# HG changeset patch # User michaelni # Date 1042754095 0 # Node ID 3b7cc8e4b83f4fa2498ff340a38ec325c3a59a92 # Parent fb6cbb8a04a3779c3ad1425022580e0b958e4553 AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau ) diff -r fb6cbb8a04a3 -r 3b7cc8e4b83f fft.c --- a/fft.c Wed Jan 15 19:21:21 2003 +0000 +++ b/fft.c Thu Jan 16 21:54:55 2003 +0000 @@ -53,12 +53,12 @@ /* compute constant table for HAVE_SSE version */ #if (defined(HAVE_MMX) && defined(HAVE_BUILTIN_VECTOR)) || defined(HAVE_ALTIVEC) { - int has_vectors; + int has_vectors = 0; #if defined(HAVE_MMX) has_vectors = mm_support() & MM_SSE; #endif -#if defined(HAVE_ALTIVEC) +#if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE) has_vectors = mm_support() & MM_ALTIVEC; #endif if (has_vectors) { diff -r fb6cbb8a04a3 -r 3b7cc8e4b83f ppc/dsputil_altivec.c --- a/ppc/dsputil_altivec.c Wed Jan 15 19:21:21 2003 +0000 +++ b/ppc/dsputil_altivec.c Thu Jan 16 21:54:55 2003 +0000 @@ -24,6 +24,22 @@ #include #endif +#ifdef ALTIVEC_TBL_PERFORMANCE_REPORT +unsigned long long perfdata[altivec_perf_total][altivec_data_total]; +/* list below must match enum in dsputil_altivec.h */ +static unsigned char* perfname[] = { + "fft_calc", + "gmc1", + "dct_unquantize_h263", + "idct_add", + "idct_put", + "put_pixels_clamped", + "put_pixels16", + "avg_pixels16" +}; +#include +#endif + int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; @@ -594,7 +610,7 @@ } void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { -#if 0 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE int i; for(i=0; i+7l); + *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); + *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); + *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); + pixels+=line_size; + block +=line_size; + } + +ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + + register vector unsigned char perm = vec_lvsl(0, pixels); + register vector unsigned char pixelsv1, pixelsv2; + int i; + +ALTIVEC_TBL_START_COUNT(altivec_put_pixels16_num, 1); + + for(i=0; i>1) ) +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +ALTIVEC_TBL_DECLARE(altivec_avg_pixels16_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int i; + +ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); + + for(i=0; il)); + op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); + op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); + op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); + pixels+=line_size; + block +=line_size; + } + +ALTIVEC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + + register vector unsigned char perm = vec_lvsl(0, pixels); + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; + int i; + +ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); + + for(i=0; i tbl_start) \ + { \ + unsigned long diff = tbl_stop - tbl_start; \ + if (cond) \ + { \ + if (diff < perfdata[a][altivec_data_min]) \ + perfdata[a][altivec_data_min] = diff; \ + if (diff > perfdata[a][altivec_data_max]) \ + perfdata[a][altivec_data_max] = diff; \ + perfdata[a][altivec_data_sum] += diff; \ + perfdata[a][altivec_data_num] ++; \ + } \ + } \ +} while (0) +#else /* ALTIVEC_TBL_PERFORMANCE_REPORT */ +#define ALTIVEC_TBL_DECLARE(a, cond) +#define ALTIVEC_TBL_START_COUNT(a, cond) +#define ALTIVEC_TBL_STOP_COUNT(a, cond) +#endif /* ALTIVEC_TBL_PERFORMANCE_REPORT */ + +#else /* HAVE_ALTIVEC */ +#ifdef ALTIVEC_USE_REFERENCE_C_CODE +#error "I can't use ALTIVEC_USE_REFERENCE_C_CODE if I don't use HAVE_ALTIVEC" +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +#ifdef ALTIVEC_TBL_PERFORMANCE_REPORT +#error "I can't use ALTIVEC_TBL_PERFORMANCE_REPORT if I don't use HAVE_ALTIVEC" +#endif /* ALTIVEC_TBL_PERFORMANCE_REPORT */ +#endif /* HAVE_ALTIVEC */ diff -r fb6cbb8a04a3 -r 3b7cc8e4b83f ppc/dsputil_ppc.c --- a/ppc/dsputil_ppc.c Wed Jan 15 19:21:21 2003 +0000 +++ b/ppc/dsputil_ppc.c Thu Jan 16 21:54:55 2003 +0000 @@ -60,11 +60,27 @@ c->pix_sum = pix_sum_altivec; c->diff_pixels = diff_pixels_altivec; c->get_pixels = get_pixels_altivec; -// next one disabled as it it untested. +// next two disabled as they're untested. #if 0 c->add_bytes= add_bytes_altivec; + c->put_pixels_clamped = put_pixels_clamped_altivec; #endif + c->put_pixels_tab[0][0] = put_pixels16_altivec; + c->avg_pixels_tab[0][0] = avg_pixels16_altivec; c->gmc1 = gmc1_altivec; + +#ifdef ALTIVEC_TBL_PERFORMANCE_REPORT + { + int i; + for (i = 0 ; i < altivec_perf_total ; i++) + { + perfdata[i][altivec_data_min] = 0xFFFFFFFFFFFFFFFF; + perfdata[i][altivec_data_max] = 0x0000000000000000; + perfdata[i][altivec_data_sum] = 0x0000000000000000; + perfdata[i][altivec_data_num] = 0x0000000000000000; + } + } +#endif } else #endif { diff -r fb6cbb8a04a3 -r 3b7cc8e4b83f ppc/fft_altivec.c --- a/ppc/fft_altivec.c Wed Jan 15 19:21:21 2003 +0000 +++ b/ppc/fft_altivec.c Thu Jan 16 21:54:55 2003 +0000 @@ -22,6 +22,31 @@ #include "dsputil_altivec.h" +/* + those three macros are from libavcodec/fft.c + and are required for the reference C code +*/ +/* butter fly op */ +#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ +{\ + FFTSample ax, ay, bx, by;\ + bx=pre1;\ + by=pim1;\ + ax=qre1;\ + ay=qim1;\ + pre = (bx + ax);\ + pim = (by + ay);\ + qre = (bx - ax);\ + qim = (by - ay);\ +} +#define MUL16(a,b) ((a) * (b)) +#define CMUL(pre, pim, are, aim, bre, bim) \ +{\ + pre = (MUL16(are, bre) - MUL16(aim, bim));\ + pim = (MUL16(are, bim) + MUL16(bre, aim));\ +} + + /** * Do a complex FFT with the parameters defined in fft_init(). The * input data must be permuted before with s->revtab table. No @@ -35,6 +60,84 @@ */ void fft_calc_altivec(FFTContext *s, FFTComplex *z) { +ALTIVEC_TBL_DECLARE(altivec_fft_num, s->nbits >= 6); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int ln = s->nbits; + int j, np, np2; + int nblocks, nloops; + register FFTComplex *p, *q; + FFTComplex *exptab = s->exptab; + int l; + FFTSample tmp_re, tmp_im; + +ALTIVEC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6); + + np = 1 << ln; + + /* pass 0 */ + + p=&z[0]; + j=(np >> 1); + do { + BF(p[0].re, p[0].im, p[1].re, p[1].im, + p[0].re, p[0].im, p[1].re, p[1].im); + p+=2; + } while (--j != 0); + + /* pass 1 */ + + + p=&z[0]; + j=np >> 2; + if (s->inverse) { + do { + BF(p[0].re, p[0].im, p[2].re, p[2].im, + p[0].re, p[0].im, p[2].re, p[2].im); + BF(p[1].re, p[1].im, p[3].re, p[3].im, + p[1].re, p[1].im, -p[3].im, p[3].re); + p+=4; + } while (--j != 0); + } else { + do { + BF(p[0].re, p[0].im, p[2].re, p[2].im, + p[0].re, p[0].im, p[2].re, p[2].im); + BF(p[1].re, p[1].im, p[3].re, p[3].im, + p[1].re, p[1].im, p[3].im, -p[3].re); + p+=4; + } while (--j != 0); + } + /* pass 2 .. ln-1 */ + + nblocks = np >> 3; + nloops = 1 << 2; + np2 = np >> 1; + do { + p = z; + q = z + nloops; + for (j = 0; j < nblocks; ++j) { + BF(p->re, p->im, q->re, q->im, + p->re, p->im, q->re, q->im); + + p++; + q++; + for(l = nblocks; l < np2; l += nblocks) { + CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im); + BF(p->re, p->im, q->re, q->im, + p->re, p->im, tmp_re, tmp_im); + p++; + q++; + } + + p += nloops; + q += nloops; + } + nblocks = nblocks >> 1; + nloops = nloops << 1; + } while (nblocks != 0); + +ALTIVEC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ register const vector float vczero = (const vector float)(0.); int ln = s->nbits; @@ -44,6 +147,8 @@ FFTComplex *cptr, *cptr1; int k; +ALTIVEC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6); + np = 1 << ln; { @@ -129,5 +234,8 @@ nblocks = nblocks >> 1; nloops = nloops << 1; } while (nblocks != 0); + +ALTIVEC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6); + +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } - diff -r fb6cbb8a04a3 -r 3b7cc8e4b83f ppc/gmc_altivec.c --- a/ppc/gmc_altivec.c Wed Jan 15 19:21:21 2003 +0000 +++ b/ppc/gmc_altivec.c Thu Jan 16 21:54:55 2003 +0000 @@ -28,13 +28,15 @@ */ void gmc1_altivec(UINT8 *dst /* align 8 */, UINT8 *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { -#if 0 +ALTIVEC_TBL_DECLARE(altivec_gmc1_num, h == 8); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE const int A=(16-x16)*(16-y16); const int B=( x16)*(16-y16); const int C=(16-x16)*( y16); const int D=( x16)*( y16); - int i; + +ALTIVEC_TBL_START_COUNT(altivec_gmc1_num, h == 8); for(i=0; i /* malloc(), free() */ #include #include "../dsputil.h" +#include "dsputil_altivec.h" #define vector_s16_t vector signed short #define vector_u16_t vector unsigned short @@ -160,8 +161,17 @@ void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) { +ALTIVEC_TBL_DECLARE(altivec_idct_put_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE +ALTIVEC_TBL_START_COUNT(altivec_idct_put_num, 1); + void simple_idct_put(UINT8 *dest, int line_size, INT16 *block); + simple_idct_put(dest, stride, (INT16*)block); +ALTIVEC_TBL_STOP_COUNT(altivec_idct_put_num, 1); +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ vector_u8_t tmp; +ALTIVEC_TBL_START_COUNT(altivec_idct_put_num, 1); + IDCT #define COPY(dest,src) \ @@ -177,16 +187,28 @@ COPY (dest, vx5) dest += stride; COPY (dest, vx6) dest += stride; COPY (dest, vx7) + +ALTIVEC_TBL_STOP_COUNT(altivec_idct_put_num, 1); +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block) { +ALTIVEC_TBL_DECLARE(altivec_idct_add_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE +ALTIVEC_TBL_START_COUNT(altivec_idct_add_num, 1); + void simple_idct_add(UINT8 *dest, int line_size, INT16 *block); + simple_idct_add(dest, stride, (INT16*)block); +ALTIVEC_TBL_STOP_COUNT(altivec_idct_add_num, 1); +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ vector_u8_t tmp; vector_s16_t tmp2, tmp3; vector_u8_t perm0; vector_u8_t perm1; vector_u8_t p0, p1, p; +ALTIVEC_TBL_START_COUNT(altivec_idct_add_num, 1); + IDCT p0 = vec_lvsl (0, dest); @@ -212,5 +234,8 @@ ADD (dest, vx5, perm1) dest += stride; ADD (dest, vx6, perm0) dest += stride; ADD (dest, vx7, perm1) + +ALTIVEC_TBL_STOP_COUNT(altivec_idct_add_num, 1); +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } diff -r fb6cbb8a04a3 -r 3b7cc8e4b83f ppc/mpegvideo_altivec.c --- a/ppc/mpegvideo_altivec.c Wed Jan 15 19:21:21 2003 +0000 +++ b/ppc/mpegvideo_altivec.c Thu Jan 16 21:54:55 2003 +0000 @@ -20,6 +20,7 @@ #include #include "../dsputil.h" #include "../mpegvideo.h" +#include "dsputil_altivec.h" // Swaps two variables (used for altivec registers) #define SWAP(a,b) \ @@ -510,10 +511,13 @@ void dct_unquantize_h263_altivec(MpegEncContext *s, DCTELEM *block, int n, int qscale) { +ALTIVEC_TBL_DECLARE(altivec_dct_unquantize_h263_num, 1); int i, level, qmul, qadd; int nCoeffs; assert(s->block_last_index[n]>=0); + +ALTIVEC_TBL_START_COUNT(altivec_dct_unquantize_h263_num, 1); qadd = (qscale - 1) | 1; qmul = qscale << 1; @@ -533,7 +537,7 @@ nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; } -#if 0 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE for(;i<=nCoeffs;i++) { level = block[i]; if (level) { @@ -545,7 +549,7 @@ block[i] = level; } } -#else +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ { register const vector short vczero = (const vector short)(0); short __attribute__ ((aligned(16))) qmul8[] = @@ -572,6 +576,7 @@ qaddv = vec_ld(0, qadd8); nqaddv = vec_ld(0, nqadd8); +#if 0 // block *is* 16 bytes-aligned, it seems. // first make sure block[j] is 16 bytes-aligned for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { level = block[j]; @@ -584,6 +589,7 @@ block[j] = level; } } +#endif // vectorize all the 16 bytes-aligned blocks // of 8 elements @@ -622,5 +628,7 @@ block[0] = backup_0; } } -#endif +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ + +ALTIVEC_TBL_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); } diff -r fb6cbb8a04a3 -r 3b7cc8e4b83f ppc/mpegvideo_ppc.c --- a/ppc/mpegvideo_ppc.c Wed Jan 15 19:21:21 2003 +0000 +++ b/ppc/mpegvideo_ppc.c Thu Jan 16 21:54:55 2003 +0000 @@ -44,7 +44,11 @@ { s->idct_put = idct_put_altivec; s->idct_add = idct_add_altivec; +#ifndef ALTIVEC_USE_REFERENCE_C_CODE s->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + s->idct_permutation_type = FF_NO_IDCT_PERM; +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } // Test to make sure that the dct required alignments are met.