Mercurial > libavcodec.hg
changeset 7333:a8a79f5385f6 libavcodec
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
This includes indentation changes, comment reformatting, consistent brace
placement and some prettyprinting.
author | diego |
---|---|
date | Sun, 20 Jul 2008 18:58:30 +0000 |
parents | b1003e468c3d |
children | 3a93377e8b76 |
files | ppc/dsputil_ppc.c ppc/dsputil_ppc.h ppc/fft_altivec.c ppc/gmc_altivec.c ppc/h264_altivec.c ppc/h264_template_altivec.c ppc/idct_altivec.c ppc/imgresample_altivec.c ppc/int_altivec.c ppc/mathops.h ppc/mpegvideo_altivec.c ppc/snow_altivec.c |
diffstat | 12 files changed, 794 insertions(+), 847 deletions(-) [+] |
line wrap: on
line diff
--- a/ppc/dsputil_ppc.c Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/dsputil_ppc.c Sun Jul 20 18:58:30 2008 +0000 @@ -60,33 +60,33 @@ unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; /* list below must match enum in dsputil_ppc.h */ static unsigned char* perfname[] = { - "ff_fft_calc_altivec", - "gmc1_altivec", - "dct_unquantize_h263_altivec", - "fdct_altivec", - "idct_add_altivec", - "idct_put_altivec", - "put_pixels16_altivec", - "avg_pixels16_altivec", - "avg_pixels8_altivec", - "put_pixels8_xy2_altivec", - "put_no_rnd_pixels8_xy2_altivec", - "put_pixels16_xy2_altivec", - "put_no_rnd_pixels16_xy2_altivec", - "hadamard8_diff8x8_altivec", - "hadamard8_diff16_altivec", - "avg_pixels8_xy2_altivec", - "clear_blocks_dcbz32_ppc", - "clear_blocks_dcbz128_ppc", - "put_h264_chroma_mc8_altivec", - "avg_h264_chroma_mc8_altivec", - "put_h264_qpel16_h_lowpass_altivec", - "avg_h264_qpel16_h_lowpass_altivec", - "put_h264_qpel16_v_lowpass_altivec", - "avg_h264_qpel16_v_lowpass_altivec", - "put_h264_qpel16_hv_lowpass_altivec", - "avg_h264_qpel16_hv_lowpass_altivec", - "" + "ff_fft_calc_altivec", + "gmc1_altivec", + "dct_unquantize_h263_altivec", + "fdct_altivec", + "idct_add_altivec", + "idct_put_altivec", + "put_pixels16_altivec", + "avg_pixels16_altivec", + "avg_pixels8_altivec", + "put_pixels8_xy2_altivec", + "put_no_rnd_pixels8_xy2_altivec", + "put_pixels16_xy2_altivec", + "put_no_rnd_pixels16_xy2_altivec", + "hadamard8_diff8x8_altivec", + "hadamard8_diff16_altivec", + "avg_pixels8_xy2_altivec", + "clear_blocks_dcbz32_ppc", + "clear_blocks_dcbz128_ppc", + "put_h264_chroma_mc8_altivec", + "avg_h264_chroma_mc8_altivec", + "put_h264_qpel16_h_lowpass_altivec", + "avg_h264_qpel16_h_lowpass_altivec", + "put_h264_qpel16_v_lowpass_altivec", + "avg_h264_qpel16_v_lowpass_altivec", + "put_h264_qpel16_hv_lowpass_altivec", + "avg_h264_qpel16_hv_lowpass_altivec", + "" }; #include <stdio.h> #endif @@ -94,51 +94,44 @@ #ifdef CONFIG_POWERPC_PERF void powerpc_display_perf_report(void) { - int i, j; - av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); - for(i = 0 ; i < powerpc_perf_total ; i++) - { - for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) - { - if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) - av_log(NULL, AV_LOG_INFO, - " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n", - perfname[i], - j+1, - perfdata[j][i][powerpc_data_min], - perfdata[j][i][powerpc_data_max], - (double)perfdata[j][i][powerpc_data_sum] / - (double)perfdata[j][i][powerpc_data_num], - perfdata[j][i][powerpc_data_num]); - } - } + int i, j; + av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); + for(i = 0 ; i < powerpc_perf_total ; i++) { + for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) { + if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) + av_log(NULL, AV_LOG_INFO, + " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n", + perfname[i], + j+1, + perfdata[j][i][powerpc_data_min], + perfdata[j][i][powerpc_data_max], + (double)perfdata[j][i][powerpc_data_sum] / + (double)perfdata[j][i][powerpc_data_num], + perfdata[j][i][powerpc_data_num]); + } + } } #endif /* CONFIG_POWERPC_PERF */ /* ***** WARNING ***** WARNING ***** WARNING ***** */ /* - clear_blocks_dcbz32_ppc will not work properly - on PowerPC processors with a cache line size - not equal to 32 bytes. - Fortunately all processor used by Apple up to - at least the 7450 (aka second generation G4) - use 32 bytes cache line. - This is due to the use of the 'dcbz' instruction. - It simply clear to zero a single cache line, - so you need to know the cache line size to use it ! - It's absurd, but it's fast... +clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a +cache line size not equal to 32 bytes. +Fortunately all processor used by Apple up to at least the 7450 (aka second +generation G4) use 32 bytes cache line. +This is due to the use of the 'dcbz' instruction. It simply clear to zero a +single cache line, so you need to know the cache line size to use it ! +It's absurd, but it's fast... - update 24/06/2003 : Apple released yesterday the G5, - with a PPC970. cache line size : 128 bytes. Oups. - The semantic of dcbz was changed, it always clear - 32 bytes. so the function below will work, but will - be slow. So I fixed check_dcbz_effect to use dcbzl, - which is defined to clear a cache line (as dcbz before). - So we still can distinguish, and use dcbz (32 bytes) - or dcbzl (one cache line) as required. +update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line +size: 128 bytes. Oups. +The semantic of dcbz was changed, it always clear 32 bytes. so the function +below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl, +which is defined to clear a cache line (as dcbz before). So we still can +distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required. - see <http://developer.apple.com/technotes/tn/tn2087.html> - and <http://developer.apple.com/technotes/tn/tn2086.html> +see <http://developer.apple.com/technotes/tn/tn2087.html> +and <http://developer.apple.com/technotes/tn/tn2086.html> */ void clear_blocks_dcbz32_ppc(DCTELEM *blocks) { @@ -148,21 +141,21 @@ POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); #if 1 if (misal) { - ((unsigned long*)blocks)[0] = 0L; - ((unsigned long*)blocks)[1] = 0L; - ((unsigned long*)blocks)[2] = 0L; - ((unsigned long*)blocks)[3] = 0L; - i += 16; + ((unsigned long*)blocks)[0] = 0L; + ((unsigned long*)blocks)[1] = 0L; + ((unsigned long*)blocks)[2] = 0L; + ((unsigned long*)blocks)[3] = 0L; + i += 16; } for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) { - asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); + asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); } if (misal) { - ((unsigned long*)blocks)[188] = 0L; - ((unsigned long*)blocks)[189] = 0L; - ((unsigned long*)blocks)[190] = 0L; - ((unsigned long*)blocks)[191] = 0L; - i += 16; + ((unsigned long*)blocks)[188] = 0L; + ((unsigned long*)blocks)[189] = 0L; + ((unsigned long*)blocks)[190] = 0L; + ((unsigned long*)blocks)[191] = 0L; + i += 16; } #else memset(blocks, 0, sizeof(DCTELEM)*6*64); @@ -180,16 +173,16 @@ register int i = 0; POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); #if 1 - if (misal) { - // we could probably also optimize this case, - // but there's not much point as the machines - // aren't available yet (2003-06-26) - memset(blocks, 0, sizeof(DCTELEM)*6*64); + if (misal) { + // we could probably also optimize this case, + // but there's not much point as the machines + // aren't available yet (2003-06-26) + memset(blocks, 0, sizeof(DCTELEM)*6*64); } else - for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { - asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); - } + for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { + asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); + } #else memset(blocks, 0, sizeof(DCTELEM)*6*64); #endif @@ -198,7 +191,7 @@ #else void clear_blocks_dcbz128_ppc(DCTELEM *blocks) { - memset(blocks, 0, sizeof(DCTELEM)*6*64); + memset(blocks, 0, sizeof(DCTELEM)*6*64); } #endif @@ -210,34 +203,32 @@ knows about dcbzl ... */ long check_dcbzl_effect(void) { - register char *fakedata = av_malloc(1024); - register char *fakedata_middle; - register long zero = 0; - register long i = 0; - long count = 0; + register char *fakedata = av_malloc(1024); + register char *fakedata_middle; + register long zero = 0; + register long i = 0; + long count = 0; - if (!fakedata) - { - return 0L; - } + if (!fakedata) { + return 0L; + } - fakedata_middle = (fakedata + 512); + fakedata_middle = (fakedata + 512); - memset(fakedata, 0xFF, 1024); + memset(fakedata, 0xFF, 1024); - /* below the constraint "b" seems to mean "Address base register" - in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ - asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); + /* below the constraint "b" seems to mean "Address base register" + in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ + asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); - for (i = 0; i < 1024 ; i ++) - { - if (fakedata[i] == (char)0) - count++; - } + for (i = 0; i < 1024 ; i ++) { + if (fakedata[i] == (char)0) + count++; + } - av_free(fakedata); + av_free(fakedata); - return count; + return count; } #else long check_dcbzl_effect(void) @@ -286,36 +277,31 @@ #ifdef CONFIG_ENCODERS if (avctx->dct_algo == FF_DCT_AUTO || - avctx->dct_algo == FF_DCT_ALTIVEC) - { + avctx->dct_algo == FF_DCT_ALTIVEC) { c->fdct = fdct_altivec; } #endif //CONFIG_ENCODERS - if (avctx->lowres==0) - { - if ((avctx->idct_algo == FF_IDCT_AUTO) || - (avctx->idct_algo == FF_IDCT_ALTIVEC)) - { - c->idct_put = idct_put_altivec; - c->idct_add = idct_add_altivec; - c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - } + if (avctx->lowres==0) { + if ((avctx->idct_algo == FF_IDCT_AUTO) || + (avctx->idct_algo == FF_IDCT_ALTIVEC)) { + c->idct_put = idct_put_altivec; + c->idct_add = idct_add_altivec; + c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + } } #ifdef CONFIG_POWERPC_PERF { - int i, j; - for (i = 0 ; i < powerpc_perf_total ; i++) - { - for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) - { - perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; - perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; - perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; - perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL; + int i, j; + for (i = 0 ; i < powerpc_perf_total ; i++) { + for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) { + perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; + perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; + perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; + perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL; + } } - } } #endif /* CONFIG_POWERPC_PERF */ }
--- a/ppc/dsputil_ppc.h Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/dsputil_ppc.h Sun Jul 20 18:58:30 2008 +0000 @@ -31,40 +31,40 @@ /* if you add to the enum below, also add to the perfname array in dsputil_ppc.c */ enum powerpc_perf_index { - altivec_fft_num = 0, - altivec_gmc1_num, - altivec_dct_unquantize_h263_num, - altivec_fdct, - altivec_idct_add_num, - altivec_idct_put_num, - altivec_put_pixels16_num, - altivec_avg_pixels16_num, - altivec_avg_pixels8_num, - altivec_put_pixels8_xy2_num, - altivec_put_no_rnd_pixels8_xy2_num, - altivec_put_pixels16_xy2_num, - altivec_put_no_rnd_pixels16_xy2_num, - altivec_hadamard8_diff8x8_num, - altivec_hadamard8_diff16_num, - altivec_avg_pixels8_xy2_num, - powerpc_clear_blocks_dcbz32, - powerpc_clear_blocks_dcbz128, - altivec_put_h264_chroma_mc8_num, - altivec_avg_h264_chroma_mc8_num, - altivec_put_h264_qpel16_h_lowpass_num, - altivec_avg_h264_qpel16_h_lowpass_num, - altivec_put_h264_qpel16_v_lowpass_num, - altivec_avg_h264_qpel16_v_lowpass_num, - altivec_put_h264_qpel16_hv_lowpass_num, - altivec_avg_h264_qpel16_hv_lowpass_num, - powerpc_perf_total + altivec_fft_num = 0, + altivec_gmc1_num, + altivec_dct_unquantize_h263_num, + altivec_fdct, + altivec_idct_add_num, + altivec_idct_put_num, + altivec_put_pixels16_num, + altivec_avg_pixels16_num, + altivec_avg_pixels8_num, + altivec_put_pixels8_xy2_num, + altivec_put_no_rnd_pixels8_xy2_num, + altivec_put_pixels16_xy2_num, + altivec_put_no_rnd_pixels16_xy2_num, + altivec_hadamard8_diff8x8_num, + altivec_hadamard8_diff16_num, + altivec_avg_pixels8_xy2_num, + powerpc_clear_blocks_dcbz32, + powerpc_clear_blocks_dcbz128, + altivec_put_h264_chroma_mc8_num, + altivec_avg_h264_chroma_mc8_num, + altivec_put_h264_qpel16_h_lowpass_num, + altivec_avg_h264_qpel16_h_lowpass_num, + altivec_put_h264_qpel16_v_lowpass_num, + altivec_avg_h264_qpel16_v_lowpass_num, + altivec_put_h264_qpel16_hv_lowpass_num, + altivec_avg_h264_qpel16_hv_lowpass_num, + powerpc_perf_total }; enum powerpc_data_index { - powerpc_data_min = 0, - powerpc_data_max, - powerpc_data_sum, - powerpc_data_num, - powerpc_data_total + powerpc_data_min = 0, + powerpc_data_max, + powerpc_data_sum, + powerpc_data_num, + powerpc_data_total }; extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; @@ -105,45 +105,42 @@ #define POWERPC_GET_PMC6(a) do {} while (0) #endif #endif /* HAVE_PPC64 */ -#define POWERPC_PERF_DECLARE(a, cond) \ - POWERP_PMC_DATATYPE \ - pmc_start[POWERPC_NUM_PMC_ENABLED], \ - pmc_stop[POWERPC_NUM_PMC_ENABLED], \ - pmc_loop_index; +#define POWERPC_PERF_DECLARE(a, cond) \ + POWERP_PMC_DATATYPE \ + pmc_start[POWERPC_NUM_PMC_ENABLED], \ + pmc_stop[POWERPC_NUM_PMC_ENABLED], \ + pmc_loop_index; #define POWERPC_PERF_START_COUNT(a, cond) do { \ - POWERPC_GET_PMC6(pmc_start[5]); \ - POWERPC_GET_PMC5(pmc_start[4]); \ - POWERPC_GET_PMC4(pmc_start[3]); \ - POWERPC_GET_PMC3(pmc_start[2]); \ - POWERPC_GET_PMC2(pmc_start[1]); \ - POWERPC_GET_PMC1(pmc_start[0]); \ - } while (0) + POWERPC_GET_PMC6(pmc_start[5]); \ + POWERPC_GET_PMC5(pmc_start[4]); \ + POWERPC_GET_PMC4(pmc_start[3]); \ + POWERPC_GET_PMC3(pmc_start[2]); \ + POWERPC_GET_PMC2(pmc_start[1]); \ + POWERPC_GET_PMC1(pmc_start[0]); \ + } while (0) #define POWERPC_PERF_STOP_COUNT(a, cond) do { \ - POWERPC_GET_PMC1(pmc_stop[0]); \ - POWERPC_GET_PMC2(pmc_stop[1]); \ - POWERPC_GET_PMC3(pmc_stop[2]); \ - POWERPC_GET_PMC4(pmc_stop[3]); \ - POWERPC_GET_PMC5(pmc_stop[4]); \ - POWERPC_GET_PMC6(pmc_stop[5]); \ - if (cond) \ - { \ - for(pmc_loop_index = 0; \ - pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ - pmc_loop_index++) \ - { \ - if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \ - { \ - POWERP_PMC_DATATYPE diff = \ - pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ - if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ - perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ - if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ - perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ - perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ - perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ - } \ - } \ - } \ + POWERPC_GET_PMC1(pmc_stop[0]); \ + POWERPC_GET_PMC2(pmc_stop[1]); \ + POWERPC_GET_PMC3(pmc_stop[2]); \ + POWERPC_GET_PMC4(pmc_stop[3]); \ + POWERPC_GET_PMC5(pmc_stop[4]); \ + POWERPC_GET_PMC6(pmc_stop[5]); \ + if (cond) { \ + for(pmc_loop_index = 0; \ + pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ + pmc_loop_index++) { \ + if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \ + POWERP_PMC_DATATYPE diff = \ + pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ + if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ + perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ + if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ + perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ + perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ + perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ + } \ + } \ + } \ } while (0) #else /* CONFIG_POWERPC_PERF */ // those are needed to avoid empty statements.
--- a/ppc/fft_altivec.c Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/fft_altivec.c Sun Jul 20 18:58:30 2008 +0000 @@ -33,21 +33,21 @@ /* butter fly op */ #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ {\ - FFTSample ax, ay, bx, by;\ - bx=pre1;\ - by=pim1;\ - ax=qre1;\ - ay=qim1;\ - pre = (bx + ax);\ - pim = (by + ay);\ - qre = (bx - ax);\ - qim = (by - ay);\ + FFTSample ax, ay, bx, by;\ + bx=pre1;\ + by=pim1;\ + ax=qre1;\ + ay=qim1;\ + pre = (bx + ax);\ + pim = (by + ay);\ + qre = (bx - ax);\ + qim = (by - ay);\ } #define MUL16(a,b) ((a) * (b)) #define CMUL(pre, pim, are, aim, bre, bim) \ {\ - pre = (MUL16(are, bre) - MUL16(aim, bim));\ - pim = (MUL16(are, bim) + MUL16(bre, aim));\ + pre = (MUL16(are, bre) - MUL16(aim, bim));\ + pim = (MUL16(are, bim) + MUL16(bre, aim));\ } @@ -85,14 +85,11 @@ c1 = vcii(p,p,n,n); - if (s->inverse) - { - c2 = vcii(p,p,n,p); - } - else - { - c2 = vcii(p,p,p,n); - } + if (s->inverse) { + c2 = vcii(p,p,n,p); + } else { + c2 = vcii(p,p,p,n); + } j = (np >> 2); do {
--- a/ppc/gmc_altivec.c Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/gmc_altivec.c Sun Jul 20 18:58:30 2008 +0000 @@ -36,16 +36,16 @@ { POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) = - {rounder, rounder, rounder, rounder, - rounder, rounder, rounder, rounder}; + {rounder, rounder, rounder, rounder, + rounder, rounder, rounder, rounder}; const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) = - { - (16-x16)*(16-y16), /* A */ - ( x16)*(16-y16), /* B */ - (16-x16)*( y16), /* C */ - ( x16)*( y16), /* D */ - 0, 0, 0, 0 /* padding */ - }; + { + (16-x16)*(16-y16), /* A */ + ( x16)*(16-y16), /* B */ + (16-x16)*( y16), /* C */ + ( x16)*( y16), /* D */ + 0, 0, 0, 0 /* padding */ + }; register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; @@ -74,73 +74,67 @@ src_1 = vec_ld(16, src); srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); - if (src_really_odd != 0x0000000F) - { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. - srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); - } - else - { - srcvB = src_1; + if (src_really_odd != 0x0000000F) { + // if src & 0xF == 0xF, then (src+1) is properly aligned + // on the second vector. + srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); + } else { + srcvB = src_1; } srcvA = vec_mergeh(vczero, srcvA); srcvB = vec_mergeh(vczero, srcvB); - for(i=0; i<h; i++) - { - dst_odd = (unsigned long)dst & 0x0000000F; - src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; + for(i=0; i<h; i++) { + dst_odd = (unsigned long)dst & 0x0000000F; + src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; - dstv = vec_ld(0, dst); + dstv = vec_ld(0, dst); - // we we'll be able to pick-up our 9 char elements - // at src + stride from those 32 bytes - // then reuse the resulting 2 vectors srvcC and srcvD - // as the next srcvA and srcvB - src_0 = vec_ld(stride + 0, src); - src_1 = vec_ld(stride + 16, src); - srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); + // we we'll be able to pick-up our 9 char elements + // at src + stride from those 32 bytes + // then reuse the resulting 2 vectors srvcC and srcvD + // as the next srcvA and srcvB + src_0 = vec_ld(stride + 0, src); + src_1 = vec_ld(stride + 16, src); + srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); - if (src_really_odd != 0x0000000F) - { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. - srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); - } - else - { - srcvD = src_1; - } + if (src_really_odd != 0x0000000F) { + // if src & 0xF == 0xF, then (src+1) is properly aligned + // on the second vector. + srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); + } else { + srcvD = src_1; + } - srcvC = vec_mergeh(vczero, srcvC); - srcvD = vec_mergeh(vczero, srcvD); + srcvC = vec_mergeh(vczero, srcvC); + srcvD = vec_mergeh(vczero, srcvD); - // OK, now we (finally) do the math :-) - // those four instructions replaces 32 int muls & 32 int adds. - // isn't AltiVec nice ? - tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); - tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); - tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); - tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); + // OK, now we (finally) do the math :-) + // those four instructions replaces 32 int muls & 32 int adds. + // isn't AltiVec nice ? + tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); + tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); + tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); + tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); - srcvA = srcvC; - srcvB = srcvD; + srcvA = srcvC; + srcvB = srcvD; - tempD = vec_sr(tempD, vcsr8); + tempD = vec_sr(tempD, vcsr8); - dstv2 = vec_pack(tempD, (vector unsigned short)vczero); + dstv2 = vec_pack(tempD, (vector unsigned short)vczero); - if (dst_odd) - { - dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); - } - else - { - dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); - } + if (dst_odd) { + dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); + } else { + dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); + } - vec_st(dstv2, 0, dst); + vec_st(dstv2, 0, dst); - dst += stride; - src += stride; + dst += stride; + src += stride; } POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
--- a/ppc/h264_altivec.c Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/h264_altivec.c Sun Jul 20 18:58:30 2008 +0000 @@ -196,7 +196,7 @@ const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); LOAD_ZERO; const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); - const vec_u16_t v6us = vec_splat_u16(6); + const vec_u16_t v6us = vec_splat_u16(6); register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; @@ -392,8 +392,8 @@ #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h) */ - H264_MC(put_, 16, altivec) - H264_MC(avg_, 16, altivec) +H264_MC(put_, 16, altivec) +H264_MC(avg_, 16, altivec) /**************************************************************************** @@ -685,9 +685,9 @@ r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \ \ /*Third merge*/ \ - r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ - r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ - r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ + r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ + r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ + r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \ r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \ r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
--- a/ppc/h264_template_altivec.c Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/h264_template_altivec.c Sun Jul 20 18:58:30 2008 +0000 @@ -206,489 +206,489 @@ /* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); - register int i; + POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); + register int i; - LOAD_ZERO; - const vec_u8_t permM2 = vec_lvsl(-2, src); - const vec_u8_t permM1 = vec_lvsl(-1, src); - const vec_u8_t permP0 = vec_lvsl(+0, src); - const vec_u8_t permP1 = vec_lvsl(+1, src); - const vec_u8_t permP2 = vec_lvsl(+2, src); - const vec_u8_t permP3 = vec_lvsl(+3, src); - const vec_s16_t v5ss = vec_splat_s16(5); - const vec_u16_t v5us = vec_splat_u16(5); - const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); + LOAD_ZERO; + const vec_u8_t permM2 = vec_lvsl(-2, src); + const vec_u8_t permM1 = vec_lvsl(-1, src); + const vec_u8_t permP0 = vec_lvsl(+0, src); + const vec_u8_t permP1 = vec_lvsl(+1, src); + const vec_u8_t permP2 = vec_lvsl(+2, src); + const vec_u8_t permP3 = vec_lvsl(+3, src); + const vec_s16_t v5ss = vec_splat_s16(5); + const vec_u16_t v5us = vec_splat_u16(5); + const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); + const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - register int align = ((((unsigned long)src) - 2) % 16); + register int align = ((((unsigned long)src) - 2) % 16); - vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, - srcP2A, srcP2B, srcP3A, srcP3B, - srcM1A, srcM1B, srcM2A, srcM2B, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, - pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, - psumA, psumB, sumA, sumB; + vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, + srcP2A, srcP2B, srcP3A, srcP3B, + srcM1A, srcM1B, srcM2A, srcM2B, + sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, + pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, + psumA, psumB, sumA, sumB; - vec_u8_t sum, vdst, fsum; + vec_u8_t sum, vdst, fsum; - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); + POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); - for (i = 0 ; i < 16 ; i ++) { - vec_u8_t srcR1 = vec_ld(-2, src); - vec_u8_t srcR2 = vec_ld(14, src); + for (i = 0 ; i < 16 ; i ++) { + vec_u8_t srcR1 = vec_ld(-2, src); + vec_u8_t srcR2 = vec_ld(14, src); - switch (align) { - default: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = vec_perm(srcR1, srcR2, permP3); - } break; - case 11: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = srcR2; - } break; - case 12: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = srcR2; - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 13: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = srcR2; - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 14: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = srcR2; - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 15: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = srcR2; - srcP0 = vec_perm(srcR2, srcR3, permP0); - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; + switch (align) { + default: { + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = vec_perm(srcR1, srcR2, permP2); + srcP3 = vec_perm(srcR1, srcR2, permP3); + } break; + case 11: { + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = vec_perm(srcR1, srcR2, permP2); + srcP3 = srcR2; + } break; + case 12: { + vec_u8_t srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = srcR2; + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 13: { + vec_u8_t srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = srcR2; + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 14: { + vec_u8_t srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = srcR2; + srcP1 = vec_perm(srcR2, srcR3, permP1); + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 15: { + vec_u8_t srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = srcR2; + srcP0 = vec_perm(srcR2, srcR3, permP0); + srcP1 = vec_perm(srcR2, srcR3, permP1); + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + } + + srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); + srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); + srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); + srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); + + srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); + srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); + srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); + srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); + + srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); + srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); + srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); + srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); + + sum1A = vec_adds(srcP0A, srcP1A); + sum1B = vec_adds(srcP0B, srcP1B); + sum2A = vec_adds(srcM1A, srcP2A); + sum2B = vec_adds(srcM1B, srcP2B); + sum3A = vec_adds(srcM2A, srcP3A); + sum3B = vec_adds(srcM2B, srcP3B); + + pp1A = vec_mladd(sum1A, v20ss, v16ss); + pp1B = vec_mladd(sum1B, v20ss, v16ss); + + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); + + pp3A = vec_add(sum3A, pp1A); + pp3B = vec_add(sum3B, pp1B); + + psumA = vec_sub(pp3A, pp2A); + psumB = vec_sub(pp3B, pp2B); + + sumA = vec_sra(psumA, v5us); + sumB = vec_sra(psumB, v5us); + + sum = vec_packsu(sumA, sumB); + + ASSERT_ALIGNED(dst); + vdst = vec_ld(0, dst); + + OP_U8_ALTIVEC(fsum, sum, vdst); + + vec_st(fsum, 0, dst); + + src += srcStride; + dst += dstStride; } - - srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); - srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); - srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); - srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); - - srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); - srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); - srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); - srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); - - srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); - srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); - srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); - srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); - - sum1A = vec_adds(srcP0A, srcP1A); - sum1B = vec_adds(srcP0B, srcP1B); - sum2A = vec_adds(srcM1A, srcP2A); - sum2B = vec_adds(srcM1B, srcP2B); - sum3A = vec_adds(srcM2A, srcP3A); - sum3B = vec_adds(srcM2B, srcP3B); - - pp1A = vec_mladd(sum1A, v20ss, v16ss); - pp1B = vec_mladd(sum1B, v20ss, v16ss); - - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - - pp3A = vec_add(sum3A, pp1A); - pp3B = vec_add(sum3B, pp1B); - - psumA = vec_sub(pp3A, pp2A); - psumB = vec_sub(pp3B, pp2B); - - sumA = vec_sra(psumA, v5us); - sumB = vec_sra(psumB, v5us); - - sum = vec_packsu(sumA, sumB); - - ASSERT_ALIGNED(dst); - vdst = vec_ld(0, dst); - - OP_U8_ALTIVEC(fsum, sum, vdst); - - vec_st(fsum, 0, dst); - - src += srcStride; - dst += dstStride; - } -POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); + POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); } /* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); + POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); - register int i; + register int i; - LOAD_ZERO; - const vec_u8_t perm = vec_lvsl(0, src); - const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_u16_t v5us = vec_splat_u16(5); - const vec_s16_t v5ss = vec_splat_s16(5); - const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); + LOAD_ZERO; + const vec_u8_t perm = vec_lvsl(0, src); + const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); + const vec_u16_t v5us = vec_splat_u16(5); + const vec_s16_t v5ss = vec_splat_s16(5); + const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - uint8_t *srcbis = src - (srcStride * 2); + uint8_t *srcbis = src - (srcStride * 2); - const vec_u8_t srcM2a = vec_ld(0, srcbis); - const vec_u8_t srcM2b = vec_ld(16, srcbis); - const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); -// srcbis += srcStride; - const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); - const vec_u8_t srcM1b = vec_ld(16, srcbis); - const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); -// srcbis += srcStride; - const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); - const vec_u8_t srcP0b = vec_ld(16, srcbis); - const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); -// srcbis += srcStride; - const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); - const vec_u8_t srcP1b = vec_ld(16, srcbis); - const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); -// srcbis += srcStride; - const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); - const vec_u8_t srcP2b = vec_ld(16, srcbis); - const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); -// srcbis += srcStride; + const vec_u8_t srcM2a = vec_ld(0, srcbis); + const vec_u8_t srcM2b = vec_ld(16, srcbis); + const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); + //srcbis += srcStride; + const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); + const vec_u8_t srcM1b = vec_ld(16, srcbis); + const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); + //srcbis += srcStride; + const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); + const vec_u8_t srcP0b = vec_ld(16, srcbis); + const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); + //srcbis += srcStride; + const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); + const vec_u8_t srcP1b = vec_ld(16, srcbis); + const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); + //srcbis += srcStride; + const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); + const vec_u8_t srcP2b = vec_ld(16, srcbis); + const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); + //srcbis += srcStride; - vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); - vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); - vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); - vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); - vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); - vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); - vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); - vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); - vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); - vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); + vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); + vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); + vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); + vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); + vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); + vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); + vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); + vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); + vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); + vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); - vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, - psumA, psumB, sumA, sumB, - srcP3ssA, srcP3ssB, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; + vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, + psumA, psumB, sumA, sumB, + srcP3ssA, srcP3ssB, + sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; - vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; + vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); + POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); - for (i = 0 ; i < 16 ; i++) { - srcP3a = vec_ld(0, srcbis += srcStride); - srcP3b = vec_ld(16, srcbis); - srcP3 = vec_perm(srcP3a, srcP3b, perm); - srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); - srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); -// srcbis += srcStride; + for (i = 0 ; i < 16 ; i++) { + srcP3a = vec_ld(0, srcbis += srcStride); + srcP3b = vec_ld(16, srcbis); + srcP3 = vec_perm(srcP3a, srcP3b, perm); + srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); + srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); + //srcbis += srcStride; - sum1A = vec_adds(srcP0ssA, srcP1ssA); - sum1B = vec_adds(srcP0ssB, srcP1ssB); - sum2A = vec_adds(srcM1ssA, srcP2ssA); - sum2B = vec_adds(srcM1ssB, srcP2ssB); - sum3A = vec_adds(srcM2ssA, srcP3ssA); - sum3B = vec_adds(srcM2ssB, srcP3ssB); + sum1A = vec_adds(srcP0ssA, srcP1ssA); + sum1B = vec_adds(srcP0ssB, srcP1ssB); + sum2A = vec_adds(srcM1ssA, srcP2ssA); + sum2B = vec_adds(srcM1ssB, srcP2ssB); + sum3A = vec_adds(srcM2ssA, srcP3ssA); + sum3B = vec_adds(srcM2ssB, srcP3ssB); - srcM2ssA = srcM1ssA; - srcM2ssB = srcM1ssB; - srcM1ssA = srcP0ssA; - srcM1ssB = srcP0ssB; - srcP0ssA = srcP1ssA; - srcP0ssB = srcP1ssB; - srcP1ssA = srcP2ssA; - srcP1ssB = srcP2ssB; - srcP2ssA = srcP3ssA; - srcP2ssB = srcP3ssB; + srcM2ssA = srcM1ssA; + srcM2ssB = srcM1ssB; + srcM1ssA = srcP0ssA; + srcM1ssB = srcP0ssB; + srcP0ssA = srcP1ssA; + srcP0ssB = srcP1ssB; + srcP1ssA = srcP2ssA; + srcP1ssB = srcP2ssB; + srcP2ssA = srcP3ssA; + srcP2ssB = srcP3ssB; - pp1A = vec_mladd(sum1A, v20ss, v16ss); - pp1B = vec_mladd(sum1B, v20ss, v16ss); + pp1A = vec_mladd(sum1A, v20ss, v16ss); + pp1B = vec_mladd(sum1B, v20ss, v16ss); - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - pp3A = vec_add(sum3A, pp1A); - pp3B = vec_add(sum3B, pp1B); + pp3A = vec_add(sum3A, pp1A); + pp3B = vec_add(sum3B, pp1B); - psumA = vec_sub(pp3A, pp2A); - psumB = vec_sub(pp3B, pp2B); + psumA = vec_sub(pp3A, pp2A); + psumB = vec_sub(pp3B, pp2B); - sumA = vec_sra(psumA, v5us); - sumB = vec_sra(psumB, v5us); + sumA = vec_sra(psumA, v5us); + sumB = vec_sra(psumB, v5us); - sum = vec_packsu(sumA, sumB); + sum = vec_packsu(sumA, sumB); - ASSERT_ALIGNED(dst); - vdst = vec_ld(0, dst); + ASSERT_ALIGNED(dst); + vdst = vec_ld(0, dst); - OP_U8_ALTIVEC(fsum, sum, vdst); + OP_U8_ALTIVEC(fsum, sum, vdst); - vec_st(fsum, 0, dst); + vec_st(fsum, 0, dst); - dst += dstStride; - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); + dst += dstStride; + } + POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); } /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); - register int i; - LOAD_ZERO; - const vec_u8_t permM2 = vec_lvsl(-2, src); - const vec_u8_t permM1 = vec_lvsl(-1, src); - const vec_u8_t permP0 = vec_lvsl(+0, src); - const vec_u8_t permP1 = vec_lvsl(+1, src); - const vec_u8_t permP2 = vec_lvsl(+2, src); - const vec_u8_t permP3 = vec_lvsl(+3, src); - const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_u32_t v10ui = vec_splat_u32(10); - const vec_s16_t v5ss = vec_splat_s16(5); - const vec_s16_t v1ss = vec_splat_s16(1); - const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); - const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); + POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); + register int i; + LOAD_ZERO; + const vec_u8_t permM2 = vec_lvsl(-2, src); + const vec_u8_t permM1 = vec_lvsl(-1, src); + const vec_u8_t permP0 = vec_lvsl(+0, src); + const vec_u8_t permP1 = vec_lvsl(+1, src); + const vec_u8_t permP2 = vec_lvsl(+2, src); + const vec_u8_t permP3 = vec_lvsl(+3, src); + const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); + const vec_u32_t v10ui = vec_splat_u32(10); + const vec_s16_t v5ss = vec_splat_s16(5); + const vec_s16_t v1ss = vec_splat_s16(1); + const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); + const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); - register int align = ((((unsigned long)src) - 2) % 16); + register int align = ((((unsigned long)src) - 2) % 16); - vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, - srcP2A, srcP2B, srcP3A, srcP3B, - srcM1A, srcM1B, srcM2A, srcM2B, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, - pp1A, pp1B, pp2A, pp2B, psumA, psumB; + vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, + srcP2A, srcP2B, srcP3A, srcP3B, + srcM1A, srcM1B, srcM2A, srcM2B, + sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, + pp1A, pp1B, pp2A, pp2B, psumA, psumB; - const vec_u8_t mperm = (const vec_u8_t) - AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, - 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); - int16_t *tmpbis = tmp; + const vec_u8_t mperm = (const vec_u8_t) + AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, + 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); + int16_t *tmpbis = tmp; - vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, - tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, - tmpP2ssA, tmpP2ssB; + vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, + tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, + tmpP2ssA, tmpP2ssB; - vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, - pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, - pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, - ssumAe, ssumAo, ssumBe, ssumBo; - vec_u8_t fsum, sumv, sum, vdst; - vec_s16_t ssume, ssumo; + vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, + pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, + pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, + ssumAe, ssumAo, ssumBe, ssumBo; + vec_u8_t fsum, sumv, sum, vdst; + vec_s16_t ssume, ssumo; - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); - src -= (2 * srcStride); - for (i = 0 ; i < 21 ; i ++) { - vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vec_u8_t srcR1 = vec_ld(-2, src); - vec_u8_t srcR2 = vec_ld(14, src); + POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); + src -= (2 * srcStride); + for (i = 0 ; i < 21 ; i ++) { + vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + vec_u8_t srcR1 = vec_ld(-2, src); + vec_u8_t srcR2 = vec_ld(14, src); - switch (align) { - default: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = vec_perm(srcR1, srcR2, permP3); - } break; - case 11: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = srcR2; - } break; - case 12: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = srcR2; - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 13: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = srcR2; - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 14: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = srcR2; - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 15: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = srcR2; - srcP0 = vec_perm(srcR2, srcR3, permP0); - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; + switch (align) { + default: { + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = vec_perm(srcR1, srcR2, permP2); + srcP3 = vec_perm(srcR1, srcR2, permP3); + } break; + case 11: { + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = vec_perm(srcR1, srcR2, permP2); + srcP3 = srcR2; + } break; + case 12: { + vec_u8_t srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = srcR2; + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 13: { + vec_u8_t srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = srcR2; + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 14: { + vec_u8_t srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = srcR2; + srcP1 = vec_perm(srcR2, srcR3, permP1); + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 15: { + vec_u8_t srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = srcR2; + srcP0 = vec_perm(srcR2, srcR3, permP0); + srcP1 = vec_perm(srcR2, srcR3, permP1); + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + } + + srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); + srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); + srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); + srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); + + srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); + srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); + srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); + srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); + + srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); + srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); + srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); + srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); + + sum1A = vec_adds(srcP0A, srcP1A); + sum1B = vec_adds(srcP0B, srcP1B); + sum2A = vec_adds(srcM1A, srcP2A); + sum2B = vec_adds(srcM1B, srcP2B); + sum3A = vec_adds(srcM2A, srcP3A); + sum3B = vec_adds(srcM2B, srcP3B); + + pp1A = vec_mladd(sum1A, v20ss, sum3A); + pp1B = vec_mladd(sum1B, v20ss, sum3B); + + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); + + psumA = vec_sub(pp1A, pp2A); + psumB = vec_sub(pp1B, pp2B); + + vec_st(psumA, 0, tmp); + vec_st(psumB, 16, tmp); + + src += srcStride; + tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ } - srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); - srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); - srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); - srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); - - srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); - srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); - srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); - srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); - - srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); - srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); - srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); - srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); - - sum1A = vec_adds(srcP0A, srcP1A); - sum1B = vec_adds(srcP0B, srcP1B); - sum2A = vec_adds(srcM1A, srcP2A); - sum2B = vec_adds(srcM1B, srcP2B); - sum3A = vec_adds(srcM2A, srcP3A); - sum3B = vec_adds(srcM2B, srcP3B); - - pp1A = vec_mladd(sum1A, v20ss, sum3A); - pp1B = vec_mladd(sum1B, v20ss, sum3B); - - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - - psumA = vec_sub(pp1A, pp2A); - psumB = vec_sub(pp1B, pp2B); - - vec_st(psumA, 0, tmp); - vec_st(psumB, 16, tmp); - - src += srcStride; - tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ - } - - tmpM2ssA = vec_ld(0, tmpbis); - tmpM2ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpM1ssA = vec_ld(0, tmpbis); - tmpM1ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP0ssA = vec_ld(0, tmpbis); - tmpP0ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP1ssA = vec_ld(0, tmpbis); - tmpP1ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP2ssA = vec_ld(0, tmpbis); - tmpP2ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - - for (i = 0 ; i < 16 ; i++) { - const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); - const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); - - const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); - const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); - const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); - const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); - const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); - const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); - + tmpM2ssA = vec_ld(0, tmpbis); + tmpM2ssB = vec_ld(16, tmpbis); + tmpbis += tmpStride; + tmpM1ssA = vec_ld(0, tmpbis); + tmpM1ssB = vec_ld(16, tmpbis); + tmpbis += tmpStride; + tmpP0ssA = vec_ld(0, tmpbis); + tmpP0ssB = vec_ld(16, tmpbis); + tmpbis += tmpStride; + tmpP1ssA = vec_ld(0, tmpbis); + tmpP1ssB = vec_ld(16, tmpbis); + tmpbis += tmpStride; + tmpP2ssA = vec_ld(0, tmpbis); + tmpP2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; - tmpM2ssA = tmpM1ssA; - tmpM2ssB = tmpM1ssB; - tmpM1ssA = tmpP0ssA; - tmpM1ssB = tmpP0ssB; - tmpP0ssA = tmpP1ssA; - tmpP0ssB = tmpP1ssB; - tmpP1ssA = tmpP2ssA; - tmpP1ssB = tmpP2ssB; - tmpP2ssA = tmpP3ssA; - tmpP2ssB = tmpP3ssB; + for (i = 0 ; i < 16 ; i++) { + const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); + const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); + + const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); + const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); + const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); + const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); + const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); + const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); + + tmpbis += tmpStride; - pp1Ae = vec_mule(sum1A, v20ss); - pp1Ao = vec_mulo(sum1A, v20ss); - pp1Be = vec_mule(sum1B, v20ss); - pp1Bo = vec_mulo(sum1B, v20ss); - - pp2Ae = vec_mule(sum2A, v5ss); - pp2Ao = vec_mulo(sum2A, v5ss); - pp2Be = vec_mule(sum2B, v5ss); - pp2Bo = vec_mulo(sum2B, v5ss); + tmpM2ssA = tmpM1ssA; + tmpM2ssB = tmpM1ssB; + tmpM1ssA = tmpP0ssA; + tmpM1ssB = tmpP0ssB; + tmpP0ssA = tmpP1ssA; + tmpP0ssB = tmpP1ssB; + tmpP1ssA = tmpP2ssA; + tmpP1ssB = tmpP2ssB; + tmpP2ssA = tmpP3ssA; + tmpP2ssB = tmpP3ssB; - pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); - pp3Ao = vec_mulo(sum3A, v1ss); - pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); - pp3Bo = vec_mulo(sum3B, v1ss); + pp1Ae = vec_mule(sum1A, v20ss); + pp1Ao = vec_mulo(sum1A, v20ss); + pp1Be = vec_mule(sum1B, v20ss); + pp1Bo = vec_mulo(sum1B, v20ss); - pp1cAe = vec_add(pp1Ae, v512si); - pp1cAo = vec_add(pp1Ao, v512si); - pp1cBe = vec_add(pp1Be, v512si); - pp1cBo = vec_add(pp1Bo, v512si); + pp2Ae = vec_mule(sum2A, v5ss); + pp2Ao = vec_mulo(sum2A, v5ss); + pp2Be = vec_mule(sum2B, v5ss); + pp2Bo = vec_mulo(sum2B, v5ss); - pp32Ae = vec_sub(pp3Ae, pp2Ae); - pp32Ao = vec_sub(pp3Ao, pp2Ao); - pp32Be = vec_sub(pp3Be, pp2Be); - pp32Bo = vec_sub(pp3Bo, pp2Bo); + pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); + pp3Ao = vec_mulo(sum3A, v1ss); + pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); + pp3Bo = vec_mulo(sum3B, v1ss); + + pp1cAe = vec_add(pp1Ae, v512si); + pp1cAo = vec_add(pp1Ao, v512si); + pp1cBe = vec_add(pp1Be, v512si); + pp1cBo = vec_add(pp1Bo, v512si); - sumAe = vec_add(pp1cAe, pp32Ae); - sumAo = vec_add(pp1cAo, pp32Ao); - sumBe = vec_add(pp1cBe, pp32Be); - sumBo = vec_add(pp1cBo, pp32Bo); + pp32Ae = vec_sub(pp3Ae, pp2Ae); + pp32Ao = vec_sub(pp3Ao, pp2Ao); + pp32Be = vec_sub(pp3Be, pp2Be); + pp32Bo = vec_sub(pp3Bo, pp2Bo); - ssumAe = vec_sra(sumAe, v10ui); - ssumAo = vec_sra(sumAo, v10ui); - ssumBe = vec_sra(sumBe, v10ui); - ssumBo = vec_sra(sumBo, v10ui); + sumAe = vec_add(pp1cAe, pp32Ae); + sumAo = vec_add(pp1cAo, pp32Ao); + sumBe = vec_add(pp1cBe, pp32Be); + sumBo = vec_add(pp1cBo, pp32Bo); - ssume = vec_packs(ssumAe, ssumBe); - ssumo = vec_packs(ssumAo, ssumBo); + ssumAe = vec_sra(sumAe, v10ui); + ssumAo = vec_sra(sumAo, v10ui); + ssumBe = vec_sra(sumBe, v10ui); + ssumBo = vec_sra(sumBo, v10ui); - sumv = vec_packsu(ssume, ssumo); - sum = vec_perm(sumv, sumv, mperm); + ssume = vec_packs(ssumAe, ssumBe); + ssumo = vec_packs(ssumAo, ssumBo); - ASSERT_ALIGNED(dst); - vdst = vec_ld(0, dst); + sumv = vec_packsu(ssume, ssumo); + sum = vec_perm(sumv, sumv, mperm); - OP_U8_ALTIVEC(fsum, sum, vdst); + ASSERT_ALIGNED(dst); + vdst = vec_ld(0, dst); - vec_st(fsum, 0, dst); + OP_U8_ALTIVEC(fsum, sum, vdst); - dst += dstStride; - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); + vec_st(fsum, 0, dst); + + dst += dstStride; + } + POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); }
--- a/ppc/idct_altivec.c Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/idct_altivec.c Sun Jul 20 18:58:30 2008 +0000 @@ -22,7 +22,6 @@ * NOTE: This code is based on GPL code from the libmpeg2 project. The * author, Michel Lespinasses, has given explicit permission to release * under LGPL as part of ffmpeg. - * */ /*
--- a/ppc/imgresample_altivec.c Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/imgresample_altivec.c Sun Jul 20 18:58:30 2008 +0000 @@ -46,8 +46,7 @@ vector signed short zeros, sumhv, sumlv; s = src; - for(i=0;i<4;i++) - { + for(i=0;i<4;i++) { /* The vec_madds later on does an implicit >>15 on the result. Since FILTER_BITS is 8, and we have 15 bits of magnitude in @@ -86,13 +85,11 @@ /* Do our altivec resampling on 16 pixels at once. */ while(dst_width>=16) { - /* - Read 16 (potentially unaligned) bytes from each of + /* Read 16 (potentially unaligned) bytes from each of 4 lines into 4 vectors, and split them into shorts. Interleave the multipy/accumulate for the resample filter with the loads to hide the 3 cycle latency - the vec_madds have. - */ + the vec_madds have. */ tv = (vector unsigned char *) &s[0 * wrap]; tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); @@ -121,10 +118,8 @@ sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); - /* - Pack the results into our destination vector, - and do an aligned write of that back to memory. - */ + /* Pack the results into our destination vector, + and do an aligned write of that back to memory. */ dstv = vec_packsu(sumhv, sumlv) ; vec_st(dstv, 0, (vector unsigned char *) dst); @@ -133,10 +128,8 @@ dst_width-=16; } - /* - If there are any leftover pixels, resample them - with the slow scalar method. - */ + /* If there are any leftover pixels, resample them + with the slow scalar method. */ while(dst_width>0) { sum = s[0 * wrap] * filter[0] + s[1 * wrap] * filter[1] +
--- a/ppc/int_altivec.c Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/int_altivec.c Sun Jul 20 18:58:30 2008 +0000 @@ -38,7 +38,7 @@ vector signed short vpix2, vdiff, vpix1l,vpix1h; union { vector signed int vscore; int32_t score[4]; - } u; + } u; u.vscore = vec_splat_s32(0); // //XXX lazy way, fix it later
--- a/ppc/mathops.h Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/mathops.h Sun Jul 20 18:58:30 2008 +0000 @@ -25,14 +25,14 @@ #if defined(ARCH_POWERPC_405) /* signed 16x16 -> 32 multiply add accumulate */ -# define MAC16(rt, ra, rb) \ - asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); +#define MAC16(rt, ra, rb) \ + asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); /* signed 16x16 -> 32 multiply */ -# define MUL16(ra, rb) \ - ({ int __rt; \ - asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ - __rt; }) +#define MUL16(ra, rb) \ + ({ int __rt; \ + asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ + __rt; }) #endif #endif /* FFMPEG_PPC_MATHOPS_H */
--- a/ppc/mpegvideo_altivec.c Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/mpegvideo_altivec.c Sun Jul 20 18:58:30 2008 +0000 @@ -41,15 +41,15 @@ // transposes a matrix consisting of four vectors with four elements each #define TRANSPOSE4(a,b,c,d) \ do { \ - __typeof__(a) _trans_ach = vec_mergeh(a, c); \ - __typeof__(a) _trans_acl = vec_mergel(a, c); \ - __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ - __typeof__(a) _trans_bdl = vec_mergel(b, d); \ - \ - a = vec_mergeh(_trans_ach, _trans_bdh); \ - b = vec_mergel(_trans_ach, _trans_bdh); \ - c = vec_mergeh(_trans_acl, _trans_bdl); \ - d = vec_mergel(_trans_acl, _trans_bdl); \ + __typeof__(a) _trans_ach = vec_mergeh(a, c); \ + __typeof__(a) _trans_acl = vec_mergel(a, c); \ + __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ + __typeof__(a) _trans_bdl = vec_mergel(b, d); \ + \ + a = vec_mergeh(_trans_ach, _trans_bdh); \ + b = vec_mergel(_trans_ach, _trans_bdh); \ + c = vec_mergeh(_trans_acl, _trans_bdl); \ + d = vec_mergel(_trans_acl, _trans_bdl); \ } while (0) @@ -58,19 +58,19 @@ // target address is four-byte aligned (which should be always). #define LOAD4(vec, address) \ { \ - __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ - vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ - vec = vec_ld(0, _load_addr); \ - vec = vec_perm(vec, vec, _perm_vec); \ - vec = vec_splat(vec, 0); \ + __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ + vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ + vec = vec_ld(0, _load_addr); \ + vec = vec_perm(vec, vec, _perm_vec); \ + vec = vec_splat(vec, 0); \ } #define FOUROF(a) AVV(a,a,a,a) int dct_quantize_altivec(MpegEncContext* s, - DCTELEM* data, int n, - int qscale, int* overflow) + DCTELEM* data, int n, + int qscale, int* overflow) { int lastNonZero; vector float row0, row1, row2, row3, row4, row5, row6, row7; @@ -137,10 +137,8 @@ int whichPass, whichHalf; - for(whichPass = 1; whichPass<=2; whichPass++) - { - for(whichHalf = 1; whichHalf<=2; whichHalf++) - { + for(whichPass = 1; whichPass<=2; whichPass++) { + for(whichHalf = 1; whichHalf<=2; whichHalf++) { vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; vector float tmp10, tmp11, tmp12, tmp13; vector float z1, z2, z3, z4, z5; @@ -235,8 +233,7 @@ SWAP(row7, alt7); } - if (whichPass == 1) - { + if (whichPass == 1) { // transpose the data for the second pass // First, block transpose the upper right with lower left. @@ -261,8 +258,7 @@ const vector signed int* qmat; vector float bias, negBias; - if (s->mb_intra) - { + if (s->mb_intra) { vector signed int baseVector; // We must cache element 0 in the intra case @@ -272,9 +268,7 @@ qmat = (vector signed int*)s->q_intra_matrix[qscale]; biasAddr = &(s->intra_quant_bias); - } - else - { + } else { qmat = (vector signed int*)s->q_inter_matrix[qscale]; biasAddr = &(s->inter_quant_bias); } @@ -439,8 +433,7 @@ // and handle it using the vector unit if we can. This is the permute used // by the altivec idct, so it is common when using the altivec dct. - if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) - { + if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) { TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); } @@ -456,10 +449,8 @@ } // special handling of block[0] - if (s->mb_intra) - { - if (!s->h263_aic) - { + if (s->mb_intra) { + if (!s->h263_aic) { if (n < 4) oldBaseValue /= s->y_dc_scale; else @@ -474,8 +465,7 @@ // need to permute the "no" permutation case. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && - (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) - { + (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) { ff_block_permute(data, s->dsp.idct_permutation, s->intra_scantable.scantable, lastNonZero); } @@ -483,10 +473,8 @@ return lastNonZero; } -/* - AltiVec version of dct_unquantize_h263 - this code assumes `block' is 16 bytes-aligned -*/ +/* AltiVec version of dct_unquantize_h263 + this code assumes `block' is 16 bytes-aligned */ void dct_unquantize_h263_altivec(MpegEncContext *s, DCTELEM *block, int n, int qscale) { @@ -517,82 +505,81 @@ } { - register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); - DECLARE_ALIGNED_16(short, qmul8[]) = - { - qmul, qmul, qmul, qmul, - qmul, qmul, qmul, qmul - }; - DECLARE_ALIGNED_16(short, qadd8[]) = - { - qadd, qadd, qadd, qadd, - qadd, qadd, qadd, qadd - }; - DECLARE_ALIGNED_16(short, nqadd8[]) = - { - -qadd, -qadd, -qadd, -qadd, - -qadd, -qadd, -qadd, -qadd - }; - register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; - register vector bool short blockv_null, blockv_neg; - register short backup_0 = block[0]; - register int j = 0; + register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); + DECLARE_ALIGNED_16(short, qmul8[]) = + { + qmul, qmul, qmul, qmul, + qmul, qmul, qmul, qmul + }; + DECLARE_ALIGNED_16(short, qadd8[]) = + { + qadd, qadd, qadd, qadd, + qadd, qadd, qadd, qadd + }; + DECLARE_ALIGNED_16(short, nqadd8[]) = + { + -qadd, -qadd, -qadd, -qadd, + -qadd, -qadd, -qadd, -qadd + }; + register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; + register vector bool short blockv_null, blockv_neg; + register short backup_0 = block[0]; + register int j = 0; - qmulv = vec_ld(0, qmul8); - qaddv = vec_ld(0, qadd8); - nqaddv = vec_ld(0, nqadd8); + qmulv = vec_ld(0, qmul8); + qaddv = vec_ld(0, qadd8); + nqaddv = vec_ld(0, nqadd8); -#if 0 // block *is* 16 bytes-aligned, it seems. - // first make sure block[j] is 16 bytes-aligned - for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; +#if 0 // block *is* 16 bytes-aligned, it seems. + // first make sure block[j] is 16 bytes-aligned + for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { + level = block[j]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[j] = level; } - block[j] = level; } - } #endif - // vectorize all the 16 bytes-aligned blocks - // of 8 elements - for(; (j + 7) <= nCoeffs ; j+=8) - { - blockv = vec_ld(j << 1, block); - blockv_neg = vec_cmplt(blockv, vczero); - blockv_null = vec_cmpeq(blockv, vczero); - // choose between +qadd or -qadd as the third operand - temp1 = vec_sel(qaddv, nqaddv, blockv_neg); - // multiply & add (block{i,i+7} * qmul [+-] qadd) - temp1 = vec_mladd(blockv, qmulv, temp1); - // put 0 where block[{i,i+7} used to have 0 - blockv = vec_sel(temp1, blockv, blockv_null); - vec_st(blockv, j << 1, block); - } + // vectorize all the 16 bytes-aligned blocks + // of 8 elements + for(; (j + 7) <= nCoeffs ; j+=8) { + blockv = vec_ld(j << 1, block); + blockv_neg = vec_cmplt(blockv, vczero); + blockv_null = vec_cmpeq(blockv, vczero); + // choose between +qadd or -qadd as the third operand + temp1 = vec_sel(qaddv, nqaddv, blockv_neg); + // multiply & add (block{i,i+7} * qmul [+-] qadd) + temp1 = vec_mladd(blockv, qmulv, temp1); + // put 0 where block[{i,i+7} used to have 0 + blockv = vec_sel(temp1, blockv, blockv_null); + vec_st(blockv, j << 1, block); + } - // if nCoeffs isn't a multiple of 8, finish the job - // using good old scalar units. - // (we could do it using a truncated vector, - // but I'm not sure it's worth the hassle) - for(; j <= nCoeffs ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; + // if nCoeffs isn't a multiple of 8, finish the job + // using good old scalar units. + // (we could do it using a truncated vector, + // but I'm not sure it's worth the hassle) + for(; j <= nCoeffs ; j++) { + level = block[j]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[j] = level; } - block[j] = level; } - } - if (i == 1) - { // cheat. this avoid special-casing the first iteration - block[0] = backup_0; - } + if (i == 1) { + // cheat. this avoid special-casing the first iteration + block[0] = backup_0; + } } POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); } @@ -605,11 +592,9 @@ { if ((mm_flags & MM_ALTIVEC) == 0) return; - if (s->avctx->lowres==0) - { + if (s->avctx->lowres==0) { if ((s->avctx->idct_algo == FF_IDCT_AUTO) || - (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) - { + (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) { s->dsp.idct_put = idct_put_altivec; s->dsp.idct_add = idct_add_altivec; s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; @@ -618,15 +603,13 @@ // Test to make sure that the dct required alignments are met. if ((((long)(s->q_intra_matrix) & 0x0f) != 0) || - (((long)(s->q_inter_matrix) & 0x0f) != 0)) - { + (((long)(s->q_inter_matrix) & 0x0f) != 0)) { av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned " "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); return; } - if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) - { + if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) { av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned " "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); return; @@ -634,8 +617,7 @@ if ((s->avctx->dct_algo == FF_DCT_AUTO) || - (s->avctx->dct_algo == FF_DCT_ALTIVEC)) - { + (s->avctx->dct_algo == FF_DCT_ALTIVEC)) { #if 0 /* seems to cause trouble under some circumstances */ s->dct_quantize = dct_quantize_altivec; #endif
--- a/ppc/snow_altivec.c Sun Jul 20 18:06:41 2008 +0000 +++ b/ppc/snow_altivec.c Sun Jul 20 18:58:30 2008 +0000 @@ -379,8 +379,7 @@ v4=(vector signed int *)b4; v5=(vector signed int *)b5; - for (i=0; i< w4;i++) - { + for (i=0; i< w4;i++) { #if 0 b4[i] -= (3*(b3[i] + b5[i])+4)>>3; @@ -782,8 +781,8 @@ void snow_init_altivec(DSPContext* c, AVCodecContext *avctx) { #if 0 - c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; - c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; - c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; + c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; + c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; + c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; #endif }