# HG changeset patch # User gpoirier # Date 1231193622 0 # Node ID 992e2f8bfba722bc5af41abab16a8189cd7c9c1d # Parent 1cb8c8e14a0a7b5ed85c1637d6da2a663e1c0304 AltiVec version of h264_idct(8)_dc_add Patch by David Conrad %lessen42 A gmail P com% diff -r 1cb8c8e14a0a -r 992e2f8bfba7 ppc/h264_altivec.c --- a/ppc/h264_altivec.c Mon Jan 05 18:44:32 2009 +0000 +++ b/ppc/h264_altivec.c Mon Jan 05 22:13:42 2009 +0000 @@ -596,19 +596,61 @@ ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); } -// TODO: implement this in AltiVec -static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) { - int i, j; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - int dc = (block[0] + 32) >> 6; - for( j = 0; j < 8; j++ ) - { - for( i = 0; i < 8; i++ ) - dst[i] = cm[ dst[i] + dc ]; - dst += stride; +static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size) +{ + vec_s16 dc16; + vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; + LOAD_ZERO; + DECLARE_ALIGNED_16(int, dc); + int i; + + dc = (block[0] + 32) >> 6; + dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); + + if (size == 4) + dc16 = vec_sld(dc16, zero_s16v, 8); + dcplus = vec_packsu(dc16, zero_s16v); + dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); + + aligner = vec_lvsr(0, dst); + dcplus = vec_perm(dcplus, dcplus, aligner); + dcminus = vec_perm(dcminus, dcminus, aligner); + + for (i = 0; i < size; i += 4) { + v0 = vec_ld(0, dst+0*stride); + v1 = vec_ld(0, dst+1*stride); + v2 = vec_ld(0, dst+2*stride); + v3 = vec_ld(0, dst+3*stride); + + v0 = vec_adds(v0, dcplus); + v1 = vec_adds(v1, dcplus); + v2 = vec_adds(v2, dcplus); + v3 = vec_adds(v3, dcplus); + + v0 = vec_subs(v0, dcminus); + v1 = vec_subs(v1, dcminus); + v2 = vec_subs(v2, dcminus); + v3 = vec_subs(v3, dcminus); + + vec_st(v0, 0, dst+0*stride); + vec_st(v1, 0, dst+1*stride); + vec_st(v2, 0, dst+2*stride); + vec_st(v3, 0, dst+3*stride); + + dst += 4*stride; } } +static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) +{ + h264_idct_dc_add_internal(dst, block, stride, 4); +} + +static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) +{ + h264_idct_dc_add_internal(dst, block, stride, 8); +} + static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ int i; for(i=0; i<16; i+=4){ @@ -903,6 +945,8 @@ h264_idct_add16, h264_idct_add16intra, h264_idct_add8 are implemented c->h264_idct_add = ff_h264_idct_add_altivec; */ + c->h264_idct_dc_add= h264_idct_dc_add_altivec; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec; c->h264_idct8_add = ff_h264_idct8_add_altivec; c->h264_idct8_add4 = ff_h264_idct8_add4_altivec; c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;