comparison ppc/h264_altivec.c @ 8530:992e2f8bfba7 libavcodec

AltiVec version of h264_idct(8)_dc_add Patch by David Conrad %lessen42 A gmail P com%
author gpoirier
date Mon, 05 Jan 2009 22:13:42 +0000
parents 1615d6b75ada
children 961e40a13102
comparison
equal deleted inserted replaced
8529:1cb8c8e14a0a 8530:992e2f8bfba7
594 ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); 594 ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
595 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); 595 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
596 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); 596 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
597 } 597 }
598 598
599 // TODO: implement this in AltiVec 599 static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size)
600 static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) { 600 {
601 int i, j; 601 vec_s16 dc16;
602 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 602 vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
603 int dc = (block[0] + 32) >> 6; 603 LOAD_ZERO;
604 for( j = 0; j < 8; j++ ) 604 DECLARE_ALIGNED_16(int, dc);
605 { 605 int i;
606 for( i = 0; i < 8; i++ ) 606
607 dst[i] = cm[ dst[i] + dc ]; 607 dc = (block[0] + 32) >> 6;
608 dst += stride; 608 dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);
609
610 if (size == 4)
611 dc16 = vec_sld(dc16, zero_s16v, 8);
612 dcplus = vec_packsu(dc16, zero_s16v);
613 dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);
614
615 aligner = vec_lvsr(0, dst);
616 dcplus = vec_perm(dcplus, dcplus, aligner);
617 dcminus = vec_perm(dcminus, dcminus, aligner);
618
619 for (i = 0; i < size; i += 4) {
620 v0 = vec_ld(0, dst+0*stride);
621 v1 = vec_ld(0, dst+1*stride);
622 v2 = vec_ld(0, dst+2*stride);
623 v3 = vec_ld(0, dst+3*stride);
624
625 v0 = vec_adds(v0, dcplus);
626 v1 = vec_adds(v1, dcplus);
627 v2 = vec_adds(v2, dcplus);
628 v3 = vec_adds(v3, dcplus);
629
630 v0 = vec_subs(v0, dcminus);
631 v1 = vec_subs(v1, dcminus);
632 v2 = vec_subs(v2, dcminus);
633 v3 = vec_subs(v3, dcminus);
634
635 vec_st(v0, 0, dst+0*stride);
636 vec_st(v1, 0, dst+1*stride);
637 vec_st(v2, 0, dst+2*stride);
638 vec_st(v3, 0, dst+3*stride);
639
640 dst += 4*stride;
609 } 641 }
642 }
643
644 static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
645 {
646 h264_idct_dc_add_internal(dst, block, stride, 4);
647 }
648
649 static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
650 {
651 h264_idct_dc_add_internal(dst, block, stride, 8);
610 } 652 }
611 653
612 static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 654 static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
613 int i; 655 int i;
614 for(i=0; i<16; i+=4){ 656 for(i=0; i<16; i+=4){
901 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; 943 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
902 /* ff_h264_idct_add_altivec may be re-enabled once AltiVec versions of 944 /* ff_h264_idct_add_altivec may be re-enabled once AltiVec versions of
903 h264_idct_add16, h264_idct_add16intra, h264_idct_add8 are implemented 945 h264_idct_add16, h264_idct_add16intra, h264_idct_add8 are implemented
904 c->h264_idct_add = ff_h264_idct_add_altivec; 946 c->h264_idct_add = ff_h264_idct_add_altivec;
905 */ 947 */
948 c->h264_idct_dc_add= h264_idct_dc_add_altivec;
949 c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec;
906 c->h264_idct8_add = ff_h264_idct8_add_altivec; 950 c->h264_idct8_add = ff_h264_idct8_add_altivec;
907 c->h264_idct8_add4 = ff_h264_idct8_add4_altivec; 951 c->h264_idct8_add4 = ff_h264_idct8_add4_altivec;
908 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; 952 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
909 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; 953 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
910 954