comparison ppc/h264_altivec.c @ 5119:ad0c45e0008c libavcodec

Altivec version of h264_(h|v)_loop_filter_luma patch by Graham Booker % perian A cod3r P com% with some minor fixes by me. historic of the patch: http://trac.perian.org/ticket/113 Original thread: Date: May 11, 2007 9:45 PM Subject: [FFmpeg-devel] [PATCH] Altivec version of-altivec h264_h-v_loop_filter_luma
author gpoirier
date Sat, 09 Jun 2007 19:13:34 +0000
parents ce57e3f2b2a7
children ae7f2c01811f
comparison
equal deleted inserted replaced
5118:3b190bc34546 5119:ad0c45e0008c
598 ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); 598 ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
599 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); 599 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
600 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); 600 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
601 } 601 }
602 602
603 #define transpose4x16(r0, r1, r2, r3) { \
604 register vector unsigned char r4; \
605 register vector unsigned char r5; \
606 register vector unsigned char r6; \
607 register vector unsigned char r7; \
608 \
609 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
610 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
611 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
612 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
613 \
614 r0 = vec_mergeh(r4, r6); /*all set 0*/ \
615 r1 = vec_mergel(r4, r6); /*all set 1*/ \
616 r2 = vec_mergeh(r5, r7); /*all set 2*/ \
617 r3 = vec_mergel(r5, r7); /*all set 3*/ \
618 }
619
620 static inline void write16x4(uint8_t *dst, int dst_stride, register vector unsigned char r0,
621 register vector unsigned char r1, register vector unsigned char r2,
622 register vector unsigned char r3) {
623 DECLARE_ALIGNED_16(unsigned char, result[64]);
624 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
625 int int_dst_stride = dst_stride/4;
626
627 vec_st(r0, 0, result);
628 vec_st(r1, 16, result);
629 vec_st(r2, 32, result);
630 vec_st(r3, 48, result);
631 /* FIXME: there has to be a better way!!!! */
632 *dst_int = *src_int;
633 *(dst_int+ int_dst_stride) = *(src_int + 1);
634 *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
635 *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
636 *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
637 *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
638 *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
639 *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
640 *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
641 *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
642 *(dst_int+10*int_dst_stride) = *(src_int + 10);
643 *(dst_int+11*int_dst_stride) = *(src_int + 11);
644 *(dst_int+12*int_dst_stride) = *(src_int + 12);
645 *(dst_int+13*int_dst_stride) = *(src_int + 13);
646 *(dst_int+14*int_dst_stride) = *(src_int + 14);
647 *(dst_int+15*int_dst_stride) = *(src_int + 15);
648 }
649
650 /** \brief performs a 6x16 transpose of data in src, and stores it to dst
651 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
652 out of unaligned_load() */
653 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
654 register vector unsigned char r0 = unaligned_load(0, src);\
655 register vector unsigned char r1 = unaligned_load( src_stride, src);\
656 register vector unsigned char r2 = unaligned_load(2* src_stride, src);\
657 register vector unsigned char r3 = unaligned_load(3* src_stride, src);\
658 register vector unsigned char r4 = unaligned_load(4* src_stride, src);\
659 register vector unsigned char r5 = unaligned_load(5* src_stride, src);\
660 register vector unsigned char r6 = unaligned_load(6* src_stride, src);\
661 register vector unsigned char r7 = unaligned_load(7* src_stride, src);\
662 register vector unsigned char r14 = unaligned_load(14*src_stride, src);\
663 register vector unsigned char r15 = unaligned_load(15*src_stride, src);\
664 \
665 r8 = unaligned_load( 8*src_stride, src); \
666 r9 = unaligned_load( 9*src_stride, src); \
667 r10 = unaligned_load(10*src_stride, src); \
668 r11 = unaligned_load(11*src_stride, src); \
669 r12 = unaligned_load(12*src_stride, src); \
670 r13 = unaligned_load(13*src_stride, src); \
671 \
672 /*Merge first pairs*/ \
673 r0 = vec_mergeh(r0, r8); /*0, 8*/ \
674 r1 = vec_mergeh(r1, r9); /*1, 9*/ \
675 r2 = vec_mergeh(r2, r10); /*2,10*/ \
676 r3 = vec_mergeh(r3, r11); /*3,11*/ \
677 r4 = vec_mergeh(r4, r12); /*4,12*/ \
678 r5 = vec_mergeh(r5, r13); /*5,13*/ \
679 r6 = vec_mergeh(r6, r14); /*6,14*/ \
680 r7 = vec_mergeh(r7, r15); /*7,15*/ \
681 \
682 /*Merge second pairs*/ \
683 r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \
684 r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \
685 r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \
686 r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \
687 r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \
688 r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \
689 r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \
690 r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
691 \
692 /*Third merge*/ \
693 r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
694 r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
695 r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
696 r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
697 r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
698 r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
699 /* Don't need to compute 3 and 7*/ \
700 \
701 /*Final merge*/ \
702 r8 = vec_mergeh(r0, r4); /*all set 0*/ \
703 r9 = vec_mergel(r0, r4); /*all set 1*/ \
704 r10 = vec_mergeh(r1, r5); /*all set 2*/ \
705 r11 = vec_mergel(r1, r5); /*all set 3*/ \
706 r12 = vec_mergeh(r2, r6); /*all set 4*/ \
707 r13 = vec_mergel(r2, r6); /*all set 5*/ \
708 /* Don't need to compute 14 and 15*/ \
709 \
710 }
711
712 // out: o = |x-y| < a
713 static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x,
714 register vector unsigned char y,
715 register vector unsigned char a) {
716
717 register vector unsigned char diff = vec_subs(x, y);
718 register vector unsigned char diffneg = vec_subs(y, x);
719 register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */
720 o = vec_cmplt(o, a);
721 return o;
722 }
723
724 static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0,
725 register vector unsigned char p1,
726 register vector unsigned char q0,
727 register vector unsigned char q1,
728 register vector unsigned char alpha,
729 register vector unsigned char beta) {
730
731 register vector unsigned char mask;
732 register vector unsigned char tempmask;
733
734 mask = diff_lt_altivec(p0, q0, alpha);
735 tempmask = diff_lt_altivec(p1, p0, beta);
736 mask = vec_and(mask, tempmask);
737 tempmask = diff_lt_altivec(q1, q0, beta);
738 mask = vec_and(mask, tempmask);
739
740 return mask;
741 }
742
743 // out: p1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
744 #define h264_deblock_q1(p0, p1, p2, q0, tc0) { \
745 \
746 register vector unsigned char average = vec_avg(p0, q0); \
747 register vector unsigned char temp; \
748 register vector unsigned char uncliped; \
749 register vector unsigned char ones; \
750 register vector unsigned char max; \
751 register vector unsigned char min; \
752 \
753 temp = vec_xor(average, p2); \
754 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ \
755 ones = vec_splat_u8(1); \
756 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ \
757 uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */\
758 max = vec_adds(p1, tc0); \
759 min = vec_subs(p1, tc0); \
760 p1 = vec_max(min, uncliped); \
761 p1 = vec_min(max, p1); \
762 }
763
764 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
765 \
766 const vec_u8_t A0v = (vec_u8_t) AVV(0xA0,0xA0,0xA0,0xA0,0xA0,0xA0,0xA0,0xA0, \
767 0xA0,0xA0,0xA0,0xA0,0xA0,0xA0,0xA0,0xA0); \
768 \
769 register vector unsigned char pq0bit = vec_xor(p0,q0); \
770 register vector unsigned char temp; \
771 register vector unsigned char q1minus; \
772 register vector unsigned char p0minus; \
773 register vector unsigned char stage1; \
774 register vector unsigned char stage2; \
775 register vector unsigned char vec160; \
776 register vector unsigned char delta; \
777 register vector unsigned char deltaneg; \
778 \
779 temp = vec_cmpeq(p0, p0); \
780 q1minus = vec_xor(temp, q1); /* 255 - q1 */ \
781 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
782 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
783 p0minus = vec_xor(temp, p0); /* 255 - p0 */ \
784 stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \
785 pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \
786 stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
787 stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
788 vec160 = vec_ld(0, &A0v); \
789 deltaneg = vec_subs(vec160, stage2); /* -d */ \
790 delta = vec_subs(stage2, vec160); /* d */ \
791 deltaneg = vec_min(tc0masked, deltaneg); \
792 delta = vec_min(tc0masked, delta); \
793 p0 = vec_subs(p0, deltaneg); \
794 q0 = vec_subs(q0, delta); \
795 p0 = vec_adds(p0, delta); \
796 q0 = vec_adds(q0, deltaneg); \
797 }
798
799 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
800 DECLARE_ALIGNED_16(unsigned char, temp[16]); \
801 register vector unsigned char alphavec; \
802 register vector unsigned char betavec; \
803 register vector unsigned char mask; \
804 register vector unsigned char p1mask; \
805 register vector unsigned char q1mask; \
806 register vector char tc0vec; \
807 register vector unsigned char finaltc0; \
808 register vector unsigned char tc0masked; \
809 \
810 temp[0] = alpha; \
811 temp[1] = beta; \
812 alphavec = vec_ld(0, temp); \
813 betavec = vec_splat(alphavec, 0x1); \
814 alphavec = vec_splat(alphavec, 0x0); \
815 mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \
816 \
817 *((int *)temp) = *((int *)tc0); \
818 tc0vec = vec_ld(0, temp); \
819 tc0vec = vec_mergeh(tc0vec, tc0vec); \
820 tc0vec = vec_mergeh(tc0vec, tc0vec); \
821 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_u8(-1))); /* if tc0[i] >= 0 */ \
822 finaltc0 = vec_and(tc0vec, mask); /*tc = tc0[i]*/ \
823 \
824 p1mask = diff_lt_altivec(p2, p0, betavec); \
825 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \
826 tc0masked = vec_and(p1mask, tc0vec); \
827 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
828 h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
829 /*end if*/ \
830 \
831 q1mask = diff_lt_altivec(q2, q0, betavec); \
832 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
833 tc0masked = vec_and(q1mask, tc0vec); \
834 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
835 h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
836 /*end if*/ \
837 \
838 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
839 }
840
841 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
842
843 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
844 register vector unsigned char p2 = vec_ld(-3*stride, pix);
845 register vector unsigned char p1 = vec_ld(-2*stride, pix);
846 register vector unsigned char p0 = vec_ld(-1*stride, pix);
847 register vector unsigned char q0 = vec_ld(0, pix);
848 register vector unsigned char q1 = vec_ld(stride, pix);
849 register vector unsigned char q2 = vec_ld(2*stride, pix);
850 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
851 vec_st(p1, -2*stride, pix);
852 vec_st(p0, -1*stride, pix);
853 vec_st(q0, 0, pix);
854 vec_st(q1, stride, pix);
855 }
856 }
857
858 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
859
860 register vector unsigned char line0, line1, line2, line3, line4, line5;
861 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
862 return;
863 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
864 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
865 transpose4x16(line1, line2, line3, line4);
866 write16x4(pix-2, stride, line1, line2, line3, line4);
867 }
868
603 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { 869 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
604 870
605 #ifdef HAVE_ALTIVEC 871 #ifdef HAVE_ALTIVEC
606 if (has_altivec()) { 872 if (has_altivec()) {
607 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; 873 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
608 c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec; 874 c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
609 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; 875 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
610 c->h264_idct_add = ff_h264_idct_add_altivec; 876 c->h264_idct_add = ff_h264_idct_add_altivec;
611 c->h264_idct8_add = ff_h264_idct8_add_altivec; 877 c->h264_idct8_add = ff_h264_idct8_add_altivec;
878 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
879 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
612 880
613 #define dspfunc(PFX, IDX, NUM) \ 881 #define dspfunc(PFX, IDX, NUM) \
614 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ 882 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
615 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ 883 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
616 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \ 884 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \