comparison i386/dsputil_mmx.c @ 2633:72e6ffa1f3a5 libavcodec

MMX for H.264 deblocking filter
author lorenm
date Mon, 25 Apr 2005 01:01:41 +0000
parents 86e2b1424801
children 24472383b36f
comparison
equal deleted inserted replaced
2632:67171616ead6 2633:72e6ffa1f3a5
37 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; 37 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
38 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; 38 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
39 39
40 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; 40 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
41 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; 41 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
42 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
42 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL; 43 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
43 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL; 44 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
44 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL; 45 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
45 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; 46 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
46 47
687 :: "r" (src), 688 :: "r" (src),
688 "r" (src + 4*stride), 689 "r" (src + 4*stride),
689 "r" ((long) stride ), 690 "r" ((long) stride ),
690 "r" ((long)(3*stride)) 691 "r" ((long)(3*stride))
691 ); 692 );
693 }
694
695 // dst = ABS( a - b )
696 #define MMABS_DIFF_MMX2(a,b,dst,z)\
697 "movq " #b ", " #dst " \n\t"\
698 "movq " #a ", " #z " \n\t"\
699 "psubusw " #b ", " #z " \n\t"\
700 "psubusw " #a ", " #dst " \n\t"\
701 "pmaxsw " #z ", " #dst " \n\t"
702
703 // a = clip( a, -tc, tc )
704 #define CLIP_MMX2(a,tc,z)\
705 "pxor " #z ", " #z " \n\t"\
706 "psubw " #tc ", " #z " \n\t"\
707 "pmaxsw " #z ", " #a " \n\t"\
708 "pminsw " #tc ", " #a " \n\t"
709
710 // in: mm0=p1, mm1=p0, mm2=q0, mm3=q1
711 // out: mm7 = do we filter this pixel?
712 #define H264_DEBLOCK_THRESH(alpha,beta)\
713 "pxor %%mm7, %%mm7 \n\t"\
714 "punpcklbw %%mm7, %%mm0 \n\t"\
715 "punpcklbw %%mm7, %%mm1 \n\t"\
716 "punpcklbw %%mm7, %%mm2 \n\t"\
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 MMABS_DIFF_MMX2(%%mm1, %%mm2, %%mm5, %%mm4)\
719 "movd " #alpha ", %%mm6 \n\t"\
720 "pshufw $0, %%mm6, %%mm6 \n\t"\
721 "pcmpgtw %%mm5, %%mm6 \n\t" /* ABS(p0-q0) < alpha */\
722 MMABS_DIFF_MMX2(%%mm0, %%mm1, %%mm5, %%mm4)\
723 MMABS_DIFF_MMX2(%%mm3, %%mm2, %%mm7, %%mm4)\
724 "pmaxsw %%mm7, %%mm5 \n\t"\
725 "movd " #beta ", %%mm7 \n\t"\
726 "pshufw $0, %%mm7, %%mm7 \n\t"\
727 "movq %%mm7, %%mm4 \n\t"\
728 "pcmpgtw %%mm5, %%mm7 \n\t" /* ABS(p1-p0) < beta && ABS(q1-q0) < beta */\
729 "pand %%mm6, %%mm7 \n\t"
730
731 // in: mm0=p1, mm1=p0, mm2=q0, mm3=q1, mm6=tc
732 // out: mm1=p0', mm2=q0'
733 #define H264_DEBLOCK_P0_Q0(pw4)\
734 "movq " #pw4 ", %%mm4 \n\t"\
735 "movq %%mm2, %%mm5 \n\t"\
736 "paddw %%mm4, %%mm0 \n\t"\
737 "psubw %%mm1, %%mm5 \n\t"\
738 "psubw %%mm3, %%mm0 \n\t"\
739 "psllw $2, %%mm5 \n\t"\
740 "paddw %%mm0, %%mm5 \n\t"\
741 "psraw $3, %%mm5 \n\t" /* mm5 = (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3 */\
742 CLIP_MMX2(%%mm5, %%mm6, %%mm4) /* delta = clip( mm5, -tc, tc ) */\
743 "paddw %%mm5, %%mm1 \n\t" /* p0 += delta */\
744 "psubw %%mm5, %%mm2 \n\t" /* q0 -= delta */
745
746 // in: mm1=p0, mm2=q0, mm6=tc0
747 // out: mm5=delta
748 #define H264_DEBLOCK_DELTA_PQ1(p1,p2,z)\
749 "movq %%mm1, %%mm5 \n\t"\
750 "pavgb %%mm2, %%mm5 \n\t"\
751 "paddw " #p2 ", %%mm5 \n\t"\
752 "psraw $1, %%mm5 \n\t"\
753 "psubw " #p1 ", %%mm5 \n\t" /* ( ( q2 + ((p0+q0+1)>>1) ) >> 1 ) - q1 */\
754 CLIP_MMX2(%%mm5, %%mm6, z)
755
756 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int tc0)
757 {
758 uint64_t tmp0, tmp1;
759 asm volatile(
760 "movd (%2,%4), %%mm0 \n\t" //p1
761 "movd (%2,%4,2), %%mm1 \n\t" //p0
762 "movd (%3), %%mm2 \n\t" //q0
763 "movd (%3,%4), %%mm3 \n\t" //q1
764 H264_DEBLOCK_THRESH(%6,%7)
765 "movq %%mm7, %0 \n\t"
766
767 // filter p1 if ABS(p2-p0) < beta
768 "movd (%2), %%mm3 \n\t"
769 "pxor %%mm6, %%mm6 \n\t"
770 "punpcklbw %%mm6, %%mm3 \n\t" //p2
771 MMABS_DIFF_MMX2(%%mm1, %%mm3, %%mm5, %%mm6)
772 "pcmpgtw %%mm5, %%mm4 \n\t"
773 "pand %%mm7, %%mm4 \n\t" // mm4 = ( ABS( p2 - p0 ) < beta && filterp )
774 "movd %5, %%mm6 \n\t"
775 "pshufw $0, %%mm6, %%mm6 \n\t" //tc
776
777 H264_DEBLOCK_DELTA_PQ1(%%mm0, %%mm3, %%mm7) // delta = clip( ( p2 + ((p0+q0+1)>>1) ) >> 1 ) - p1 )
778 "pand %%mm4, %%mm5 \n\t"
779 "paddw %%mm0, %%mm5 \n\t"
780 "packuswb %%mm5, %%mm5 \n\t"
781 "movd %%mm5, (%2,%4) \n\t" // *p1 += delta
782 "psrlw $15, %%mm4 \n\t"
783 "paddw %%mm6, %%mm4 \n\t" // tc++
784 "movq %%mm4, %1 \n\t"
785
786 // filter q1 if ABS(q2-q0) < beta
787 "pxor %%mm7, %%mm7 \n\t"
788 "movd (%3,%4), %%mm3 \n\t" //q1
789 "movd (%3,%4,2), %%mm4 \n\t" //q2
790 "punpcklbw %%mm7, %%mm3 \n\t"
791 "punpcklbw %%mm7, %%mm4 \n\t"
792 MMABS_DIFF_MMX2(%%mm2, %%mm4, %%mm5, %%mm7)
793 "movd %7, %%mm7 \n\t"
794 "pshufw $0, %%mm7, %%mm7 \n\t"
795 "pcmpgtw %%mm5, %%mm7 \n\t"
796
797 H264_DEBLOCK_DELTA_PQ1(%%mm3, %%mm4, %%mm4) // delta = clip( ( q2 + ((p0+q0+1)>>1) ) >> 1 ) - q1 )
798 "movq %0, %%mm4 \n\t"
799 "pand %%mm4, %%mm7 \n\t" // mm7 = ( ABS( q2 - q0 ) < beta && filterp )
800 "pand %%mm7, %%mm5 \n\t"
801 "paddw %%mm3, %%mm5 \n\t"
802 "packuswb %%mm5, %%mm5 \n\t"
803 "movd %%mm5, (%3,%4) \n\t" // *q1 += delta
804 "movq %1, %%mm6 \n\t"
805 "psrlw $15, %%mm7 \n\t"
806 "paddw %%mm7, %%mm6 \n\t" // tc++
807 "movq %0, %%mm4 \n\t"
808 "pand %%mm4, %%mm6 \n\t"
809
810 H264_DEBLOCK_P0_Q0(%8)
811 "packuswb %%mm1, %%mm1 \n\t"
812 "packuswb %%mm2, %%mm2 \n\t"
813 "movd %%mm1, (%2,%4,2) \n\t"
814 "movd %%mm2, (%3) \n\t"
815
816 : "=m"(tmp0), "=m"(tmp1)
817 : "r"(pix-3*stride), "r"(pix), "r"((long)stride),
818 "r"(tc0), "r"(alpha), "r"(beta), "m"(ff_pw_4)
819 );
820 }
821
822 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
823 {
824 int i;
825 for(i=0; i<4; i++, pix+=4) {
826 if(tc0[i] < 0)
827 continue;
828 h264_loop_filter_luma_mmx2(pix, stride, alpha, beta, tc0[i]);
829 }
830 }
831
832 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
833 {
834 uint8_t trans[4*8];
835 int i;
836 for(i=0; i<4; i++, pix+=4*stride) {
837 if(tc0[i] < 0)
838 continue;
839 //FIXME: could cut some load/stores by merging transpose with filter
840 transpose4x4(trans, pix-4, 4, stride);
841 transpose4x4(trans+4*4, pix, 4, stride);
842 h264_loop_filter_luma_mmx2(trans+4*4, 4, alpha, beta, tc0[i]);
843 transpose4x4(pix-2, trans+2*4, stride, 4);
844 }
845 }
846
847 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
848 {
849 asm volatile(
850 "movd (%0), %%mm0 \n\t"
851 "movd (%0,%2), %%mm1 \n\t"
852 "movd (%1), %%mm2 \n\t"
853 "movd (%1,%2), %%mm3 \n\t"
854 H264_DEBLOCK_THRESH(%4,%5)
855 "movd %3, %%mm6 \n\t"
856 "pshufw $0x50, %%mm6, %%mm6 \n\t" // mm6 = tc[1], tc[1], tc[0], tc[0]
857 "pand %%mm7, %%mm6 \n\t"
858 H264_DEBLOCK_P0_Q0(%6)
859 "packuswb %%mm1, %%mm1 \n\t"
860 "packuswb %%mm2, %%mm2 \n\t"
861 "movd %%mm1, (%0,%2) \n\t"
862 "movd %%mm2, (%1) \n\t"
863 :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
864 "r"(tc0[1]<<16 | tc0[0]),
865 "r"(alpha), "r"(beta), "m"(ff_pw_4)
866 );
867 }
868
869 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
870 {
871 int i;
872 for(i=0; i<2; i++) {
873 h264_loop_filter_chroma_mmx2(pix, stride, alpha, beta, tc0);
874 pix += 4;
875 tc0 += 2;
876 }
877 }
878
879 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
880 {
881 uint8_t trans[4*4];
882 int i;
883 for(i=0; i<2; i++) {
884 //FIXME: could cut some load/stores by merging transpose with filter
885 transpose4x4(trans, pix-2, 4, stride);
886 h264_loop_filter_chroma_mmx2(trans+2*4, 4, alpha, beta, tc0);
887 transpose4x4(pix-2, trans, stride, 4);
888 pix += 4*stride;
889 tc0 += 2;
890 }
692 } 891 }
693 892
694 #ifdef CONFIG_ENCODERS 893 #ifdef CONFIG_ENCODERS
695 static int pix_norm1_mmx(uint8_t *pix, int line_size) { 894 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
696 int tmp; 895 int tmp;
3182 dspfunc(avg_h264_qpel, 0, 16); 3381 dspfunc(avg_h264_qpel, 0, 16);
3183 dspfunc(avg_h264_qpel, 1, 8); 3382 dspfunc(avg_h264_qpel, 1, 8);
3184 dspfunc(avg_h264_qpel, 2, 4); 3383 dspfunc(avg_h264_qpel, 2, 4);
3185 #undef dspfunc 3384 #undef dspfunc
3186 3385
3386 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
3387 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
3388 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
3389 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
3390
3187 #ifdef CONFIG_ENCODERS 3391 #ifdef CONFIG_ENCODERS
3188 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; 3392 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
3189 #endif //CONFIG_ENCODERS 3393 #endif //CONFIG_ENCODERS
3190 } else if (mm_flags & MM_3DNOW) { 3394 } else if (mm_flags & MM_3DNOW) {
3191 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; 3395 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;