Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 2633:72e6ffa1f3a5 libavcodec
MMX for H.264 deblocking filter
author | lorenm |
---|---|
date | Mon, 25 Apr 2005 01:01:41 +0000 |
parents | 86e2b1424801 |
children | 24472383b36f |
comparison
equal
deleted
inserted
replaced
2632:67171616ead6 | 2633:72e6ffa1f3a5 |
---|---|
37 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | 37 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; |
38 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | 38 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; |
39 | 39 |
40 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; | 40 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; |
41 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; | 41 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; |
42 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL; | |
42 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL; | 43 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL; |
43 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL; | 44 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL; |
44 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL; | 45 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL; |
45 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; | 46 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; |
46 | 47 |
687 :: "r" (src), | 688 :: "r" (src), |
688 "r" (src + 4*stride), | 689 "r" (src + 4*stride), |
689 "r" ((long) stride ), | 690 "r" ((long) stride ), |
690 "r" ((long)(3*stride)) | 691 "r" ((long)(3*stride)) |
691 ); | 692 ); |
693 } | |
694 | |
695 // dst = ABS( a - b ) | |
696 #define MMABS_DIFF_MMX2(a,b,dst,z)\ | |
697 "movq " #b ", " #dst " \n\t"\ | |
698 "movq " #a ", " #z " \n\t"\ | |
699 "psubusw " #b ", " #z " \n\t"\ | |
700 "psubusw " #a ", " #dst " \n\t"\ | |
701 "pmaxsw " #z ", " #dst " \n\t" | |
702 | |
703 // a = clip( a, -tc, tc ) | |
704 #define CLIP_MMX2(a,tc,z)\ | |
705 "pxor " #z ", " #z " \n\t"\ | |
706 "psubw " #tc ", " #z " \n\t"\ | |
707 "pmaxsw " #z ", " #a " \n\t"\ | |
708 "pminsw " #tc ", " #a " \n\t" | |
709 | |
710 // in: mm0=p1, mm1=p0, mm2=q0, mm3=q1 | |
711 // out: mm7 = do we filter this pixel? | |
712 #define H264_DEBLOCK_THRESH(alpha,beta)\ | |
713 "pxor %%mm7, %%mm7 \n\t"\ | |
714 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
715 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
716 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
717 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
718 MMABS_DIFF_MMX2(%%mm1, %%mm2, %%mm5, %%mm4)\ | |
719 "movd " #alpha ", %%mm6 \n\t"\ | |
720 "pshufw $0, %%mm6, %%mm6 \n\t"\ | |
721 "pcmpgtw %%mm5, %%mm6 \n\t" /* ABS(p0-q0) < alpha */\ | |
722 MMABS_DIFF_MMX2(%%mm0, %%mm1, %%mm5, %%mm4)\ | |
723 MMABS_DIFF_MMX2(%%mm3, %%mm2, %%mm7, %%mm4)\ | |
724 "pmaxsw %%mm7, %%mm5 \n\t"\ | |
725 "movd " #beta ", %%mm7 \n\t"\ | |
726 "pshufw $0, %%mm7, %%mm7 \n\t"\ | |
727 "movq %%mm7, %%mm4 \n\t"\ | |
728 "pcmpgtw %%mm5, %%mm7 \n\t" /* ABS(p1-p0) < beta && ABS(q1-q0) < beta */\ | |
729 "pand %%mm6, %%mm7 \n\t" | |
730 | |
731 // in: mm0=p1, mm1=p0, mm2=q0, mm3=q1, mm6=tc | |
732 // out: mm1=p0', mm2=q0' | |
733 #define H264_DEBLOCK_P0_Q0(pw4)\ | |
734 "movq " #pw4 ", %%mm4 \n\t"\ | |
735 "movq %%mm2, %%mm5 \n\t"\ | |
736 "paddw %%mm4, %%mm0 \n\t"\ | |
737 "psubw %%mm1, %%mm5 \n\t"\ | |
738 "psubw %%mm3, %%mm0 \n\t"\ | |
739 "psllw $2, %%mm5 \n\t"\ | |
740 "paddw %%mm0, %%mm5 \n\t"\ | |
741 "psraw $3, %%mm5 \n\t" /* mm5 = (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3 */\ | |
742 CLIP_MMX2(%%mm5, %%mm6, %%mm4) /* delta = clip( mm5, -tc, tc ) */\ | |
743 "paddw %%mm5, %%mm1 \n\t" /* p0 += delta */\ | |
744 "psubw %%mm5, %%mm2 \n\t" /* q0 -= delta */ | |
745 | |
746 // in: mm1=p0, mm2=q0, mm6=tc0 | |
747 // out: mm5=delta | |
748 #define H264_DEBLOCK_DELTA_PQ1(p1,p2,z)\ | |
749 "movq %%mm1, %%mm5 \n\t"\ | |
750 "pavgb %%mm2, %%mm5 \n\t"\ | |
751 "paddw " #p2 ", %%mm5 \n\t"\ | |
752 "psraw $1, %%mm5 \n\t"\ | |
753 "psubw " #p1 ", %%mm5 \n\t" /* ( ( q2 + ((p0+q0+1)>>1) ) >> 1 ) - q1 */\ | |
754 CLIP_MMX2(%%mm5, %%mm6, z) | |
755 | |
756 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int tc0) | |
757 { | |
758 uint64_t tmp0, tmp1; | |
759 asm volatile( | |
760 "movd (%2,%4), %%mm0 \n\t" //p1 | |
761 "movd (%2,%4,2), %%mm1 \n\t" //p0 | |
762 "movd (%3), %%mm2 \n\t" //q0 | |
763 "movd (%3,%4), %%mm3 \n\t" //q1 | |
764 H264_DEBLOCK_THRESH(%6,%7) | |
765 "movq %%mm7, %0 \n\t" | |
766 | |
767 // filter p1 if ABS(p2-p0) < beta | |
768 "movd (%2), %%mm3 \n\t" | |
769 "pxor %%mm6, %%mm6 \n\t" | |
770 "punpcklbw %%mm6, %%mm3 \n\t" //p2 | |
771 MMABS_DIFF_MMX2(%%mm1, %%mm3, %%mm5, %%mm6) | |
772 "pcmpgtw %%mm5, %%mm4 \n\t" | |
773 "pand %%mm7, %%mm4 \n\t" // mm4 = ( ABS( p2 - p0 ) < beta && filterp ) | |
774 "movd %5, %%mm6 \n\t" | |
775 "pshufw $0, %%mm6, %%mm6 \n\t" //tc | |
776 | |
777 H264_DEBLOCK_DELTA_PQ1(%%mm0, %%mm3, %%mm7) // delta = clip( ( p2 + ((p0+q0+1)>>1) ) >> 1 ) - p1 ) | |
778 "pand %%mm4, %%mm5 \n\t" | |
779 "paddw %%mm0, %%mm5 \n\t" | |
780 "packuswb %%mm5, %%mm5 \n\t" | |
781 "movd %%mm5, (%2,%4) \n\t" // *p1 += delta | |
782 "psrlw $15, %%mm4 \n\t" | |
783 "paddw %%mm6, %%mm4 \n\t" // tc++ | |
784 "movq %%mm4, %1 \n\t" | |
785 | |
786 // filter q1 if ABS(q2-q0) < beta | |
787 "pxor %%mm7, %%mm7 \n\t" | |
788 "movd (%3,%4), %%mm3 \n\t" //q1 | |
789 "movd (%3,%4,2), %%mm4 \n\t" //q2 | |
790 "punpcklbw %%mm7, %%mm3 \n\t" | |
791 "punpcklbw %%mm7, %%mm4 \n\t" | |
792 MMABS_DIFF_MMX2(%%mm2, %%mm4, %%mm5, %%mm7) | |
793 "movd %7, %%mm7 \n\t" | |
794 "pshufw $0, %%mm7, %%mm7 \n\t" | |
795 "pcmpgtw %%mm5, %%mm7 \n\t" | |
796 | |
797 H264_DEBLOCK_DELTA_PQ1(%%mm3, %%mm4, %%mm4) // delta = clip( ( q2 + ((p0+q0+1)>>1) ) >> 1 ) - q1 ) | |
798 "movq %0, %%mm4 \n\t" | |
799 "pand %%mm4, %%mm7 \n\t" // mm7 = ( ABS( q2 - q0 ) < beta && filterp ) | |
800 "pand %%mm7, %%mm5 \n\t" | |
801 "paddw %%mm3, %%mm5 \n\t" | |
802 "packuswb %%mm5, %%mm5 \n\t" | |
803 "movd %%mm5, (%3,%4) \n\t" // *q1 += delta | |
804 "movq %1, %%mm6 \n\t" | |
805 "psrlw $15, %%mm7 \n\t" | |
806 "paddw %%mm7, %%mm6 \n\t" // tc++ | |
807 "movq %0, %%mm4 \n\t" | |
808 "pand %%mm4, %%mm6 \n\t" | |
809 | |
810 H264_DEBLOCK_P0_Q0(%8) | |
811 "packuswb %%mm1, %%mm1 \n\t" | |
812 "packuswb %%mm2, %%mm2 \n\t" | |
813 "movd %%mm1, (%2,%4,2) \n\t" | |
814 "movd %%mm2, (%3) \n\t" | |
815 | |
816 : "=m"(tmp0), "=m"(tmp1) | |
817 : "r"(pix-3*stride), "r"(pix), "r"((long)stride), | |
818 "r"(tc0), "r"(alpha), "r"(beta), "m"(ff_pw_4) | |
819 ); | |
820 } | |
821 | |
822 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0) | |
823 { | |
824 int i; | |
825 for(i=0; i<4; i++, pix+=4) { | |
826 if(tc0[i] < 0) | |
827 continue; | |
828 h264_loop_filter_luma_mmx2(pix, stride, alpha, beta, tc0[i]); | |
829 } | |
830 } | |
831 | |
832 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0) | |
833 { | |
834 uint8_t trans[4*8]; | |
835 int i; | |
836 for(i=0; i<4; i++, pix+=4*stride) { | |
837 if(tc0[i] < 0) | |
838 continue; | |
839 //FIXME: could cut some load/stores by merging transpose with filter | |
840 transpose4x4(trans, pix-4, 4, stride); | |
841 transpose4x4(trans+4*4, pix, 4, stride); | |
842 h264_loop_filter_luma_mmx2(trans+4*4, 4, alpha, beta, tc0[i]); | |
843 transpose4x4(pix-2, trans+2*4, stride, 4); | |
844 } | |
845 } | |
846 | |
847 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0) | |
848 { | |
849 asm volatile( | |
850 "movd (%0), %%mm0 \n\t" | |
851 "movd (%0,%2), %%mm1 \n\t" | |
852 "movd (%1), %%mm2 \n\t" | |
853 "movd (%1,%2), %%mm3 \n\t" | |
854 H264_DEBLOCK_THRESH(%4,%5) | |
855 "movd %3, %%mm6 \n\t" | |
856 "pshufw $0x50, %%mm6, %%mm6 \n\t" // mm6 = tc[1], tc[1], tc[0], tc[0] | |
857 "pand %%mm7, %%mm6 \n\t" | |
858 H264_DEBLOCK_P0_Q0(%6) | |
859 "packuswb %%mm1, %%mm1 \n\t" | |
860 "packuswb %%mm2, %%mm2 \n\t" | |
861 "movd %%mm1, (%0,%2) \n\t" | |
862 "movd %%mm2, (%1) \n\t" | |
863 :: "r"(pix-2*stride), "r"(pix), "r"((long)stride), | |
864 "r"(tc0[1]<<16 | tc0[0]), | |
865 "r"(alpha), "r"(beta), "m"(ff_pw_4) | |
866 ); | |
867 } | |
868 | |
869 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0) | |
870 { | |
871 int i; | |
872 for(i=0; i<2; i++) { | |
873 h264_loop_filter_chroma_mmx2(pix, stride, alpha, beta, tc0); | |
874 pix += 4; | |
875 tc0 += 2; | |
876 } | |
877 } | |
878 | |
879 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0) | |
880 { | |
881 uint8_t trans[4*4]; | |
882 int i; | |
883 for(i=0; i<2; i++) { | |
884 //FIXME: could cut some load/stores by merging transpose with filter | |
885 transpose4x4(trans, pix-2, 4, stride); | |
886 h264_loop_filter_chroma_mmx2(trans+2*4, 4, alpha, beta, tc0); | |
887 transpose4x4(pix-2, trans, stride, 4); | |
888 pix += 4*stride; | |
889 tc0 += 2; | |
890 } | |
692 } | 891 } |
693 | 892 |
694 #ifdef CONFIG_ENCODERS | 893 #ifdef CONFIG_ENCODERS |
695 static int pix_norm1_mmx(uint8_t *pix, int line_size) { | 894 static int pix_norm1_mmx(uint8_t *pix, int line_size) { |
696 int tmp; | 895 int tmp; |
3182 dspfunc(avg_h264_qpel, 0, 16); | 3381 dspfunc(avg_h264_qpel, 0, 16); |
3183 dspfunc(avg_h264_qpel, 1, 8); | 3382 dspfunc(avg_h264_qpel, 1, 8); |
3184 dspfunc(avg_h264_qpel, 2, 4); | 3383 dspfunc(avg_h264_qpel, 2, 4); |
3185 #undef dspfunc | 3384 #undef dspfunc |
3186 | 3385 |
3386 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; | |
3387 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; | |
3388 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; | |
3389 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; | |
3390 | |
3187 #ifdef CONFIG_ENCODERS | 3391 #ifdef CONFIG_ENCODERS |
3188 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; | 3392 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; |
3189 #endif //CONFIG_ENCODERS | 3393 #endif //CONFIG_ENCODERS |
3190 } else if (mm_flags & MM_3DNOW) { | 3394 } else if (mm_flags & MM_3DNOW) { |
3191 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; | 3395 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |