Mercurial > libavcodec.hg
comparison x86/h264dsp_mmx.c @ 11017:aa10bb3c244c libavcodec
optimize h264_loop_filter_strength_mmx2
244->160 cycles on core2
author | lorenm |
---|---|
date | Tue, 26 Jan 2010 17:17:48 +0000 |
parents | 34a65026fa06 |
children | 98970e51365a |
comparison
equal
deleted
inserted
replaced
11016:4aee091df934 | 11017:aa10bb3c244c |
---|---|
794 | 794 |
795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], | 795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], |
796 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { | 796 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { |
797 int dir; | 797 int dir; |
798 __asm__ volatile( | 798 __asm__ volatile( |
799 "pxor %%mm7, %%mm7 \n\t" | 799 "movq %0, %%mm7 \n" |
800 "movq %0, %%mm6 \n\t" | 800 "movq %1, %%mm6 \n" |
801 "movq %1, %%mm5 \n\t" | 801 ::"m"(ff_pb_1), "m"(ff_pb_3) |
802 "movq %2, %%mm4 \n\t" | |
803 ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) | |
804 ); | 802 ); |
805 if(field) | 803 if(field) |
806 __asm__ volatile( | 804 __asm__ volatile( |
807 "movq %0, %%mm5 \n\t" | 805 "movq %0, %%mm6 \n" |
808 "movq %1, %%mm4 \n\t" | 806 ::"m"(ff_pb_3_1) |
809 ::"m"(ff_pb_3_1), "m"(ff_pb_7_3) | |
810 ); | 807 ); |
808 __asm__ volatile( | |
809 "movq %%mm6, %%mm5 \n" | |
810 "paddb %%mm5, %%mm5 \n" | |
811 :); | |
811 | 812 |
812 // could do a special case for dir==0 && edges==1, but it only reduces the | 813 // could do a special case for dir==0 && edges==1, but it only reduces the |
813 // average filter time by 1.2% | 814 // average filter time by 1.2% |
814 for( dir=1; dir>=0; dir-- ) { | 815 for( dir=1; dir>=0; dir-- ) { |
815 const x86_reg d_idx = dir ? -8 : -1; | 816 const x86_reg d_idx = dir ? -8 : -1; |
816 const int mask_mv = dir ? mask_mv1 : mask_mv0; | 817 const int mask_mv = dir ? mask_mv1 : mask_mv0; |
817 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; | 818 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; |
818 int b_idx, edge, l; | 819 int b_idx, edge; |
819 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { | 820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { |
820 __asm__ volatile( | 821 __asm__ volatile( |
821 "pand %0, %%mm0 \n\t" | 822 "pand %0, %%mm0 \n\t" |
822 ::"m"(mask_dir) | 823 ::"m"(mask_dir) |
823 ); | 824 ); |
824 if(!(mask_mv & edge)) { | 825 if(!(mask_mv & edge)) { |
825 __asm__ volatile("pxor %%mm0, %%mm0 \n\t":); | 826 if(bidir) { |
826 for( l = bidir; l >= 0; l-- ) { | |
827 __asm__ volatile( | 827 __asm__ volatile( |
828 "movd (%0), %%mm1 \n\t" | 828 "movd (%1,%0), %%mm2 \n" |
829 "punpckldq (%0,%1), %%mm1 \n\t" | 829 "punpckldq 40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] } |
830 "punpckldq %%mm1, %%mm2 \n\t" | 830 "pshufw $0x44, (%1), %%mm0 \n" // { ref0[b], ref0[b] } |
831 "pcmpeqb %%mm2, %%mm1 \n\t" | 831 "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] } |
832 "paddb %%mm6, %%mm1 \n\t" | 832 "pshufw $0x4E, %%mm2, %%mm3 \n" |
833 "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] | 833 "psubb %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } |
834 "por %%mm1, %%mm0 \n\t" | 834 "psubb %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } |
835 | 835 "1: \n" |
836 "movq (%2), %%mm1 \n\t" | 836 "por %%mm1, %%mm0 \n" |
837 "movq 8(%2), %%mm2 \n\t" | 837 "movq (%2,%0,4), %%mm1 \n" |
838 "psubw (%2,%1,4), %%mm1 \n\t" | 838 "movq 8(%2,%0,4), %%mm2 \n" |
839 "psubw 8(%2,%1,4), %%mm2 \n\t" | 839 "movq %%mm1, %%mm3 \n" |
840 "packsswb %%mm2, %%mm1 \n\t" | 840 "movq %%mm2, %%mm4 \n" |
841 "paddb %%mm5, %%mm1 \n\t" | 841 "psubw (%2), %%mm1 \n" |
842 "pminub %%mm4, %%mm1 \n\t" | 842 "psubw 8(%2), %%mm2 \n" |
843 "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit | 843 "psubw 160(%2), %%mm3 \n" |
844 "por %%mm1, %%mm0 \n\t" | 844 "psubw 168(%2), %%mm4 \n" |
845 ::"r"(ref[l]+b_idx), | 845 "packsswb %%mm2, %%mm1 \n" |
846 "r"(d_idx), | 846 "packsswb %%mm4, %%mm3 \n" |
847 "r"(mv[l]+b_idx) | 847 "paddb %%mm6, %%mm1 \n" |
848 "paddb %%mm6, %%mm3 \n" | |
849 "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit | |
850 "psubusb %%mm5, %%mm3 \n" | |
851 "packsswb %%mm3, %%mm1 \n" | |
852 "add $40, %0 \n" | |
853 "cmp $40, %0 \n" | |
854 "jl 1b \n" | |
855 "sub $80, %0 \n" | |
856 "pshufw $0x4E, %%mm1, %%mm1 \n" | |
857 "por %%mm1, %%mm0 \n" | |
858 "pshufw $0x4E, %%mm0, %%mm1 \n" | |
859 "pminub %%mm1, %%mm0 \n" | |
860 ::"r"(d_idx), | |
861 "r"(ref[0]+b_idx), | |
862 "r"(mv[0]+b_idx) | |
848 ); | 863 ); |
849 } | 864 } else { |
850 if(bidir==1){ | |
851 __asm__ volatile("pxor %%mm3, %%mm3 \n\t":); | |
852 for( l = bidir; l >= 0; l-- ) { | |
853 __asm__ volatile( | 865 __asm__ volatile( |
854 "movd (%0), %%mm1 \n\t" | 866 "movd (%1), %%mm0 \n" |
855 "punpckldq (%1), %%mm1 \n\t" | 867 "psubb (%1,%0), %%mm0 \n" // ref[b] != ref[bn] |
856 "punpckldq %%mm1, %%mm2 \n\t" | 868 "movq (%2), %%mm1 \n" |
857 "pcmpeqb %%mm2, %%mm1 \n\t" | 869 "movq 8(%2), %%mm2 \n" |
858 "paddb %%mm6, %%mm1 \n\t" | 870 "psubw (%2,%0,4), %%mm1 \n" |
859 "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] | 871 "psubw 8(%2,%0,4), %%mm2 \n" |
860 "por %%mm1, %%mm3 \n\t" | 872 "packsswb %%mm2, %%mm1 \n" |
861 | 873 "paddb %%mm6, %%mm1 \n" |
862 "movq (%2), %%mm1 \n\t" | 874 "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit |
863 "movq 8(%2), %%mm2 \n\t" | 875 "packsswb %%mm1, %%mm1 \n" |
864 "psubw (%3), %%mm1 \n\t" | 876 "por %%mm1, %%mm0 \n" |
865 "psubw 8(%3), %%mm2 \n\t" | 877 ::"r"(d_idx), |
866 "packsswb %%mm2, %%mm1 \n\t" | 878 "r"(ref[0]+b_idx), |
867 "paddb %%mm5, %%mm1 \n\t" | 879 "r"(mv[0]+b_idx) |
868 "pminub %%mm4, %%mm1 \n\t" | |
869 "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit | |
870 "por %%mm1, %%mm3 \n\t" | |
871 ::"r"(ref[l]+b_idx), | |
872 "r"(ref[1-l]+b_idx+d_idx), | |
873 "r"(mv[l][b_idx]), | |
874 "r"(mv[1-l][b_idx+d_idx]) | |
875 ); | 880 ); |
876 } | |
877 __asm__ volatile( | |
878 "pcmpeqw %%mm7, %%mm3 \n\t" | |
879 "psubusw %%mm3, %%mm0 \n\t" | |
880 :); | |
881 } | 881 } |
882 } | 882 } |
883 __asm__ volatile( | 883 __asm__ volatile( |
884 "movd %0, %%mm1 \n\t" | 884 "movd %0, %%mm1 \n" |
885 "por %1, %%mm1 \n\t" | 885 "por %1, %%mm1 \n" // nnz[b] || nnz[bn] |
886 "punpcklbw %%mm7, %%mm1 \n\t" | |
887 "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn] | |
888 ::"m"(nnz[b_idx]), | 886 ::"m"(nnz[b_idx]), |
889 "m"(nnz[b_idx+d_idx]) | 887 "m"(nnz[b_idx+d_idx]) |
890 ); | 888 ); |
891 __asm__ volatile( | 889 __asm__ volatile( |
892 "pcmpeqw %%mm7, %%mm0 \n\t" | 890 "pminub %%mm7, %%mm1 \n" |
893 "pcmpeqw %%mm7, %%mm0 \n\t" | 891 "pminub %%mm7, %%mm0 \n" |
894 "psrlw $15, %%mm0 \n\t" // nonzero -> 1 | 892 "psllw $1, %%mm1 \n" |
895 "psrlw $14, %%mm1 \n\t" | 893 "pxor %%mm2, %%mm2 \n" |
896 "movq %%mm0, %%mm2 \n\t" | 894 "pmaxub %%mm0, %%mm1 \n" |
897 "por %%mm1, %%mm2 \n\t" | 895 "punpcklbw %%mm2, %%mm1 \n" |
898 "psrlw $1, %%mm1 \n\t" | 896 "movq %%mm1, %0 \n" |
899 "pandn %%mm2, %%mm1 \n\t" | |
900 "movq %%mm1, %0 \n\t" | |
901 :"=m"(*bS[dir][edge]) | 897 :"=m"(*bS[dir][edge]) |
902 ::"memory" | 898 ::"memory" |
903 ); | 899 ); |
904 } | 900 } |
905 edges = 4; | 901 edges = 4; |