comparison x86/h264dsp_mmx.c @ 10953:eb9a2581f50e libavcodec

Attempt to fix asm compilation failure. Only tested on gcc 4 & x86_64.
author michael
date Wed, 20 Jan 2010 19:23:19 +0000
parents 3a723e8dcd26
children fdddf3d4238f
comparison
equal deleted inserted replaced
10952:ea8f891d997d 10953:eb9a2581f50e
810 ); 810 );
811 811
812 // could do a special case for dir==0 && edges==1, but it only reduces the 812 // could do a special case for dir==0 && edges==1, but it only reduces the
813 // average filter time by 1.2% 813 // average filter time by 1.2%
814 for( dir=1; dir>=0; dir-- ) { 814 for( dir=1; dir>=0; dir-- ) {
815 const int d_idx = dir ? -8 : -1; 815 const x86_reg d_idx = dir ? -8 : -1;
816 const int mask_mv = dir ? mask_mv1 : mask_mv0; 816 const int mask_mv = dir ? mask_mv1 : mask_mv0;
817 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; 817 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
818 int b_idx, edge, l; 818 int b_idx, edge, l;
819 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { 819 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
820 __asm__ volatile( 820 __asm__ volatile(
823 ); 823 );
824 if(!(mask_mv & edge)) { 824 if(!(mask_mv & edge)) {
825 __asm__ volatile("pxor %%mm0, %%mm0 \n\t":); 825 __asm__ volatile("pxor %%mm0, %%mm0 \n\t":);
826 for( l = bidir; l >= 0; l-- ) { 826 for( l = bidir; l >= 0; l-- ) {
827 __asm__ volatile( 827 __asm__ volatile(
828 "movd %0, %%mm1 \n\t" 828 "movd (%0), %%mm1 \n\t"
829 "punpckldq %1, %%mm1 \n\t" 829 "punpckldq (%0,%1), %%mm1 \n\t"
830 "punpckldq %%mm1, %%mm2 \n\t" 830 "punpckldq %%mm1, %%mm2 \n\t"
831 "pcmpeqb %%mm2, %%mm1 \n\t" 831 "pcmpeqb %%mm2, %%mm1 \n\t"
832 "paddb %%mm6, %%mm1 \n\t" 832 "paddb %%mm6, %%mm1 \n\t"
833 "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] 833 "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
834 "por %%mm1, %%mm0 \n\t" 834 "por %%mm1, %%mm0 \n\t"
835 835
836 "movq %2, %%mm1 \n\t" 836 "movq (%2), %%mm1 \n\t"
837 "movq 8+1*%2, %%mm2 \n\t" 837 "movq 8(%2), %%mm2 \n\t"
838 "psubw %3, %%mm1 \n\t" 838 "psubw (%2,%1,4), %%mm1 \n\t"
839 "psubw 8+1*%3, %%mm2 \n\t" 839 "psubw 8(%2,%1,4), %%mm2 \n\t"
840 "packsswb %%mm2, %%mm1 \n\t" 840 "packsswb %%mm2, %%mm1 \n\t"
841 "paddb %%mm5, %%mm1 \n\t" 841 "paddb %%mm5, %%mm1 \n\t"
842 "pminub %%mm4, %%mm1 \n\t" 842 "pminub %%mm4, %%mm1 \n\t"
843 "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit 843 "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
844 "por %%mm1, %%mm0 \n\t" 844 "por %%mm1, %%mm0 \n\t"
845 ::"m"(ref[l][b_idx]), 845 ::"r"(ref[l]+b_idx),
846 "m"(ref[l][b_idx+d_idx]), 846 "r"(d_idx),
847 "m"(mv[l][b_idx][0]), 847 "r"(mv[l]+b_idx)
848 "m"(mv[l][b_idx+d_idx][0])
849 ); 848 );
850 } 849 }
851 if(bidir==1){ 850 if(bidir==1){
852 __asm__ volatile("pxor %%mm3, %%mm3 \n\t":); 851 __asm__ volatile("pxor %%mm3, %%mm3 \n\t":);
853 for( l = bidir; l >= 0; l-- ) { 852 for( l = bidir; l >= 0; l-- ) {
854 __asm__ volatile( 853 __asm__ volatile(
855 "movd %0, %%mm1 \n\t" 854 "movd (%0), %%mm1 \n\t"
856 "punpckldq %1, %%mm1 \n\t" 855 "punpckldq (%1), %%mm1 \n\t"
857 "punpckldq %%mm1, %%mm2 \n\t" 856 "punpckldq %%mm1, %%mm2 \n\t"
858 "pcmpeqb %%mm2, %%mm1 \n\t" 857 "pcmpeqb %%mm2, %%mm1 \n\t"
859 "paddb %%mm6, %%mm1 \n\t" 858 "paddb %%mm6, %%mm1 \n\t"
860 "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] 859 "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
861 "por %%mm1, %%mm3 \n\t" 860 "por %%mm1, %%mm3 \n\t"
862 861
863 "movq %2, %%mm1 \n\t" 862 "movq (%2), %%mm1 \n\t"
864 "movq 8+1*%2, %%mm2 \n\t" 863 "movq 8(%2), %%mm2 \n\t"
865 "psubw %3, %%mm1 \n\t" 864 "psubw (%3), %%mm1 \n\t"
866 "psubw 8+1*%3, %%mm2 \n\t" 865 "psubw 8(%3), %%mm2 \n\t"
867 "packsswb %%mm2, %%mm1 \n\t" 866 "packsswb %%mm2, %%mm1 \n\t"
868 "paddb %%mm5, %%mm1 \n\t" 867 "paddb %%mm5, %%mm1 \n\t"
869 "pminub %%mm4, %%mm1 \n\t" 868 "pminub %%mm4, %%mm1 \n\t"
870 "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit 869 "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
871 "por %%mm1, %%mm3 \n\t" 870 "por %%mm1, %%mm3 \n\t"
872 ::"m"(ref[l][b_idx]), 871 ::"r"(ref[l]+b_idx),
873 "m"(ref[1-l][b_idx+d_idx]), 872 "r"(ref[1-l]+b_idx+d_idx),
874 "m"(mv[l][b_idx][0]), 873 "r"(mv[l][b_idx]),
875 "m"(mv[1-l][b_idx+d_idx][0]) 874 "r"(mv[1-l][b_idx+d_idx])
876 ); 875 );
877 } 876 }
878 __asm__ volatile( 877 __asm__ volatile(
879 "pcmpeqw %%mm7, %%mm3 \n\t" 878 "pcmpeqw %%mm7, %%mm3 \n\t"
880 "psubusw %%mm3, %%mm0 \n\t" 879 "psubusw %%mm3, %%mm0 \n\t"