comparison i386/dsputil_mmx.c @ 2967:ef2149182f1c libavcodec

COSMETICS: Remove all trailing whitespace.
author diego
date Sat, 17 Dec 2005 18:14:38 +0000
parents 8aa244d7c274
children bfabfdf9ce55
comparison
equal deleted inserted replaced
2966:564788471dd4 2967:ef2149182f1c
600 600
601 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ 601 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
602 const int strength= ff_h263_loop_filter_strength[qscale]; 602 const int strength= ff_h263_loop_filter_strength[qscale];
603 603
604 asm volatile( 604 asm volatile(
605 605
606 H263_LOOP_FILTER 606 H263_LOOP_FILTER
607 607
608 "movq %%mm3, %1 \n\t" 608 "movq %%mm3, %1 \n\t"
609 "movq %%mm4, %2 \n\t" 609 "movq %%mm4, %2 \n\t"
610 "movq %%mm5, %0 \n\t" 610 "movq %%mm5, %0 \n\t"
611 "movq %%mm6, %3 \n\t" 611 "movq %%mm6, %3 \n\t"
612 : "+m" (*(uint64_t*)(src - 2*stride)), 612 : "+m" (*(uint64_t*)(src - 2*stride)),
632 "punpckhdq %%mm0, %%mm0 \n\t" 632 "punpckhdq %%mm0, %%mm0 \n\t"
633 "movd %%mm0, %1 \n\t" 633 "movd %%mm0, %1 \n\t"
634 "movd %%mm1, %2 \n\t" 634 "movd %%mm1, %2 \n\t"
635 "punpckhdq %%mm1, %%mm1 \n\t" 635 "punpckhdq %%mm1, %%mm1 \n\t"
636 "movd %%mm1, %3 \n\t" 636 "movd %%mm1, %3 \n\t"
637 637
638 : "=m" (*(uint32_t*)(dst + 0*dst_stride)), 638 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
639 "=m" (*(uint32_t*)(dst + 1*dst_stride)), 639 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
640 "=m" (*(uint32_t*)(dst + 2*dst_stride)), 640 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
641 "=m" (*(uint32_t*)(dst + 3*dst_stride)) 641 "=m" (*(uint32_t*)(dst + 3*dst_stride))
642 : "m" (*(uint32_t*)(src + 0*src_stride)), 642 : "m" (*(uint32_t*)(src + 0*src_stride)),
648 648
649 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ 649 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
650 const int strength= ff_h263_loop_filter_strength[qscale]; 650 const int strength= ff_h263_loop_filter_strength[qscale];
651 uint64_t temp[4] __attribute__ ((aligned(8))); 651 uint64_t temp[4] __attribute__ ((aligned(8)));
652 uint8_t *btemp= (uint8_t*)temp; 652 uint8_t *btemp= (uint8_t*)temp;
653 653
654 src -= 2; 654 src -= 2;
655 655
656 transpose4x4(btemp , src , 8, stride); 656 transpose4x4(btemp , src , 8, stride);
657 transpose4x4(btemp+4, src + 4*stride, 8, stride); 657 transpose4x4(btemp+4, src + 4*stride, 8, stride);
658 asm volatile( 658 asm volatile(
659 H263_LOOP_FILTER // 5 3 4 6 659 H263_LOOP_FILTER // 5 3 4 6
660 660
661 : "+m" (temp[0]), 661 : "+m" (temp[0]),
662 "+m" (temp[1]), 662 "+m" (temp[1]),
663 "+m" (temp[2]), 663 "+m" (temp[2]),
664 "+m" (temp[3]) 664 "+m" (temp[3])
665 : "g" (2*strength), "m"(ff_pb_FC) 665 : "g" (2*strength), "m"(ff_pb_FC)
794 794
795 "movq %%mm7,%%mm1\n" 795 "movq %%mm7,%%mm1\n"
796 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 796 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
797 "paddd %%mm7,%%mm1\n" 797 "paddd %%mm7,%%mm1\n"
798 "movd %%mm1,%2\n" 798 "movd %%mm1,%2\n"
799 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 799 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
800 : "r" ((long)line_size) , "m" (h) 800 : "r" ((long)line_size) , "m" (h)
801 : "%ecx"); 801 : "%ecx");
802 return tmp; 802 return tmp;
803 } 803 }
804 804
854 854
855 "movq %%mm7,%%mm1\n" 855 "movq %%mm7,%%mm1\n"
856 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 856 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
857 "paddd %%mm7,%%mm1\n" 857 "paddd %%mm7,%%mm1\n"
858 "movd %%mm1,%2\n" 858 "movd %%mm1,%2\n"
859 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 859 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
860 : "r" ((long)line_size) , "m" (h) 860 : "r" ((long)line_size) , "m" (h)
861 : "%ecx"); 861 : "%ecx");
862 return tmp; 862 return tmp;
863 } 863 }
864 864
917 "paddd %%xmm1,%%xmm7\n" 917 "paddd %%xmm1,%%xmm7\n"
918 "movdqa %%xmm7,%%xmm1\n" 918 "movdqa %%xmm7,%%xmm1\n"
919 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ 919 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
920 "paddd %%xmm1,%%xmm7\n" 920 "paddd %%xmm1,%%xmm7\n"
921 "movd %%xmm7,%3\n" 921 "movd %%xmm7,%3\n"
922 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) 922 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
923 : "r" ((long)line_size)); 923 : "r" ((long)line_size));
924 return tmp; 924 return tmp;
925 } 925 }
926 926
927 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { 927 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
928 int tmp; 928 int tmp;
929 asm volatile ( 929 asm volatile (
930 "movl %3,%%ecx\n" 930 "movl %3,%%ecx\n"
931 "pxor %%mm7,%%mm7\n" 931 "pxor %%mm7,%%mm7\n"
932 "pxor %%mm6,%%mm6\n" 932 "pxor %%mm6,%%mm6\n"
933 933
934 "movq (%0),%%mm0\n" 934 "movq (%0),%%mm0\n"
935 "movq %%mm0, %%mm1\n" 935 "movq %%mm0, %%mm1\n"
936 "psllq $8, %%mm0\n" 936 "psllq $8, %%mm0\n"
937 "psrlq $8, %%mm1\n" 937 "psrlq $8, %%mm1\n"
938 "psrlq $8, %%mm0\n" 938 "psrlq $8, %%mm0\n"
942 "punpcklbw %%mm7,%%mm1\n" 942 "punpcklbw %%mm7,%%mm1\n"
943 "punpckhbw %%mm7,%%mm2\n" 943 "punpckhbw %%mm7,%%mm2\n"
944 "punpckhbw %%mm7,%%mm3\n" 944 "punpckhbw %%mm7,%%mm3\n"
945 "psubw %%mm1, %%mm0\n" 945 "psubw %%mm1, %%mm0\n"
946 "psubw %%mm3, %%mm2\n" 946 "psubw %%mm3, %%mm2\n"
947 947
948 "add %2,%0\n" 948 "add %2,%0\n"
949 949
950 "movq (%0),%%mm4\n" 950 "movq (%0),%%mm4\n"
951 "movq %%mm4, %%mm1\n" 951 "movq %%mm4, %%mm1\n"
952 "psllq $8, %%mm4\n" 952 "psllq $8, %%mm4\n"
953 "psrlq $8, %%mm1\n" 953 "psrlq $8, %%mm1\n"
954 "psrlq $8, %%mm4\n" 954 "psrlq $8, %%mm4\n"
966 "pxor %%mm1, %%mm1\n" 966 "pxor %%mm1, %%mm1\n"
967 "pcmpgtw %%mm0, %%mm3\n\t" 967 "pcmpgtw %%mm0, %%mm3\n\t"
968 "pcmpgtw %%mm2, %%mm1\n\t" 968 "pcmpgtw %%mm2, %%mm1\n\t"
969 "pxor %%mm3, %%mm0\n" 969 "pxor %%mm3, %%mm0\n"
970 "pxor %%mm1, %%mm2\n" 970 "pxor %%mm1, %%mm2\n"
971 "psubw %%mm3, %%mm0\n" 971 "psubw %%mm3, %%mm0\n"
972 "psubw %%mm1, %%mm2\n" 972 "psubw %%mm1, %%mm2\n"
973 "paddw %%mm0, %%mm2\n" 973 "paddw %%mm0, %%mm2\n"
974 "paddw %%mm2, %%mm6\n" 974 "paddw %%mm2, %%mm6\n"
975 975
976 "add %2,%0\n" 976 "add %2,%0\n"
977 "1:\n" 977 "1:\n"
978 978
979 "movq (%0),%%mm0\n" 979 "movq (%0),%%mm0\n"
980 "movq %%mm0, %%mm1\n" 980 "movq %%mm0, %%mm1\n"
981 "psllq $8, %%mm0\n" 981 "psllq $8, %%mm0\n"
982 "psrlq $8, %%mm1\n" 982 "psrlq $8, %%mm1\n"
983 "psrlq $8, %%mm0\n" 983 "psrlq $8, %%mm0\n"
995 "pxor %%mm1, %%mm1\n" 995 "pxor %%mm1, %%mm1\n"
996 "pcmpgtw %%mm4, %%mm3\n\t" 996 "pcmpgtw %%mm4, %%mm3\n\t"
997 "pcmpgtw %%mm5, %%mm1\n\t" 997 "pcmpgtw %%mm5, %%mm1\n\t"
998 "pxor %%mm3, %%mm4\n" 998 "pxor %%mm3, %%mm4\n"
999 "pxor %%mm1, %%mm5\n" 999 "pxor %%mm1, %%mm5\n"
1000 "psubw %%mm3, %%mm4\n" 1000 "psubw %%mm3, %%mm4\n"
1001 "psubw %%mm1, %%mm5\n" 1001 "psubw %%mm1, %%mm5\n"
1002 "paddw %%mm4, %%mm5\n" 1002 "paddw %%mm4, %%mm5\n"
1003 "paddw %%mm5, %%mm6\n" 1003 "paddw %%mm5, %%mm6\n"
1004 1004
1005 "add %2,%0\n" 1005 "add %2,%0\n"
1006 1006
1007 "movq (%0),%%mm4\n" 1007 "movq (%0),%%mm4\n"
1008 "movq %%mm4, %%mm1\n" 1008 "movq %%mm4, %%mm1\n"
1009 "psllq $8, %%mm4\n" 1009 "psllq $8, %%mm4\n"
1010 "psrlq $8, %%mm1\n" 1010 "psrlq $8, %%mm1\n"
1011 "psrlq $8, %%mm4\n" 1011 "psrlq $8, %%mm4\n"
1023 "pxor %%mm1, %%mm1\n" 1023 "pxor %%mm1, %%mm1\n"
1024 "pcmpgtw %%mm0, %%mm3\n\t" 1024 "pcmpgtw %%mm0, %%mm3\n\t"
1025 "pcmpgtw %%mm2, %%mm1\n\t" 1025 "pcmpgtw %%mm2, %%mm1\n\t"
1026 "pxor %%mm3, %%mm0\n" 1026 "pxor %%mm3, %%mm0\n"
1027 "pxor %%mm1, %%mm2\n" 1027 "pxor %%mm1, %%mm2\n"
1028 "psubw %%mm3, %%mm0\n" 1028 "psubw %%mm3, %%mm0\n"
1029 "psubw %%mm1, %%mm2\n" 1029 "psubw %%mm1, %%mm2\n"
1030 "paddw %%mm0, %%mm2\n" 1030 "paddw %%mm0, %%mm2\n"
1031 "paddw %%mm2, %%mm6\n" 1031 "paddw %%mm2, %%mm6\n"
1032 1032
1033 "add %2,%0\n" 1033 "add %2,%0\n"
1036 1036
1037 "movq %%mm6, %%mm0\n" 1037 "movq %%mm6, %%mm0\n"
1038 "punpcklwd %%mm7,%%mm0\n" 1038 "punpcklwd %%mm7,%%mm0\n"
1039 "punpckhwd %%mm7,%%mm6\n" 1039 "punpckhwd %%mm7,%%mm6\n"
1040 "paddd %%mm0, %%mm6\n" 1040 "paddd %%mm0, %%mm6\n"
1041 1041
1042 "movq %%mm6,%%mm0\n" 1042 "movq %%mm6,%%mm0\n"
1043 "psrlq $32, %%mm6\n" 1043 "psrlq $32, %%mm6\n"
1044 "paddd %%mm6,%%mm0\n" 1044 "paddd %%mm6,%%mm0\n"
1045 "movd %%mm0,%1\n" 1045 "movd %%mm0,%1\n"
1046 : "+r" (pix1), "=r"(tmp) 1046 : "+r" (pix1), "=r"(tmp)
1047 : "r" ((long)line_size) , "g" (h-2) 1047 : "r" ((long)line_size) , "g" (h-2)
1048 : "%ecx"); 1048 : "%ecx");
1049 return tmp; 1049 return tmp;
1050 } 1050 }
1051 1051
1054 uint8_t * pix= pix1; 1054 uint8_t * pix= pix1;
1055 asm volatile ( 1055 asm volatile (
1056 "movl %3,%%ecx\n" 1056 "movl %3,%%ecx\n"
1057 "pxor %%mm7,%%mm7\n" 1057 "pxor %%mm7,%%mm7\n"
1058 "pxor %%mm6,%%mm6\n" 1058 "pxor %%mm6,%%mm6\n"
1059 1059
1060 "movq (%0),%%mm0\n" 1060 "movq (%0),%%mm0\n"
1061 "movq 1(%0),%%mm1\n" 1061 "movq 1(%0),%%mm1\n"
1062 "movq %%mm0, %%mm2\n" 1062 "movq %%mm0, %%mm2\n"
1063 "movq %%mm1, %%mm3\n" 1063 "movq %%mm1, %%mm3\n"
1064 "punpcklbw %%mm7,%%mm0\n" 1064 "punpcklbw %%mm7,%%mm0\n"
1065 "punpcklbw %%mm7,%%mm1\n" 1065 "punpcklbw %%mm7,%%mm1\n"
1066 "punpckhbw %%mm7,%%mm2\n" 1066 "punpckhbw %%mm7,%%mm2\n"
1067 "punpckhbw %%mm7,%%mm3\n" 1067 "punpckhbw %%mm7,%%mm3\n"
1068 "psubw %%mm1, %%mm0\n" 1068 "psubw %%mm1, %%mm0\n"
1069 "psubw %%mm3, %%mm2\n" 1069 "psubw %%mm3, %%mm2\n"
1070 1070
1071 "add %2,%0\n" 1071 "add %2,%0\n"
1072 1072
1073 "movq (%0),%%mm4\n" 1073 "movq (%0),%%mm4\n"
1074 "movq 1(%0),%%mm1\n" 1074 "movq 1(%0),%%mm1\n"
1075 "movq %%mm4, %%mm5\n" 1075 "movq %%mm4, %%mm5\n"
1076 "movq %%mm1, %%mm3\n" 1076 "movq %%mm1, %%mm3\n"
1077 "punpcklbw %%mm7,%%mm4\n" 1077 "punpcklbw %%mm7,%%mm4\n"
1086 "pxor %%mm1, %%mm1\n" 1086 "pxor %%mm1, %%mm1\n"
1087 "pcmpgtw %%mm0, %%mm3\n\t" 1087 "pcmpgtw %%mm0, %%mm3\n\t"
1088 "pcmpgtw %%mm2, %%mm1\n\t" 1088 "pcmpgtw %%mm2, %%mm1\n\t"
1089 "pxor %%mm3, %%mm0\n" 1089 "pxor %%mm3, %%mm0\n"
1090 "pxor %%mm1, %%mm2\n" 1090 "pxor %%mm1, %%mm2\n"
1091 "psubw %%mm3, %%mm0\n" 1091 "psubw %%mm3, %%mm0\n"
1092 "psubw %%mm1, %%mm2\n" 1092 "psubw %%mm1, %%mm2\n"
1093 "paddw %%mm0, %%mm2\n" 1093 "paddw %%mm0, %%mm2\n"
1094 "paddw %%mm2, %%mm6\n" 1094 "paddw %%mm2, %%mm6\n"
1095 1095
1096 "add %2,%0\n" 1096 "add %2,%0\n"
1097 "1:\n" 1097 "1:\n"
1098 1098
1099 "movq (%0),%%mm0\n" 1099 "movq (%0),%%mm0\n"
1100 "movq 1(%0),%%mm1\n" 1100 "movq 1(%0),%%mm1\n"
1101 "movq %%mm0, %%mm2\n" 1101 "movq %%mm0, %%mm2\n"
1102 "movq %%mm1, %%mm3\n" 1102 "movq %%mm1, %%mm3\n"
1103 "punpcklbw %%mm7,%%mm0\n" 1103 "punpcklbw %%mm7,%%mm0\n"
1116 "pxor %%mm1, %%mm5\n" 1116 "pxor %%mm1, %%mm5\n"
1117 "psubw %%mm3, %%mm4\n" 1117 "psubw %%mm3, %%mm4\n"
1118 "psubw %%mm1, %%mm5\n" 1118 "psubw %%mm1, %%mm5\n"
1119 "paddw %%mm4, %%mm5\n" 1119 "paddw %%mm4, %%mm5\n"
1120 "paddw %%mm5, %%mm6\n" 1120 "paddw %%mm5, %%mm6\n"
1121 1121
1122 "add %2,%0\n" 1122 "add %2,%0\n"
1123 1123
1124 "movq (%0),%%mm4\n" 1124 "movq (%0),%%mm4\n"
1125 "movq 1(%0),%%mm1\n" 1125 "movq 1(%0),%%mm1\n"
1126 "movq %%mm4, %%mm5\n" 1126 "movq %%mm4, %%mm5\n"
1127 "movq %%mm1, %%mm3\n" 1127 "movq %%mm1, %%mm3\n"
1128 "punpcklbw %%mm7,%%mm4\n" 1128 "punpcklbw %%mm7,%%mm4\n"
1137 "pxor %%mm1, %%mm1\n" 1137 "pxor %%mm1, %%mm1\n"
1138 "pcmpgtw %%mm0, %%mm3\n\t" 1138 "pcmpgtw %%mm0, %%mm3\n\t"
1139 "pcmpgtw %%mm2, %%mm1\n\t" 1139 "pcmpgtw %%mm2, %%mm1\n\t"
1140 "pxor %%mm3, %%mm0\n" 1140 "pxor %%mm3, %%mm0\n"
1141 "pxor %%mm1, %%mm2\n" 1141 "pxor %%mm1, %%mm2\n"
1142 "psubw %%mm3, %%mm0\n" 1142 "psubw %%mm3, %%mm0\n"
1143 "psubw %%mm1, %%mm2\n" 1143 "psubw %%mm1, %%mm2\n"
1144 "paddw %%mm0, %%mm2\n" 1144 "paddw %%mm0, %%mm2\n"
1145 "paddw %%mm2, %%mm6\n" 1145 "paddw %%mm2, %%mm6\n"
1146 1146
1147 "add %2,%0\n" 1147 "add %2,%0\n"
1150 1150
1151 "movq %%mm6, %%mm0\n" 1151 "movq %%mm6, %%mm0\n"
1152 "punpcklwd %%mm7,%%mm0\n" 1152 "punpcklwd %%mm7,%%mm0\n"
1153 "punpckhwd %%mm7,%%mm6\n" 1153 "punpckhwd %%mm7,%%mm6\n"
1154 "paddd %%mm0, %%mm6\n" 1154 "paddd %%mm0, %%mm6\n"
1155 1155
1156 "movq %%mm6,%%mm0\n" 1156 "movq %%mm6,%%mm0\n"
1157 "psrlq $32, %%mm6\n" 1157 "psrlq $32, %%mm6\n"
1158 "paddd %%mm6,%%mm0\n" 1158 "paddd %%mm6,%%mm0\n"
1159 "movd %%mm0,%1\n" 1159 "movd %%mm0,%1\n"
1160 : "+r" (pix1), "=r"(tmp) 1160 : "+r" (pix1), "=r"(tmp)
1161 : "r" ((long)line_size) , "g" (h-2) 1161 : "r" ((long)line_size) , "g" (h-2)
1162 : "%ecx"); 1162 : "%ecx");
1163 return tmp + hf_noise8_mmx(pix+8, line_size, h); 1163 return tmp + hf_noise8_mmx(pix+8, line_size, h);
1164 } 1164 }
1165 1165
1184 else return score1 + ABS(score2)*8; 1184 else return score1 + ABS(score2)*8;
1185 } 1185 }
1186 1186
1187 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 1187 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1188 int tmp; 1188 int tmp;
1189 1189
1190 assert( (((int)pix) & 7) == 0); 1190 assert( (((int)pix) & 7) == 0);
1191 assert((line_size &7) ==0); 1191 assert((line_size &7) ==0);
1192 1192
1193 #define SUM(in0, in1, out0, out1) \ 1193 #define SUM(in0, in1, out0, out1) \
1194 "movq (%0), %%mm2\n"\ 1194 "movq (%0), %%mm2\n"\
1195 "movq 8(%0), %%mm3\n"\ 1195 "movq 8(%0), %%mm3\n"\
1196 "add %2,%0\n"\ 1196 "add %2,%0\n"\
1197 "movq %%mm2, " #out0 "\n"\ 1197 "movq %%mm2, " #out0 "\n"\
1211 "paddw " #in1 ", " #in0 "\n"\ 1211 "paddw " #in1 ", " #in0 "\n"\
1212 "paddw %%mm3, %%mm2\n"\ 1212 "paddw %%mm3, %%mm2\n"\
1213 "paddw %%mm2, " #in0 "\n"\ 1213 "paddw %%mm2, " #in0 "\n"\
1214 "paddw " #in0 ", %%mm6\n" 1214 "paddw " #in0 ", %%mm6\n"
1215 1215
1216 1216
1217 asm volatile ( 1217 asm volatile (
1218 "movl %3,%%ecx\n" 1218 "movl %3,%%ecx\n"
1219 "pxor %%mm6,%%mm6\n" 1219 "pxor %%mm6,%%mm6\n"
1220 "pxor %%mm7,%%mm7\n" 1220 "pxor %%mm7,%%mm7\n"
1221 "movq (%0),%%mm0\n" 1221 "movq (%0),%%mm0\n"
1222 "movq 8(%0),%%mm1\n" 1222 "movq 8(%0),%%mm1\n"
1223 "add %2,%0\n" 1223 "add %2,%0\n"
1224 "subl $2, %%ecx\n" 1224 "subl $2, %%ecx\n"
1225 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 1225 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1226 "1:\n" 1226 "1:\n"
1227 1227
1228 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 1228 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1229 1229
1230 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 1230 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1231 1231
1232 "subl $2, %%ecx\n" 1232 "subl $2, %%ecx\n"
1233 "jnz 1b\n" 1233 "jnz 1b\n"
1234 1234
1235 "movq %%mm6,%%mm0\n" 1235 "movq %%mm6,%%mm0\n"
1236 "psrlq $32, %%mm6\n" 1236 "psrlq $32, %%mm6\n"
1237 "paddw %%mm6,%%mm0\n" 1237 "paddw %%mm6,%%mm0\n"
1238 "movq %%mm0,%%mm6\n" 1238 "movq %%mm0,%%mm6\n"
1239 "psrlq $16, %%mm0\n" 1239 "psrlq $16, %%mm0\n"
1240 "paddw %%mm6,%%mm0\n" 1240 "paddw %%mm6,%%mm0\n"
1241 "movd %%mm0,%1\n" 1241 "movd %%mm0,%1\n"
1242 : "+r" (pix), "=r"(tmp) 1242 : "+r" (pix), "=r"(tmp)
1243 : "r" ((long)line_size) , "m" (h) 1243 : "r" ((long)line_size) , "m" (h)
1244 : "%ecx"); 1244 : "%ecx");
1245 return tmp & 0xFFFF; 1245 return tmp & 0xFFFF;
1246 } 1246 }
1247 #undef SUM 1247 #undef SUM
1248 1248
1249 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 1249 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1250 int tmp; 1250 int tmp;
1251 1251
1252 assert( (((int)pix) & 7) == 0); 1252 assert( (((int)pix) & 7) == 0);
1253 assert((line_size &7) ==0); 1253 assert((line_size &7) ==0);
1254 1254
1255 #define SUM(in0, in1, out0, out1) \ 1255 #define SUM(in0, in1, out0, out1) \
1256 "movq (%0), " #out0 "\n"\ 1256 "movq (%0), " #out0 "\n"\
1257 "movq 8(%0), " #out1 "\n"\ 1257 "movq 8(%0), " #out1 "\n"\
1258 "add %2,%0\n"\ 1258 "add %2,%0\n"\
1259 "psadbw " #out0 ", " #in0 "\n"\ 1259 "psadbw " #out0 ", " #in0 "\n"\
1269 "movq 8(%0),%%mm1\n" 1269 "movq 8(%0),%%mm1\n"
1270 "add %2,%0\n" 1270 "add %2,%0\n"
1271 "subl $2, %%ecx\n" 1271 "subl $2, %%ecx\n"
1272 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 1272 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1273 "1:\n" 1273 "1:\n"
1274 1274
1275 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 1275 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1276 1276
1277 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 1277 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1278 1278
1279 "subl $2, %%ecx\n" 1279 "subl $2, %%ecx\n"
1280 "jnz 1b\n" 1280 "jnz 1b\n"
1281 1281
1282 "movd %%mm6,%1\n" 1282 "movd %%mm6,%1\n"
1283 : "+r" (pix), "=r"(tmp) 1283 : "+r" (pix), "=r"(tmp)
1284 : "r" ((long)line_size) , "m" (h) 1284 : "r" ((long)line_size) , "m" (h)
1285 : "%ecx"); 1285 : "%ecx");
1286 return tmp; 1286 return tmp;
1287 } 1287 }
1288 #undef SUM 1288 #undef SUM
1289 1289
1290 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 1290 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1291 int tmp; 1291 int tmp;
1292 1292
1293 assert( (((int)pix1) & 7) == 0); 1293 assert( (((int)pix1) & 7) == 0);
1294 assert( (((int)pix2) & 7) == 0); 1294 assert( (((int)pix2) & 7) == 0);
1295 assert((line_size &7) ==0); 1295 assert((line_size &7) ==0);
1296 1296
1297 #define SUM(in0, in1, out0, out1) \ 1297 #define SUM(in0, in1, out0, out1) \
1298 "movq (%0),%%mm2\n"\ 1298 "movq (%0),%%mm2\n"\
1299 "movq (%1)," #out0 "\n"\ 1299 "movq (%1)," #out0 "\n"\
1300 "movq 8(%0),%%mm3\n"\ 1300 "movq 8(%0),%%mm3\n"\
1301 "movq 8(%1)," #out1 "\n"\ 1301 "movq 8(%1)," #out1 "\n"\
1322 "paddw " #in1 ", " #in0 "\n"\ 1322 "paddw " #in1 ", " #in0 "\n"\
1323 "paddw %%mm3, %%mm2\n"\ 1323 "paddw %%mm3, %%mm2\n"\
1324 "paddw %%mm2, " #in0 "\n"\ 1324 "paddw %%mm2, " #in0 "\n"\
1325 "paddw " #in0 ", %%mm6\n" 1325 "paddw " #in0 ", %%mm6\n"
1326 1326
1327 1327
1328 asm volatile ( 1328 asm volatile (
1329 "movl %4,%%ecx\n" 1329 "movl %4,%%ecx\n"
1330 "pxor %%mm6,%%mm6\n" 1330 "pxor %%mm6,%%mm6\n"
1331 "pcmpeqw %%mm7,%%mm7\n" 1331 "pcmpeqw %%mm7,%%mm7\n"
1332 "psllw $15, %%mm7\n" 1332 "psllw $15, %%mm7\n"
1342 "psubb %%mm3, %%mm1\n" 1342 "psubb %%mm3, %%mm1\n"
1343 "pxor %%mm7, %%mm0\n" 1343 "pxor %%mm7, %%mm0\n"
1344 "pxor %%mm7, %%mm1\n" 1344 "pxor %%mm7, %%mm1\n"
1345 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 1345 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1346 "1:\n" 1346 "1:\n"
1347 1347
1348 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 1348 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1349 1349
1350 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 1350 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1351 1351
1352 "subl $2, %%ecx\n" 1352 "subl $2, %%ecx\n"
1353 "jnz 1b\n" 1353 "jnz 1b\n"
1354 1354
1355 "movq %%mm6,%%mm0\n" 1355 "movq %%mm6,%%mm0\n"
1356 "psrlq $32, %%mm6\n" 1356 "psrlq $32, %%mm6\n"
1357 "paddw %%mm6,%%mm0\n" 1357 "paddw %%mm6,%%mm0\n"
1358 "movq %%mm0,%%mm6\n" 1358 "movq %%mm0,%%mm6\n"
1359 "psrlq $16, %%mm0\n" 1359 "psrlq $16, %%mm0\n"
1360 "paddw %%mm6,%%mm0\n" 1360 "paddw %%mm6,%%mm0\n"
1361 "movd %%mm0,%2\n" 1361 "movd %%mm0,%2\n"
1362 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 1362 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1363 : "r" ((long)line_size) , "m" (h) 1363 : "r" ((long)line_size) , "m" (h)
1364 : "%ecx"); 1364 : "%ecx");
1365 return tmp & 0x7FFF; 1365 return tmp & 0x7FFF;
1366 } 1366 }
1367 #undef SUM 1367 #undef SUM
1368 1368
1369 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 1369 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1370 int tmp; 1370 int tmp;
1371 1371
1372 assert( (((int)pix1) & 7) == 0); 1372 assert( (((int)pix1) & 7) == 0);
1373 assert( (((int)pix2) & 7) == 0); 1373 assert( (((int)pix2) & 7) == 0);
1374 assert((line_size &7) ==0); 1374 assert((line_size &7) ==0);
1375 1375
1376 #define SUM(in0, in1, out0, out1) \ 1376 #define SUM(in0, in1, out0, out1) \
1377 "movq (%0)," #out0 "\n"\ 1377 "movq (%0)," #out0 "\n"\
1378 "movq (%1),%%mm2\n"\ 1378 "movq (%1),%%mm2\n"\
1379 "movq 8(%0)," #out1 "\n"\ 1379 "movq 8(%0)," #out1 "\n"\
1380 "movq 8(%1),%%mm3\n"\ 1380 "movq 8(%1),%%mm3\n"\
1406 "psubb %%mm3, %%mm1\n" 1406 "psubb %%mm3, %%mm1\n"
1407 "pxor %%mm7, %%mm0\n" 1407 "pxor %%mm7, %%mm0\n"
1408 "pxor %%mm7, %%mm1\n" 1408 "pxor %%mm7, %%mm1\n"
1409 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 1409 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1410 "1:\n" 1410 "1:\n"
1411 1411
1412 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 1412 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1413 1413
1414 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 1414 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1415 1415
1416 "subl $2, %%ecx\n" 1416 "subl $2, %%ecx\n"
1417 "jnz 1b\n" 1417 "jnz 1b\n"
1418 1418
1419 "movd %%mm6,%2\n" 1419 "movd %%mm6,%2\n"
1420 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 1420 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1421 : "r" ((long)line_size) , "m" (h) 1421 : "r" ((long)line_size) , "m" (h)
1422 : "%ecx"); 1422 : "%ecx");
1423 return tmp; 1423 return tmp;
1424 } 1424 }
1425 #undef SUM 1425 #undef SUM
1447 } 1447 }
1448 1448
1449 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ 1449 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1450 long i=0; 1450 long i=0;
1451 uint8_t l, lt; 1451 uint8_t l, lt;
1452 1452
1453 asm volatile( 1453 asm volatile(
1454 "1: \n\t" 1454 "1: \n\t"
1455 "movq -1(%1, %0), %%mm0 \n\t" // LT 1455 "movq -1(%1, %0), %%mm0 \n\t" // LT
1456 "movq (%1, %0), %%mm1 \n\t" // T 1456 "movq (%1, %0), %%mm1 \n\t" // T
1457 "movq -1(%2, %0), %%mm2 \n\t" // L 1457 "movq -1(%2, %0), %%mm2 \n\t" // L
1460 "psubb %%mm0, %%mm2 \n\t" 1460 "psubb %%mm0, %%mm2 \n\t"
1461 "paddb %%mm1, %%mm2 \n\t" // L + T - LT 1461 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
1462 "movq %%mm4, %%mm5 \n\t" // L 1462 "movq %%mm4, %%mm5 \n\t" // L
1463 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) 1463 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1464 "pminub %%mm5, %%mm1 \n\t" // min(T, L) 1464 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1465 "pminub %%mm2, %%mm4 \n\t" 1465 "pminub %%mm2, %%mm4 \n\t"
1466 "pmaxub %%mm1, %%mm4 \n\t" 1466 "pmaxub %%mm1, %%mm4 \n\t"
1467 "psubb %%mm4, %%mm3 \n\t" // dst - pred 1467 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1468 "movq %%mm3, (%3, %0) \n\t" 1468 "movq %%mm3, (%3, %0) \n\t"
1469 "add $8, %0 \n\t" 1469 "add $8, %0 \n\t"
1470 "cmp %4, %0 \n\t" 1470 "cmp %4, %0 \n\t"
1473 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w) 1473 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1474 ); 1474 );
1475 1475
1476 l= *left; 1476 l= *left;
1477 lt= *left_top; 1477 lt= *left_top;
1478 1478
1479 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); 1479 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1480 1480
1481 *left_top= src1[w-1]; 1481 *left_top= src1[w-1];
1482 *left = src2[w-1]; 1482 *left = src2[w-1];
1483 } 1483 }
1484 1484
1485 #define LBUTTERFLY2(a1,b1,a2,b2)\ 1485 #define LBUTTERFLY2(a1,b1,a2,b2)\
1519 #define MMABS_SUM_MMX2(a,z, sum)\ 1519 #define MMABS_SUM_MMX2(a,z, sum)\
1520 "pxor " #z ", " #z " \n\t"\ 1520 "pxor " #z ", " #z " \n\t"\
1521 "psubw " #a ", " #z " \n\t"\ 1521 "psubw " #a ", " #z " \n\t"\
1522 "pmaxsw " #z ", " #a " \n\t"\ 1522 "pmaxsw " #z ", " #a " \n\t"\
1523 "paddusw " #a ", " #sum " \n\t" 1523 "paddusw " #a ", " #sum " \n\t"
1524 1524
1525 #define SBUTTERFLY(a,b,t,n)\ 1525 #define SBUTTERFLY(a,b,t,n)\
1526 "movq " #a ", " #t " \n\t" /* abcd */\ 1526 "movq " #a ", " #t " \n\t" /* abcd */\
1527 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ 1527 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
1528 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ 1528 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
1529 1529
1546 "movq "#d", "#o"+48(%1) \n\t"\ 1546 "movq "#d", "#o"+48(%1) \n\t"\
1547 1547
1548 static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ 1548 static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1549 uint64_t temp[16] __align8; 1549 uint64_t temp[16] __align8;
1550 int sum=0; 1550 int sum=0;
1551 1551
1552 assert(h==8); 1552 assert(h==8);
1553 1553
1554 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); 1554 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1555 1555
1556 asm volatile( 1556 asm volatile(
1557 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) 1557 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1558 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) 1558 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1559 1559
1560 HADAMARD48 1560 HADAMARD48
1561 1561
1562 "movq %%mm7, 112(%1) \n\t" 1562 "movq %%mm7, 112(%1) \n\t"
1563 1563
1564 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) 1564 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1565 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) 1565 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1566 1566
1567 "movq 112(%1), %%mm7 \n\t" 1567 "movq 112(%1), %%mm7 \n\t"
1568 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) 1568 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1569 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) 1569 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1570 1570
1571 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) 1571 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1572 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) 1572 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1573 1573
1574 HADAMARD48 1574 HADAMARD48
1575 1575
1576 "movq %%mm7, 120(%1) \n\t" 1576 "movq %%mm7, 120(%1) \n\t"
1577 1577
1578 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) 1578 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1579 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) 1579 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1580 1580
1581 "movq 120(%1), %%mm7 \n\t" 1581 "movq 120(%1), %%mm7 \n\t"
1582 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) 1582 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1583 "movq %%mm7, %%mm5 \n\t"//FIXME remove 1583 "movq %%mm7, %%mm5 \n\t"//FIXME remove
1584 "movq %%mm6, %%mm7 \n\t" 1584 "movq %%mm6, %%mm7 \n\t"
1585 "movq %%mm0, %%mm6 \n\t" 1585 "movq %%mm0, %%mm6 \n\t"
1586 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove 1586 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1587 1587
1588 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) 1588 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1589 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) 1589 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1590 1590
1591 HADAMARD48 1591 HADAMARD48
1592 "movq %%mm7, 64(%1) \n\t" 1592 "movq %%mm7, 64(%1) \n\t"
1593 MMABS(%%mm0, %%mm7) 1593 MMABS(%%mm0, %%mm7)
1594 MMABS_SUM(%%mm1, %%mm7, %%mm0) 1594 MMABS_SUM(%%mm1, %%mm7, %%mm0)
1595 MMABS_SUM(%%mm2, %%mm7, %%mm0) 1595 MMABS_SUM(%%mm2, %%mm7, %%mm0)
1598 MMABS_SUM(%%mm5, %%mm7, %%mm0) 1598 MMABS_SUM(%%mm5, %%mm7, %%mm0)
1599 MMABS_SUM(%%mm6, %%mm7, %%mm0) 1599 MMABS_SUM(%%mm6, %%mm7, %%mm0)
1600 "movq 64(%1), %%mm1 \n\t" 1600 "movq 64(%1), %%mm1 \n\t"
1601 MMABS_SUM(%%mm1, %%mm7, %%mm0) 1601 MMABS_SUM(%%mm1, %%mm7, %%mm0)
1602 "movq %%mm0, 64(%1) \n\t" 1602 "movq %%mm0, 64(%1) \n\t"
1603 1603
1604 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) 1604 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1605 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) 1605 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1606 1606
1607 HADAMARD48 1607 HADAMARD48
1608 "movq %%mm7, (%1) \n\t" 1608 "movq %%mm7, (%1) \n\t"
1609 MMABS(%%mm0, %%mm7) 1609 MMABS(%%mm0, %%mm7)
1610 MMABS_SUM(%%mm1, %%mm7, %%mm0) 1610 MMABS_SUM(%%mm1, %%mm7, %%mm0)
1611 MMABS_SUM(%%mm2, %%mm7, %%mm0) 1611 MMABS_SUM(%%mm2, %%mm7, %%mm0)
1615 MMABS_SUM(%%mm6, %%mm7, %%mm0) 1615 MMABS_SUM(%%mm6, %%mm7, %%mm0)
1616 "movq (%1), %%mm1 \n\t" 1616 "movq (%1), %%mm1 \n\t"
1617 MMABS_SUM(%%mm1, %%mm7, %%mm0) 1617 MMABS_SUM(%%mm1, %%mm7, %%mm0)
1618 "movq 64(%1), %%mm1 \n\t" 1618 "movq 64(%1), %%mm1 \n\t"
1619 MMABS_SUM(%%mm1, %%mm7, %%mm0) 1619 MMABS_SUM(%%mm1, %%mm7, %%mm0)
1620 1620
1621 "movq %%mm0, %%mm1 \n\t" 1621 "movq %%mm0, %%mm1 \n\t"
1622 "psrlq $32, %%mm0 \n\t" 1622 "psrlq $32, %%mm0 \n\t"
1623 "paddusw %%mm1, %%mm0 \n\t" 1623 "paddusw %%mm1, %%mm0 \n\t"
1624 "movq %%mm0, %%mm1 \n\t" 1624 "movq %%mm0, %%mm1 \n\t"
1625 "psrlq $16, %%mm0 \n\t" 1625 "psrlq $16, %%mm0 \n\t"
1626 "paddusw %%mm1, %%mm0 \n\t" 1626 "paddusw %%mm1, %%mm0 \n\t"
1627 "movd %%mm0, %0 \n\t" 1627 "movd %%mm0, %0 \n\t"
1628 1628
1629 : "=r" (sum) 1629 : "=r" (sum)
1630 : "r"(temp) 1630 : "r"(temp)
1631 ); 1631 );
1632 return sum&0xFFFF; 1632 return sum&0xFFFF;
1633 } 1633 }
1634 1634
1635 static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ 1635 static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1636 uint64_t temp[16] __align8; 1636 uint64_t temp[16] __align8;
1637 int sum=0; 1637 int sum=0;
1638 1638
1639 assert(h==8); 1639 assert(h==8);
1640 1640
1641 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); 1641 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1642 1642
1643 asm volatile( 1643 asm volatile(
1644 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) 1644 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1645 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) 1645 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1646 1646
1647 HADAMARD48 1647 HADAMARD48
1648 1648
1649 "movq %%mm7, 112(%1) \n\t" 1649 "movq %%mm7, 112(%1) \n\t"
1650 1650
1651 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) 1651 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1652 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) 1652 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1653 1653
1654 "movq 112(%1), %%mm7 \n\t" 1654 "movq 112(%1), %%mm7 \n\t"
1655 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) 1655 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1656 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) 1656 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1657 1657
1658 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) 1658 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1659 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) 1659 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1660 1660
1661 HADAMARD48 1661 HADAMARD48
1662 1662
1663 "movq %%mm7, 120(%1) \n\t" 1663 "movq %%mm7, 120(%1) \n\t"
1664 1664
1665 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) 1665 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1666 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) 1666 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1667 1667
1668 "movq 120(%1), %%mm7 \n\t" 1668 "movq 120(%1), %%mm7 \n\t"
1669 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) 1669 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1670 "movq %%mm7, %%mm5 \n\t"//FIXME remove 1670 "movq %%mm7, %%mm5 \n\t"//FIXME remove
1671 "movq %%mm6, %%mm7 \n\t" 1671 "movq %%mm6, %%mm7 \n\t"
1672 "movq %%mm0, %%mm6 \n\t" 1672 "movq %%mm0, %%mm6 \n\t"
1673 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove 1673 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1674 1674
1675 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) 1675 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1676 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) 1676 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1677 1677
1678 HADAMARD48 1678 HADAMARD48
1679 "movq %%mm7, 64(%1) \n\t" 1679 "movq %%mm7, 64(%1) \n\t"
1680 MMABS_MMX2(%%mm0, %%mm7) 1680 MMABS_MMX2(%%mm0, %%mm7)
1681 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) 1681 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1682 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) 1682 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1685 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0) 1685 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1686 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) 1686 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1687 "movq 64(%1), %%mm1 \n\t" 1687 "movq 64(%1), %%mm1 \n\t"
1688 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) 1688 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1689 "movq %%mm0, 64(%1) \n\t" 1689 "movq %%mm0, 64(%1) \n\t"
1690 1690
1691 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) 1691 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1692 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) 1692 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1693 1693
1694 HADAMARD48 1694 HADAMARD48
1695 "movq %%mm7, (%1) \n\t" 1695 "movq %%mm7, (%1) \n\t"
1696 MMABS_MMX2(%%mm0, %%mm7) 1696 MMABS_MMX2(%%mm0, %%mm7)
1697 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) 1697 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1698 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) 1698 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1702 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) 1702 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1703 "movq (%1), %%mm1 \n\t" 1703 "movq (%1), %%mm1 \n\t"
1704 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) 1704 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1705 "movq 64(%1), %%mm1 \n\t" 1705 "movq 64(%1), %%mm1 \n\t"
1706 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) 1706 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1707 1707
1708 "pshufw $0x0E, %%mm0, %%mm1 \n\t" 1708 "pshufw $0x0E, %%mm0, %%mm1 \n\t"
1709 "paddusw %%mm1, %%mm0 \n\t" 1709 "paddusw %%mm1, %%mm0 \n\t"
1710 "pshufw $0x01, %%mm0, %%mm1 \n\t" 1710 "pshufw $0x01, %%mm0, %%mm1 \n\t"
1711 "paddusw %%mm1, %%mm0 \n\t" 1711 "paddusw %%mm1, %%mm0 \n\t"
1712 "movd %%mm0, %0 \n\t" 1712 "movd %%mm0, %0 \n\t"
1713 1713
1714 : "=r" (sum) 1714 : "=r" (sum)
1715 : "r"(temp) 1715 : "r"(temp)
1716 ); 1716 );
1717 return sum&0xFFFF; 1717 return sum&0xFFFF;
1718 } 1718 }
2403 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ 2403 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2404 c->avg_ ## postfix1 = avg_ ## postfix2; 2404 c->avg_ ## postfix1 = avg_ ## postfix2;
2405 2405
2406 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ 2406 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2407 long i=0; 2407 long i=0;
2408 2408
2409 assert(ABS(scale) < 256); 2409 assert(ABS(scale) < 256);
2410 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; 2410 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2411 2411
2412 asm volatile( 2412 asm volatile(
2413 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w 2413 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
2414 "psrlw $15, %%mm6 \n\t" // 1w 2414 "psrlw $15, %%mm6 \n\t" // 1w
2415 "pxor %%mm7, %%mm7 \n\t" 2415 "pxor %%mm7, %%mm7 \n\t"
2416 "movd %4, %%mm5 \n\t" 2416 "movd %4, %%mm5 \n\t"
2417 "punpcklwd %%mm5, %%mm5 \n\t" 2417 "punpcklwd %%mm5, %%mm5 \n\t"
2418 "punpcklwd %%mm5, %%mm5 \n\t" 2418 "punpcklwd %%mm5, %%mm5 \n\t"
2419 "1: \n\t" 2419 "1: \n\t"
2420 "movq (%1, %0), %%mm0 \n\t" 2420 "movq (%1, %0), %%mm0 \n\t"
2421 "movq 8(%1, %0), %%mm1 \n\t" 2421 "movq 8(%1, %0), %%mm1 \n\t"
2422 "pmulhw %%mm5, %%mm0 \n\t" 2422 "pmulhw %%mm5, %%mm0 \n\t"
2423 "pmulhw %%mm5, %%mm1 \n\t" 2423 "pmulhw %%mm5, %%mm1 \n\t"
2424 "paddw %%mm6, %%mm0 \n\t" 2424 "paddw %%mm6, %%mm0 \n\t"
2425 "paddw %%mm6, %%mm1 \n\t" 2425 "paddw %%mm6, %%mm1 \n\t"
2442 "movq %%mm7, %%mm6 \n\t" 2442 "movq %%mm7, %%mm6 \n\t"
2443 "psrlq $32, %%mm7 \n\t" 2443 "psrlq $32, %%mm7 \n\t"
2444 "paddd %%mm6, %%mm7 \n\t" 2444 "paddd %%mm6, %%mm7 \n\t"
2445 "psrld $2, %%mm7 \n\t" 2445 "psrld $2, %%mm7 \n\t"
2446 "movd %%mm7, %0 \n\t" 2446 "movd %%mm7, %0 \n\t"
2447 2447
2448 : "+r" (i) 2448 : "+r" (i)
2449 : "r"(basis), "r"(rem), "r"(weight), "g"(scale) 2449 : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
2450 ); 2450 );
2451 return i; 2451 return i;
2452 } 2452 }
2453 2453
2454 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ 2454 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
2455 long i=0; 2455 long i=0;
2456 2456
2457 if(ABS(scale) < 256){ 2457 if(ABS(scale) < 256){
2458 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; 2458 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2459 asm volatile( 2459 asm volatile(
2460 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w 2460 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
2461 "psrlw $15, %%mm6 \n\t" // 1w 2461 "psrlw $15, %%mm6 \n\t" // 1w
2462 "movd %3, %%mm5 \n\t" 2462 "movd %3, %%mm5 \n\t"
2463 "punpcklwd %%mm5, %%mm5 \n\t" 2463 "punpcklwd %%mm5, %%mm5 \n\t"
2464 "punpcklwd %%mm5, %%mm5 \n\t" 2464 "punpcklwd %%mm5, %%mm5 \n\t"
2465 "1: \n\t" 2465 "1: \n\t"
2466 "movq (%1, %0), %%mm0 \n\t" 2466 "movq (%1, %0), %%mm0 \n\t"
2467 "movq 8(%1, %0), %%mm1 \n\t" 2467 "movq 8(%1, %0), %%mm1 \n\t"
2468 "pmulhw %%mm5, %%mm0 \n\t" 2468 "pmulhw %%mm5, %%mm0 \n\t"
2469 "pmulhw %%mm5, %%mm1 \n\t" 2469 "pmulhw %%mm5, %%mm1 \n\t"
2470 "paddw %%mm6, %%mm0 \n\t" 2470 "paddw %%mm6, %%mm0 \n\t"
2471 "paddw %%mm6, %%mm1 \n\t" 2471 "paddw %%mm6, %%mm1 \n\t"
2472 "psraw $1, %%mm0 \n\t" 2472 "psraw $1, %%mm0 \n\t"
2473 "psraw $1, %%mm1 \n\t" 2473 "psraw $1, %%mm1 \n\t"
2474 "paddw (%2, %0), %%mm0 \n\t" 2474 "paddw (%2, %0), %%mm0 \n\t"
2475 "paddw 8(%2, %0), %%mm1 \n\t" 2475 "paddw 8(%2, %0), %%mm1 \n\t"
2476 "movq %%mm0, (%2, %0) \n\t" 2476 "movq %%mm0, (%2, %0) \n\t"
2477 "movq %%mm1, 8(%2, %0) \n\t" 2477 "movq %%mm1, 8(%2, %0) \n\t"
2478 "add $16, %0 \n\t" 2478 "add $16, %0 \n\t"
2479 "cmp $128, %0 \n\t" //FIXME optimize & bench 2479 "cmp $128, %0 \n\t" //FIXME optimize & bench
2480 " jb 1b \n\t" 2480 " jb 1b \n\t"
2481 2481
2482 : "+r" (i) 2482 : "+r" (i)
2483 : "r"(basis), "r"(rem), "g"(scale) 2483 : "r"(basis), "r"(rem), "g"(scale)
2484 ); 2484 );
2485 }else{ 2485 }else{
2486 for(i=0; i<8*8; i++){ 2486 for(i=0; i<8*8; i++){
2487 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); 2487 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2488 } 2488 }
2489 } 2489 }
2490 } 2490 }
2491 2491
2492 #include "h264dsp_mmx.c" 2492 #include "h264dsp_mmx.c"
2493 2493
2494 /* external functions, from idct_mmx.c */ 2494 /* external functions, from idct_mmx.c */
2495 void ff_mmx_idct(DCTELEM *block); 2495 void ff_mmx_idct(DCTELEM *block);
2496 void ff_mmxext_idct(DCTELEM *block); 2496 void ff_mmxext_idct(DCTELEM *block);
2497 2497
2498 void ff_vp3_idct_sse2(int16_t *input_data); 2498 void ff_vp3_idct_sse2(int16_t *input_data);
2561 { 2561 {
2562 ff_idct_xvid_mmx2 (block); 2562 ff_idct_xvid_mmx2 (block);
2563 add_pixels_clamped_mmx(block, dest, line_size); 2563 add_pixels_clamped_mmx(block, dest, line_size);
2564 } 2564 }
2565 #endif 2565 #endif
2566 2566
2567 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) 2567 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2568 { 2568 {
2569 mm_flags = mm_support(); 2569 mm_flags = mm_support();
2570 2570
2571 if (avctx->dsp_mask) { 2571 if (avctx->dsp_mask) {
2699 2699
2700 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx; 2700 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
2701 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; 2701 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
2702 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; 2702 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
2703 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; 2703 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
2704 2704
2705 c->add_bytes= add_bytes_mmx; 2705 c->add_bytes= add_bytes_mmx;
2706 #ifdef CONFIG_ENCODERS 2706 #ifdef CONFIG_ENCODERS
2707 c->diff_bytes= diff_bytes_mmx; 2707 c->diff_bytes= diff_bytes_mmx;
2708 2708
2709 c->hadamard8_diff[0]= hadamard8_diff16_mmx; 2709 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
2710 c->hadamard8_diff[1]= hadamard8_diff_mmx; 2710 c->hadamard8_diff[1]= hadamard8_diff_mmx;
2711 2711
2712 c->pix_norm1 = pix_norm1_mmx; 2712 c->pix_norm1 = pix_norm1_mmx;
2713 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx; 2713 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
2714 c->sse[1] = sse8_mmx; 2714 c->sse[1] = sse8_mmx;
2715 c->vsad[4]= vsad_intra16_mmx; 2715 c->vsad[4]= vsad_intra16_mmx;
2716 2716
2717 c->nsse[0] = nsse16_mmx; 2717 c->nsse[0] = nsse16_mmx;
2718 c->nsse[1] = nsse8_mmx; 2718 c->nsse[1] = nsse8_mmx;
2719 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2719 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2720 c->vsad[0] = vsad16_mmx; 2720 c->vsad[0] = vsad16_mmx;
2721 } 2721 }
2722 2722
2723 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2723 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2724 c->try_8x8basis= try_8x8basis_mmx; 2724 c->try_8x8basis= try_8x8basis_mmx;
2725 } 2725 }
2726 c->add_8x8basis= add_8x8basis_mmx; 2726 c->add_8x8basis= add_8x8basis_mmx;
2727 2727
2728 #endif //CONFIG_ENCODERS 2728 #endif //CONFIG_ENCODERS
2729 2729
2730 c->h263_v_loop_filter= h263_v_loop_filter_mmx; 2730 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2731 c->h263_h_loop_filter= h263_h_loop_filter_mmx; 2731 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2732 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx; 2732 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
2733 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; 2733 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
2734 2734
2735 if (mm_flags & MM_MMXEXT) { 2735 if (mm_flags & MM_MMXEXT) {
2736 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; 2736 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2737 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; 2737 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2738 2738
2739 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; 2739 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2943 2943
2944 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow; 2944 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
2945 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; 2945 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
2946 } 2946 }
2947 } 2947 }
2948 2948
2949 #ifdef CONFIG_ENCODERS 2949 #ifdef CONFIG_ENCODERS
2950 dsputil_init_pix_mmx(c, avctx); 2950 dsputil_init_pix_mmx(c, avctx);
2951 #endif //CONFIG_ENCODERS 2951 #endif //CONFIG_ENCODERS
2952 #if 0 2952 #if 0
2953 // for speed testing 2953 // for speed testing