comparison x86/dsputilenc_mmx.c @ 12498:c997f09d1e10 libavcodec

Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm, which will hopefully solve the Win64/FATE failures caused by these functions.
author rbultje
date Fri, 17 Sep 2010 01:56:06 +0000
parents c5ffa8b81f9c
children bc17df45daa3
comparison
equal deleted inserted replaced
12497:c5ffa8b81f9c 12498:c997f09d1e10
877 877
878 *left_top= src1[w-1]; 878 *left_top= src1[w-1];
879 *left = src2[w-1]; 879 *left = src2[w-1];
880 } 880 }
881 881
882 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
883 "mov"#m" "#p1", "#a" \n\t"\
884 "mov"#m" "#p2", "#t" \n\t"\
885 "punpcklbw "#a", "#t" \n\t"\
886 "punpcklbw "#a", "#a" \n\t"\
887 "psubw "#t", "#a" \n\t"\
888
889 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
890 uint8_t *p1b=p1, *p2b=p2;\
891 __asm__ volatile(\
892 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
893 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
894 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
895 "add %4, %1 \n\t"\
896 "add %4, %2 \n\t"\
897 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
898 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
899 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
900 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
901 "mov"#m1" "#mm"0, %0 \n\t"\
902 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
903 "mov"#m1" %0, "#mm"0 \n\t"\
904 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
905 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
906 );\
907 }
908 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
909
910 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
911 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
912
913 #define LBUTTERFLY2(a1,b1,a2,b2)\
914 "paddw " #b1 ", " #a1 " \n\t"\
915 "paddw " #b2 ", " #a2 " \n\t"\
916 "paddw " #b1 ", " #b1 " \n\t"\
917 "paddw " #b2 ", " #b2 " \n\t"\
918 "psubw " #a1 ", " #b1 " \n\t"\
919 "psubw " #a2 ", " #b2 " \n\t"
920
921 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
922 LBUTTERFLY2(m0, m1, m2, m3)\
923 LBUTTERFLY2(m4, m5, m6, m7)\
924 LBUTTERFLY2(m0, m2, m1, m3)\
925 LBUTTERFLY2(m4, m6, m5, m7)\
926 LBUTTERFLY2(m0, m4, m1, m5)\
927 LBUTTERFLY2(m2, m6, m3, m7)\
928
929 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
930
931 #define MMABS_MMX(a,z)\ 882 #define MMABS_MMX(a,z)\
932 "pxor " #z ", " #z " \n\t"\ 883 "pxor " #z ", " #z " \n\t"\
933 "pcmpgtw " #a ", " #z " \n\t"\ 884 "pcmpgtw " #a ", " #z " \n\t"\
934 "pxor " #z ", " #a " \n\t"\ 885 "pxor " #z ", " #a " \n\t"\
935 "psubw " #z ", " #a " \n\t" 886 "psubw " #z ", " #a " \n\t"
943 "pabsw " #a ", " #a " \n\t" 894 "pabsw " #a ", " #a " \n\t"
944 895
945 #define MMABS_SUM(a,z, sum)\ 896 #define MMABS_SUM(a,z, sum)\
946 MMABS(a,z)\ 897 MMABS(a,z)\
947 "paddusw " #a ", " #sum " \n\t" 898 "paddusw " #a ", " #sum " \n\t"
948
949 #define MMABS_SUM_8x8_NOSPILL\
950 MMABS(%%xmm0, %%xmm8)\
951 MMABS(%%xmm1, %%xmm9)\
952 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
953 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
954 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
955 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
956 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
957 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
958 "paddusw %%xmm1, %%xmm0 \n\t"
959
960 #if ARCH_X86_64
961 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
962 #else
963 #define MMABS_SUM_8x8_SSE2\
964 "movdqa %%xmm7, (%1) \n\t"\
965 MMABS(%%xmm0, %%xmm7)\
966 MMABS(%%xmm1, %%xmm7)\
967 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
968 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
969 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
970 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
971 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
972 "movdqa (%1), %%xmm2 \n\t"\
973 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
974 "paddusw %%xmm1, %%xmm0 \n\t"
975 #endif
976 899
977 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to 900 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
978 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, 901 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
979 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ 902 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
980 #define HSUM_MMX(a, t, dst)\ 903 #define HSUM_MMX(a, t, dst)\
1000 "paddusw "#t", "#a" \n\t"\ 923 "paddusw "#t", "#a" \n\t"\
1001 "pshuflw $0x01, "#a", "#t" \n\t"\ 924 "pshuflw $0x01, "#a", "#t" \n\t"\
1002 "paddusw "#t", "#a" \n\t"\ 925 "paddusw "#t", "#a" \n\t"\
1003 "movd "#a", "#dst" \n\t"\ 926 "movd "#a", "#dst" \n\t"\
1004 927
1005 #define HADAMARD8_DIFF_MMX(cpu) \ 928 #define hadamard_func(cpu) \
1006 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ 929 int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \
1007 DECLARE_ALIGNED(8, uint64_t, temp)[13];\ 930 int stride, int h); \
1008 int sum;\ 931 int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
1009 \ 932 int stride, int h);
1010 assert(h==8);\ 933
1011 \ 934 hadamard_func(mmx)
1012 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ 935 hadamard_func(mmx2)
1013 \ 936 hadamard_func(sse2)
1014 __asm__ volatile(\ 937 hadamard_func(ssse3)
1015 HADAMARD48\
1016 \
1017 "movq %%mm7, 96(%1) \n\t"\
1018 \
1019 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1020 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1021 \
1022 "movq 96(%1), %%mm7 \n\t"\
1023 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1024 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
1025 \
1026 : "=r" (sum)\
1027 : "r"(temp)\
1028 );\
1029 \
1030 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1031 \
1032 __asm__ volatile(\
1033 HADAMARD48\
1034 \
1035 "movq %%mm7, 96(%1) \n\t"\
1036 \
1037 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1038 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1039 \
1040 "movq 96(%1), %%mm7 \n\t"\
1041 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1042 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1043 "movq %%mm6, %%mm7 \n\t"\
1044 "movq %%mm0, %%mm6 \n\t"\
1045 \
1046 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1047 \
1048 HADAMARD48\
1049 "movq %%mm7, 64(%1) \n\t"\
1050 MMABS(%%mm0, %%mm7)\
1051 MMABS(%%mm1, %%mm7)\
1052 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1053 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1054 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1055 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1056 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1057 "movq 64(%1), %%mm2 \n\t"\
1058 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1059 "paddusw %%mm1, %%mm0 \n\t"\
1060 "movq %%mm0, 64(%1) \n\t"\
1061 \
1062 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1063 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
1064 \
1065 HADAMARD48\
1066 "movq %%mm7, (%1) \n\t"\
1067 MMABS(%%mm0, %%mm7)\
1068 MMABS(%%mm1, %%mm7)\
1069 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1070 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1071 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1072 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1073 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1074 "movq (%1), %%mm2 \n\t"\
1075 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1076 "paddusw 64(%1), %%mm0 \n\t"\
1077 "paddusw %%mm1, %%mm0 \n\t"\
1078 \
1079 HSUM(%%mm0, %%mm1, %0)\
1080 \
1081 : "=r" (sum)\
1082 : "r"(temp)\
1083 );\
1084 return sum&0xFFFF;\
1085 }\
1086 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1087
1088 #define HADAMARD8_DIFF_SSE2(cpu) \
1089 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1090 DECLARE_ALIGNED(16, uint64_t, temp)[4];\
1091 int sum;\
1092 \
1093 assert(h==8);\
1094 \
1095 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1096 \
1097 __asm__ volatile(\
1098 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1099 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1100 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1101 MMABS_SUM_8x8\
1102 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1103 : "=r" (sum)\
1104 : "r"(temp)\
1105 );\
1106 return sum&0xFFFF;\
1107 }\
1108 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1109
1110 #define MMABS(a,z) MMABS_MMX(a,z)
1111 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1112 HADAMARD8_DIFF_MMX(mmx)
1113 #undef MMABS
1114 #undef HSUM
1115
1116 #define MMABS(a,z) MMABS_MMX2(a,z)
1117 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1118 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1119 HADAMARD8_DIFF_MMX(mmx2)
1120 HADAMARD8_DIFF_SSE2(sse2)
1121 #undef MMABS
1122 #undef MMABS_SUM_8x8
1123 #undef HSUM
1124
1125 #if HAVE_SSSE3
1126 #define MMABS(a,z) MMABS_SSSE3(a,z)
1127 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1128 HADAMARD8_DIFF_SSE2(ssse3)
1129 #undef MMABS
1130 #undef MMABS_SUM_8x8
1131 #endif
1132 938
1133 #define DCT_SAD4(m,mm,o)\ 939 #define DCT_SAD4(m,mm,o)\
1134 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ 940 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1135 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ 941 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1136 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ 942 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1310 c->pix_sum = pix_sum16_mmx; 1116 c->pix_sum = pix_sum16_mmx;
1311 1117
1312 c->diff_bytes= diff_bytes_mmx; 1118 c->diff_bytes= diff_bytes_mmx;
1313 c->sum_abs_dctelem= sum_abs_dctelem_mmx; 1119 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1314 1120
1315 c->hadamard8_diff[0]= hadamard8_diff16_mmx; 1121 c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx;
1316 c->hadamard8_diff[1]= hadamard8_diff_mmx; 1122 c->hadamard8_diff[1]= ff_hadamard8_diff_mmx;
1317 1123
1318 c->pix_norm1 = pix_norm1_mmx; 1124 c->pix_norm1 = pix_norm1_mmx;
1319 c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx; 1125 c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx;
1320 c->sse[1] = sse8_mmx; 1126 c->sse[1] = sse8_mmx;
1321 c->vsad[4]= vsad_intra16_mmx; 1127 c->vsad[4]= vsad_intra16_mmx;
1334 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; 1140 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1335 1141
1336 1142
1337 if (mm_flags & AV_CPU_FLAG_MMX2) { 1143 if (mm_flags & AV_CPU_FLAG_MMX2) {
1338 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; 1144 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1339 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; 1145 c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2;
1340 c->hadamard8_diff[1]= hadamard8_diff_mmx2; 1146 c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2;
1341 c->vsad[4]= vsad_intra16_mmx2; 1147 c->vsad[4]= vsad_intra16_mmx2;
1342 1148
1343 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1149 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1344 c->vsad[0] = vsad16_mmx2; 1150 c->vsad[0] = vsad16_mmx2;
1345 } 1151 }
1348 } 1154 }
1349 1155
1350 if(mm_flags & AV_CPU_FLAG_SSE2){ 1156 if(mm_flags & AV_CPU_FLAG_SSE2){
1351 c->get_pixels = get_pixels_sse2; 1157 c->get_pixels = get_pixels_sse2;
1352 c->sum_abs_dctelem= sum_abs_dctelem_sse2; 1158 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1353 c->hadamard8_diff[0]= hadamard8_diff16_sse2; 1159 c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
1354 c->hadamard8_diff[1]= hadamard8_diff_sse2; 1160 c->hadamard8_diff[1]= ff_hadamard8_diff_sse2;
1355 } 1161 }
1356 1162
1357 if (CONFIG_LPC && mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) { 1163 if (CONFIG_LPC && mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
1358 c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; 1164 c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
1359 } 1165 }
1363 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1169 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1364 c->try_8x8basis= try_8x8basis_ssse3; 1170 c->try_8x8basis= try_8x8basis_ssse3;
1365 } 1171 }
1366 c->add_8x8basis= add_8x8basis_ssse3; 1172 c->add_8x8basis= add_8x8basis_ssse3;
1367 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; 1173 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1368 c->hadamard8_diff[0]= hadamard8_diff16_ssse3; 1174 c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3;
1369 c->hadamard8_diff[1]= hadamard8_diff_ssse3; 1175 c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3;
1370 } 1176 }
1371 #endif 1177 #endif
1372 1178
1373 if(mm_flags & AV_CPU_FLAG_3DNOW){ 1179 if(mm_flags & AV_CPU_FLAG_3DNOW){
1374 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1180 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){