Mercurial > libavcodec.hg
comparison x86/dsputilenc_mmx.c @ 12498:c997f09d1e10 libavcodec
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
which will hopefully solve the Win64/FATE failures caused by these functions.
author | rbultje |
---|---|
date | Fri, 17 Sep 2010 01:56:06 +0000 |
parents | c5ffa8b81f9c |
children | bc17df45daa3 |
comparison
equal
deleted
inserted
replaced
12497:c5ffa8b81f9c | 12498:c997f09d1e10 |
---|---|
877 | 877 |
878 *left_top= src1[w-1]; | 878 *left_top= src1[w-1]; |
879 *left = src2[w-1]; | 879 *left = src2[w-1]; |
880 } | 880 } |
881 | 881 |
882 #define DIFF_PIXELS_1(m,a,t,p1,p2)\ | |
883 "mov"#m" "#p1", "#a" \n\t"\ | |
884 "mov"#m" "#p2", "#t" \n\t"\ | |
885 "punpcklbw "#a", "#t" \n\t"\ | |
886 "punpcklbw "#a", "#a" \n\t"\ | |
887 "psubw "#t", "#a" \n\t"\ | |
888 | |
889 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ | |
890 uint8_t *p1b=p1, *p2b=p2;\ | |
891 __asm__ volatile(\ | |
892 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ | |
893 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ | |
894 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ | |
895 "add %4, %1 \n\t"\ | |
896 "add %4, %2 \n\t"\ | |
897 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ | |
898 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ | |
899 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ | |
900 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ | |
901 "mov"#m1" "#mm"0, %0 \n\t"\ | |
902 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ | |
903 "mov"#m1" %0, "#mm"0 \n\t"\ | |
904 : "+m"(temp), "+r"(p1b), "+r"(p2b)\ | |
905 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\ | |
906 );\ | |
907 } | |
908 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp) | |
909 | |
910 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) | |
911 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) | |
912 | |
913 #define LBUTTERFLY2(a1,b1,a2,b2)\ | |
914 "paddw " #b1 ", " #a1 " \n\t"\ | |
915 "paddw " #b2 ", " #a2 " \n\t"\ | |
916 "paddw " #b1 ", " #b1 " \n\t"\ | |
917 "paddw " #b2 ", " #b2 " \n\t"\ | |
918 "psubw " #a1 ", " #b1 " \n\t"\ | |
919 "psubw " #a2 ", " #b2 " \n\t" | |
920 | |
921 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ | |
922 LBUTTERFLY2(m0, m1, m2, m3)\ | |
923 LBUTTERFLY2(m4, m5, m6, m7)\ | |
924 LBUTTERFLY2(m0, m2, m1, m3)\ | |
925 LBUTTERFLY2(m4, m6, m5, m7)\ | |
926 LBUTTERFLY2(m0, m4, m1, m5)\ | |
927 LBUTTERFLY2(m2, m6, m3, m7)\ | |
928 | |
929 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) | |
930 | |
931 #define MMABS_MMX(a,z)\ | 882 #define MMABS_MMX(a,z)\ |
932 "pxor " #z ", " #z " \n\t"\ | 883 "pxor " #z ", " #z " \n\t"\ |
933 "pcmpgtw " #a ", " #z " \n\t"\ | 884 "pcmpgtw " #a ", " #z " \n\t"\ |
934 "pxor " #z ", " #a " \n\t"\ | 885 "pxor " #z ", " #a " \n\t"\ |
935 "psubw " #z ", " #a " \n\t" | 886 "psubw " #z ", " #a " \n\t" |
943 "pabsw " #a ", " #a " \n\t" | 894 "pabsw " #a ", " #a " \n\t" |
944 | 895 |
945 #define MMABS_SUM(a,z, sum)\ | 896 #define MMABS_SUM(a,z, sum)\ |
946 MMABS(a,z)\ | 897 MMABS(a,z)\ |
947 "paddusw " #a ", " #sum " \n\t" | 898 "paddusw " #a ", " #sum " \n\t" |
948 | |
949 #define MMABS_SUM_8x8_NOSPILL\ | |
950 MMABS(%%xmm0, %%xmm8)\ | |
951 MMABS(%%xmm1, %%xmm9)\ | |
952 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ | |
953 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ | |
954 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ | |
955 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ | |
956 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ | |
957 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ | |
958 "paddusw %%xmm1, %%xmm0 \n\t" | |
959 | |
960 #if ARCH_X86_64 | |
961 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL | |
962 #else | |
963 #define MMABS_SUM_8x8_SSE2\ | |
964 "movdqa %%xmm7, (%1) \n\t"\ | |
965 MMABS(%%xmm0, %%xmm7)\ | |
966 MMABS(%%xmm1, %%xmm7)\ | |
967 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ | |
968 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ | |
969 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ | |
970 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ | |
971 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ | |
972 "movdqa (%1), %%xmm2 \n\t"\ | |
973 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ | |
974 "paddusw %%xmm1, %%xmm0 \n\t" | |
975 #endif | |
976 | 899 |
977 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to | 900 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to |
978 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, | 901 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
979 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ | 902 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ |
980 #define HSUM_MMX(a, t, dst)\ | 903 #define HSUM_MMX(a, t, dst)\ |
1000 "paddusw "#t", "#a" \n\t"\ | 923 "paddusw "#t", "#a" \n\t"\ |
1001 "pshuflw $0x01, "#a", "#t" \n\t"\ | 924 "pshuflw $0x01, "#a", "#t" \n\t"\ |
1002 "paddusw "#t", "#a" \n\t"\ | 925 "paddusw "#t", "#a" \n\t"\ |
1003 "movd "#a", "#dst" \n\t"\ | 926 "movd "#a", "#dst" \n\t"\ |
1004 | 927 |
1005 #define HADAMARD8_DIFF_MMX(cpu) \ | 928 #define hadamard_func(cpu) \ |
1006 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ | 929 int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \ |
1007 DECLARE_ALIGNED(8, uint64_t, temp)[13];\ | 930 int stride, int h); \ |
1008 int sum;\ | 931 int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \ |
1009 \ | 932 int stride, int h); |
1010 assert(h==8);\ | 933 |
1011 \ | 934 hadamard_func(mmx) |
1012 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ | 935 hadamard_func(mmx2) |
1013 \ | 936 hadamard_func(sse2) |
1014 __asm__ volatile(\ | 937 hadamard_func(ssse3) |
1015 HADAMARD48\ | |
1016 \ | |
1017 "movq %%mm7, 96(%1) \n\t"\ | |
1018 \ | |
1019 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ | |
1020 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ | |
1021 \ | |
1022 "movq 96(%1), %%mm7 \n\t"\ | |
1023 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ | |
1024 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ | |
1025 \ | |
1026 : "=r" (sum)\ | |
1027 : "r"(temp)\ | |
1028 );\ | |
1029 \ | |
1030 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ | |
1031 \ | |
1032 __asm__ volatile(\ | |
1033 HADAMARD48\ | |
1034 \ | |
1035 "movq %%mm7, 96(%1) \n\t"\ | |
1036 \ | |
1037 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ | |
1038 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ | |
1039 \ | |
1040 "movq 96(%1), %%mm7 \n\t"\ | |
1041 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ | |
1042 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ | |
1043 "movq %%mm6, %%mm7 \n\t"\ | |
1044 "movq %%mm0, %%mm6 \n\t"\ | |
1045 \ | |
1046 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ | |
1047 \ | |
1048 HADAMARD48\ | |
1049 "movq %%mm7, 64(%1) \n\t"\ | |
1050 MMABS(%%mm0, %%mm7)\ | |
1051 MMABS(%%mm1, %%mm7)\ | |
1052 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ | |
1053 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ | |
1054 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ | |
1055 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ | |
1056 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ | |
1057 "movq 64(%1), %%mm2 \n\t"\ | |
1058 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ | |
1059 "paddusw %%mm1, %%mm0 \n\t"\ | |
1060 "movq %%mm0, 64(%1) \n\t"\ | |
1061 \ | |
1062 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ | |
1063 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ | |
1064 \ | |
1065 HADAMARD48\ | |
1066 "movq %%mm7, (%1) \n\t"\ | |
1067 MMABS(%%mm0, %%mm7)\ | |
1068 MMABS(%%mm1, %%mm7)\ | |
1069 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ | |
1070 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ | |
1071 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ | |
1072 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ | |
1073 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ | |
1074 "movq (%1), %%mm2 \n\t"\ | |
1075 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ | |
1076 "paddusw 64(%1), %%mm0 \n\t"\ | |
1077 "paddusw %%mm1, %%mm0 \n\t"\ | |
1078 \ | |
1079 HSUM(%%mm0, %%mm1, %0)\ | |
1080 \ | |
1081 : "=r" (sum)\ | |
1082 : "r"(temp)\ | |
1083 );\ | |
1084 return sum&0xFFFF;\ | |
1085 }\ | |
1086 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) | |
1087 | |
1088 #define HADAMARD8_DIFF_SSE2(cpu) \ | |
1089 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ | |
1090 DECLARE_ALIGNED(16, uint64_t, temp)[4];\ | |
1091 int sum;\ | |
1092 \ | |
1093 assert(h==8);\ | |
1094 \ | |
1095 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ | |
1096 \ | |
1097 __asm__ volatile(\ | |
1098 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ | |
1099 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ | |
1100 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ | |
1101 MMABS_SUM_8x8\ | |
1102 HSUM_SSE2(%%xmm0, %%xmm1, %0)\ | |
1103 : "=r" (sum)\ | |
1104 : "r"(temp)\ | |
1105 );\ | |
1106 return sum&0xFFFF;\ | |
1107 }\ | |
1108 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) | |
1109 | |
1110 #define MMABS(a,z) MMABS_MMX(a,z) | |
1111 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) | |
1112 HADAMARD8_DIFF_MMX(mmx) | |
1113 #undef MMABS | |
1114 #undef HSUM | |
1115 | |
1116 #define MMABS(a,z) MMABS_MMX2(a,z) | |
1117 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 | |
1118 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) | |
1119 HADAMARD8_DIFF_MMX(mmx2) | |
1120 HADAMARD8_DIFF_SSE2(sse2) | |
1121 #undef MMABS | |
1122 #undef MMABS_SUM_8x8 | |
1123 #undef HSUM | |
1124 | |
1125 #if HAVE_SSSE3 | |
1126 #define MMABS(a,z) MMABS_SSSE3(a,z) | |
1127 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL | |
1128 HADAMARD8_DIFF_SSE2(ssse3) | |
1129 #undef MMABS | |
1130 #undef MMABS_SUM_8x8 | |
1131 #endif | |
1132 | 938 |
1133 #define DCT_SAD4(m,mm,o)\ | 939 #define DCT_SAD4(m,mm,o)\ |
1134 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ | 940 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ |
1135 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ | 941 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ |
1136 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ | 942 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ |
1310 c->pix_sum = pix_sum16_mmx; | 1116 c->pix_sum = pix_sum16_mmx; |
1311 | 1117 |
1312 c->diff_bytes= diff_bytes_mmx; | 1118 c->diff_bytes= diff_bytes_mmx; |
1313 c->sum_abs_dctelem= sum_abs_dctelem_mmx; | 1119 c->sum_abs_dctelem= sum_abs_dctelem_mmx; |
1314 | 1120 |
1315 c->hadamard8_diff[0]= hadamard8_diff16_mmx; | 1121 c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx; |
1316 c->hadamard8_diff[1]= hadamard8_diff_mmx; | 1122 c->hadamard8_diff[1]= ff_hadamard8_diff_mmx; |
1317 | 1123 |
1318 c->pix_norm1 = pix_norm1_mmx; | 1124 c->pix_norm1 = pix_norm1_mmx; |
1319 c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx; | 1125 c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx; |
1320 c->sse[1] = sse8_mmx; | 1126 c->sse[1] = sse8_mmx; |
1321 c->vsad[4]= vsad_intra16_mmx; | 1127 c->vsad[4]= vsad_intra16_mmx; |
1334 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; | 1140 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; |
1335 | 1141 |
1336 | 1142 |
1337 if (mm_flags & AV_CPU_FLAG_MMX2) { | 1143 if (mm_flags & AV_CPU_FLAG_MMX2) { |
1338 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; | 1144 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; |
1339 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; | 1145 c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2; |
1340 c->hadamard8_diff[1]= hadamard8_diff_mmx2; | 1146 c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2; |
1341 c->vsad[4]= vsad_intra16_mmx2; | 1147 c->vsad[4]= vsad_intra16_mmx2; |
1342 | 1148 |
1343 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | 1149 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
1344 c->vsad[0] = vsad16_mmx2; | 1150 c->vsad[0] = vsad16_mmx2; |
1345 } | 1151 } |
1348 } | 1154 } |
1349 | 1155 |
1350 if(mm_flags & AV_CPU_FLAG_SSE2){ | 1156 if(mm_flags & AV_CPU_FLAG_SSE2){ |
1351 c->get_pixels = get_pixels_sse2; | 1157 c->get_pixels = get_pixels_sse2; |
1352 c->sum_abs_dctelem= sum_abs_dctelem_sse2; | 1158 c->sum_abs_dctelem= sum_abs_dctelem_sse2; |
1353 c->hadamard8_diff[0]= hadamard8_diff16_sse2; | 1159 c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2; |
1354 c->hadamard8_diff[1]= hadamard8_diff_sse2; | 1160 c->hadamard8_diff[1]= ff_hadamard8_diff_sse2; |
1355 } | 1161 } |
1356 | 1162 |
1357 if (CONFIG_LPC && mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) { | 1163 if (CONFIG_LPC && mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) { |
1358 c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; | 1164 c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; |
1359 } | 1165 } |
1363 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | 1169 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
1364 c->try_8x8basis= try_8x8basis_ssse3; | 1170 c->try_8x8basis= try_8x8basis_ssse3; |
1365 } | 1171 } |
1366 c->add_8x8basis= add_8x8basis_ssse3; | 1172 c->add_8x8basis= add_8x8basis_ssse3; |
1367 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; | 1173 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; |
1368 c->hadamard8_diff[0]= hadamard8_diff16_ssse3; | 1174 c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3; |
1369 c->hadamard8_diff[1]= hadamard8_diff_ssse3; | 1175 c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3; |
1370 } | 1176 } |
1371 #endif | 1177 #endif |
1372 | 1178 |
1373 if(mm_flags & AV_CPU_FLAG_3DNOW){ | 1179 if(mm_flags & AV_CPU_FLAG_3DNOW){ |
1374 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | 1180 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |