Mercurial > mplayer.hg
comparison postproc/swscale.c @ 13720:821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
author | aurel |
---|---|
date | Thu, 21 Oct 2004 11:55:20 +0000 |
parents | 992960f68af0 |
children | 31cb219364a4 |
comparison
equal
deleted
inserted
replaced
13719:43ecd6a73ec0 | 13720:821f464b4d90 |
---|---|
143 | 143 |
144 #define ABS(a) ((a) > 0 ? (a) : (-(a))) | 144 #define ABS(a) ((a) > 0 ? (a) : (-(a))) |
145 #define MIN(a,b) ((a) > (b) ? (b) : (a)) | 145 #define MIN(a,b) ((a) > (b) ? (b) : (a)) |
146 #define MAX(a,b) ((a) < (b) ? (b) : (a)) | 146 #define MAX(a,b) ((a) < (b) ? (b) : (a)) |
147 | 147 |
148 #ifdef ARCH_X86 | 148 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
149 static uint64_t attribute_used __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL; | 149 static uint64_t attribute_used __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL; |
150 static uint64_t attribute_used __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL; | 150 static uint64_t attribute_used __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL; |
151 static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL; | 151 static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL; |
152 static uint64_t attribute_used __attribute__((aligned(8))) w02= 0x0002000200020002LL; | 152 static uint64_t attribute_used __attribute__((aligned(8))) w02= 0x0002000200020002LL; |
153 static uint64_t attribute_used __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL; | 153 static uint64_t attribute_used __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL; |
202 extern const uint8_t dither_2x2_8[2][8]; | 202 extern const uint8_t dither_2x2_8[2][8]; |
203 extern const uint8_t dither_8x8_32[8][8]; | 203 extern const uint8_t dither_8x8_32[8][8]; |
204 extern const uint8_t dither_8x8_73[8][8]; | 204 extern const uint8_t dither_8x8_73[8][8]; |
205 extern const uint8_t dither_8x8_220[8][8]; | 205 extern const uint8_t dither_8x8_220[8][8]; |
206 | 206 |
207 #ifdef ARCH_X86 | 207 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
208 void in_asm_used_var_warning_killer() | 208 void in_asm_used_var_warning_killer() |
209 { | 209 { |
210 volatile int i= bF8+bFC+w10+ | 210 volatile int i= bF8+bFC+w10+ |
211 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+ | 211 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+ |
212 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101; | 212 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101; |
677 #ifdef HAVE_ALTIVEC | 677 #ifdef HAVE_ALTIVEC |
678 #define COMPILE_ALTIVEC | 678 #define COMPILE_ALTIVEC |
679 #endif //HAVE_ALTIVEC | 679 #endif //HAVE_ALTIVEC |
680 #endif //ARCH_POWERPC | 680 #endif //ARCH_POWERPC |
681 | 681 |
682 #ifdef ARCH_X86 | 682 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
683 | 683 |
684 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | 684 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) |
685 #define COMPILE_MMX | 685 #define COMPILE_MMX |
686 #endif | 686 #endif |
687 | 687 |
690 #endif | 690 #endif |
691 | 691 |
692 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | 692 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) |
693 #define COMPILE_3DNOW | 693 #define COMPILE_3DNOW |
694 #endif | 694 #endif |
695 #endif //ARCH_X86 | 695 #endif //ARCH_X86 || ARCH_X86_64 |
696 | 696 |
697 #undef HAVE_MMX | 697 #undef HAVE_MMX |
698 #undef HAVE_MMX2 | 698 #undef HAVE_MMX2 |
699 #undef HAVE_3DNOW | 699 #undef HAVE_3DNOW |
700 | 700 |
714 #define RENAME(a) a ## _altivec | 714 #define RENAME(a) a ## _altivec |
715 #include "swscale_template.c" | 715 #include "swscale_template.c" |
716 #endif | 716 #endif |
717 #endif //ARCH_POWERPC | 717 #endif //ARCH_POWERPC |
718 | 718 |
719 #ifdef ARCH_X86 | 719 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
720 | 720 |
721 //X86 versions | 721 //X86 versions |
722 /* | 722 /* |
723 #undef RENAME | 723 #undef RENAME |
724 #undef HAVE_MMX | 724 #undef HAVE_MMX |
756 #define HAVE_3DNOW | 756 #define HAVE_3DNOW |
757 #define RENAME(a) a ## _3DNow | 757 #define RENAME(a) a ## _3DNow |
758 #include "swscale_template.c" | 758 #include "swscale_template.c" |
759 #endif | 759 #endif |
760 | 760 |
761 #endif //ARCH_X86 | 761 #endif //ARCH_X86 || ARCH_X86_64 |
762 | 762 |
763 // minor note: the HAVE_xyz is messed up after that line so don't use it | 763 // minor note: the HAVE_xyz is messed up after that line so don't use it |
764 | 764 |
765 static double getSplineCoeff(double a, double b, double c, double d, double dist) | 765 static double getSplineCoeff(double a, double b, double c, double d, double dist) |
766 { | 766 { |
781 int filterSize; | 781 int filterSize; |
782 int filter2Size; | 782 int filter2Size; |
783 int minFilterSize; | 783 int minFilterSize; |
784 double *filter=NULL; | 784 double *filter=NULL; |
785 double *filter2=NULL; | 785 double *filter2=NULL; |
786 #ifdef ARCH_X86 | 786 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
787 if(flags & SWS_CPU_CAPS_MMX) | 787 if(flags & SWS_CPU_CAPS_MMX) |
788 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions) | 788 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions) |
789 #endif | 789 #endif |
790 | 790 |
791 // Note the +1 is for the MMXscaler which reads over the end | 791 // Note the +1 is for the MMXscaler which reads over the end |
1140 } | 1140 } |
1141 | 1141 |
1142 free(filter); | 1142 free(filter); |
1143 } | 1143 } |
1144 | 1144 |
1145 #ifdef ARCH_X86 | 1145 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1146 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits) | 1146 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits) |
1147 { | 1147 { |
1148 uint8_t *fragmentA; | 1148 uint8_t *fragmentA; |
1149 int imm8OfPShufW1A; | 1149 long imm8OfPShufW1A; |
1150 int imm8OfPShufW2A; | 1150 long imm8OfPShufW2A; |
1151 int fragmentLengthA; | 1151 long fragmentLengthA; |
1152 uint8_t *fragmentB; | 1152 uint8_t *fragmentB; |
1153 int imm8OfPShufW1B; | 1153 long imm8OfPShufW1B; |
1154 int imm8OfPShufW2B; | 1154 long imm8OfPShufW2B; |
1155 int fragmentLengthB; | 1155 long fragmentLengthB; |
1156 int fragmentPos; | 1156 int fragmentPos; |
1157 | 1157 |
1158 int xpos, i; | 1158 int xpos, i; |
1159 | 1159 |
1160 // create an optimized horizontal scaling routine | 1160 // create an optimized horizontal scaling routine |
1163 | 1163 |
1164 asm volatile( | 1164 asm volatile( |
1165 "jmp 9f \n\t" | 1165 "jmp 9f \n\t" |
1166 // Begin | 1166 // Begin |
1167 "0: \n\t" | 1167 "0: \n\t" |
1168 "movq (%%edx, %%eax), %%mm3 \n\t" | 1168 "movq (%%"REG_d", %%"REG_a"), %%mm3\n\t" |
1169 "movd (%%ecx, %%esi), %%mm0 \n\t" | 1169 "movd (%%"REG_c", %%"REG_S"), %%mm0\n\t" |
1170 "movd 1(%%ecx, %%esi), %%mm1 \n\t" | 1170 "movd 1(%%"REG_c", %%"REG_S"), %%mm1\n\t" |
1171 "punpcklbw %%mm7, %%mm1 \n\t" | 1171 "punpcklbw %%mm7, %%mm1 \n\t" |
1172 "punpcklbw %%mm7, %%mm0 \n\t" | 1172 "punpcklbw %%mm7, %%mm0 \n\t" |
1173 "pshufw $0xFF, %%mm1, %%mm1 \n\t" | 1173 "pshufw $0xFF, %%mm1, %%mm1 \n\t" |
1174 "1: \n\t" | 1174 "1: \n\t" |
1175 "pshufw $0xFF, %%mm0, %%mm0 \n\t" | 1175 "pshufw $0xFF, %%mm0, %%mm0 \n\t" |
1176 "2: \n\t" | 1176 "2: \n\t" |
1177 "psubw %%mm1, %%mm0 \n\t" | 1177 "psubw %%mm1, %%mm0 \n\t" |
1178 "movl 8(%%ebx, %%eax), %%esi \n\t" | 1178 "mov 8(%%"REG_b", %%"REG_a"), %%"REG_S"\n\t" |
1179 "pmullw %%mm3, %%mm0 \n\t" | 1179 "pmullw %%mm3, %%mm0 \n\t" |
1180 "psllw $7, %%mm1 \n\t" | 1180 "psllw $7, %%mm1 \n\t" |
1181 "paddw %%mm1, %%mm0 \n\t" | 1181 "paddw %%mm1, %%mm0 \n\t" |
1182 | 1182 |
1183 "movq %%mm0, (%%edi, %%eax) \n\t" | 1183 "movq %%mm0, (%%"REG_D", %%"REG_a")\n\t" |
1184 | 1184 |
1185 "addl $8, %%eax \n\t" | 1185 "add $8, %%"REG_a" \n\t" |
1186 // End | 1186 // End |
1187 "9: \n\t" | 1187 "9: \n\t" |
1188 // "int $3\n\t" | 1188 // "int $3\n\t" |
1189 "leal 0b, %0 \n\t" | 1189 "lea 0b, %0 \n\t" |
1190 "leal 1b, %1 \n\t" | 1190 "lea 1b, %1 \n\t" |
1191 "leal 2b, %2 \n\t" | 1191 "lea 2b, %2 \n\t" |
1192 "decl %1 \n\t" | 1192 "dec %1 \n\t" |
1193 "decl %2 \n\t" | 1193 "dec %2 \n\t" |
1194 "subl %0, %1 \n\t" | 1194 "sub %0, %1 \n\t" |
1195 "subl %0, %2 \n\t" | 1195 "sub %0, %2 \n\t" |
1196 "leal 9b, %3 \n\t" | 1196 "lea 9b, %3 \n\t" |
1197 "subl %0, %3 \n\t" | 1197 "sub %0, %3 \n\t" |
1198 | 1198 |
1199 | 1199 |
1200 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), | 1200 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), |
1201 "=r" (fragmentLengthA) | 1201 "=r" (fragmentLengthA) |
1202 ); | 1202 ); |
1203 | 1203 |
1204 asm volatile( | 1204 asm volatile( |
1205 "jmp 9f \n\t" | 1205 "jmp 9f \n\t" |
1206 // Begin | 1206 // Begin |
1207 "0: \n\t" | 1207 "0: \n\t" |
1208 "movq (%%edx, %%eax), %%mm3 \n\t" | 1208 "movq (%%"REG_d", %%"REG_a"), %%mm3\n\t" |
1209 "movd (%%ecx, %%esi), %%mm0 \n\t" | 1209 "movd (%%"REG_c", %%"REG_S"), %%mm0\n\t" |
1210 "punpcklbw %%mm7, %%mm0 \n\t" | 1210 "punpcklbw %%mm7, %%mm0 \n\t" |
1211 "pshufw $0xFF, %%mm0, %%mm1 \n\t" | 1211 "pshufw $0xFF, %%mm0, %%mm1 \n\t" |
1212 "1: \n\t" | 1212 "1: \n\t" |
1213 "pshufw $0xFF, %%mm0, %%mm0 \n\t" | 1213 "pshufw $0xFF, %%mm0, %%mm0 \n\t" |
1214 "2: \n\t" | 1214 "2: \n\t" |
1215 "psubw %%mm1, %%mm0 \n\t" | 1215 "psubw %%mm1, %%mm0 \n\t" |
1216 "movl 8(%%ebx, %%eax), %%esi \n\t" | 1216 "mov 8(%%"REG_b", %%"REG_a"), %%"REG_S"\n\t" |
1217 "pmullw %%mm3, %%mm0 \n\t" | 1217 "pmullw %%mm3, %%mm0 \n\t" |
1218 "psllw $7, %%mm1 \n\t" | 1218 "psllw $7, %%mm1 \n\t" |
1219 "paddw %%mm1, %%mm0 \n\t" | 1219 "paddw %%mm1, %%mm0 \n\t" |
1220 | 1220 |
1221 "movq %%mm0, (%%edi, %%eax) \n\t" | 1221 "movq %%mm0, (%%"REG_D", %%"REG_a")\n\t" |
1222 | 1222 |
1223 "addl $8, %%eax \n\t" | 1223 "add $8, %%"REG_a" \n\t" |
1224 // End | 1224 // End |
1225 "9: \n\t" | 1225 "9: \n\t" |
1226 // "int $3\n\t" | 1226 // "int $3\n\t" |
1227 "leal 0b, %0 \n\t" | 1227 "lea 0b, %0 \n\t" |
1228 "leal 1b, %1 \n\t" | 1228 "lea 1b, %1 \n\t" |
1229 "leal 2b, %2 \n\t" | 1229 "lea 2b, %2 \n\t" |
1230 "decl %1 \n\t" | 1230 "dec %1 \n\t" |
1231 "decl %2 \n\t" | 1231 "dec %2 \n\t" |
1232 "subl %0, %1 \n\t" | 1232 "sub %0, %1 \n\t" |
1233 "subl %0, %2 \n\t" | 1233 "sub %0, %2 \n\t" |
1234 "leal 9b, %3 \n\t" | 1234 "lea 9b, %3 \n\t" |
1235 "subl %0, %3 \n\t" | 1235 "sub %0, %3 \n\t" |
1236 | 1236 |
1237 | 1237 |
1238 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), | 1238 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), |
1239 "=r" (fragmentLengthB) | 1239 "=r" (fragmentLengthB) |
1240 ); | 1240 ); |
1311 } | 1311 } |
1312 xpos+=xInc; | 1312 xpos+=xInc; |
1313 } | 1313 } |
1314 filterPos[i/2]= xpos>>16; // needed to jump to the next part | 1314 filterPos[i/2]= xpos>>16; // needed to jump to the next part |
1315 } | 1315 } |
1316 #endif // ARCH_X86 | 1316 #endif // ARCH_X86 || ARCH_X86_64 |
1317 | 1317 |
1318 static void globalInit(){ | 1318 static void globalInit(){ |
1319 // generating tables: | 1319 // generating tables: |
1320 int i; | 1320 int i; |
1321 for(i=0; i<768; i++){ | 1321 for(i=0; i<768; i++){ |
1325 } | 1325 } |
1326 | 1326 |
1327 static SwsFunc getSwsFunc(int flags){ | 1327 static SwsFunc getSwsFunc(int flags){ |
1328 | 1328 |
1329 #ifdef RUNTIME_CPUDETECT | 1329 #ifdef RUNTIME_CPUDETECT |
1330 #ifdef ARCH_X86 | 1330 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1331 // ordered per speed fasterst first | 1331 // ordered per speed fasterst first |
1332 if(flags & SWS_CPU_CAPS_MMX2) | 1332 if(flags & SWS_CPU_CAPS_MMX2) |
1333 return swScale_MMX2; | 1333 return swScale_MMX2; |
1334 else if(flags & SWS_CPU_CAPS_3DNOW) | 1334 else if(flags & SWS_CPU_CAPS_3DNOW) |
1335 return swScale_3DNow; | 1335 return swScale_3DNow; |
1753 int i; | 1753 int i; |
1754 int usesVFilter, usesHFilter; | 1754 int usesVFilter, usesHFilter; |
1755 int unscaled, needsDither; | 1755 int unscaled, needsDither; |
1756 int srcFormat, dstFormat; | 1756 int srcFormat, dstFormat; |
1757 SwsFilter dummyFilter= {NULL, NULL, NULL, NULL}; | 1757 SwsFilter dummyFilter= {NULL, NULL, NULL, NULL}; |
1758 #ifdef ARCH_X86 | 1758 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1759 if(flags & SWS_CPU_CAPS_MMX) | 1759 if(flags & SWS_CPU_CAPS_MMX) |
1760 asm volatile("emms\n\t"::: "memory"); | 1760 asm volatile("emms\n\t"::: "memory"); |
1761 #endif | 1761 #endif |
1762 | 1762 |
1763 #ifndef RUNTIME_CPUDETECT //ensure that the flags match the compiled variant if cpudetect is off | 1763 #ifndef RUNTIME_CPUDETECT //ensure that the flags match the compiled variant if cpudetect is off |
1993 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, | 1993 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, |
1994 c->chrSrcW, c->chrDstW, filterAlign, 1<<14, | 1994 c->chrSrcW, c->chrDstW, filterAlign, 1<<14, |
1995 (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, | 1995 (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, |
1996 srcFilter->chrH, dstFilter->chrH, c->param); | 1996 srcFilter->chrH, dstFilter->chrH, c->param); |
1997 | 1997 |
1998 #ifdef ARCH_X86 | 1998 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1999 // can't downscale !!! | 1999 // can't downscale !!! |
2000 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) | 2000 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) |
2001 { | 2001 { |
2002 c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t)); | 2002 c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t)); |
2003 c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t)); | 2003 c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t)); |
2134 MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n"); | 2134 MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n"); |
2135 } | 2135 } |
2136 } | 2136 } |
2137 else | 2137 else |
2138 { | 2138 { |
2139 #ifdef ARCH_X86 | 2139 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
2140 MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n"); | 2140 MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n"); |
2141 #else | 2141 #else |
2142 if(flags & SWS_FAST_BILINEAR) | 2142 if(flags & SWS_FAST_BILINEAR) |
2143 MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n"); | 2143 MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n"); |
2144 else | 2144 else |