comparison postproc/swscale.c @ 13720:821f464b4d90

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
author aurel
date Thu, 21 Oct 2004 11:55:20 +0000
parents 992960f68af0
children 31cb219364a4
comparison
equal deleted inserted replaced
13719:43ecd6a73ec0 13720:821f464b4d90
143 143
144 #define ABS(a) ((a) > 0 ? (a) : (-(a))) 144 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
145 #define MIN(a,b) ((a) > (b) ? (b) : (a)) 145 #define MIN(a,b) ((a) > (b) ? (b) : (a))
146 #define MAX(a,b) ((a) < (b) ? (b) : (a)) 146 #define MAX(a,b) ((a) < (b) ? (b) : (a))
147 147
148 #ifdef ARCH_X86 148 #if defined(ARCH_X86) || defined(ARCH_X86_64)
149 static uint64_t attribute_used __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL; 149 static uint64_t attribute_used __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
150 static uint64_t attribute_used __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL; 150 static uint64_t attribute_used __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
151 static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL; 151 static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
152 static uint64_t attribute_used __attribute__((aligned(8))) w02= 0x0002000200020002LL; 152 static uint64_t attribute_used __attribute__((aligned(8))) w02= 0x0002000200020002LL;
153 static uint64_t attribute_used __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL; 153 static uint64_t attribute_used __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
202 extern const uint8_t dither_2x2_8[2][8]; 202 extern const uint8_t dither_2x2_8[2][8];
203 extern const uint8_t dither_8x8_32[8][8]; 203 extern const uint8_t dither_8x8_32[8][8];
204 extern const uint8_t dither_8x8_73[8][8]; 204 extern const uint8_t dither_8x8_73[8][8];
205 extern const uint8_t dither_8x8_220[8][8]; 205 extern const uint8_t dither_8x8_220[8][8];
206 206
207 #ifdef ARCH_X86 207 #if defined(ARCH_X86) || defined(ARCH_X86_64)
208 void in_asm_used_var_warning_killer() 208 void in_asm_used_var_warning_killer()
209 { 209 {
210 volatile int i= bF8+bFC+w10+ 210 volatile int i= bF8+bFC+w10+
211 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+ 211 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+
212 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101; 212 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
677 #ifdef HAVE_ALTIVEC 677 #ifdef HAVE_ALTIVEC
678 #define COMPILE_ALTIVEC 678 #define COMPILE_ALTIVEC
679 #endif //HAVE_ALTIVEC 679 #endif //HAVE_ALTIVEC
680 #endif //ARCH_POWERPC 680 #endif //ARCH_POWERPC
681 681
682 #ifdef ARCH_X86 682 #if defined(ARCH_X86) || defined(ARCH_X86_64)
683 683
684 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) 684 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
685 #define COMPILE_MMX 685 #define COMPILE_MMX
686 #endif 686 #endif
687 687
690 #endif 690 #endif
691 691
692 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) 692 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
693 #define COMPILE_3DNOW 693 #define COMPILE_3DNOW
694 #endif 694 #endif
695 #endif //ARCH_X86 695 #endif //ARCH_X86 || ARCH_X86_64
696 696
697 #undef HAVE_MMX 697 #undef HAVE_MMX
698 #undef HAVE_MMX2 698 #undef HAVE_MMX2
699 #undef HAVE_3DNOW 699 #undef HAVE_3DNOW
700 700
714 #define RENAME(a) a ## _altivec 714 #define RENAME(a) a ## _altivec
715 #include "swscale_template.c" 715 #include "swscale_template.c"
716 #endif 716 #endif
717 #endif //ARCH_POWERPC 717 #endif //ARCH_POWERPC
718 718
719 #ifdef ARCH_X86 719 #if defined(ARCH_X86) || defined(ARCH_X86_64)
720 720
721 //X86 versions 721 //X86 versions
722 /* 722 /*
723 #undef RENAME 723 #undef RENAME
724 #undef HAVE_MMX 724 #undef HAVE_MMX
756 #define HAVE_3DNOW 756 #define HAVE_3DNOW
757 #define RENAME(a) a ## _3DNow 757 #define RENAME(a) a ## _3DNow
758 #include "swscale_template.c" 758 #include "swscale_template.c"
759 #endif 759 #endif
760 760
761 #endif //ARCH_X86 761 #endif //ARCH_X86 || ARCH_X86_64
762 762
763 // minor note: the HAVE_xyz is messed up after that line so don't use it 763 // minor note: the HAVE_xyz is messed up after that line so don't use it
764 764
765 static double getSplineCoeff(double a, double b, double c, double d, double dist) 765 static double getSplineCoeff(double a, double b, double c, double d, double dist)
766 { 766 {
781 int filterSize; 781 int filterSize;
782 int filter2Size; 782 int filter2Size;
783 int minFilterSize; 783 int minFilterSize;
784 double *filter=NULL; 784 double *filter=NULL;
785 double *filter2=NULL; 785 double *filter2=NULL;
786 #ifdef ARCH_X86 786 #if defined(ARCH_X86) || defined(ARCH_X86_64)
787 if(flags & SWS_CPU_CAPS_MMX) 787 if(flags & SWS_CPU_CAPS_MMX)
788 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions) 788 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
789 #endif 789 #endif
790 790
791 // Note the +1 is for the MMXscaler which reads over the end 791 // Note the +1 is for the MMXscaler which reads over the end
1140 } 1140 }
1141 1141
1142 free(filter); 1142 free(filter);
1143 } 1143 }
1144 1144
1145 #ifdef ARCH_X86 1145 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1146 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits) 1146 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
1147 { 1147 {
1148 uint8_t *fragmentA; 1148 uint8_t *fragmentA;
1149 int imm8OfPShufW1A; 1149 long imm8OfPShufW1A;
1150 int imm8OfPShufW2A; 1150 long imm8OfPShufW2A;
1151 int fragmentLengthA; 1151 long fragmentLengthA;
1152 uint8_t *fragmentB; 1152 uint8_t *fragmentB;
1153 int imm8OfPShufW1B; 1153 long imm8OfPShufW1B;
1154 int imm8OfPShufW2B; 1154 long imm8OfPShufW2B;
1155 int fragmentLengthB; 1155 long fragmentLengthB;
1156 int fragmentPos; 1156 int fragmentPos;
1157 1157
1158 int xpos, i; 1158 int xpos, i;
1159 1159
1160 // create an optimized horizontal scaling routine 1160 // create an optimized horizontal scaling routine
1163 1163
1164 asm volatile( 1164 asm volatile(
1165 "jmp 9f \n\t" 1165 "jmp 9f \n\t"
1166 // Begin 1166 // Begin
1167 "0: \n\t" 1167 "0: \n\t"
1168 "movq (%%edx, %%eax), %%mm3 \n\t" 1168 "movq (%%"REG_d", %%"REG_a"), %%mm3\n\t"
1169 "movd (%%ecx, %%esi), %%mm0 \n\t" 1169 "movd (%%"REG_c", %%"REG_S"), %%mm0\n\t"
1170 "movd 1(%%ecx, %%esi), %%mm1 \n\t" 1170 "movd 1(%%"REG_c", %%"REG_S"), %%mm1\n\t"
1171 "punpcklbw %%mm7, %%mm1 \n\t" 1171 "punpcklbw %%mm7, %%mm1 \n\t"
1172 "punpcklbw %%mm7, %%mm0 \n\t" 1172 "punpcklbw %%mm7, %%mm0 \n\t"
1173 "pshufw $0xFF, %%mm1, %%mm1 \n\t" 1173 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
1174 "1: \n\t" 1174 "1: \n\t"
1175 "pshufw $0xFF, %%mm0, %%mm0 \n\t" 1175 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1176 "2: \n\t" 1176 "2: \n\t"
1177 "psubw %%mm1, %%mm0 \n\t" 1177 "psubw %%mm1, %%mm0 \n\t"
1178 "movl 8(%%ebx, %%eax), %%esi \n\t" 1178 "mov 8(%%"REG_b", %%"REG_a"), %%"REG_S"\n\t"
1179 "pmullw %%mm3, %%mm0 \n\t" 1179 "pmullw %%mm3, %%mm0 \n\t"
1180 "psllw $7, %%mm1 \n\t" 1180 "psllw $7, %%mm1 \n\t"
1181 "paddw %%mm1, %%mm0 \n\t" 1181 "paddw %%mm1, %%mm0 \n\t"
1182 1182
1183 "movq %%mm0, (%%edi, %%eax) \n\t" 1183 "movq %%mm0, (%%"REG_D", %%"REG_a")\n\t"
1184 1184
1185 "addl $8, %%eax \n\t" 1185 "add $8, %%"REG_a" \n\t"
1186 // End 1186 // End
1187 "9: \n\t" 1187 "9: \n\t"
1188 // "int $3\n\t" 1188 // "int $3\n\t"
1189 "leal 0b, %0 \n\t" 1189 "lea 0b, %0 \n\t"
1190 "leal 1b, %1 \n\t" 1190 "lea 1b, %1 \n\t"
1191 "leal 2b, %2 \n\t" 1191 "lea 2b, %2 \n\t"
1192 "decl %1 \n\t" 1192 "dec %1 \n\t"
1193 "decl %2 \n\t" 1193 "dec %2 \n\t"
1194 "subl %0, %1 \n\t" 1194 "sub %0, %1 \n\t"
1195 "subl %0, %2 \n\t" 1195 "sub %0, %2 \n\t"
1196 "leal 9b, %3 \n\t" 1196 "lea 9b, %3 \n\t"
1197 "subl %0, %3 \n\t" 1197 "sub %0, %3 \n\t"
1198 1198
1199 1199
1200 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), 1200 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
1201 "=r" (fragmentLengthA) 1201 "=r" (fragmentLengthA)
1202 ); 1202 );
1203 1203
1204 asm volatile( 1204 asm volatile(
1205 "jmp 9f \n\t" 1205 "jmp 9f \n\t"
1206 // Begin 1206 // Begin
1207 "0: \n\t" 1207 "0: \n\t"
1208 "movq (%%edx, %%eax), %%mm3 \n\t" 1208 "movq (%%"REG_d", %%"REG_a"), %%mm3\n\t"
1209 "movd (%%ecx, %%esi), %%mm0 \n\t" 1209 "movd (%%"REG_c", %%"REG_S"), %%mm0\n\t"
1210 "punpcklbw %%mm7, %%mm0 \n\t" 1210 "punpcklbw %%mm7, %%mm0 \n\t"
1211 "pshufw $0xFF, %%mm0, %%mm1 \n\t" 1211 "pshufw $0xFF, %%mm0, %%mm1 \n\t"
1212 "1: \n\t" 1212 "1: \n\t"
1213 "pshufw $0xFF, %%mm0, %%mm0 \n\t" 1213 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1214 "2: \n\t" 1214 "2: \n\t"
1215 "psubw %%mm1, %%mm0 \n\t" 1215 "psubw %%mm1, %%mm0 \n\t"
1216 "movl 8(%%ebx, %%eax), %%esi \n\t" 1216 "mov 8(%%"REG_b", %%"REG_a"), %%"REG_S"\n\t"
1217 "pmullw %%mm3, %%mm0 \n\t" 1217 "pmullw %%mm3, %%mm0 \n\t"
1218 "psllw $7, %%mm1 \n\t" 1218 "psllw $7, %%mm1 \n\t"
1219 "paddw %%mm1, %%mm0 \n\t" 1219 "paddw %%mm1, %%mm0 \n\t"
1220 1220
1221 "movq %%mm0, (%%edi, %%eax) \n\t" 1221 "movq %%mm0, (%%"REG_D", %%"REG_a")\n\t"
1222 1222
1223 "addl $8, %%eax \n\t" 1223 "add $8, %%"REG_a" \n\t"
1224 // End 1224 // End
1225 "9: \n\t" 1225 "9: \n\t"
1226 // "int $3\n\t" 1226 // "int $3\n\t"
1227 "leal 0b, %0 \n\t" 1227 "lea 0b, %0 \n\t"
1228 "leal 1b, %1 \n\t" 1228 "lea 1b, %1 \n\t"
1229 "leal 2b, %2 \n\t" 1229 "lea 2b, %2 \n\t"
1230 "decl %1 \n\t" 1230 "dec %1 \n\t"
1231 "decl %2 \n\t" 1231 "dec %2 \n\t"
1232 "subl %0, %1 \n\t" 1232 "sub %0, %1 \n\t"
1233 "subl %0, %2 \n\t" 1233 "sub %0, %2 \n\t"
1234 "leal 9b, %3 \n\t" 1234 "lea 9b, %3 \n\t"
1235 "subl %0, %3 \n\t" 1235 "sub %0, %3 \n\t"
1236 1236
1237 1237
1238 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), 1238 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
1239 "=r" (fragmentLengthB) 1239 "=r" (fragmentLengthB)
1240 ); 1240 );
1311 } 1311 }
1312 xpos+=xInc; 1312 xpos+=xInc;
1313 } 1313 }
1314 filterPos[i/2]= xpos>>16; // needed to jump to the next part 1314 filterPos[i/2]= xpos>>16; // needed to jump to the next part
1315 } 1315 }
1316 #endif // ARCH_X86 1316 #endif // ARCH_X86 || ARCH_X86_64
1317 1317
1318 static void globalInit(){ 1318 static void globalInit(){
1319 // generating tables: 1319 // generating tables:
1320 int i; 1320 int i;
1321 for(i=0; i<768; i++){ 1321 for(i=0; i<768; i++){
1325 } 1325 }
1326 1326
1327 static SwsFunc getSwsFunc(int flags){ 1327 static SwsFunc getSwsFunc(int flags){
1328 1328
1329 #ifdef RUNTIME_CPUDETECT 1329 #ifdef RUNTIME_CPUDETECT
1330 #ifdef ARCH_X86 1330 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1331 // ordered per speed fasterst first 1331 // ordered per speed fasterst first
1332 if(flags & SWS_CPU_CAPS_MMX2) 1332 if(flags & SWS_CPU_CAPS_MMX2)
1333 return swScale_MMX2; 1333 return swScale_MMX2;
1334 else if(flags & SWS_CPU_CAPS_3DNOW) 1334 else if(flags & SWS_CPU_CAPS_3DNOW)
1335 return swScale_3DNow; 1335 return swScale_3DNow;
1753 int i; 1753 int i;
1754 int usesVFilter, usesHFilter; 1754 int usesVFilter, usesHFilter;
1755 int unscaled, needsDither; 1755 int unscaled, needsDither;
1756 int srcFormat, dstFormat; 1756 int srcFormat, dstFormat;
1757 SwsFilter dummyFilter= {NULL, NULL, NULL, NULL}; 1757 SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1758 #ifdef ARCH_X86 1758 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1759 if(flags & SWS_CPU_CAPS_MMX) 1759 if(flags & SWS_CPU_CAPS_MMX)
1760 asm volatile("emms\n\t"::: "memory"); 1760 asm volatile("emms\n\t"::: "memory");
1761 #endif 1761 #endif
1762 1762
1763 #ifndef RUNTIME_CPUDETECT //ensure that the flags match the compiled variant if cpudetect is off 1763 #ifndef RUNTIME_CPUDETECT //ensure that the flags match the compiled variant if cpudetect is off
1993 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, 1993 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1994 c->chrSrcW, c->chrDstW, filterAlign, 1<<14, 1994 c->chrSrcW, c->chrDstW, filterAlign, 1<<14,
1995 (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, 1995 (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
1996 srcFilter->chrH, dstFilter->chrH, c->param); 1996 srcFilter->chrH, dstFilter->chrH, c->param);
1997 1997
1998 #ifdef ARCH_X86 1998 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1999 // can't downscale !!! 1999 // can't downscale !!!
2000 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) 2000 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
2001 { 2001 {
2002 c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t)); 2002 c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t));
2003 c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t)); 2003 c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t));
2134 MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n"); 2134 MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
2135 } 2135 }
2136 } 2136 }
2137 else 2137 else
2138 { 2138 {
2139 #ifdef ARCH_X86 2139 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2140 MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n"); 2140 MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
2141 #else 2141 #else
2142 if(flags & SWS_FAST_BILINEAR) 2142 if(flags & SWS_FAST_BILINEAR)
2143 MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n"); 2143 MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
2144 else 2144 else