diff postproc/swscale.c @ 13720:821f464b4d90

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
author aurel
date Thu, 21 Oct 2004 11:55:20 +0000
parents 992960f68af0
children 31cb219364a4
line wrap: on
line diff
--- a/postproc/swscale.c	Thu Oct 21 11:36:20 2004 +0000
+++ b/postproc/swscale.c	Thu Oct 21 11:55:20 2004 +0000
@@ -145,7 +145,7 @@
 #define MIN(a,b) ((a) > (b) ? (b) : (a))
 #define MAX(a,b) ((a) < (b) ? (b) : (a))
 
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 static uint64_t attribute_used __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
 static uint64_t attribute_used __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
 static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
@@ -204,7 +204,7 @@
 extern const uint8_t dither_8x8_73[8][8];
 extern const uint8_t dither_8x8_220[8][8];
 
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 void in_asm_used_var_warning_killer()
 {
  volatile int i= bF8+bFC+w10+
@@ -679,7 +679,7 @@
 #endif //HAVE_ALTIVEC
 #endif //ARCH_POWERPC
 
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 
 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 #define COMPILE_MMX
@@ -692,7 +692,7 @@
 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 #define COMPILE_3DNOW
 #endif
-#endif //ARCH_X86
+#endif //ARCH_X86 || ARCH_X86_64
 
 #undef HAVE_MMX
 #undef HAVE_MMX2
@@ -716,7 +716,7 @@
 #endif
 #endif //ARCH_POWERPC
 
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 
 //X86 versions
 /*
@@ -758,7 +758,7 @@
 #include "swscale_template.c"
 #endif
 
-#endif //ARCH_X86
+#endif //ARCH_X86 || ARCH_X86_64
 
 // minor note: the HAVE_xyz is messed up after that line so don't use it
 
@@ -783,7 +783,7 @@
 	int minFilterSize;
 	double *filter=NULL;
 	double *filter2=NULL;
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 	if(flags & SWS_CPU_CAPS_MMX)
 		asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
 #endif
@@ -1142,17 +1142,17 @@
 	free(filter);
 }
 
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
 {
 	uint8_t *fragmentA;
-	int imm8OfPShufW1A;
-	int imm8OfPShufW2A;
-	int fragmentLengthA;
+	long imm8OfPShufW1A;
+	long imm8OfPShufW2A;
+	long fragmentLengthA;
 	uint8_t *fragmentB;
-	int imm8OfPShufW1B;
-	int imm8OfPShufW2B;
-	int fragmentLengthB;
+	long imm8OfPShufW1B;
+	long imm8OfPShufW2B;
+	long fragmentLengthB;
 	int fragmentPos;
 
 	int xpos, i;
@@ -1165,9 +1165,9 @@
 		"jmp 9f				\n\t"
 	// Begin
 		"0:				\n\t"
-		"movq (%%edx, %%eax), %%mm3	\n\t" 
-		"movd (%%ecx, %%esi), %%mm0	\n\t" 
-		"movd 1(%%ecx, %%esi), %%mm1	\n\t"
+		"movq (%%"REG_d", %%"REG_a"), %%mm3\n\t" 
+		"movd (%%"REG_c", %%"REG_S"), %%mm0\n\t" 
+		"movd 1(%%"REG_c", %%"REG_S"), %%mm1\n\t"
 		"punpcklbw %%mm7, %%mm1		\n\t"
 		"punpcklbw %%mm7, %%mm0		\n\t"
 		"pshufw $0xFF, %%mm1, %%mm1	\n\t"
@@ -1175,26 +1175,26 @@
 		"pshufw $0xFF, %%mm0, %%mm0	\n\t"
 		"2:				\n\t"
 		"psubw %%mm1, %%mm0		\n\t"
-		"movl 8(%%ebx, %%eax), %%esi	\n\t"
+		"mov 8(%%"REG_b", %%"REG_a"), %%"REG_S"\n\t"
 		"pmullw %%mm3, %%mm0		\n\t"
 		"psllw $7, %%mm1		\n\t"
 		"paddw %%mm1, %%mm0		\n\t"
 
-		"movq %%mm0, (%%edi, %%eax)	\n\t"
+		"movq %%mm0, (%%"REG_D", %%"REG_a")\n\t"
 
-		"addl $8, %%eax			\n\t"
+		"add $8, %%"REG_a"		\n\t"
 	// End
 		"9:				\n\t"
 //		"int $3\n\t"
-		"leal 0b, %0			\n\t"
-		"leal 1b, %1			\n\t"
-		"leal 2b, %2			\n\t"
-		"decl %1			\n\t"
-		"decl %2			\n\t"
-		"subl %0, %1			\n\t"
-		"subl %0, %2			\n\t"
-		"leal 9b, %3			\n\t"
-		"subl %0, %3			\n\t"
+		"lea 0b, %0			\n\t"
+		"lea 1b, %1			\n\t"
+		"lea 2b, %2			\n\t"
+		"dec %1				\n\t"
+		"dec %2				\n\t"
+		"sub %0, %1			\n\t"
+		"sub %0, %2			\n\t"
+		"lea 9b, %3			\n\t"
+		"sub %0, %3			\n\t"
 
 
 		:"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
@@ -1205,34 +1205,34 @@
 		"jmp 9f				\n\t"
 	// Begin
 		"0:				\n\t"
-		"movq (%%edx, %%eax), %%mm3	\n\t" 
-		"movd (%%ecx, %%esi), %%mm0	\n\t" 
+		"movq (%%"REG_d", %%"REG_a"), %%mm3\n\t" 
+		"movd (%%"REG_c", %%"REG_S"), %%mm0\n\t" 
 		"punpcklbw %%mm7, %%mm0		\n\t"
 		"pshufw $0xFF, %%mm0, %%mm1	\n\t"
 		"1:				\n\t"
 		"pshufw $0xFF, %%mm0, %%mm0	\n\t"
 		"2:				\n\t"
 		"psubw %%mm1, %%mm0		\n\t"
-		"movl 8(%%ebx, %%eax), %%esi	\n\t"
+		"mov 8(%%"REG_b", %%"REG_a"), %%"REG_S"\n\t"
 		"pmullw %%mm3, %%mm0		\n\t"
 		"psllw $7, %%mm1		\n\t"
 		"paddw %%mm1, %%mm0		\n\t"
 
-		"movq %%mm0, (%%edi, %%eax)	\n\t"
+		"movq %%mm0, (%%"REG_D", %%"REG_a")\n\t"
 
-		"addl $8, %%eax			\n\t"
+		"add $8, %%"REG_a"		\n\t"
 	// End
 		"9:				\n\t"
 //		"int $3\n\t"
-		"leal 0b, %0			\n\t"
-		"leal 1b, %1			\n\t"
-		"leal 2b, %2			\n\t"
-		"decl %1			\n\t"
-		"decl %2			\n\t"
-		"subl %0, %1			\n\t"
-		"subl %0, %2			\n\t"
-		"leal 9b, %3			\n\t"
-		"subl %0, %3			\n\t"
+		"lea 0b, %0			\n\t"
+		"lea 1b, %1			\n\t"
+		"lea 2b, %2			\n\t"
+		"dec %1				\n\t"
+		"dec %2				\n\t"
+		"sub %0, %1			\n\t"
+		"sub %0, %2			\n\t"
+		"lea 9b, %3			\n\t"
+		"sub %0, %3			\n\t"
 
 
 		:"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
@@ -1313,7 +1313,7 @@
 	}
 	filterPos[i/2]= xpos>>16; // needed to jump to the next part
 }
-#endif // ARCH_X86
+#endif // ARCH_X86 || ARCH_X86_64
 
 static void globalInit(){
     // generating tables:
@@ -1327,7 +1327,7 @@
 static SwsFunc getSwsFunc(int flags){
     
 #ifdef RUNTIME_CPUDETECT
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 	// ordered per speed fasterst first
 	if(flags & SWS_CPU_CAPS_MMX2)
 		return swScale_MMX2;
@@ -1755,7 +1755,7 @@
 	int unscaled, needsDither;
 	int srcFormat, dstFormat;
 	SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 	if(flags & SWS_CPU_CAPS_MMX)
 		asm volatile("emms\n\t"::: "memory");
 #endif
@@ -1995,7 +1995,7 @@
 				 (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
 				 srcFilter->chrH, dstFilter->chrH, c->param);
 
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 // can't downscale !!!
 		if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
 		{
@@ -2136,7 +2136,7 @@
 		}
 		else
 		{
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 			MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
 #else
 			if(flags & SWS_FAST_BILINEAR)