diff postproc/swscale_template.c @ 5452:eb87391a5292

overread in the mmx2 horizontal scaler fixed 2% faster horizontal mmx2 scaler
author michael
date Mon, 01 Apr 2002 14:01:22 +0000
parents 3cc0f4938be1
children 4b18bf35f153
line wrap: on
line diff
--- a/postproc/swscale_template.c	Mon Apr 01 13:26:38 2002 +0000
+++ b/postproc/swscale_template.c	Mon Apr 01 14:01:22 2002 +0000
@@ -2238,7 +2238,8 @@
 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
 				   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
 				   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
-				   int srcFormat, uint8_t *formatConvBuffer)
+				   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
+				   int32_t *mmx2FilterPos)
 {
     if(srcFormat==IMGFMT_YUY2)
     {
@@ -2294,35 +2295,21 @@
 	{
 		asm volatile(
 			"pxor %%mm7, %%mm7		\n\t"
-			"pxor %%mm2, %%mm2		\n\t" // 2*xalpha
-			"movd %5, %%mm6			\n\t" // xInc&0xFFFF
-			"punpcklwd %%mm6, %%mm6		\n\t"
-			"punpcklwd %%mm6, %%mm6		\n\t"
-			"movq %%mm6, %%mm2		\n\t"
-			"psllq $16, %%mm2		\n\t"
-			"paddw %%mm6, %%mm2		\n\t"
-			"psllq $16, %%mm2		\n\t"
-			"paddw %%mm6, %%mm2		\n\t"
-			"psllq $16, %%mm2		\n\t" //0,t,2t,3t		t=xInc&0xFF
-			"movq %%mm2, %%mm4		\n\t"
-			"movd %4, %%mm6			\n\t" //(xInc*4)&0xFFFF
-			"punpcklwd %%mm6, %%mm6		\n\t"
-			"punpcklwd %%mm6, %%mm6		\n\t"
+			"movl %0, %%ecx			\n\t"
+			"movl %1, %%edi			\n\t"
+			"movl %2, %%edx			\n\t"
+			"movl %3, %%ebx			\n\t"
 			"xorl %%eax, %%eax		\n\t" // i
-			"movl %0, %%esi			\n\t" // src
-			"movl %1, %%edi			\n\t" // buf1
-			"movl %3, %%edx			\n\t" // (xInc*4)>>16
-			"xorl %%ecx, %%ecx		\n\t"
-			"xorl %%ebx, %%ebx		\n\t"
-			"movw %4, %%bx			\n\t" // (xInc*4)&0xFFFF
+			PREFETCH" (%%ecx)		\n\t"
+			PREFETCH" 32(%%ecx)		\n\t"
+			PREFETCH" 64(%%ecx)		\n\t"
 
 #define FUNNY_Y_CODE \
-			PREFETCH" 1024(%%esi)		\n\t"\
-			PREFETCH" 1056(%%esi)		\n\t"\
-			PREFETCH" 1088(%%esi)		\n\t"\
-			"call *%6			\n\t"\
-			"movq %%mm4, %%mm2		\n\t"\
-			"xorl %%ecx, %%ecx		\n\t"
+			"movl (%%ebx), %%esi		\n\t"\
+			"call *%4			\n\t"\
+			"addl (%%ebx, %%eax), %%ecx	\n\t"\
+			"addl %%eax, %%edi		\n\t"\
+			"xorl %%eax, %%eax		\n\t"\
 
 FUNNY_Y_CODE
 FUNNY_Y_CODE
@@ -2333,8 +2320,8 @@
 FUNNY_Y_CODE
 FUNNY_Y_CODE
 
-			:: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
-			"m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
+			:: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
+			"m" (funnyYCode)
 			: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
 		);
 		for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
@@ -2402,7 +2389,8 @@
 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
 				   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
 				   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
-				   int srcFormat, uint8_t *formatConvBuffer)
+				   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
+				   int32_t *mmx2FilterPos)
 {
     if(srcFormat==IMGFMT_YUY2)
     {
@@ -2469,65 +2457,44 @@
 	if(canMMX2BeUsed)
 	{
 		asm volatile(
-		"pxor %%mm7, %%mm7		\n\t"
-		"pxor %%mm2, %%mm2		\n\t" // 2*xalpha
-		"movd %5, %%mm6			\n\t" // xInc&0xFFFF
-		"punpcklwd %%mm6, %%mm6		\n\t"
-		"punpcklwd %%mm6, %%mm6		\n\t"
-		"movq %%mm6, %%mm2		\n\t"
-		"psllq $16, %%mm2		\n\t"
-		"paddw %%mm6, %%mm2		\n\t"
-		"psllq $16, %%mm2		\n\t"
-		"paddw %%mm6, %%mm2		\n\t"
-		"psllq $16, %%mm2		\n\t" //0,t,2t,3t		t=xInc&0xFFFF
-		"movq %%mm2, %%mm4		\n\t"
-		"movd %4, %%mm6			\n\t" //(xInc*4)&0xFFFF
-		"punpcklwd %%mm6, %%mm6		\n\t"
-		"punpcklwd %%mm6, %%mm6		\n\t"
-		"xorl %%eax, %%eax		\n\t" // i
-		"movl %0, %%esi			\n\t" // src
-		"movl %1, %%edi			\n\t" // buf1
-		"movl %3, %%edx			\n\t" // (xInc*4)>>16
-		"xorl %%ecx, %%ecx		\n\t"
-		"xorl %%ebx, %%ebx		\n\t"
-		"movw %4, %%bx			\n\t" // (xInc*4)&0xFFFF
+			"pxor %%mm7, %%mm7		\n\t"
+			"movl %0, %%ecx			\n\t"
+			"movl %1, %%edi			\n\t"
+			"movl %2, %%edx			\n\t"
+			"movl %3, %%ebx			\n\t"
+			"xorl %%eax, %%eax		\n\t" // i
+			PREFETCH" (%%ecx)		\n\t"
+			PREFETCH" 32(%%ecx)		\n\t"
+			PREFETCH" 64(%%ecx)		\n\t"
+
+#define FUNNY_UV_CODE \
+			"movl (%%ebx), %%esi		\n\t"\
+			"call *%4			\n\t"\
+			"addl (%%ebx, %%eax), %%ecx	\n\t"\
+			"addl %%eax, %%edi		\n\t"\
+			"xorl %%eax, %%eax		\n\t"\
 
-#define FUNNYUVCODE \
-			PREFETCH" 1024(%%esi)		\n\t"\
-			PREFETCH" 1056(%%esi)		\n\t"\
-			PREFETCH" 1088(%%esi)		\n\t"\
-			"call *%7			\n\t"\
-			"movq %%mm4, %%mm2	\n\t"\
-			"xorl %%ecx, %%ecx		\n\t"
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+			"xorl %%eax, %%eax		\n\t" // i
+			"movl %5, %%ecx			\n\t" // src
+			"movl %1, %%edi			\n\t" // buf1
+			"addl $4096, %%edi		\n\t"
+			PREFETCH" (%%ecx)		\n\t"
+			PREFETCH" 32(%%ecx)		\n\t"
+			PREFETCH" 64(%%ecx)		\n\t"
 
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-		"xorl %%eax, %%eax		\n\t" // i
-		"movl %6, %%esi			\n\t" // src
-		"movl %1, %%edi			\n\t" // buf1
-		"addl $4096, %%edi		\n\t"
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
 
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-
-		:: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
-		  "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
-		: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
-	);
+			:: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
+			"m" (funnyUVCode), "m" (src2)
+			: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
+		);
 		for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
 		{
 //			printf("%d %d %d\n", dstWidth, i, srcW);
@@ -2749,7 +2716,8 @@
 //				printf("%d %d\n", lumBufIndex, vLumBufSize);
 				RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
 						flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
-						funnyYCode, c->srcFormat, formatConvBuffer);
+						funnyYCode, c->srcFormat, formatConvBuffer, 
+						c->lumMmx2Filter, c->lumMmx2FilterPos);
 				lastInLumBuf++;
 			}
 			while(lastInChrBuf < lastChrSrcY)
@@ -2763,7 +2731,8 @@
 				//FIXME replace parameters through context struct (some at least)
 				RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
 						flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
-						funnyUVCode, c->srcFormat, formatConvBuffer);
+						funnyUVCode, c->srcFormat, formatConvBuffer, 
+						c->chrMmx2Filter, c->chrMmx2FilterPos);
 				lastInChrBuf++;
 			}
 			//wrap buf index around to stay inside the ring buffer
@@ -2787,7 +2756,8 @@
 				ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
 				RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
 						flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
-						funnyYCode, c->srcFormat, formatConvBuffer);
+						funnyYCode, c->srcFormat, formatConvBuffer, 
+						c->lumMmx2Filter, c->lumMmx2FilterPos);
 				lastInLumBuf++;
 			}
 			while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
@@ -2800,7 +2770,8 @@
 				ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
 				RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
 						flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
-						funnyUVCode, c->srcFormat, formatConvBuffer);
+						funnyUVCode, c->srcFormat, formatConvBuffer, 
+						c->chrMmx2Filter, c->chrMmx2FilterPos);
 				lastInChrBuf++;
 			}
 			//wrap buf index around to stay inside the ring buffer