changeset 5452:eb87391a5292

overread in the mmx2 horizontal scaler fixed 2% faster horizontal mmx2 scaler
author michael
date Mon, 01 Apr 2002 14:01:22 +0000
parents b716977c47d9
children 0e63c2f19ba6
files postproc/swscale.c postproc/swscale.h postproc/swscale_template.c
diffstat 3 files changed, 195 insertions(+), 123 deletions(-) [+]
line wrap: on
line diff
--- a/postproc/swscale.c	Mon Apr 01 13:26:38 2002 +0000
+++ b/postproc/swscale.c	Mon Apr 01 14:01:22 2002 +0000
@@ -117,10 +117,6 @@
 extern int verbose; // defined in mplayer.c
 /*
 NOTES
-
-known BUGS with known cause (no bugreports please!, but patches are welcome :) )
-horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
-
 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
 
 TODO
@@ -1020,12 +1016,17 @@
 }
 
 #ifdef ARCH_X86
-static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
+static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
 {
-	uint8_t *fragment;
-	int imm8OfPShufW1;
-	int imm8OfPShufW2;
-	int fragmentLength;
+	uint8_t *fragmentA;
+	int imm8OfPShufW1A;
+	int imm8OfPShufW2A;
+	int fragmentLengthA;
+	uint8_t *fragmentB;
+	int imm8OfPShufW1B;
+	int imm8OfPShufW2B;
+	int fragmentLengthB;
+	int fragmentPos;
 
 	int xpos, i;
 
@@ -1037,22 +1038,18 @@
 		"jmp 9f				\n\t"
 	// Begin
 		"0:				\n\t"
-		"movq (%%esi), %%mm0		\n\t" //FIXME Alignment
-		"movq %%mm0, %%mm1		\n\t"
-		"psrlq $8, %%mm0		\n\t"
-		"punpcklbw %%mm7, %%mm1	\n\t"
-		"movq %%mm2, %%mm3		\n\t"
-		"punpcklbw %%mm7, %%mm0	\n\t"
-		"addw %%bx, %%cx		\n\t" //2*xalpha += (4*lumXInc)&0xFFFF
+		"movq (%%edx, %%eax), %%mm3	\n\t" 
+		"movd (%%ecx, %%esi), %%mm0	\n\t" 
+		"movd 1(%%ecx, %%esi), %%mm1	\n\t"
+		"punpcklbw %%mm7, %%mm1		\n\t"
+		"punpcklbw %%mm7, %%mm0		\n\t"
 		"pshufw $0xFF, %%mm1, %%mm1	\n\t"
 		"1:				\n\t"
-		"adcl %%edx, %%esi		\n\t" //xx+= (4*lumXInc)>>16 + carry
 		"pshufw $0xFF, %%mm0, %%mm0	\n\t"
 		"2:				\n\t"
-		"psrlw $9, %%mm3		\n\t"
 		"psubw %%mm1, %%mm0		\n\t"
+		"movl 8(%%ebx, %%eax), %%esi	\n\t"
 		"pmullw %%mm3, %%mm0		\n\t"
-		"paddw %%mm6, %%mm2		\n\t" // 2*alpha += xpos&0xFFFF
 		"psllw $7, %%mm1		\n\t"
 		"paddw %%mm1, %%mm0		\n\t"
 
@@ -1071,13 +1068,54 @@
 		"subl %0, %2			\n\t"
 		"leal 9b, %3			\n\t"
 		"subl %0, %3			\n\t"
-		:"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
-		"=r" (fragmentLength)
+
+
+		:"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
+		"=r" (fragmentLengthA)
+	);
+
+	asm volatile(
+		"jmp 9f				\n\t"
+	// Begin
+		"0:				\n\t"
+		"movq (%%edx, %%eax), %%mm3	\n\t" 
+		"movd (%%ecx, %%esi), %%mm0	\n\t" 
+		"punpcklbw %%mm7, %%mm0		\n\t"
+		"pshufw $0xFF, %%mm0, %%mm1	\n\t"
+		"1:				\n\t"
+		"pshufw $0xFF, %%mm0, %%mm0	\n\t"
+		"2:				\n\t"
+		"psubw %%mm1, %%mm0		\n\t"
+		"movl 8(%%ebx, %%eax), %%esi	\n\t"
+		"pmullw %%mm3, %%mm0		\n\t"
+		"psllw $7, %%mm1		\n\t"
+		"paddw %%mm1, %%mm0		\n\t"
+
+		"movq %%mm0, (%%edi, %%eax)	\n\t"
+
+		"addl $8, %%eax			\n\t"
+	// End
+		"9:				\n\t"
+//		"int $3\n\t"
+		"leal 0b, %0			\n\t"
+		"leal 1b, %1			\n\t"
+		"leal 2b, %2			\n\t"
+		"decl %1			\n\t"
+		"decl %2			\n\t"
+		"subl %0, %1			\n\t"
+		"subl %0, %2			\n\t"
+		"leal 9b, %3			\n\t"
+		"subl %0, %3			\n\t"
+
+
+		:"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
+		"=r" (fragmentLengthB)
 	);
 
 	xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
-
-	for(i=0; i<dstW/8; i++)
+	fragmentPos=0;
+	
+	for(i=0; i<dstW/numSplits; i++)
 	{
 		int xx=xpos>>16;
 
@@ -1088,20 +1126,65 @@
 			int c=((xpos+xInc*2)>>16) - xx;
 			int d=((xpos+xInc*3)>>16) - xx;
 
-			memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
+			filter[i  ] = (( xpos         & 0xFFFF) ^ 0xFFFF)>>9;
+			filter[i+1] = (((xpos+xInc  ) & 0xFFFF) ^ 0xFFFF)>>9;
+			filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
+			filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
+			filterPos[i/2]= xx;
+
+			if(d+1<4)
+			{
+				int maxShift= 3-(d+1);
+				int shift=0;
+
+				memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
 
-			funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
-			funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
-				a | (b<<2) | (c<<4) | (d<<6);
+				funnyCode[fragmentPos + imm8OfPShufW1B]=
+					(a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
+				funnyCode[fragmentPos + imm8OfPShufW2B]=
+					a | (b<<2) | (c<<4) | (d<<6);
+
+				if(i+3>=dstW) shift=maxShift; //avoid overread
+				else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
+
+				if(shift && i>=shift)
+				{
+					funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
+					funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
+					filterPos[i/2]-=shift;
+				}
 
-			// if we dont need to read 8 bytes than dont :), reduces the chance of
-			// crossing a cache line
-			if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
+				fragmentPos+= fragmentLengthB;
+			}
+			else
+			{
+				int maxShift= 3-d;
+				int shift=0;
+
+				memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
+
+				funnyCode[fragmentPos + imm8OfPShufW1A]=
+				funnyCode[fragmentPos + imm8OfPShufW2A]=
+					a | (b<<2) | (c<<4) | (d<<6);
 
-			funnyCode[fragmentLength*(i+4)/4]= RET;
+				if(i+4>=dstW) shift=maxShift; //avoid overread
+				else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
+
+				if(shift && i>=shift)
+				{
+					funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
+					funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
+					filterPos[i/2]-=shift;
+				}
+
+				fragmentPos+= fragmentLengthA;
+			}
+
+			funnyCode[fragmentPos]= RET;
 		}
 		xpos+=xInc;
 	}
+	filterPos[i/2]= xpos>>16; // needed to jump to the next part
 }
 #endif // ARCH_X86
 
@@ -1565,8 +1648,13 @@
 // cant downscale !!!
 		if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
 		{
-			initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode);
-			initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
+			c->lumMmx2Filter   = (int16_t*)memalign(8, (dstW        /8+8)*sizeof(int16_t));
+			c->chrMmx2Filter   = (int16_t*)memalign(8, (c->chrDstW  /4+8)*sizeof(int16_t));
+			c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW      /2/8+8)*sizeof(int32_t));
+			c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
+
+			initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
+			initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
 		}
 #endif
 	} // Init Horizontal stuff
@@ -2014,6 +2102,15 @@
 	if(c->chrMmxFilter) free(c->chrMmxFilter);
 	c->chrMmxFilter = NULL;
 
+	if(c->lumMmx2Filter) free(c->lumMmx2Filter);
+	c->lumMmx2Filter=NULL;
+	if(c->chrMmx2Filter) free(c->chrMmx2Filter);
+	c->chrMmx2Filter=NULL;
+	if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
+	c->lumMmx2FilterPos=NULL;
+	if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
+	c->chrMmx2FilterPos=NULL;
+
 	free(c);
 }
 
--- a/postproc/swscale.h	Mon Apr 01 13:26:38 2002 +0000
+++ b/postproc/swscale.h	Mon Apr 01 14:01:22 2002 +0000
@@ -69,6 +69,10 @@
 
 	uint8_t __attribute__((aligned(32))) funnyYCode[10000];
 	uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
+	int32_t *lumMmx2FilterPos;
+	int32_t *chrMmx2FilterPos;
+	int16_t *lumMmx2Filter;
+	int16_t *chrMmx2Filter;
 
 	int canMMX2BeUsed;
 
--- a/postproc/swscale_template.c	Mon Apr 01 13:26:38 2002 +0000
+++ b/postproc/swscale_template.c	Mon Apr 01 14:01:22 2002 +0000
@@ -2238,7 +2238,8 @@
 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
 				   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
 				   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
-				   int srcFormat, uint8_t *formatConvBuffer)
+				   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
+				   int32_t *mmx2FilterPos)
 {
     if(srcFormat==IMGFMT_YUY2)
     {
@@ -2294,35 +2295,21 @@
 	{
 		asm volatile(
 			"pxor %%mm7, %%mm7		\n\t"
-			"pxor %%mm2, %%mm2		\n\t" // 2*xalpha
-			"movd %5, %%mm6			\n\t" // xInc&0xFFFF
-			"punpcklwd %%mm6, %%mm6		\n\t"
-			"punpcklwd %%mm6, %%mm6		\n\t"
-			"movq %%mm6, %%mm2		\n\t"
-			"psllq $16, %%mm2		\n\t"
-			"paddw %%mm6, %%mm2		\n\t"
-			"psllq $16, %%mm2		\n\t"
-			"paddw %%mm6, %%mm2		\n\t"
-			"psllq $16, %%mm2		\n\t" //0,t,2t,3t		t=xInc&0xFF
-			"movq %%mm2, %%mm4		\n\t"
-			"movd %4, %%mm6			\n\t" //(xInc*4)&0xFFFF
-			"punpcklwd %%mm6, %%mm6		\n\t"
-			"punpcklwd %%mm6, %%mm6		\n\t"
+			"movl %0, %%ecx			\n\t"
+			"movl %1, %%edi			\n\t"
+			"movl %2, %%edx			\n\t"
+			"movl %3, %%ebx			\n\t"
 			"xorl %%eax, %%eax		\n\t" // i
-			"movl %0, %%esi			\n\t" // src
-			"movl %1, %%edi			\n\t" // buf1
-			"movl %3, %%edx			\n\t" // (xInc*4)>>16
-			"xorl %%ecx, %%ecx		\n\t"
-			"xorl %%ebx, %%ebx		\n\t"
-			"movw %4, %%bx			\n\t" // (xInc*4)&0xFFFF
+			PREFETCH" (%%ecx)		\n\t"
+			PREFETCH" 32(%%ecx)		\n\t"
+			PREFETCH" 64(%%ecx)		\n\t"
 
 #define FUNNY_Y_CODE \
-			PREFETCH" 1024(%%esi)		\n\t"\
-			PREFETCH" 1056(%%esi)		\n\t"\
-			PREFETCH" 1088(%%esi)		\n\t"\
-			"call *%6			\n\t"\
-			"movq %%mm4, %%mm2		\n\t"\
-			"xorl %%ecx, %%ecx		\n\t"
+			"movl (%%ebx), %%esi		\n\t"\
+			"call *%4			\n\t"\
+			"addl (%%ebx, %%eax), %%ecx	\n\t"\
+			"addl %%eax, %%edi		\n\t"\
+			"xorl %%eax, %%eax		\n\t"\
 
 FUNNY_Y_CODE
 FUNNY_Y_CODE
@@ -2333,8 +2320,8 @@
 FUNNY_Y_CODE
 FUNNY_Y_CODE
 
-			:: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
-			"m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
+			:: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
+			"m" (funnyYCode)
 			: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
 		);
 		for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
@@ -2402,7 +2389,8 @@
 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
 				   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
 				   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
-				   int srcFormat, uint8_t *formatConvBuffer)
+				   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
+				   int32_t *mmx2FilterPos)
 {
     if(srcFormat==IMGFMT_YUY2)
     {
@@ -2469,65 +2457,44 @@
 	if(canMMX2BeUsed)
 	{
 		asm volatile(
-		"pxor %%mm7, %%mm7		\n\t"
-		"pxor %%mm2, %%mm2		\n\t" // 2*xalpha
-		"movd %5, %%mm6			\n\t" // xInc&0xFFFF
-		"punpcklwd %%mm6, %%mm6		\n\t"
-		"punpcklwd %%mm6, %%mm6		\n\t"
-		"movq %%mm6, %%mm2		\n\t"
-		"psllq $16, %%mm2		\n\t"
-		"paddw %%mm6, %%mm2		\n\t"
-		"psllq $16, %%mm2		\n\t"
-		"paddw %%mm6, %%mm2		\n\t"
-		"psllq $16, %%mm2		\n\t" //0,t,2t,3t		t=xInc&0xFFFF
-		"movq %%mm2, %%mm4		\n\t"
-		"movd %4, %%mm6			\n\t" //(xInc*4)&0xFFFF
-		"punpcklwd %%mm6, %%mm6		\n\t"
-		"punpcklwd %%mm6, %%mm6		\n\t"
-		"xorl %%eax, %%eax		\n\t" // i
-		"movl %0, %%esi			\n\t" // src
-		"movl %1, %%edi			\n\t" // buf1
-		"movl %3, %%edx			\n\t" // (xInc*4)>>16
-		"xorl %%ecx, %%ecx		\n\t"
-		"xorl %%ebx, %%ebx		\n\t"
-		"movw %4, %%bx			\n\t" // (xInc*4)&0xFFFF
+			"pxor %%mm7, %%mm7		\n\t"
+			"movl %0, %%ecx			\n\t"
+			"movl %1, %%edi			\n\t"
+			"movl %2, %%edx			\n\t"
+			"movl %3, %%ebx			\n\t"
+			"xorl %%eax, %%eax		\n\t" // i
+			PREFETCH" (%%ecx)		\n\t"
+			PREFETCH" 32(%%ecx)		\n\t"
+			PREFETCH" 64(%%ecx)		\n\t"
+
+#define FUNNY_UV_CODE \
+			"movl (%%ebx), %%esi		\n\t"\
+			"call *%4			\n\t"\
+			"addl (%%ebx, %%eax), %%ecx	\n\t"\
+			"addl %%eax, %%edi		\n\t"\
+			"xorl %%eax, %%eax		\n\t"\
 
-#define FUNNYUVCODE \
-			PREFETCH" 1024(%%esi)		\n\t"\
-			PREFETCH" 1056(%%esi)		\n\t"\
-			PREFETCH" 1088(%%esi)		\n\t"\
-			"call *%7			\n\t"\
-			"movq %%mm4, %%mm2	\n\t"\
-			"xorl %%ecx, %%ecx		\n\t"
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+			"xorl %%eax, %%eax		\n\t" // i
+			"movl %5, %%ecx			\n\t" // src
+			"movl %1, %%edi			\n\t" // buf1
+			"addl $4096, %%edi		\n\t"
+			PREFETCH" (%%ecx)		\n\t"
+			PREFETCH" 32(%%ecx)		\n\t"
+			PREFETCH" 64(%%ecx)		\n\t"
 
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-		"xorl %%eax, %%eax		\n\t" // i
-		"movl %6, %%esi			\n\t" // src
-		"movl %1, %%edi			\n\t" // buf1
-		"addl $4096, %%edi		\n\t"
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
 
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-
-		:: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
-		  "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
-		: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
-	);
+			:: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
+			"m" (funnyUVCode), "m" (src2)
+			: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
+		);
 		for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
 		{
 //			printf("%d %d %d\n", dstWidth, i, srcW);
@@ -2749,7 +2716,8 @@
 //				printf("%d %d\n", lumBufIndex, vLumBufSize);
 				RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
 						flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
-						funnyYCode, c->srcFormat, formatConvBuffer);
+						funnyYCode, c->srcFormat, formatConvBuffer, 
+						c->lumMmx2Filter, c->lumMmx2FilterPos);
 				lastInLumBuf++;
 			}
 			while(lastInChrBuf < lastChrSrcY)
@@ -2763,7 +2731,8 @@
 				//FIXME replace parameters through context struct (some at least)
 				RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
 						flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
-						funnyUVCode, c->srcFormat, formatConvBuffer);
+						funnyUVCode, c->srcFormat, formatConvBuffer, 
+						c->chrMmx2Filter, c->chrMmx2FilterPos);
 				lastInChrBuf++;
 			}
 			//wrap buf index around to stay inside the ring buffer
@@ -2787,7 +2756,8 @@
 				ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
 				RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
 						flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
-						funnyYCode, c->srcFormat, formatConvBuffer);
+						funnyYCode, c->srcFormat, formatConvBuffer, 
+						c->lumMmx2Filter, c->lumMmx2FilterPos);
 				lastInLumBuf++;
 			}
 			while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
@@ -2800,7 +2770,8 @@
 				ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
 				RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
 						flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
-						funnyUVCode, c->srcFormat, formatConvBuffer);
+						funnyUVCode, c->srcFormat, formatConvBuffer, 
+						c->chrMmx2Filter, c->chrMmx2FilterPos);
 				lastInChrBuf++;
 			}
 			//wrap buf index around to stay inside the ring buffer