changeset 2835:86fdf7897315

Minor speedup for K6-2, K7, P3
author nick
date Sun, 11 Nov 2001 15:35:11 +0000
parents b7d3b450f8fd
children ec672ea5ac2c
files libvo/osd.c libvo/osd_template.c
diffstat 2 files changed, 68 insertions(+), 66 deletions(-) [+]
line wrap: on
line diff
--- a/libvo/osd.c	Sun Nov 11 15:20:19 2001 +0000
+++ b/libvo/osd.c	Sun Nov 11 15:35:11 2001 +0000
@@ -85,40 +85,41 @@
         register int x;
 #ifdef ARCH_X86
 #ifdef HAVE_MMX
+    asm volatile(
+	PREFETCHW" %0\n\t"
+	PREFETCH" %1\n\t"
+	PREFETCH" %2\n\t"
+	"pxor %%mm7, %%mm7\n\t"
+	"pcmpeqb %%mm6, %%mm6\n\t" // F..F
+	::"m"(dstbase),"m"(srca),"m"(src):"memory");
+    for(x=0;x<w;x+=2){
 	asm volatile(
-		"pxor %%mm7, %%mm7		\n\t"
-		"xorl %%eax, %%eax		\n\t"
-		"pcmpeqb %%mm6, %%mm6		\n\t" // F..F
-		".balign 16\n\t"
-		"1:				\n\t"
-		"movq (%0, %%eax, 4), %%mm0	\n\t" // dstbase
-		"movq %%mm0, %%mm1		\n\t"
-		"punpcklbw %%mm7, %%mm0		\n\t"
-		"punpckhbw %%mm7, %%mm1		\n\t"
-		"movd (%1, %%eax), %%mm2	\n\t" // srca ABCD0000
-		"paddb %%mm6, %%mm2		\n\t"
-		"punpcklbw %%mm2, %%mm2		\n\t" // srca AABBCCDD
-		"punpcklbw %%mm2, %%mm2		\n\t" // srca AAAABBBB
-		"movq %%mm2, %%mm3		\n\t"
-		"punpcklbw %%mm7, %%mm2		\n\t" // srca 0A0A0A0A
-		"punpckhbw %%mm7, %%mm3		\n\t" // srca 0B0B0B0B
-		"pmullw %%mm2, %%mm0		\n\t"
-		"pmullw %%mm3, %%mm1		\n\t"
-		"psrlw $8, %%mm0		\n\t"
-		"psrlw $8, %%mm1		\n\t"
-		"packuswb %%mm1, %%mm0		\n\t"
-		"movd (%2, %%eax), %%mm2	\n\t" // src ABCD0000
-		"punpcklbw %%mm2, %%mm2		\n\t" // src AABBCCDD
-		"punpcklbw %%mm2, %%mm2		\n\t" // src AAAABBBB
-		"paddb %%mm2, %%mm0		\n\t"
-		"movq %%mm0, (%0, %%eax, 4)	\n\t"
-		"addl $2, %%eax			\n\t"
-		"cmpl %3, %%eax			\n\t"
-		" jb 1b				\n\t"
-
-		:: "r" (dstbase), "r" (srca), "r" (src), "r" (w)
-		: "%eax"
-		);
+		PREFETCHW" 32%0\n\t"
+		PREFETCH" 32%1\n\t"
+		PREFETCH" 32%2\n\t"
+		"movq	%0, %%mm0\n\t" // dstbase
+		"movq	%%mm0, %%mm1\n\t"
+		"punpcklbw %%mm7, %%mm0\n\t"
+		"punpckhbw %%mm7, %%mm1\n\t"
+		"movd	%1, %%mm2\n\t" // srca ABCD0000
+		"paddb	%%mm6, %%mm2\n\t"
+		"punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
+		"punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
+		"movq	%%mm2, %%mm3\n\t"
+		"punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
+		"punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
+		"pmullw	%%mm2, %%mm0\n\t"
+		"pmullw	%%mm3, %%mm1\n\t"
+		"psrlw	$8, %%mm0\n\t"
+		"psrlw	$8, %%mm1\n\t"
+		"packuswb %%mm1, %%mm0\n\t"
+		"movd %2, %%mm2	\n\t" // src ABCD0000
+		"punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
+		"punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
+		"paddb	%%mm2, %%mm0\n\t"
+		"movq	%%mm0, %0\n\t"
+		:: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]));
+	}
 #else /* 0 HAVE_MMX2*/
     for(x=0;x<w;x++){
         if(srca[x]){
--- a/libvo/osd_template.c	Sun Nov 11 15:20:19 2001 +0000
+++ b/libvo/osd_template.c	Sun Nov 11 15:35:11 2001 +0000
@@ -85,40 +85,41 @@
         register int x;
 #ifdef ARCH_X86
 #ifdef HAVE_MMX
+    asm volatile(
+	PREFETCHW" %0\n\t"
+	PREFETCH" %1\n\t"
+	PREFETCH" %2\n\t"
+	"pxor %%mm7, %%mm7\n\t"
+	"pcmpeqb %%mm6, %%mm6\n\t" // F..F
+	::"m"(dstbase),"m"(srca),"m"(src):"memory");
+    for(x=0;x<w;x+=2){
 	asm volatile(
-		"pxor %%mm7, %%mm7		\n\t"
-		"xorl %%eax, %%eax		\n\t"
-		"pcmpeqb %%mm6, %%mm6		\n\t" // F..F
-		".balign 16\n\t"
-		"1:				\n\t"
-		"movq (%0, %%eax, 4), %%mm0	\n\t" // dstbase
-		"movq %%mm0, %%mm1		\n\t"
-		"punpcklbw %%mm7, %%mm0		\n\t"
-		"punpckhbw %%mm7, %%mm1		\n\t"
-		"movd (%1, %%eax), %%mm2	\n\t" // srca ABCD0000
-		"paddb %%mm6, %%mm2		\n\t"
-		"punpcklbw %%mm2, %%mm2		\n\t" // srca AABBCCDD
-		"punpcklbw %%mm2, %%mm2		\n\t" // srca AAAABBBB
-		"movq %%mm2, %%mm3		\n\t"
-		"punpcklbw %%mm7, %%mm2		\n\t" // srca 0A0A0A0A
-		"punpckhbw %%mm7, %%mm3		\n\t" // srca 0B0B0B0B
-		"pmullw %%mm2, %%mm0		\n\t"
-		"pmullw %%mm3, %%mm1		\n\t"
-		"psrlw $8, %%mm0		\n\t"
-		"psrlw $8, %%mm1		\n\t"
-		"packuswb %%mm1, %%mm0		\n\t"
-		"movd (%2, %%eax), %%mm2	\n\t" // src ABCD0000
-		"punpcklbw %%mm2, %%mm2		\n\t" // src AABBCCDD
-		"punpcklbw %%mm2, %%mm2		\n\t" // src AAAABBBB
-		"paddb %%mm2, %%mm0		\n\t"
-		"movq %%mm0, (%0, %%eax, 4)	\n\t"
-		"addl $2, %%eax			\n\t"
-		"cmpl %3, %%eax			\n\t"
-		" jb 1b				\n\t"
-
-		:: "r" (dstbase), "r" (srca), "r" (src), "r" (w)
-		: "%eax"
-		);
+		PREFETCHW" 32%0\n\t"
+		PREFETCH" 32%1\n\t"
+		PREFETCH" 32%2\n\t"
+		"movq	%0, %%mm0\n\t" // dstbase
+		"movq	%%mm0, %%mm1\n\t"
+		"punpcklbw %%mm7, %%mm0\n\t"
+		"punpckhbw %%mm7, %%mm1\n\t"
+		"movd	%1, %%mm2\n\t" // srca ABCD0000
+		"paddb	%%mm6, %%mm2\n\t"
+		"punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
+		"punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
+		"movq	%%mm2, %%mm3\n\t"
+		"punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
+		"punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
+		"pmullw	%%mm2, %%mm0\n\t"
+		"pmullw	%%mm3, %%mm1\n\t"
+		"psrlw	$8, %%mm0\n\t"
+		"psrlw	$8, %%mm1\n\t"
+		"packuswb %%mm1, %%mm0\n\t"
+		"movd %2, %%mm2	\n\t" // src ABCD0000
+		"punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
+		"punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
+		"paddb	%%mm2, %%mm0\n\t"
+		"movq	%%mm0, %0\n\t"
+		:: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]));
+	}
 #else /* 0 HAVE_MMX2*/
     for(x=0;x<w;x++){
         if(srca[x]){