changeset 2823:004ee19ebfcf

Extract parallelism from OSD stuff + MMX2 optimization. I've found that mplayer's measuring is not precise :( Here my test with using RDTSC: Old stuff: rd_tsc: 774377 rd_tsc: 765985 rd_tsc: 265309 New CPU optimized stuff: rd_tsc: 661154 rd_tsc: 641317 rd_tsc: 222448 New MMX2 optimized stuff: rd_tsc: 269544 rd_tsc: 329189 rd_tsc: 173110
author nick
date Sun, 11 Nov 2001 11:18:50 +0000
parents 7679d983c52f
children 2f7f02fa1199
files libvo/osd.c libvo/osd_template.c
diffstat 2 files changed, 86 insertions(+), 68 deletions(-) [+]
line wrap: on
line diff
--- a/libvo/osd.c	Sun Nov 11 04:31:59 2001 +0000
+++ b/libvo/osd.c	Sun Nov 11 11:18:50 2001 +0000
@@ -76,12 +76,25 @@
     return;
 }
 
+#ifdef PROFILE_ME
+static inline unsigned long long int read_tsc( void )
+{
+  unsigned long long int retval;
+  __asm __volatile ("rdtsc":"=A"(retval)::"memory");
+  return retval;
+}
+#endif
+
 void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
+#ifdef PROFILE_ME
+unsigned long long v1,v2;
+v1 = read_tsc();
+#endif
     for(y=0;y<h;y++){
         register int x;
 #ifdef ARCH_X86
-#if 0 /*def HAVE_MMX2*/
+#ifdef HAVE_MMX2
 	asm volatile(
 		"pxor %%mm7, %%mm7		\n\t"
 		"xorl %%eax, %%eax		\n\t"
@@ -117,41 +130,33 @@
 		: "%eax"
 		);
 #else /* 0 HAVE_MMX2*/
-	asm volatile(
-		"xorl %%eax, %%eax		\n\t"
-		"xorl %%ebx, %%ebx		\n\t"
-		"xorl %%edx, %%edx		\n\t"
-		".balign 16\n\t"
-		"1:				\n\t"
-		"movb (%1, %%eax), %%bl		\n\t"
-		"cmpb $0, %%bl			\n\t"
-		" jz 2f				\n\t"
-		"movzbl (%2, %%eax), %%edx	\n\t"
-		"shll $8, %%edx			\n\t"
-		"decb %%bl			\n\t"
-		"movzbl (%0, %%eax, 4), %%ecx	\n\t"
-		"imull %%ebx, %%ecx		\n\t"
-		"addl %%edx, %%ecx		\n\t"
-		"movb %%ch, (%0, %%eax, 4)	\n\t"
+    for(x=0;x<w;x++){
+        if(srca[x]){
+	    asm volatile(
+		"movzbl (%0), %%ecx\n\t"
+		"movzbl 1(%0), %%eax\n\t"
+		"movzbl 2(%0), %%edx\n\t"
+
+		"imull %1, %%ecx\n\t"
+		"imull %1, %%eax\n\t"
+		"imull %1, %%edx\n\t"
 
-		"movzbl 1(%0, %%eax, 4), %%ecx	\n\t"
-		"imull %%ebx, %%ecx		\n\t"
-		"addl %%edx, %%ecx		\n\t"
-		"movb %%ch, 1(%0, %%eax, 4)	\n\t"
+ 		"addl %2, %%ecx\n\t"
+		"addl %2, %%eax\n\t"
+		"addl %2, %%edx\n\t"
+
+		"movb %%ch, (%0)\n\t"
+		"movb %%ah, 1(%0)\n\t"
+		"movb %%dh, 2(%0)\n\t"
 
-		"movzbl 2(%0, %%eax, 4), %%ecx	\n\t"
-		"imull %%ebx, %%ecx		\n\t"
-		"addl %%edx, %%ecx		\n\t"
-		"movb %%ch, 2(%0, %%eax, 4)	\n\t"
-
-		"2:				\n\t"
-		"addl $1, %%eax			\n\t"
-		"cmpl %3, %%eax			\n\t"
-		" jb 1b				\n\t"
-
-		:: "r" (dstbase), "r" (srca), "r" (src), "m" (w)
-		: "%eax", "%ebx", "%ecx", "%edx"
+		:
+		:"r" (&dstbase[4*x]),
+		 "r" ((unsigned)srca[x]),
+		 "r" (((unsigned)src[x])<<8)
+		:"%eax", "%ecx", "%edx"
 		);
+            }
+        }
 #endif /* 0 HAVE_MMX*/
 #else /*non x86 arch*/
         for(x=0;x<w;x++){
@@ -170,10 +175,14 @@
         srca+=srcstride;
         dstbase+=dststride;
     }
-#if 0 /*def HAVE_MMX2*/
+#ifdef HAVE_MMX2
 	asm volatile(SFENCE:::"memory");
 	asm volatile(EMMS:::"memory");
 #endif
+#ifdef PROFILE_ME
+v2 = read_tsc();
+printf("rd_tsc: %llu\n\t",v2-v1);
+#endif
     return;
 }
 
--- a/libvo/osd_template.c	Sun Nov 11 04:31:59 2001 +0000
+++ b/libvo/osd_template.c	Sun Nov 11 11:18:50 2001 +0000
@@ -76,12 +76,25 @@
     return;
 }
 
+#ifdef PROFILE_ME
+static inline unsigned long long int read_tsc( void )
+{
+  unsigned long long int retval;
+  __asm __volatile ("rdtsc":"=A"(retval)::"memory");
+  return retval;
+}
+#endif
+
 void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
+#ifdef PROFILE_ME
+unsigned long long v1,v2;
+v1 = read_tsc();
+#endif
     for(y=0;y<h;y++){
         register int x;
 #ifdef ARCH_X86
-#if 0 /*def HAVE_MMX2*/
+#ifdef HAVE_MMX2
 	asm volatile(
 		"pxor %%mm7, %%mm7		\n\t"
 		"xorl %%eax, %%eax		\n\t"
@@ -117,41 +130,33 @@
 		: "%eax"
 		);
 #else /* 0 HAVE_MMX2*/
-	asm volatile(
-		"xorl %%eax, %%eax		\n\t"
-		"xorl %%ebx, %%ebx		\n\t"
-		"xorl %%edx, %%edx		\n\t"
-		".balign 16\n\t"
-		"1:				\n\t"
-		"movb (%1, %%eax), %%bl		\n\t"
-		"cmpb $0, %%bl			\n\t"
-		" jz 2f				\n\t"
-		"movzbl (%2, %%eax), %%edx	\n\t"
-		"shll $8, %%edx			\n\t"
-		"decb %%bl			\n\t"
-		"movzbl (%0, %%eax, 4), %%ecx	\n\t"
-		"imull %%ebx, %%ecx		\n\t"
-		"addl %%edx, %%ecx		\n\t"
-		"movb %%ch, (%0, %%eax, 4)	\n\t"
+    for(x=0;x<w;x++){
+        if(srca[x]){
+	    asm volatile(
+		"movzbl (%0), %%ecx\n\t"
+		"movzbl 1(%0), %%eax\n\t"
+		"movzbl 2(%0), %%edx\n\t"
+
+		"imull %1, %%ecx\n\t"
+		"imull %1, %%eax\n\t"
+		"imull %1, %%edx\n\t"
 
-		"movzbl 1(%0, %%eax, 4), %%ecx	\n\t"
-		"imull %%ebx, %%ecx		\n\t"
-		"addl %%edx, %%ecx		\n\t"
-		"movb %%ch, 1(%0, %%eax, 4)	\n\t"
+ 		"addl %2, %%ecx\n\t"
+		"addl %2, %%eax\n\t"
+		"addl %2, %%edx\n\t"
+
+		"movb %%ch, (%0)\n\t"
+		"movb %%ah, 1(%0)\n\t"
+		"movb %%dh, 2(%0)\n\t"
 
-		"movzbl 2(%0, %%eax, 4), %%ecx	\n\t"
-		"imull %%ebx, %%ecx		\n\t"
-		"addl %%edx, %%ecx		\n\t"
-		"movb %%ch, 2(%0, %%eax, 4)	\n\t"
-
-		"2:				\n\t"
-		"addl $1, %%eax			\n\t"
-		"cmpl %3, %%eax			\n\t"
-		" jb 1b				\n\t"
-
-		:: "r" (dstbase), "r" (srca), "r" (src), "m" (w)
-		: "%eax", "%ebx", "%ecx", "%edx"
+		:
+		:"r" (&dstbase[4*x]),
+		 "r" ((unsigned)srca[x]),
+		 "r" (((unsigned)src[x])<<8)
+		:"%eax", "%ecx", "%edx"
 		);
+            }
+        }
 #endif /* 0 HAVE_MMX*/
 #else /*non x86 arch*/
         for(x=0;x<w;x++){
@@ -170,10 +175,14 @@
         srca+=srcstride;
         dstbase+=dststride;
     }
-#if 0 /*def HAVE_MMX2*/
+#ifdef HAVE_MMX2
 	asm volatile(SFENCE:::"memory");
 	asm volatile(EMMS:::"memory");
 #endif
+#ifdef PROFILE_ME
+v2 = read_tsc();
+printf("rd_tsc: %llu\n\t",v2-v1);
+#endif
     return;
 }