changeset 3142:0f6cce3a8059

runtime cpu detection
author michael
date Mon, 26 Nov 2001 21:12:15 +0000
parents 21b6aab15ec9
children 86910f54c391
files libvo/osd.c libvo/osd_template.c
diffstat 2 files changed, 157 insertions(+), 486 deletions(-) [+]
line wrap: on
line diff
--- a/libvo/osd.c	Mon Nov 26 18:53:32 2001 +0000
+++ b/libvo/osd.c	Mon Nov 26 21:12:15 2001 +0000
@@ -1,414 +1,140 @@
 // Generic alpha renderers for all YUV modes and RGB depths.
 // These are "reference implementations", should be optimized later (MMX, etc)
-// Optimized by Nick and Michael
+// Templating Code from Michael Niedermayer (michaelni@gmx.at) is under GPL
 
 //#define FAST_OSD
 //#define FAST_OSD_TABLE
 
 #include "config.h"
 #include "osd.h"
-#include "../mmx_defs.h"
 //#define ENABLE_PROFILE
 #include "../my_profile.h"
 #include <inttypes.h>
+#include "../cpudetect.h"
 
-#ifdef HAVE_MMX
-static const uint64_t bFF  __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
+extern int verbose; // defined in mplayer.c
+
+#ifdef ARCH_X86
+#define CAN_COMPILE_X86_ASM
 #endif
 
-void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-#if defined(FAST_OSD) && !defined(HAVE_MMX)
-    w=w>>1;
-#endif
-PROFILE_START();
-    for(y=0;y<h;y++){
-        register int x;
-#ifdef HAVE_MMX
-    asm volatile(
-	PREFETCHW" %0\n\t"
-	PREFETCH" %1\n\t"
-	PREFETCH" %2\n\t"
-//	"pxor %%mm7, %%mm7\n\t"
-	"pcmpeqb %%mm5, %%mm5\n\t" // F..F
-	"movq %%mm5, %%mm4\n\t"
-	"psllw $8, %%mm5\n\t" //FF00FF00FF00
-	"psrlw $8, %%mm4\n\t" //00FF00FF00FF
-	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
-    for(x=0;x<w;x+=8){
-	asm volatile(
-		"movl %1, %%eax\n\t"
-		"orl 4%1, %%eax\n\t"
-		" jz 1f\n\t"
-		PREFETCHW" 32%0\n\t"
-		PREFETCH" 32%1\n\t"
-		PREFETCH" 32%2\n\t"
-		"movq	%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"pand %%mm4, %%mm0\n\t" 	//0Y0Y0Y0Y
-		"psrlw $8, %%mm1\n\t"		//0Y0Y0Y0Y
-		"movq	%1, %%mm2\n\t" 		//srca HGFEDCBA
-		"paddb	bFF, %%mm2\n\t"
-		"movq %%mm2, %%mm3\n\t"
-		"pand %%mm4, %%mm2\n\t" 	//0G0E0C0A
-		"psrlw $8, %%mm3\n\t"		//0H0F0D0B
-		"pmullw	%%mm2, %%mm0\n\t"
-		"pmullw	%%mm3, %%mm1\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"pand %%mm5, %%mm1\n\t"
-		"por %%mm1, %%mm0\n\t"
-		"paddb	%2, %%mm0\n\t"
-		"movq	%%mm0, %0\n\t"
-		"1:\n\t"
-		:: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x])
-		: "%eax");
-	}
-#else
-        for(x=0;x<w;x++){
-#ifdef FAST_OSD
-            if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0];
-            if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1];
-#else
-            if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
-#endif
-        }
-#endif
-        src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-#ifdef HAVE_MMX
-	asm volatile(EMMS:::"memory");
-#endif
-PROFILE_END("vo_draw_alpha_yv12");
-    return;
-}
-
-void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-#if defined(FAST_OSD) && !defined(HAVE_MMX)
-    w=w>>1;
-#endif
-PROFILE_START();
-    for(y=0;y<h;y++){
-        register int x;
-#ifdef HAVE_MMX
-    asm volatile(
-	PREFETCHW" %0\n\t"
-	PREFETCH" %1\n\t"
-	PREFETCH" %2\n\t"
-	"pxor %%mm7, %%mm7\n\t"
-	"pcmpeqb %%mm5, %%mm5\n\t" // F..F
-	"movq %%mm5, %%mm4\n\t"
-	"psllw $8, %%mm5\n\t" //FF00FF00FF00
-	"psrlw $8, %%mm4\n\t" //00FF00FF00FF
-	::"m"(*dstbase),"m"(*srca),"m"(*src));
-    for(x=0;x<w;x+=4){
-	asm volatile(
-		"movl %1, %%eax\n\t"
-		"orl %%eax, %%eax\n\t"
-		" jz 1f\n\t"
-		PREFETCHW" 32%0\n\t"
-		PREFETCH" 32%1\n\t"
-		PREFETCH" 32%2\n\t"
-		"movq	%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"pand %%mm4, %%mm0\n\t" 	//0Y0Y0Y0Y
-		"movd	%%eax, %%mm2\n\t"	//srca 0000DCBA
-		"paddb	bFF, %%mm2\n\t"
-		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0D0C0B0A
-		"pmullw	%%mm2, %%mm0\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"pand %%mm5, %%mm1\n\t" 	//U0V0U0V0
-		"movd %2, %%mm2\n\t"		//src 0000DCBA
-		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0D0C0B0A
-		"por %%mm1, %%mm0\n\t"
-		"paddb	%%mm2, %%mm0\n\t"
-		"movq	%%mm0, %0\n\t"
-		"1:\n\t"
-		:: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x])
-		: "%eax");
-	}
-#else
-        for(x=0;x<w;x++){
-#ifdef FAST_OSD
-            if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0];
-            if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1];
-#else
-            if(srca[x]) dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
-#endif
-        }
-#endif
-	src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-#ifdef HAVE_MMX
-	asm volatile(EMMS:::"memory");
-#endif
-PROFILE_END("vo_draw_alpha_yuy2");
-    return;
-}
-
-#ifdef HAVE_MMX
+#ifdef CAN_COMPILE_X86_ASM
+static const uint64_t bFF  __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
 static const unsigned long long mask24lh  __attribute__((aligned(8))) = 0xFFFF000000000000ULL;
 static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL;
 #endif
+
+//Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
+//Plain C versions
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_3DNOW
+#undef ARCH_X86
+#define RENAME(a) a ## _C
+#include "osd_template.c"
+
+#ifdef CAN_COMPILE_X86_ASM
+
+//X86 noMMX versions
+#undef RENAME
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_3DNOW
+#define ARCH_X86
+#define RENAME(a) a ## _X86
+#include "osd_template.c"
+
+//MMX versions
+#undef RENAME
+#define HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_3DNOW
+#define ARCH_X86
+#define RENAME(a) a ## _MMX
+#include "osd_template.c"
+
+//MMX2 versions
+#undef RENAME
+#define HAVE_MMX
+#define HAVE_MMX2
+#undef HAVE_3DNOW
+#define ARCH_X86
+#define RENAME(a) a ## _MMX2
+#include "osd_template.c"
+
+//3DNOW versions
+#undef RENAME
+#define HAVE_MMX
+#undef HAVE_MMX2
+#define HAVE_3DNOW
+#define ARCH_X86
+#define RENAME(a) a ## _3DNow
+#include "osd_template.c"
+
+#endif //CAN_COMPILE_X86_ASM
+
+void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+#ifdef CAN_COMPILE_X86_ASM
+	// ordered per speed fasterst first
+	if(gCpuCaps.hasMMX2)
+		vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.has3DNow)
+		vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX)
+		vo_draw_alpha_yv12_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+	else
+		vo_draw_alpha_yv12_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_yv12_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+}
+
+void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+#ifdef CAN_COMPILE_X86_ASM
+	// ordered per speed fasterst first
+	if(gCpuCaps.hasMMX2)
+		vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.has3DNow)
+		vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX)
+		vo_draw_alpha_yuy2_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+	else
+		vo_draw_alpha_yuy2_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_yuy2_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+}
+
 void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-    for(y=0;y<h;y++){
-        register unsigned char *dst = dstbase;
-        register int x;
-#ifdef ARCH_X86
-#ifdef HAVE_MMX
-    asm volatile(
-	PREFETCHW" %0\n\t"
-	PREFETCH" %1\n\t"
-	PREFETCH" %2\n\t"
-	"pxor %%mm7, %%mm7\n\t"
-	"pcmpeqb %%mm6, %%mm6\n\t" // F..F
-	::"m"(*dst),"m"(*srca),"m"(*src):"memory");
-    for(x=0;x<w;x+=2){
-     if(srca[x] || srca[x+1])
-	asm volatile(
-		PREFETCHW" 32%0\n\t"
-		PREFETCH" 32%1\n\t"
-		PREFETCH" 32%2\n\t"
-		"movq	%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"movq	%%mm0, %%mm5\n\t"
-		"punpcklbw %%mm7, %%mm0\n\t"
-		"punpckhbw %%mm7, %%mm1\n\t"
-		"movd	%1, %%mm2\n\t" // srca ABCD0000
-		"paddb	%%mm6, %%mm2\n\t"
-		"punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
-		"punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
-		"movq	%%mm2, %%mm3\n\t"
-		"punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
-		"punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
-		"pmullw	%%mm2, %%mm0\n\t"
-		"pmullw	%%mm3, %%mm1\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"psrlw	$8, %%mm1\n\t"
-		"packuswb %%mm1, %%mm0\n\t"
-		"movd %2, %%mm2	\n\t" // src ABCD0000
-		"punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
-		"punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
-		"paddb	%%mm2, %%mm0\n\t"
-		"pand	%4, %%mm5\n\t"
-		"pand	%3, %%mm0\n\t"
-		"por	%%mm0, %%mm5\n\t"
-		"movq	%%mm5, %0\n\t"
-		:: "m" (dst[0]), "m" (srca[x]), "m" (src[x]), "m"(mask24hl), "m"(mask24lh));
-		dst += 6;
-	}
-#else /* HAVE_MMX */
-    for(x=0;x<w;x++){
-        if(srca[x]){
-	    asm volatile(
-		"movzbl (%0), %%ecx\n\t"
-		"movzbl 1(%0), %%eax\n\t"
-		"movzbl 2(%0), %%edx\n\t"
-
-		"imull %1, %%ecx\n\t"
-		"imull %1, %%eax\n\t"
-		"imull %1, %%edx\n\t"
-
- 		"addl %2, %%ecx\n\t"
-		"addl %2, %%eax\n\t"
-		"addl %2, %%edx\n\t"
-
-		"movb %%ch, (%0)\n\t"
-		"movb %%ah, 1(%0)\n\t"
-		"movb %%dh, 2(%0)\n\t"
-
-		:
-		:"r" (dst),
-		 "r" ((unsigned)srca[x]),
-		 "r" (((unsigned)src[x])<<8)
-		:"%eax", "%ecx", "%edx"
-		);
-            }
-	    dst += 3;
-        }
-#endif /* HAVE_MMX */
-#else /*non x86 arch*/
-        for(x=0;x<w;x++){
-            if(srca[x]){
-#ifdef FAST_OSD
-		dst[0]=dst[1]=dst[2]=src[x];
+#ifdef CAN_COMPILE_X86_ASM
+	// ordered per speed fasterst first
+	if(gCpuCaps.hasMMX2)
+		vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.has3DNow)
+		vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX)
+		vo_draw_alpha_rgb24_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+	else
+		vo_draw_alpha_rgb24_X86(w, h, src, srca, srcstride, dstbase, dststride);
 #else
-		dst[0]=((dst[0]*srca[x])>>8)+src[x];
-		dst[1]=((dst[1]*srca[x])>>8)+src[x];
-		dst[2]=((dst[2]*srca[x])>>8)+src[x];
+		vo_draw_alpha_rgb24_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
-            }
-            dst+=3; // 24bpp
-        }
-#endif /* arch_x86 */
-        src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-#ifdef HAVE_MMX
-	asm volatile(EMMS:::"memory");
-#endif
-    return;
 }
 
 void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-PROFILE_START();
-    for(y=0;y<h;y++){
-        register int x;
-#ifdef ARCH_X86
-#ifdef HAVE_MMX
-#ifdef HAVE_3DNOW
-    asm volatile(
-	PREFETCHW" %0\n\t"
-	PREFETCH" %1\n\t"
-	PREFETCH" %2\n\t"
-	"pxor %%mm7, %%mm7\n\t"
-	"pcmpeqb %%mm6, %%mm6\n\t" // F..F
-	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
-    for(x=0;x<w;x+=2){
-     if(srca[x] || srca[x+1])
-	asm volatile(
-		PREFETCHW" 32%0\n\t"
-		PREFETCH" 32%1\n\t"
-		PREFETCH" 32%2\n\t"
-		"movq	%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"punpcklbw %%mm7, %%mm0\n\t"
-		"punpckhbw %%mm7, %%mm1\n\t"
-		"movd	%1, %%mm2\n\t" // srca ABCD0000
-		"paddb	%%mm6, %%mm2\n\t"
-		"punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
-		"punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
-		"movq	%%mm2, %%mm3\n\t"
-		"punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
-		"punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
-		"pmullw	%%mm2, %%mm0\n\t"
-		"pmullw	%%mm3, %%mm1\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"psrlw	$8, %%mm1\n\t"
-		"packuswb %%mm1, %%mm0\n\t"
-		"movd %2, %%mm2	\n\t" // src ABCD0000
-		"punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
-		"punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
-		"paddb	%%mm2, %%mm0\n\t"
-		"movq	%%mm0, %0\n\t"
-		:: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]));
-	}
-#else //this is faster for intels crap
-    asm volatile(
-	PREFETCHW" %0\n\t"
-	PREFETCH" %1\n\t"
-	PREFETCH" %2\n\t"
-	"pxor %%mm7, %%mm7\n\t"
-	"pcmpeqb %%mm5, %%mm5\n\t" // F..F
-	"movq %%mm5, %%mm4\n\t"
-	"psllw $8, %%mm5\n\t" //FF00FF00FF00
-	"psrlw $8, %%mm4\n\t" //00FF00FF00FF
-	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
-    for(x=0;x<w;x+=4){
-	asm volatile(
-		"movl %1, %%eax\n\t"
-		"orl %%eax, %%eax\n\t"
-		" jz 1f\n\t"
-		PREFETCHW" 32%0\n\t"
-		PREFETCH" 32%1\n\t"
-		PREFETCH" 32%2\n\t"
-		"movq	%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"pand %%mm4, %%mm0\n\t" 	//0R0B0R0B
-		"psrlw $8, %%mm1\n\t"		//0?0G0?0G
-		"movd	%%eax, %%mm2\n\t" 	//srca 0000DCBA
-		"paddb	bFF, %%mm2\n\t"
-		"punpcklbw %%mm2, %%mm2\n\t"	//srca DDCCBBAA
-		"movq %%mm2, %%mm3\n\t"
-		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0B0B0A0A
-		"pmullw	%%mm2, %%mm0\n\t"
-		"pmullw	%%mm2, %%mm1\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"pand %%mm5, %%mm1\n\t"
-		"por %%mm1, %%mm0\n\t"
-		"movd %2, %%mm2	\n\t"		//src 0000DCBA
-		"punpcklbw %%mm2, %%mm2\n\t" 	//src DDCCBBAA
-		"movq %%mm2, %%mm6\n\t"
-		"punpcklbw %%mm2, %%mm2\n\t"	//src BBBBAAAA
-		"paddb	%%mm2, %%mm0\n\t"
-		"movq	%%mm0, %0\n\t"
-
-		"movq	8%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"pand %%mm4, %%mm0\n\t" 	//0R0B0R0B
-		"psrlw $8, %%mm1\n\t"		//0?0G0?0G
-		"punpckhbw %%mm7, %%mm3\n\t"	//srca 0D0D0C0C
-		"pmullw	%%mm3, %%mm0\n\t"
-		"pmullw	%%mm3, %%mm1\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"pand %%mm5, %%mm1\n\t"
-		"por %%mm1, %%mm0\n\t"
-		"punpckhbw %%mm6, %%mm6\n\t"	//src DDDDCCCC
-		"paddb	%%mm6, %%mm0\n\t"
-		"movq	%%mm0, 8%0\n\t"
-		"1:\n\t"
-		:: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x])
-		: "%eax");
-	}
+#ifdef CAN_COMPILE_X86_ASM
+	// ordered per speed fasterst first
+	if(gCpuCaps.hasMMX2)
+		vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.has3DNow)
+		vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX)
+		vo_draw_alpha_rgb32_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+	else
+		vo_draw_alpha_rgb32_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_rgb32_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
-#else /* HAVE_MMX */
-    for(x=0;x<w;x++){
-        if(srca[x]){
-	    asm volatile(
-		"movzbl (%0), %%ecx\n\t"
-		"movzbl 1(%0), %%eax\n\t"
-		"movzbl 2(%0), %%edx\n\t"
-
-		"imull %1, %%ecx\n\t"
-		"imull %1, %%eax\n\t"
-		"imull %1, %%edx\n\t"
-
- 		"addl %2, %%ecx\n\t"
-		"addl %2, %%eax\n\t"
-		"addl %2, %%edx\n\t"
-
-		"movb %%ch, (%0)\n\t"
-		"movb %%ah, 1(%0)\n\t"
-		"movb %%dh, 2(%0)\n\t"
-
-		:
-		:"r" (&dstbase[4*x]),
-		 "r" ((unsigned)srca[x]),
-		 "r" (((unsigned)src[x])<<8)
-		:"%eax", "%ecx", "%edx"
-		);
-            }
-        }
-#endif /* HAVE_MMX */
-#else /*non x86 arch*/
-        for(x=0;x<w;x++){
-            if(srca[x]){
-#ifdef FAST_OSD
-		dstbase[4*x+0]=dstbase[4*x+1]=dstbase[4*x+2]=src[x];
-#else
-		dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x];
-		dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x];
-		dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x];
-#endif
-            }
-        }
-#endif /* arch_x86 */
-        src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-#ifdef HAVE_MMX
-	asm volatile(EMMS:::"memory");
-#endif
-PROFILE_END("vo_draw_alpha_rgb32");
-    return;
 }
 
 #ifdef FAST_OSD_TABLE
@@ -424,6 +150,23 @@
         fast_osd_16bpp_table[i]=((i>>3)<<11)|((i>>2)<<5)|(i>>3);
     }
 #endif
+//FIXME the optimized stuff is a lie for 15/16bpp as they arent optimized yet
+	if(verbose)
+	{
+#ifdef CAN_COMPILE_X86_ASM
+		// ordered per speed fasterst first
+		if(gCpuCaps.hasMMX2)
+			printf("Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
+		else if(gCpuCaps.has3DNow)
+			printf("Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
+		else if(gCpuCaps.hasMMX)
+			printf("Using MMX Optimized OnScreenDisplay\n");
+		else
+			printf("Using X86 Optimized OnScreenDisplay\n");
+#else
+			printf("Using Unoptimized OnScreenDisplay\n");
+#endif
+	}
 }
 
 void vo_draw_alpha_rgb15(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
--- a/libvo/osd_template.c	Mon Nov 26 18:53:32 2001 +0000
+++ b/libvo/osd_template.c	Mon Nov 26 21:12:15 2001 +0000
@@ -1,22 +1,33 @@
 // Generic alpha renderers for all YUV modes and RGB depths.
-// These are "reference implementations", should be optimized later (MMX, etc)
 // Optimized by Nick and Michael
+// Code from Michael Niedermayer (michaelni@gmx.at) is under GPL
 
-//#define FAST_OSD
-//#define FAST_OSD_TABLE
+#undef PREFETCH
+#undef EMMS
+#undef PREFETCHW
+#undef PAVGB
 
-#include "config.h"
-#include "osd.h"
-#include "../mmx_defs.h"
-//#define ENABLE_PROFILE
-#include "../my_profile.h"
-#include <inttypes.h>
-
-#ifdef HAVE_MMX
-static const uint64_t bFF  __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
+#ifdef HAVE_3DNOW
+#define PREFETCH  "prefetch"
+#define PREFETCHW "prefetchw"
+#define PAVGB	  "pavgusb"
+#elif defined ( HAVE_MMX2 )
+#define PREFETCH "prefetchnta"
+#define PREFETCHW "prefetcht0"
+#define PAVGB	  "pavgb"
+#else
+#define PREFETCH "/nop"
+#define PREFETCHW "/nop"
 #endif
 
-void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+#ifdef HAVE_3DNOW
+/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
+#define EMMS     "femms"
+#else
+#define EMMS     "emms"
+#endif
+
+static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
 #if defined(FAST_OSD) && !defined(HAVE_MMX)
     w=w>>1;
@@ -84,7 +95,7 @@
     return;
 }
 
-void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
 #if defined(FAST_OSD) && !defined(HAVE_MMX)
     w=w>>1;
@@ -150,11 +161,7 @@
     return;
 }
 
-#ifdef HAVE_MMX
-static const unsigned long long mask24lh  __attribute__((aligned(8))) = 0xFFFF000000000000ULL;
-static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL;
-#endif
-void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
     for(y=0;y<h;y++){
         register unsigned char *dst = dstbase;
@@ -256,7 +263,7 @@
     return;
 }
 
-void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+static inline void RENAME(vo_draw_alpha_rgb32)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
 PROFILE_START();
     for(y=0;y<h;y++){
@@ -410,82 +417,3 @@
 PROFILE_END("vo_draw_alpha_rgb32");
     return;
 }
-
-#ifdef FAST_OSD_TABLE
-static unsigned short fast_osd_15bpp_table[256];
-static unsigned short fast_osd_16bpp_table[256];
-#endif
-
-void vo_draw_alpha_init(){
-#ifdef FAST_OSD_TABLE
-    int i;
-    for(i=0;i<256;i++){
-        fast_osd_15bpp_table[i]=((i>>3)<<10)|((i>>3)<<5)|(i>>3);
-        fast_osd_16bpp_table[i]=((i>>3)<<11)|((i>>2)<<5)|(i>>3);
-    }
-#endif
-}
-
-void vo_draw_alpha_rgb15(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-    for(y=0;y<h;y++){
-        register unsigned short *dst = (unsigned short*) dstbase;
-        register int x;
-        for(x=0;x<w;x++){
-            if(srca[x]){
-#ifdef FAST_OSD
-#ifdef FAST_OSD_TABLE
-                dst[x]=fast_osd_15bpp_table[src[x]];
-#else
-		register unsigned int a=src[x]>>3;
-                dst[x]=(a<<10)|(a<<5)|a;
-#endif
-#else
-                unsigned char r=dst[x]&0x1F;
-                unsigned char g=(dst[x]>>5)&0x1F;
-                unsigned char b=(dst[x]>>10)&0x1F;
-                r=(((r*srca[x])>>5)+src[x])>>3;
-                g=(((g*srca[x])>>5)+src[x])>>3;
-                b=(((b*srca[x])>>5)+src[x])>>3;
-                dst[x]=(b<<10)|(g<<5)|r;
-#endif
-            }
-        }
-        src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-    return;
-}
-
-void vo_draw_alpha_rgb16(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-    for(y=0;y<h;y++){
-        register unsigned short *dst = (unsigned short*) dstbase;
-        register int x;
-        for(x=0;x<w;x++){
-            if(srca[x]){
-#ifdef FAST_OSD
-#ifdef FAST_OSD_TABLE
-                dst[x]=fast_osd_16bpp_table[src[x]];
-#else
-                dst[x]=((src[x]>>3)<<11)|((src[x]>>2)<<5)|(src[x]>>3);
-#endif
-#else
-                unsigned char r=dst[x]&0x1F;
-                unsigned char g=(dst[x]>>5)&0x3F;
-                unsigned char b=(dst[x]>>11)&0x1F;
-                r=(((r*srca[x])>>5)+src[x])>>3;
-                g=(((g*srca[x])>>6)+src[x])>>2;
-                b=(((b*srca[x])>>5)+src[x])>>3;
-                dst[x]=(b<<11)|(g<<5)|r;
-#endif
-            }
-        }
-        src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-    return;
-}
-