changeset 32469:3fef2e17a03f

Move osd.[ch] and osd_template.c from libvo to sub.
author cigaes
date Wed, 27 Oct 2010 17:53:24 +0000
parents 0c7c4ed0b7eb
children 139876e79725
files Makefile libmenu/menu.c libmpcodecs/vf_expand.c libvo/osd.c libvo/osd.h libvo/osd_template.c libvo/vesa_lvo.c libvo/video_out_internal.h libvo/vosub_vidix.c sub/osd.c sub/osd.h sub/osd_template.c
diffstat 12 files changed, 955 insertions(+), 955 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Wed Oct 27 17:52:45 2010 +0000
+++ b/Makefile	Wed Oct 27 17:53:24 2010 +0000
@@ -504,7 +504,6 @@
               libmpdemux/video.c \
               libmpdemux/yuv4mpeg.c \
               libmpdemux/yuv4mpeg_ratio.c \
-              libvo/osd.c \
               osdep/$(GETCH) \
               osdep/$(TIMER) \
               stream/open.c \
@@ -517,6 +516,7 @@
               stream/url.c \
               sub/eosd.c \
               sub/find_sub.c \
+              sub/osd.c \
               sub/spudec.c \
               sub/sub.c \
               sub/sub_cc.c \
--- a/libmenu/menu.c	Wed Oct 27 17:52:45 2010 +0000
+++ b/libmenu/menu.c	Wed Oct 27 17:53:24 2010 +0000
@@ -26,7 +26,7 @@
 #include <fcntl.h>
 #include <unistd.h>
 
-#include "libvo/osd.h"
+#include "sub/osd.h"
 #include "sub/font_load.h"
 #include "sub/sub.h"
 #include "osdep/keycodes.h"
--- a/libmpcodecs/vf_expand.c	Wed Oct 27 17:52:45 2010 +0000
+++ b/libmpcodecs/vf_expand.c	Wed Oct 27 17:53:24 2010 +0000
@@ -36,7 +36,7 @@
 
 #ifdef OSD_SUPPORT
 #include "sub/sub.h"
-#include "libvo/osd.h"
+#include "sub/osd.h"
 #endif
 
 #include "m_option.h"
--- a/libvo/osd.c	Wed Oct 27 17:52:45 2010 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,428 +0,0 @@
-/*
- * generic alpha renderers for all YUV modes and RGB depths
- * These are "reference implementations", should be optimized later (MMX, etc).
- * templating code by Michael Niedermayer (michaelni@gmx.at)
- *
- * This file is part of MPlayer.
- *
- * MPlayer is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * MPlayer is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with MPlayer; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-//#define FAST_OSD
-//#define FAST_OSD_TABLE
-
-#include "config.h"
-#include "osd.h"
-#include "mp_msg.h"
-#include <inttypes.h>
-#include "cpudetect.h"
-
-#if ARCH_X86
-static const uint64_t bFF __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
-static const unsigned long long mask24lh  __attribute__((aligned(8))) = 0xFFFF000000000000ULL;
-static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL;
-#endif
-
-//Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
-//Plain C versions
-#if !HAVE_MMX || CONFIG_RUNTIME_CPUDETECT
-#define COMPILE_C
-#endif
-
-#if ARCH_X86
-
-#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
-#define COMPILE_MMX
-#endif
-
-#if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT
-#define COMPILE_MMX2
-#endif
-
-#if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
-#define COMPILE_3DNOW
-#endif
-
-#endif /* ARCH_X86 */
-
-#undef HAVE_MMX
-#undef HAVE_MMX2
-#undef HAVE_AMD3DNOW
-#define HAVE_MMX 0
-#define HAVE_MMX2 0
-#define HAVE_AMD3DNOW 0
-
-#if ! ARCH_X86
-
-#ifdef COMPILE_C
-#undef HAVE_MMX
-#undef HAVE_MMX2
-#undef HAVE_AMD3DNOW
-#define HAVE_MMX 0
-#define HAVE_MMX2 0
-#define HAVE_AMD3DNOW 0
-#define RENAME(a) a ## _C
-#include "osd_template.c"
-#endif
-
-#else
-
-//X86 noMMX versions
-#ifdef COMPILE_C
-#undef RENAME
-#undef HAVE_MMX
-#undef HAVE_MMX2
-#undef HAVE_AMD3DNOW
-#define HAVE_MMX 0
-#define HAVE_MMX2 0
-#define HAVE_AMD3DNOW 0
-#define RENAME(a) a ## _X86
-#include "osd_template.c"
-#endif
-
-//MMX versions
-#ifdef COMPILE_MMX
-#undef RENAME
-#undef HAVE_MMX
-#undef HAVE_MMX2
-#undef HAVE_AMD3DNOW
-#define HAVE_MMX 1
-#define HAVE_MMX2 0
-#define HAVE_AMD3DNOW 0
-#define RENAME(a) a ## _MMX
-#include "osd_template.c"
-#endif
-
-//MMX2 versions
-#ifdef COMPILE_MMX2
-#undef RENAME
-#undef HAVE_MMX
-#undef HAVE_MMX2
-#undef HAVE_AMD3DNOW
-#define HAVE_MMX 1
-#define HAVE_MMX2 1
-#define HAVE_AMD3DNOW 0
-#define RENAME(a) a ## _MMX2
-#include "osd_template.c"
-#endif
-
-//3DNOW versions
-#ifdef COMPILE_3DNOW
-#undef RENAME
-#undef HAVE_MMX
-#undef HAVE_MMX2
-#undef HAVE_AMD3DNOW
-#define HAVE_MMX 1
-#define HAVE_MMX2 0
-#define HAVE_AMD3DNOW 1
-#define RENAME(a) a ## _3DNow
-#include "osd_template.c"
-#endif
-
-#endif /* ARCH_X86 */
-
-void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-#if CONFIG_RUNTIME_CPUDETECT
-#if ARCH_X86
-	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
-		vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
-	else if(gCpuCaps.has3DNow)
-		vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
-	else if(gCpuCaps.hasMMX)
-		vo_draw_alpha_yv12_MMX(w, h, src, srca, srcstride, dstbase, dststride);
-	else
-		vo_draw_alpha_yv12_X86(w, h, src, srca, srcstride, dstbase, dststride);
-#else
-		vo_draw_alpha_yv12_C(w, h, src, srca, srcstride, dstbase, dststride);
-#endif
-#else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
-		vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
-#elif HAVE_AMD3DNOW
-		vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
-#elif HAVE_MMX
-		vo_draw_alpha_yv12_MMX(w, h, src, srca, srcstride, dstbase, dststride);
-#elif ARCH_X86
-		vo_draw_alpha_yv12_X86(w, h, src, srca, srcstride, dstbase, dststride);
-#else
-		vo_draw_alpha_yv12_C(w, h, src, srca, srcstride, dstbase, dststride);
-#endif
-#endif //!CONFIG_RUNTIME_CPUDETECT
-}
-
-void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-#if CONFIG_RUNTIME_CPUDETECT
-#if ARCH_X86
-	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
-		vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
-	else if(gCpuCaps.has3DNow)
-		vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
-	else if(gCpuCaps.hasMMX)
-		vo_draw_alpha_yuy2_MMX(w, h, src, srca, srcstride, dstbase, dststride);
-	else
-		vo_draw_alpha_yuy2_X86(w, h, src, srca, srcstride, dstbase, dststride);
-#else
-		vo_draw_alpha_yuy2_C(w, h, src, srca, srcstride, dstbase, dststride);
-#endif
-#else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
-		vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
-#elif HAVE_AMD3DNOW
-		vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
-#elif HAVE_MMX
-		vo_draw_alpha_yuy2_MMX(w, h, src, srca, srcstride, dstbase, dststride);
-#elif ARCH_X86
-		vo_draw_alpha_yuy2_X86(w, h, src, srca, srcstride, dstbase, dststride);
-#else
-		vo_draw_alpha_yuy2_C(w, h, src, srca, srcstride, dstbase, dststride);
-#endif
-#endif //!CONFIG_RUNTIME_CPUDETECT
-}
-
-void vo_draw_alpha_uyvy(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-#if CONFIG_RUNTIME_CPUDETECT
-#if ARCH_X86
-	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
-		vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
-	else if(gCpuCaps.has3DNow)
-		vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
-	else if(gCpuCaps.hasMMX)
-		vo_draw_alpha_uyvy_MMX(w, h, src, srca, srcstride, dstbase, dststride);
-	else
-		vo_draw_alpha_uyvy_X86(w, h, src, srca, srcstride, dstbase, dststride);
-#else
-		vo_draw_alpha_uyvy_C(w, h, src, srca, srcstride, dstbase, dststride);
-#endif
-#else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
-		vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
-#elif HAVE_AMD3DNOW
-		vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
-#elif HAVE_MMX
-		vo_draw_alpha_uyvy_MMX(w, h, src, srca, srcstride, dstbase, dststride);
-#elif ARCH_X86
-		vo_draw_alpha_uyvy_X86(w, h, src, srca, srcstride, dstbase, dststride);
-#else
-		vo_draw_alpha_uyvy_C(w, h, src, srca, srcstride, dstbase, dststride);
-#endif
-#endif //!CONFIG_RUNTIME_CPUDETECT
-}
-
-void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-#if CONFIG_RUNTIME_CPUDETECT
-#if ARCH_X86
-	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
-		vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
-	else if(gCpuCaps.has3DNow)
-		vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
-	else if(gCpuCaps.hasMMX)
-		vo_draw_alpha_rgb24_MMX(w, h, src, srca, srcstride, dstbase, dststride);
-	else
-		vo_draw_alpha_rgb24_X86(w, h, src, srca, srcstride, dstbase, dststride);
-#else
-		vo_draw_alpha_rgb24_C(w, h, src, srca, srcstride, dstbase, dststride);
-#endif
-#else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
-		vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
-#elif HAVE_AMD3DNOW
-		vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
-#elif HAVE_MMX
-		vo_draw_alpha_rgb24_MMX(w, h, src, srca, srcstride, dstbase, dststride);
-#elif ARCH_X86
-		vo_draw_alpha_rgb24_X86(w, h, src, srca, srcstride, dstbase, dststride);
-#else
-		vo_draw_alpha_rgb24_C(w, h, src, srca, srcstride, dstbase, dststride);
-#endif
-#endif //!CONFIG_RUNTIME_CPUDETECT
-}
-
-void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-#if CONFIG_RUNTIME_CPUDETECT
-#if ARCH_X86
-	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
-		vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
-	else if(gCpuCaps.has3DNow)
-		vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
-	else if(gCpuCaps.hasMMX)
-		vo_draw_alpha_rgb32_MMX(w, h, src, srca, srcstride, dstbase, dststride);
-	else
-		vo_draw_alpha_rgb32_X86(w, h, src, srca, srcstride, dstbase, dststride);
-#else
-		vo_draw_alpha_rgb32_C(w, h, src, srca, srcstride, dstbase, dststride);
-#endif
-#else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
-		vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
-#elif HAVE_AMD3DNOW
-		vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
-#elif HAVE_MMX
-		vo_draw_alpha_rgb32_MMX(w, h, src, srca, srcstride, dstbase, dststride);
-#elif ARCH_X86
-		vo_draw_alpha_rgb32_X86(w, h, src, srca, srcstride, dstbase, dststride);
-#else
-		vo_draw_alpha_rgb32_C(w, h, src, srca, srcstride, dstbase, dststride);
-#endif
-#endif //!CONFIG_RUNTIME_CPUDETECT
-}
-
-#ifdef FAST_OSD_TABLE
-static unsigned short fast_osd_12bpp_table[256];
-static unsigned short fast_osd_15bpp_table[256];
-static unsigned short fast_osd_16bpp_table[256];
-#endif
-
-void vo_draw_alpha_init(void){
-#ifdef FAST_OSD_TABLE
-    int i;
-    for(i=0;i<256;i++){
-        fast_osd_12bpp_table[i]=((i>>4)<< 8)|((i>>4)<<4)|(i>>4);
-        fast_osd_15bpp_table[i]=((i>>3)<<10)|((i>>3)<<5)|(i>>3);
-        fast_osd_16bpp_table[i]=((i>>3)<<11)|((i>>2)<<5)|(i>>3);
-    }
-#endif
-//FIXME the optimized stuff is a lie for 15/16bpp as they aren't optimized yet
-	if( mp_msg_test(MSGT_OSD,MSGL_V) )
-	{
-#if CONFIG_RUNTIME_CPUDETECT
-#if ARCH_X86
-		// ordered per speed fasterst first
-		if(gCpuCaps.hasMMX2)
-			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
-		else if(gCpuCaps.has3DNow)
-			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
-		else if(gCpuCaps.hasMMX)
-			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX Optimized OnScreenDisplay\n");
-		else
-			mp_msg(MSGT_OSD,MSGL_INFO,"Using X86 Optimized OnScreenDisplay\n");
-#else
-			mp_msg(MSGT_OSD,MSGL_INFO,"Using Unoptimized OnScreenDisplay\n");
-#endif
-#else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
-			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
-#elif HAVE_AMD3DNOW
-			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
-#elif HAVE_MMX
-			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX Optimized OnScreenDisplay\n");
-#elif ARCH_X86
-			mp_msg(MSGT_OSD,MSGL_INFO,"Using X86 Optimized OnScreenDisplay\n");
-#else
-			mp_msg(MSGT_OSD,MSGL_INFO,"Using Unoptimized OnScreenDisplay\n");
-#endif
-#endif //!CONFIG_RUNTIME_CPUDETECT
-	}
-}
-
-void vo_draw_alpha_rgb12(int w, int h, unsigned char* src, unsigned char *srca,
-                         int srcstride, unsigned char* dstbase, int dststride) {
-    int y;
-    for (y = 0; y < h; y++) {
-        register unsigned short *dst = (unsigned short*) dstbase;
-        register int x;
-        for (x = 0; x < w; x++) {
-            if(srca[x]){
-#ifdef FAST_OSD
-#ifdef FAST_OSD_TABLE
-                dst[x] = fast_osd_12bpp_table[src[x]];
-#else
-                register unsigned int a = src[x] >> 4;
-                dst[x] = (a << 8) | (a << 4) | a;
-#endif
-#else
-                unsigned char r = dst[x] & 0x0F;
-                unsigned char g = (dst[x] >> 4) & 0x0F;
-                unsigned char b = (dst[x] >> 8) & 0x0F;
-                r = (((r*srca[x]) >> 4) + src[x]) >> 4;
-                g = (((g*srca[x]) >> 4) + src[x]) >> 4;
-                b = (((b*srca[x]) >> 4) + src[x]) >> 4;
-                dst[x] = (b << 8) | (g << 4) | r;
-#endif
-            }
-        }
-        src += srcstride;
-        srca += srcstride;
-        dstbase += dststride;
-    }
-    return;
-}
-
-void vo_draw_alpha_rgb15(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-    for(y=0;y<h;y++){
-        register unsigned short *dst = (unsigned short*) dstbase;
-        register int x;
-        for(x=0;x<w;x++){
-            if(srca[x]){
-#ifdef FAST_OSD
-#ifdef FAST_OSD_TABLE
-                dst[x]=fast_osd_15bpp_table[src[x]];
-#else
-		register unsigned int a=src[x]>>3;
-                dst[x]=(a<<10)|(a<<5)|a;
-#endif
-#else
-                unsigned char r=dst[x]&0x1F;
-                unsigned char g=(dst[x]>>5)&0x1F;
-                unsigned char b=(dst[x]>>10)&0x1F;
-                r=(((r*srca[x])>>5)+src[x])>>3;
-                g=(((g*srca[x])>>5)+src[x])>>3;
-                b=(((b*srca[x])>>5)+src[x])>>3;
-                dst[x]=(b<<10)|(g<<5)|r;
-#endif
-            }
-        }
-        src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-    return;
-}
-
-void vo_draw_alpha_rgb16(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-    for(y=0;y<h;y++){
-        register unsigned short *dst = (unsigned short*) dstbase;
-        register int x;
-        for(x=0;x<w;x++){
-            if(srca[x]){
-#ifdef FAST_OSD
-#ifdef FAST_OSD_TABLE
-                dst[x]=fast_osd_16bpp_table[src[x]];
-#else
-                dst[x]=((src[x]>>3)<<11)|((src[x]>>2)<<5)|(src[x]>>3);
-#endif
-#else
-                unsigned char r=dst[x]&0x1F;
-                unsigned char g=(dst[x]>>5)&0x3F;
-                unsigned char b=(dst[x]>>11)&0x1F;
-                r=(((r*srca[x])>>5)+src[x])>>3;
-                g=(((g*srca[x])>>6)+src[x])>>2;
-                b=(((b*srca[x])>>5)+src[x])>>3;
-                dst[x]=(b<<11)|(g<<5)|r;
-#endif
-            }
-        }
-        src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-    return;
-}
--- a/libvo/osd.h	Wed Oct 27 17:52:45 2010 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-/*
- * generic alpha renderers for all YUV modes and RGB depths
- * These are "reference implementations", should be optimized later (MMX, etc).
- *
- * This file is part of MPlayer.
- *
- * MPlayer is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * MPlayer is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with MPlayer; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#ifndef MPLAYER_OSD_H
-#define MPLAYER_OSD_H
-
-void vo_draw_alpha_init(void); // build tables
-
-void vo_draw_alpha_yv12(int w,  int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
-void vo_draw_alpha_yuy2(int w,  int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
-void vo_draw_alpha_uyvy(int w,  int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
-void vo_draw_alpha_rgb24(int w, int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
-void vo_draw_alpha_rgb32(int w, int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
-void vo_draw_alpha_rgb12(int w, int h, unsigned char* src, unsigned char *srca,
-                         int srcstride, unsigned char* dstbase, int dststride);
-void vo_draw_alpha_rgb15(int w, int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
-void vo_draw_alpha_rgb16(int w, int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
-
-#endif /* MPLAYER_OSD_H */
--- a/libvo/osd_template.c	Wed Oct 27 17:52:45 2010 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,484 +0,0 @@
-/*
- * generic alpha renderers for all YUV modes and RGB depths
- * Optimized by Nick and Michael.
- *
- * This file is part of MPlayer.
- *
- * MPlayer is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * MPlayer is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with MPlayer; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#undef PREFETCH
-#undef EMMS
-#undef PREFETCHW
-#undef PAVGB
-
-#if HAVE_AMD3DNOW
-#define PREFETCH  "prefetch"
-#define PREFETCHW "prefetchw"
-#define PAVGB	  "pavgusb"
-#elif HAVE_MMX2
-#define PREFETCH "prefetchnta"
-#define PREFETCHW "prefetcht0"
-#define PAVGB	  "pavgb"
-#else
-#define PREFETCH " # nop"
-#define PREFETCHW " # nop"
-#endif
-
-#if HAVE_AMD3DNOW
-/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
-#define EMMS     "femms"
-#else
-#define EMMS     "emms"
-#endif
-
-static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-#if defined(FAST_OSD) && !HAVE_MMX
-    w=w>>1;
-#endif
-#if HAVE_MMX
-    __asm__ volatile(
-        "pcmpeqb %%mm5, %%mm5\n\t" // F..F
-        "movq %%mm5, %%mm4\n\t"
-        "movq %%mm5, %%mm7\n\t"
-        "psllw $8, %%mm5\n\t" //FF00FF00FF00
-        "psrlw $8, %%mm4\n\t" //00FF00FF00FF
-        ::);
-#endif
-    for(y=0;y<h;y++){
-        register int x;
-#if HAVE_MMX
-    __asm__ volatile(
-	PREFETCHW" %0\n\t"
-	PREFETCH" %1\n\t"
-	PREFETCH" %2\n\t"
-	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
-    for(x=0;x<w;x+=8){
-	__asm__ volatile(
-		"movl %1, %%eax\n\t"
-		"orl 4%1, %%eax\n\t"
-		" jz 1f\n\t"
-		PREFETCHW" 32%0\n\t"
-		PREFETCH" 32%1\n\t"
-		PREFETCH" 32%2\n\t"
-		"movq	%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"pand %%mm4, %%mm0\n\t" 	//0Y0Y0Y0Y
-		"psrlw $8, %%mm1\n\t"		//0Y0Y0Y0Y
-		"movq	%1, %%mm2\n\t" 		//srca HGFEDCBA
-		"paddb	%%mm7, %%mm2\n\t"
-		"movq %%mm2, %%mm3\n\t"
-		"pand %%mm4, %%mm2\n\t" 	//0G0E0C0A
-		"psrlw $8, %%mm3\n\t"		//0H0F0D0B
-		"pmullw	%%mm2, %%mm0\n\t"
-		"pmullw	%%mm3, %%mm1\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"pand %%mm5, %%mm1\n\t"
-		"por %%mm1, %%mm0\n\t"
-		"paddb	%2, %%mm0\n\t"
-		"movq	%%mm0, %0\n\t"
-		"1:\n\t"
-		:: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x])
-		: "%eax");
-	}
-#else
-        for(x=0;x<w;x++){
-#ifdef FAST_OSD
-            if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0];
-            if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1];
-#else
-            if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
-#endif
-        }
-#endif
-        src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-#if HAVE_MMX
-	__asm__ volatile(EMMS:::"memory");
-#endif
-    return;
-}
-
-static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-#if defined(FAST_OSD) && !HAVE_MMX
-    w=w>>1;
-#endif
-#if HAVE_MMX
-    __asm__ volatile(
-        "pxor %%mm7, %%mm7\n\t"
-        "pcmpeqb %%mm5, %%mm5\n\t" // F..F
-        "movq %%mm5, %%mm6\n\t"
-        "movq %%mm5, %%mm4\n\t"
-        "psllw $8, %%mm5\n\t" //FF00FF00FF00
-        "psrlw $8, %%mm4\n\t" //00FF00FF00FF
-        ::);
-#endif
-    for(y=0;y<h;y++){
-        register int x;
-#if HAVE_MMX
-    __asm__ volatile(
-	PREFETCHW" %0\n\t"
-	PREFETCH" %1\n\t"
-	PREFETCH" %2\n\t"
-	::"m"(*dstbase),"m"(*srca),"m"(*src));
-    for(x=0;x<w;x+=4){
-	__asm__ volatile(
-		"movl %1, %%eax\n\t"
-		"orl %%eax, %%eax\n\t"
-		" jz 1f\n\t"
-		PREFETCHW" 32%0\n\t"
-		PREFETCH" 32%1\n\t"
-		PREFETCH" 32%2\n\t"
-		"movq	%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"pand %%mm4, %%mm0\n\t" 	//0Y0Y0Y0Y
-		"movd	%%eax, %%mm2\n\t"	//srca 0000DCBA
-		"paddb	%%mm6, %%mm2\n\t"
-		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0D0C0B0A
-		"pmullw	%%mm2, %%mm0\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"pand %%mm5, %%mm1\n\t" 	//U0V0U0V0
-		"movd %2, %%mm2\n\t"		//src 0000DCBA
-		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0D0C0B0A
-		"por %%mm1, %%mm0\n\t"
-		"paddb	%%mm2, %%mm0\n\t"
-		"movq	%%mm0, %0\n\t"
-		"1:\n\t"
-		:: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x])
-		: "%eax");
-	}
-#else
-        for(x=0;x<w;x++){
-#ifdef FAST_OSD
-            if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0];
-            if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1];
-#else
-            if(srca[x]) {
-               dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
-               dstbase[2*x+1]=((((signed)dstbase[2*x+1]-128)*srca[x])>>8)+128;
-           }
-#endif
-        }
-#endif
-	src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-#if HAVE_MMX
-	__asm__ volatile(EMMS:::"memory");
-#endif
-    return;
-}
-
-static inline void RENAME(vo_draw_alpha_uyvy)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-  int y;
-#if defined(FAST_OSD)
-  w=w>>1;
-#endif
-  for(y=0;y<h;y++){
-    register int x;
-    for(x=0;x<w;x++){
-#ifdef FAST_OSD
-      if(srca[2*x+0]) dstbase[4*x+2]=src[2*x+0];
-      if(srca[2*x+1]) dstbase[4*x+0]=src[2*x+1];
-#else
-      if(srca[x]) {
-	dstbase[2*x+1]=((dstbase[2*x+1]*srca[x])>>8)+src[x];
-	dstbase[2*x]=((((signed)dstbase[2*x]-128)*srca[x])>>8)+128;
-      }
-#endif
-    }
-    src+=srcstride;
-    srca+=srcstride;
-    dstbase+=dststride;
-  }
-}
-
-static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-#if HAVE_MMX
-    __asm__ volatile(
-        "pxor %%mm7, %%mm7\n\t"
-        "pcmpeqb %%mm6, %%mm6\n\t" // F..F
-        ::);
-#endif
-    for(y=0;y<h;y++){
-        register unsigned char *dst = dstbase;
-        register int x;
-#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX)
-#if HAVE_MMX
-    __asm__ volatile(
-	PREFETCHW" %0\n\t"
-	PREFETCH" %1\n\t"
-	PREFETCH" %2\n\t"
-	::"m"(*dst),"m"(*srca),"m"(*src):"memory");
-    for(x=0;x<w;x+=2){
-     if(srca[x] || srca[x+1])
-	__asm__ volatile(
-		PREFETCHW" 32%0\n\t"
-		PREFETCH" 32%1\n\t"
-		PREFETCH" 32%2\n\t"
-		"movq	%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"movq	%%mm0, %%mm5\n\t"
-		"punpcklbw %%mm7, %%mm0\n\t"
-		"punpckhbw %%mm7, %%mm1\n\t"
-		"movd	%1, %%mm2\n\t" // srca ABCD0000
-		"paddb	%%mm6, %%mm2\n\t"
-		"punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
-		"punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
-		"psrlq  $8, %%mm2\n\t" // srca AAABBBB0
-		"movq	%%mm2, %%mm3\n\t"
-		"punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0B
-		"punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B00
-		"pmullw	%%mm2, %%mm0\n\t"
-		"pmullw	%%mm3, %%mm1\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"psrlw	$8, %%mm1\n\t"
-		"packuswb %%mm1, %%mm0\n\t"
-		"movd %2, %%mm2	\n\t" // src ABCD0000
-		"punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
-		"punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
-		"psrlq  $8, %%mm2\n\t" // src AAABBBB0
-		"paddb	%%mm2, %%mm0\n\t"
-		"pand	%4, %%mm5\n\t"
-		"pand	%3, %%mm0\n\t"
-		"por	%%mm0, %%mm5\n\t"
-		"movq	%%mm5, %0\n\t"
-		:: "m" (dst[0]), "m" (srca[x]), "m" (src[x]), "m"(mask24hl), "m"(mask24lh));
-		dst += 6;
-	}
-#else /* HAVE_MMX */
-    for(x=0;x<w;x++){
-        if(srca[x]){
-	    __asm__ volatile(
-		"movzbl (%0), %%ecx\n\t"
-		"movzbl 1(%0), %%eax\n\t"
-
-		"imull %1, %%ecx\n\t"
-		"imull %1, %%eax\n\t"
-
-		"addl %2, %%ecx\n\t"
-		"addl %2, %%eax\n\t"
-
-		"movb %%ch, (%0)\n\t"
-		"movb %%ah, 1(%0)\n\t"
-
-                "movzbl 2(%0), %%eax\n\t"
-		"imull %1, %%eax\n\t"
-		"addl %2, %%eax\n\t"
-		"movb %%ah, 2(%0)\n\t"
-		:
-		:"D" (dst),
-		 "r" ((unsigned)srca[x]),
-		 "r" (((unsigned)src[x])<<8)
-		:"%eax", "%ecx"
-		);
-            }
-	    dst += 3;
-        }
-#endif /* !HAVE_MMX */
-#else /*non x86 arch or x86_64 with MMX disabled */
-        for(x=0;x<w;x++){
-            if(srca[x]){
-#ifdef FAST_OSD
-		dst[0]=dst[1]=dst[2]=src[x];
-#else
-		dst[0]=((dst[0]*srca[x])>>8)+src[x];
-		dst[1]=((dst[1]*srca[x])>>8)+src[x];
-		dst[2]=((dst[2]*srca[x])>>8)+src[x];
-#endif
-            }
-            dst+=3; // 24bpp
-        }
-#endif /* arch_x86 */
-        src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-#if HAVE_MMX
-	__asm__ volatile(EMMS:::"memory");
-#endif
-    return;
-}
-
-static inline void RENAME(vo_draw_alpha_rgb32)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    int y;
-#if HAVE_BIGENDIAN
-    dstbase++;
-#endif
-#if HAVE_MMX
-#if HAVE_AMD3DNOW
-    __asm__ volatile(
-        "pxor %%mm7, %%mm7\n\t"
-        "pcmpeqb %%mm6, %%mm6\n\t" // F..F
-        ::);
-#else /* HAVE_AMD3DNOW */
-    __asm__ volatile(
-        "pxor %%mm7, %%mm7\n\t"
-        "pcmpeqb %%mm5, %%mm5\n\t" // F..F
-        "movq %%mm5, %%mm4\n\t"
-        "psllw $8, %%mm5\n\t" //FF00FF00FF00
-        "psrlw $8, %%mm4\n\t" //00FF00FF00FF
-        ::);
-#endif /* HAVE_AMD3DNOW */
-#endif /* HAVE_MMX */
-    for(y=0;y<h;y++){
-        register int x;
-#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX)
-#if HAVE_MMX
-#if HAVE_AMD3DNOW
-    __asm__ volatile(
-	PREFETCHW" %0\n\t"
-	PREFETCH" %1\n\t"
-	PREFETCH" %2\n\t"
-	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
-    for(x=0;x<w;x+=2){
-     if(srca[x] || srca[x+1])
-	__asm__ volatile(
-		PREFETCHW" 32%0\n\t"
-		PREFETCH" 32%1\n\t"
-		PREFETCH" 32%2\n\t"
-		"movq	%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"punpcklbw %%mm7, %%mm0\n\t"
-		"punpckhbw %%mm7, %%mm1\n\t"
-		"movd	%1, %%mm2\n\t" // srca ABCD0000
-		"paddb	%%mm6, %%mm2\n\t"
-		"punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
-		"punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
-		"movq	%%mm2, %%mm3\n\t"
-		"punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
-		"punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
-		"pmullw	%%mm2, %%mm0\n\t"
-		"pmullw	%%mm3, %%mm1\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"psrlw	$8, %%mm1\n\t"
-		"packuswb %%mm1, %%mm0\n\t"
-		"movd %2, %%mm2	\n\t" // src ABCD0000
-		"punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
-		"punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
-		"paddb	%%mm2, %%mm0\n\t"
-		"movq	%%mm0, %0\n\t"
-		:: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]));
-	}
-#else //this is faster for intels crap
-    __asm__ volatile(
-	PREFETCHW" %0\n\t"
-	PREFETCH" %1\n\t"
-	PREFETCH" %2\n\t"
-	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
-    for(x=0;x<w;x+=4){
-	__asm__ volatile(
-		"movl %1, %%eax\n\t"
-		"orl %%eax, %%eax\n\t"
-		" jz 1f\n\t"
-		PREFETCHW" 32%0\n\t"
-		PREFETCH" 32%1\n\t"
-		PREFETCH" 32%2\n\t"
-		"movq	%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"pand %%mm4, %%mm0\n\t" 	//0R0B0R0B
-		"psrlw $8, %%mm1\n\t"		//0?0G0?0G
-		"movd	%%eax, %%mm2\n\t" 	//srca 0000DCBA
-		"paddb	%3, %%mm2\n\t"
-		"punpcklbw %%mm2, %%mm2\n\t"	//srca DDCCBBAA
-		"movq %%mm2, %%mm3\n\t"
-		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0B0B0A0A
-		"pmullw	%%mm2, %%mm0\n\t"
-		"pmullw	%%mm2, %%mm1\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"pand %%mm5, %%mm1\n\t"
-		"por %%mm1, %%mm0\n\t"
-		"movd %2, %%mm2	\n\t"		//src 0000DCBA
-		"punpcklbw %%mm2, %%mm2\n\t" 	//src DDCCBBAA
-		"movq %%mm2, %%mm6\n\t"
-		"punpcklbw %%mm2, %%mm2\n\t"	//src BBBBAAAA
-		"paddb	%%mm2, %%mm0\n\t"
-		"movq	%%mm0, %0\n\t"
-
-		"movq	8%0, %%mm0\n\t" // dstbase
-		"movq	%%mm0, %%mm1\n\t"
-		"pand %%mm4, %%mm0\n\t" 	//0R0B0R0B
-		"psrlw $8, %%mm1\n\t"		//0?0G0?0G
-		"punpckhbw %%mm7, %%mm3\n\t"	//srca 0D0D0C0C
-		"pmullw	%%mm3, %%mm0\n\t"
-		"pmullw	%%mm3, %%mm1\n\t"
-		"psrlw	$8, %%mm0\n\t"
-		"pand %%mm5, %%mm1\n\t"
-		"por %%mm1, %%mm0\n\t"
-		"punpckhbw %%mm6, %%mm6\n\t"	//src DDDDCCCC
-		"paddb	%%mm6, %%mm0\n\t"
-		"movq	%%mm0, 8%0\n\t"
-		"1:\n\t"
-		:: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]), "m" (bFF)
-		: "%eax");
-	}
-#endif
-#else /* HAVE_MMX */
-    for(x=0;x<w;x++){
-        if(srca[x]){
-	    __asm__ volatile(
-		"movzbl (%0), %%ecx\n\t"
-		"movzbl 1(%0), %%eax\n\t"
-		"movzbl 2(%0), %%edx\n\t"
-
-		"imull %1, %%ecx\n\t"
-		"imull %1, %%eax\n\t"
-		"imull %1, %%edx\n\t"
-
- 		"addl %2, %%ecx\n\t"
-		"addl %2, %%eax\n\t"
-		"addl %2, %%edx\n\t"
-
-		"movb %%ch, (%0)\n\t"
-		"movb %%ah, 1(%0)\n\t"
-		"movb %%dh, 2(%0)\n\t"
-
-		:
-		:"r" (&dstbase[4*x]),
-		 "r" ((unsigned)srca[x]),
-		 "r" (((unsigned)src[x])<<8)
-		:"%eax", "%ecx", "%edx"
-		);
-            }
-        }
-#endif /* HAVE_MMX */
-#else /*non x86 arch or x86_64 with MMX disabled */
-        for(x=0;x<w;x++){
-            if(srca[x]){
-#ifdef FAST_OSD
-		dstbase[4*x+0]=dstbase[4*x+1]=dstbase[4*x+2]=src[x];
-#else
-		dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x];
-		dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x];
-		dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x];
-#endif
-            }
-        }
-#endif /* arch_x86 */
-        src+=srcstride;
-        srca+=srcstride;
-        dstbase+=dststride;
-    }
-#if HAVE_MMX
-	__asm__ volatile(EMMS:::"memory");
-#endif
-    return;
-}
--- a/libvo/vesa_lvo.c	Wed Oct 27 17:52:45 2010 +0000
+++ b/libvo/vesa_lvo.c	Wed Oct 27 17:53:24 2010 +0000
@@ -38,7 +38,7 @@
 #include "libmpcodecs/img_format.h"
 #include "drivers/mga_vid.h" /* <- should be changed to "linux/'something'.h" */
 #include "fastmemcpy.h"
-#include "osd.h"
+#include "sub/osd.h"
 #include "video_out.h"
 #include "sub/sub.h"
 #include "libmpcodecs/vfcap.h"
--- a/libvo/video_out_internal.h	Wed Oct 27 17:52:45 2010 +0000
+++ b/libvo/video_out_internal.h	Wed Oct 27 17:53:24 2010 +0000
@@ -55,6 +55,6 @@
 	uninit\
 };
 
-#include "osd.h"
+#include "sub/osd.h"
 
 #endif /* MPLAYER_VIDEO_OUT_INTERNAL_H */
--- a/libvo/vosub_vidix.c	Wed Oct 27 17:52:45 2010 +0000
+++ b/libvo/vosub_vidix.c	Wed Oct 27 17:53:24 2010 +0000
@@ -40,7 +40,7 @@
 
 #include "vidix/vidix.h"
 #include "fastmemcpy.h"
-#include "osd.h"
+#include "sub/osd.h"
 #include "video_out.h"
 #include "sub/sub.h"
 #include "vosub_vidix.h"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sub/osd.c	Wed Oct 27 17:53:24 2010 +0000
@@ -0,0 +1,428 @@
+/*
+ * generic alpha renderers for all YUV modes and RGB depths
+ * These are "reference implementations", should be optimized later (MMX, etc).
+ * templating code by Michael Niedermayer (michaelni@gmx.at)
+ *
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+//#define FAST_OSD
+//#define FAST_OSD_TABLE
+
+#include "config.h"
+#include "osd.h"
+#include "mp_msg.h"
+#include <inttypes.h>
+#include "cpudetect.h"
+
+#if ARCH_X86
+static const uint64_t bFF __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
+static const unsigned long long mask24lh  __attribute__((aligned(8))) = 0xFFFF000000000000ULL;
+static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL;
+#endif
+
+//Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
+//Plain C versions
+#if !HAVE_MMX || CONFIG_RUNTIME_CPUDETECT
+#define COMPILE_C
+#endif
+
+#if ARCH_X86
+
+#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
+#define COMPILE_MMX
+#endif
+
+#if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT
+#define COMPILE_MMX2
+#endif
+
+#if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
+#define COMPILE_3DNOW
+#endif
+
+#endif /* ARCH_X86 */
+
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX 0
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
+
+#if ! ARCH_X86
+
+#ifdef COMPILE_C
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX 0
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
+#define RENAME(a) a ## _C
+#include "osd_template.c"
+#endif
+
+#else
+
+//X86 noMMX versions
+#ifdef COMPILE_C
+#undef RENAME
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX 0
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
+#define RENAME(a) a ## _X86
+#include "osd_template.c"
+#endif
+
+//MMX versions
+#ifdef COMPILE_MMX
+#undef RENAME
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX 1
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
+#define RENAME(a) a ## _MMX
+#include "osd_template.c"
+#endif
+
+//MMX2 versions
+#ifdef COMPILE_MMX2
+#undef RENAME
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX 1
+#define HAVE_MMX2 1
+#define HAVE_AMD3DNOW 0
+#define RENAME(a) a ## _MMX2
+#include "osd_template.c"
+#endif
+
+//3DNOW versions
+#ifdef COMPILE_3DNOW
+#undef RENAME
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX 1
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 1
+#define RENAME(a) a ## _3DNow
+#include "osd_template.c"
+#endif
+
+#endif /* ARCH_X86 */
+
+void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+#if CONFIG_RUNTIME_CPUDETECT
+#if ARCH_X86
+	// ordered by speed / fastest first
+	if(gCpuCaps.hasMMX2)
+		vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.has3DNow)
+		vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX)
+		vo_draw_alpha_yv12_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+	else
+		vo_draw_alpha_yv12_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_yv12_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+#else //CONFIG_RUNTIME_CPUDETECT
+#if HAVE_MMX2
+		vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_AMD3DNOW
+		vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX
+		vo_draw_alpha_yv12_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+#elif ARCH_X86
+		vo_draw_alpha_yv12_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_yv12_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+#endif //!CONFIG_RUNTIME_CPUDETECT
+}
+
+void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+#if CONFIG_RUNTIME_CPUDETECT
+#if ARCH_X86
+	// ordered by speed / fastest first
+	if(gCpuCaps.hasMMX2)
+		vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.has3DNow)
+		vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX)
+		vo_draw_alpha_yuy2_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+	else
+		vo_draw_alpha_yuy2_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_yuy2_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+#else //CONFIG_RUNTIME_CPUDETECT
+#if HAVE_MMX2
+		vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_AMD3DNOW
+		vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX
+		vo_draw_alpha_yuy2_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+#elif ARCH_X86
+		vo_draw_alpha_yuy2_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_yuy2_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+#endif //!CONFIG_RUNTIME_CPUDETECT
+}
+
+void vo_draw_alpha_uyvy(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+#if CONFIG_RUNTIME_CPUDETECT
+#if ARCH_X86
+	// ordered by speed / fastest first
+	if(gCpuCaps.hasMMX2)
+		vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.has3DNow)
+		vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX)
+		vo_draw_alpha_uyvy_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+	else
+		vo_draw_alpha_uyvy_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_uyvy_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+#else //CONFIG_RUNTIME_CPUDETECT
+#if HAVE_MMX2
+		vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_AMD3DNOW
+		vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX
+		vo_draw_alpha_uyvy_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+#elif ARCH_X86
+		vo_draw_alpha_uyvy_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_uyvy_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+#endif //!CONFIG_RUNTIME_CPUDETECT
+}
+
+void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+#if CONFIG_RUNTIME_CPUDETECT
+#if ARCH_X86
+	// ordered by speed / fastest first
+	if(gCpuCaps.hasMMX2)
+		vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.has3DNow)
+		vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX)
+		vo_draw_alpha_rgb24_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+	else
+		vo_draw_alpha_rgb24_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_rgb24_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+#else //CONFIG_RUNTIME_CPUDETECT
+#if HAVE_MMX2
+		vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_AMD3DNOW
+		vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX
+		vo_draw_alpha_rgb24_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+#elif ARCH_X86
+		vo_draw_alpha_rgb24_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_rgb24_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+#endif //!CONFIG_RUNTIME_CPUDETECT
+}
+
+void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+#if CONFIG_RUNTIME_CPUDETECT
+#if ARCH_X86
+	// ordered by speed / fastest first
+	if(gCpuCaps.hasMMX2)
+		vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.has3DNow)
+		vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX)
+		vo_draw_alpha_rgb32_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+	else
+		vo_draw_alpha_rgb32_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_rgb32_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+#else //CONFIG_RUNTIME_CPUDETECT
+#if HAVE_MMX2
+		vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_AMD3DNOW
+		vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX
+		vo_draw_alpha_rgb32_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+#elif ARCH_X86
+		vo_draw_alpha_rgb32_X86(w, h, src, srca, srcstride, dstbase, dststride);
+#else
+		vo_draw_alpha_rgb32_C(w, h, src, srca, srcstride, dstbase, dststride);
+#endif
+#endif //!CONFIG_RUNTIME_CPUDETECT
+}
+
+#ifdef FAST_OSD_TABLE
+static unsigned short fast_osd_12bpp_table[256];
+static unsigned short fast_osd_15bpp_table[256];
+static unsigned short fast_osd_16bpp_table[256];
+#endif
+
+void vo_draw_alpha_init(void){
+#ifdef FAST_OSD_TABLE
+    int i;
+    for(i=0;i<256;i++){
+        fast_osd_12bpp_table[i]=((i>>4)<< 8)|((i>>4)<<4)|(i>>4);
+        fast_osd_15bpp_table[i]=((i>>3)<<10)|((i>>3)<<5)|(i>>3);
+        fast_osd_16bpp_table[i]=((i>>3)<<11)|((i>>2)<<5)|(i>>3);
+    }
+#endif
+//FIXME the optimized stuff is a lie for 15/16bpp as they aren't optimized yet
+	if( mp_msg_test(MSGT_OSD,MSGL_V) )
+	{
+#if CONFIG_RUNTIME_CPUDETECT
+#if ARCH_X86
+		// ordered per speed fasterst first
+		if(gCpuCaps.hasMMX2)
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
+		else if(gCpuCaps.has3DNow)
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
+		else if(gCpuCaps.hasMMX)
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX Optimized OnScreenDisplay\n");
+		else
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using X86 Optimized OnScreenDisplay\n");
+#else
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using Unoptimized OnScreenDisplay\n");
+#endif
+#else //CONFIG_RUNTIME_CPUDETECT
+#if HAVE_MMX2
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
+#elif HAVE_AMD3DNOW
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
+#elif HAVE_MMX
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX Optimized OnScreenDisplay\n");
+#elif ARCH_X86
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using X86 Optimized OnScreenDisplay\n");
+#else
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using Unoptimized OnScreenDisplay\n");
+#endif
+#endif //!CONFIG_RUNTIME_CPUDETECT
+	}
+}
+
+void vo_draw_alpha_rgb12(int w, int h, unsigned char* src, unsigned char *srca,
+                         int srcstride, unsigned char* dstbase, int dststride) {
+    int y;
+    for (y = 0; y < h; y++) {
+        register unsigned short *dst = (unsigned short*) dstbase;
+        register int x;
+        for (x = 0; x < w; x++) {
+            if(srca[x]){
+#ifdef FAST_OSD
+#ifdef FAST_OSD_TABLE
+                dst[x] = fast_osd_12bpp_table[src[x]];
+#else
+                register unsigned int a = src[x] >> 4;
+                dst[x] = (a << 8) | (a << 4) | a;
+#endif
+#else
+                unsigned char r = dst[x] & 0x0F;
+                unsigned char g = (dst[x] >> 4) & 0x0F;
+                unsigned char b = (dst[x] >> 8) & 0x0F;
+                r = (((r*srca[x]) >> 4) + src[x]) >> 4;
+                g = (((g*srca[x]) >> 4) + src[x]) >> 4;
+                b = (((b*srca[x]) >> 4) + src[x]) >> 4;
+                dst[x] = (b << 8) | (g << 4) | r;
+#endif
+            }
+        }
+        src += srcstride;
+        srca += srcstride;
+        dstbase += dststride;
+    }
+    return;
+}
+
+void vo_draw_alpha_rgb15(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+    int y;
+    for(y=0;y<h;y++){
+        register unsigned short *dst = (unsigned short*) dstbase;
+        register int x;
+        for(x=0;x<w;x++){
+            if(srca[x]){
+#ifdef FAST_OSD
+#ifdef FAST_OSD_TABLE
+                dst[x]=fast_osd_15bpp_table[src[x]];
+#else
+		register unsigned int a=src[x]>>3;
+                dst[x]=(a<<10)|(a<<5)|a;
+#endif
+#else
+                unsigned char r=dst[x]&0x1F;
+                unsigned char g=(dst[x]>>5)&0x1F;
+                unsigned char b=(dst[x]>>10)&0x1F;
+                r=(((r*srca[x])>>5)+src[x])>>3;
+                g=(((g*srca[x])>>5)+src[x])>>3;
+                b=(((b*srca[x])>>5)+src[x])>>3;
+                dst[x]=(b<<10)|(g<<5)|r;
+#endif
+            }
+        }
+        src+=srcstride;
+        srca+=srcstride;
+        dstbase+=dststride;
+    }
+    return;
+}
+
+void vo_draw_alpha_rgb16(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+    int y;
+    for(y=0;y<h;y++){
+        register unsigned short *dst = (unsigned short*) dstbase;
+        register int x;
+        for(x=0;x<w;x++){
+            if(srca[x]){
+#ifdef FAST_OSD
+#ifdef FAST_OSD_TABLE
+                dst[x]=fast_osd_16bpp_table[src[x]];
+#else
+                dst[x]=((src[x]>>3)<<11)|((src[x]>>2)<<5)|(src[x]>>3);
+#endif
+#else
+                unsigned char r=dst[x]&0x1F;
+                unsigned char g=(dst[x]>>5)&0x3F;
+                unsigned char b=(dst[x]>>11)&0x1F;
+                r=(((r*srca[x])>>5)+src[x])>>3;
+                g=(((g*srca[x])>>6)+src[x])>>2;
+                b=(((b*srca[x])>>5)+src[x])>>3;
+                dst[x]=(b<<11)|(g<<5)|r;
+#endif
+            }
+        }
+        src+=srcstride;
+        srca+=srcstride;
+        dstbase+=dststride;
+    }
+    return;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sub/osd.h	Wed Oct 27 17:53:24 2010 +0000
@@ -0,0 +1,37 @@
+/*
+ * generic alpha renderers for all YUV modes and RGB depths
+ * These are "reference implementations", should be optimized later (MMX, etc).
+ *
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef MPLAYER_OSD_H
+#define MPLAYER_OSD_H
+
+void vo_draw_alpha_init(void); // build tables
+
+void vo_draw_alpha_yv12(int w,  int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
+void vo_draw_alpha_yuy2(int w,  int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
+void vo_draw_alpha_uyvy(int w,  int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
+void vo_draw_alpha_rgb24(int w, int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
+void vo_draw_alpha_rgb32(int w, int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
+void vo_draw_alpha_rgb12(int w, int h, unsigned char* src, unsigned char *srca,
+                         int srcstride, unsigned char* dstbase, int dststride);
+void vo_draw_alpha_rgb15(int w, int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
+void vo_draw_alpha_rgb16(int w, int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase, int dststride);
+
+#endif /* MPLAYER_OSD_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sub/osd_template.c	Wed Oct 27 17:53:24 2010 +0000
@@ -0,0 +1,484 @@
+/*
+ * generic alpha renderers for all YUV modes and RGB depths
+ * Optimized by Nick and Michael.
+ *
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#undef PREFETCH
+#undef EMMS
+#undef PREFETCHW
+#undef PAVGB
+
+#if HAVE_AMD3DNOW
+#define PREFETCH  "prefetch"
+#define PREFETCHW "prefetchw"
+#define PAVGB	  "pavgusb"
+#elif HAVE_MMX2
+#define PREFETCH "prefetchnta"
+#define PREFETCHW "prefetcht0"
+#define PAVGB	  "pavgb"
+#else
+#define PREFETCH " # nop"
+#define PREFETCHW " # nop"
+#endif
+
+#if HAVE_AMD3DNOW
+/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
+#define EMMS     "femms"
+#else
+#define EMMS     "emms"
+#endif
+
+static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+    int y;
+#if defined(FAST_OSD) && !HAVE_MMX
+    w=w>>1;
+#endif
+#if HAVE_MMX
+    __asm__ volatile(
+        "pcmpeqb %%mm5, %%mm5\n\t" // F..F
+        "movq %%mm5, %%mm4\n\t"
+        "movq %%mm5, %%mm7\n\t"
+        "psllw $8, %%mm5\n\t" //FF00FF00FF00
+        "psrlw $8, %%mm4\n\t" //00FF00FF00FF
+        ::);
+#endif
+    for(y=0;y<h;y++){
+        register int x;
+#if HAVE_MMX
+    __asm__ volatile(
+	PREFETCHW" %0\n\t"
+	PREFETCH" %1\n\t"
+	PREFETCH" %2\n\t"
+	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
+    for(x=0;x<w;x+=8){
+	__asm__ volatile(
+		"movl %1, %%eax\n\t"
+		"orl 4%1, %%eax\n\t"
+		" jz 1f\n\t"
+		PREFETCHW" 32%0\n\t"
+		PREFETCH" 32%1\n\t"
+		PREFETCH" 32%2\n\t"
+		"movq	%0, %%mm0\n\t" // dstbase
+		"movq	%%mm0, %%mm1\n\t"
+		"pand %%mm4, %%mm0\n\t" 	//0Y0Y0Y0Y
+		"psrlw $8, %%mm1\n\t"		//0Y0Y0Y0Y
+		"movq	%1, %%mm2\n\t" 		//srca HGFEDCBA
+		"paddb	%%mm7, %%mm2\n\t"
+		"movq %%mm2, %%mm3\n\t"
+		"pand %%mm4, %%mm2\n\t" 	//0G0E0C0A
+		"psrlw $8, %%mm3\n\t"		//0H0F0D0B
+		"pmullw	%%mm2, %%mm0\n\t"
+		"pmullw	%%mm3, %%mm1\n\t"
+		"psrlw	$8, %%mm0\n\t"
+		"pand %%mm5, %%mm1\n\t"
+		"por %%mm1, %%mm0\n\t"
+		"paddb	%2, %%mm0\n\t"
+		"movq	%%mm0, %0\n\t"
+		"1:\n\t"
+		:: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x])
+		: "%eax");
+	}
+#else
+        for(x=0;x<w;x++){
+#ifdef FAST_OSD
+            if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0];
+            if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1];
+#else
+            if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
+#endif
+        }
+#endif
+        src+=srcstride;
+        srca+=srcstride;
+        dstbase+=dststride;
+    }
+#if HAVE_MMX
+	__asm__ volatile(EMMS:::"memory");
+#endif
+    return;
+}
+
+static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+    int y;
+#if defined(FAST_OSD) && !HAVE_MMX
+    w=w>>1;
+#endif
+#if HAVE_MMX
+    __asm__ volatile(
+        "pxor %%mm7, %%mm7\n\t"
+        "pcmpeqb %%mm5, %%mm5\n\t" // F..F
+        "movq %%mm5, %%mm6\n\t"
+        "movq %%mm5, %%mm4\n\t"
+        "psllw $8, %%mm5\n\t" //FF00FF00FF00
+        "psrlw $8, %%mm4\n\t" //00FF00FF00FF
+        ::);
+#endif
+    for(y=0;y<h;y++){
+        register int x;
+#if HAVE_MMX
+    __asm__ volatile(
+	PREFETCHW" %0\n\t"
+	PREFETCH" %1\n\t"
+	PREFETCH" %2\n\t"
+	::"m"(*dstbase),"m"(*srca),"m"(*src));
+    for(x=0;x<w;x+=4){
+	__asm__ volatile(
+		"movl %1, %%eax\n\t"
+		"orl %%eax, %%eax\n\t"
+		" jz 1f\n\t"
+		PREFETCHW" 32%0\n\t"
+		PREFETCH" 32%1\n\t"
+		PREFETCH" 32%2\n\t"
+		"movq	%0, %%mm0\n\t" // dstbase
+		"movq	%%mm0, %%mm1\n\t"
+		"pand %%mm4, %%mm0\n\t" 	//0Y0Y0Y0Y
+		"movd	%%eax, %%mm2\n\t"	//srca 0000DCBA
+		"paddb	%%mm6, %%mm2\n\t"
+		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0D0C0B0A
+		"pmullw	%%mm2, %%mm0\n\t"
+		"psrlw	$8, %%mm0\n\t"
+		"pand %%mm5, %%mm1\n\t" 	//U0V0U0V0
+		"movd %2, %%mm2\n\t"		//src 0000DCBA
+		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0D0C0B0A
+		"por %%mm1, %%mm0\n\t"
+		"paddb	%%mm2, %%mm0\n\t"
+		"movq	%%mm0, %0\n\t"
+		"1:\n\t"
+		:: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x])
+		: "%eax");
+	}
+#else
+        for(x=0;x<w;x++){
+#ifdef FAST_OSD
+            if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0];
+            if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1];
+#else
+            if(srca[x]) {
+               dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
+               dstbase[2*x+1]=((((signed)dstbase[2*x+1]-128)*srca[x])>>8)+128;
+           }
+#endif
+        }
+#endif
+	src+=srcstride;
+        srca+=srcstride;
+        dstbase+=dststride;
+    }
+#if HAVE_MMX
+	__asm__ volatile(EMMS:::"memory");
+#endif
+    return;
+}
+
+static inline void RENAME(vo_draw_alpha_uyvy)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+  int y;
+#if defined(FAST_OSD)
+  w=w>>1;
+#endif
+  for(y=0;y<h;y++){
+    register int x;
+    for(x=0;x<w;x++){
+#ifdef FAST_OSD
+      if(srca[2*x+0]) dstbase[4*x+2]=src[2*x+0];
+      if(srca[2*x+1]) dstbase[4*x+0]=src[2*x+1];
+#else
+      if(srca[x]) {
+	dstbase[2*x+1]=((dstbase[2*x+1]*srca[x])>>8)+src[x];
+	dstbase[2*x]=((((signed)dstbase[2*x]-128)*srca[x])>>8)+128;
+      }
+#endif
+    }
+    src+=srcstride;
+    srca+=srcstride;
+    dstbase+=dststride;
+  }
+}
+
+static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+    int y;
+#if HAVE_MMX
+    __asm__ volatile(
+        "pxor %%mm7, %%mm7\n\t"
+        "pcmpeqb %%mm6, %%mm6\n\t" // F..F
+        ::);
+#endif
+    for(y=0;y<h;y++){
+        register unsigned char *dst = dstbase;
+        register int x;
+#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX)
+#if HAVE_MMX
+    __asm__ volatile(
+	PREFETCHW" %0\n\t"
+	PREFETCH" %1\n\t"
+	PREFETCH" %2\n\t"
+	::"m"(*dst),"m"(*srca),"m"(*src):"memory");
+    for(x=0;x<w;x+=2){
+     if(srca[x] || srca[x+1])
+	__asm__ volatile(
+		PREFETCHW" 32%0\n\t"
+		PREFETCH" 32%1\n\t"
+		PREFETCH" 32%2\n\t"
+		"movq	%0, %%mm0\n\t" // dstbase
+		"movq	%%mm0, %%mm1\n\t"
+		"movq	%%mm0, %%mm5\n\t"
+		"punpcklbw %%mm7, %%mm0\n\t"
+		"punpckhbw %%mm7, %%mm1\n\t"
+		"movd	%1, %%mm2\n\t" // srca ABCD0000
+		"paddb	%%mm6, %%mm2\n\t"
+		"punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
+		"punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
+		"psrlq  $8, %%mm2\n\t" // srca AAABBBB0
+		"movq	%%mm2, %%mm3\n\t"
+		"punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0B
+		"punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B00
+		"pmullw	%%mm2, %%mm0\n\t"
+		"pmullw	%%mm3, %%mm1\n\t"
+		"psrlw	$8, %%mm0\n\t"
+		"psrlw	$8, %%mm1\n\t"
+		"packuswb %%mm1, %%mm0\n\t"
+		"movd %2, %%mm2	\n\t" // src ABCD0000
+		"punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
+		"punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
+		"psrlq  $8, %%mm2\n\t" // src AAABBBB0
+		"paddb	%%mm2, %%mm0\n\t"
+		"pand	%4, %%mm5\n\t"
+		"pand	%3, %%mm0\n\t"
+		"por	%%mm0, %%mm5\n\t"
+		"movq	%%mm5, %0\n\t"
+		:: "m" (dst[0]), "m" (srca[x]), "m" (src[x]), "m"(mask24hl), "m"(mask24lh));
+		dst += 6;
+	}
+#else /* HAVE_MMX */
+    for(x=0;x<w;x++){
+        if(srca[x]){
+	    __asm__ volatile(
+		"movzbl (%0), %%ecx\n\t"
+		"movzbl 1(%0), %%eax\n\t"
+
+		"imull %1, %%ecx\n\t"
+		"imull %1, %%eax\n\t"
+
+		"addl %2, %%ecx\n\t"
+		"addl %2, %%eax\n\t"
+
+		"movb %%ch, (%0)\n\t"
+		"movb %%ah, 1(%0)\n\t"
+
+                "movzbl 2(%0), %%eax\n\t"
+		"imull %1, %%eax\n\t"
+		"addl %2, %%eax\n\t"
+		"movb %%ah, 2(%0)\n\t"
+		:
+		:"D" (dst),
+		 "r" ((unsigned)srca[x]),
+		 "r" (((unsigned)src[x])<<8)
+		:"%eax", "%ecx"
+		);
+            }
+	    dst += 3;
+        }
+#endif /* !HAVE_MMX */
+#else /*non x86 arch or x86_64 with MMX disabled */
+        for(x=0;x<w;x++){
+            if(srca[x]){
+#ifdef FAST_OSD
+		dst[0]=dst[1]=dst[2]=src[x];
+#else
+		dst[0]=((dst[0]*srca[x])>>8)+src[x];
+		dst[1]=((dst[1]*srca[x])>>8)+src[x];
+		dst[2]=((dst[2]*srca[x])>>8)+src[x];
+#endif
+            }
+            dst+=3; // 24bpp
+        }
+#endif /* arch_x86 */
+        src+=srcstride;
+        srca+=srcstride;
+        dstbase+=dststride;
+    }
+#if HAVE_MMX
+	__asm__ volatile(EMMS:::"memory");
+#endif
+    return;
+}
+
+static inline void RENAME(vo_draw_alpha_rgb32)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+    int y;
+#if HAVE_BIGENDIAN
+    dstbase++;
+#endif
+#if HAVE_MMX
+#if HAVE_AMD3DNOW
+    __asm__ volatile(
+        "pxor %%mm7, %%mm7\n\t"
+        "pcmpeqb %%mm6, %%mm6\n\t" // F..F
+        ::);
+#else /* HAVE_AMD3DNOW */
+    __asm__ volatile(
+        "pxor %%mm7, %%mm7\n\t"
+        "pcmpeqb %%mm5, %%mm5\n\t" // F..F
+        "movq %%mm5, %%mm4\n\t"
+        "psllw $8, %%mm5\n\t" //FF00FF00FF00
+        "psrlw $8, %%mm4\n\t" //00FF00FF00FF
+        ::);
+#endif /* HAVE_AMD3DNOW */
+#endif /* HAVE_MMX */
+    for(y=0;y<h;y++){
+        register int x;
+#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX)
+#if HAVE_MMX
+#if HAVE_AMD3DNOW
+    __asm__ volatile(
+	PREFETCHW" %0\n\t"
+	PREFETCH" %1\n\t"
+	PREFETCH" %2\n\t"
+	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
+    for(x=0;x<w;x+=2){
+     if(srca[x] || srca[x+1])
+	__asm__ volatile(
+		PREFETCHW" 32%0\n\t"
+		PREFETCH" 32%1\n\t"
+		PREFETCH" 32%2\n\t"
+		"movq	%0, %%mm0\n\t" // dstbase
+		"movq	%%mm0, %%mm1\n\t"
+		"punpcklbw %%mm7, %%mm0\n\t"
+		"punpckhbw %%mm7, %%mm1\n\t"
+		"movd	%1, %%mm2\n\t" // srca ABCD0000
+		"paddb	%%mm6, %%mm2\n\t"
+		"punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
+		"punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
+		"movq	%%mm2, %%mm3\n\t"
+		"punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
+		"punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
+		"pmullw	%%mm2, %%mm0\n\t"
+		"pmullw	%%mm3, %%mm1\n\t"
+		"psrlw	$8, %%mm0\n\t"
+		"psrlw	$8, %%mm1\n\t"
+		"packuswb %%mm1, %%mm0\n\t"
+		"movd %2, %%mm2	\n\t" // src ABCD0000
+		"punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
+		"punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
+		"paddb	%%mm2, %%mm0\n\t"
+		"movq	%%mm0, %0\n\t"
+		:: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]));
+	}
+#else //this is faster for intels crap
+    __asm__ volatile(
+	PREFETCHW" %0\n\t"
+	PREFETCH" %1\n\t"
+	PREFETCH" %2\n\t"
+	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
+    for(x=0;x<w;x+=4){
+	__asm__ volatile(
+		"movl %1, %%eax\n\t"
+		"orl %%eax, %%eax\n\t"
+		" jz 1f\n\t"
+		PREFETCHW" 32%0\n\t"
+		PREFETCH" 32%1\n\t"
+		PREFETCH" 32%2\n\t"
+		"movq	%0, %%mm0\n\t" // dstbase
+		"movq	%%mm0, %%mm1\n\t"
+		"pand %%mm4, %%mm0\n\t" 	//0R0B0R0B
+		"psrlw $8, %%mm1\n\t"		//0?0G0?0G
+		"movd	%%eax, %%mm2\n\t" 	//srca 0000DCBA
+		"paddb	%3, %%mm2\n\t"
+		"punpcklbw %%mm2, %%mm2\n\t"	//srca DDCCBBAA
+		"movq %%mm2, %%mm3\n\t"
+		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0B0B0A0A
+		"pmullw	%%mm2, %%mm0\n\t"
+		"pmullw	%%mm2, %%mm1\n\t"
+		"psrlw	$8, %%mm0\n\t"
+		"pand %%mm5, %%mm1\n\t"
+		"por %%mm1, %%mm0\n\t"
+		"movd %2, %%mm2	\n\t"		//src 0000DCBA
+		"punpcklbw %%mm2, %%mm2\n\t" 	//src DDCCBBAA
+		"movq %%mm2, %%mm6\n\t"
+		"punpcklbw %%mm2, %%mm2\n\t"	//src BBBBAAAA
+		"paddb	%%mm2, %%mm0\n\t"
+		"movq	%%mm0, %0\n\t"
+
+		"movq	8%0, %%mm0\n\t" // dstbase
+		"movq	%%mm0, %%mm1\n\t"
+		"pand %%mm4, %%mm0\n\t" 	//0R0B0R0B
+		"psrlw $8, %%mm1\n\t"		//0?0G0?0G
+		"punpckhbw %%mm7, %%mm3\n\t"	//srca 0D0D0C0C
+		"pmullw	%%mm3, %%mm0\n\t"
+		"pmullw	%%mm3, %%mm1\n\t"
+		"psrlw	$8, %%mm0\n\t"
+		"pand %%mm5, %%mm1\n\t"
+		"por %%mm1, %%mm0\n\t"
+		"punpckhbw %%mm6, %%mm6\n\t"	//src DDDDCCCC
+		"paddb	%%mm6, %%mm0\n\t"
+		"movq	%%mm0, 8%0\n\t"
+		"1:\n\t"
+		:: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]), "m" (bFF)
+		: "%eax");
+	}
+#endif
+#else /* HAVE_MMX */
+    for(x=0;x<w;x++){
+        if(srca[x]){
+	    __asm__ volatile(
+		"movzbl (%0), %%ecx\n\t"
+		"movzbl 1(%0), %%eax\n\t"
+		"movzbl 2(%0), %%edx\n\t"
+
+		"imull %1, %%ecx\n\t"
+		"imull %1, %%eax\n\t"
+		"imull %1, %%edx\n\t"
+
+ 		"addl %2, %%ecx\n\t"
+		"addl %2, %%eax\n\t"
+		"addl %2, %%edx\n\t"
+
+		"movb %%ch, (%0)\n\t"
+		"movb %%ah, 1(%0)\n\t"
+		"movb %%dh, 2(%0)\n\t"
+
+		:
+		:"r" (&dstbase[4*x]),
+		 "r" ((unsigned)srca[x]),
+		 "r" (((unsigned)src[x])<<8)
+		:"%eax", "%ecx", "%edx"
+		);
+            }
+        }
+#endif /* HAVE_MMX */
+#else /*non x86 arch or x86_64 with MMX disabled */
+        for(x=0;x<w;x++){
+            if(srca[x]){
+#ifdef FAST_OSD
+		dstbase[4*x+0]=dstbase[4*x+1]=dstbase[4*x+2]=src[x];
+#else
+		dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x];
+		dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x];
+		dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x];
+#endif
+            }
+        }
+#endif /* arch_x86 */
+        src+=srcstride;
+        srca+=srcstride;
+        dstbase+=dststride;
+    }
+#if HAVE_MMX
+	__asm__ volatile(EMMS:::"memory");
+#endif
+    return;
+}