annotate libvo/osd_template.c @ 9987:988c2ffc5bc1

remove remaining cpudetect dependancy
author michael
date Fri, 25 Apr 2003 17:16:55 +0000
parents 9d7477d0d64d
children 14c8c762c2b7
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
1 // Generic alpha renderers for all YUV modes and RGB depths.
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
2 // Optimized by Nick and Michael
3142
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
3 // Code from Michael Niedermayer (michaelni@gmx.at) is under GPL
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
4
3142
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
5 #undef PREFETCH
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
6 #undef EMMS
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
7 #undef PREFETCHW
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
8 #undef PAVGB
947
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
9
3142
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
10 #ifdef HAVE_3DNOW
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
11 #define PREFETCH "prefetch"
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
12 #define PREFETCHW "prefetchw"
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
13 #define PAVGB "pavgusb"
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
14 #elif defined ( HAVE_MMX2 )
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
15 #define PREFETCH "prefetchnta"
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
16 #define PREFETCHW "prefetcht0"
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
17 #define PAVGB "pavgb"
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
18 #else
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
19 #define PREFETCH "/nop"
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
20 #define PREFETCHW "/nop"
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
21 #endif
622
6737025afed0 to be sure in that header is okey
arpi_esp
parents: 326
diff changeset
22
3142
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
23 #ifdef HAVE_3DNOW
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
24 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
25 #define EMMS "femms"
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
26 #else
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
27 #define EMMS "emms"
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
28 #endif
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
29
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
30 static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
31 int y;
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
32 #if defined(FAST_OSD) && !defined(HAVE_MMX)
947
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
33 w=w>>1;
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
34 #endif
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
35 PROFILE_START();
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
36 for(y=0;y<h;y++){
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
37 register int x;
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
38 #ifdef HAVE_MMX
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
39 asm volatile(
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
40 PREFETCHW" %0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
41 PREFETCH" %1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
42 PREFETCH" %2\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
43 // "pxor %%mm7, %%mm7\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
44 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
45 "movq %%mm5, %%mm4\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
46 "psllw $8, %%mm5\n\t" //FF00FF00FF00
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
47 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
48 ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
49 for(x=0;x<w;x+=8){
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
50 asm volatile(
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
51 "movl %1, %%eax\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
52 "orl 4%1, %%eax\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
53 " jz 1f\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
54 PREFETCHW" 32%0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
55 PREFETCH" 32%1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
56 PREFETCH" 32%2\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
57 "movq %0, %%mm0\n\t" // dstbase
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
58 "movq %%mm0, %%mm1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
59 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
60 "psrlw $8, %%mm1\n\t" //0Y0Y0Y0Y
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
61 "movq %1, %%mm2\n\t" //srca HGFEDCBA
4245
27cb0e43de32 mangling in libvo
atmos4
parents: 3431
diff changeset
62 "paddb "MANGLE(bFF)", %%mm2\n\t"
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
63 "movq %%mm2, %%mm3\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
64 "pand %%mm4, %%mm2\n\t" //0G0E0C0A
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
65 "psrlw $8, %%mm3\n\t" //0H0F0D0B
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
66 "pmullw %%mm2, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
67 "pmullw %%mm3, %%mm1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
68 "psrlw $8, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
69 "pand %%mm5, %%mm1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
70 "por %%mm1, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
71 "paddb %2, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
72 "movq %%mm0, %0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
73 "1:\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
74 :: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x])
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
75 : "%eax");
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
76 }
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
77 #else
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
78 for(x=0;x<w;x++){
947
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
79 #ifdef FAST_OSD
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
80 if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0];
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
81 if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1];
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
82 #else
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
83 if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
947
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
84 #endif
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
85 }
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
86 #endif
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
87 src+=srcstride;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
88 srca+=srcstride;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
89 dstbase+=dststride;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
90 }
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
91 #ifdef HAVE_MMX
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
92 asm volatile(EMMS:::"memory");
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
93 #endif
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
94 PROFILE_END("vo_draw_alpha_yv12");
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
95 return;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
96 }
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
97
3142
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
98 static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
99 int y;
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
100 #if defined(FAST_OSD) && !defined(HAVE_MMX)
947
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
101 w=w>>1;
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
102 #endif
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
103 PROFILE_START();
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
104 for(y=0;y<h;y++){
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
105 register int x;
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
106 #ifdef HAVE_MMX
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
107 asm volatile(
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
108 PREFETCHW" %0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
109 PREFETCH" %1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
110 PREFETCH" %2\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
111 "pxor %%mm7, %%mm7\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
112 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
113 "movq %%mm5, %%mm4\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
114 "psllw $8, %%mm5\n\t" //FF00FF00FF00
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
115 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
116 ::"m"(*dstbase),"m"(*srca),"m"(*src));
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
117 for(x=0;x<w;x+=4){
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
118 asm volatile(
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
119 "movl %1, %%eax\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
120 "orl %%eax, %%eax\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
121 " jz 1f\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
122 PREFETCHW" 32%0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
123 PREFETCH" 32%1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
124 PREFETCH" 32%2\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
125 "movq %0, %%mm0\n\t" // dstbase
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
126 "movq %%mm0, %%mm1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
127 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
128 "movd %%eax, %%mm2\n\t" //srca 0000DCBA
4245
27cb0e43de32 mangling in libvo
atmos4
parents: 3431
diff changeset
129 "paddb "MANGLE(bFF)", %%mm2\n\t"
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
130 "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
131 "pmullw %%mm2, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
132 "psrlw $8, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
133 "pand %%mm5, %%mm1\n\t" //U0V0U0V0
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
134 "movd %2, %%mm2\n\t" //src 0000DCBA
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
135 "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
136 "por %%mm1, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
137 "paddb %%mm2, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
138 "movq %%mm0, %0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
139 "1:\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
140 :: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x])
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
141 : "%eax");
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
142 }
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
143 #else
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
144 for(x=0;x<w;x++){
947
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
145 #ifdef FAST_OSD
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
146 if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0];
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
147 if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1];
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
148 #else
3431
63ecec3bdf93 yuy2 in C color bugfix patch from Artur Zaprzala <artur.zaprzala@talex.com.pl>
michael
parents: 3142
diff changeset
149 if(srca[x]) {
63ecec3bdf93 yuy2 in C color bugfix patch from Artur Zaprzala <artur.zaprzala@talex.com.pl>
michael
parents: 3142
diff changeset
150 dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
63ecec3bdf93 yuy2 in C color bugfix patch from Artur Zaprzala <artur.zaprzala@talex.com.pl>
michael
parents: 3142
diff changeset
151 dstbase[2*x+1]=((((signed)dstbase[2*x+1]-128)*srca[x])>>8)+128;
63ecec3bdf93 yuy2 in C color bugfix patch from Artur Zaprzala <artur.zaprzala@talex.com.pl>
michael
parents: 3142
diff changeset
152 }
947
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
153 #endif
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
154 }
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
155 #endif
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
156 src+=srcstride;
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
157 srca+=srcstride;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
158 dstbase+=dststride;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
159 }
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
160 #ifdef HAVE_MMX
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
161 asm volatile(EMMS:::"memory");
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
162 #endif
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
163 PROFILE_END("vo_draw_alpha_yuy2");
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
164 return;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
165 }
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
166
3142
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
167 static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
168 int y;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
169 for(y=0;y<h;y++){
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
170 register unsigned char *dst = dstbase;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
171 register int x;
2839
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
172 #ifdef ARCH_X86
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
173 #ifdef HAVE_MMX
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
174 asm volatile(
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
175 PREFETCHW" %0\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
176 PREFETCH" %1\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
177 PREFETCH" %2\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
178 "pxor %%mm7, %%mm7\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
179 "pcmpeqb %%mm6, %%mm6\n\t" // F..F
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
180 ::"m"(*dst),"m"(*srca),"m"(*src):"memory");
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
181 for(x=0;x<w;x+=2){
2843
5be2017077fb Use new logic suggested by Michael Niedermayer
nick
parents: 2839
diff changeset
182 if(srca[x] || srca[x+1])
2839
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
183 asm volatile(
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
184 PREFETCHW" 32%0\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
185 PREFETCH" 32%1\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
186 PREFETCH" 32%2\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
187 "movq %0, %%mm0\n\t" // dstbase
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
188 "movq %%mm0, %%mm1\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
189 "movq %%mm0, %%mm5\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
190 "punpcklbw %%mm7, %%mm0\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
191 "punpckhbw %%mm7, %%mm1\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
192 "movd %1, %%mm2\n\t" // srca ABCD0000
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
193 "paddb %%mm6, %%mm2\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
194 "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
195 "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
196 "movq %%mm2, %%mm3\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
197 "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
198 "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
199 "pmullw %%mm2, %%mm0\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
200 "pmullw %%mm3, %%mm1\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
201 "psrlw $8, %%mm0\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
202 "psrlw $8, %%mm1\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
203 "packuswb %%mm1, %%mm0\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
204 "movd %2, %%mm2 \n\t" // src ABCD0000
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
205 "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
206 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
207 "paddb %%mm2, %%mm0\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
208 "pand %4, %%mm5\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
209 "pand %3, %%mm0\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
210 "por %%mm0, %%mm5\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
211 "movq %%mm5, %0\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
212 :: "m" (dst[0]), "m" (srca[x]), "m" (src[x]), "m"(mask24hl), "m"(mask24lh));
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
213 dst += 6;
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
214 }
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
215 #else /* HAVE_MMX */
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
216 for(x=0;x<w;x++){
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
217 if(srca[x]){
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
218 asm volatile(
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
219 "movzbl (%0), %%ecx\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
220 "movzbl 1(%0), %%eax\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
221
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
222 "imull %1, %%ecx\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
223 "imull %1, %%eax\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
224
5139
473058a6211e workaround lack of -fomit-frame-pointer
michael
parents: 4245
diff changeset
225 "addl %2, %%ecx\n\t"
2839
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
226 "addl %2, %%eax\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
227
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
228 "movb %%ch, (%0)\n\t"
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
229 "movb %%ah, 1(%0)\n\t"
5139
473058a6211e workaround lack of -fomit-frame-pointer
michael
parents: 4245
diff changeset
230
473058a6211e workaround lack of -fomit-frame-pointer
michael
parents: 4245
diff changeset
231 "movzbl 2(%0), %%eax\n\t"
473058a6211e workaround lack of -fomit-frame-pointer
michael
parents: 4245
diff changeset
232 "imull %1, %%eax\n\t"
473058a6211e workaround lack of -fomit-frame-pointer
michael
parents: 4245
diff changeset
233 "addl %2, %%eax\n\t"
473058a6211e workaround lack of -fomit-frame-pointer
michael
parents: 4245
diff changeset
234 "movb %%ah, 2(%0)\n\t"
2839
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
235 :
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
236 :"r" (dst),
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
237 "r" ((unsigned)srca[x]),
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
238 "r" (((unsigned)src[x])<<8)
5139
473058a6211e workaround lack of -fomit-frame-pointer
michael
parents: 4245
diff changeset
239 :"%eax", "%ecx"
2839
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
240 );
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
241 }
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
242 dst += 3;
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
243 }
5139
473058a6211e workaround lack of -fomit-frame-pointer
michael
parents: 4245
diff changeset
244 #endif /* !HAVE_MMX */
2839
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
245 #else /*non x86 arch*/
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
246 for(x=0;x<w;x++){
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
247 if(srca[x]){
947
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
248 #ifdef FAST_OSD
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
249 dst[0]=dst[1]=dst[2]=src[x];
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
250 #else
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
251 dst[0]=((dst[0]*srca[x])>>8)+src[x];
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
252 dst[1]=((dst[1]*srca[x])>>8)+src[x];
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
253 dst[2]=((dst[2]*srca[x])>>8)+src[x];
947
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
254 #endif
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
255 }
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
256 dst+=3; // 24bpp
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
257 }
2839
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
258 #endif /* arch_x86 */
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
259 src+=srcstride;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
260 srca+=srcstride;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
261 dstbase+=dststride;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
262 }
2839
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
263 #ifdef HAVE_MMX
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
264 asm volatile(EMMS:::"memory");
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
265 #endif
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
266 return;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
267 }
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
268
3142
0f6cce3a8059 runtime cpu detection
michael
parents: 2850
diff changeset
269 static inline void RENAME(vo_draw_alpha_rgb32)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
270 int y;
2833
1b6c207c0410 Enable MMX stuff
nick
parents: 2823
diff changeset
271 PROFILE_START();
9960
9d7477d0d64d big endian yellow fix?
michael
parents: 5139
diff changeset
272 #ifdef WORDS_BIGENDIAN
9d7477d0d64d big endian yellow fix?
michael
parents: 5139
diff changeset
273 dstbase++;
9d7477d0d64d big endian yellow fix?
michael
parents: 5139
diff changeset
274 #endif
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
275 for(y=0;y<h;y++){
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
276 register int x;
2798
ee2cd36a81a2 Code cleanup - emms is not required when MMX block is commented out.
nick
parents: 2578
diff changeset
277 #ifdef ARCH_X86
2833
1b6c207c0410 Enable MMX stuff
nick
parents: 2823
diff changeset
278 #ifdef HAVE_MMX
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
279 #ifdef HAVE_3DNOW
2835
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
280 asm volatile(
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
281 PREFETCHW" %0\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
282 PREFETCH" %1\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
283 PREFETCH" %2\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
284 "pxor %%mm7, %%mm7\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
285 "pcmpeqb %%mm6, %%mm6\n\t" // F..F
2839
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
286 ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
2835
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
287 for(x=0;x<w;x+=2){
2843
5be2017077fb Use new logic suggested by Michael Niedermayer
nick
parents: 2839
diff changeset
288 if(srca[x] || srca[x+1])
2798
ee2cd36a81a2 Code cleanup - emms is not required when MMX block is commented out.
nick
parents: 2578
diff changeset
289 asm volatile(
2835
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
290 PREFETCHW" 32%0\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
291 PREFETCH" 32%1\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
292 PREFETCH" 32%2\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
293 "movq %0, %%mm0\n\t" // dstbase
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
294 "movq %%mm0, %%mm1\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
295 "punpcklbw %%mm7, %%mm0\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
296 "punpckhbw %%mm7, %%mm1\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
297 "movd %1, %%mm2\n\t" // srca ABCD0000
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
298 "paddb %%mm6, %%mm2\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
299 "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
300 "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
301 "movq %%mm2, %%mm3\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
302 "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
303 "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
304 "pmullw %%mm2, %%mm0\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
305 "pmullw %%mm3, %%mm1\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
306 "psrlw $8, %%mm0\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
307 "psrlw $8, %%mm1\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
308 "packuswb %%mm1, %%mm0\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
309 "movd %2, %%mm2 \n\t" // src ABCD0000
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
310 "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
311 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
312 "paddb %%mm2, %%mm0\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
313 "movq %%mm0, %0\n\t"
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
314 :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]));
86fdf7897315 Minor speedup for K6-2, K7, P3
nick
parents: 2833
diff changeset
315 }
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
316 #else //this is faster for intels crap
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
317 asm volatile(
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
318 PREFETCHW" %0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
319 PREFETCH" %1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
320 PREFETCH" %2\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
321 "pxor %%mm7, %%mm7\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
322 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
323 "movq %%mm5, %%mm4\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
324 "psllw $8, %%mm5\n\t" //FF00FF00FF00
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
325 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
326 ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
327 for(x=0;x<w;x+=4){
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
328 asm volatile(
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
329 "movl %1, %%eax\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
330 "orl %%eax, %%eax\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
331 " jz 1f\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
332 PREFETCHW" 32%0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
333 PREFETCH" 32%1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
334 PREFETCH" 32%2\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
335 "movq %0, %%mm0\n\t" // dstbase
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
336 "movq %%mm0, %%mm1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
337 "pand %%mm4, %%mm0\n\t" //0R0B0R0B
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
338 "psrlw $8, %%mm1\n\t" //0?0G0?0G
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
339 "movd %%eax, %%mm2\n\t" //srca 0000DCBA
4245
27cb0e43de32 mangling in libvo
atmos4
parents: 3431
diff changeset
340 "paddb "MANGLE(bFF)", %%mm2\n\t"
2846
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
341 "punpcklbw %%mm2, %%mm2\n\t" //srca DDCCBBAA
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
342 "movq %%mm2, %%mm3\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
343 "punpcklbw %%mm7, %%mm2\n\t" //srca 0B0B0A0A
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
344 "pmullw %%mm2, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
345 "pmullw %%mm2, %%mm1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
346 "psrlw $8, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
347 "pand %%mm5, %%mm1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
348 "por %%mm1, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
349 "movd %2, %%mm2 \n\t" //src 0000DCBA
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
350 "punpcklbw %%mm2, %%mm2\n\t" //src DDCCBBAA
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
351 "movq %%mm2, %%mm6\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
352 "punpcklbw %%mm2, %%mm2\n\t" //src BBBBAAAA
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
353 "paddb %%mm2, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
354 "movq %%mm0, %0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
355
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
356 "movq 8%0, %%mm0\n\t" // dstbase
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
357 "movq %%mm0, %%mm1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
358 "pand %%mm4, %%mm0\n\t" //0R0B0R0B
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
359 "psrlw $8, %%mm1\n\t" //0?0G0?0G
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
360 "punpckhbw %%mm7, %%mm3\n\t" //srca 0D0D0C0C
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
361 "pmullw %%mm3, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
362 "pmullw %%mm3, %%mm1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
363 "psrlw $8, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
364 "pand %%mm5, %%mm1\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
365 "por %%mm1, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
366 "punpckhbw %%mm6, %%mm6\n\t" //src DDDDCCCC
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
367 "paddb %%mm6, %%mm0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
368 "movq %%mm0, 8%0\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
369 "1:\n\t"
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
370 :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x])
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
371 : "%eax");
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
372 }
ab51228bf3cf p2/p3 bgr32 version (20%faster)
michael
parents: 2843
diff changeset
373 #endif
2839
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
374 #else /* HAVE_MMX */
2823
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
375 for(x=0;x<w;x++){
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
376 if(srca[x]){
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
377 asm volatile(
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
378 "movzbl (%0), %%ecx\n\t"
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
379 "movzbl 1(%0), %%eax\n\t"
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
380 "movzbl 2(%0), %%edx\n\t"
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
381
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
382 "imull %1, %%ecx\n\t"
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
383 "imull %1, %%eax\n\t"
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
384 "imull %1, %%edx\n\t"
2578
d363fde389b5 slow mmx & not so slow asm versions (outcommented)
michael
parents: 947
diff changeset
385
2823
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
386 "addl %2, %%ecx\n\t"
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
387 "addl %2, %%eax\n\t"
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
388 "addl %2, %%edx\n\t"
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
389
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
390 "movb %%ch, (%0)\n\t"
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
391 "movb %%ah, 1(%0)\n\t"
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
392 "movb %%dh, 2(%0)\n\t"
2578
d363fde389b5 slow mmx & not so slow asm versions (outcommented)
michael
parents: 947
diff changeset
393
2823
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
394 :
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
395 :"r" (&dstbase[4*x]),
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
396 "r" ((unsigned)srca[x]),
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
397 "r" (((unsigned)src[x])<<8)
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
398 :"%eax", "%ecx", "%edx"
2578
d363fde389b5 slow mmx & not so slow asm versions (outcommented)
michael
parents: 947
diff changeset
399 );
2823
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
400 }
004ee19ebfcf Extract parallelism from OSD stuff + MMX2 optimization.
nick
parents: 2807
diff changeset
401 }
2839
03ccbb72e2e9 Cloning 32 stuff to 24
nick
parents: 2835
diff changeset
402 #endif /* HAVE_MMX */
2798
ee2cd36a81a2 Code cleanup - emms is not required when MMX block is commented out.
nick
parents: 2578
diff changeset
403 #else /*non x86 arch*/
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
404 for(x=0;x<w;x++){
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
405 if(srca[x]){
947
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
406 #ifdef FAST_OSD
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
407 dstbase[4*x+0]=dstbase[4*x+1]=dstbase[4*x+2]=src[x];
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
408 #else
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
409 dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x];
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
410 dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x];
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
411 dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x];
947
76fd9463b9d3 FAST_OSD option to disable font outline antialiasing
arpi_esp
parents: 622
diff changeset
412 #endif
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
413 }
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
414 }
2798
ee2cd36a81a2 Code cleanup - emms is not required when MMX block is commented out.
nick
parents: 2578
diff changeset
415 #endif /* arch_x86 */
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
416 src+=srcstride;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
417 srca+=srcstride;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
418 dstbase+=dststride;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
419 }
2833
1b6c207c0410 Enable MMX stuff
nick
parents: 2823
diff changeset
420 #ifdef HAVE_MMX
2798
ee2cd36a81a2 Code cleanup - emms is not required when MMX block is commented out.
nick
parents: 2578
diff changeset
421 asm volatile(EMMS:::"memory");
2578
d363fde389b5 slow mmx & not so slow asm versions (outcommented)
michael
parents: 947
diff changeset
422 #endif
2833
1b6c207c0410 Enable MMX stuff
nick
parents: 2823
diff changeset
423 PROFILE_END("vo_draw_alpha_rgb32");
326
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
424 return;
f6b5c2dbc88e OSD alpha renderers moved to osd.c
arpi_esp
parents:
diff changeset
425 }