Mercurial > mplayer.hg
comparison libvo/osd.c @ 2846:ab51228bf3cf
p2/p3 bgr32 version (20%faster)
yv12 and yuy2 versions in mmx
author | michael |
---|---|
date | Sun, 11 Nov 2001 22:14:13 +0000 |
parents | 5be2017077fb |
children | 2f1e40539fe2 |
comparison
equal
deleted
inserted
replaced
2845:b512c5b40b0d | 2846:ab51228bf3cf |
---|---|
1 // Generic alpha renderers for all YUV modes and RGB depths. | 1 // Generic alpha renderers for all YUV modes and RGB depths. |
2 // These are "reference implementations", should be optimized later (MMX, etc) | 2 // These are "reference implementations", should be optimized later (MMX, etc) |
3 // Optimized by Nick and Michael | |
3 | 4 |
4 //#define FAST_OSD | 5 //#define FAST_OSD |
5 //#define FAST_OSD_TABLE | 6 //#define FAST_OSD_TABLE |
6 | 7 |
7 #include "config.h" | 8 #include "config.h" |
8 #include "osd.h" | 9 #include "osd.h" |
9 #include "../mmx_defs.h" | 10 #include "../mmx_defs.h" |
10 //#define ENABLE_PROFILE | 11 //#define ENABLE_PROFILE |
11 #include "../my_profile.h" | 12 #include "../my_profile.h" |
13 #include <inttypes.h> | |
14 | |
15 #ifndef HAVE_3DNOW | |
16 static const uint64_t bFF __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL; | |
17 #endif | |
12 | 18 |
13 void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ | 19 void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ |
14 int y; | 20 int y; |
15 #ifdef FAST_OSD | 21 #if defined(FAST_OSD) && !defined(HAVE_MMX) |
16 w=w>>1; | 22 w=w>>1; |
17 #endif | 23 #endif |
18 for(y=0;y<h;y++){ | 24 PROFILE_START(); |
19 register int x; | 25 for(y=0;y<h;y++){ |
26 register int x; | |
27 #ifdef HAVE_MMX | |
28 asm volatile( | |
29 PREFETCHW" %0\n\t" | |
30 PREFETCH" %1\n\t" | |
31 PREFETCH" %2\n\t" | |
32 // "pxor %%mm7, %%mm7\n\t" | |
33 "pcmpeqb %%mm5, %%mm5\n\t" // F..F | |
34 "movq %%mm5, %%mm4\n\t" | |
35 "psllw $8, %%mm5\n\t" //FF00FF00FF00 | |
36 "psrlw $8, %%mm4\n\t" //00FF00FF00FF | |
37 ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory"); | |
38 for(x=0;x<w;x+=8){ | |
39 asm volatile( | |
40 "movl %1, %%eax\n\t" | |
41 "orl 4%1, %%eax\n\t" | |
42 " jz 1f\n\t" | |
43 PREFETCHW" 32%0\n\t" | |
44 PREFETCH" 32%1\n\t" | |
45 PREFETCH" 32%2\n\t" | |
46 "movq %0, %%mm0\n\t" // dstbase | |
47 "movq %%mm0, %%mm1\n\t" | |
48 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y | |
49 "psrlw $8, %%mm1\n\t" //0Y0Y0Y0Y | |
50 "movq %1, %%mm2\n\t" //srca HGFEDCBA | |
51 "paddb bFF, %%mm2\n\t" | |
52 "movq %%mm2, %%mm3\n\t" | |
53 "pand %%mm4, %%mm2\n\t" //0G0E0C0A | |
54 "psrlw $8, %%mm3\n\t" //0H0F0D0B | |
55 "pmullw %%mm2, %%mm0\n\t" | |
56 "pmullw %%mm3, %%mm1\n\t" | |
57 "psrlw $8, %%mm0\n\t" | |
58 "pand %%mm5, %%mm1\n\t" | |
59 "por %%mm1, %%mm0\n\t" | |
60 "paddb %2, %%mm0\n\t" | |
61 "movq %%mm0, %0\n\t" | |
62 "1:\n\t" | |
63 :: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x]) | |
64 : "%eax"); | |
65 } | |
66 #else | |
20 for(x=0;x<w;x++){ | 67 for(x=0;x<w;x++){ |
21 #ifdef FAST_OSD | 68 #ifdef FAST_OSD |
22 if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0]; | 69 if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0]; |
23 if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1]; | 70 if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1]; |
24 #else | 71 #else |
25 if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x]; | 72 if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x]; |
26 #endif | 73 #endif |
27 } | 74 } |
75 #endif | |
28 src+=srcstride; | 76 src+=srcstride; |
29 srca+=srcstride; | 77 srca+=srcstride; |
30 dstbase+=dststride; | 78 dstbase+=dststride; |
31 } | 79 } |
80 #ifdef HAVE_MMX | |
81 asm volatile(EMMS:::"memory"); | |
82 #endif | |
83 PROFILE_END("vo_draw_alpha_yv12"); | |
32 return; | 84 return; |
33 } | 85 } |
34 | 86 |
35 void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ | 87 void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ |
36 int y; | 88 int y; |
37 #ifdef FAST_OSD | 89 #if defined(FAST_OSD) && !defined(HAVE_MMX) |
38 w=w>>1; | 90 w=w>>1; |
39 #endif | 91 #endif |
40 for(y=0;y<h;y++){ | 92 PROFILE_START(); |
41 register int x; | 93 for(y=0;y<h;y++){ |
94 register int x; | |
95 #ifdef HAVE_MMX | |
96 asm volatile( | |
97 PREFETCHW" %0\n\t" | |
98 PREFETCH" %1\n\t" | |
99 PREFETCH" %2\n\t" | |
100 "pxor %%mm7, %%mm7\n\t" | |
101 "pcmpeqb %%mm5, %%mm5\n\t" // F..F | |
102 "movq %%mm5, %%mm4\n\t" | |
103 "psllw $8, %%mm5\n\t" //FF00FF00FF00 | |
104 "psrlw $8, %%mm4\n\t" //00FF00FF00FF | |
105 ::"m"(*dstbase),"m"(*srca),"m"(*src)); | |
106 for(x=0;x<w;x+=4){ | |
107 asm volatile( | |
108 "movl %1, %%eax\n\t" | |
109 "orl %%eax, %%eax\n\t" | |
110 " jz 1f\n\t" | |
111 PREFETCHW" 32%0\n\t" | |
112 PREFETCH" 32%1\n\t" | |
113 PREFETCH" 32%2\n\t" | |
114 "movq %0, %%mm0\n\t" // dstbase | |
115 "movq %%mm0, %%mm1\n\t" | |
116 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y | |
117 "movd %%eax, %%mm2\n\t" //srca 0000DCBA | |
118 "paddb bFF, %%mm2\n\t" | |
119 "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A | |
120 "pmullw %%mm2, %%mm0\n\t" | |
121 "psrlw $8, %%mm0\n\t" | |
122 "pand %%mm5, %%mm1\n\t" //U0V0U0V0 | |
123 "movd %2, %%mm2\n\t" //src 0000DCBA | |
124 "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A | |
125 "por %%mm1, %%mm0\n\t" | |
126 "paddb %%mm2, %%mm0\n\t" | |
127 "movq %%mm0, %0\n\t" | |
128 "1:\n\t" | |
129 :: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x]) | |
130 : "%eax"); | |
131 } | |
132 #else | |
42 for(x=0;x<w;x++){ | 133 for(x=0;x<w;x++){ |
43 #ifdef FAST_OSD | 134 #ifdef FAST_OSD |
44 if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0]; | 135 if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0]; |
45 if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1]; | 136 if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1]; |
46 #else | 137 #else |
47 if(srca[x]) dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x]; | 138 if(srca[x]) dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x]; |
48 #endif | 139 #endif |
49 } | 140 } |
50 src+=srcstride; | 141 #endif |
51 srca+=srcstride; | 142 src+=srcstride; |
52 dstbase+=dststride; | 143 srca+=srcstride; |
53 } | 144 dstbase+=dststride; |
145 } | |
146 #ifdef HAVE_MMX | |
147 asm volatile(EMMS:::"memory"); | |
148 #endif | |
149 PROFILE_END("vo_draw_alpha_yuy2"); | |
54 return; | 150 return; |
55 } | 151 } |
56 | 152 |
57 #ifdef HAVE_MMX | 153 #ifdef HAVE_MMX |
58 static const unsigned long long mask24lh __attribute__((aligned(8))) = 0xFFFF000000000000ULL; | 154 static const unsigned long long mask24lh __attribute__((aligned(8))) = 0xFFFF000000000000ULL; |
165 PROFILE_START(); | 261 PROFILE_START(); |
166 for(y=0;y<h;y++){ | 262 for(y=0;y<h;y++){ |
167 register int x; | 263 register int x; |
168 #ifdef ARCH_X86 | 264 #ifdef ARCH_X86 |
169 #ifdef HAVE_MMX | 265 #ifdef HAVE_MMX |
266 #ifdef HAVE_3DNOW | |
170 asm volatile( | 267 asm volatile( |
171 PREFETCHW" %0\n\t" | 268 PREFETCHW" %0\n\t" |
172 PREFETCH" %1\n\t" | 269 PREFETCH" %1\n\t" |
173 PREFETCH" %2\n\t" | 270 PREFETCH" %2\n\t" |
174 "pxor %%mm7, %%mm7\n\t" | 271 "pxor %%mm7, %%mm7\n\t" |
201 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB | 298 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB |
202 "paddb %%mm2, %%mm0\n\t" | 299 "paddb %%mm2, %%mm0\n\t" |
203 "movq %%mm0, %0\n\t" | 300 "movq %%mm0, %0\n\t" |
204 :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x])); | 301 :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x])); |
205 } | 302 } |
303 #else //this is faster for intels crap | |
304 asm volatile( | |
305 PREFETCHW" %0\n\t" | |
306 PREFETCH" %1\n\t" | |
307 PREFETCH" %2\n\t" | |
308 "pxor %%mm7, %%mm7\n\t" | |
309 "pcmpeqb %%mm5, %%mm5\n\t" // F..F | |
310 "movq %%mm5, %%mm4\n\t" | |
311 "psllw $8, %%mm5\n\t" //FF00FF00FF00 | |
312 "psrlw $8, %%mm4\n\t" //00FF00FF00FF | |
313 ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory"); | |
314 for(x=0;x<w;x+=4){ | |
315 asm volatile( | |
316 "movl %1, %%eax\n\t" | |
317 "orl %%eax, %%eax\n\t" | |
318 " jz 1f\n\t" | |
319 PREFETCHW" 32%0\n\t" | |
320 PREFETCH" 32%1\n\t" | |
321 PREFETCH" 32%2\n\t" | |
322 "movq %0, %%mm0\n\t" // dstbase | |
323 "movq %%mm0, %%mm1\n\t" | |
324 "pand %%mm4, %%mm0\n\t" //0R0B0R0B | |
325 "psrlw $8, %%mm1\n\t" //0?0G0?0G | |
326 "movd %%eax, %%mm2\n\t" //srca 0000DCBA | |
327 "paddb bFF, %%mm2\n\t" | |
328 "punpcklbw %%mm2, %%mm2\n\t" //srca DDCCBBAA | |
329 "movq %%mm2, %%mm3\n\t" | |
330 "punpcklbw %%mm7, %%mm2\n\t" //srca 0B0B0A0A | |
331 "pmullw %%mm2, %%mm0\n\t" | |
332 "pmullw %%mm2, %%mm1\n\t" | |
333 "psrlw $8, %%mm0\n\t" | |
334 "pand %%mm5, %%mm1\n\t" | |
335 "por %%mm1, %%mm0\n\t" | |
336 "movd %2, %%mm2 \n\t" //src 0000DCBA | |
337 "punpcklbw %%mm2, %%mm2\n\t" //src DDCCBBAA | |
338 "movq %%mm2, %%mm6\n\t" | |
339 "punpcklbw %%mm2, %%mm2\n\t" //src BBBBAAAA | |
340 "paddb %%mm2, %%mm0\n\t" | |
341 "movq %%mm0, %0\n\t" | |
342 | |
343 "movq 8%0, %%mm0\n\t" // dstbase | |
344 "movq %%mm0, %%mm1\n\t" | |
345 "pand %%mm4, %%mm0\n\t" //0R0B0R0B | |
346 "psrlw $8, %%mm1\n\t" //0?0G0?0G | |
347 "punpckhbw %%mm7, %%mm3\n\t" //srca 0D0D0C0C | |
348 "pmullw %%mm3, %%mm0\n\t" | |
349 "pmullw %%mm3, %%mm1\n\t" | |
350 "psrlw $8, %%mm0\n\t" | |
351 "pand %%mm5, %%mm1\n\t" | |
352 "por %%mm1, %%mm0\n\t" | |
353 "punpckhbw %%mm6, %%mm6\n\t" //src DDDDCCCC | |
354 "paddb %%mm6, %%mm0\n\t" | |
355 "movq %%mm0, 8%0\n\t" | |
356 "1:\n\t" | |
357 :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]) | |
358 : "%eax"); | |
359 } | |
360 #endif | |
206 #else /* HAVE_MMX */ | 361 #else /* HAVE_MMX */ |
207 for(x=0;x<w;x++){ | 362 for(x=0;x<w;x++){ |
208 if(srca[x]){ | 363 if(srca[x]){ |
209 asm volatile( | 364 asm volatile( |
210 "movzbl (%0), %%ecx\n\t" | 365 "movzbl (%0), %%ecx\n\t" |