comparison libvo/osd.c @ 2846:ab51228bf3cf

p2/p3 bgr32 version (20%faster) yv12 and yuy2 versions in mmx
author michael
date Sun, 11 Nov 2001 22:14:13 +0000
parents 5be2017077fb
children 2f1e40539fe2
comparison
equal deleted inserted replaced
2845:b512c5b40b0d 2846:ab51228bf3cf
1 // Generic alpha renderers for all YUV modes and RGB depths. 1 // Generic alpha renderers for all YUV modes and RGB depths.
2 // These are "reference implementations", should be optimized later (MMX, etc) 2 // These are "reference implementations", should be optimized later (MMX, etc)
3 // Optimized by Nick and Michael
3 4
4 //#define FAST_OSD 5 //#define FAST_OSD
5 //#define FAST_OSD_TABLE 6 //#define FAST_OSD_TABLE
6 7
7 #include "config.h" 8 #include "config.h"
8 #include "osd.h" 9 #include "osd.h"
9 #include "../mmx_defs.h" 10 #include "../mmx_defs.h"
10 //#define ENABLE_PROFILE 11 //#define ENABLE_PROFILE
11 #include "../my_profile.h" 12 #include "../my_profile.h"
13 #include <inttypes.h>
14
15 #ifndef HAVE_3DNOW
16 static const uint64_t bFF __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
17 #endif
12 18
13 void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ 19 void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
14 int y; 20 int y;
15 #ifdef FAST_OSD 21 #if defined(FAST_OSD) && !defined(HAVE_MMX)
16 w=w>>1; 22 w=w>>1;
17 #endif 23 #endif
18 for(y=0;y<h;y++){ 24 PROFILE_START();
19 register int x; 25 for(y=0;y<h;y++){
26 register int x;
27 #ifdef HAVE_MMX
28 asm volatile(
29 PREFETCHW" %0\n\t"
30 PREFETCH" %1\n\t"
31 PREFETCH" %2\n\t"
32 // "pxor %%mm7, %%mm7\n\t"
33 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
34 "movq %%mm5, %%mm4\n\t"
35 "psllw $8, %%mm5\n\t" //FF00FF00FF00
36 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
37 ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
38 for(x=0;x<w;x+=8){
39 asm volatile(
40 "movl %1, %%eax\n\t"
41 "orl 4%1, %%eax\n\t"
42 " jz 1f\n\t"
43 PREFETCHW" 32%0\n\t"
44 PREFETCH" 32%1\n\t"
45 PREFETCH" 32%2\n\t"
46 "movq %0, %%mm0\n\t" // dstbase
47 "movq %%mm0, %%mm1\n\t"
48 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
49 "psrlw $8, %%mm1\n\t" //0Y0Y0Y0Y
50 "movq %1, %%mm2\n\t" //srca HGFEDCBA
51 "paddb bFF, %%mm2\n\t"
52 "movq %%mm2, %%mm3\n\t"
53 "pand %%mm4, %%mm2\n\t" //0G0E0C0A
54 "psrlw $8, %%mm3\n\t" //0H0F0D0B
55 "pmullw %%mm2, %%mm0\n\t"
56 "pmullw %%mm3, %%mm1\n\t"
57 "psrlw $8, %%mm0\n\t"
58 "pand %%mm5, %%mm1\n\t"
59 "por %%mm1, %%mm0\n\t"
60 "paddb %2, %%mm0\n\t"
61 "movq %%mm0, %0\n\t"
62 "1:\n\t"
63 :: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x])
64 : "%eax");
65 }
66 #else
20 for(x=0;x<w;x++){ 67 for(x=0;x<w;x++){
21 #ifdef FAST_OSD 68 #ifdef FAST_OSD
22 if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0]; 69 if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0];
23 if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1]; 70 if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1];
24 #else 71 #else
25 if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x]; 72 if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
26 #endif 73 #endif
27 } 74 }
75 #endif
28 src+=srcstride; 76 src+=srcstride;
29 srca+=srcstride; 77 srca+=srcstride;
30 dstbase+=dststride; 78 dstbase+=dststride;
31 } 79 }
80 #ifdef HAVE_MMX
81 asm volatile(EMMS:::"memory");
82 #endif
83 PROFILE_END("vo_draw_alpha_yv12");
32 return; 84 return;
33 } 85 }
34 86
35 void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ 87 void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
36 int y; 88 int y;
37 #ifdef FAST_OSD 89 #if defined(FAST_OSD) && !defined(HAVE_MMX)
38 w=w>>1; 90 w=w>>1;
39 #endif 91 #endif
40 for(y=0;y<h;y++){ 92 PROFILE_START();
41 register int x; 93 for(y=0;y<h;y++){
94 register int x;
95 #ifdef HAVE_MMX
96 asm volatile(
97 PREFETCHW" %0\n\t"
98 PREFETCH" %1\n\t"
99 PREFETCH" %2\n\t"
100 "pxor %%mm7, %%mm7\n\t"
101 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
102 "movq %%mm5, %%mm4\n\t"
103 "psllw $8, %%mm5\n\t" //FF00FF00FF00
104 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
105 ::"m"(*dstbase),"m"(*srca),"m"(*src));
106 for(x=0;x<w;x+=4){
107 asm volatile(
108 "movl %1, %%eax\n\t"
109 "orl %%eax, %%eax\n\t"
110 " jz 1f\n\t"
111 PREFETCHW" 32%0\n\t"
112 PREFETCH" 32%1\n\t"
113 PREFETCH" 32%2\n\t"
114 "movq %0, %%mm0\n\t" // dstbase
115 "movq %%mm0, %%mm1\n\t"
116 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
117 "movd %%eax, %%mm2\n\t" //srca 0000DCBA
118 "paddb bFF, %%mm2\n\t"
119 "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A
120 "pmullw %%mm2, %%mm0\n\t"
121 "psrlw $8, %%mm0\n\t"
122 "pand %%mm5, %%mm1\n\t" //U0V0U0V0
123 "movd %2, %%mm2\n\t" //src 0000DCBA
124 "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A
125 "por %%mm1, %%mm0\n\t"
126 "paddb %%mm2, %%mm0\n\t"
127 "movq %%mm0, %0\n\t"
128 "1:\n\t"
129 :: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x])
130 : "%eax");
131 }
132 #else
42 for(x=0;x<w;x++){ 133 for(x=0;x<w;x++){
43 #ifdef FAST_OSD 134 #ifdef FAST_OSD
44 if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0]; 135 if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0];
45 if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1]; 136 if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1];
46 #else 137 #else
47 if(srca[x]) dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x]; 138 if(srca[x]) dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
48 #endif 139 #endif
49 } 140 }
50 src+=srcstride; 141 #endif
51 srca+=srcstride; 142 src+=srcstride;
52 dstbase+=dststride; 143 srca+=srcstride;
53 } 144 dstbase+=dststride;
145 }
146 #ifdef HAVE_MMX
147 asm volatile(EMMS:::"memory");
148 #endif
149 PROFILE_END("vo_draw_alpha_yuy2");
54 return; 150 return;
55 } 151 }
56 152
57 #ifdef HAVE_MMX 153 #ifdef HAVE_MMX
58 static const unsigned long long mask24lh __attribute__((aligned(8))) = 0xFFFF000000000000ULL; 154 static const unsigned long long mask24lh __attribute__((aligned(8))) = 0xFFFF000000000000ULL;
165 PROFILE_START(); 261 PROFILE_START();
166 for(y=0;y<h;y++){ 262 for(y=0;y<h;y++){
167 register int x; 263 register int x;
168 #ifdef ARCH_X86 264 #ifdef ARCH_X86
169 #ifdef HAVE_MMX 265 #ifdef HAVE_MMX
266 #ifdef HAVE_3DNOW
170 asm volatile( 267 asm volatile(
171 PREFETCHW" %0\n\t" 268 PREFETCHW" %0\n\t"
172 PREFETCH" %1\n\t" 269 PREFETCH" %1\n\t"
173 PREFETCH" %2\n\t" 270 PREFETCH" %2\n\t"
174 "pxor %%mm7, %%mm7\n\t" 271 "pxor %%mm7, %%mm7\n\t"
201 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB 298 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
202 "paddb %%mm2, %%mm0\n\t" 299 "paddb %%mm2, %%mm0\n\t"
203 "movq %%mm0, %0\n\t" 300 "movq %%mm0, %0\n\t"
204 :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x])); 301 :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]));
205 } 302 }
303 #else //this is faster for intels crap
304 asm volatile(
305 PREFETCHW" %0\n\t"
306 PREFETCH" %1\n\t"
307 PREFETCH" %2\n\t"
308 "pxor %%mm7, %%mm7\n\t"
309 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
310 "movq %%mm5, %%mm4\n\t"
311 "psllw $8, %%mm5\n\t" //FF00FF00FF00
312 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
313 ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
314 for(x=0;x<w;x+=4){
315 asm volatile(
316 "movl %1, %%eax\n\t"
317 "orl %%eax, %%eax\n\t"
318 " jz 1f\n\t"
319 PREFETCHW" 32%0\n\t"
320 PREFETCH" 32%1\n\t"
321 PREFETCH" 32%2\n\t"
322 "movq %0, %%mm0\n\t" // dstbase
323 "movq %%mm0, %%mm1\n\t"
324 "pand %%mm4, %%mm0\n\t" //0R0B0R0B
325 "psrlw $8, %%mm1\n\t" //0?0G0?0G
326 "movd %%eax, %%mm2\n\t" //srca 0000DCBA
327 "paddb bFF, %%mm2\n\t"
328 "punpcklbw %%mm2, %%mm2\n\t" //srca DDCCBBAA
329 "movq %%mm2, %%mm3\n\t"
330 "punpcklbw %%mm7, %%mm2\n\t" //srca 0B0B0A0A
331 "pmullw %%mm2, %%mm0\n\t"
332 "pmullw %%mm2, %%mm1\n\t"
333 "psrlw $8, %%mm0\n\t"
334 "pand %%mm5, %%mm1\n\t"
335 "por %%mm1, %%mm0\n\t"
336 "movd %2, %%mm2 \n\t" //src 0000DCBA
337 "punpcklbw %%mm2, %%mm2\n\t" //src DDCCBBAA
338 "movq %%mm2, %%mm6\n\t"
339 "punpcklbw %%mm2, %%mm2\n\t" //src BBBBAAAA
340 "paddb %%mm2, %%mm0\n\t"
341 "movq %%mm0, %0\n\t"
342
343 "movq 8%0, %%mm0\n\t" // dstbase
344 "movq %%mm0, %%mm1\n\t"
345 "pand %%mm4, %%mm0\n\t" //0R0B0R0B
346 "psrlw $8, %%mm1\n\t" //0?0G0?0G
347 "punpckhbw %%mm7, %%mm3\n\t" //srca 0D0D0C0C
348 "pmullw %%mm3, %%mm0\n\t"
349 "pmullw %%mm3, %%mm1\n\t"
350 "psrlw $8, %%mm0\n\t"
351 "pand %%mm5, %%mm1\n\t"
352 "por %%mm1, %%mm0\n\t"
353 "punpckhbw %%mm6, %%mm6\n\t" //src DDDDCCCC
354 "paddb %%mm6, %%mm0\n\t"
355 "movq %%mm0, 8%0\n\t"
356 "1:\n\t"
357 :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x])
358 : "%eax");
359 }
360 #endif
206 #else /* HAVE_MMX */ 361 #else /* HAVE_MMX */
207 for(x=0;x<w;x++){ 362 for(x=0;x<w;x++){
208 if(srca[x]){ 363 if(srca[x]){
209 asm volatile( 364 asm volatile(
210 "movzbl (%0), %%ecx\n\t" 365 "movzbl (%0), %%ecx\n\t"