Mercurial > mplayer.hg
comparison libswscale/rgb2rgb_template.c @ 18861:8579acff875e
Move postproc ---> libswscale
author | lucabe |
---|---|
date | Fri, 30 Jun 2006 12:00:31 +0000 |
parents | |
children | 6334c14b38eb |
comparison
equal
deleted
inserted
replaced
18860:ef741a3e90f5 | 18861:8579acff875e |
---|---|
1 /* | |
2 * | |
3 * rgb2rgb.c, Software RGB to RGB convertor | |
4 * pluralize by Software PAL8 to RGB convertor | |
5 * Software YUV to YUV convertor | |
6 * Software YUV to RGB convertor | |
7 * Written by Nick Kurshev. | |
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL) | |
9 * lot of big-endian byteorder fixes by Alex Beregszaszi | |
10 */ | |
11 | |
12 #include <stddef.h> | |
13 #include <inttypes.h> /* for __WORDSIZE */ | |
14 | |
15 #include "asmalign.h" | |
16 | |
17 #ifndef __WORDSIZE | |
18 // #warning You have misconfigured system and probably will lose performance! | |
19 #define __WORDSIZE MP_WORDSIZE | |
20 #endif | |
21 | |
22 #undef PREFETCH | |
23 #undef MOVNTQ | |
24 #undef EMMS | |
25 #undef SFENCE | |
26 #undef MMREG_SIZE | |
27 #undef PREFETCHW | |
28 #undef PAVGB | |
29 | |
30 #ifdef HAVE_SSE2 | |
31 #define MMREG_SIZE 16 | |
32 #else | |
33 #define MMREG_SIZE 8 | |
34 #endif | |
35 | |
36 #ifdef HAVE_3DNOW | |
37 #define PREFETCH "prefetch" | |
38 #define PREFETCHW "prefetchw" | |
39 #define PAVGB "pavgusb" | |
40 #elif defined ( HAVE_MMX2 ) | |
41 #define PREFETCH "prefetchnta" | |
42 #define PREFETCHW "prefetcht0" | |
43 #define PAVGB "pavgb" | |
44 #else | |
45 #ifdef __APPLE__ | |
46 #define PREFETCH "#" | |
47 #define PREFETCHW "#" | |
48 #else | |
49 #define PREFETCH "/nop" | |
50 #define PREFETCHW "/nop" | |
51 #endif | |
52 #endif | |
53 | |
54 #ifdef HAVE_3DNOW | |
55 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
56 #define EMMS "femms" | |
57 #else | |
58 #define EMMS "emms" | |
59 #endif | |
60 | |
61 #ifdef HAVE_MMX2 | |
62 #define MOVNTQ "movntq" | |
63 #define SFENCE "sfence" | |
64 #else | |
65 #define MOVNTQ "movq" | |
66 #ifdef __APPLE__ | |
67 #define SFENCE "#" | |
68 #else | |
69 #define SFENCE "/nop" | |
70 #endif | |
71 #endif | |
72 | |
73 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size) | |
74 { | |
75 uint8_t *dest = dst; | |
76 const uint8_t *s = src; | |
77 const uint8_t *end; | |
78 #ifdef HAVE_MMX | |
79 const uint8_t *mm_end; | |
80 #endif | |
81 end = s + src_size; | |
82 #ifdef HAVE_MMX | |
83 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
84 mm_end = end - 23; | |
85 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); | |
86 while(s < mm_end) | |
87 { | |
88 __asm __volatile( | |
89 PREFETCH" 32%1\n\t" | |
90 "movd %1, %%mm0\n\t" | |
91 "punpckldq 3%1, %%mm0\n\t" | |
92 "movd 6%1, %%mm1\n\t" | |
93 "punpckldq 9%1, %%mm1\n\t" | |
94 "movd 12%1, %%mm2\n\t" | |
95 "punpckldq 15%1, %%mm2\n\t" | |
96 "movd 18%1, %%mm3\n\t" | |
97 "punpckldq 21%1, %%mm3\n\t" | |
98 "pand %%mm7, %%mm0\n\t" | |
99 "pand %%mm7, %%mm1\n\t" | |
100 "pand %%mm7, %%mm2\n\t" | |
101 "pand %%mm7, %%mm3\n\t" | |
102 MOVNTQ" %%mm0, %0\n\t" | |
103 MOVNTQ" %%mm1, 8%0\n\t" | |
104 MOVNTQ" %%mm2, 16%0\n\t" | |
105 MOVNTQ" %%mm3, 24%0" | |
106 :"=m"(*dest) | |
107 :"m"(*s) | |
108 :"memory"); | |
109 dest += 32; | |
110 s += 24; | |
111 } | |
112 __asm __volatile(SFENCE:::"memory"); | |
113 __asm __volatile(EMMS:::"memory"); | |
114 #endif | |
115 while(s < end) | |
116 { | |
117 #ifdef WORDS_BIGENDIAN | |
118 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ | |
119 *dest++ = 0; | |
120 *dest++ = s[2]; | |
121 *dest++ = s[1]; | |
122 *dest++ = s[0]; | |
123 s+=3; | |
124 #else | |
125 *dest++ = *s++; | |
126 *dest++ = *s++; | |
127 *dest++ = *s++; | |
128 *dest++ = 0; | |
129 #endif | |
130 } | |
131 } | |
132 | |
133 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size) | |
134 { | |
135 uint8_t *dest = dst; | |
136 const uint8_t *s = src; | |
137 const uint8_t *end; | |
138 #ifdef HAVE_MMX | |
139 const uint8_t *mm_end; | |
140 #endif | |
141 end = s + src_size; | |
142 #ifdef HAVE_MMX | |
143 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
144 mm_end = end - 31; | |
145 while(s < mm_end) | |
146 { | |
147 __asm __volatile( | |
148 PREFETCH" 32%1\n\t" | |
149 "movq %1, %%mm0\n\t" | |
150 "movq 8%1, %%mm1\n\t" | |
151 "movq 16%1, %%mm4\n\t" | |
152 "movq 24%1, %%mm5\n\t" | |
153 "movq %%mm0, %%mm2\n\t" | |
154 "movq %%mm1, %%mm3\n\t" | |
155 "movq %%mm4, %%mm6\n\t" | |
156 "movq %%mm5, %%mm7\n\t" | |
157 "psrlq $8, %%mm2\n\t" | |
158 "psrlq $8, %%mm3\n\t" | |
159 "psrlq $8, %%mm6\n\t" | |
160 "psrlq $8, %%mm7\n\t" | |
161 "pand %2, %%mm0\n\t" | |
162 "pand %2, %%mm1\n\t" | |
163 "pand %2, %%mm4\n\t" | |
164 "pand %2, %%mm5\n\t" | |
165 "pand %3, %%mm2\n\t" | |
166 "pand %3, %%mm3\n\t" | |
167 "pand %3, %%mm6\n\t" | |
168 "pand %3, %%mm7\n\t" | |
169 "por %%mm2, %%mm0\n\t" | |
170 "por %%mm3, %%mm1\n\t" | |
171 "por %%mm6, %%mm4\n\t" | |
172 "por %%mm7, %%mm5\n\t" | |
173 | |
174 "movq %%mm1, %%mm2\n\t" | |
175 "movq %%mm4, %%mm3\n\t" | |
176 "psllq $48, %%mm2\n\t" | |
177 "psllq $32, %%mm3\n\t" | |
178 "pand %4, %%mm2\n\t" | |
179 "pand %5, %%mm3\n\t" | |
180 "por %%mm2, %%mm0\n\t" | |
181 "psrlq $16, %%mm1\n\t" | |
182 "psrlq $32, %%mm4\n\t" | |
183 "psllq $16, %%mm5\n\t" | |
184 "por %%mm3, %%mm1\n\t" | |
185 "pand %6, %%mm5\n\t" | |
186 "por %%mm5, %%mm4\n\t" | |
187 | |
188 MOVNTQ" %%mm0, %0\n\t" | |
189 MOVNTQ" %%mm1, 8%0\n\t" | |
190 MOVNTQ" %%mm4, 16%0" | |
191 :"=m"(*dest) | |
192 :"m"(*s),"m"(mask24l), | |
193 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | |
194 :"memory"); | |
195 dest += 24; | |
196 s += 32; | |
197 } | |
198 __asm __volatile(SFENCE:::"memory"); | |
199 __asm __volatile(EMMS:::"memory"); | |
200 #endif | |
201 while(s < end) | |
202 { | |
203 #ifdef WORDS_BIGENDIAN | |
204 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */ | |
205 s++; | |
206 dest[2] = *s++; | |
207 dest[1] = *s++; | |
208 dest[0] = *s++; | |
209 dest += 3; | |
210 #else | |
211 *dest++ = *s++; | |
212 *dest++ = *s++; | |
213 *dest++ = *s++; | |
214 s++; | |
215 #endif | |
216 } | |
217 } | |
218 | |
219 /* | |
220 Original by Strepto/Astral | |
221 ported to gcc & bugfixed : A'rpi | |
222 MMX2, 3DNOW optimization by Nick Kurshev | |
223 32bit c version, and and&add trick by Michael Niedermayer | |
224 */ | |
225 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size) | |
226 { | |
227 register const uint8_t* s=src; | |
228 register uint8_t* d=dst; | |
229 register const uint8_t *end; | |
230 const uint8_t *mm_end; | |
231 end = s + src_size; | |
232 #ifdef HAVE_MMX | |
233 __asm __volatile(PREFETCH" %0"::"m"(*s)); | |
234 __asm __volatile("movq %0, %%mm4"::"m"(mask15s)); | |
235 mm_end = end - 15; | |
236 while(s<mm_end) | |
237 { | |
238 __asm __volatile( | |
239 PREFETCH" 32%1\n\t" | |
240 "movq %1, %%mm0\n\t" | |
241 "movq 8%1, %%mm2\n\t" | |
242 "movq %%mm0, %%mm1\n\t" | |
243 "movq %%mm2, %%mm3\n\t" | |
244 "pand %%mm4, %%mm0\n\t" | |
245 "pand %%mm4, %%mm2\n\t" | |
246 "paddw %%mm1, %%mm0\n\t" | |
247 "paddw %%mm3, %%mm2\n\t" | |
248 MOVNTQ" %%mm0, %0\n\t" | |
249 MOVNTQ" %%mm2, 8%0" | |
250 :"=m"(*d) | |
251 :"m"(*s) | |
252 ); | |
253 d+=16; | |
254 s+=16; | |
255 } | |
256 __asm __volatile(SFENCE:::"memory"); | |
257 __asm __volatile(EMMS:::"memory"); | |
258 #endif | |
259 mm_end = end - 3; | |
260 while(s < mm_end) | |
261 { | |
262 register unsigned x= *((uint32_t *)s); | |
263 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); | |
264 d+=4; | |
265 s+=4; | |
266 } | |
267 if(s < end) | |
268 { | |
269 register unsigned short x= *((uint16_t *)s); | |
270 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); | |
271 } | |
272 } | |
273 | |
274 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size) | |
275 { | |
276 register const uint8_t* s=src; | |
277 register uint8_t* d=dst; | |
278 register const uint8_t *end; | |
279 const uint8_t *mm_end; | |
280 end = s + src_size; | |
281 #ifdef HAVE_MMX | |
282 __asm __volatile(PREFETCH" %0"::"m"(*s)); | |
283 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg)); | |
284 __asm __volatile("movq %0, %%mm6"::"m"(mask15b)); | |
285 mm_end = end - 15; | |
286 while(s<mm_end) | |
287 { | |
288 __asm __volatile( | |
289 PREFETCH" 32%1\n\t" | |
290 "movq %1, %%mm0\n\t" | |
291 "movq 8%1, %%mm2\n\t" | |
292 "movq %%mm0, %%mm1\n\t" | |
293 "movq %%mm2, %%mm3\n\t" | |
294 "psrlq $1, %%mm0\n\t" | |
295 "psrlq $1, %%mm2\n\t" | |
296 "pand %%mm7, %%mm0\n\t" | |
297 "pand %%mm7, %%mm2\n\t" | |
298 "pand %%mm6, %%mm1\n\t" | |
299 "pand %%mm6, %%mm3\n\t" | |
300 "por %%mm1, %%mm0\n\t" | |
301 "por %%mm3, %%mm2\n\t" | |
302 MOVNTQ" %%mm0, %0\n\t" | |
303 MOVNTQ" %%mm2, 8%0" | |
304 :"=m"(*d) | |
305 :"m"(*s) | |
306 ); | |
307 d+=16; | |
308 s+=16; | |
309 } | |
310 __asm __volatile(SFENCE:::"memory"); | |
311 __asm __volatile(EMMS:::"memory"); | |
312 #endif | |
313 mm_end = end - 3; | |
314 while(s < mm_end) | |
315 { | |
316 register uint32_t x= *((uint32_t *)s); | |
317 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); | |
318 s+=4; | |
319 d+=4; | |
320 } | |
321 if(s < end) | |
322 { | |
323 register uint16_t x= *((uint16_t *)s); | |
324 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); | |
325 s+=2; | |
326 d+=2; | |
327 } | |
328 } | |
329 | |
330 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size) | |
331 { | |
332 const uint8_t *s = src; | |
333 const uint8_t *end; | |
334 #ifdef HAVE_MMX | |
335 const uint8_t *mm_end; | |
336 #endif | |
337 uint16_t *d = (uint16_t *)dst; | |
338 end = s + src_size; | |
339 #ifdef HAVE_MMX | |
340 mm_end = end - 15; | |
341 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) | |
342 asm volatile( | |
343 "movq %3, %%mm5 \n\t" | |
344 "movq %4, %%mm6 \n\t" | |
345 "movq %5, %%mm7 \n\t" | |
346 ASMALIGN16 | |
347 "1: \n\t" | |
348 PREFETCH" 32(%1) \n\t" | |
349 "movd (%1), %%mm0 \n\t" | |
350 "movd 4(%1), %%mm3 \n\t" | |
351 "punpckldq 8(%1), %%mm0 \n\t" | |
352 "punpckldq 12(%1), %%mm3 \n\t" | |
353 "movq %%mm0, %%mm1 \n\t" | |
354 "movq %%mm3, %%mm4 \n\t" | |
355 "pand %%mm6, %%mm0 \n\t" | |
356 "pand %%mm6, %%mm3 \n\t" | |
357 "pmaddwd %%mm7, %%mm0 \n\t" | |
358 "pmaddwd %%mm7, %%mm3 \n\t" | |
359 "pand %%mm5, %%mm1 \n\t" | |
360 "pand %%mm5, %%mm4 \n\t" | |
361 "por %%mm1, %%mm0 \n\t" | |
362 "por %%mm4, %%mm3 \n\t" | |
363 "psrld $5, %%mm0 \n\t" | |
364 "pslld $11, %%mm3 \n\t" | |
365 "por %%mm3, %%mm0 \n\t" | |
366 MOVNTQ" %%mm0, (%0) \n\t" | |
367 "add $16, %1 \n\t" | |
368 "add $8, %0 \n\t" | |
369 "cmp %2, %1 \n\t" | |
370 " jb 1b \n\t" | |
371 : "+r" (d), "+r"(s) | |
372 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | |
373 ); | |
374 #else | |
375 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
376 __asm __volatile( | |
377 "movq %0, %%mm7\n\t" | |
378 "movq %1, %%mm6\n\t" | |
379 ::"m"(red_16mask),"m"(green_16mask)); | |
380 while(s < mm_end) | |
381 { | |
382 __asm __volatile( | |
383 PREFETCH" 32%1\n\t" | |
384 "movd %1, %%mm0\n\t" | |
385 "movd 4%1, %%mm3\n\t" | |
386 "punpckldq 8%1, %%mm0\n\t" | |
387 "punpckldq 12%1, %%mm3\n\t" | |
388 "movq %%mm0, %%mm1\n\t" | |
389 "movq %%mm0, %%mm2\n\t" | |
390 "movq %%mm3, %%mm4\n\t" | |
391 "movq %%mm3, %%mm5\n\t" | |
392 "psrlq $3, %%mm0\n\t" | |
393 "psrlq $3, %%mm3\n\t" | |
394 "pand %2, %%mm0\n\t" | |
395 "pand %2, %%mm3\n\t" | |
396 "psrlq $5, %%mm1\n\t" | |
397 "psrlq $5, %%mm4\n\t" | |
398 "pand %%mm6, %%mm1\n\t" | |
399 "pand %%mm6, %%mm4\n\t" | |
400 "psrlq $8, %%mm2\n\t" | |
401 "psrlq $8, %%mm5\n\t" | |
402 "pand %%mm7, %%mm2\n\t" | |
403 "pand %%mm7, %%mm5\n\t" | |
404 "por %%mm1, %%mm0\n\t" | |
405 "por %%mm4, %%mm3\n\t" | |
406 "por %%mm2, %%mm0\n\t" | |
407 "por %%mm5, %%mm3\n\t" | |
408 "psllq $16, %%mm3\n\t" | |
409 "por %%mm3, %%mm0\n\t" | |
410 MOVNTQ" %%mm0, %0\n\t" | |
411 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
412 d += 4; | |
413 s += 16; | |
414 } | |
415 #endif | |
416 __asm __volatile(SFENCE:::"memory"); | |
417 __asm __volatile(EMMS:::"memory"); | |
418 #endif | |
419 while(s < end) | |
420 { | |
421 register int rgb = *(uint32_t*)s; s += 4; | |
422 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); | |
423 } | |
424 } | |
425 | |
426 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) | |
427 { | |
428 const uint8_t *s = src; | |
429 const uint8_t *end; | |
430 #ifdef HAVE_MMX | |
431 const uint8_t *mm_end; | |
432 #endif | |
433 uint16_t *d = (uint16_t *)dst; | |
434 end = s + src_size; | |
435 #ifdef HAVE_MMX | |
436 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
437 __asm __volatile( | |
438 "movq %0, %%mm7\n\t" | |
439 "movq %1, %%mm6\n\t" | |
440 ::"m"(red_16mask),"m"(green_16mask)); | |
441 mm_end = end - 15; | |
442 while(s < mm_end) | |
443 { | |
444 __asm __volatile( | |
445 PREFETCH" 32%1\n\t" | |
446 "movd %1, %%mm0\n\t" | |
447 "movd 4%1, %%mm3\n\t" | |
448 "punpckldq 8%1, %%mm0\n\t" | |
449 "punpckldq 12%1, %%mm3\n\t" | |
450 "movq %%mm0, %%mm1\n\t" | |
451 "movq %%mm0, %%mm2\n\t" | |
452 "movq %%mm3, %%mm4\n\t" | |
453 "movq %%mm3, %%mm5\n\t" | |
454 "psllq $8, %%mm0\n\t" | |
455 "psllq $8, %%mm3\n\t" | |
456 "pand %%mm7, %%mm0\n\t" | |
457 "pand %%mm7, %%mm3\n\t" | |
458 "psrlq $5, %%mm1\n\t" | |
459 "psrlq $5, %%mm4\n\t" | |
460 "pand %%mm6, %%mm1\n\t" | |
461 "pand %%mm6, %%mm4\n\t" | |
462 "psrlq $19, %%mm2\n\t" | |
463 "psrlq $19, %%mm5\n\t" | |
464 "pand %2, %%mm2\n\t" | |
465 "pand %2, %%mm5\n\t" | |
466 "por %%mm1, %%mm0\n\t" | |
467 "por %%mm4, %%mm3\n\t" | |
468 "por %%mm2, %%mm0\n\t" | |
469 "por %%mm5, %%mm3\n\t" | |
470 "psllq $16, %%mm3\n\t" | |
471 "por %%mm3, %%mm0\n\t" | |
472 MOVNTQ" %%mm0, %0\n\t" | |
473 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
474 d += 4; | |
475 s += 16; | |
476 } | |
477 __asm __volatile(SFENCE:::"memory"); | |
478 __asm __volatile(EMMS:::"memory"); | |
479 #endif | |
480 while(s < end) | |
481 { | |
482 register int rgb = *(uint32_t*)s; s += 4; | |
483 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); | |
484 } | |
485 } | |
486 | |
487 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size) | |
488 { | |
489 const uint8_t *s = src; | |
490 const uint8_t *end; | |
491 #ifdef HAVE_MMX | |
492 const uint8_t *mm_end; | |
493 #endif | |
494 uint16_t *d = (uint16_t *)dst; | |
495 end = s + src_size; | |
496 #ifdef HAVE_MMX | |
497 mm_end = end - 15; | |
498 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) | |
499 asm volatile( | |
500 "movq %3, %%mm5 \n\t" | |
501 "movq %4, %%mm6 \n\t" | |
502 "movq %5, %%mm7 \n\t" | |
503 ASMALIGN16 | |
504 "1: \n\t" | |
505 PREFETCH" 32(%1) \n\t" | |
506 "movd (%1), %%mm0 \n\t" | |
507 "movd 4(%1), %%mm3 \n\t" | |
508 "punpckldq 8(%1), %%mm0 \n\t" | |
509 "punpckldq 12(%1), %%mm3 \n\t" | |
510 "movq %%mm0, %%mm1 \n\t" | |
511 "movq %%mm3, %%mm4 \n\t" | |
512 "pand %%mm6, %%mm0 \n\t" | |
513 "pand %%mm6, %%mm3 \n\t" | |
514 "pmaddwd %%mm7, %%mm0 \n\t" | |
515 "pmaddwd %%mm7, %%mm3 \n\t" | |
516 "pand %%mm5, %%mm1 \n\t" | |
517 "pand %%mm5, %%mm4 \n\t" | |
518 "por %%mm1, %%mm0 \n\t" | |
519 "por %%mm4, %%mm3 \n\t" | |
520 "psrld $6, %%mm0 \n\t" | |
521 "pslld $10, %%mm3 \n\t" | |
522 "por %%mm3, %%mm0 \n\t" | |
523 MOVNTQ" %%mm0, (%0) \n\t" | |
524 "add $16, %1 \n\t" | |
525 "add $8, %0 \n\t" | |
526 "cmp %2, %1 \n\t" | |
527 " jb 1b \n\t" | |
528 : "+r" (d), "+r"(s) | |
529 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | |
530 ); | |
531 #else | |
532 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
533 __asm __volatile( | |
534 "movq %0, %%mm7\n\t" | |
535 "movq %1, %%mm6\n\t" | |
536 ::"m"(red_15mask),"m"(green_15mask)); | |
537 while(s < mm_end) | |
538 { | |
539 __asm __volatile( | |
540 PREFETCH" 32%1\n\t" | |
541 "movd %1, %%mm0\n\t" | |
542 "movd 4%1, %%mm3\n\t" | |
543 "punpckldq 8%1, %%mm0\n\t" | |
544 "punpckldq 12%1, %%mm3\n\t" | |
545 "movq %%mm0, %%mm1\n\t" | |
546 "movq %%mm0, %%mm2\n\t" | |
547 "movq %%mm3, %%mm4\n\t" | |
548 "movq %%mm3, %%mm5\n\t" | |
549 "psrlq $3, %%mm0\n\t" | |
550 "psrlq $3, %%mm3\n\t" | |
551 "pand %2, %%mm0\n\t" | |
552 "pand %2, %%mm3\n\t" | |
553 "psrlq $6, %%mm1\n\t" | |
554 "psrlq $6, %%mm4\n\t" | |
555 "pand %%mm6, %%mm1\n\t" | |
556 "pand %%mm6, %%mm4\n\t" | |
557 "psrlq $9, %%mm2\n\t" | |
558 "psrlq $9, %%mm5\n\t" | |
559 "pand %%mm7, %%mm2\n\t" | |
560 "pand %%mm7, %%mm5\n\t" | |
561 "por %%mm1, %%mm0\n\t" | |
562 "por %%mm4, %%mm3\n\t" | |
563 "por %%mm2, %%mm0\n\t" | |
564 "por %%mm5, %%mm3\n\t" | |
565 "psllq $16, %%mm3\n\t" | |
566 "por %%mm3, %%mm0\n\t" | |
567 MOVNTQ" %%mm0, %0\n\t" | |
568 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
569 d += 4; | |
570 s += 16; | |
571 } | |
572 #endif | |
573 __asm __volatile(SFENCE:::"memory"); | |
574 __asm __volatile(EMMS:::"memory"); | |
575 #endif | |
576 while(s < end) | |
577 { | |
578 register int rgb = *(uint32_t*)s; s += 4; | |
579 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); | |
580 } | |
581 } | |
582 | |
583 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) | |
584 { | |
585 const uint8_t *s = src; | |
586 const uint8_t *end; | |
587 #ifdef HAVE_MMX | |
588 const uint8_t *mm_end; | |
589 #endif | |
590 uint16_t *d = (uint16_t *)dst; | |
591 end = s + src_size; | |
592 #ifdef HAVE_MMX | |
593 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
594 __asm __volatile( | |
595 "movq %0, %%mm7\n\t" | |
596 "movq %1, %%mm6\n\t" | |
597 ::"m"(red_15mask),"m"(green_15mask)); | |
598 mm_end = end - 15; | |
599 while(s < mm_end) | |
600 { | |
601 __asm __volatile( | |
602 PREFETCH" 32%1\n\t" | |
603 "movd %1, %%mm0\n\t" | |
604 "movd 4%1, %%mm3\n\t" | |
605 "punpckldq 8%1, %%mm0\n\t" | |
606 "punpckldq 12%1, %%mm3\n\t" | |
607 "movq %%mm0, %%mm1\n\t" | |
608 "movq %%mm0, %%mm2\n\t" | |
609 "movq %%mm3, %%mm4\n\t" | |
610 "movq %%mm3, %%mm5\n\t" | |
611 "psllq $7, %%mm0\n\t" | |
612 "psllq $7, %%mm3\n\t" | |
613 "pand %%mm7, %%mm0\n\t" | |
614 "pand %%mm7, %%mm3\n\t" | |
615 "psrlq $6, %%mm1\n\t" | |
616 "psrlq $6, %%mm4\n\t" | |
617 "pand %%mm6, %%mm1\n\t" | |
618 "pand %%mm6, %%mm4\n\t" | |
619 "psrlq $19, %%mm2\n\t" | |
620 "psrlq $19, %%mm5\n\t" | |
621 "pand %2, %%mm2\n\t" | |
622 "pand %2, %%mm5\n\t" | |
623 "por %%mm1, %%mm0\n\t" | |
624 "por %%mm4, %%mm3\n\t" | |
625 "por %%mm2, %%mm0\n\t" | |
626 "por %%mm5, %%mm3\n\t" | |
627 "psllq $16, %%mm3\n\t" | |
628 "por %%mm3, %%mm0\n\t" | |
629 MOVNTQ" %%mm0, %0\n\t" | |
630 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
631 d += 4; | |
632 s += 16; | |
633 } | |
634 __asm __volatile(SFENCE:::"memory"); | |
635 __asm __volatile(EMMS:::"memory"); | |
636 #endif | |
637 while(s < end) | |
638 { | |
639 register int rgb = *(uint32_t*)s; s += 4; | |
640 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); | |
641 } | |
642 } | |
643 | |
644 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size) | |
645 { | |
646 const uint8_t *s = src; | |
647 const uint8_t *end; | |
648 #ifdef HAVE_MMX | |
649 const uint8_t *mm_end; | |
650 #endif | |
651 uint16_t *d = (uint16_t *)dst; | |
652 end = s + src_size; | |
653 #ifdef HAVE_MMX | |
654 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
655 __asm __volatile( | |
656 "movq %0, %%mm7\n\t" | |
657 "movq %1, %%mm6\n\t" | |
658 ::"m"(red_16mask),"m"(green_16mask)); | |
659 mm_end = end - 11; | |
660 while(s < mm_end) | |
661 { | |
662 __asm __volatile( | |
663 PREFETCH" 32%1\n\t" | |
664 "movd %1, %%mm0\n\t" | |
665 "movd 3%1, %%mm3\n\t" | |
666 "punpckldq 6%1, %%mm0\n\t" | |
667 "punpckldq 9%1, %%mm3\n\t" | |
668 "movq %%mm0, %%mm1\n\t" | |
669 "movq %%mm0, %%mm2\n\t" | |
670 "movq %%mm3, %%mm4\n\t" | |
671 "movq %%mm3, %%mm5\n\t" | |
672 "psrlq $3, %%mm0\n\t" | |
673 "psrlq $3, %%mm3\n\t" | |
674 "pand %2, %%mm0\n\t" | |
675 "pand %2, %%mm3\n\t" | |
676 "psrlq $5, %%mm1\n\t" | |
677 "psrlq $5, %%mm4\n\t" | |
678 "pand %%mm6, %%mm1\n\t" | |
679 "pand %%mm6, %%mm4\n\t" | |
680 "psrlq $8, %%mm2\n\t" | |
681 "psrlq $8, %%mm5\n\t" | |
682 "pand %%mm7, %%mm2\n\t" | |
683 "pand %%mm7, %%mm5\n\t" | |
684 "por %%mm1, %%mm0\n\t" | |
685 "por %%mm4, %%mm3\n\t" | |
686 "por %%mm2, %%mm0\n\t" | |
687 "por %%mm5, %%mm3\n\t" | |
688 "psllq $16, %%mm3\n\t" | |
689 "por %%mm3, %%mm0\n\t" | |
690 MOVNTQ" %%mm0, %0\n\t" | |
691 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
692 d += 4; | |
693 s += 12; | |
694 } | |
695 __asm __volatile(SFENCE:::"memory"); | |
696 __asm __volatile(EMMS:::"memory"); | |
697 #endif | |
698 while(s < end) | |
699 { | |
700 const int b= *s++; | |
701 const int g= *s++; | |
702 const int r= *s++; | |
703 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
704 } | |
705 } | |
706 | |
707 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) | |
708 { | |
709 const uint8_t *s = src; | |
710 const uint8_t *end; | |
711 #ifdef HAVE_MMX | |
712 const uint8_t *mm_end; | |
713 #endif | |
714 uint16_t *d = (uint16_t *)dst; | |
715 end = s + src_size; | |
716 #ifdef HAVE_MMX | |
717 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
718 __asm __volatile( | |
719 "movq %0, %%mm7\n\t" | |
720 "movq %1, %%mm6\n\t" | |
721 ::"m"(red_16mask),"m"(green_16mask)); | |
722 mm_end = end - 15; | |
723 while(s < mm_end) | |
724 { | |
725 __asm __volatile( | |
726 PREFETCH" 32%1\n\t" | |
727 "movd %1, %%mm0\n\t" | |
728 "movd 3%1, %%mm3\n\t" | |
729 "punpckldq 6%1, %%mm0\n\t" | |
730 "punpckldq 9%1, %%mm3\n\t" | |
731 "movq %%mm0, %%mm1\n\t" | |
732 "movq %%mm0, %%mm2\n\t" | |
733 "movq %%mm3, %%mm4\n\t" | |
734 "movq %%mm3, %%mm5\n\t" | |
735 "psllq $8, %%mm0\n\t" | |
736 "psllq $8, %%mm3\n\t" | |
737 "pand %%mm7, %%mm0\n\t" | |
738 "pand %%mm7, %%mm3\n\t" | |
739 "psrlq $5, %%mm1\n\t" | |
740 "psrlq $5, %%mm4\n\t" | |
741 "pand %%mm6, %%mm1\n\t" | |
742 "pand %%mm6, %%mm4\n\t" | |
743 "psrlq $19, %%mm2\n\t" | |
744 "psrlq $19, %%mm5\n\t" | |
745 "pand %2, %%mm2\n\t" | |
746 "pand %2, %%mm5\n\t" | |
747 "por %%mm1, %%mm0\n\t" | |
748 "por %%mm4, %%mm3\n\t" | |
749 "por %%mm2, %%mm0\n\t" | |
750 "por %%mm5, %%mm3\n\t" | |
751 "psllq $16, %%mm3\n\t" | |
752 "por %%mm3, %%mm0\n\t" | |
753 MOVNTQ" %%mm0, %0\n\t" | |
754 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | |
755 d += 4; | |
756 s += 12; | |
757 } | |
758 __asm __volatile(SFENCE:::"memory"); | |
759 __asm __volatile(EMMS:::"memory"); | |
760 #endif | |
761 while(s < end) | |
762 { | |
763 const int r= *s++; | |
764 const int g= *s++; | |
765 const int b= *s++; | |
766 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
767 } | |
768 } | |
769 | |
770 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size) | |
771 { | |
772 const uint8_t *s = src; | |
773 const uint8_t *end; | |
774 #ifdef HAVE_MMX | |
775 const uint8_t *mm_end; | |
776 #endif | |
777 uint16_t *d = (uint16_t *)dst; | |
778 end = s + src_size; | |
779 #ifdef HAVE_MMX | |
780 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
781 __asm __volatile( | |
782 "movq %0, %%mm7\n\t" | |
783 "movq %1, %%mm6\n\t" | |
784 ::"m"(red_15mask),"m"(green_15mask)); | |
785 mm_end = end - 11; | |
786 while(s < mm_end) | |
787 { | |
788 __asm __volatile( | |
789 PREFETCH" 32%1\n\t" | |
790 "movd %1, %%mm0\n\t" | |
791 "movd 3%1, %%mm3\n\t" | |
792 "punpckldq 6%1, %%mm0\n\t" | |
793 "punpckldq 9%1, %%mm3\n\t" | |
794 "movq %%mm0, %%mm1\n\t" | |
795 "movq %%mm0, %%mm2\n\t" | |
796 "movq %%mm3, %%mm4\n\t" | |
797 "movq %%mm3, %%mm5\n\t" | |
798 "psrlq $3, %%mm0\n\t" | |
799 "psrlq $3, %%mm3\n\t" | |
800 "pand %2, %%mm0\n\t" | |
801 "pand %2, %%mm3\n\t" | |
802 "psrlq $6, %%mm1\n\t" | |
803 "psrlq $6, %%mm4\n\t" | |
804 "pand %%mm6, %%mm1\n\t" | |
805 "pand %%mm6, %%mm4\n\t" | |
806 "psrlq $9, %%mm2\n\t" | |
807 "psrlq $9, %%mm5\n\t" | |
808 "pand %%mm7, %%mm2\n\t" | |
809 "pand %%mm7, %%mm5\n\t" | |
810 "por %%mm1, %%mm0\n\t" | |
811 "por %%mm4, %%mm3\n\t" | |
812 "por %%mm2, %%mm0\n\t" | |
813 "por %%mm5, %%mm3\n\t" | |
814 "psllq $16, %%mm3\n\t" | |
815 "por %%mm3, %%mm0\n\t" | |
816 MOVNTQ" %%mm0, %0\n\t" | |
817 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
818 d += 4; | |
819 s += 12; | |
820 } | |
821 __asm __volatile(SFENCE:::"memory"); | |
822 __asm __volatile(EMMS:::"memory"); | |
823 #endif | |
824 while(s < end) | |
825 { | |
826 const int b= *s++; | |
827 const int g= *s++; | |
828 const int r= *s++; | |
829 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
830 } | |
831 } | |
832 | |
833 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) | |
834 { | |
835 const uint8_t *s = src; | |
836 const uint8_t *end; | |
837 #ifdef HAVE_MMX | |
838 const uint8_t *mm_end; | |
839 #endif | |
840 uint16_t *d = (uint16_t *)dst; | |
841 end = s + src_size; | |
842 #ifdef HAVE_MMX | |
843 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
844 __asm __volatile( | |
845 "movq %0, %%mm7\n\t" | |
846 "movq %1, %%mm6\n\t" | |
847 ::"m"(red_15mask),"m"(green_15mask)); | |
848 mm_end = end - 15; | |
849 while(s < mm_end) | |
850 { | |
851 __asm __volatile( | |
852 PREFETCH" 32%1\n\t" | |
853 "movd %1, %%mm0\n\t" | |
854 "movd 3%1, %%mm3\n\t" | |
855 "punpckldq 6%1, %%mm0\n\t" | |
856 "punpckldq 9%1, %%mm3\n\t" | |
857 "movq %%mm0, %%mm1\n\t" | |
858 "movq %%mm0, %%mm2\n\t" | |
859 "movq %%mm3, %%mm4\n\t" | |
860 "movq %%mm3, %%mm5\n\t" | |
861 "psllq $7, %%mm0\n\t" | |
862 "psllq $7, %%mm3\n\t" | |
863 "pand %%mm7, %%mm0\n\t" | |
864 "pand %%mm7, %%mm3\n\t" | |
865 "psrlq $6, %%mm1\n\t" | |
866 "psrlq $6, %%mm4\n\t" | |
867 "pand %%mm6, %%mm1\n\t" | |
868 "pand %%mm6, %%mm4\n\t" | |
869 "psrlq $19, %%mm2\n\t" | |
870 "psrlq $19, %%mm5\n\t" | |
871 "pand %2, %%mm2\n\t" | |
872 "pand %2, %%mm5\n\t" | |
873 "por %%mm1, %%mm0\n\t" | |
874 "por %%mm4, %%mm3\n\t" | |
875 "por %%mm2, %%mm0\n\t" | |
876 "por %%mm5, %%mm3\n\t" | |
877 "psllq $16, %%mm3\n\t" | |
878 "por %%mm3, %%mm0\n\t" | |
879 MOVNTQ" %%mm0, %0\n\t" | |
880 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | |
881 d += 4; | |
882 s += 12; | |
883 } | |
884 __asm __volatile(SFENCE:::"memory"); | |
885 __asm __volatile(EMMS:::"memory"); | |
886 #endif | |
887 while(s < end) | |
888 { | |
889 const int r= *s++; | |
890 const int g= *s++; | |
891 const int b= *s++; | |
892 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
893 } | |
894 } | |
895 | |
896 /* | |
897 I use here less accurate approximation by simply | |
898 left-shifting the input | |
899 value and filling the low order bits with | |
900 zeroes. This method improves png's | |
901 compression but this scheme cannot reproduce white exactly, since it does not | |
902 generate an all-ones maximum value; the net effect is to darken the | |
903 image slightly. | |
904 | |
905 The better method should be "left bit replication": | |
906 | |
907 4 3 2 1 0 | |
908 --------- | |
909 1 1 0 1 1 | |
910 | |
911 7 6 5 4 3 2 1 0 | |
912 ---------------- | |
913 1 1 0 1 1 1 1 0 | |
914 |=======| |===| | |
915 | Leftmost Bits Repeated to Fill Open Bits | |
916 | | |
917 Original Bits | |
918 */ | |
919 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size) | |
920 { | |
921 const uint16_t *end; | |
922 #ifdef HAVE_MMX | |
923 const uint16_t *mm_end; | |
924 #endif | |
925 uint8_t *d = (uint8_t *)dst; | |
926 const uint16_t *s = (uint16_t *)src; | |
927 end = s + src_size/2; | |
928 #ifdef HAVE_MMX | |
929 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
930 mm_end = end - 7; | |
931 while(s < mm_end) | |
932 { | |
933 __asm __volatile( | |
934 PREFETCH" 32%1\n\t" | |
935 "movq %1, %%mm0\n\t" | |
936 "movq %1, %%mm1\n\t" | |
937 "movq %1, %%mm2\n\t" | |
938 "pand %2, %%mm0\n\t" | |
939 "pand %3, %%mm1\n\t" | |
940 "pand %4, %%mm2\n\t" | |
941 "psllq $3, %%mm0\n\t" | |
942 "psrlq $2, %%mm1\n\t" | |
943 "psrlq $7, %%mm2\n\t" | |
944 "movq %%mm0, %%mm3\n\t" | |
945 "movq %%mm1, %%mm4\n\t" | |
946 "movq %%mm2, %%mm5\n\t" | |
947 "punpcklwd %5, %%mm0\n\t" | |
948 "punpcklwd %5, %%mm1\n\t" | |
949 "punpcklwd %5, %%mm2\n\t" | |
950 "punpckhwd %5, %%mm3\n\t" | |
951 "punpckhwd %5, %%mm4\n\t" | |
952 "punpckhwd %5, %%mm5\n\t" | |
953 "psllq $8, %%mm1\n\t" | |
954 "psllq $16, %%mm2\n\t" | |
955 "por %%mm1, %%mm0\n\t" | |
956 "por %%mm2, %%mm0\n\t" | |
957 "psllq $8, %%mm4\n\t" | |
958 "psllq $16, %%mm5\n\t" | |
959 "por %%mm4, %%mm3\n\t" | |
960 "por %%mm5, %%mm3\n\t" | |
961 | |
962 "movq %%mm0, %%mm6\n\t" | |
963 "movq %%mm3, %%mm7\n\t" | |
964 | |
965 "movq 8%1, %%mm0\n\t" | |
966 "movq 8%1, %%mm1\n\t" | |
967 "movq 8%1, %%mm2\n\t" | |
968 "pand %2, %%mm0\n\t" | |
969 "pand %3, %%mm1\n\t" | |
970 "pand %4, %%mm2\n\t" | |
971 "psllq $3, %%mm0\n\t" | |
972 "psrlq $2, %%mm1\n\t" | |
973 "psrlq $7, %%mm2\n\t" | |
974 "movq %%mm0, %%mm3\n\t" | |
975 "movq %%mm1, %%mm4\n\t" | |
976 "movq %%mm2, %%mm5\n\t" | |
977 "punpcklwd %5, %%mm0\n\t" | |
978 "punpcklwd %5, %%mm1\n\t" | |
979 "punpcklwd %5, %%mm2\n\t" | |
980 "punpckhwd %5, %%mm3\n\t" | |
981 "punpckhwd %5, %%mm4\n\t" | |
982 "punpckhwd %5, %%mm5\n\t" | |
983 "psllq $8, %%mm1\n\t" | |
984 "psllq $16, %%mm2\n\t" | |
985 "por %%mm1, %%mm0\n\t" | |
986 "por %%mm2, %%mm0\n\t" | |
987 "psllq $8, %%mm4\n\t" | |
988 "psllq $16, %%mm5\n\t" | |
989 "por %%mm4, %%mm3\n\t" | |
990 "por %%mm5, %%mm3\n\t" | |
991 | |
992 :"=m"(*d) | |
993 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) | |
994 :"memory"); | |
995 /* Borrowed 32 to 24 */ | |
996 __asm __volatile( | |
997 "movq %%mm0, %%mm4\n\t" | |
998 "movq %%mm3, %%mm5\n\t" | |
999 "movq %%mm6, %%mm0\n\t" | |
1000 "movq %%mm7, %%mm1\n\t" | |
1001 | |
1002 "movq %%mm4, %%mm6\n\t" | |
1003 "movq %%mm5, %%mm7\n\t" | |
1004 "movq %%mm0, %%mm2\n\t" | |
1005 "movq %%mm1, %%mm3\n\t" | |
1006 | |
1007 "psrlq $8, %%mm2\n\t" | |
1008 "psrlq $8, %%mm3\n\t" | |
1009 "psrlq $8, %%mm6\n\t" | |
1010 "psrlq $8, %%mm7\n\t" | |
1011 "pand %2, %%mm0\n\t" | |
1012 "pand %2, %%mm1\n\t" | |
1013 "pand %2, %%mm4\n\t" | |
1014 "pand %2, %%mm5\n\t" | |
1015 "pand %3, %%mm2\n\t" | |
1016 "pand %3, %%mm3\n\t" | |
1017 "pand %3, %%mm6\n\t" | |
1018 "pand %3, %%mm7\n\t" | |
1019 "por %%mm2, %%mm0\n\t" | |
1020 "por %%mm3, %%mm1\n\t" | |
1021 "por %%mm6, %%mm4\n\t" | |
1022 "por %%mm7, %%mm5\n\t" | |
1023 | |
1024 "movq %%mm1, %%mm2\n\t" | |
1025 "movq %%mm4, %%mm3\n\t" | |
1026 "psllq $48, %%mm2\n\t" | |
1027 "psllq $32, %%mm3\n\t" | |
1028 "pand %4, %%mm2\n\t" | |
1029 "pand %5, %%mm3\n\t" | |
1030 "por %%mm2, %%mm0\n\t" | |
1031 "psrlq $16, %%mm1\n\t" | |
1032 "psrlq $32, %%mm4\n\t" | |
1033 "psllq $16, %%mm5\n\t" | |
1034 "por %%mm3, %%mm1\n\t" | |
1035 "pand %6, %%mm5\n\t" | |
1036 "por %%mm5, %%mm4\n\t" | |
1037 | |
1038 MOVNTQ" %%mm0, %0\n\t" | |
1039 MOVNTQ" %%mm1, 8%0\n\t" | |
1040 MOVNTQ" %%mm4, 16%0" | |
1041 | |
1042 :"=m"(*d) | |
1043 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | |
1044 :"memory"); | |
1045 d += 24; | |
1046 s += 8; | |
1047 } | |
1048 __asm __volatile(SFENCE:::"memory"); | |
1049 __asm __volatile(EMMS:::"memory"); | |
1050 #endif | |
1051 while(s < end) | |
1052 { | |
1053 register uint16_t bgr; | |
1054 bgr = *s++; | |
1055 *d++ = (bgr&0x1F)<<3; | |
1056 *d++ = (bgr&0x3E0)>>2; | |
1057 *d++ = (bgr&0x7C00)>>7; | |
1058 } | |
1059 } | |
1060 | |
1061 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size) | |
1062 { | |
1063 const uint16_t *end; | |
1064 #ifdef HAVE_MMX | |
1065 const uint16_t *mm_end; | |
1066 #endif | |
1067 uint8_t *d = (uint8_t *)dst; | |
1068 const uint16_t *s = (const uint16_t *)src; | |
1069 end = s + src_size/2; | |
1070 #ifdef HAVE_MMX | |
1071 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
1072 mm_end = end - 7; | |
1073 while(s < mm_end) | |
1074 { | |
1075 __asm __volatile( | |
1076 PREFETCH" 32%1\n\t" | |
1077 "movq %1, %%mm0\n\t" | |
1078 "movq %1, %%mm1\n\t" | |
1079 "movq %1, %%mm2\n\t" | |
1080 "pand %2, %%mm0\n\t" | |
1081 "pand %3, %%mm1\n\t" | |
1082 "pand %4, %%mm2\n\t" | |
1083 "psllq $3, %%mm0\n\t" | |
1084 "psrlq $3, %%mm1\n\t" | |
1085 "psrlq $8, %%mm2\n\t" | |
1086 "movq %%mm0, %%mm3\n\t" | |
1087 "movq %%mm1, %%mm4\n\t" | |
1088 "movq %%mm2, %%mm5\n\t" | |
1089 "punpcklwd %5, %%mm0\n\t" | |
1090 "punpcklwd %5, %%mm1\n\t" | |
1091 "punpcklwd %5, %%mm2\n\t" | |
1092 "punpckhwd %5, %%mm3\n\t" | |
1093 "punpckhwd %5, %%mm4\n\t" | |
1094 "punpckhwd %5, %%mm5\n\t" | |
1095 "psllq $8, %%mm1\n\t" | |
1096 "psllq $16, %%mm2\n\t" | |
1097 "por %%mm1, %%mm0\n\t" | |
1098 "por %%mm2, %%mm0\n\t" | |
1099 "psllq $8, %%mm4\n\t" | |
1100 "psllq $16, %%mm5\n\t" | |
1101 "por %%mm4, %%mm3\n\t" | |
1102 "por %%mm5, %%mm3\n\t" | |
1103 | |
1104 "movq %%mm0, %%mm6\n\t" | |
1105 "movq %%mm3, %%mm7\n\t" | |
1106 | |
1107 "movq 8%1, %%mm0\n\t" | |
1108 "movq 8%1, %%mm1\n\t" | |
1109 "movq 8%1, %%mm2\n\t" | |
1110 "pand %2, %%mm0\n\t" | |
1111 "pand %3, %%mm1\n\t" | |
1112 "pand %4, %%mm2\n\t" | |
1113 "psllq $3, %%mm0\n\t" | |
1114 "psrlq $3, %%mm1\n\t" | |
1115 "psrlq $8, %%mm2\n\t" | |
1116 "movq %%mm0, %%mm3\n\t" | |
1117 "movq %%mm1, %%mm4\n\t" | |
1118 "movq %%mm2, %%mm5\n\t" | |
1119 "punpcklwd %5, %%mm0\n\t" | |
1120 "punpcklwd %5, %%mm1\n\t" | |
1121 "punpcklwd %5, %%mm2\n\t" | |
1122 "punpckhwd %5, %%mm3\n\t" | |
1123 "punpckhwd %5, %%mm4\n\t" | |
1124 "punpckhwd %5, %%mm5\n\t" | |
1125 "psllq $8, %%mm1\n\t" | |
1126 "psllq $16, %%mm2\n\t" | |
1127 "por %%mm1, %%mm0\n\t" | |
1128 "por %%mm2, %%mm0\n\t" | |
1129 "psllq $8, %%mm4\n\t" | |
1130 "psllq $16, %%mm5\n\t" | |
1131 "por %%mm4, %%mm3\n\t" | |
1132 "por %%mm5, %%mm3\n\t" | |
1133 :"=m"(*d) | |
1134 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) | |
1135 :"memory"); | |
1136 /* Borrowed 32 to 24 */ | |
1137 __asm __volatile( | |
1138 "movq %%mm0, %%mm4\n\t" | |
1139 "movq %%mm3, %%mm5\n\t" | |
1140 "movq %%mm6, %%mm0\n\t" | |
1141 "movq %%mm7, %%mm1\n\t" | |
1142 | |
1143 "movq %%mm4, %%mm6\n\t" | |
1144 "movq %%mm5, %%mm7\n\t" | |
1145 "movq %%mm0, %%mm2\n\t" | |
1146 "movq %%mm1, %%mm3\n\t" | |
1147 | |
1148 "psrlq $8, %%mm2\n\t" | |
1149 "psrlq $8, %%mm3\n\t" | |
1150 "psrlq $8, %%mm6\n\t" | |
1151 "psrlq $8, %%mm7\n\t" | |
1152 "pand %2, %%mm0\n\t" | |
1153 "pand %2, %%mm1\n\t" | |
1154 "pand %2, %%mm4\n\t" | |
1155 "pand %2, %%mm5\n\t" | |
1156 "pand %3, %%mm2\n\t" | |
1157 "pand %3, %%mm3\n\t" | |
1158 "pand %3, %%mm6\n\t" | |
1159 "pand %3, %%mm7\n\t" | |
1160 "por %%mm2, %%mm0\n\t" | |
1161 "por %%mm3, %%mm1\n\t" | |
1162 "por %%mm6, %%mm4\n\t" | |
1163 "por %%mm7, %%mm5\n\t" | |
1164 | |
1165 "movq %%mm1, %%mm2\n\t" | |
1166 "movq %%mm4, %%mm3\n\t" | |
1167 "psllq $48, %%mm2\n\t" | |
1168 "psllq $32, %%mm3\n\t" | |
1169 "pand %4, %%mm2\n\t" | |
1170 "pand %5, %%mm3\n\t" | |
1171 "por %%mm2, %%mm0\n\t" | |
1172 "psrlq $16, %%mm1\n\t" | |
1173 "psrlq $32, %%mm4\n\t" | |
1174 "psllq $16, %%mm5\n\t" | |
1175 "por %%mm3, %%mm1\n\t" | |
1176 "pand %6, %%mm5\n\t" | |
1177 "por %%mm5, %%mm4\n\t" | |
1178 | |
1179 MOVNTQ" %%mm0, %0\n\t" | |
1180 MOVNTQ" %%mm1, 8%0\n\t" | |
1181 MOVNTQ" %%mm4, 16%0" | |
1182 | |
1183 :"=m"(*d) | |
1184 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | |
1185 :"memory"); | |
1186 d += 24; | |
1187 s += 8; | |
1188 } | |
1189 __asm __volatile(SFENCE:::"memory"); | |
1190 __asm __volatile(EMMS:::"memory"); | |
1191 #endif | |
1192 while(s < end) | |
1193 { | |
1194 register uint16_t bgr; | |
1195 bgr = *s++; | |
1196 *d++ = (bgr&0x1F)<<3; | |
1197 *d++ = (bgr&0x7E0)>>3; | |
1198 *d++ = (bgr&0xF800)>>8; | |
1199 } | |
1200 } | |
1201 | |
1202 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size) | |
1203 { | |
1204 const uint16_t *end; | |
1205 #ifdef HAVE_MMX | |
1206 const uint16_t *mm_end; | |
1207 #endif | |
1208 uint8_t *d = (uint8_t *)dst; | |
1209 const uint16_t *s = (const uint16_t *)src; | |
1210 end = s + src_size/2; | |
1211 #ifdef HAVE_MMX | |
1212 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
1213 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); | |
1214 mm_end = end - 3; | |
1215 while(s < mm_end) | |
1216 { | |
1217 __asm __volatile( | |
1218 PREFETCH" 32%1\n\t" | |
1219 "movq %1, %%mm0\n\t" | |
1220 "movq %1, %%mm1\n\t" | |
1221 "movq %1, %%mm2\n\t" | |
1222 "pand %2, %%mm0\n\t" | |
1223 "pand %3, %%mm1\n\t" | |
1224 "pand %4, %%mm2\n\t" | |
1225 "psllq $3, %%mm0\n\t" | |
1226 "psrlq $2, %%mm1\n\t" | |
1227 "psrlq $7, %%mm2\n\t" | |
1228 "movq %%mm0, %%mm3\n\t" | |
1229 "movq %%mm1, %%mm4\n\t" | |
1230 "movq %%mm2, %%mm5\n\t" | |
1231 "punpcklwd %%mm7, %%mm0\n\t" | |
1232 "punpcklwd %%mm7, %%mm1\n\t" | |
1233 "punpcklwd %%mm7, %%mm2\n\t" | |
1234 "punpckhwd %%mm7, %%mm3\n\t" | |
1235 "punpckhwd %%mm7, %%mm4\n\t" | |
1236 "punpckhwd %%mm7, %%mm5\n\t" | |
1237 "psllq $8, %%mm1\n\t" | |
1238 "psllq $16, %%mm2\n\t" | |
1239 "por %%mm1, %%mm0\n\t" | |
1240 "por %%mm2, %%mm0\n\t" | |
1241 "psllq $8, %%mm4\n\t" | |
1242 "psllq $16, %%mm5\n\t" | |
1243 "por %%mm4, %%mm3\n\t" | |
1244 "por %%mm5, %%mm3\n\t" | |
1245 MOVNTQ" %%mm0, %0\n\t" | |
1246 MOVNTQ" %%mm3, 8%0\n\t" | |
1247 :"=m"(*d) | |
1248 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) | |
1249 :"memory"); | |
1250 d += 16; | |
1251 s += 4; | |
1252 } | |
1253 __asm __volatile(SFENCE:::"memory"); | |
1254 __asm __volatile(EMMS:::"memory"); | |
1255 #endif | |
1256 while(s < end) | |
1257 { | |
1258 #if 0 //slightly slower on athlon | |
1259 int bgr= *s++; | |
1260 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9); | |
1261 #else | |
1262 register uint16_t bgr; | |
1263 bgr = *s++; | |
1264 #ifdef WORDS_BIGENDIAN | |
1265 *d++ = 0; | |
1266 *d++ = (bgr&0x7C00)>>7; | |
1267 *d++ = (bgr&0x3E0)>>2; | |
1268 *d++ = (bgr&0x1F)<<3; | |
1269 #else | |
1270 *d++ = (bgr&0x1F)<<3; | |
1271 *d++ = (bgr&0x3E0)>>2; | |
1272 *d++ = (bgr&0x7C00)>>7; | |
1273 *d++ = 0; | |
1274 #endif | |
1275 | |
1276 #endif | |
1277 } | |
1278 } | |
1279 | |
1280 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size) | |
1281 { | |
1282 const uint16_t *end; | |
1283 #ifdef HAVE_MMX | |
1284 const uint16_t *mm_end; | |
1285 #endif | |
1286 uint8_t *d = (uint8_t *)dst; | |
1287 const uint16_t *s = (uint16_t *)src; | |
1288 end = s + src_size/2; | |
1289 #ifdef HAVE_MMX | |
1290 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
1291 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); | |
1292 mm_end = end - 3; | |
1293 while(s < mm_end) | |
1294 { | |
1295 __asm __volatile( | |
1296 PREFETCH" 32%1\n\t" | |
1297 "movq %1, %%mm0\n\t" | |
1298 "movq %1, %%mm1\n\t" | |
1299 "movq %1, %%mm2\n\t" | |
1300 "pand %2, %%mm0\n\t" | |
1301 "pand %3, %%mm1\n\t" | |
1302 "pand %4, %%mm2\n\t" | |
1303 "psllq $3, %%mm0\n\t" | |
1304 "psrlq $3, %%mm1\n\t" | |
1305 "psrlq $8, %%mm2\n\t" | |
1306 "movq %%mm0, %%mm3\n\t" | |
1307 "movq %%mm1, %%mm4\n\t" | |
1308 "movq %%mm2, %%mm5\n\t" | |
1309 "punpcklwd %%mm7, %%mm0\n\t" | |
1310 "punpcklwd %%mm7, %%mm1\n\t" | |
1311 "punpcklwd %%mm7, %%mm2\n\t" | |
1312 "punpckhwd %%mm7, %%mm3\n\t" | |
1313 "punpckhwd %%mm7, %%mm4\n\t" | |
1314 "punpckhwd %%mm7, %%mm5\n\t" | |
1315 "psllq $8, %%mm1\n\t" | |
1316 "psllq $16, %%mm2\n\t" | |
1317 "por %%mm1, %%mm0\n\t" | |
1318 "por %%mm2, %%mm0\n\t" | |
1319 "psllq $8, %%mm4\n\t" | |
1320 "psllq $16, %%mm5\n\t" | |
1321 "por %%mm4, %%mm3\n\t" | |
1322 "por %%mm5, %%mm3\n\t" | |
1323 MOVNTQ" %%mm0, %0\n\t" | |
1324 MOVNTQ" %%mm3, 8%0\n\t" | |
1325 :"=m"(*d) | |
1326 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) | |
1327 :"memory"); | |
1328 d += 16; | |
1329 s += 4; | |
1330 } | |
1331 __asm __volatile(SFENCE:::"memory"); | |
1332 __asm __volatile(EMMS:::"memory"); | |
1333 #endif | |
1334 while(s < end) | |
1335 { | |
1336 register uint16_t bgr; | |
1337 bgr = *s++; | |
1338 #ifdef WORDS_BIGENDIAN | |
1339 *d++ = 0; | |
1340 *d++ = (bgr&0xF800)>>8; | |
1341 *d++ = (bgr&0x7E0)>>3; | |
1342 *d++ = (bgr&0x1F)<<3; | |
1343 #else | |
1344 *d++ = (bgr&0x1F)<<3; | |
1345 *d++ = (bgr&0x7E0)>>3; | |
1346 *d++ = (bgr&0xF800)>>8; | |
1347 *d++ = 0; | |
1348 #endif | |
1349 } | |
1350 } | |
1351 | |
1352 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) | |
1353 { | |
1354 #ifdef HAVE_MMX | |
1355 /* TODO: unroll this loop */ | |
1356 asm volatile ( | |
1357 "xor %%"REG_a", %%"REG_a" \n\t" | |
1358 ASMALIGN16 | |
1359 "1: \n\t" | |
1360 PREFETCH" 32(%0, %%"REG_a") \n\t" | |
1361 "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
1362 "movq %%mm0, %%mm1 \n\t" | |
1363 "movq %%mm0, %%mm2 \n\t" | |
1364 "pslld $16, %%mm0 \n\t" | |
1365 "psrld $16, %%mm1 \n\t" | |
1366 "pand "MANGLE(mask32r)", %%mm0 \n\t" | |
1367 "pand "MANGLE(mask32g)", %%mm2 \n\t" | |
1368 "pand "MANGLE(mask32b)", %%mm1 \n\t" | |
1369 "por %%mm0, %%mm2 \n\t" | |
1370 "por %%mm1, %%mm2 \n\t" | |
1371 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t" | |
1372 "add $8, %%"REG_a" \n\t" | |
1373 "cmp %2, %%"REG_a" \n\t" | |
1374 " jb 1b \n\t" | |
1375 :: "r" (src), "r"(dst), "r" (src_size-7) | |
1376 : "%"REG_a | |
1377 ); | |
1378 | |
1379 __asm __volatile(SFENCE:::"memory"); | |
1380 __asm __volatile(EMMS:::"memory"); | |
1381 #else | |
1382 unsigned i; | |
1383 unsigned num_pixels = src_size >> 2; | |
1384 for(i=0; i<num_pixels; i++) | |
1385 { | |
1386 #ifdef WORDS_BIGENDIAN | |
1387 dst[4*i + 1] = src[4*i + 3]; | |
1388 dst[4*i + 2] = src[4*i + 2]; | |
1389 dst[4*i + 3] = src[4*i + 1]; | |
1390 #else | |
1391 dst[4*i + 0] = src[4*i + 2]; | |
1392 dst[4*i + 1] = src[4*i + 1]; | |
1393 dst[4*i + 2] = src[4*i + 0]; | |
1394 #endif | |
1395 } | |
1396 #endif | |
1397 } | |
1398 | |
1399 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) | |
1400 { | |
1401 unsigned i; | |
1402 #ifdef HAVE_MMX | |
1403 long mmx_size= 23 - src_size; | |
1404 asm volatile ( | |
1405 "movq "MANGLE(mask24r)", %%mm5 \n\t" | |
1406 "movq "MANGLE(mask24g)", %%mm6 \n\t" | |
1407 "movq "MANGLE(mask24b)", %%mm7 \n\t" | |
1408 ASMALIGN16 | |
1409 "1: \n\t" | |
1410 PREFETCH" 32(%1, %%"REG_a") \n\t" | |
1411 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG | |
1412 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG | |
1413 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B | |
1414 "psllq $16, %%mm0 \n\t" // 00 BGR BGR | |
1415 "pand %%mm5, %%mm0 \n\t" | |
1416 "pand %%mm6, %%mm1 \n\t" | |
1417 "pand %%mm7, %%mm2 \n\t" | |
1418 "por %%mm0, %%mm1 \n\t" | |
1419 "por %%mm2, %%mm1 \n\t" | |
1420 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG | |
1421 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG | |
1422 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B | |
1423 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR | |
1424 "pand %%mm7, %%mm0 \n\t" | |
1425 "pand %%mm5, %%mm1 \n\t" | |
1426 "pand %%mm6, %%mm2 \n\t" | |
1427 "por %%mm0, %%mm1 \n\t" | |
1428 "por %%mm2, %%mm1 \n\t" | |
1429 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B | |
1430 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R | |
1431 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR | |
1432 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG | |
1433 "pand %%mm6, %%mm0 \n\t" | |
1434 "pand %%mm7, %%mm1 \n\t" | |
1435 "pand %%mm5, %%mm2 \n\t" | |
1436 "por %%mm0, %%mm1 \n\t" | |
1437 "por %%mm2, %%mm1 \n\t" | |
1438 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t" | |
1439 "add $24, %%"REG_a" \n\t" | |
1440 " js 1b \n\t" | |
1441 : "+a" (mmx_size) | |
1442 : "r" (src-mmx_size), "r"(dst-mmx_size) | |
1443 ); | |
1444 | |
1445 __asm __volatile(SFENCE:::"memory"); | |
1446 __asm __volatile(EMMS:::"memory"); | |
1447 | |
1448 if(mmx_size==23) return; //finihsed, was multiple of 8 | |
1449 | |
1450 src+= src_size; | |
1451 dst+= src_size; | |
1452 src_size= 23-mmx_size; | |
1453 src-= src_size; | |
1454 dst-= src_size; | |
1455 #endif | |
1456 for(i=0; i<src_size; i+=3) | |
1457 { | |
1458 register uint8_t x; | |
1459 x = src[i + 2]; | |
1460 dst[i + 1] = src[i + 1]; | |
1461 dst[i + 2] = src[i + 0]; | |
1462 dst[i + 0] = x; | |
1463 } | |
1464 } | |
1465 | |
1466 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1467 long width, long height, | |
1468 long lumStride, long chromStride, long dstStride, long vertLumPerChroma) | |
1469 { | |
1470 long y; | |
1471 const long chromWidth= width>>1; | |
1472 for(y=0; y<height; y++) | |
1473 { | |
1474 #ifdef HAVE_MMX | |
1475 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) | |
1476 asm volatile( | |
1477 "xor %%"REG_a", %%"REG_a" \n\t" | |
1478 ASMALIGN16 | |
1479 "1: \n\t" | |
1480 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" | |
1481 PREFETCH" 32(%2, %%"REG_a") \n\t" | |
1482 PREFETCH" 32(%3, %%"REG_a") \n\t" | |
1483 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) | |
1484 "movq %%mm0, %%mm2 \n\t" // U(0) | |
1485 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) | |
1486 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
1487 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
1488 | |
1489 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) | |
1490 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) | |
1491 "movq %%mm3, %%mm4 \n\t" // Y(0) | |
1492 "movq %%mm5, %%mm6 \n\t" // Y(8) | |
1493 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |
1494 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | |
1495 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |
1496 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |
1497 | |
1498 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t" | |
1499 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" | |
1500 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t" | |
1501 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" | |
1502 | |
1503 "add $8, %%"REG_a" \n\t" | |
1504 "cmp %4, %%"REG_a" \n\t" | |
1505 " jb 1b \n\t" | |
1506 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | |
1507 : "%"REG_a | |
1508 ); | |
1509 #else | |
1510 | |
1511 #if defined ARCH_ALPHA && defined HAVE_MVI | |
1512 #define pl2yuy2(n) \ | |
1513 y1 = yc[n]; \ | |
1514 y2 = yc2[n]; \ | |
1515 u = uc[n]; \ | |
1516 v = vc[n]; \ | |
1517 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \ | |
1518 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \ | |
1519 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \ | |
1520 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \ | |
1521 yuv1 = (u << 8) + (v << 24); \ | |
1522 yuv2 = yuv1 + y2; \ | |
1523 yuv1 += y1; \ | |
1524 qdst[n] = yuv1; \ | |
1525 qdst2[n] = yuv2; | |
1526 | |
1527 int i; | |
1528 uint64_t *qdst = (uint64_t *) dst; | |
1529 uint64_t *qdst2 = (uint64_t *) (dst + dstStride); | |
1530 const uint32_t *yc = (uint32_t *) ysrc; | |
1531 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride); | |
1532 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc; | |
1533 for(i = 0; i < chromWidth; i += 8){ | |
1534 uint64_t y1, y2, yuv1, yuv2; | |
1535 uint64_t u, v; | |
1536 /* Prefetch */ | |
1537 asm("ldq $31,64(%0)" :: "r"(yc)); | |
1538 asm("ldq $31,64(%0)" :: "r"(yc2)); | |
1539 asm("ldq $31,64(%0)" :: "r"(uc)); | |
1540 asm("ldq $31,64(%0)" :: "r"(vc)); | |
1541 | |
1542 pl2yuy2(0); | |
1543 pl2yuy2(1); | |
1544 pl2yuy2(2); | |
1545 pl2yuy2(3); | |
1546 | |
1547 yc += 4; | |
1548 yc2 += 4; | |
1549 uc += 4; | |
1550 vc += 4; | |
1551 qdst += 4; | |
1552 qdst2 += 4; | |
1553 } | |
1554 y++; | |
1555 ysrc += lumStride; | |
1556 dst += dstStride; | |
1557 | |
1558 #elif __WORDSIZE >= 64 | |
1559 int i; | |
1560 uint64_t *ldst = (uint64_t *) dst; | |
1561 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1562 for(i = 0; i < chromWidth; i += 2){ | |
1563 uint64_t k, l; | |
1564 k = yc[0] + (uc[0] << 8) + | |
1565 (yc[1] << 16) + (vc[0] << 24); | |
1566 l = yc[2] + (uc[1] << 8) + | |
1567 (yc[3] << 16) + (vc[1] << 24); | |
1568 *ldst++ = k + (l << 32); | |
1569 yc += 4; | |
1570 uc += 2; | |
1571 vc += 2; | |
1572 } | |
1573 | |
1574 #else | |
1575 int i, *idst = (int32_t *) dst; | |
1576 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1577 for(i = 0; i < chromWidth; i++){ | |
1578 #ifdef WORDS_BIGENDIAN | |
1579 *idst++ = (yc[0] << 24)+ (uc[0] << 16) + | |
1580 (yc[1] << 8) + (vc[0] << 0); | |
1581 #else | |
1582 *idst++ = yc[0] + (uc[0] << 8) + | |
1583 (yc[1] << 16) + (vc[0] << 24); | |
1584 #endif | |
1585 yc += 2; | |
1586 uc++; | |
1587 vc++; | |
1588 } | |
1589 #endif | |
1590 #endif | |
1591 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) | |
1592 { | |
1593 usrc += chromStride; | |
1594 vsrc += chromStride; | |
1595 } | |
1596 ysrc += lumStride; | |
1597 dst += dstStride; | |
1598 } | |
1599 #ifdef HAVE_MMX | |
1600 asm( EMMS" \n\t" | |
1601 SFENCE" \n\t" | |
1602 :::"memory"); | |
1603 #endif | |
1604 } | |
1605 | |
1606 /** | |
1607 * | |
1608 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1609 * problem for anyone then tell me, and ill fix it) | |
1610 */ | |
1611 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1612 long width, long height, | |
1613 long lumStride, long chromStride, long dstStride) | |
1614 { | |
1615 //FIXME interpolate chroma | |
1616 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1617 } | |
1618 | |
1619 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1620 long width, long height, | |
1621 long lumStride, long chromStride, long dstStride, long vertLumPerChroma) | |
1622 { | |
1623 long y; | |
1624 const long chromWidth= width>>1; | |
1625 for(y=0; y<height; y++) | |
1626 { | |
1627 #ifdef HAVE_MMX | |
1628 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) | |
1629 asm volatile( | |
1630 "xor %%"REG_a", %%"REG_a" \n\t" | |
1631 ASMALIGN16 | |
1632 "1: \n\t" | |
1633 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" | |
1634 PREFETCH" 32(%2, %%"REG_a") \n\t" | |
1635 PREFETCH" 32(%3, %%"REG_a") \n\t" | |
1636 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) | |
1637 "movq %%mm0, %%mm2 \n\t" // U(0) | |
1638 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) | |
1639 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
1640 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |
1641 | |
1642 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) | |
1643 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) | |
1644 "movq %%mm0, %%mm4 \n\t" // Y(0) | |
1645 "movq %%mm2, %%mm6 \n\t" // Y(8) | |
1646 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) | |
1647 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) | |
1648 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) | |
1649 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) | |
1650 | |
1651 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t" | |
1652 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" | |
1653 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t" | |
1654 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" | |
1655 | |
1656 "add $8, %%"REG_a" \n\t" | |
1657 "cmp %4, %%"REG_a" \n\t" | |
1658 " jb 1b \n\t" | |
1659 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | |
1660 : "%"REG_a | |
1661 ); | |
1662 #else | |
1663 //FIXME adapt the alpha asm code from yv12->yuy2 | |
1664 | |
1665 #if __WORDSIZE >= 64 | |
1666 int i; | |
1667 uint64_t *ldst = (uint64_t *) dst; | |
1668 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1669 for(i = 0; i < chromWidth; i += 2){ | |
1670 uint64_t k, l; | |
1671 k = uc[0] + (yc[0] << 8) + | |
1672 (vc[0] << 16) + (yc[1] << 24); | |
1673 l = uc[1] + (yc[2] << 8) + | |
1674 (vc[1] << 16) + (yc[3] << 24); | |
1675 *ldst++ = k + (l << 32); | |
1676 yc += 4; | |
1677 uc += 2; | |
1678 vc += 2; | |
1679 } | |
1680 | |
1681 #else | |
1682 int i, *idst = (int32_t *) dst; | |
1683 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | |
1684 for(i = 0; i < chromWidth; i++){ | |
1685 #ifdef WORDS_BIGENDIAN | |
1686 *idst++ = (uc[0] << 24)+ (yc[0] << 16) + | |
1687 (vc[0] << 8) + (yc[1] << 0); | |
1688 #else | |
1689 *idst++ = uc[0] + (yc[0] << 8) + | |
1690 (vc[0] << 16) + (yc[1] << 24); | |
1691 #endif | |
1692 yc += 2; | |
1693 uc++; | |
1694 vc++; | |
1695 } | |
1696 #endif | |
1697 #endif | |
1698 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) | |
1699 { | |
1700 usrc += chromStride; | |
1701 vsrc += chromStride; | |
1702 } | |
1703 ysrc += lumStride; | |
1704 dst += dstStride; | |
1705 } | |
1706 #ifdef HAVE_MMX | |
1707 asm( EMMS" \n\t" | |
1708 SFENCE" \n\t" | |
1709 :::"memory"); | |
1710 #endif | |
1711 } | |
1712 | |
1713 /** | |
1714 * | |
1715 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1716 * problem for anyone then tell me, and ill fix it) | |
1717 */ | |
1718 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1719 long width, long height, | |
1720 long lumStride, long chromStride, long dstStride) | |
1721 { | |
1722 //FIXME interpolate chroma | |
1723 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1724 } | |
1725 | |
1726 /** | |
1727 * | |
1728 * width should be a multiple of 16 | |
1729 */ | |
1730 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1731 long width, long height, | |
1732 long lumStride, long chromStride, long dstStride) | |
1733 { | |
1734 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
1735 } | |
1736 | |
1737 /** | |
1738 * | |
1739 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1740 * problem for anyone then tell me, and ill fix it) | |
1741 */ | |
1742 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
1743 long width, long height, | |
1744 long lumStride, long chromStride, long srcStride) | |
1745 { | |
1746 long y; | |
1747 const long chromWidth= width>>1; | |
1748 for(y=0; y<height; y+=2) | |
1749 { | |
1750 #ifdef HAVE_MMX | |
1751 asm volatile( | |
1752 "xor %%"REG_a", %%"REG_a" \n\t" | |
1753 "pcmpeqw %%mm7, %%mm7 \n\t" | |
1754 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
1755 ASMALIGN16 | |
1756 "1: \n\t" | |
1757 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | |
1758 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | |
1759 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) | |
1760 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | |
1761 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |
1762 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |
1763 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | |
1764 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
1765 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
1766 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
1767 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
1768 | |
1769 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t" | |
1770 | |
1771 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8) | |
1772 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12) | |
1773 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | |
1774 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |
1775 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |
1776 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | |
1777 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
1778 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
1779 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
1780 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
1781 | |
1782 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t" | |
1783 | |
1784 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
1785 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
1786 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
1787 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
1788 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
1789 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
1790 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
1791 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
1792 | |
1793 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" | |
1794 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" | |
1795 | |
1796 "add $8, %%"REG_a" \n\t" | |
1797 "cmp %4, %%"REG_a" \n\t" | |
1798 " jb 1b \n\t" | |
1799 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | |
1800 : "memory", "%"REG_a | |
1801 ); | |
1802 | |
1803 ydst += lumStride; | |
1804 src += srcStride; | |
1805 | |
1806 asm volatile( | |
1807 "xor %%"REG_a", %%"REG_a" \n\t" | |
1808 ASMALIGN16 | |
1809 "1: \n\t" | |
1810 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | |
1811 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | |
1812 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) | |
1813 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8) | |
1814 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12) | |
1815 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
1816 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
1817 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
1818 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
1819 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
1820 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
1821 | |
1822 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t" | |
1823 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t" | |
1824 | |
1825 "add $8, %%"REG_a" \n\t" | |
1826 "cmp %4, %%"REG_a" \n\t" | |
1827 " jb 1b \n\t" | |
1828 | |
1829 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | |
1830 : "memory", "%"REG_a | |
1831 ); | |
1832 #else | |
1833 long i; | |
1834 for(i=0; i<chromWidth; i++) | |
1835 { | |
1836 ydst[2*i+0] = src[4*i+0]; | |
1837 udst[i] = src[4*i+1]; | |
1838 ydst[2*i+1] = src[4*i+2]; | |
1839 vdst[i] = src[4*i+3]; | |
1840 } | |
1841 ydst += lumStride; | |
1842 src += srcStride; | |
1843 | |
1844 for(i=0; i<chromWidth; i++) | |
1845 { | |
1846 ydst[2*i+0] = src[4*i+0]; | |
1847 ydst[2*i+1] = src[4*i+2]; | |
1848 } | |
1849 #endif | |
1850 udst += chromStride; | |
1851 vdst += chromStride; | |
1852 ydst += lumStride; | |
1853 src += srcStride; | |
1854 } | |
1855 #ifdef HAVE_MMX | |
1856 asm volatile( EMMS" \n\t" | |
1857 SFENCE" \n\t" | |
1858 :::"memory"); | |
1859 #endif | |
1860 } | |
1861 | |
1862 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, | |
1863 uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
1864 long width, long height, long lumStride, long chromStride) | |
1865 { | |
1866 /* Y Plane */ | |
1867 memcpy(ydst, ysrc, width*height); | |
1868 | |
1869 /* XXX: implement upscaling for U,V */ | |
1870 } | |
1871 | |
1872 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride) | |
1873 { | |
1874 long x,y; | |
1875 | |
1876 dst[0]= src[0]; | |
1877 | |
1878 // first line | |
1879 for(x=0; x<srcWidth-1; x++){ | |
1880 dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1881 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
1882 } | |
1883 dst[2*srcWidth-1]= src[srcWidth-1]; | |
1884 | |
1885 dst+= dstStride; | |
1886 | |
1887 for(y=1; y<srcHeight; y++){ | |
1888 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1889 const long mmxSize= srcWidth&~15; | |
1890 asm volatile( | |
1891 "mov %4, %%"REG_a" \n\t" | |
1892 "1: \n\t" | |
1893 "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
1894 "movq (%1, %%"REG_a"), %%mm1 \n\t" | |
1895 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" | |
1896 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" | |
1897 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" | |
1898 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" | |
1899 PAVGB" %%mm0, %%mm5 \n\t" | |
1900 PAVGB" %%mm0, %%mm3 \n\t" | |
1901 PAVGB" %%mm0, %%mm5 \n\t" | |
1902 PAVGB" %%mm0, %%mm3 \n\t" | |
1903 PAVGB" %%mm1, %%mm4 \n\t" | |
1904 PAVGB" %%mm1, %%mm2 \n\t" | |
1905 PAVGB" %%mm1, %%mm4 \n\t" | |
1906 PAVGB" %%mm1, %%mm2 \n\t" | |
1907 "movq %%mm5, %%mm7 \n\t" | |
1908 "movq %%mm4, %%mm6 \n\t" | |
1909 "punpcklbw %%mm3, %%mm5 \n\t" | |
1910 "punpckhbw %%mm3, %%mm7 \n\t" | |
1911 "punpcklbw %%mm2, %%mm4 \n\t" | |
1912 "punpckhbw %%mm2, %%mm6 \n\t" | |
1913 #if 1 | |
1914 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t" | |
1915 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t" | |
1916 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t" | |
1917 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t" | |
1918 #else | |
1919 "movq %%mm5, (%2, %%"REG_a", 2) \n\t" | |
1920 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t" | |
1921 "movq %%mm4, (%3, %%"REG_a", 2) \n\t" | |
1922 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t" | |
1923 #endif | |
1924 "add $8, %%"REG_a" \n\t" | |
1925 " js 1b \n\t" | |
1926 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), | |
1927 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), | |
1928 "g" (-mmxSize) | |
1929 : "%"REG_a | |
1930 | |
1931 ); | |
1932 #else | |
1933 const long mmxSize=1; | |
1934 #endif | |
1935 dst[0 ]= (3*src[0] + src[srcStride])>>2; | |
1936 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2; | |
1937 | |
1938 for(x=mmxSize-1; x<srcWidth-1; x++){ | |
1939 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; | |
1940 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; | |
1941 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; | |
1942 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; | |
1943 } | |
1944 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; | |
1945 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; | |
1946 | |
1947 dst+=dstStride*2; | |
1948 src+=srcStride; | |
1949 } | |
1950 | |
1951 // last line | |
1952 #if 1 | |
1953 dst[0]= src[0]; | |
1954 | |
1955 for(x=0; x<srcWidth-1; x++){ | |
1956 dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1957 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
1958 } | |
1959 dst[2*srcWidth-1]= src[srcWidth-1]; | |
1960 #else | |
1961 for(x=0; x<srcWidth; x++){ | |
1962 dst[2*x+0]= | |
1963 dst[2*x+1]= src[x]; | |
1964 } | |
1965 #endif | |
1966 | |
1967 #ifdef HAVE_MMX | |
1968 asm volatile( EMMS" \n\t" | |
1969 SFENCE" \n\t" | |
1970 :::"memory"); | |
1971 #endif | |
1972 } | |
1973 | |
1974 /** | |
1975 * | |
1976 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a | |
1977 * problem for anyone then tell me, and ill fix it) | |
1978 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version | |
1979 */ | |
1980 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
1981 long width, long height, | |
1982 long lumStride, long chromStride, long srcStride) | |
1983 { | |
1984 long y; | |
1985 const long chromWidth= width>>1; | |
1986 for(y=0; y<height; y+=2) | |
1987 { | |
1988 #ifdef HAVE_MMX | |
1989 asm volatile( | |
1990 "xorl %%eax, %%eax \n\t" | |
1991 "pcmpeqw %%mm7, %%mm7 \n\t" | |
1992 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |
1993 ASMALIGN16 | |
1994 "1: \n\t" | |
1995 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
1996 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) | |
1997 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) | |
1998 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | |
1999 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | |
2000 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | |
2001 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | |
2002 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | |
2003 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | |
2004 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |
2005 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |
2006 | |
2007 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | |
2008 | |
2009 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) | |
2010 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) | |
2011 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | |
2012 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | |
2013 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | |
2014 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | |
2015 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | |
2016 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | |
2017 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |
2018 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |
2019 | |
2020 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" | |
2021 | |
2022 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |
2023 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |
2024 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | |
2025 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | |
2026 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | |
2027 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | |
2028 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |
2029 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |
2030 | |
2031 MOVNTQ" %%mm0, (%3, %%eax) \n\t" | |
2032 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | |
2033 | |
2034 "addl $8, %%eax \n\t" | |
2035 "cmpl %4, %%eax \n\t" | |
2036 " jb 1b \n\t" | |
2037 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | |
2038 : "memory", "%eax" | |
2039 ); | |
2040 | |
2041 ydst += lumStride; | |
2042 src += srcStride; | |
2043 | |
2044 asm volatile( | |
2045 "xorl %%eax, %%eax \n\t" | |
2046 ASMALIGN16 | |
2047 "1: \n\t" | |
2048 PREFETCH" 64(%0, %%eax, 4) \n\t" | |
2049 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | |
2050 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | |
2051 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | |
2052 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | |
2053 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |
2054 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |
2055 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |
2056 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | |
2057 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |
2058 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |
2059 | |
2060 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" | |
2061 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | |
2062 | |
2063 "addl $8, %%eax \n\t" | |
2064 "cmpl %4, %%eax \n\t" | |
2065 " jb 1b \n\t" | |
2066 | |
2067 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | |
2068 : "memory", "%eax" | |
2069 ); | |
2070 #else | |
2071 long i; | |
2072 for(i=0; i<chromWidth; i++) | |
2073 { | |
2074 udst[i] = src[4*i+0]; | |
2075 ydst[2*i+0] = src[4*i+1]; | |
2076 vdst[i] = src[4*i+2]; | |
2077 ydst[2*i+1] = src[4*i+3]; | |
2078 } | |
2079 ydst += lumStride; | |
2080 src += srcStride; | |
2081 | |
2082 for(i=0; i<chromWidth; i++) | |
2083 { | |
2084 ydst[2*i+0] = src[4*i+1]; | |
2085 ydst[2*i+1] = src[4*i+3]; | |
2086 } | |
2087 #endif | |
2088 udst += chromStride; | |
2089 vdst += chromStride; | |
2090 ydst += lumStride; | |
2091 src += srcStride; | |
2092 } | |
2093 #ifdef HAVE_MMX | |
2094 asm volatile( EMMS" \n\t" | |
2095 SFENCE" \n\t" | |
2096 :::"memory"); | |
2097 #endif | |
2098 } | |
2099 | |
2100 /** | |
2101 * | |
2102 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a | |
2103 * problem for anyone then tell me, and ill fix it) | |
2104 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version | |
2105 */ | |
2106 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
2107 long width, long height, | |
2108 long lumStride, long chromStride, long srcStride) | |
2109 { | |
2110 long y; | |
2111 const long chromWidth= width>>1; | |
2112 #ifdef HAVE_MMX | |
2113 for(y=0; y<height-2; y+=2) | |
2114 { | |
2115 long i; | |
2116 for(i=0; i<2; i++) | |
2117 { | |
2118 asm volatile( | |
2119 "mov %2, %%"REG_a" \n\t" | |
2120 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" | |
2121 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
2122 "pxor %%mm7, %%mm7 \n\t" | |
2123 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" | |
2124 ASMALIGN16 | |
2125 "1: \n\t" | |
2126 PREFETCH" 64(%0, %%"REG_b") \n\t" | |
2127 "movd (%0, %%"REG_b"), %%mm0 \n\t" | |
2128 "movd 3(%0, %%"REG_b"), %%mm1 \n\t" | |
2129 "punpcklbw %%mm7, %%mm0 \n\t" | |
2130 "punpcklbw %%mm7, %%mm1 \n\t" | |
2131 "movd 6(%0, %%"REG_b"), %%mm2 \n\t" | |
2132 "movd 9(%0, %%"REG_b"), %%mm3 \n\t" | |
2133 "punpcklbw %%mm7, %%mm2 \n\t" | |
2134 "punpcklbw %%mm7, %%mm3 \n\t" | |
2135 "pmaddwd %%mm6, %%mm0 \n\t" | |
2136 "pmaddwd %%mm6, %%mm1 \n\t" | |
2137 "pmaddwd %%mm6, %%mm2 \n\t" | |
2138 "pmaddwd %%mm6, %%mm3 \n\t" | |
2139 #ifndef FAST_BGR2YV12 | |
2140 "psrad $8, %%mm0 \n\t" | |
2141 "psrad $8, %%mm1 \n\t" | |
2142 "psrad $8, %%mm2 \n\t" | |
2143 "psrad $8, %%mm3 \n\t" | |
2144 #endif | |
2145 "packssdw %%mm1, %%mm0 \n\t" | |
2146 "packssdw %%mm3, %%mm2 \n\t" | |
2147 "pmaddwd %%mm5, %%mm0 \n\t" | |
2148 "pmaddwd %%mm5, %%mm2 \n\t" | |
2149 "packssdw %%mm2, %%mm0 \n\t" | |
2150 "psraw $7, %%mm0 \n\t" | |
2151 | |
2152 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" | |
2153 "movd 15(%0, %%"REG_b"), %%mm1 \n\t" | |
2154 "punpcklbw %%mm7, %%mm4 \n\t" | |
2155 "punpcklbw %%mm7, %%mm1 \n\t" | |
2156 "movd 18(%0, %%"REG_b"), %%mm2 \n\t" | |
2157 "movd 21(%0, %%"REG_b"), %%mm3 \n\t" | |
2158 "punpcklbw %%mm7, %%mm2 \n\t" | |
2159 "punpcklbw %%mm7, %%mm3 \n\t" | |
2160 "pmaddwd %%mm6, %%mm4 \n\t" | |
2161 "pmaddwd %%mm6, %%mm1 \n\t" | |
2162 "pmaddwd %%mm6, %%mm2 \n\t" | |
2163 "pmaddwd %%mm6, %%mm3 \n\t" | |
2164 #ifndef FAST_BGR2YV12 | |
2165 "psrad $8, %%mm4 \n\t" | |
2166 "psrad $8, %%mm1 \n\t" | |
2167 "psrad $8, %%mm2 \n\t" | |
2168 "psrad $8, %%mm3 \n\t" | |
2169 #endif | |
2170 "packssdw %%mm1, %%mm4 \n\t" | |
2171 "packssdw %%mm3, %%mm2 \n\t" | |
2172 "pmaddwd %%mm5, %%mm4 \n\t" | |
2173 "pmaddwd %%mm5, %%mm2 \n\t" | |
2174 "add $24, %%"REG_b" \n\t" | |
2175 "packssdw %%mm2, %%mm4 \n\t" | |
2176 "psraw $7, %%mm4 \n\t" | |
2177 | |
2178 "packuswb %%mm4, %%mm0 \n\t" | |
2179 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" | |
2180 | |
2181 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" | |
2182 "add $8, %%"REG_a" \n\t" | |
2183 " js 1b \n\t" | |
2184 : : "r" (src+width*3), "r" (ydst+width), "g" (-width) | |
2185 : "%"REG_a, "%"REG_b | |
2186 ); | |
2187 ydst += lumStride; | |
2188 src += srcStride; | |
2189 } | |
2190 src -= srcStride*2; | |
2191 asm volatile( | |
2192 "mov %4, %%"REG_a" \n\t" | |
2193 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
2194 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
2195 "pxor %%mm7, %%mm7 \n\t" | |
2196 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" | |
2197 "add %%"REG_b", %%"REG_b" \n\t" | |
2198 ASMALIGN16 | |
2199 "1: \n\t" | |
2200 PREFETCH" 64(%0, %%"REG_b") \n\t" | |
2201 PREFETCH" 64(%1, %%"REG_b") \n\t" | |
2202 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
2203 "movq (%0, %%"REG_b"), %%mm0 \n\t" | |
2204 "movq (%1, %%"REG_b"), %%mm1 \n\t" | |
2205 "movq 6(%0, %%"REG_b"), %%mm2 \n\t" | |
2206 "movq 6(%1, %%"REG_b"), %%mm3 \n\t" | |
2207 PAVGB" %%mm1, %%mm0 \n\t" | |
2208 PAVGB" %%mm3, %%mm2 \n\t" | |
2209 "movq %%mm0, %%mm1 \n\t" | |
2210 "movq %%mm2, %%mm3 \n\t" | |
2211 "psrlq $24, %%mm0 \n\t" | |
2212 "psrlq $24, %%mm2 \n\t" | |
2213 PAVGB" %%mm1, %%mm0 \n\t" | |
2214 PAVGB" %%mm3, %%mm2 \n\t" | |
2215 "punpcklbw %%mm7, %%mm0 \n\t" | |
2216 "punpcklbw %%mm7, %%mm2 \n\t" | |
2217 #else | |
2218 "movd (%0, %%"REG_b"), %%mm0 \n\t" | |
2219 "movd (%1, %%"REG_b"), %%mm1 \n\t" | |
2220 "movd 3(%0, %%"REG_b"), %%mm2 \n\t" | |
2221 "movd 3(%1, %%"REG_b"), %%mm3 \n\t" | |
2222 "punpcklbw %%mm7, %%mm0 \n\t" | |
2223 "punpcklbw %%mm7, %%mm1 \n\t" | |
2224 "punpcklbw %%mm7, %%mm2 \n\t" | |
2225 "punpcklbw %%mm7, %%mm3 \n\t" | |
2226 "paddw %%mm1, %%mm0 \n\t" | |
2227 "paddw %%mm3, %%mm2 \n\t" | |
2228 "paddw %%mm2, %%mm0 \n\t" | |
2229 "movd 6(%0, %%"REG_b"), %%mm4 \n\t" | |
2230 "movd 6(%1, %%"REG_b"), %%mm1 \n\t" | |
2231 "movd 9(%0, %%"REG_b"), %%mm2 \n\t" | |
2232 "movd 9(%1, %%"REG_b"), %%mm3 \n\t" | |
2233 "punpcklbw %%mm7, %%mm4 \n\t" | |
2234 "punpcklbw %%mm7, %%mm1 \n\t" | |
2235 "punpcklbw %%mm7, %%mm2 \n\t" | |
2236 "punpcklbw %%mm7, %%mm3 \n\t" | |
2237 "paddw %%mm1, %%mm4 \n\t" | |
2238 "paddw %%mm3, %%mm2 \n\t" | |
2239 "paddw %%mm4, %%mm2 \n\t" | |
2240 "psrlw $2, %%mm0 \n\t" | |
2241 "psrlw $2, %%mm2 \n\t" | |
2242 #endif | |
2243 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" | |
2244 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
2245 | |
2246 "pmaddwd %%mm0, %%mm1 \n\t" | |
2247 "pmaddwd %%mm2, %%mm3 \n\t" | |
2248 "pmaddwd %%mm6, %%mm0 \n\t" | |
2249 "pmaddwd %%mm6, %%mm2 \n\t" | |
2250 #ifndef FAST_BGR2YV12 | |
2251 "psrad $8, %%mm0 \n\t" | |
2252 "psrad $8, %%mm1 \n\t" | |
2253 "psrad $8, %%mm2 \n\t" | |
2254 "psrad $8, %%mm3 \n\t" | |
2255 #endif | |
2256 "packssdw %%mm2, %%mm0 \n\t" | |
2257 "packssdw %%mm3, %%mm1 \n\t" | |
2258 "pmaddwd %%mm5, %%mm0 \n\t" | |
2259 "pmaddwd %%mm5, %%mm1 \n\t" | |
2260 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
2261 "psraw $7, %%mm0 \n\t" | |
2262 | |
2263 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
2264 "movq 12(%0, %%"REG_b"), %%mm4 \n\t" | |
2265 "movq 12(%1, %%"REG_b"), %%mm1 \n\t" | |
2266 "movq 18(%0, %%"REG_b"), %%mm2 \n\t" | |
2267 "movq 18(%1, %%"REG_b"), %%mm3 \n\t" | |
2268 PAVGB" %%mm1, %%mm4 \n\t" | |
2269 PAVGB" %%mm3, %%mm2 \n\t" | |
2270 "movq %%mm4, %%mm1 \n\t" | |
2271 "movq %%mm2, %%mm3 \n\t" | |
2272 "psrlq $24, %%mm4 \n\t" | |
2273 "psrlq $24, %%mm2 \n\t" | |
2274 PAVGB" %%mm1, %%mm4 \n\t" | |
2275 PAVGB" %%mm3, %%mm2 \n\t" | |
2276 "punpcklbw %%mm7, %%mm4 \n\t" | |
2277 "punpcklbw %%mm7, %%mm2 \n\t" | |
2278 #else | |
2279 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" | |
2280 "movd 12(%1, %%"REG_b"), %%mm1 \n\t" | |
2281 "movd 15(%0, %%"REG_b"), %%mm2 \n\t" | |
2282 "movd 15(%1, %%"REG_b"), %%mm3 \n\t" | |
2283 "punpcklbw %%mm7, %%mm4 \n\t" | |
2284 "punpcklbw %%mm7, %%mm1 \n\t" | |
2285 "punpcklbw %%mm7, %%mm2 \n\t" | |
2286 "punpcklbw %%mm7, %%mm3 \n\t" | |
2287 "paddw %%mm1, %%mm4 \n\t" | |
2288 "paddw %%mm3, %%mm2 \n\t" | |
2289 "paddw %%mm2, %%mm4 \n\t" | |
2290 "movd 18(%0, %%"REG_b"), %%mm5 \n\t" | |
2291 "movd 18(%1, %%"REG_b"), %%mm1 \n\t" | |
2292 "movd 21(%0, %%"REG_b"), %%mm2 \n\t" | |
2293 "movd 21(%1, %%"REG_b"), %%mm3 \n\t" | |
2294 "punpcklbw %%mm7, %%mm5 \n\t" | |
2295 "punpcklbw %%mm7, %%mm1 \n\t" | |
2296 "punpcklbw %%mm7, %%mm2 \n\t" | |
2297 "punpcklbw %%mm7, %%mm3 \n\t" | |
2298 "paddw %%mm1, %%mm5 \n\t" | |
2299 "paddw %%mm3, %%mm2 \n\t" | |
2300 "paddw %%mm5, %%mm2 \n\t" | |
2301 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
2302 "psrlw $2, %%mm4 \n\t" | |
2303 "psrlw $2, %%mm2 \n\t" | |
2304 #endif | |
2305 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" | |
2306 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
2307 | |
2308 "pmaddwd %%mm4, %%mm1 \n\t" | |
2309 "pmaddwd %%mm2, %%mm3 \n\t" | |
2310 "pmaddwd %%mm6, %%mm4 \n\t" | |
2311 "pmaddwd %%mm6, %%mm2 \n\t" | |
2312 #ifndef FAST_BGR2YV12 | |
2313 "psrad $8, %%mm4 \n\t" | |
2314 "psrad $8, %%mm1 \n\t" | |
2315 "psrad $8, %%mm2 \n\t" | |
2316 "psrad $8, %%mm3 \n\t" | |
2317 #endif | |
2318 "packssdw %%mm2, %%mm4 \n\t" | |
2319 "packssdw %%mm3, %%mm1 \n\t" | |
2320 "pmaddwd %%mm5, %%mm4 \n\t" | |
2321 "pmaddwd %%mm5, %%mm1 \n\t" | |
2322 "add $24, %%"REG_b" \n\t" | |
2323 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | |
2324 "psraw $7, %%mm4 \n\t" | |
2325 | |
2326 "movq %%mm0, %%mm1 \n\t" | |
2327 "punpckldq %%mm4, %%mm0 \n\t" | |
2328 "punpckhdq %%mm4, %%mm1 \n\t" | |
2329 "packsswb %%mm1, %%mm0 \n\t" | |
2330 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" | |
2331 "movd %%mm0, (%2, %%"REG_a") \n\t" | |
2332 "punpckhdq %%mm0, %%mm0 \n\t" | |
2333 "movd %%mm0, (%3, %%"REG_a") \n\t" | |
2334 "add $4, %%"REG_a" \n\t" | |
2335 " js 1b \n\t" | |
2336 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) | |
2337 : "%"REG_a, "%"REG_b | |
2338 ); | |
2339 | |
2340 udst += chromStride; | |
2341 vdst += chromStride; | |
2342 src += srcStride*2; | |
2343 } | |
2344 | |
2345 asm volatile( EMMS" \n\t" | |
2346 SFENCE" \n\t" | |
2347 :::"memory"); | |
2348 #else | |
2349 y=0; | |
2350 #endif | |
2351 for(; y<height; y+=2) | |
2352 { | |
2353 long i; | |
2354 for(i=0; i<chromWidth; i++) | |
2355 { | |
2356 unsigned int b= src[6*i+0]; | |
2357 unsigned int g= src[6*i+1]; | |
2358 unsigned int r= src[6*i+2]; | |
2359 | |
2360 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | |
2361 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; | |
2362 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; | |
2363 | |
2364 udst[i] = U; | |
2365 vdst[i] = V; | |
2366 ydst[2*i] = Y; | |
2367 | |
2368 b= src[6*i+3]; | |
2369 g= src[6*i+4]; | |
2370 r= src[6*i+5]; | |
2371 | |
2372 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | |
2373 ydst[2*i+1] = Y; | |
2374 } | |
2375 ydst += lumStride; | |
2376 src += srcStride; | |
2377 | |
2378 for(i=0; i<chromWidth; i++) | |
2379 { | |
2380 unsigned int b= src[6*i+0]; | |
2381 unsigned int g= src[6*i+1]; | |
2382 unsigned int r= src[6*i+2]; | |
2383 | |
2384 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | |
2385 | |
2386 ydst[2*i] = Y; | |
2387 | |
2388 b= src[6*i+3]; | |
2389 g= src[6*i+4]; | |
2390 r= src[6*i+5]; | |
2391 | |
2392 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | |
2393 ydst[2*i+1] = Y; | |
2394 } | |
2395 udst += chromStride; | |
2396 vdst += chromStride; | |
2397 ydst += lumStride; | |
2398 src += srcStride; | |
2399 } | |
2400 } | |
2401 | |
2402 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, | |
2403 long width, long height, long src1Stride, | |
2404 long src2Stride, long dstStride){ | |
2405 long h; | |
2406 | |
2407 for(h=0; h < height; h++) | |
2408 { | |
2409 long w; | |
2410 | |
2411 #ifdef HAVE_MMX | |
2412 #ifdef HAVE_SSE2 | |
2413 asm( | |
2414 "xor %%"REG_a", %%"REG_a" \n\t" | |
2415 "1: \n\t" | |
2416 PREFETCH" 64(%1, %%"REG_a") \n\t" | |
2417 PREFETCH" 64(%2, %%"REG_a") \n\t" | |
2418 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" | |
2419 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" | |
2420 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" | |
2421 "punpcklbw %%xmm2, %%xmm0 \n\t" | |
2422 "punpckhbw %%xmm2, %%xmm1 \n\t" | |
2423 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t" | |
2424 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t" | |
2425 "add $16, %%"REG_a" \n\t" | |
2426 "cmp %3, %%"REG_a" \n\t" | |
2427 " jb 1b \n\t" | |
2428 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | |
2429 : "memory", "%"REG_a"" | |
2430 ); | |
2431 #else | |
2432 asm( | |
2433 "xor %%"REG_a", %%"REG_a" \n\t" | |
2434 "1: \n\t" | |
2435 PREFETCH" 64(%1, %%"REG_a") \n\t" | |
2436 PREFETCH" 64(%2, %%"REG_a") \n\t" | |
2437 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
2438 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" | |
2439 "movq %%mm0, %%mm1 \n\t" | |
2440 "movq %%mm2, %%mm3 \n\t" | |
2441 "movq (%2, %%"REG_a"), %%mm4 \n\t" | |
2442 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" | |
2443 "punpcklbw %%mm4, %%mm0 \n\t" | |
2444 "punpckhbw %%mm4, %%mm1 \n\t" | |
2445 "punpcklbw %%mm5, %%mm2 \n\t" | |
2446 "punpckhbw %%mm5, %%mm3 \n\t" | |
2447 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t" | |
2448 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t" | |
2449 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t" | |
2450 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t" | |
2451 "add $16, %%"REG_a" \n\t" | |
2452 "cmp %3, %%"REG_a" \n\t" | |
2453 " jb 1b \n\t" | |
2454 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | |
2455 : "memory", "%"REG_a | |
2456 ); | |
2457 #endif | |
2458 for(w= (width&(~15)); w < width; w++) | |
2459 { | |
2460 dest[2*w+0] = src1[w]; | |
2461 dest[2*w+1] = src2[w]; | |
2462 } | |
2463 #else | |
2464 for(w=0; w < width; w++) | |
2465 { | |
2466 dest[2*w+0] = src1[w]; | |
2467 dest[2*w+1] = src2[w]; | |
2468 } | |
2469 #endif | |
2470 dest += dstStride; | |
2471 src1 += src1Stride; | |
2472 src2 += src2Stride; | |
2473 } | |
2474 #ifdef HAVE_MMX | |
2475 asm( | |
2476 EMMS" \n\t" | |
2477 SFENCE" \n\t" | |
2478 ::: "memory" | |
2479 ); | |
2480 #endif | |
2481 } | |
2482 | |
2483 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, | |
2484 uint8_t *dst1, uint8_t *dst2, | |
2485 long width, long height, | |
2486 long srcStride1, long srcStride2, | |
2487 long dstStride1, long dstStride2) | |
2488 { | |
2489 long y,x,w,h; | |
2490 w=width/2; h=height/2; | |
2491 #ifdef HAVE_MMX | |
2492 asm volatile( | |
2493 PREFETCH" %0\n\t" | |
2494 PREFETCH" %1\n\t" | |
2495 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); | |
2496 #endif | |
2497 for(y=0;y<h;y++){ | |
2498 const uint8_t* s1=src1+srcStride1*(y>>1); | |
2499 uint8_t* d=dst1+dstStride1*y; | |
2500 x=0; | |
2501 #ifdef HAVE_MMX | |
2502 for(;x<w-31;x+=32) | |
2503 { | |
2504 asm volatile( | |
2505 PREFETCH" 32%1\n\t" | |
2506 "movq %1, %%mm0\n\t" | |
2507 "movq 8%1, %%mm2\n\t" | |
2508 "movq 16%1, %%mm4\n\t" | |
2509 "movq 24%1, %%mm6\n\t" | |
2510 "movq %%mm0, %%mm1\n\t" | |
2511 "movq %%mm2, %%mm3\n\t" | |
2512 "movq %%mm4, %%mm5\n\t" | |
2513 "movq %%mm6, %%mm7\n\t" | |
2514 "punpcklbw %%mm0, %%mm0\n\t" | |
2515 "punpckhbw %%mm1, %%mm1\n\t" | |
2516 "punpcklbw %%mm2, %%mm2\n\t" | |
2517 "punpckhbw %%mm3, %%mm3\n\t" | |
2518 "punpcklbw %%mm4, %%mm4\n\t" | |
2519 "punpckhbw %%mm5, %%mm5\n\t" | |
2520 "punpcklbw %%mm6, %%mm6\n\t" | |
2521 "punpckhbw %%mm7, %%mm7\n\t" | |
2522 MOVNTQ" %%mm0, %0\n\t" | |
2523 MOVNTQ" %%mm1, 8%0\n\t" | |
2524 MOVNTQ" %%mm2, 16%0\n\t" | |
2525 MOVNTQ" %%mm3, 24%0\n\t" | |
2526 MOVNTQ" %%mm4, 32%0\n\t" | |
2527 MOVNTQ" %%mm5, 40%0\n\t" | |
2528 MOVNTQ" %%mm6, 48%0\n\t" | |
2529 MOVNTQ" %%mm7, 56%0" | |
2530 :"=m"(d[2*x]) | |
2531 :"m"(s1[x]) | |
2532 :"memory"); | |
2533 } | |
2534 #endif | |
2535 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; | |
2536 } | |
2537 for(y=0;y<h;y++){ | |
2538 const uint8_t* s2=src2+srcStride2*(y>>1); | |
2539 uint8_t* d=dst2+dstStride2*y; | |
2540 x=0; | |
2541 #ifdef HAVE_MMX | |
2542 for(;x<w-31;x+=32) | |
2543 { | |
2544 asm volatile( | |
2545 PREFETCH" 32%1\n\t" | |
2546 "movq %1, %%mm0\n\t" | |
2547 "movq 8%1, %%mm2\n\t" | |
2548 "movq 16%1, %%mm4\n\t" | |
2549 "movq 24%1, %%mm6\n\t" | |
2550 "movq %%mm0, %%mm1\n\t" | |
2551 "movq %%mm2, %%mm3\n\t" | |
2552 "movq %%mm4, %%mm5\n\t" | |
2553 "movq %%mm6, %%mm7\n\t" | |
2554 "punpcklbw %%mm0, %%mm0\n\t" | |
2555 "punpckhbw %%mm1, %%mm1\n\t" | |
2556 "punpcklbw %%mm2, %%mm2\n\t" | |
2557 "punpckhbw %%mm3, %%mm3\n\t" | |
2558 "punpcklbw %%mm4, %%mm4\n\t" | |
2559 "punpckhbw %%mm5, %%mm5\n\t" | |
2560 "punpcklbw %%mm6, %%mm6\n\t" | |
2561 "punpckhbw %%mm7, %%mm7\n\t" | |
2562 MOVNTQ" %%mm0, %0\n\t" | |
2563 MOVNTQ" %%mm1, 8%0\n\t" | |
2564 MOVNTQ" %%mm2, 16%0\n\t" | |
2565 MOVNTQ" %%mm3, 24%0\n\t" | |
2566 MOVNTQ" %%mm4, 32%0\n\t" | |
2567 MOVNTQ" %%mm5, 40%0\n\t" | |
2568 MOVNTQ" %%mm6, 48%0\n\t" | |
2569 MOVNTQ" %%mm7, 56%0" | |
2570 :"=m"(d[2*x]) | |
2571 :"m"(s2[x]) | |
2572 :"memory"); | |
2573 } | |
2574 #endif | |
2575 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; | |
2576 } | |
2577 #ifdef HAVE_MMX | |
2578 asm( | |
2579 EMMS" \n\t" | |
2580 SFENCE" \n\t" | |
2581 ::: "memory" | |
2582 ); | |
2583 #endif | |
2584 } | |
2585 | |
2586 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, | |
2587 uint8_t *dst, | |
2588 long width, long height, | |
2589 long srcStride1, long srcStride2, | |
2590 long srcStride3, long dstStride) | |
2591 { | |
2592 long y,x,w,h; | |
2593 w=width/2; h=height; | |
2594 for(y=0;y<h;y++){ | |
2595 const uint8_t* yp=src1+srcStride1*y; | |
2596 const uint8_t* up=src2+srcStride2*(y>>2); | |
2597 const uint8_t* vp=src3+srcStride3*(y>>2); | |
2598 uint8_t* d=dst+dstStride*y; | |
2599 x=0; | |
2600 #ifdef HAVE_MMX | |
2601 for(;x<w-7;x+=8) | |
2602 { | |
2603 asm volatile( | |
2604 PREFETCH" 32(%1, %0)\n\t" | |
2605 PREFETCH" 32(%2, %0)\n\t" | |
2606 PREFETCH" 32(%3, %0)\n\t" | |
2607 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | |
2608 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */ | |
2609 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */ | |
2610 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | |
2611 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */ | |
2612 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */ | |
2613 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */ | |
2614 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */ | |
2615 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */ | |
2616 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */ | |
2617 | |
2618 "movq %%mm1, %%mm6\n\t" | |
2619 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/ | |
2620 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ | |
2621 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ | |
2622 MOVNTQ" %%mm0, (%4, %0, 8)\n\t" | |
2623 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t" | |
2624 | |
2625 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/ | |
2626 "movq 8(%1, %0, 4), %%mm0\n\t" | |
2627 "movq %%mm0, %%mm3\n\t" | |
2628 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/ | |
2629 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/ | |
2630 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t" | |
2631 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t" | |
2632 | |
2633 "movq %%mm4, %%mm6\n\t" | |
2634 "movq 16(%1, %0, 4), %%mm0\n\t" | |
2635 "movq %%mm0, %%mm3\n\t" | |
2636 "punpcklbw %%mm5, %%mm4\n\t" | |
2637 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/ | |
2638 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/ | |
2639 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t" | |
2640 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t" | |
2641 | |
2642 "punpckhbw %%mm5, %%mm6\n\t" | |
2643 "movq 24(%1, %0, 4), %%mm0\n\t" | |
2644 "movq %%mm0, %%mm3\n\t" | |
2645 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/ | |
2646 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/ | |
2647 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t" | |
2648 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t" | |
2649 | |
2650 : "+r" (x) | |
2651 : "r"(yp), "r" (up), "r"(vp), "r"(d) | |
2652 :"memory"); | |
2653 } | |
2654 #endif | |
2655 for(; x<w; x++) | |
2656 { | |
2657 const long x2= x<<2; | |
2658 d[8*x+0]=yp[x2]; | |
2659 d[8*x+1]=up[x]; | |
2660 d[8*x+2]=yp[x2+1]; | |
2661 d[8*x+3]=vp[x]; | |
2662 d[8*x+4]=yp[x2+2]; | |
2663 d[8*x+5]=up[x]; | |
2664 d[8*x+6]=yp[x2+3]; | |
2665 d[8*x+7]=vp[x]; | |
2666 } | |
2667 } | |
2668 #ifdef HAVE_MMX | |
2669 asm( | |
2670 EMMS" \n\t" | |
2671 SFENCE" \n\t" | |
2672 ::: "memory" | |
2673 ); | |
2674 #endif | |
2675 } |