comparison libswscale/rgb2rgb_template.c @ 18861:8579acff875e

Move postproc ---> libswscale
author lucabe
date Fri, 30 Jun 2006 12:00:31 +0000
parents
children 6334c14b38eb
comparison
equal deleted inserted replaced
18860:ef741a3e90f5 18861:8579acff875e
1 /*
2 *
3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9 * lot of big-endian byteorder fixes by Alex Beregszaszi
10 */
11
12 #include <stddef.h>
13 #include <inttypes.h> /* for __WORDSIZE */
14
15 #include "asmalign.h"
16
17 #ifndef __WORDSIZE
18 // #warning You have misconfigured system and probably will lose performance!
19 #define __WORDSIZE MP_WORDSIZE
20 #endif
21
22 #undef PREFETCH
23 #undef MOVNTQ
24 #undef EMMS
25 #undef SFENCE
26 #undef MMREG_SIZE
27 #undef PREFETCHW
28 #undef PAVGB
29
30 #ifdef HAVE_SSE2
31 #define MMREG_SIZE 16
32 #else
33 #define MMREG_SIZE 8
34 #endif
35
36 #ifdef HAVE_3DNOW
37 #define PREFETCH "prefetch"
38 #define PREFETCHW "prefetchw"
39 #define PAVGB "pavgusb"
40 #elif defined ( HAVE_MMX2 )
41 #define PREFETCH "prefetchnta"
42 #define PREFETCHW "prefetcht0"
43 #define PAVGB "pavgb"
44 #else
45 #ifdef __APPLE__
46 #define PREFETCH "#"
47 #define PREFETCHW "#"
48 #else
49 #define PREFETCH "/nop"
50 #define PREFETCHW "/nop"
51 #endif
52 #endif
53
54 #ifdef HAVE_3DNOW
55 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
56 #define EMMS "femms"
57 #else
58 #define EMMS "emms"
59 #endif
60
61 #ifdef HAVE_MMX2
62 #define MOVNTQ "movntq"
63 #define SFENCE "sfence"
64 #else
65 #define MOVNTQ "movq"
66 #ifdef __APPLE__
67 #define SFENCE "#"
68 #else
69 #define SFENCE "/nop"
70 #endif
71 #endif
72
73 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
74 {
75 uint8_t *dest = dst;
76 const uint8_t *s = src;
77 const uint8_t *end;
78 #ifdef HAVE_MMX
79 const uint8_t *mm_end;
80 #endif
81 end = s + src_size;
82 #ifdef HAVE_MMX
83 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
84 mm_end = end - 23;
85 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
86 while(s < mm_end)
87 {
88 __asm __volatile(
89 PREFETCH" 32%1\n\t"
90 "movd %1, %%mm0\n\t"
91 "punpckldq 3%1, %%mm0\n\t"
92 "movd 6%1, %%mm1\n\t"
93 "punpckldq 9%1, %%mm1\n\t"
94 "movd 12%1, %%mm2\n\t"
95 "punpckldq 15%1, %%mm2\n\t"
96 "movd 18%1, %%mm3\n\t"
97 "punpckldq 21%1, %%mm3\n\t"
98 "pand %%mm7, %%mm0\n\t"
99 "pand %%mm7, %%mm1\n\t"
100 "pand %%mm7, %%mm2\n\t"
101 "pand %%mm7, %%mm3\n\t"
102 MOVNTQ" %%mm0, %0\n\t"
103 MOVNTQ" %%mm1, 8%0\n\t"
104 MOVNTQ" %%mm2, 16%0\n\t"
105 MOVNTQ" %%mm3, 24%0"
106 :"=m"(*dest)
107 :"m"(*s)
108 :"memory");
109 dest += 32;
110 s += 24;
111 }
112 __asm __volatile(SFENCE:::"memory");
113 __asm __volatile(EMMS:::"memory");
114 #endif
115 while(s < end)
116 {
117 #ifdef WORDS_BIGENDIAN
118 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
119 *dest++ = 0;
120 *dest++ = s[2];
121 *dest++ = s[1];
122 *dest++ = s[0];
123 s+=3;
124 #else
125 *dest++ = *s++;
126 *dest++ = *s++;
127 *dest++ = *s++;
128 *dest++ = 0;
129 #endif
130 }
131 }
132
133 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
134 {
135 uint8_t *dest = dst;
136 const uint8_t *s = src;
137 const uint8_t *end;
138 #ifdef HAVE_MMX
139 const uint8_t *mm_end;
140 #endif
141 end = s + src_size;
142 #ifdef HAVE_MMX
143 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
144 mm_end = end - 31;
145 while(s < mm_end)
146 {
147 __asm __volatile(
148 PREFETCH" 32%1\n\t"
149 "movq %1, %%mm0\n\t"
150 "movq 8%1, %%mm1\n\t"
151 "movq 16%1, %%mm4\n\t"
152 "movq 24%1, %%mm5\n\t"
153 "movq %%mm0, %%mm2\n\t"
154 "movq %%mm1, %%mm3\n\t"
155 "movq %%mm4, %%mm6\n\t"
156 "movq %%mm5, %%mm7\n\t"
157 "psrlq $8, %%mm2\n\t"
158 "psrlq $8, %%mm3\n\t"
159 "psrlq $8, %%mm6\n\t"
160 "psrlq $8, %%mm7\n\t"
161 "pand %2, %%mm0\n\t"
162 "pand %2, %%mm1\n\t"
163 "pand %2, %%mm4\n\t"
164 "pand %2, %%mm5\n\t"
165 "pand %3, %%mm2\n\t"
166 "pand %3, %%mm3\n\t"
167 "pand %3, %%mm6\n\t"
168 "pand %3, %%mm7\n\t"
169 "por %%mm2, %%mm0\n\t"
170 "por %%mm3, %%mm1\n\t"
171 "por %%mm6, %%mm4\n\t"
172 "por %%mm7, %%mm5\n\t"
173
174 "movq %%mm1, %%mm2\n\t"
175 "movq %%mm4, %%mm3\n\t"
176 "psllq $48, %%mm2\n\t"
177 "psllq $32, %%mm3\n\t"
178 "pand %4, %%mm2\n\t"
179 "pand %5, %%mm3\n\t"
180 "por %%mm2, %%mm0\n\t"
181 "psrlq $16, %%mm1\n\t"
182 "psrlq $32, %%mm4\n\t"
183 "psllq $16, %%mm5\n\t"
184 "por %%mm3, %%mm1\n\t"
185 "pand %6, %%mm5\n\t"
186 "por %%mm5, %%mm4\n\t"
187
188 MOVNTQ" %%mm0, %0\n\t"
189 MOVNTQ" %%mm1, 8%0\n\t"
190 MOVNTQ" %%mm4, 16%0"
191 :"=m"(*dest)
192 :"m"(*s),"m"(mask24l),
193 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
194 :"memory");
195 dest += 24;
196 s += 32;
197 }
198 __asm __volatile(SFENCE:::"memory");
199 __asm __volatile(EMMS:::"memory");
200 #endif
201 while(s < end)
202 {
203 #ifdef WORDS_BIGENDIAN
204 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
205 s++;
206 dest[2] = *s++;
207 dest[1] = *s++;
208 dest[0] = *s++;
209 dest += 3;
210 #else
211 *dest++ = *s++;
212 *dest++ = *s++;
213 *dest++ = *s++;
214 s++;
215 #endif
216 }
217 }
218
219 /*
220 Original by Strepto/Astral
221 ported to gcc & bugfixed : A'rpi
222 MMX2, 3DNOW optimization by Nick Kurshev
223 32bit c version, and and&add trick by Michael Niedermayer
224 */
225 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
226 {
227 register const uint8_t* s=src;
228 register uint8_t* d=dst;
229 register const uint8_t *end;
230 const uint8_t *mm_end;
231 end = s + src_size;
232 #ifdef HAVE_MMX
233 __asm __volatile(PREFETCH" %0"::"m"(*s));
234 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
235 mm_end = end - 15;
236 while(s<mm_end)
237 {
238 __asm __volatile(
239 PREFETCH" 32%1\n\t"
240 "movq %1, %%mm0\n\t"
241 "movq 8%1, %%mm2\n\t"
242 "movq %%mm0, %%mm1\n\t"
243 "movq %%mm2, %%mm3\n\t"
244 "pand %%mm4, %%mm0\n\t"
245 "pand %%mm4, %%mm2\n\t"
246 "paddw %%mm1, %%mm0\n\t"
247 "paddw %%mm3, %%mm2\n\t"
248 MOVNTQ" %%mm0, %0\n\t"
249 MOVNTQ" %%mm2, 8%0"
250 :"=m"(*d)
251 :"m"(*s)
252 );
253 d+=16;
254 s+=16;
255 }
256 __asm __volatile(SFENCE:::"memory");
257 __asm __volatile(EMMS:::"memory");
258 #endif
259 mm_end = end - 3;
260 while(s < mm_end)
261 {
262 register unsigned x= *((uint32_t *)s);
263 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
264 d+=4;
265 s+=4;
266 }
267 if(s < end)
268 {
269 register unsigned short x= *((uint16_t *)s);
270 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
271 }
272 }
273
274 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
275 {
276 register const uint8_t* s=src;
277 register uint8_t* d=dst;
278 register const uint8_t *end;
279 const uint8_t *mm_end;
280 end = s + src_size;
281 #ifdef HAVE_MMX
282 __asm __volatile(PREFETCH" %0"::"m"(*s));
283 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
284 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
285 mm_end = end - 15;
286 while(s<mm_end)
287 {
288 __asm __volatile(
289 PREFETCH" 32%1\n\t"
290 "movq %1, %%mm0\n\t"
291 "movq 8%1, %%mm2\n\t"
292 "movq %%mm0, %%mm1\n\t"
293 "movq %%mm2, %%mm3\n\t"
294 "psrlq $1, %%mm0\n\t"
295 "psrlq $1, %%mm2\n\t"
296 "pand %%mm7, %%mm0\n\t"
297 "pand %%mm7, %%mm2\n\t"
298 "pand %%mm6, %%mm1\n\t"
299 "pand %%mm6, %%mm3\n\t"
300 "por %%mm1, %%mm0\n\t"
301 "por %%mm3, %%mm2\n\t"
302 MOVNTQ" %%mm0, %0\n\t"
303 MOVNTQ" %%mm2, 8%0"
304 :"=m"(*d)
305 :"m"(*s)
306 );
307 d+=16;
308 s+=16;
309 }
310 __asm __volatile(SFENCE:::"memory");
311 __asm __volatile(EMMS:::"memory");
312 #endif
313 mm_end = end - 3;
314 while(s < mm_end)
315 {
316 register uint32_t x= *((uint32_t *)s);
317 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
318 s+=4;
319 d+=4;
320 }
321 if(s < end)
322 {
323 register uint16_t x= *((uint16_t *)s);
324 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
325 s+=2;
326 d+=2;
327 }
328 }
329
330 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
331 {
332 const uint8_t *s = src;
333 const uint8_t *end;
334 #ifdef HAVE_MMX
335 const uint8_t *mm_end;
336 #endif
337 uint16_t *d = (uint16_t *)dst;
338 end = s + src_size;
339 #ifdef HAVE_MMX
340 mm_end = end - 15;
341 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
342 asm volatile(
343 "movq %3, %%mm5 \n\t"
344 "movq %4, %%mm6 \n\t"
345 "movq %5, %%mm7 \n\t"
346 ASMALIGN16
347 "1: \n\t"
348 PREFETCH" 32(%1) \n\t"
349 "movd (%1), %%mm0 \n\t"
350 "movd 4(%1), %%mm3 \n\t"
351 "punpckldq 8(%1), %%mm0 \n\t"
352 "punpckldq 12(%1), %%mm3 \n\t"
353 "movq %%mm0, %%mm1 \n\t"
354 "movq %%mm3, %%mm4 \n\t"
355 "pand %%mm6, %%mm0 \n\t"
356 "pand %%mm6, %%mm3 \n\t"
357 "pmaddwd %%mm7, %%mm0 \n\t"
358 "pmaddwd %%mm7, %%mm3 \n\t"
359 "pand %%mm5, %%mm1 \n\t"
360 "pand %%mm5, %%mm4 \n\t"
361 "por %%mm1, %%mm0 \n\t"
362 "por %%mm4, %%mm3 \n\t"
363 "psrld $5, %%mm0 \n\t"
364 "pslld $11, %%mm3 \n\t"
365 "por %%mm3, %%mm0 \n\t"
366 MOVNTQ" %%mm0, (%0) \n\t"
367 "add $16, %1 \n\t"
368 "add $8, %0 \n\t"
369 "cmp %2, %1 \n\t"
370 " jb 1b \n\t"
371 : "+r" (d), "+r"(s)
372 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
373 );
374 #else
375 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
376 __asm __volatile(
377 "movq %0, %%mm7\n\t"
378 "movq %1, %%mm6\n\t"
379 ::"m"(red_16mask),"m"(green_16mask));
380 while(s < mm_end)
381 {
382 __asm __volatile(
383 PREFETCH" 32%1\n\t"
384 "movd %1, %%mm0\n\t"
385 "movd 4%1, %%mm3\n\t"
386 "punpckldq 8%1, %%mm0\n\t"
387 "punpckldq 12%1, %%mm3\n\t"
388 "movq %%mm0, %%mm1\n\t"
389 "movq %%mm0, %%mm2\n\t"
390 "movq %%mm3, %%mm4\n\t"
391 "movq %%mm3, %%mm5\n\t"
392 "psrlq $3, %%mm0\n\t"
393 "psrlq $3, %%mm3\n\t"
394 "pand %2, %%mm0\n\t"
395 "pand %2, %%mm3\n\t"
396 "psrlq $5, %%mm1\n\t"
397 "psrlq $5, %%mm4\n\t"
398 "pand %%mm6, %%mm1\n\t"
399 "pand %%mm6, %%mm4\n\t"
400 "psrlq $8, %%mm2\n\t"
401 "psrlq $8, %%mm5\n\t"
402 "pand %%mm7, %%mm2\n\t"
403 "pand %%mm7, %%mm5\n\t"
404 "por %%mm1, %%mm0\n\t"
405 "por %%mm4, %%mm3\n\t"
406 "por %%mm2, %%mm0\n\t"
407 "por %%mm5, %%mm3\n\t"
408 "psllq $16, %%mm3\n\t"
409 "por %%mm3, %%mm0\n\t"
410 MOVNTQ" %%mm0, %0\n\t"
411 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
412 d += 4;
413 s += 16;
414 }
415 #endif
416 __asm __volatile(SFENCE:::"memory");
417 __asm __volatile(EMMS:::"memory");
418 #endif
419 while(s < end)
420 {
421 register int rgb = *(uint32_t*)s; s += 4;
422 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
423 }
424 }
425
426 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
427 {
428 const uint8_t *s = src;
429 const uint8_t *end;
430 #ifdef HAVE_MMX
431 const uint8_t *mm_end;
432 #endif
433 uint16_t *d = (uint16_t *)dst;
434 end = s + src_size;
435 #ifdef HAVE_MMX
436 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
437 __asm __volatile(
438 "movq %0, %%mm7\n\t"
439 "movq %1, %%mm6\n\t"
440 ::"m"(red_16mask),"m"(green_16mask));
441 mm_end = end - 15;
442 while(s < mm_end)
443 {
444 __asm __volatile(
445 PREFETCH" 32%1\n\t"
446 "movd %1, %%mm0\n\t"
447 "movd 4%1, %%mm3\n\t"
448 "punpckldq 8%1, %%mm0\n\t"
449 "punpckldq 12%1, %%mm3\n\t"
450 "movq %%mm0, %%mm1\n\t"
451 "movq %%mm0, %%mm2\n\t"
452 "movq %%mm3, %%mm4\n\t"
453 "movq %%mm3, %%mm5\n\t"
454 "psllq $8, %%mm0\n\t"
455 "psllq $8, %%mm3\n\t"
456 "pand %%mm7, %%mm0\n\t"
457 "pand %%mm7, %%mm3\n\t"
458 "psrlq $5, %%mm1\n\t"
459 "psrlq $5, %%mm4\n\t"
460 "pand %%mm6, %%mm1\n\t"
461 "pand %%mm6, %%mm4\n\t"
462 "psrlq $19, %%mm2\n\t"
463 "psrlq $19, %%mm5\n\t"
464 "pand %2, %%mm2\n\t"
465 "pand %2, %%mm5\n\t"
466 "por %%mm1, %%mm0\n\t"
467 "por %%mm4, %%mm3\n\t"
468 "por %%mm2, %%mm0\n\t"
469 "por %%mm5, %%mm3\n\t"
470 "psllq $16, %%mm3\n\t"
471 "por %%mm3, %%mm0\n\t"
472 MOVNTQ" %%mm0, %0\n\t"
473 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
474 d += 4;
475 s += 16;
476 }
477 __asm __volatile(SFENCE:::"memory");
478 __asm __volatile(EMMS:::"memory");
479 #endif
480 while(s < end)
481 {
482 register int rgb = *(uint32_t*)s; s += 4;
483 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
484 }
485 }
486
487 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
488 {
489 const uint8_t *s = src;
490 const uint8_t *end;
491 #ifdef HAVE_MMX
492 const uint8_t *mm_end;
493 #endif
494 uint16_t *d = (uint16_t *)dst;
495 end = s + src_size;
496 #ifdef HAVE_MMX
497 mm_end = end - 15;
498 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
499 asm volatile(
500 "movq %3, %%mm5 \n\t"
501 "movq %4, %%mm6 \n\t"
502 "movq %5, %%mm7 \n\t"
503 ASMALIGN16
504 "1: \n\t"
505 PREFETCH" 32(%1) \n\t"
506 "movd (%1), %%mm0 \n\t"
507 "movd 4(%1), %%mm3 \n\t"
508 "punpckldq 8(%1), %%mm0 \n\t"
509 "punpckldq 12(%1), %%mm3 \n\t"
510 "movq %%mm0, %%mm1 \n\t"
511 "movq %%mm3, %%mm4 \n\t"
512 "pand %%mm6, %%mm0 \n\t"
513 "pand %%mm6, %%mm3 \n\t"
514 "pmaddwd %%mm7, %%mm0 \n\t"
515 "pmaddwd %%mm7, %%mm3 \n\t"
516 "pand %%mm5, %%mm1 \n\t"
517 "pand %%mm5, %%mm4 \n\t"
518 "por %%mm1, %%mm0 \n\t"
519 "por %%mm4, %%mm3 \n\t"
520 "psrld $6, %%mm0 \n\t"
521 "pslld $10, %%mm3 \n\t"
522 "por %%mm3, %%mm0 \n\t"
523 MOVNTQ" %%mm0, (%0) \n\t"
524 "add $16, %1 \n\t"
525 "add $8, %0 \n\t"
526 "cmp %2, %1 \n\t"
527 " jb 1b \n\t"
528 : "+r" (d), "+r"(s)
529 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
530 );
531 #else
532 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
533 __asm __volatile(
534 "movq %0, %%mm7\n\t"
535 "movq %1, %%mm6\n\t"
536 ::"m"(red_15mask),"m"(green_15mask));
537 while(s < mm_end)
538 {
539 __asm __volatile(
540 PREFETCH" 32%1\n\t"
541 "movd %1, %%mm0\n\t"
542 "movd 4%1, %%mm3\n\t"
543 "punpckldq 8%1, %%mm0\n\t"
544 "punpckldq 12%1, %%mm3\n\t"
545 "movq %%mm0, %%mm1\n\t"
546 "movq %%mm0, %%mm2\n\t"
547 "movq %%mm3, %%mm4\n\t"
548 "movq %%mm3, %%mm5\n\t"
549 "psrlq $3, %%mm0\n\t"
550 "psrlq $3, %%mm3\n\t"
551 "pand %2, %%mm0\n\t"
552 "pand %2, %%mm3\n\t"
553 "psrlq $6, %%mm1\n\t"
554 "psrlq $6, %%mm4\n\t"
555 "pand %%mm6, %%mm1\n\t"
556 "pand %%mm6, %%mm4\n\t"
557 "psrlq $9, %%mm2\n\t"
558 "psrlq $9, %%mm5\n\t"
559 "pand %%mm7, %%mm2\n\t"
560 "pand %%mm7, %%mm5\n\t"
561 "por %%mm1, %%mm0\n\t"
562 "por %%mm4, %%mm3\n\t"
563 "por %%mm2, %%mm0\n\t"
564 "por %%mm5, %%mm3\n\t"
565 "psllq $16, %%mm3\n\t"
566 "por %%mm3, %%mm0\n\t"
567 MOVNTQ" %%mm0, %0\n\t"
568 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
569 d += 4;
570 s += 16;
571 }
572 #endif
573 __asm __volatile(SFENCE:::"memory");
574 __asm __volatile(EMMS:::"memory");
575 #endif
576 while(s < end)
577 {
578 register int rgb = *(uint32_t*)s; s += 4;
579 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
580 }
581 }
582
583 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
584 {
585 const uint8_t *s = src;
586 const uint8_t *end;
587 #ifdef HAVE_MMX
588 const uint8_t *mm_end;
589 #endif
590 uint16_t *d = (uint16_t *)dst;
591 end = s + src_size;
592 #ifdef HAVE_MMX
593 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
594 __asm __volatile(
595 "movq %0, %%mm7\n\t"
596 "movq %1, %%mm6\n\t"
597 ::"m"(red_15mask),"m"(green_15mask));
598 mm_end = end - 15;
599 while(s < mm_end)
600 {
601 __asm __volatile(
602 PREFETCH" 32%1\n\t"
603 "movd %1, %%mm0\n\t"
604 "movd 4%1, %%mm3\n\t"
605 "punpckldq 8%1, %%mm0\n\t"
606 "punpckldq 12%1, %%mm3\n\t"
607 "movq %%mm0, %%mm1\n\t"
608 "movq %%mm0, %%mm2\n\t"
609 "movq %%mm3, %%mm4\n\t"
610 "movq %%mm3, %%mm5\n\t"
611 "psllq $7, %%mm0\n\t"
612 "psllq $7, %%mm3\n\t"
613 "pand %%mm7, %%mm0\n\t"
614 "pand %%mm7, %%mm3\n\t"
615 "psrlq $6, %%mm1\n\t"
616 "psrlq $6, %%mm4\n\t"
617 "pand %%mm6, %%mm1\n\t"
618 "pand %%mm6, %%mm4\n\t"
619 "psrlq $19, %%mm2\n\t"
620 "psrlq $19, %%mm5\n\t"
621 "pand %2, %%mm2\n\t"
622 "pand %2, %%mm5\n\t"
623 "por %%mm1, %%mm0\n\t"
624 "por %%mm4, %%mm3\n\t"
625 "por %%mm2, %%mm0\n\t"
626 "por %%mm5, %%mm3\n\t"
627 "psllq $16, %%mm3\n\t"
628 "por %%mm3, %%mm0\n\t"
629 MOVNTQ" %%mm0, %0\n\t"
630 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
631 d += 4;
632 s += 16;
633 }
634 __asm __volatile(SFENCE:::"memory");
635 __asm __volatile(EMMS:::"memory");
636 #endif
637 while(s < end)
638 {
639 register int rgb = *(uint32_t*)s; s += 4;
640 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
641 }
642 }
643
644 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
645 {
646 const uint8_t *s = src;
647 const uint8_t *end;
648 #ifdef HAVE_MMX
649 const uint8_t *mm_end;
650 #endif
651 uint16_t *d = (uint16_t *)dst;
652 end = s + src_size;
653 #ifdef HAVE_MMX
654 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
655 __asm __volatile(
656 "movq %0, %%mm7\n\t"
657 "movq %1, %%mm6\n\t"
658 ::"m"(red_16mask),"m"(green_16mask));
659 mm_end = end - 11;
660 while(s < mm_end)
661 {
662 __asm __volatile(
663 PREFETCH" 32%1\n\t"
664 "movd %1, %%mm0\n\t"
665 "movd 3%1, %%mm3\n\t"
666 "punpckldq 6%1, %%mm0\n\t"
667 "punpckldq 9%1, %%mm3\n\t"
668 "movq %%mm0, %%mm1\n\t"
669 "movq %%mm0, %%mm2\n\t"
670 "movq %%mm3, %%mm4\n\t"
671 "movq %%mm3, %%mm5\n\t"
672 "psrlq $3, %%mm0\n\t"
673 "psrlq $3, %%mm3\n\t"
674 "pand %2, %%mm0\n\t"
675 "pand %2, %%mm3\n\t"
676 "psrlq $5, %%mm1\n\t"
677 "psrlq $5, %%mm4\n\t"
678 "pand %%mm6, %%mm1\n\t"
679 "pand %%mm6, %%mm4\n\t"
680 "psrlq $8, %%mm2\n\t"
681 "psrlq $8, %%mm5\n\t"
682 "pand %%mm7, %%mm2\n\t"
683 "pand %%mm7, %%mm5\n\t"
684 "por %%mm1, %%mm0\n\t"
685 "por %%mm4, %%mm3\n\t"
686 "por %%mm2, %%mm0\n\t"
687 "por %%mm5, %%mm3\n\t"
688 "psllq $16, %%mm3\n\t"
689 "por %%mm3, %%mm0\n\t"
690 MOVNTQ" %%mm0, %0\n\t"
691 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
692 d += 4;
693 s += 12;
694 }
695 __asm __volatile(SFENCE:::"memory");
696 __asm __volatile(EMMS:::"memory");
697 #endif
698 while(s < end)
699 {
700 const int b= *s++;
701 const int g= *s++;
702 const int r= *s++;
703 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
704 }
705 }
706
707 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
708 {
709 const uint8_t *s = src;
710 const uint8_t *end;
711 #ifdef HAVE_MMX
712 const uint8_t *mm_end;
713 #endif
714 uint16_t *d = (uint16_t *)dst;
715 end = s + src_size;
716 #ifdef HAVE_MMX
717 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
718 __asm __volatile(
719 "movq %0, %%mm7\n\t"
720 "movq %1, %%mm6\n\t"
721 ::"m"(red_16mask),"m"(green_16mask));
722 mm_end = end - 15;
723 while(s < mm_end)
724 {
725 __asm __volatile(
726 PREFETCH" 32%1\n\t"
727 "movd %1, %%mm0\n\t"
728 "movd 3%1, %%mm3\n\t"
729 "punpckldq 6%1, %%mm0\n\t"
730 "punpckldq 9%1, %%mm3\n\t"
731 "movq %%mm0, %%mm1\n\t"
732 "movq %%mm0, %%mm2\n\t"
733 "movq %%mm3, %%mm4\n\t"
734 "movq %%mm3, %%mm5\n\t"
735 "psllq $8, %%mm0\n\t"
736 "psllq $8, %%mm3\n\t"
737 "pand %%mm7, %%mm0\n\t"
738 "pand %%mm7, %%mm3\n\t"
739 "psrlq $5, %%mm1\n\t"
740 "psrlq $5, %%mm4\n\t"
741 "pand %%mm6, %%mm1\n\t"
742 "pand %%mm6, %%mm4\n\t"
743 "psrlq $19, %%mm2\n\t"
744 "psrlq $19, %%mm5\n\t"
745 "pand %2, %%mm2\n\t"
746 "pand %2, %%mm5\n\t"
747 "por %%mm1, %%mm0\n\t"
748 "por %%mm4, %%mm3\n\t"
749 "por %%mm2, %%mm0\n\t"
750 "por %%mm5, %%mm3\n\t"
751 "psllq $16, %%mm3\n\t"
752 "por %%mm3, %%mm0\n\t"
753 MOVNTQ" %%mm0, %0\n\t"
754 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
755 d += 4;
756 s += 12;
757 }
758 __asm __volatile(SFENCE:::"memory");
759 __asm __volatile(EMMS:::"memory");
760 #endif
761 while(s < end)
762 {
763 const int r= *s++;
764 const int g= *s++;
765 const int b= *s++;
766 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
767 }
768 }
769
770 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
771 {
772 const uint8_t *s = src;
773 const uint8_t *end;
774 #ifdef HAVE_MMX
775 const uint8_t *mm_end;
776 #endif
777 uint16_t *d = (uint16_t *)dst;
778 end = s + src_size;
779 #ifdef HAVE_MMX
780 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
781 __asm __volatile(
782 "movq %0, %%mm7\n\t"
783 "movq %1, %%mm6\n\t"
784 ::"m"(red_15mask),"m"(green_15mask));
785 mm_end = end - 11;
786 while(s < mm_end)
787 {
788 __asm __volatile(
789 PREFETCH" 32%1\n\t"
790 "movd %1, %%mm0\n\t"
791 "movd 3%1, %%mm3\n\t"
792 "punpckldq 6%1, %%mm0\n\t"
793 "punpckldq 9%1, %%mm3\n\t"
794 "movq %%mm0, %%mm1\n\t"
795 "movq %%mm0, %%mm2\n\t"
796 "movq %%mm3, %%mm4\n\t"
797 "movq %%mm3, %%mm5\n\t"
798 "psrlq $3, %%mm0\n\t"
799 "psrlq $3, %%mm3\n\t"
800 "pand %2, %%mm0\n\t"
801 "pand %2, %%mm3\n\t"
802 "psrlq $6, %%mm1\n\t"
803 "psrlq $6, %%mm4\n\t"
804 "pand %%mm6, %%mm1\n\t"
805 "pand %%mm6, %%mm4\n\t"
806 "psrlq $9, %%mm2\n\t"
807 "psrlq $9, %%mm5\n\t"
808 "pand %%mm7, %%mm2\n\t"
809 "pand %%mm7, %%mm5\n\t"
810 "por %%mm1, %%mm0\n\t"
811 "por %%mm4, %%mm3\n\t"
812 "por %%mm2, %%mm0\n\t"
813 "por %%mm5, %%mm3\n\t"
814 "psllq $16, %%mm3\n\t"
815 "por %%mm3, %%mm0\n\t"
816 MOVNTQ" %%mm0, %0\n\t"
817 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
818 d += 4;
819 s += 12;
820 }
821 __asm __volatile(SFENCE:::"memory");
822 __asm __volatile(EMMS:::"memory");
823 #endif
824 while(s < end)
825 {
826 const int b= *s++;
827 const int g= *s++;
828 const int r= *s++;
829 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
830 }
831 }
832
833 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
834 {
835 const uint8_t *s = src;
836 const uint8_t *end;
837 #ifdef HAVE_MMX
838 const uint8_t *mm_end;
839 #endif
840 uint16_t *d = (uint16_t *)dst;
841 end = s + src_size;
842 #ifdef HAVE_MMX
843 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
844 __asm __volatile(
845 "movq %0, %%mm7\n\t"
846 "movq %1, %%mm6\n\t"
847 ::"m"(red_15mask),"m"(green_15mask));
848 mm_end = end - 15;
849 while(s < mm_end)
850 {
851 __asm __volatile(
852 PREFETCH" 32%1\n\t"
853 "movd %1, %%mm0\n\t"
854 "movd 3%1, %%mm3\n\t"
855 "punpckldq 6%1, %%mm0\n\t"
856 "punpckldq 9%1, %%mm3\n\t"
857 "movq %%mm0, %%mm1\n\t"
858 "movq %%mm0, %%mm2\n\t"
859 "movq %%mm3, %%mm4\n\t"
860 "movq %%mm3, %%mm5\n\t"
861 "psllq $7, %%mm0\n\t"
862 "psllq $7, %%mm3\n\t"
863 "pand %%mm7, %%mm0\n\t"
864 "pand %%mm7, %%mm3\n\t"
865 "psrlq $6, %%mm1\n\t"
866 "psrlq $6, %%mm4\n\t"
867 "pand %%mm6, %%mm1\n\t"
868 "pand %%mm6, %%mm4\n\t"
869 "psrlq $19, %%mm2\n\t"
870 "psrlq $19, %%mm5\n\t"
871 "pand %2, %%mm2\n\t"
872 "pand %2, %%mm5\n\t"
873 "por %%mm1, %%mm0\n\t"
874 "por %%mm4, %%mm3\n\t"
875 "por %%mm2, %%mm0\n\t"
876 "por %%mm5, %%mm3\n\t"
877 "psllq $16, %%mm3\n\t"
878 "por %%mm3, %%mm0\n\t"
879 MOVNTQ" %%mm0, %0\n\t"
880 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
881 d += 4;
882 s += 12;
883 }
884 __asm __volatile(SFENCE:::"memory");
885 __asm __volatile(EMMS:::"memory");
886 #endif
887 while(s < end)
888 {
889 const int r= *s++;
890 const int g= *s++;
891 const int b= *s++;
892 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
893 }
894 }
895
896 /*
897 I use here less accurate approximation by simply
898 left-shifting the input
899 value and filling the low order bits with
900 zeroes. This method improves png's
901 compression but this scheme cannot reproduce white exactly, since it does not
902 generate an all-ones maximum value; the net effect is to darken the
903 image slightly.
904
905 The better method should be "left bit replication":
906
907 4 3 2 1 0
908 ---------
909 1 1 0 1 1
910
911 7 6 5 4 3 2 1 0
912 ----------------
913 1 1 0 1 1 1 1 0
914 |=======| |===|
915 | Leftmost Bits Repeated to Fill Open Bits
916 |
917 Original Bits
918 */
919 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
920 {
921 const uint16_t *end;
922 #ifdef HAVE_MMX
923 const uint16_t *mm_end;
924 #endif
925 uint8_t *d = (uint8_t *)dst;
926 const uint16_t *s = (uint16_t *)src;
927 end = s + src_size/2;
928 #ifdef HAVE_MMX
929 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
930 mm_end = end - 7;
931 while(s < mm_end)
932 {
933 __asm __volatile(
934 PREFETCH" 32%1\n\t"
935 "movq %1, %%mm0\n\t"
936 "movq %1, %%mm1\n\t"
937 "movq %1, %%mm2\n\t"
938 "pand %2, %%mm0\n\t"
939 "pand %3, %%mm1\n\t"
940 "pand %4, %%mm2\n\t"
941 "psllq $3, %%mm0\n\t"
942 "psrlq $2, %%mm1\n\t"
943 "psrlq $7, %%mm2\n\t"
944 "movq %%mm0, %%mm3\n\t"
945 "movq %%mm1, %%mm4\n\t"
946 "movq %%mm2, %%mm5\n\t"
947 "punpcklwd %5, %%mm0\n\t"
948 "punpcklwd %5, %%mm1\n\t"
949 "punpcklwd %5, %%mm2\n\t"
950 "punpckhwd %5, %%mm3\n\t"
951 "punpckhwd %5, %%mm4\n\t"
952 "punpckhwd %5, %%mm5\n\t"
953 "psllq $8, %%mm1\n\t"
954 "psllq $16, %%mm2\n\t"
955 "por %%mm1, %%mm0\n\t"
956 "por %%mm2, %%mm0\n\t"
957 "psllq $8, %%mm4\n\t"
958 "psllq $16, %%mm5\n\t"
959 "por %%mm4, %%mm3\n\t"
960 "por %%mm5, %%mm3\n\t"
961
962 "movq %%mm0, %%mm6\n\t"
963 "movq %%mm3, %%mm7\n\t"
964
965 "movq 8%1, %%mm0\n\t"
966 "movq 8%1, %%mm1\n\t"
967 "movq 8%1, %%mm2\n\t"
968 "pand %2, %%mm0\n\t"
969 "pand %3, %%mm1\n\t"
970 "pand %4, %%mm2\n\t"
971 "psllq $3, %%mm0\n\t"
972 "psrlq $2, %%mm1\n\t"
973 "psrlq $7, %%mm2\n\t"
974 "movq %%mm0, %%mm3\n\t"
975 "movq %%mm1, %%mm4\n\t"
976 "movq %%mm2, %%mm5\n\t"
977 "punpcklwd %5, %%mm0\n\t"
978 "punpcklwd %5, %%mm1\n\t"
979 "punpcklwd %5, %%mm2\n\t"
980 "punpckhwd %5, %%mm3\n\t"
981 "punpckhwd %5, %%mm4\n\t"
982 "punpckhwd %5, %%mm5\n\t"
983 "psllq $8, %%mm1\n\t"
984 "psllq $16, %%mm2\n\t"
985 "por %%mm1, %%mm0\n\t"
986 "por %%mm2, %%mm0\n\t"
987 "psllq $8, %%mm4\n\t"
988 "psllq $16, %%mm5\n\t"
989 "por %%mm4, %%mm3\n\t"
990 "por %%mm5, %%mm3\n\t"
991
992 :"=m"(*d)
993 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
994 :"memory");
995 /* Borrowed 32 to 24 */
996 __asm __volatile(
997 "movq %%mm0, %%mm4\n\t"
998 "movq %%mm3, %%mm5\n\t"
999 "movq %%mm6, %%mm0\n\t"
1000 "movq %%mm7, %%mm1\n\t"
1001
1002 "movq %%mm4, %%mm6\n\t"
1003 "movq %%mm5, %%mm7\n\t"
1004 "movq %%mm0, %%mm2\n\t"
1005 "movq %%mm1, %%mm3\n\t"
1006
1007 "psrlq $8, %%mm2\n\t"
1008 "psrlq $8, %%mm3\n\t"
1009 "psrlq $8, %%mm6\n\t"
1010 "psrlq $8, %%mm7\n\t"
1011 "pand %2, %%mm0\n\t"
1012 "pand %2, %%mm1\n\t"
1013 "pand %2, %%mm4\n\t"
1014 "pand %2, %%mm5\n\t"
1015 "pand %3, %%mm2\n\t"
1016 "pand %3, %%mm3\n\t"
1017 "pand %3, %%mm6\n\t"
1018 "pand %3, %%mm7\n\t"
1019 "por %%mm2, %%mm0\n\t"
1020 "por %%mm3, %%mm1\n\t"
1021 "por %%mm6, %%mm4\n\t"
1022 "por %%mm7, %%mm5\n\t"
1023
1024 "movq %%mm1, %%mm2\n\t"
1025 "movq %%mm4, %%mm3\n\t"
1026 "psllq $48, %%mm2\n\t"
1027 "psllq $32, %%mm3\n\t"
1028 "pand %4, %%mm2\n\t"
1029 "pand %5, %%mm3\n\t"
1030 "por %%mm2, %%mm0\n\t"
1031 "psrlq $16, %%mm1\n\t"
1032 "psrlq $32, %%mm4\n\t"
1033 "psllq $16, %%mm5\n\t"
1034 "por %%mm3, %%mm1\n\t"
1035 "pand %6, %%mm5\n\t"
1036 "por %%mm5, %%mm4\n\t"
1037
1038 MOVNTQ" %%mm0, %0\n\t"
1039 MOVNTQ" %%mm1, 8%0\n\t"
1040 MOVNTQ" %%mm4, 16%0"
1041
1042 :"=m"(*d)
1043 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1044 :"memory");
1045 d += 24;
1046 s += 8;
1047 }
1048 __asm __volatile(SFENCE:::"memory");
1049 __asm __volatile(EMMS:::"memory");
1050 #endif
1051 while(s < end)
1052 {
1053 register uint16_t bgr;
1054 bgr = *s++;
1055 *d++ = (bgr&0x1F)<<3;
1056 *d++ = (bgr&0x3E0)>>2;
1057 *d++ = (bgr&0x7C00)>>7;
1058 }
1059 }
1060
1061 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1062 {
1063 const uint16_t *end;
1064 #ifdef HAVE_MMX
1065 const uint16_t *mm_end;
1066 #endif
1067 uint8_t *d = (uint8_t *)dst;
1068 const uint16_t *s = (const uint16_t *)src;
1069 end = s + src_size/2;
1070 #ifdef HAVE_MMX
1071 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1072 mm_end = end - 7;
1073 while(s < mm_end)
1074 {
1075 __asm __volatile(
1076 PREFETCH" 32%1\n\t"
1077 "movq %1, %%mm0\n\t"
1078 "movq %1, %%mm1\n\t"
1079 "movq %1, %%mm2\n\t"
1080 "pand %2, %%mm0\n\t"
1081 "pand %3, %%mm1\n\t"
1082 "pand %4, %%mm2\n\t"
1083 "psllq $3, %%mm0\n\t"
1084 "psrlq $3, %%mm1\n\t"
1085 "psrlq $8, %%mm2\n\t"
1086 "movq %%mm0, %%mm3\n\t"
1087 "movq %%mm1, %%mm4\n\t"
1088 "movq %%mm2, %%mm5\n\t"
1089 "punpcklwd %5, %%mm0\n\t"
1090 "punpcklwd %5, %%mm1\n\t"
1091 "punpcklwd %5, %%mm2\n\t"
1092 "punpckhwd %5, %%mm3\n\t"
1093 "punpckhwd %5, %%mm4\n\t"
1094 "punpckhwd %5, %%mm5\n\t"
1095 "psllq $8, %%mm1\n\t"
1096 "psllq $16, %%mm2\n\t"
1097 "por %%mm1, %%mm0\n\t"
1098 "por %%mm2, %%mm0\n\t"
1099 "psllq $8, %%mm4\n\t"
1100 "psllq $16, %%mm5\n\t"
1101 "por %%mm4, %%mm3\n\t"
1102 "por %%mm5, %%mm3\n\t"
1103
1104 "movq %%mm0, %%mm6\n\t"
1105 "movq %%mm3, %%mm7\n\t"
1106
1107 "movq 8%1, %%mm0\n\t"
1108 "movq 8%1, %%mm1\n\t"
1109 "movq 8%1, %%mm2\n\t"
1110 "pand %2, %%mm0\n\t"
1111 "pand %3, %%mm1\n\t"
1112 "pand %4, %%mm2\n\t"
1113 "psllq $3, %%mm0\n\t"
1114 "psrlq $3, %%mm1\n\t"
1115 "psrlq $8, %%mm2\n\t"
1116 "movq %%mm0, %%mm3\n\t"
1117 "movq %%mm1, %%mm4\n\t"
1118 "movq %%mm2, %%mm5\n\t"
1119 "punpcklwd %5, %%mm0\n\t"
1120 "punpcklwd %5, %%mm1\n\t"
1121 "punpcklwd %5, %%mm2\n\t"
1122 "punpckhwd %5, %%mm3\n\t"
1123 "punpckhwd %5, %%mm4\n\t"
1124 "punpckhwd %5, %%mm5\n\t"
1125 "psllq $8, %%mm1\n\t"
1126 "psllq $16, %%mm2\n\t"
1127 "por %%mm1, %%mm0\n\t"
1128 "por %%mm2, %%mm0\n\t"
1129 "psllq $8, %%mm4\n\t"
1130 "psllq $16, %%mm5\n\t"
1131 "por %%mm4, %%mm3\n\t"
1132 "por %%mm5, %%mm3\n\t"
1133 :"=m"(*d)
1134 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1135 :"memory");
1136 /* Borrowed 32 to 24 */
1137 __asm __volatile(
1138 "movq %%mm0, %%mm4\n\t"
1139 "movq %%mm3, %%mm5\n\t"
1140 "movq %%mm6, %%mm0\n\t"
1141 "movq %%mm7, %%mm1\n\t"
1142
1143 "movq %%mm4, %%mm6\n\t"
1144 "movq %%mm5, %%mm7\n\t"
1145 "movq %%mm0, %%mm2\n\t"
1146 "movq %%mm1, %%mm3\n\t"
1147
1148 "psrlq $8, %%mm2\n\t"
1149 "psrlq $8, %%mm3\n\t"
1150 "psrlq $8, %%mm6\n\t"
1151 "psrlq $8, %%mm7\n\t"
1152 "pand %2, %%mm0\n\t"
1153 "pand %2, %%mm1\n\t"
1154 "pand %2, %%mm4\n\t"
1155 "pand %2, %%mm5\n\t"
1156 "pand %3, %%mm2\n\t"
1157 "pand %3, %%mm3\n\t"
1158 "pand %3, %%mm6\n\t"
1159 "pand %3, %%mm7\n\t"
1160 "por %%mm2, %%mm0\n\t"
1161 "por %%mm3, %%mm1\n\t"
1162 "por %%mm6, %%mm4\n\t"
1163 "por %%mm7, %%mm5\n\t"
1164
1165 "movq %%mm1, %%mm2\n\t"
1166 "movq %%mm4, %%mm3\n\t"
1167 "psllq $48, %%mm2\n\t"
1168 "psllq $32, %%mm3\n\t"
1169 "pand %4, %%mm2\n\t"
1170 "pand %5, %%mm3\n\t"
1171 "por %%mm2, %%mm0\n\t"
1172 "psrlq $16, %%mm1\n\t"
1173 "psrlq $32, %%mm4\n\t"
1174 "psllq $16, %%mm5\n\t"
1175 "por %%mm3, %%mm1\n\t"
1176 "pand %6, %%mm5\n\t"
1177 "por %%mm5, %%mm4\n\t"
1178
1179 MOVNTQ" %%mm0, %0\n\t"
1180 MOVNTQ" %%mm1, 8%0\n\t"
1181 MOVNTQ" %%mm4, 16%0"
1182
1183 :"=m"(*d)
1184 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1185 :"memory");
1186 d += 24;
1187 s += 8;
1188 }
1189 __asm __volatile(SFENCE:::"memory");
1190 __asm __volatile(EMMS:::"memory");
1191 #endif
1192 while(s < end)
1193 {
1194 register uint16_t bgr;
1195 bgr = *s++;
1196 *d++ = (bgr&0x1F)<<3;
1197 *d++ = (bgr&0x7E0)>>3;
1198 *d++ = (bgr&0xF800)>>8;
1199 }
1200 }
1201
1202 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1203 {
1204 const uint16_t *end;
1205 #ifdef HAVE_MMX
1206 const uint16_t *mm_end;
1207 #endif
1208 uint8_t *d = (uint8_t *)dst;
1209 const uint16_t *s = (const uint16_t *)src;
1210 end = s + src_size/2;
1211 #ifdef HAVE_MMX
1212 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1213 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1214 mm_end = end - 3;
1215 while(s < mm_end)
1216 {
1217 __asm __volatile(
1218 PREFETCH" 32%1\n\t"
1219 "movq %1, %%mm0\n\t"
1220 "movq %1, %%mm1\n\t"
1221 "movq %1, %%mm2\n\t"
1222 "pand %2, %%mm0\n\t"
1223 "pand %3, %%mm1\n\t"
1224 "pand %4, %%mm2\n\t"
1225 "psllq $3, %%mm0\n\t"
1226 "psrlq $2, %%mm1\n\t"
1227 "psrlq $7, %%mm2\n\t"
1228 "movq %%mm0, %%mm3\n\t"
1229 "movq %%mm1, %%mm4\n\t"
1230 "movq %%mm2, %%mm5\n\t"
1231 "punpcklwd %%mm7, %%mm0\n\t"
1232 "punpcklwd %%mm7, %%mm1\n\t"
1233 "punpcklwd %%mm7, %%mm2\n\t"
1234 "punpckhwd %%mm7, %%mm3\n\t"
1235 "punpckhwd %%mm7, %%mm4\n\t"
1236 "punpckhwd %%mm7, %%mm5\n\t"
1237 "psllq $8, %%mm1\n\t"
1238 "psllq $16, %%mm2\n\t"
1239 "por %%mm1, %%mm0\n\t"
1240 "por %%mm2, %%mm0\n\t"
1241 "psllq $8, %%mm4\n\t"
1242 "psllq $16, %%mm5\n\t"
1243 "por %%mm4, %%mm3\n\t"
1244 "por %%mm5, %%mm3\n\t"
1245 MOVNTQ" %%mm0, %0\n\t"
1246 MOVNTQ" %%mm3, 8%0\n\t"
1247 :"=m"(*d)
1248 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1249 :"memory");
1250 d += 16;
1251 s += 4;
1252 }
1253 __asm __volatile(SFENCE:::"memory");
1254 __asm __volatile(EMMS:::"memory");
1255 #endif
1256 while(s < end)
1257 {
1258 #if 0 //slightly slower on athlon
1259 int bgr= *s++;
1260 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1261 #else
1262 register uint16_t bgr;
1263 bgr = *s++;
1264 #ifdef WORDS_BIGENDIAN
1265 *d++ = 0;
1266 *d++ = (bgr&0x7C00)>>7;
1267 *d++ = (bgr&0x3E0)>>2;
1268 *d++ = (bgr&0x1F)<<3;
1269 #else
1270 *d++ = (bgr&0x1F)<<3;
1271 *d++ = (bgr&0x3E0)>>2;
1272 *d++ = (bgr&0x7C00)>>7;
1273 *d++ = 0;
1274 #endif
1275
1276 #endif
1277 }
1278 }
1279
1280 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1281 {
1282 const uint16_t *end;
1283 #ifdef HAVE_MMX
1284 const uint16_t *mm_end;
1285 #endif
1286 uint8_t *d = (uint8_t *)dst;
1287 const uint16_t *s = (uint16_t *)src;
1288 end = s + src_size/2;
1289 #ifdef HAVE_MMX
1290 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1291 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1292 mm_end = end - 3;
1293 while(s < mm_end)
1294 {
1295 __asm __volatile(
1296 PREFETCH" 32%1\n\t"
1297 "movq %1, %%mm0\n\t"
1298 "movq %1, %%mm1\n\t"
1299 "movq %1, %%mm2\n\t"
1300 "pand %2, %%mm0\n\t"
1301 "pand %3, %%mm1\n\t"
1302 "pand %4, %%mm2\n\t"
1303 "psllq $3, %%mm0\n\t"
1304 "psrlq $3, %%mm1\n\t"
1305 "psrlq $8, %%mm2\n\t"
1306 "movq %%mm0, %%mm3\n\t"
1307 "movq %%mm1, %%mm4\n\t"
1308 "movq %%mm2, %%mm5\n\t"
1309 "punpcklwd %%mm7, %%mm0\n\t"
1310 "punpcklwd %%mm7, %%mm1\n\t"
1311 "punpcklwd %%mm7, %%mm2\n\t"
1312 "punpckhwd %%mm7, %%mm3\n\t"
1313 "punpckhwd %%mm7, %%mm4\n\t"
1314 "punpckhwd %%mm7, %%mm5\n\t"
1315 "psllq $8, %%mm1\n\t"
1316 "psllq $16, %%mm2\n\t"
1317 "por %%mm1, %%mm0\n\t"
1318 "por %%mm2, %%mm0\n\t"
1319 "psllq $8, %%mm4\n\t"
1320 "psllq $16, %%mm5\n\t"
1321 "por %%mm4, %%mm3\n\t"
1322 "por %%mm5, %%mm3\n\t"
1323 MOVNTQ" %%mm0, %0\n\t"
1324 MOVNTQ" %%mm3, 8%0\n\t"
1325 :"=m"(*d)
1326 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1327 :"memory");
1328 d += 16;
1329 s += 4;
1330 }
1331 __asm __volatile(SFENCE:::"memory");
1332 __asm __volatile(EMMS:::"memory");
1333 #endif
1334 while(s < end)
1335 {
1336 register uint16_t bgr;
1337 bgr = *s++;
1338 #ifdef WORDS_BIGENDIAN
1339 *d++ = 0;
1340 *d++ = (bgr&0xF800)>>8;
1341 *d++ = (bgr&0x7E0)>>3;
1342 *d++ = (bgr&0x1F)<<3;
1343 #else
1344 *d++ = (bgr&0x1F)<<3;
1345 *d++ = (bgr&0x7E0)>>3;
1346 *d++ = (bgr&0xF800)>>8;
1347 *d++ = 0;
1348 #endif
1349 }
1350 }
1351
1352 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1353 {
1354 #ifdef HAVE_MMX
1355 /* TODO: unroll this loop */
1356 asm volatile (
1357 "xor %%"REG_a", %%"REG_a" \n\t"
1358 ASMALIGN16
1359 "1: \n\t"
1360 PREFETCH" 32(%0, %%"REG_a") \n\t"
1361 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1362 "movq %%mm0, %%mm1 \n\t"
1363 "movq %%mm0, %%mm2 \n\t"
1364 "pslld $16, %%mm0 \n\t"
1365 "psrld $16, %%mm1 \n\t"
1366 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1367 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1368 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1369 "por %%mm0, %%mm2 \n\t"
1370 "por %%mm1, %%mm2 \n\t"
1371 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
1372 "add $8, %%"REG_a" \n\t"
1373 "cmp %2, %%"REG_a" \n\t"
1374 " jb 1b \n\t"
1375 :: "r" (src), "r"(dst), "r" (src_size-7)
1376 : "%"REG_a
1377 );
1378
1379 __asm __volatile(SFENCE:::"memory");
1380 __asm __volatile(EMMS:::"memory");
1381 #else
1382 unsigned i;
1383 unsigned num_pixels = src_size >> 2;
1384 for(i=0; i<num_pixels; i++)
1385 {
1386 #ifdef WORDS_BIGENDIAN
1387 dst[4*i + 1] = src[4*i + 3];
1388 dst[4*i + 2] = src[4*i + 2];
1389 dst[4*i + 3] = src[4*i + 1];
1390 #else
1391 dst[4*i + 0] = src[4*i + 2];
1392 dst[4*i + 1] = src[4*i + 1];
1393 dst[4*i + 2] = src[4*i + 0];
1394 #endif
1395 }
1396 #endif
1397 }
1398
1399 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1400 {
1401 unsigned i;
1402 #ifdef HAVE_MMX
1403 long mmx_size= 23 - src_size;
1404 asm volatile (
1405 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1406 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1407 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1408 ASMALIGN16
1409 "1: \n\t"
1410 PREFETCH" 32(%1, %%"REG_a") \n\t"
1411 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1412 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1413 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1414 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1415 "pand %%mm5, %%mm0 \n\t"
1416 "pand %%mm6, %%mm1 \n\t"
1417 "pand %%mm7, %%mm2 \n\t"
1418 "por %%mm0, %%mm1 \n\t"
1419 "por %%mm2, %%mm1 \n\t"
1420 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1421 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1422 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1423 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1424 "pand %%mm7, %%mm0 \n\t"
1425 "pand %%mm5, %%mm1 \n\t"
1426 "pand %%mm6, %%mm2 \n\t"
1427 "por %%mm0, %%mm1 \n\t"
1428 "por %%mm2, %%mm1 \n\t"
1429 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1430 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1431 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1432 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1433 "pand %%mm6, %%mm0 \n\t"
1434 "pand %%mm7, %%mm1 \n\t"
1435 "pand %%mm5, %%mm2 \n\t"
1436 "por %%mm0, %%mm1 \n\t"
1437 "por %%mm2, %%mm1 \n\t"
1438 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1439 "add $24, %%"REG_a" \n\t"
1440 " js 1b \n\t"
1441 : "+a" (mmx_size)
1442 : "r" (src-mmx_size), "r"(dst-mmx_size)
1443 );
1444
1445 __asm __volatile(SFENCE:::"memory");
1446 __asm __volatile(EMMS:::"memory");
1447
1448 if(mmx_size==23) return; //finihsed, was multiple of 8
1449
1450 src+= src_size;
1451 dst+= src_size;
1452 src_size= 23-mmx_size;
1453 src-= src_size;
1454 dst-= src_size;
1455 #endif
1456 for(i=0; i<src_size; i+=3)
1457 {
1458 register uint8_t x;
1459 x = src[i + 2];
1460 dst[i + 1] = src[i + 1];
1461 dst[i + 2] = src[i + 0];
1462 dst[i + 0] = x;
1463 }
1464 }
1465
1466 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1467 long width, long height,
1468 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1469 {
1470 long y;
1471 const long chromWidth= width>>1;
1472 for(y=0; y<height; y++)
1473 {
1474 #ifdef HAVE_MMX
1475 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1476 asm volatile(
1477 "xor %%"REG_a", %%"REG_a" \n\t"
1478 ASMALIGN16
1479 "1: \n\t"
1480 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1481 PREFETCH" 32(%2, %%"REG_a") \n\t"
1482 PREFETCH" 32(%3, %%"REG_a") \n\t"
1483 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1484 "movq %%mm0, %%mm2 \n\t" // U(0)
1485 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1486 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1487 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1488
1489 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1490 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1491 "movq %%mm3, %%mm4 \n\t" // Y(0)
1492 "movq %%mm5, %%mm6 \n\t" // Y(8)
1493 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1494 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1495 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1496 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1497
1498 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1499 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1500 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1501 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1502
1503 "add $8, %%"REG_a" \n\t"
1504 "cmp %4, %%"REG_a" \n\t"
1505 " jb 1b \n\t"
1506 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1507 : "%"REG_a
1508 );
1509 #else
1510
1511 #if defined ARCH_ALPHA && defined HAVE_MVI
1512 #define pl2yuy2(n) \
1513 y1 = yc[n]; \
1514 y2 = yc2[n]; \
1515 u = uc[n]; \
1516 v = vc[n]; \
1517 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1518 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1519 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1520 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1521 yuv1 = (u << 8) + (v << 24); \
1522 yuv2 = yuv1 + y2; \
1523 yuv1 += y1; \
1524 qdst[n] = yuv1; \
1525 qdst2[n] = yuv2;
1526
1527 int i;
1528 uint64_t *qdst = (uint64_t *) dst;
1529 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1530 const uint32_t *yc = (uint32_t *) ysrc;
1531 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1532 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1533 for(i = 0; i < chromWidth; i += 8){
1534 uint64_t y1, y2, yuv1, yuv2;
1535 uint64_t u, v;
1536 /* Prefetch */
1537 asm("ldq $31,64(%0)" :: "r"(yc));
1538 asm("ldq $31,64(%0)" :: "r"(yc2));
1539 asm("ldq $31,64(%0)" :: "r"(uc));
1540 asm("ldq $31,64(%0)" :: "r"(vc));
1541
1542 pl2yuy2(0);
1543 pl2yuy2(1);
1544 pl2yuy2(2);
1545 pl2yuy2(3);
1546
1547 yc += 4;
1548 yc2 += 4;
1549 uc += 4;
1550 vc += 4;
1551 qdst += 4;
1552 qdst2 += 4;
1553 }
1554 y++;
1555 ysrc += lumStride;
1556 dst += dstStride;
1557
1558 #elif __WORDSIZE >= 64
1559 int i;
1560 uint64_t *ldst = (uint64_t *) dst;
1561 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1562 for(i = 0; i < chromWidth; i += 2){
1563 uint64_t k, l;
1564 k = yc[0] + (uc[0] << 8) +
1565 (yc[1] << 16) + (vc[0] << 24);
1566 l = yc[2] + (uc[1] << 8) +
1567 (yc[3] << 16) + (vc[1] << 24);
1568 *ldst++ = k + (l << 32);
1569 yc += 4;
1570 uc += 2;
1571 vc += 2;
1572 }
1573
1574 #else
1575 int i, *idst = (int32_t *) dst;
1576 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1577 for(i = 0; i < chromWidth; i++){
1578 #ifdef WORDS_BIGENDIAN
1579 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1580 (yc[1] << 8) + (vc[0] << 0);
1581 #else
1582 *idst++ = yc[0] + (uc[0] << 8) +
1583 (yc[1] << 16) + (vc[0] << 24);
1584 #endif
1585 yc += 2;
1586 uc++;
1587 vc++;
1588 }
1589 #endif
1590 #endif
1591 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1592 {
1593 usrc += chromStride;
1594 vsrc += chromStride;
1595 }
1596 ysrc += lumStride;
1597 dst += dstStride;
1598 }
1599 #ifdef HAVE_MMX
1600 asm( EMMS" \n\t"
1601 SFENCE" \n\t"
1602 :::"memory");
1603 #endif
1604 }
1605
1606 /**
1607 *
1608 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1609 * problem for anyone then tell me, and ill fix it)
1610 */
1611 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1612 long width, long height,
1613 long lumStride, long chromStride, long dstStride)
1614 {
1615 //FIXME interpolate chroma
1616 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1617 }
1618
1619 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1620 long width, long height,
1621 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1622 {
1623 long y;
1624 const long chromWidth= width>>1;
1625 for(y=0; y<height; y++)
1626 {
1627 #ifdef HAVE_MMX
1628 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1629 asm volatile(
1630 "xor %%"REG_a", %%"REG_a" \n\t"
1631 ASMALIGN16
1632 "1: \n\t"
1633 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1634 PREFETCH" 32(%2, %%"REG_a") \n\t"
1635 PREFETCH" 32(%3, %%"REG_a") \n\t"
1636 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1637 "movq %%mm0, %%mm2 \n\t" // U(0)
1638 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1639 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1640 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1641
1642 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1643 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1644 "movq %%mm0, %%mm4 \n\t" // Y(0)
1645 "movq %%mm2, %%mm6 \n\t" // Y(8)
1646 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1647 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1648 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1649 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1650
1651 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1652 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1653 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1654 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1655
1656 "add $8, %%"REG_a" \n\t"
1657 "cmp %4, %%"REG_a" \n\t"
1658 " jb 1b \n\t"
1659 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1660 : "%"REG_a
1661 );
1662 #else
1663 //FIXME adapt the alpha asm code from yv12->yuy2
1664
1665 #if __WORDSIZE >= 64
1666 int i;
1667 uint64_t *ldst = (uint64_t *) dst;
1668 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1669 for(i = 0; i < chromWidth; i += 2){
1670 uint64_t k, l;
1671 k = uc[0] + (yc[0] << 8) +
1672 (vc[0] << 16) + (yc[1] << 24);
1673 l = uc[1] + (yc[2] << 8) +
1674 (vc[1] << 16) + (yc[3] << 24);
1675 *ldst++ = k + (l << 32);
1676 yc += 4;
1677 uc += 2;
1678 vc += 2;
1679 }
1680
1681 #else
1682 int i, *idst = (int32_t *) dst;
1683 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1684 for(i = 0; i < chromWidth; i++){
1685 #ifdef WORDS_BIGENDIAN
1686 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1687 (vc[0] << 8) + (yc[1] << 0);
1688 #else
1689 *idst++ = uc[0] + (yc[0] << 8) +
1690 (vc[0] << 16) + (yc[1] << 24);
1691 #endif
1692 yc += 2;
1693 uc++;
1694 vc++;
1695 }
1696 #endif
1697 #endif
1698 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1699 {
1700 usrc += chromStride;
1701 vsrc += chromStride;
1702 }
1703 ysrc += lumStride;
1704 dst += dstStride;
1705 }
1706 #ifdef HAVE_MMX
1707 asm( EMMS" \n\t"
1708 SFENCE" \n\t"
1709 :::"memory");
1710 #endif
1711 }
1712
1713 /**
1714 *
1715 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1716 * problem for anyone then tell me, and ill fix it)
1717 */
1718 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1719 long width, long height,
1720 long lumStride, long chromStride, long dstStride)
1721 {
1722 //FIXME interpolate chroma
1723 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1724 }
1725
1726 /**
1727 *
1728 * width should be a multiple of 16
1729 */
1730 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1731 long width, long height,
1732 long lumStride, long chromStride, long dstStride)
1733 {
1734 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1735 }
1736
1737 /**
1738 *
1739 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1740 * problem for anyone then tell me, and ill fix it)
1741 */
1742 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1743 long width, long height,
1744 long lumStride, long chromStride, long srcStride)
1745 {
1746 long y;
1747 const long chromWidth= width>>1;
1748 for(y=0; y<height; y+=2)
1749 {
1750 #ifdef HAVE_MMX
1751 asm volatile(
1752 "xor %%"REG_a", %%"REG_a" \n\t"
1753 "pcmpeqw %%mm7, %%mm7 \n\t"
1754 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1755 ASMALIGN16
1756 "1: \n\t"
1757 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1758 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1759 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1760 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1761 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1762 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1763 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1764 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1765 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1766 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1767 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1768
1769 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1770
1771 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1772 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1773 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1774 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1775 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1776 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1777 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1778 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1779 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1780 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1781
1782 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1783
1784 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1785 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1786 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1787 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1788 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1789 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1790 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1791 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1792
1793 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1794 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1795
1796 "add $8, %%"REG_a" \n\t"
1797 "cmp %4, %%"REG_a" \n\t"
1798 " jb 1b \n\t"
1799 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1800 : "memory", "%"REG_a
1801 );
1802
1803 ydst += lumStride;
1804 src += srcStride;
1805
1806 asm volatile(
1807 "xor %%"REG_a", %%"REG_a" \n\t"
1808 ASMALIGN16
1809 "1: \n\t"
1810 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1811 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1812 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1813 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1814 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1815 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1816 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1817 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1818 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1819 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1820 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1821
1822 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1823 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1824
1825 "add $8, %%"REG_a" \n\t"
1826 "cmp %4, %%"REG_a" \n\t"
1827 " jb 1b \n\t"
1828
1829 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1830 : "memory", "%"REG_a
1831 );
1832 #else
1833 long i;
1834 for(i=0; i<chromWidth; i++)
1835 {
1836 ydst[2*i+0] = src[4*i+0];
1837 udst[i] = src[4*i+1];
1838 ydst[2*i+1] = src[4*i+2];
1839 vdst[i] = src[4*i+3];
1840 }
1841 ydst += lumStride;
1842 src += srcStride;
1843
1844 for(i=0; i<chromWidth; i++)
1845 {
1846 ydst[2*i+0] = src[4*i+0];
1847 ydst[2*i+1] = src[4*i+2];
1848 }
1849 #endif
1850 udst += chromStride;
1851 vdst += chromStride;
1852 ydst += lumStride;
1853 src += srcStride;
1854 }
1855 #ifdef HAVE_MMX
1856 asm volatile( EMMS" \n\t"
1857 SFENCE" \n\t"
1858 :::"memory");
1859 #endif
1860 }
1861
1862 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1863 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1864 long width, long height, long lumStride, long chromStride)
1865 {
1866 /* Y Plane */
1867 memcpy(ydst, ysrc, width*height);
1868
1869 /* XXX: implement upscaling for U,V */
1870 }
1871
1872 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1873 {
1874 long x,y;
1875
1876 dst[0]= src[0];
1877
1878 // first line
1879 for(x=0; x<srcWidth-1; x++){
1880 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1881 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1882 }
1883 dst[2*srcWidth-1]= src[srcWidth-1];
1884
1885 dst+= dstStride;
1886
1887 for(y=1; y<srcHeight; y++){
1888 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1889 const long mmxSize= srcWidth&~15;
1890 asm volatile(
1891 "mov %4, %%"REG_a" \n\t"
1892 "1: \n\t"
1893 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1894 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1895 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1896 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1897 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1898 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1899 PAVGB" %%mm0, %%mm5 \n\t"
1900 PAVGB" %%mm0, %%mm3 \n\t"
1901 PAVGB" %%mm0, %%mm5 \n\t"
1902 PAVGB" %%mm0, %%mm3 \n\t"
1903 PAVGB" %%mm1, %%mm4 \n\t"
1904 PAVGB" %%mm1, %%mm2 \n\t"
1905 PAVGB" %%mm1, %%mm4 \n\t"
1906 PAVGB" %%mm1, %%mm2 \n\t"
1907 "movq %%mm5, %%mm7 \n\t"
1908 "movq %%mm4, %%mm6 \n\t"
1909 "punpcklbw %%mm3, %%mm5 \n\t"
1910 "punpckhbw %%mm3, %%mm7 \n\t"
1911 "punpcklbw %%mm2, %%mm4 \n\t"
1912 "punpckhbw %%mm2, %%mm6 \n\t"
1913 #if 1
1914 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1915 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1916 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1917 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1918 #else
1919 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1920 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1921 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1922 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1923 #endif
1924 "add $8, %%"REG_a" \n\t"
1925 " js 1b \n\t"
1926 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1927 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1928 "g" (-mmxSize)
1929 : "%"REG_a
1930
1931 );
1932 #else
1933 const long mmxSize=1;
1934 #endif
1935 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1936 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1937
1938 for(x=mmxSize-1; x<srcWidth-1; x++){
1939 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1940 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1941 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1942 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1943 }
1944 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1945 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1946
1947 dst+=dstStride*2;
1948 src+=srcStride;
1949 }
1950
1951 // last line
1952 #if 1
1953 dst[0]= src[0];
1954
1955 for(x=0; x<srcWidth-1; x++){
1956 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1957 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1958 }
1959 dst[2*srcWidth-1]= src[srcWidth-1];
1960 #else
1961 for(x=0; x<srcWidth; x++){
1962 dst[2*x+0]=
1963 dst[2*x+1]= src[x];
1964 }
1965 #endif
1966
1967 #ifdef HAVE_MMX
1968 asm volatile( EMMS" \n\t"
1969 SFENCE" \n\t"
1970 :::"memory");
1971 #endif
1972 }
1973
1974 /**
1975 *
1976 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1977 * problem for anyone then tell me, and ill fix it)
1978 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1979 */
1980 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1981 long width, long height,
1982 long lumStride, long chromStride, long srcStride)
1983 {
1984 long y;
1985 const long chromWidth= width>>1;
1986 for(y=0; y<height; y+=2)
1987 {
1988 #ifdef HAVE_MMX
1989 asm volatile(
1990 "xorl %%eax, %%eax \n\t"
1991 "pcmpeqw %%mm7, %%mm7 \n\t"
1992 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1993 ASMALIGN16
1994 "1: \n\t"
1995 PREFETCH" 64(%0, %%eax, 4) \n\t"
1996 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1997 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1998 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1999 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2000 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2001 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2002 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2003 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2004 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2005 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2006
2007 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2008
2009 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2010 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2011 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2012 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2013 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2014 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2015 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2016 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2017 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2018 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2019
2020 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2021
2022 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2023 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2024 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2025 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2026 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2027 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2028 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2029 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2030
2031 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2032 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2033
2034 "addl $8, %%eax \n\t"
2035 "cmpl %4, %%eax \n\t"
2036 " jb 1b \n\t"
2037 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2038 : "memory", "%eax"
2039 );
2040
2041 ydst += lumStride;
2042 src += srcStride;
2043
2044 asm volatile(
2045 "xorl %%eax, %%eax \n\t"
2046 ASMALIGN16
2047 "1: \n\t"
2048 PREFETCH" 64(%0, %%eax, 4) \n\t"
2049 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2050 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2051 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2052 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2053 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2054 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2055 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2056 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2057 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2058 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2059
2060 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2061 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2062
2063 "addl $8, %%eax \n\t"
2064 "cmpl %4, %%eax \n\t"
2065 " jb 1b \n\t"
2066
2067 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2068 : "memory", "%eax"
2069 );
2070 #else
2071 long i;
2072 for(i=0; i<chromWidth; i++)
2073 {
2074 udst[i] = src[4*i+0];
2075 ydst[2*i+0] = src[4*i+1];
2076 vdst[i] = src[4*i+2];
2077 ydst[2*i+1] = src[4*i+3];
2078 }
2079 ydst += lumStride;
2080 src += srcStride;
2081
2082 for(i=0; i<chromWidth; i++)
2083 {
2084 ydst[2*i+0] = src[4*i+1];
2085 ydst[2*i+1] = src[4*i+3];
2086 }
2087 #endif
2088 udst += chromStride;
2089 vdst += chromStride;
2090 ydst += lumStride;
2091 src += srcStride;
2092 }
2093 #ifdef HAVE_MMX
2094 asm volatile( EMMS" \n\t"
2095 SFENCE" \n\t"
2096 :::"memory");
2097 #endif
2098 }
2099
2100 /**
2101 *
2102 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2103 * problem for anyone then tell me, and ill fix it)
2104 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2105 */
2106 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2107 long width, long height,
2108 long lumStride, long chromStride, long srcStride)
2109 {
2110 long y;
2111 const long chromWidth= width>>1;
2112 #ifdef HAVE_MMX
2113 for(y=0; y<height-2; y+=2)
2114 {
2115 long i;
2116 for(i=0; i<2; i++)
2117 {
2118 asm volatile(
2119 "mov %2, %%"REG_a" \n\t"
2120 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2121 "movq "MANGLE(w1111)", %%mm5 \n\t"
2122 "pxor %%mm7, %%mm7 \n\t"
2123 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2124 ASMALIGN16
2125 "1: \n\t"
2126 PREFETCH" 64(%0, %%"REG_b") \n\t"
2127 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2128 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
2129 "punpcklbw %%mm7, %%mm0 \n\t"
2130 "punpcklbw %%mm7, %%mm1 \n\t"
2131 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
2132 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
2133 "punpcklbw %%mm7, %%mm2 \n\t"
2134 "punpcklbw %%mm7, %%mm3 \n\t"
2135 "pmaddwd %%mm6, %%mm0 \n\t"
2136 "pmaddwd %%mm6, %%mm1 \n\t"
2137 "pmaddwd %%mm6, %%mm2 \n\t"
2138 "pmaddwd %%mm6, %%mm3 \n\t"
2139 #ifndef FAST_BGR2YV12
2140 "psrad $8, %%mm0 \n\t"
2141 "psrad $8, %%mm1 \n\t"
2142 "psrad $8, %%mm2 \n\t"
2143 "psrad $8, %%mm3 \n\t"
2144 #endif
2145 "packssdw %%mm1, %%mm0 \n\t"
2146 "packssdw %%mm3, %%mm2 \n\t"
2147 "pmaddwd %%mm5, %%mm0 \n\t"
2148 "pmaddwd %%mm5, %%mm2 \n\t"
2149 "packssdw %%mm2, %%mm0 \n\t"
2150 "psraw $7, %%mm0 \n\t"
2151
2152 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2153 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
2154 "punpcklbw %%mm7, %%mm4 \n\t"
2155 "punpcklbw %%mm7, %%mm1 \n\t"
2156 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
2157 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
2158 "punpcklbw %%mm7, %%mm2 \n\t"
2159 "punpcklbw %%mm7, %%mm3 \n\t"
2160 "pmaddwd %%mm6, %%mm4 \n\t"
2161 "pmaddwd %%mm6, %%mm1 \n\t"
2162 "pmaddwd %%mm6, %%mm2 \n\t"
2163 "pmaddwd %%mm6, %%mm3 \n\t"
2164 #ifndef FAST_BGR2YV12
2165 "psrad $8, %%mm4 \n\t"
2166 "psrad $8, %%mm1 \n\t"
2167 "psrad $8, %%mm2 \n\t"
2168 "psrad $8, %%mm3 \n\t"
2169 #endif
2170 "packssdw %%mm1, %%mm4 \n\t"
2171 "packssdw %%mm3, %%mm2 \n\t"
2172 "pmaddwd %%mm5, %%mm4 \n\t"
2173 "pmaddwd %%mm5, %%mm2 \n\t"
2174 "add $24, %%"REG_b" \n\t"
2175 "packssdw %%mm2, %%mm4 \n\t"
2176 "psraw $7, %%mm4 \n\t"
2177
2178 "packuswb %%mm4, %%mm0 \n\t"
2179 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2180
2181 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2182 "add $8, %%"REG_a" \n\t"
2183 " js 1b \n\t"
2184 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2185 : "%"REG_a, "%"REG_b
2186 );
2187 ydst += lumStride;
2188 src += srcStride;
2189 }
2190 src -= srcStride*2;
2191 asm volatile(
2192 "mov %4, %%"REG_a" \n\t"
2193 "movq "MANGLE(w1111)", %%mm5 \n\t"
2194 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2195 "pxor %%mm7, %%mm7 \n\t"
2196 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2197 "add %%"REG_b", %%"REG_b" \n\t"
2198 ASMALIGN16
2199 "1: \n\t"
2200 PREFETCH" 64(%0, %%"REG_b") \n\t"
2201 PREFETCH" 64(%1, %%"REG_b") \n\t"
2202 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2203 "movq (%0, %%"REG_b"), %%mm0 \n\t"
2204 "movq (%1, %%"REG_b"), %%mm1 \n\t"
2205 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
2206 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
2207 PAVGB" %%mm1, %%mm0 \n\t"
2208 PAVGB" %%mm3, %%mm2 \n\t"
2209 "movq %%mm0, %%mm1 \n\t"
2210 "movq %%mm2, %%mm3 \n\t"
2211 "psrlq $24, %%mm0 \n\t"
2212 "psrlq $24, %%mm2 \n\t"
2213 PAVGB" %%mm1, %%mm0 \n\t"
2214 PAVGB" %%mm3, %%mm2 \n\t"
2215 "punpcklbw %%mm7, %%mm0 \n\t"
2216 "punpcklbw %%mm7, %%mm2 \n\t"
2217 #else
2218 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2219 "movd (%1, %%"REG_b"), %%mm1 \n\t"
2220 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
2221 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
2222 "punpcklbw %%mm7, %%mm0 \n\t"
2223 "punpcklbw %%mm7, %%mm1 \n\t"
2224 "punpcklbw %%mm7, %%mm2 \n\t"
2225 "punpcklbw %%mm7, %%mm3 \n\t"
2226 "paddw %%mm1, %%mm0 \n\t"
2227 "paddw %%mm3, %%mm2 \n\t"
2228 "paddw %%mm2, %%mm0 \n\t"
2229 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
2230 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
2231 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
2232 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
2233 "punpcklbw %%mm7, %%mm4 \n\t"
2234 "punpcklbw %%mm7, %%mm1 \n\t"
2235 "punpcklbw %%mm7, %%mm2 \n\t"
2236 "punpcklbw %%mm7, %%mm3 \n\t"
2237 "paddw %%mm1, %%mm4 \n\t"
2238 "paddw %%mm3, %%mm2 \n\t"
2239 "paddw %%mm4, %%mm2 \n\t"
2240 "psrlw $2, %%mm0 \n\t"
2241 "psrlw $2, %%mm2 \n\t"
2242 #endif
2243 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2244 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2245
2246 "pmaddwd %%mm0, %%mm1 \n\t"
2247 "pmaddwd %%mm2, %%mm3 \n\t"
2248 "pmaddwd %%mm6, %%mm0 \n\t"
2249 "pmaddwd %%mm6, %%mm2 \n\t"
2250 #ifndef FAST_BGR2YV12
2251 "psrad $8, %%mm0 \n\t"
2252 "psrad $8, %%mm1 \n\t"
2253 "psrad $8, %%mm2 \n\t"
2254 "psrad $8, %%mm3 \n\t"
2255 #endif
2256 "packssdw %%mm2, %%mm0 \n\t"
2257 "packssdw %%mm3, %%mm1 \n\t"
2258 "pmaddwd %%mm5, %%mm0 \n\t"
2259 "pmaddwd %%mm5, %%mm1 \n\t"
2260 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2261 "psraw $7, %%mm0 \n\t"
2262
2263 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2264 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
2265 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
2266 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
2267 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
2268 PAVGB" %%mm1, %%mm4 \n\t"
2269 PAVGB" %%mm3, %%mm2 \n\t"
2270 "movq %%mm4, %%mm1 \n\t"
2271 "movq %%mm2, %%mm3 \n\t"
2272 "psrlq $24, %%mm4 \n\t"
2273 "psrlq $24, %%mm2 \n\t"
2274 PAVGB" %%mm1, %%mm4 \n\t"
2275 PAVGB" %%mm3, %%mm2 \n\t"
2276 "punpcklbw %%mm7, %%mm4 \n\t"
2277 "punpcklbw %%mm7, %%mm2 \n\t"
2278 #else
2279 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2280 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
2281 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
2282 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
2283 "punpcklbw %%mm7, %%mm4 \n\t"
2284 "punpcklbw %%mm7, %%mm1 \n\t"
2285 "punpcklbw %%mm7, %%mm2 \n\t"
2286 "punpcklbw %%mm7, %%mm3 \n\t"
2287 "paddw %%mm1, %%mm4 \n\t"
2288 "paddw %%mm3, %%mm2 \n\t"
2289 "paddw %%mm2, %%mm4 \n\t"
2290 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
2291 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
2292 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
2293 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
2294 "punpcklbw %%mm7, %%mm5 \n\t"
2295 "punpcklbw %%mm7, %%mm1 \n\t"
2296 "punpcklbw %%mm7, %%mm2 \n\t"
2297 "punpcklbw %%mm7, %%mm3 \n\t"
2298 "paddw %%mm1, %%mm5 \n\t"
2299 "paddw %%mm3, %%mm2 \n\t"
2300 "paddw %%mm5, %%mm2 \n\t"
2301 "movq "MANGLE(w1111)", %%mm5 \n\t"
2302 "psrlw $2, %%mm4 \n\t"
2303 "psrlw $2, %%mm2 \n\t"
2304 #endif
2305 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2306 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2307
2308 "pmaddwd %%mm4, %%mm1 \n\t"
2309 "pmaddwd %%mm2, %%mm3 \n\t"
2310 "pmaddwd %%mm6, %%mm4 \n\t"
2311 "pmaddwd %%mm6, %%mm2 \n\t"
2312 #ifndef FAST_BGR2YV12
2313 "psrad $8, %%mm4 \n\t"
2314 "psrad $8, %%mm1 \n\t"
2315 "psrad $8, %%mm2 \n\t"
2316 "psrad $8, %%mm3 \n\t"
2317 #endif
2318 "packssdw %%mm2, %%mm4 \n\t"
2319 "packssdw %%mm3, %%mm1 \n\t"
2320 "pmaddwd %%mm5, %%mm4 \n\t"
2321 "pmaddwd %%mm5, %%mm1 \n\t"
2322 "add $24, %%"REG_b" \n\t"
2323 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2324 "psraw $7, %%mm4 \n\t"
2325
2326 "movq %%mm0, %%mm1 \n\t"
2327 "punpckldq %%mm4, %%mm0 \n\t"
2328 "punpckhdq %%mm4, %%mm1 \n\t"
2329 "packsswb %%mm1, %%mm0 \n\t"
2330 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2331 "movd %%mm0, (%2, %%"REG_a") \n\t"
2332 "punpckhdq %%mm0, %%mm0 \n\t"
2333 "movd %%mm0, (%3, %%"REG_a") \n\t"
2334 "add $4, %%"REG_a" \n\t"
2335 " js 1b \n\t"
2336 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2337 : "%"REG_a, "%"REG_b
2338 );
2339
2340 udst += chromStride;
2341 vdst += chromStride;
2342 src += srcStride*2;
2343 }
2344
2345 asm volatile( EMMS" \n\t"
2346 SFENCE" \n\t"
2347 :::"memory");
2348 #else
2349 y=0;
2350 #endif
2351 for(; y<height; y+=2)
2352 {
2353 long i;
2354 for(i=0; i<chromWidth; i++)
2355 {
2356 unsigned int b= src[6*i+0];
2357 unsigned int g= src[6*i+1];
2358 unsigned int r= src[6*i+2];
2359
2360 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2361 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2362 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2363
2364 udst[i] = U;
2365 vdst[i] = V;
2366 ydst[2*i] = Y;
2367
2368 b= src[6*i+3];
2369 g= src[6*i+4];
2370 r= src[6*i+5];
2371
2372 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2373 ydst[2*i+1] = Y;
2374 }
2375 ydst += lumStride;
2376 src += srcStride;
2377
2378 for(i=0; i<chromWidth; i++)
2379 {
2380 unsigned int b= src[6*i+0];
2381 unsigned int g= src[6*i+1];
2382 unsigned int r= src[6*i+2];
2383
2384 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2385
2386 ydst[2*i] = Y;
2387
2388 b= src[6*i+3];
2389 g= src[6*i+4];
2390 r= src[6*i+5];
2391
2392 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2393 ydst[2*i+1] = Y;
2394 }
2395 udst += chromStride;
2396 vdst += chromStride;
2397 ydst += lumStride;
2398 src += srcStride;
2399 }
2400 }
2401
2402 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2403 long width, long height, long src1Stride,
2404 long src2Stride, long dstStride){
2405 long h;
2406
2407 for(h=0; h < height; h++)
2408 {
2409 long w;
2410
2411 #ifdef HAVE_MMX
2412 #ifdef HAVE_SSE2
2413 asm(
2414 "xor %%"REG_a", %%"REG_a" \n\t"
2415 "1: \n\t"
2416 PREFETCH" 64(%1, %%"REG_a") \n\t"
2417 PREFETCH" 64(%2, %%"REG_a") \n\t"
2418 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2419 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2420 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2421 "punpcklbw %%xmm2, %%xmm0 \n\t"
2422 "punpckhbw %%xmm2, %%xmm1 \n\t"
2423 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2424 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2425 "add $16, %%"REG_a" \n\t"
2426 "cmp %3, %%"REG_a" \n\t"
2427 " jb 1b \n\t"
2428 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2429 : "memory", "%"REG_a""
2430 );
2431 #else
2432 asm(
2433 "xor %%"REG_a", %%"REG_a" \n\t"
2434 "1: \n\t"
2435 PREFETCH" 64(%1, %%"REG_a") \n\t"
2436 PREFETCH" 64(%2, %%"REG_a") \n\t"
2437 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2438 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2439 "movq %%mm0, %%mm1 \n\t"
2440 "movq %%mm2, %%mm3 \n\t"
2441 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2442 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2443 "punpcklbw %%mm4, %%mm0 \n\t"
2444 "punpckhbw %%mm4, %%mm1 \n\t"
2445 "punpcklbw %%mm5, %%mm2 \n\t"
2446 "punpckhbw %%mm5, %%mm3 \n\t"
2447 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2448 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2449 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2450 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2451 "add $16, %%"REG_a" \n\t"
2452 "cmp %3, %%"REG_a" \n\t"
2453 " jb 1b \n\t"
2454 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2455 : "memory", "%"REG_a
2456 );
2457 #endif
2458 for(w= (width&(~15)); w < width; w++)
2459 {
2460 dest[2*w+0] = src1[w];
2461 dest[2*w+1] = src2[w];
2462 }
2463 #else
2464 for(w=0; w < width; w++)
2465 {
2466 dest[2*w+0] = src1[w];
2467 dest[2*w+1] = src2[w];
2468 }
2469 #endif
2470 dest += dstStride;
2471 src1 += src1Stride;
2472 src2 += src2Stride;
2473 }
2474 #ifdef HAVE_MMX
2475 asm(
2476 EMMS" \n\t"
2477 SFENCE" \n\t"
2478 ::: "memory"
2479 );
2480 #endif
2481 }
2482
2483 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2484 uint8_t *dst1, uint8_t *dst2,
2485 long width, long height,
2486 long srcStride1, long srcStride2,
2487 long dstStride1, long dstStride2)
2488 {
2489 long y,x,w,h;
2490 w=width/2; h=height/2;
2491 #ifdef HAVE_MMX
2492 asm volatile(
2493 PREFETCH" %0\n\t"
2494 PREFETCH" %1\n\t"
2495 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2496 #endif
2497 for(y=0;y<h;y++){
2498 const uint8_t* s1=src1+srcStride1*(y>>1);
2499 uint8_t* d=dst1+dstStride1*y;
2500 x=0;
2501 #ifdef HAVE_MMX
2502 for(;x<w-31;x+=32)
2503 {
2504 asm volatile(
2505 PREFETCH" 32%1\n\t"
2506 "movq %1, %%mm0\n\t"
2507 "movq 8%1, %%mm2\n\t"
2508 "movq 16%1, %%mm4\n\t"
2509 "movq 24%1, %%mm6\n\t"
2510 "movq %%mm0, %%mm1\n\t"
2511 "movq %%mm2, %%mm3\n\t"
2512 "movq %%mm4, %%mm5\n\t"
2513 "movq %%mm6, %%mm7\n\t"
2514 "punpcklbw %%mm0, %%mm0\n\t"
2515 "punpckhbw %%mm1, %%mm1\n\t"
2516 "punpcklbw %%mm2, %%mm2\n\t"
2517 "punpckhbw %%mm3, %%mm3\n\t"
2518 "punpcklbw %%mm4, %%mm4\n\t"
2519 "punpckhbw %%mm5, %%mm5\n\t"
2520 "punpcklbw %%mm6, %%mm6\n\t"
2521 "punpckhbw %%mm7, %%mm7\n\t"
2522 MOVNTQ" %%mm0, %0\n\t"
2523 MOVNTQ" %%mm1, 8%0\n\t"
2524 MOVNTQ" %%mm2, 16%0\n\t"
2525 MOVNTQ" %%mm3, 24%0\n\t"
2526 MOVNTQ" %%mm4, 32%0\n\t"
2527 MOVNTQ" %%mm5, 40%0\n\t"
2528 MOVNTQ" %%mm6, 48%0\n\t"
2529 MOVNTQ" %%mm7, 56%0"
2530 :"=m"(d[2*x])
2531 :"m"(s1[x])
2532 :"memory");
2533 }
2534 #endif
2535 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2536 }
2537 for(y=0;y<h;y++){
2538 const uint8_t* s2=src2+srcStride2*(y>>1);
2539 uint8_t* d=dst2+dstStride2*y;
2540 x=0;
2541 #ifdef HAVE_MMX
2542 for(;x<w-31;x+=32)
2543 {
2544 asm volatile(
2545 PREFETCH" 32%1\n\t"
2546 "movq %1, %%mm0\n\t"
2547 "movq 8%1, %%mm2\n\t"
2548 "movq 16%1, %%mm4\n\t"
2549 "movq 24%1, %%mm6\n\t"
2550 "movq %%mm0, %%mm1\n\t"
2551 "movq %%mm2, %%mm3\n\t"
2552 "movq %%mm4, %%mm5\n\t"
2553 "movq %%mm6, %%mm7\n\t"
2554 "punpcklbw %%mm0, %%mm0\n\t"
2555 "punpckhbw %%mm1, %%mm1\n\t"
2556 "punpcklbw %%mm2, %%mm2\n\t"
2557 "punpckhbw %%mm3, %%mm3\n\t"
2558 "punpcklbw %%mm4, %%mm4\n\t"
2559 "punpckhbw %%mm5, %%mm5\n\t"
2560 "punpcklbw %%mm6, %%mm6\n\t"
2561 "punpckhbw %%mm7, %%mm7\n\t"
2562 MOVNTQ" %%mm0, %0\n\t"
2563 MOVNTQ" %%mm1, 8%0\n\t"
2564 MOVNTQ" %%mm2, 16%0\n\t"
2565 MOVNTQ" %%mm3, 24%0\n\t"
2566 MOVNTQ" %%mm4, 32%0\n\t"
2567 MOVNTQ" %%mm5, 40%0\n\t"
2568 MOVNTQ" %%mm6, 48%0\n\t"
2569 MOVNTQ" %%mm7, 56%0"
2570 :"=m"(d[2*x])
2571 :"m"(s2[x])
2572 :"memory");
2573 }
2574 #endif
2575 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2576 }
2577 #ifdef HAVE_MMX
2578 asm(
2579 EMMS" \n\t"
2580 SFENCE" \n\t"
2581 ::: "memory"
2582 );
2583 #endif
2584 }
2585
2586 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2587 uint8_t *dst,
2588 long width, long height,
2589 long srcStride1, long srcStride2,
2590 long srcStride3, long dstStride)
2591 {
2592 long y,x,w,h;
2593 w=width/2; h=height;
2594 for(y=0;y<h;y++){
2595 const uint8_t* yp=src1+srcStride1*y;
2596 const uint8_t* up=src2+srcStride2*(y>>2);
2597 const uint8_t* vp=src3+srcStride3*(y>>2);
2598 uint8_t* d=dst+dstStride*y;
2599 x=0;
2600 #ifdef HAVE_MMX
2601 for(;x<w-7;x+=8)
2602 {
2603 asm volatile(
2604 PREFETCH" 32(%1, %0)\n\t"
2605 PREFETCH" 32(%2, %0)\n\t"
2606 PREFETCH" 32(%3, %0)\n\t"
2607 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2608 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2609 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2610 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2611 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2612 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2613 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2614 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2615 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2616 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2617
2618 "movq %%mm1, %%mm6\n\t"
2619 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2620 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2621 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2622 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2623 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2624
2625 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2626 "movq 8(%1, %0, 4), %%mm0\n\t"
2627 "movq %%mm0, %%mm3\n\t"
2628 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2629 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2630 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2631 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2632
2633 "movq %%mm4, %%mm6\n\t"
2634 "movq 16(%1, %0, 4), %%mm0\n\t"
2635 "movq %%mm0, %%mm3\n\t"
2636 "punpcklbw %%mm5, %%mm4\n\t"
2637 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2638 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2639 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2640 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2641
2642 "punpckhbw %%mm5, %%mm6\n\t"
2643 "movq 24(%1, %0, 4), %%mm0\n\t"
2644 "movq %%mm0, %%mm3\n\t"
2645 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2646 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2647 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2648 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2649
2650 : "+r" (x)
2651 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2652 :"memory");
2653 }
2654 #endif
2655 for(; x<w; x++)
2656 {
2657 const long x2= x<<2;
2658 d[8*x+0]=yp[x2];
2659 d[8*x+1]=up[x];
2660 d[8*x+2]=yp[x2+1];
2661 d[8*x+3]=vp[x];
2662 d[8*x+4]=yp[x2+2];
2663 d[8*x+5]=up[x];
2664 d[8*x+6]=yp[x2+3];
2665 d[8*x+7]=vp[x];
2666 }
2667 }
2668 #ifdef HAVE_MMX
2669 asm(
2670 EMMS" \n\t"
2671 SFENCE" \n\t"
2672 ::: "memory"
2673 );
2674 #endif
2675 }