Mercurial > mplayer.hg
comparison libswscale/rgb2rgb_template.c @ 26059:ba2efd11abf9
__asm __volatile -> asm volatile part 3
author | reimar |
---|---|
date | Sun, 24 Feb 2008 14:59:16 +0000 |
parents | 2ad528dd42a0 |
children | b7eeb3590bc9 |
comparison
equal
deleted
inserted
replaced
26058:e2ee46838240 | 26059:ba2efd11abf9 |
---|---|
90 #ifdef HAVE_MMX | 90 #ifdef HAVE_MMX |
91 const uint8_t *mm_end; | 91 const uint8_t *mm_end; |
92 #endif | 92 #endif |
93 end = s + src_size; | 93 end = s + src_size; |
94 #ifdef HAVE_MMX | 94 #ifdef HAVE_MMX |
95 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | 95 asm volatile(PREFETCH" %0"::"m"(*s):"memory"); |
96 mm_end = end - 23; | 96 mm_end = end - 23; |
97 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); | 97 asm volatile("movq %0, %%mm7"::"m"(mask32):"memory"); |
98 while (s < mm_end) | 98 while (s < mm_end) |
99 { | 99 { |
100 __asm __volatile( | 100 asm volatile( |
101 PREFETCH" 32%1 \n\t" | 101 PREFETCH" 32%1 \n\t" |
102 "movd %1, %%mm0 \n\t" | 102 "movd %1, %%mm0 \n\t" |
103 "punpckldq 3%1, %%mm0 \n\t" | 103 "punpckldq 3%1, %%mm0 \n\t" |
104 "movd 6%1, %%mm1 \n\t" | 104 "movd 6%1, %%mm1 \n\t" |
105 "punpckldq 9%1, %%mm1 \n\t" | 105 "punpckldq 9%1, %%mm1 \n\t" |
119 :"m"(*s) | 119 :"m"(*s) |
120 :"memory"); | 120 :"memory"); |
121 dest += 32; | 121 dest += 32; |
122 s += 24; | 122 s += 24; |
123 } | 123 } |
124 __asm __volatile(SFENCE:::"memory"); | 124 asm volatile(SFENCE:::"memory"); |
125 __asm __volatile(EMMS:::"memory"); | 125 asm volatile(EMMS:::"memory"); |
126 #endif | 126 #endif |
127 while (s < end) | 127 while (s < end) |
128 { | 128 { |
129 #ifdef WORDS_BIGENDIAN | 129 #ifdef WORDS_BIGENDIAN |
130 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ | 130 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ |
150 #ifdef HAVE_MMX | 150 #ifdef HAVE_MMX |
151 const uint8_t *mm_end; | 151 const uint8_t *mm_end; |
152 #endif | 152 #endif |
153 end = s + src_size; | 153 end = s + src_size; |
154 #ifdef HAVE_MMX | 154 #ifdef HAVE_MMX |
155 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | 155 asm volatile(PREFETCH" %0"::"m"(*s):"memory"); |
156 mm_end = end - 31; | 156 mm_end = end - 31; |
157 while (s < mm_end) | 157 while (s < mm_end) |
158 { | 158 { |
159 __asm __volatile( | 159 asm volatile( |
160 PREFETCH" 32%1 \n\t" | 160 PREFETCH" 32%1 \n\t" |
161 "movq %1, %%mm0 \n\t" | 161 "movq %1, %%mm0 \n\t" |
162 "movq 8%1, %%mm1 \n\t" | 162 "movq 8%1, %%mm1 \n\t" |
163 "movq 16%1, %%mm4 \n\t" | 163 "movq 16%1, %%mm4 \n\t" |
164 "movq 24%1, %%mm5 \n\t" | 164 "movq 24%1, %%mm5 \n\t" |
205 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | 205 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
206 :"memory"); | 206 :"memory"); |
207 dest += 24; | 207 dest += 24; |
208 s += 32; | 208 s += 32; |
209 } | 209 } |
210 __asm __volatile(SFENCE:::"memory"); | 210 asm volatile(SFENCE:::"memory"); |
211 __asm __volatile(EMMS:::"memory"); | 211 asm volatile(EMMS:::"memory"); |
212 #endif | 212 #endif |
213 while (s < end) | 213 while (s < end) |
214 { | 214 { |
215 #ifdef WORDS_BIGENDIAN | 215 #ifdef WORDS_BIGENDIAN |
216 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */ | 216 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */ |
240 register uint8_t* d=dst; | 240 register uint8_t* d=dst; |
241 register const uint8_t *end; | 241 register const uint8_t *end; |
242 const uint8_t *mm_end; | 242 const uint8_t *mm_end; |
243 end = s + src_size; | 243 end = s + src_size; |
244 #ifdef HAVE_MMX | 244 #ifdef HAVE_MMX |
245 __asm __volatile(PREFETCH" %0"::"m"(*s)); | 245 asm volatile(PREFETCH" %0"::"m"(*s)); |
246 __asm __volatile("movq %0, %%mm4"::"m"(mask15s)); | 246 asm volatile("movq %0, %%mm4"::"m"(mask15s)); |
247 mm_end = end - 15; | 247 mm_end = end - 15; |
248 while (s<mm_end) | 248 while (s<mm_end) |
249 { | 249 { |
250 __asm __volatile( | 250 asm volatile( |
251 PREFETCH" 32%1 \n\t" | 251 PREFETCH" 32%1 \n\t" |
252 "movq %1, %%mm0 \n\t" | 252 "movq %1, %%mm0 \n\t" |
253 "movq 8%1, %%mm2 \n\t" | 253 "movq 8%1, %%mm2 \n\t" |
254 "movq %%mm0, %%mm1 \n\t" | 254 "movq %%mm0, %%mm1 \n\t" |
255 "movq %%mm2, %%mm3 \n\t" | 255 "movq %%mm2, %%mm3 \n\t" |
263 :"m"(*s) | 263 :"m"(*s) |
264 ); | 264 ); |
265 d+=16; | 265 d+=16; |
266 s+=16; | 266 s+=16; |
267 } | 267 } |
268 __asm __volatile(SFENCE:::"memory"); | 268 asm volatile(SFENCE:::"memory"); |
269 __asm __volatile(EMMS:::"memory"); | 269 asm volatile(EMMS:::"memory"); |
270 #endif | 270 #endif |
271 mm_end = end - 3; | 271 mm_end = end - 3; |
272 while (s < mm_end) | 272 while (s < mm_end) |
273 { | 273 { |
274 register unsigned x= *((uint32_t *)s); | 274 register unsigned x= *((uint32_t *)s); |
289 register uint8_t* d=dst; | 289 register uint8_t* d=dst; |
290 register const uint8_t *end; | 290 register const uint8_t *end; |
291 const uint8_t *mm_end; | 291 const uint8_t *mm_end; |
292 end = s + src_size; | 292 end = s + src_size; |
293 #ifdef HAVE_MMX | 293 #ifdef HAVE_MMX |
294 __asm __volatile(PREFETCH" %0"::"m"(*s)); | 294 asm volatile(PREFETCH" %0"::"m"(*s)); |
295 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg)); | 295 asm volatile("movq %0, %%mm7"::"m"(mask15rg)); |
296 __asm __volatile("movq %0, %%mm6"::"m"(mask15b)); | 296 asm volatile("movq %0, %%mm6"::"m"(mask15b)); |
297 mm_end = end - 15; | 297 mm_end = end - 15; |
298 while (s<mm_end) | 298 while (s<mm_end) |
299 { | 299 { |
300 __asm __volatile( | 300 asm volatile( |
301 PREFETCH" 32%1 \n\t" | 301 PREFETCH" 32%1 \n\t" |
302 "movq %1, %%mm0 \n\t" | 302 "movq %1, %%mm0 \n\t" |
303 "movq 8%1, %%mm2 \n\t" | 303 "movq 8%1, %%mm2 \n\t" |
304 "movq %%mm0, %%mm1 \n\t" | 304 "movq %%mm0, %%mm1 \n\t" |
305 "movq %%mm2, %%mm3 \n\t" | 305 "movq %%mm2, %%mm3 \n\t" |
317 :"m"(*s) | 317 :"m"(*s) |
318 ); | 318 ); |
319 d+=16; | 319 d+=16; |
320 s+=16; | 320 s+=16; |
321 } | 321 } |
322 __asm __volatile(SFENCE:::"memory"); | 322 asm volatile(SFENCE:::"memory"); |
323 __asm __volatile(EMMS:::"memory"); | 323 asm volatile(EMMS:::"memory"); |
324 #endif | 324 #endif |
325 mm_end = end - 3; | 325 mm_end = end - 3; |
326 while (s < mm_end) | 326 while (s < mm_end) |
327 { | 327 { |
328 register uint32_t x= *((uint32_t *)s); | 328 register uint32_t x= *((uint32_t *)s); |
384 " jb 1b \n\t" | 384 " jb 1b \n\t" |
385 : "+r" (d), "+r"(s) | 385 : "+r" (d), "+r"(s) |
386 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | 386 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) |
387 ); | 387 ); |
388 #else | 388 #else |
389 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | 389 asm volatile(PREFETCH" %0"::"m"(*src):"memory"); |
390 __asm __volatile( | 390 asm volatile( |
391 "movq %0, %%mm7 \n\t" | 391 "movq %0, %%mm7 \n\t" |
392 "movq %1, %%mm6 \n\t" | 392 "movq %1, %%mm6 \n\t" |
393 ::"m"(red_16mask),"m"(green_16mask)); | 393 ::"m"(red_16mask),"m"(green_16mask)); |
394 while (s < mm_end) | 394 while (s < mm_end) |
395 { | 395 { |
396 __asm __volatile( | 396 asm volatile( |
397 PREFETCH" 32%1 \n\t" | 397 PREFETCH" 32%1 \n\t" |
398 "movd %1, %%mm0 \n\t" | 398 "movd %1, %%mm0 \n\t" |
399 "movd 4%1, %%mm3 \n\t" | 399 "movd 4%1, %%mm3 \n\t" |
400 "punpckldq 8%1, %%mm0 \n\t" | 400 "punpckldq 8%1, %%mm0 \n\t" |
401 "punpckldq 12%1, %%mm3 \n\t" | 401 "punpckldq 12%1, %%mm3 \n\t" |
425 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | 425 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
426 d += 4; | 426 d += 4; |
427 s += 16; | 427 s += 16; |
428 } | 428 } |
429 #endif | 429 #endif |
430 __asm __volatile(SFENCE:::"memory"); | 430 asm volatile(SFENCE:::"memory"); |
431 __asm __volatile(EMMS:::"memory"); | 431 asm volatile(EMMS:::"memory"); |
432 #endif | 432 #endif |
433 while (s < end) | 433 while (s < end) |
434 { | 434 { |
435 register int rgb = *(uint32_t*)s; s += 4; | 435 register int rgb = *(uint32_t*)s; s += 4; |
436 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); | 436 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); |
445 const uint8_t *mm_end; | 445 const uint8_t *mm_end; |
446 #endif | 446 #endif |
447 uint16_t *d = (uint16_t *)dst; | 447 uint16_t *d = (uint16_t *)dst; |
448 end = s + src_size; | 448 end = s + src_size; |
449 #ifdef HAVE_MMX | 449 #ifdef HAVE_MMX |
450 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | 450 asm volatile(PREFETCH" %0"::"m"(*src):"memory"); |
451 __asm __volatile( | 451 asm volatile( |
452 "movq %0, %%mm7 \n\t" | 452 "movq %0, %%mm7 \n\t" |
453 "movq %1, %%mm6 \n\t" | 453 "movq %1, %%mm6 \n\t" |
454 ::"m"(red_16mask),"m"(green_16mask)); | 454 ::"m"(red_16mask),"m"(green_16mask)); |
455 mm_end = end - 15; | 455 mm_end = end - 15; |
456 while (s < mm_end) | 456 while (s < mm_end) |
457 { | 457 { |
458 __asm __volatile( | 458 asm volatile( |
459 PREFETCH" 32%1 \n\t" | 459 PREFETCH" 32%1 \n\t" |
460 "movd %1, %%mm0 \n\t" | 460 "movd %1, %%mm0 \n\t" |
461 "movd 4%1, %%mm3 \n\t" | 461 "movd 4%1, %%mm3 \n\t" |
462 "punpckldq 8%1, %%mm0 \n\t" | 462 "punpckldq 8%1, %%mm0 \n\t" |
463 "punpckldq 12%1, %%mm3 \n\t" | 463 "punpckldq 12%1, %%mm3 \n\t" |
486 MOVNTQ" %%mm0, %0 \n\t" | 486 MOVNTQ" %%mm0, %0 \n\t" |
487 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | 487 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
488 d += 4; | 488 d += 4; |
489 s += 16; | 489 s += 16; |
490 } | 490 } |
491 __asm __volatile(SFENCE:::"memory"); | 491 asm volatile(SFENCE:::"memory"); |
492 __asm __volatile(EMMS:::"memory"); | 492 asm volatile(EMMS:::"memory"); |
493 #endif | 493 #endif |
494 while (s < end) | 494 while (s < end) |
495 { | 495 { |
496 register int rgb = *(uint32_t*)s; s += 4; | 496 register int rgb = *(uint32_t*)s; s += 4; |
497 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); | 497 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); |
543 " jb 1b \n\t" | 543 " jb 1b \n\t" |
544 : "+r" (d), "+r"(s) | 544 : "+r" (d), "+r"(s) |
545 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | 545 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) |
546 ); | 546 ); |
547 #else | 547 #else |
548 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | 548 asm volatile(PREFETCH" %0"::"m"(*src):"memory"); |
549 __asm __volatile( | 549 asm volatile( |
550 "movq %0, %%mm7 \n\t" | 550 "movq %0, %%mm7 \n\t" |
551 "movq %1, %%mm6 \n\t" | 551 "movq %1, %%mm6 \n\t" |
552 ::"m"(red_15mask),"m"(green_15mask)); | 552 ::"m"(red_15mask),"m"(green_15mask)); |
553 while (s < mm_end) | 553 while (s < mm_end) |
554 { | 554 { |
555 __asm __volatile( | 555 asm volatile( |
556 PREFETCH" 32%1 \n\t" | 556 PREFETCH" 32%1 \n\t" |
557 "movd %1, %%mm0 \n\t" | 557 "movd %1, %%mm0 \n\t" |
558 "movd 4%1, %%mm3 \n\t" | 558 "movd 4%1, %%mm3 \n\t" |
559 "punpckldq 8%1, %%mm0 \n\t" | 559 "punpckldq 8%1, %%mm0 \n\t" |
560 "punpckldq 12%1, %%mm3 \n\t" | 560 "punpckldq 12%1, %%mm3 \n\t" |
584 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | 584 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
585 d += 4; | 585 d += 4; |
586 s += 16; | 586 s += 16; |
587 } | 587 } |
588 #endif | 588 #endif |
589 __asm __volatile(SFENCE:::"memory"); | 589 asm volatile(SFENCE:::"memory"); |
590 __asm __volatile(EMMS:::"memory"); | 590 asm volatile(EMMS:::"memory"); |
591 #endif | 591 #endif |
592 while (s < end) | 592 while (s < end) |
593 { | 593 { |
594 register int rgb = *(uint32_t*)s; s += 4; | 594 register int rgb = *(uint32_t*)s; s += 4; |
595 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); | 595 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); |
604 const uint8_t *mm_end; | 604 const uint8_t *mm_end; |
605 #endif | 605 #endif |
606 uint16_t *d = (uint16_t *)dst; | 606 uint16_t *d = (uint16_t *)dst; |
607 end = s + src_size; | 607 end = s + src_size; |
608 #ifdef HAVE_MMX | 608 #ifdef HAVE_MMX |
609 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | 609 asm volatile(PREFETCH" %0"::"m"(*src):"memory"); |
610 __asm __volatile( | 610 asm volatile( |
611 "movq %0, %%mm7 \n\t" | 611 "movq %0, %%mm7 \n\t" |
612 "movq %1, %%mm6 \n\t" | 612 "movq %1, %%mm6 \n\t" |
613 ::"m"(red_15mask),"m"(green_15mask)); | 613 ::"m"(red_15mask),"m"(green_15mask)); |
614 mm_end = end - 15; | 614 mm_end = end - 15; |
615 while (s < mm_end) | 615 while (s < mm_end) |
616 { | 616 { |
617 __asm __volatile( | 617 asm volatile( |
618 PREFETCH" 32%1 \n\t" | 618 PREFETCH" 32%1 \n\t" |
619 "movd %1, %%mm0 \n\t" | 619 "movd %1, %%mm0 \n\t" |
620 "movd 4%1, %%mm3 \n\t" | 620 "movd 4%1, %%mm3 \n\t" |
621 "punpckldq 8%1, %%mm0 \n\t" | 621 "punpckldq 8%1, %%mm0 \n\t" |
622 "punpckldq 12%1, %%mm3 \n\t" | 622 "punpckldq 12%1, %%mm3 \n\t" |
645 MOVNTQ" %%mm0, %0 \n\t" | 645 MOVNTQ" %%mm0, %0 \n\t" |
646 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | 646 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
647 d += 4; | 647 d += 4; |
648 s += 16; | 648 s += 16; |
649 } | 649 } |
650 __asm __volatile(SFENCE:::"memory"); | 650 asm volatile(SFENCE:::"memory"); |
651 __asm __volatile(EMMS:::"memory"); | 651 asm volatile(EMMS:::"memory"); |
652 #endif | 652 #endif |
653 while (s < end) | 653 while (s < end) |
654 { | 654 { |
655 register int rgb = *(uint32_t*)s; s += 4; | 655 register int rgb = *(uint32_t*)s; s += 4; |
656 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); | 656 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); |
665 const uint8_t *mm_end; | 665 const uint8_t *mm_end; |
666 #endif | 666 #endif |
667 uint16_t *d = (uint16_t *)dst; | 667 uint16_t *d = (uint16_t *)dst; |
668 end = s + src_size; | 668 end = s + src_size; |
669 #ifdef HAVE_MMX | 669 #ifdef HAVE_MMX |
670 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | 670 asm volatile(PREFETCH" %0"::"m"(*src):"memory"); |
671 __asm __volatile( | 671 asm volatile( |
672 "movq %0, %%mm7 \n\t" | 672 "movq %0, %%mm7 \n\t" |
673 "movq %1, %%mm6 \n\t" | 673 "movq %1, %%mm6 \n\t" |
674 ::"m"(red_16mask),"m"(green_16mask)); | 674 ::"m"(red_16mask),"m"(green_16mask)); |
675 mm_end = end - 11; | 675 mm_end = end - 11; |
676 while (s < mm_end) | 676 while (s < mm_end) |
677 { | 677 { |
678 __asm __volatile( | 678 asm volatile( |
679 PREFETCH" 32%1 \n\t" | 679 PREFETCH" 32%1 \n\t" |
680 "movd %1, %%mm0 \n\t" | 680 "movd %1, %%mm0 \n\t" |
681 "movd 3%1, %%mm3 \n\t" | 681 "movd 3%1, %%mm3 \n\t" |
682 "punpckldq 6%1, %%mm0 \n\t" | 682 "punpckldq 6%1, %%mm0 \n\t" |
683 "punpckldq 9%1, %%mm3 \n\t" | 683 "punpckldq 9%1, %%mm3 \n\t" |
706 MOVNTQ" %%mm0, %0 \n\t" | 706 MOVNTQ" %%mm0, %0 \n\t" |
707 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | 707 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
708 d += 4; | 708 d += 4; |
709 s += 12; | 709 s += 12; |
710 } | 710 } |
711 __asm __volatile(SFENCE:::"memory"); | 711 asm volatile(SFENCE:::"memory"); |
712 __asm __volatile(EMMS:::"memory"); | 712 asm volatile(EMMS:::"memory"); |
713 #endif | 713 #endif |
714 while (s < end) | 714 while (s < end) |
715 { | 715 { |
716 const int b = *s++; | 716 const int b = *s++; |
717 const int g = *s++; | 717 const int g = *s++; |
728 const uint8_t *mm_end; | 728 const uint8_t *mm_end; |
729 #endif | 729 #endif |
730 uint16_t *d = (uint16_t *)dst; | 730 uint16_t *d = (uint16_t *)dst; |
731 end = s + src_size; | 731 end = s + src_size; |
732 #ifdef HAVE_MMX | 732 #ifdef HAVE_MMX |
733 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | 733 asm volatile(PREFETCH" %0"::"m"(*src):"memory"); |
734 __asm __volatile( | 734 asm volatile( |
735 "movq %0, %%mm7 \n\t" | 735 "movq %0, %%mm7 \n\t" |
736 "movq %1, %%mm6 \n\t" | 736 "movq %1, %%mm6 \n\t" |
737 ::"m"(red_16mask),"m"(green_16mask)); | 737 ::"m"(red_16mask),"m"(green_16mask)); |
738 mm_end = end - 15; | 738 mm_end = end - 15; |
739 while (s < mm_end) | 739 while (s < mm_end) |
740 { | 740 { |
741 __asm __volatile( | 741 asm volatile( |
742 PREFETCH" 32%1 \n\t" | 742 PREFETCH" 32%1 \n\t" |
743 "movd %1, %%mm0 \n\t" | 743 "movd %1, %%mm0 \n\t" |
744 "movd 3%1, %%mm3 \n\t" | 744 "movd 3%1, %%mm3 \n\t" |
745 "punpckldq 6%1, %%mm0 \n\t" | 745 "punpckldq 6%1, %%mm0 \n\t" |
746 "punpckldq 9%1, %%mm3 \n\t" | 746 "punpckldq 9%1, %%mm3 \n\t" |
769 MOVNTQ" %%mm0, %0 \n\t" | 769 MOVNTQ" %%mm0, %0 \n\t" |
770 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | 770 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
771 d += 4; | 771 d += 4; |
772 s += 12; | 772 s += 12; |
773 } | 773 } |
774 __asm __volatile(SFENCE:::"memory"); | 774 asm volatile(SFENCE:::"memory"); |
775 __asm __volatile(EMMS:::"memory"); | 775 asm volatile(EMMS:::"memory"); |
776 #endif | 776 #endif |
777 while (s < end) | 777 while (s < end) |
778 { | 778 { |
779 const int r = *s++; | 779 const int r = *s++; |
780 const int g = *s++; | 780 const int g = *s++; |
791 const uint8_t *mm_end; | 791 const uint8_t *mm_end; |
792 #endif | 792 #endif |
793 uint16_t *d = (uint16_t *)dst; | 793 uint16_t *d = (uint16_t *)dst; |
794 end = s + src_size; | 794 end = s + src_size; |
795 #ifdef HAVE_MMX | 795 #ifdef HAVE_MMX |
796 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | 796 asm volatile(PREFETCH" %0"::"m"(*src):"memory"); |
797 __asm __volatile( | 797 asm volatile( |
798 "movq %0, %%mm7 \n\t" | 798 "movq %0, %%mm7 \n\t" |
799 "movq %1, %%mm6 \n\t" | 799 "movq %1, %%mm6 \n\t" |
800 ::"m"(red_15mask),"m"(green_15mask)); | 800 ::"m"(red_15mask),"m"(green_15mask)); |
801 mm_end = end - 11; | 801 mm_end = end - 11; |
802 while (s < mm_end) | 802 while (s < mm_end) |
803 { | 803 { |
804 __asm __volatile( | 804 asm volatile( |
805 PREFETCH" 32%1 \n\t" | 805 PREFETCH" 32%1 \n\t" |
806 "movd %1, %%mm0 \n\t" | 806 "movd %1, %%mm0 \n\t" |
807 "movd 3%1, %%mm3 \n\t" | 807 "movd 3%1, %%mm3 \n\t" |
808 "punpckldq 6%1, %%mm0 \n\t" | 808 "punpckldq 6%1, %%mm0 \n\t" |
809 "punpckldq 9%1, %%mm3 \n\t" | 809 "punpckldq 9%1, %%mm3 \n\t" |
832 MOVNTQ" %%mm0, %0 \n\t" | 832 MOVNTQ" %%mm0, %0 \n\t" |
833 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | 833 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
834 d += 4; | 834 d += 4; |
835 s += 12; | 835 s += 12; |
836 } | 836 } |
837 __asm __volatile(SFENCE:::"memory"); | 837 asm volatile(SFENCE:::"memory"); |
838 __asm __volatile(EMMS:::"memory"); | 838 asm volatile(EMMS:::"memory"); |
839 #endif | 839 #endif |
840 while (s < end) | 840 while (s < end) |
841 { | 841 { |
842 const int b = *s++; | 842 const int b = *s++; |
843 const int g = *s++; | 843 const int g = *s++; |
854 const uint8_t *mm_end; | 854 const uint8_t *mm_end; |
855 #endif | 855 #endif |
856 uint16_t *d = (uint16_t *)dst; | 856 uint16_t *d = (uint16_t *)dst; |
857 end = s + src_size; | 857 end = s + src_size; |
858 #ifdef HAVE_MMX | 858 #ifdef HAVE_MMX |
859 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | 859 asm volatile(PREFETCH" %0"::"m"(*src):"memory"); |
860 __asm __volatile( | 860 asm volatile( |
861 "movq %0, %%mm7 \n\t" | 861 "movq %0, %%mm7 \n\t" |
862 "movq %1, %%mm6 \n\t" | 862 "movq %1, %%mm6 \n\t" |
863 ::"m"(red_15mask),"m"(green_15mask)); | 863 ::"m"(red_15mask),"m"(green_15mask)); |
864 mm_end = end - 15; | 864 mm_end = end - 15; |
865 while (s < mm_end) | 865 while (s < mm_end) |
866 { | 866 { |
867 __asm __volatile( | 867 asm volatile( |
868 PREFETCH" 32%1 \n\t" | 868 PREFETCH" 32%1 \n\t" |
869 "movd %1, %%mm0 \n\t" | 869 "movd %1, %%mm0 \n\t" |
870 "movd 3%1, %%mm3 \n\t" | 870 "movd 3%1, %%mm3 \n\t" |
871 "punpckldq 6%1, %%mm0 \n\t" | 871 "punpckldq 6%1, %%mm0 \n\t" |
872 "punpckldq 9%1, %%mm3 \n\t" | 872 "punpckldq 9%1, %%mm3 \n\t" |
895 MOVNTQ" %%mm0, %0 \n\t" | 895 MOVNTQ" %%mm0, %0 \n\t" |
896 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | 896 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
897 d += 4; | 897 d += 4; |
898 s += 12; | 898 s += 12; |
899 } | 899 } |
900 __asm __volatile(SFENCE:::"memory"); | 900 asm volatile(SFENCE:::"memory"); |
901 __asm __volatile(EMMS:::"memory"); | 901 asm volatile(EMMS:::"memory"); |
902 #endif | 902 #endif |
903 while (s < end) | 903 while (s < end) |
904 { | 904 { |
905 const int r = *s++; | 905 const int r = *s++; |
906 const int g = *s++; | 906 const int g = *s++; |
938 #endif | 938 #endif |
939 uint8_t *d = (uint8_t *)dst; | 939 uint8_t *d = (uint8_t *)dst; |
940 const uint16_t *s = (uint16_t *)src; | 940 const uint16_t *s = (uint16_t *)src; |
941 end = s + src_size/2; | 941 end = s + src_size/2; |
942 #ifdef HAVE_MMX | 942 #ifdef HAVE_MMX |
943 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | 943 asm volatile(PREFETCH" %0"::"m"(*s):"memory"); |
944 mm_end = end - 7; | 944 mm_end = end - 7; |
945 while (s < mm_end) | 945 while (s < mm_end) |
946 { | 946 { |
947 __asm __volatile( | 947 asm volatile( |
948 PREFETCH" 32%1 \n\t" | 948 PREFETCH" 32%1 \n\t" |
949 "movq %1, %%mm0 \n\t" | 949 "movq %1, %%mm0 \n\t" |
950 "movq %1, %%mm1 \n\t" | 950 "movq %1, %%mm1 \n\t" |
951 "movq %1, %%mm2 \n\t" | 951 "movq %1, %%mm2 \n\t" |
952 "pand %2, %%mm0 \n\t" | 952 "pand %2, %%mm0 \n\t" |
1005 | 1005 |
1006 :"=m"(*d) | 1006 :"=m"(*d) |
1007 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) | 1007 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) |
1008 :"memory"); | 1008 :"memory"); |
1009 /* Borrowed 32 to 24 */ | 1009 /* Borrowed 32 to 24 */ |
1010 __asm __volatile( | 1010 asm volatile( |
1011 "movq %%mm0, %%mm4 \n\t" | 1011 "movq %%mm0, %%mm4 \n\t" |
1012 "movq %%mm3, %%mm5 \n\t" | 1012 "movq %%mm3, %%mm5 \n\t" |
1013 "movq %%mm6, %%mm0 \n\t" | 1013 "movq %%mm6, %%mm0 \n\t" |
1014 "movq %%mm7, %%mm1 \n\t" | 1014 "movq %%mm7, %%mm1 \n\t" |
1015 | 1015 |
1057 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | 1057 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
1058 :"memory"); | 1058 :"memory"); |
1059 d += 24; | 1059 d += 24; |
1060 s += 8; | 1060 s += 8; |
1061 } | 1061 } |
1062 __asm __volatile(SFENCE:::"memory"); | 1062 asm volatile(SFENCE:::"memory"); |
1063 __asm __volatile(EMMS:::"memory"); | 1063 asm volatile(EMMS:::"memory"); |
1064 #endif | 1064 #endif |
1065 while (s < end) | 1065 while (s < end) |
1066 { | 1066 { |
1067 register uint16_t bgr; | 1067 register uint16_t bgr; |
1068 bgr = *s++; | 1068 bgr = *s++; |
1080 #endif | 1080 #endif |
1081 uint8_t *d = (uint8_t *)dst; | 1081 uint8_t *d = (uint8_t *)dst; |
1082 const uint16_t *s = (const uint16_t *)src; | 1082 const uint16_t *s = (const uint16_t *)src; |
1083 end = s + src_size/2; | 1083 end = s + src_size/2; |
1084 #ifdef HAVE_MMX | 1084 #ifdef HAVE_MMX |
1085 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | 1085 asm volatile(PREFETCH" %0"::"m"(*s):"memory"); |
1086 mm_end = end - 7; | 1086 mm_end = end - 7; |
1087 while (s < mm_end) | 1087 while (s < mm_end) |
1088 { | 1088 { |
1089 __asm __volatile( | 1089 asm volatile( |
1090 PREFETCH" 32%1 \n\t" | 1090 PREFETCH" 32%1 \n\t" |
1091 "movq %1, %%mm0 \n\t" | 1091 "movq %1, %%mm0 \n\t" |
1092 "movq %1, %%mm1 \n\t" | 1092 "movq %1, %%mm1 \n\t" |
1093 "movq %1, %%mm2 \n\t" | 1093 "movq %1, %%mm2 \n\t" |
1094 "pand %2, %%mm0 \n\t" | 1094 "pand %2, %%mm0 \n\t" |
1146 "por %%mm5, %%mm3 \n\t" | 1146 "por %%mm5, %%mm3 \n\t" |
1147 :"=m"(*d) | 1147 :"=m"(*d) |
1148 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) | 1148 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) |
1149 :"memory"); | 1149 :"memory"); |
1150 /* Borrowed 32 to 24 */ | 1150 /* Borrowed 32 to 24 */ |
1151 __asm __volatile( | 1151 asm volatile( |
1152 "movq %%mm0, %%mm4 \n\t" | 1152 "movq %%mm0, %%mm4 \n\t" |
1153 "movq %%mm3, %%mm5 \n\t" | 1153 "movq %%mm3, %%mm5 \n\t" |
1154 "movq %%mm6, %%mm0 \n\t" | 1154 "movq %%mm6, %%mm0 \n\t" |
1155 "movq %%mm7, %%mm1 \n\t" | 1155 "movq %%mm7, %%mm1 \n\t" |
1156 | 1156 |
1198 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | 1198 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) |
1199 :"memory"); | 1199 :"memory"); |
1200 d += 24; | 1200 d += 24; |
1201 s += 8; | 1201 s += 8; |
1202 } | 1202 } |
1203 __asm __volatile(SFENCE:::"memory"); | 1203 asm volatile(SFENCE:::"memory"); |
1204 __asm __volatile(EMMS:::"memory"); | 1204 asm volatile(EMMS:::"memory"); |
1205 #endif | 1205 #endif |
1206 while (s < end) | 1206 while (s < end) |
1207 { | 1207 { |
1208 register uint16_t bgr; | 1208 register uint16_t bgr; |
1209 bgr = *s++; | 1209 bgr = *s++; |
1221 #endif | 1221 #endif |
1222 uint8_t *d = (uint8_t *)dst; | 1222 uint8_t *d = (uint8_t *)dst; |
1223 const uint16_t *s = (const uint16_t *)src; | 1223 const uint16_t *s = (const uint16_t *)src; |
1224 end = s + src_size/2; | 1224 end = s + src_size/2; |
1225 #ifdef HAVE_MMX | 1225 #ifdef HAVE_MMX |
1226 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | 1226 asm volatile(PREFETCH" %0"::"m"(*s):"memory"); |
1227 __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); | 1227 asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); |
1228 mm_end = end - 3; | 1228 mm_end = end - 3; |
1229 while (s < mm_end) | 1229 while (s < mm_end) |
1230 { | 1230 { |
1231 __asm __volatile( | 1231 asm volatile( |
1232 PREFETCH" 32%1 \n\t" | 1232 PREFETCH" 32%1 \n\t" |
1233 "movq %1, %%mm0 \n\t" | 1233 "movq %1, %%mm0 \n\t" |
1234 "movq %1, %%mm1 \n\t" | 1234 "movq %1, %%mm1 \n\t" |
1235 "movq %1, %%mm2 \n\t" | 1235 "movq %1, %%mm2 \n\t" |
1236 "pand %2, %%mm0 \n\t" | 1236 "pand %2, %%mm0 \n\t" |
1262 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) | 1262 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) |
1263 :"memory"); | 1263 :"memory"); |
1264 d += 16; | 1264 d += 16; |
1265 s += 4; | 1265 s += 4; |
1266 } | 1266 } |
1267 __asm __volatile(SFENCE:::"memory"); | 1267 asm volatile(SFENCE:::"memory"); |
1268 __asm __volatile(EMMS:::"memory"); | 1268 asm volatile(EMMS:::"memory"); |
1269 #endif | 1269 #endif |
1270 while (s < end) | 1270 while (s < end) |
1271 { | 1271 { |
1272 #if 0 //slightly slower on Athlon | 1272 #if 0 //slightly slower on Athlon |
1273 int bgr= *s++; | 1273 int bgr= *s++; |
1299 #endif | 1299 #endif |
1300 uint8_t *d = (uint8_t *)dst; | 1300 uint8_t *d = (uint8_t *)dst; |
1301 const uint16_t *s = (uint16_t *)src; | 1301 const uint16_t *s = (uint16_t *)src; |
1302 end = s + src_size/2; | 1302 end = s + src_size/2; |
1303 #ifdef HAVE_MMX | 1303 #ifdef HAVE_MMX |
1304 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); | 1304 asm volatile(PREFETCH" %0"::"m"(*s):"memory"); |
1305 __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); | 1305 asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); |
1306 mm_end = end - 3; | 1306 mm_end = end - 3; |
1307 while (s < mm_end) | 1307 while (s < mm_end) |
1308 { | 1308 { |
1309 __asm __volatile( | 1309 asm volatile( |
1310 PREFETCH" 32%1 \n\t" | 1310 PREFETCH" 32%1 \n\t" |
1311 "movq %1, %%mm0 \n\t" | 1311 "movq %1, %%mm0 \n\t" |
1312 "movq %1, %%mm1 \n\t" | 1312 "movq %1, %%mm1 \n\t" |
1313 "movq %1, %%mm2 \n\t" | 1313 "movq %1, %%mm2 \n\t" |
1314 "pand %2, %%mm0 \n\t" | 1314 "pand %2, %%mm0 \n\t" |
1340 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) | 1340 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) |
1341 :"memory"); | 1341 :"memory"); |
1342 d += 16; | 1342 d += 16; |
1343 s += 4; | 1343 s += 4; |
1344 } | 1344 } |
1345 __asm __volatile(SFENCE:::"memory"); | 1345 asm volatile(SFENCE:::"memory"); |
1346 __asm __volatile(EMMS:::"memory"); | 1346 asm volatile(EMMS:::"memory"); |
1347 #endif | 1347 #endif |
1348 while (s < end) | 1348 while (s < end) |
1349 { | 1349 { |
1350 register uint16_t bgr; | 1350 register uint16_t bgr; |
1351 bgr = *s++; | 1351 bgr = *s++; |
1366 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) | 1366 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) |
1367 { | 1367 { |
1368 long idx = 15 - src_size; | 1368 long idx = 15 - src_size; |
1369 uint8_t *s = (uint8_t *) src-idx, *d = dst-idx; | 1369 uint8_t *s = (uint8_t *) src-idx, *d = dst-idx; |
1370 #ifdef HAVE_MMX | 1370 #ifdef HAVE_MMX |
1371 __asm __volatile( | 1371 asm volatile( |
1372 "test %0, %0 \n\t" | 1372 "test %0, %0 \n\t" |
1373 "jns 2f \n\t" | 1373 "jns 2f \n\t" |
1374 PREFETCH" (%1, %0) \n\t" | 1374 PREFETCH" (%1, %0) \n\t" |
1375 "movq %3, %%mm7 \n\t" | 1375 "movq %3, %%mm7 \n\t" |
1376 "pxor %4, %%mm7 \n\t" | 1376 "pxor %4, %%mm7 \n\t" |
1473 "2: \n\t" | 1473 "2: \n\t" |
1474 : "+a" (mmx_size) | 1474 : "+a" (mmx_size) |
1475 : "r" (src-mmx_size), "r"(dst-mmx_size) | 1475 : "r" (src-mmx_size), "r"(dst-mmx_size) |
1476 ); | 1476 ); |
1477 | 1477 |
1478 __asm __volatile(SFENCE:::"memory"); | 1478 asm volatile(SFENCE:::"memory"); |
1479 __asm __volatile(EMMS:::"memory"); | 1479 asm volatile(EMMS:::"memory"); |
1480 | 1480 |
1481 if (mmx_size==23) return; //finihsed, was multiple of 8 | 1481 if (mmx_size==23) return; //finihsed, was multiple of 8 |
1482 | 1482 |
1483 src+= src_size; | 1483 src+= src_size; |
1484 dst+= src_size; | 1484 dst+= src_size; |