comparison postproc/rgb2rgb_template.c @ 9454:50ef22bcc0c3

optimize
author michael
date Tue, 18 Feb 2003 19:22:34 +0000
parents 4cd88c2a44bf
children 988c2ffc5bc1
comparison
equal deleted inserted replaced
9453:41cd0f009115 9454:50ef22bcc0c3
316 const uint8_t *mm_end; 316 const uint8_t *mm_end;
317 #endif 317 #endif
318 uint16_t *d = (uint16_t *)dst; 318 uint16_t *d = (uint16_t *)dst;
319 end = s + src_size; 319 end = s + src_size;
320 #ifdef HAVE_MMX 320 #ifdef HAVE_MMX
321 mm_end = end - 15;
322 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
323 asm volatile(
324 "movq %3, %%mm5 \n\t"
325 "movq %4, %%mm6 \n\t"
326 "movq %5, %%mm7 \n\t"
327 ".balign 16 \n\t"
328 "1: \n\t"
329 PREFETCH" 32(%1) \n\t"
330 "movd (%1), %%mm0 \n\t"
331 "movd 4(%1), %%mm3 \n\t"
332 "punpckldq 8(%1), %%mm0 \n\t"
333 "punpckldq 12(%1), %%mm3 \n\t"
334 "movq %%mm0, %%mm1 \n\t"
335 "movq %%mm3, %%mm4 \n\t"
336 "pand %%mm6, %%mm0 \n\t"
337 "pand %%mm6, %%mm3 \n\t"
338 "pmaddwd %%mm7, %%mm0 \n\t"
339 "pmaddwd %%mm7, %%mm3 \n\t"
340 "pand %%mm5, %%mm1 \n\t"
341 "pand %%mm5, %%mm4 \n\t"
342 "por %%mm1, %%mm0 \n\t"
343 "por %%mm4, %%mm3 \n\t"
344 "psrld $5, %%mm0 \n\t"
345 "pslld $11, %%mm3 \n\t"
346 "por %%mm3, %%mm0 \n\t"
347 MOVNTQ" %%mm0, (%0) \n\t"
348 "addl $16, %1 \n\t"
349 "addl $8, %0 \n\t"
350 "cmpl %2, %1 \n\t"
351 " jb 1b \n\t"
352 : "+r" (d), "+r"(s)
353 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
354 );
355 #else
321 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 356 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
322 __asm __volatile( 357 __asm __volatile(
323 "movq %0, %%mm7\n\t" 358 "movq %0, %%mm7\n\t"
324 "movq %1, %%mm6\n\t" 359 "movq %1, %%mm6\n\t"
325 ::"m"(red_16mask),"m"(green_16mask)); 360 ::"m"(red_16mask),"m"(green_16mask));
326 mm_end = end - 15;
327 while(s < mm_end) 361 while(s < mm_end)
328 { 362 {
329 __asm __volatile( 363 __asm __volatile(
330 PREFETCH" 32%1\n\t" 364 PREFETCH" 32%1\n\t"
331 "movd %1, %%mm0\n\t" 365 "movd %1, %%mm0\n\t"
357 MOVNTQ" %%mm0, %0\n\t" 391 MOVNTQ" %%mm0, %0\n\t"
358 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 392 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
359 d += 4; 393 d += 4;
360 s += 16; 394 s += 16;
361 } 395 }
396 #endif
362 __asm __volatile(SFENCE:::"memory"); 397 __asm __volatile(SFENCE:::"memory");
363 __asm __volatile(EMMS:::"memory"); 398 __asm __volatile(EMMS:::"memory");
364 #endif 399 #endif
365 while(s < end) 400 while(s < end)
366 { 401 {
439 const uint8_t *mm_end; 474 const uint8_t *mm_end;
440 #endif 475 #endif
441 uint16_t *d = (uint16_t *)dst; 476 uint16_t *d = (uint16_t *)dst;
442 end = s + src_size; 477 end = s + src_size;
443 #ifdef HAVE_MMX 478 #ifdef HAVE_MMX
479 mm_end = end - 15;
480 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
481 asm volatile(
482 "movq %3, %%mm5 \n\t"
483 "movq %4, %%mm6 \n\t"
484 "movq %5, %%mm7 \n\t"
485 ".balign 16 \n\t"
486 "1: \n\t"
487 PREFETCH" 32(%1) \n\t"
488 "movd (%1), %%mm0 \n\t"
489 "movd 4(%1), %%mm3 \n\t"
490 "punpckldq 8(%1), %%mm0 \n\t"
491 "punpckldq 12(%1), %%mm3 \n\t"
492 "movq %%mm0, %%mm1 \n\t"
493 "movq %%mm3, %%mm4 \n\t"
494 "pand %%mm6, %%mm0 \n\t"
495 "pand %%mm6, %%mm3 \n\t"
496 "pmaddwd %%mm7, %%mm0 \n\t"
497 "pmaddwd %%mm7, %%mm3 \n\t"
498 "pand %%mm5, %%mm1 \n\t"
499 "pand %%mm5, %%mm4 \n\t"
500 "por %%mm1, %%mm0 \n\t"
501 "por %%mm4, %%mm3 \n\t"
502 "psrld $6, %%mm0 \n\t"
503 "pslld $10, %%mm3 \n\t"
504 "por %%mm3, %%mm0 \n\t"
505 MOVNTQ" %%mm0, (%0) \n\t"
506 "addl $16, %1 \n\t"
507 "addl $8, %0 \n\t"
508 "cmpl %2, %1 \n\t"
509 " jb 1b \n\t"
510 : "+r" (d), "+r"(s)
511 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
512 );
513 #else
444 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 514 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
445 __asm __volatile( 515 __asm __volatile(
446 "movq %0, %%mm7\n\t" 516 "movq %0, %%mm7\n\t"
447 "movq %1, %%mm6\n\t" 517 "movq %1, %%mm6\n\t"
448 ::"m"(red_15mask),"m"(green_15mask)); 518 ::"m"(red_15mask),"m"(green_15mask));
449 mm_end = end - 15;
450 while(s < mm_end) 519 while(s < mm_end)
451 { 520 {
452 __asm __volatile( 521 __asm __volatile(
453 PREFETCH" 32%1\n\t" 522 PREFETCH" 32%1\n\t"
454 "movd %1, %%mm0\n\t" 523 "movd %1, %%mm0\n\t"
480 MOVNTQ" %%mm0, %0\n\t" 549 MOVNTQ" %%mm0, %0\n\t"
481 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 550 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
482 d += 4; 551 d += 4;
483 s += 16; 552 s += 16;
484 } 553 }
554 #endif
485 __asm __volatile(SFENCE:::"memory"); 555 __asm __volatile(SFENCE:::"memory");
486 __asm __volatile(EMMS:::"memory"); 556 __asm __volatile(EMMS:::"memory");
487 #endif 557 #endif
488 while(s < end) 558 while(s < end)
489 { 559 {