Mercurial > mplayer.hg
comparison postproc/rgb2rgb_template.c @ 9454:50ef22bcc0c3
optimize
author | michael |
---|---|
date | Tue, 18 Feb 2003 19:22:34 +0000 |
parents | 4cd88c2a44bf |
children | 988c2ffc5bc1 |
comparison
equal
deleted
inserted
replaced
9453:41cd0f009115 | 9454:50ef22bcc0c3 |
---|---|
316 const uint8_t *mm_end; | 316 const uint8_t *mm_end; |
317 #endif | 317 #endif |
318 uint16_t *d = (uint16_t *)dst; | 318 uint16_t *d = (uint16_t *)dst; |
319 end = s + src_size; | 319 end = s + src_size; |
320 #ifdef HAVE_MMX | 320 #ifdef HAVE_MMX |
321 mm_end = end - 15; | |
322 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) | |
323 asm volatile( | |
324 "movq %3, %%mm5 \n\t" | |
325 "movq %4, %%mm6 \n\t" | |
326 "movq %5, %%mm7 \n\t" | |
327 ".balign 16 \n\t" | |
328 "1: \n\t" | |
329 PREFETCH" 32(%1) \n\t" | |
330 "movd (%1), %%mm0 \n\t" | |
331 "movd 4(%1), %%mm3 \n\t" | |
332 "punpckldq 8(%1), %%mm0 \n\t" | |
333 "punpckldq 12(%1), %%mm3 \n\t" | |
334 "movq %%mm0, %%mm1 \n\t" | |
335 "movq %%mm3, %%mm4 \n\t" | |
336 "pand %%mm6, %%mm0 \n\t" | |
337 "pand %%mm6, %%mm3 \n\t" | |
338 "pmaddwd %%mm7, %%mm0 \n\t" | |
339 "pmaddwd %%mm7, %%mm3 \n\t" | |
340 "pand %%mm5, %%mm1 \n\t" | |
341 "pand %%mm5, %%mm4 \n\t" | |
342 "por %%mm1, %%mm0 \n\t" | |
343 "por %%mm4, %%mm3 \n\t" | |
344 "psrld $5, %%mm0 \n\t" | |
345 "pslld $11, %%mm3 \n\t" | |
346 "por %%mm3, %%mm0 \n\t" | |
347 MOVNTQ" %%mm0, (%0) \n\t" | |
348 "addl $16, %1 \n\t" | |
349 "addl $8, %0 \n\t" | |
350 "cmpl %2, %1 \n\t" | |
351 " jb 1b \n\t" | |
352 : "+r" (d), "+r"(s) | |
353 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | |
354 ); | |
355 #else | |
321 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | 356 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
322 __asm __volatile( | 357 __asm __volatile( |
323 "movq %0, %%mm7\n\t" | 358 "movq %0, %%mm7\n\t" |
324 "movq %1, %%mm6\n\t" | 359 "movq %1, %%mm6\n\t" |
325 ::"m"(red_16mask),"m"(green_16mask)); | 360 ::"m"(red_16mask),"m"(green_16mask)); |
326 mm_end = end - 15; | |
327 while(s < mm_end) | 361 while(s < mm_end) |
328 { | 362 { |
329 __asm __volatile( | 363 __asm __volatile( |
330 PREFETCH" 32%1\n\t" | 364 PREFETCH" 32%1\n\t" |
331 "movd %1, %%mm0\n\t" | 365 "movd %1, %%mm0\n\t" |
357 MOVNTQ" %%mm0, %0\n\t" | 391 MOVNTQ" %%mm0, %0\n\t" |
358 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | 392 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); |
359 d += 4; | 393 d += 4; |
360 s += 16; | 394 s += 16; |
361 } | 395 } |
396 #endif | |
362 __asm __volatile(SFENCE:::"memory"); | 397 __asm __volatile(SFENCE:::"memory"); |
363 __asm __volatile(EMMS:::"memory"); | 398 __asm __volatile(EMMS:::"memory"); |
364 #endif | 399 #endif |
365 while(s < end) | 400 while(s < end) |
366 { | 401 { |
439 const uint8_t *mm_end; | 474 const uint8_t *mm_end; |
440 #endif | 475 #endif |
441 uint16_t *d = (uint16_t *)dst; | 476 uint16_t *d = (uint16_t *)dst; |
442 end = s + src_size; | 477 end = s + src_size; |
443 #ifdef HAVE_MMX | 478 #ifdef HAVE_MMX |
479 mm_end = end - 15; | |
480 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) | |
481 asm volatile( | |
482 "movq %3, %%mm5 \n\t" | |
483 "movq %4, %%mm6 \n\t" | |
484 "movq %5, %%mm7 \n\t" | |
485 ".balign 16 \n\t" | |
486 "1: \n\t" | |
487 PREFETCH" 32(%1) \n\t" | |
488 "movd (%1), %%mm0 \n\t" | |
489 "movd 4(%1), %%mm3 \n\t" | |
490 "punpckldq 8(%1), %%mm0 \n\t" | |
491 "punpckldq 12(%1), %%mm3 \n\t" | |
492 "movq %%mm0, %%mm1 \n\t" | |
493 "movq %%mm3, %%mm4 \n\t" | |
494 "pand %%mm6, %%mm0 \n\t" | |
495 "pand %%mm6, %%mm3 \n\t" | |
496 "pmaddwd %%mm7, %%mm0 \n\t" | |
497 "pmaddwd %%mm7, %%mm3 \n\t" | |
498 "pand %%mm5, %%mm1 \n\t" | |
499 "pand %%mm5, %%mm4 \n\t" | |
500 "por %%mm1, %%mm0 \n\t" | |
501 "por %%mm4, %%mm3 \n\t" | |
502 "psrld $6, %%mm0 \n\t" | |
503 "pslld $10, %%mm3 \n\t" | |
504 "por %%mm3, %%mm0 \n\t" | |
505 MOVNTQ" %%mm0, (%0) \n\t" | |
506 "addl $16, %1 \n\t" | |
507 "addl $8, %0 \n\t" | |
508 "cmpl %2, %1 \n\t" | |
509 " jb 1b \n\t" | |
510 : "+r" (d), "+r"(s) | |
511 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | |
512 ); | |
513 #else | |
444 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); | 514 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); |
445 __asm __volatile( | 515 __asm __volatile( |
446 "movq %0, %%mm7\n\t" | 516 "movq %0, %%mm7\n\t" |
447 "movq %1, %%mm6\n\t" | 517 "movq %1, %%mm6\n\t" |
448 ::"m"(red_15mask),"m"(green_15mask)); | 518 ::"m"(red_15mask),"m"(green_15mask)); |
449 mm_end = end - 15; | |
450 while(s < mm_end) | 519 while(s < mm_end) |
451 { | 520 { |
452 __asm __volatile( | 521 __asm __volatile( |
453 PREFETCH" 32%1\n\t" | 522 PREFETCH" 32%1\n\t" |
454 "movd %1, %%mm0\n\t" | 523 "movd %1, %%mm0\n\t" |
480 MOVNTQ" %%mm0, %0\n\t" | 549 MOVNTQ" %%mm0, %0\n\t" |
481 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | 550 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); |
482 d += 4; | 551 d += 4; |
483 s += 16; | 552 s += 16; |
484 } | 553 } |
554 #endif | |
485 __asm __volatile(SFENCE:::"memory"); | 555 __asm __volatile(SFENCE:::"memory"); |
486 __asm __volatile(EMMS:::"memory"); | 556 __asm __volatile(EMMS:::"memory"); |
487 #endif | 557 #endif |
488 while(s < end) | 558 while(s < end) |
489 { | 559 { |