Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12278:da5b503f050d libavcodec
VP8: Much faster SSE2 MC
5-10% faster or more on Phenom, Athlon 64, and some others.
Helps some on pre-SSSE3 Intel chips as well, but not as much.
author | darkshikari |
---|---|
date | Mon, 26 Jul 2010 19:34:00 +0000 |
parents | 1c299b8f2930 |
children | 7fb91885433c |
comparison
equal
deleted
inserted
replaced
12277:17adb33ac603 | 12278:da5b503f050d |
---|---|
436 add r2, r3 | 436 add r2, r3 |
437 dec r4 ; next row | 437 dec r4 ; next row |
438 jg .nextrow | 438 jg .nextrow |
439 REP_RET | 439 REP_RET |
440 | 440 |
441 ; 4x4 block, H-only 4-tap filter | |
442 INIT_XMM | 441 INIT_XMM |
443 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 | 442 cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 |
444 shl r5d, 4 | 443 shl r5d, 5 |
445 %ifdef PIC | 444 %ifdef PIC |
446 lea r11, [fourtap_filter_hw_m] | 445 lea r11, [fourtap_filter_v_m] |
447 %endif | 446 %endif |
448 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | 447 lea r5, [fourtap_filter_v+r5-32] |
449 mova m6, [fourtap_filter_hw+r5] | |
450 pxor m7, m7 | 448 pxor m7, m7 |
451 | 449 mova m4, [pw_64] |
450 mova m5, [r5+ 0] | |
451 mova m6, [r5+16] | |
452 %ifdef m8 | |
453 mova m8, [r5+32] | |
454 mova m9, [r5+48] | |
455 %endif | |
452 .nextrow | 456 .nextrow |
453 movh m0, [r2-1] | 457 movq m0, [r2-1] |
454 punpcklbw m0, m7 ; ABCDEFGH | 458 movq m1, [r2-0] |
455 mova m1, m0 | 459 movq m2, [r2+1] |
456 mova m2, m0 | 460 movq m3, [r2+2] |
457 mova m3, m0 | 461 punpcklbw m0, m7 |
458 psrldq m1, 2 ; BCDEFGH | 462 punpcklbw m1, m7 |
459 psrldq m2, 4 ; CDEFGH | 463 punpcklbw m2, m7 |
460 psrldq m3, 6 ; DEFGH | 464 punpcklbw m3, m7 |
461 punpcklwd m0, m1 ; ABBCCDDE | 465 pmullw m0, m5 |
462 punpcklwd m2, m3 ; CDDEEFFG | 466 pmullw m1, m6 |
463 pmaddwd m0, m5 | 467 %ifdef m8 |
464 pmaddwd m2, m6 | 468 pmullw m2, m8 |
465 paddd m0, m2 | 469 pmullw m3, m9 |
466 | 470 %else |
467 movh m1, [r2+3] | 471 pmullw m2, [r5+32] |
468 punpcklbw m1, m7 ; ABCDEFGH | 472 pmullw m3, [r5+48] |
469 mova m2, m1 | 473 %endif |
470 mova m3, m1 | 474 paddsw m0, m1 |
471 mova m4, m1 | 475 paddsw m2, m3 |
472 psrldq m2, 2 ; BCDEFGH | 476 paddsw m0, m2 |
473 psrldq m3, 4 ; CDEFGH | 477 paddsw m0, m4 |
474 psrldq m4, 6 ; DEFGH | |
475 punpcklwd m1, m2 ; ABBCCDDE | |
476 punpcklwd m3, m4 ; CDDEEFFG | |
477 pmaddwd m1, m5 | |
478 pmaddwd m3, m6 | |
479 paddd m1, m3 | |
480 | |
481 packssdw m0, m1 | |
482 paddsw m0, [pw_64] | |
483 psraw m0, 7 | 478 psraw m0, 7 |
484 packuswb m0, m7 | 479 packuswb m0, m7 |
485 movh [r0], m0 ; store | 480 movh [r0], m0 ; store |
486 | 481 |
487 ; go to next line | 482 ; go to next line |
489 add r2, r3 | 484 add r2, r3 |
490 dec r4 ; next row | 485 dec r4 ; next row |
491 jg .nextrow | 486 jg .nextrow |
492 REP_RET | 487 REP_RET |
493 | 488 |
494 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 | 489 cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 |
495 lea r5d, [r5*3] | 490 lea r5d, [r5*3] |
491 shl r5d, 4 | |
496 %ifdef PIC | 492 %ifdef PIC |
497 lea r11, [sixtap_filter_hw_m] | 493 lea r11, [sixtap_filter_v_m] |
498 %endif | 494 %endif |
499 lea r5, [sixtap_filter_hw+r5*8] | 495 lea r5, [sixtap_filter_v+r5-96] |
500 pxor m7, m7 | 496 pxor m7, m7 |
501 | 497 mova m6, [pw_64] |
498 %ifdef m8 | |
499 mova m8, [r5+ 0] | |
500 mova m9, [r5+16] | |
501 mova m10, [r5+32] | |
502 mova m11, [r5+48] | |
503 mova m12, [r5+64] | |
504 mova m13, [r5+80] | |
505 %endif | |
502 .nextrow | 506 .nextrow |
503 movu m0, [r2-2] | 507 movq m0, [r2-2] |
504 mova m6, m0 | 508 movq m1, [r2-1] |
505 mova m4, m0 | 509 movq m2, [r2-0] |
506 punpcklbw m0, m7 ; ABCDEFGHI | 510 movq m3, [r2+1] |
507 mova m1, m0 | 511 movq m4, [r2+2] |
508 mova m2, m0 | 512 movq m5, [r2+3] |
509 mova m3, m0 | 513 punpcklbw m0, m7 |
510 psrldq m1, 2 ; BCDEFGH | 514 punpcklbw m1, m7 |
511 psrldq m2, 4 ; CDEFGH | 515 punpcklbw m2, m7 |
512 psrldq m3, 6 ; DEFGH | 516 punpcklbw m3, m7 |
513 psrldq m4, 4 | 517 punpcklbw m4, m7 |
514 punpcklbw m4, m7 ; EFGH | 518 punpcklbw m5, m7 |
515 mova m5, m4 | 519 %ifdef m8 |
516 psrldq m5, 2 ; FGH | 520 pmullw m0, m8 |
517 punpcklwd m0, m1 ; ABBCCDDE | 521 pmullw m1, m9 |
518 punpcklwd m2, m3 ; CDDEEFFG | 522 pmullw m2, m10 |
519 punpcklwd m4, m5 ; EFFGGHHI | 523 pmullw m3, m11 |
520 pmaddwd m0, [r5-48] | 524 pmullw m4, m12 |
521 pmaddwd m2, [r5-32] | 525 pmullw m5, m13 |
522 pmaddwd m4, [r5-16] | 526 %else |
523 paddd m0, m2 | 527 pmullw m0, [r5+ 0] |
524 paddd m0, m4 | 528 pmullw m1, [r5+16] |
525 | 529 pmullw m2, [r5+32] |
526 psrldq m6, 4 | 530 pmullw m3, [r5+48] |
527 mova m4, m6 | 531 pmullw m4, [r5+64] |
528 punpcklbw m6, m7 ; ABCDEFGHI | 532 pmullw m5, [r5+80] |
529 mova m1, m6 | 533 %endif |
530 mova m2, m6 | 534 paddsw m1, m4 |
531 mova m3, m6 | 535 paddsw m0, m5 |
532 psrldq m1, 2 ; BCDEFGH | 536 paddsw m1, m2 |
533 psrldq m2, 4 ; CDEFGH | 537 paddsw m0, m3 |
534 psrldq m3, 6 ; DEFGH | 538 paddsw m0, m1 |
535 psrldq m4, 4 | 539 paddsw m0, m6 |
536 punpcklbw m4, m7 ; EFGH | |
537 mova m5, m4 | |
538 psrldq m5, 2 ; FGH | |
539 punpcklwd m6, m1 ; ABBCCDDE | |
540 punpcklwd m2, m3 ; CDDEEFFG | |
541 punpcklwd m4, m5 ; EFFGGHHI | |
542 pmaddwd m6, [r5-48] | |
543 pmaddwd m2, [r5-32] | |
544 pmaddwd m4, [r5-16] | |
545 paddd m6, m2 | |
546 paddd m6, m4 | |
547 | |
548 packssdw m0, m6 | |
549 paddsw m0, [pw_64] | |
550 psraw m0, 7 | 540 psraw m0, 7 |
551 packuswb m0, m7 | 541 packuswb m0, m7 |
552 movh [r0], m0 ; store | 542 movh [r0], m0 ; store |
553 | 543 |
554 ; go to next line | 544 ; go to next line |