comparison x86/vp8dsp.asm @ 12278:da5b503f050d libavcodec

VP8: Much faster SSE2 MC 5-10% faster or more on Phenom, Athlon 64, and some others. Helps some on pre-SSSE3 Intel chips as well, but not as much.
author darkshikari
date Mon, 26 Jul 2010 19:34:00 +0000
parents 1c299b8f2930
children 7fb91885433c
comparison
equal deleted inserted replaced
12277:17adb33ac603 12278:da5b503f050d
436 add r2, r3 436 add r2, r3
437 dec r4 ; next row 437 dec r4 ; next row
438 jg .nextrow 438 jg .nextrow
439 REP_RET 439 REP_RET
440 440
441 ; 4x4 block, H-only 4-tap filter
442 INIT_XMM 441 INIT_XMM
443 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 442 cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
444 shl r5d, 4 443 shl r5d, 5
445 %ifdef PIC 444 %ifdef PIC
446 lea r11, [fourtap_filter_hw_m] 445 lea r11, [fourtap_filter_v_m]
447 %endif 446 %endif
448 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words 447 lea r5, [fourtap_filter_v+r5-32]
449 mova m6, [fourtap_filter_hw+r5]
450 pxor m7, m7 448 pxor m7, m7
451 449 mova m4, [pw_64]
450 mova m5, [r5+ 0]
451 mova m6, [r5+16]
452 %ifdef m8
453 mova m8, [r5+32]
454 mova m9, [r5+48]
455 %endif
452 .nextrow 456 .nextrow
453 movh m0, [r2-1] 457 movq m0, [r2-1]
454 punpcklbw m0, m7 ; ABCDEFGH 458 movq m1, [r2-0]
455 mova m1, m0 459 movq m2, [r2+1]
456 mova m2, m0 460 movq m3, [r2+2]
457 mova m3, m0 461 punpcklbw m0, m7
458 psrldq m1, 2 ; BCDEFGH 462 punpcklbw m1, m7
459 psrldq m2, 4 ; CDEFGH 463 punpcklbw m2, m7
460 psrldq m3, 6 ; DEFGH 464 punpcklbw m3, m7
461 punpcklwd m0, m1 ; ABBCCDDE 465 pmullw m0, m5
462 punpcklwd m2, m3 ; CDDEEFFG 466 pmullw m1, m6
463 pmaddwd m0, m5 467 %ifdef m8
464 pmaddwd m2, m6 468 pmullw m2, m8
465 paddd m0, m2 469 pmullw m3, m9
466 470 %else
467 movh m1, [r2+3] 471 pmullw m2, [r5+32]
468 punpcklbw m1, m7 ; ABCDEFGH 472 pmullw m3, [r5+48]
469 mova m2, m1 473 %endif
470 mova m3, m1 474 paddsw m0, m1
471 mova m4, m1 475 paddsw m2, m3
472 psrldq m2, 2 ; BCDEFGH 476 paddsw m0, m2
473 psrldq m3, 4 ; CDEFGH 477 paddsw m0, m4
474 psrldq m4, 6 ; DEFGH
475 punpcklwd m1, m2 ; ABBCCDDE
476 punpcklwd m3, m4 ; CDDEEFFG
477 pmaddwd m1, m5
478 pmaddwd m3, m6
479 paddd m1, m3
480
481 packssdw m0, m1
482 paddsw m0, [pw_64]
483 psraw m0, 7 478 psraw m0, 7
484 packuswb m0, m7 479 packuswb m0, m7
485 movh [r0], m0 ; store 480 movh [r0], m0 ; store
486 481
487 ; go to next line 482 ; go to next line
489 add r2, r3 484 add r2, r3
490 dec r4 ; next row 485 dec r4 ; next row
491 jg .nextrow 486 jg .nextrow
492 REP_RET 487 REP_RET
493 488
494 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 489 cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
495 lea r5d, [r5*3] 490 lea r5d, [r5*3]
491 shl r5d, 4
496 %ifdef PIC 492 %ifdef PIC
497 lea r11, [sixtap_filter_hw_m] 493 lea r11, [sixtap_filter_v_m]
498 %endif 494 %endif
499 lea r5, [sixtap_filter_hw+r5*8] 495 lea r5, [sixtap_filter_v+r5-96]
500 pxor m7, m7 496 pxor m7, m7
501 497 mova m6, [pw_64]
498 %ifdef m8
499 mova m8, [r5+ 0]
500 mova m9, [r5+16]
501 mova m10, [r5+32]
502 mova m11, [r5+48]
503 mova m12, [r5+64]
504 mova m13, [r5+80]
505 %endif
502 .nextrow 506 .nextrow
503 movu m0, [r2-2] 507 movq m0, [r2-2]
504 mova m6, m0 508 movq m1, [r2-1]
505 mova m4, m0 509 movq m2, [r2-0]
506 punpcklbw m0, m7 ; ABCDEFGHI 510 movq m3, [r2+1]
507 mova m1, m0 511 movq m4, [r2+2]
508 mova m2, m0 512 movq m5, [r2+3]
509 mova m3, m0 513 punpcklbw m0, m7
510 psrldq m1, 2 ; BCDEFGH 514 punpcklbw m1, m7
511 psrldq m2, 4 ; CDEFGH 515 punpcklbw m2, m7
512 psrldq m3, 6 ; DEFGH 516 punpcklbw m3, m7
513 psrldq m4, 4 517 punpcklbw m4, m7
514 punpcklbw m4, m7 ; EFGH 518 punpcklbw m5, m7
515 mova m5, m4 519 %ifdef m8
516 psrldq m5, 2 ; FGH 520 pmullw m0, m8
517 punpcklwd m0, m1 ; ABBCCDDE 521 pmullw m1, m9
518 punpcklwd m2, m3 ; CDDEEFFG 522 pmullw m2, m10
519 punpcklwd m4, m5 ; EFFGGHHI 523 pmullw m3, m11
520 pmaddwd m0, [r5-48] 524 pmullw m4, m12
521 pmaddwd m2, [r5-32] 525 pmullw m5, m13
522 pmaddwd m4, [r5-16] 526 %else
523 paddd m0, m2 527 pmullw m0, [r5+ 0]
524 paddd m0, m4 528 pmullw m1, [r5+16]
525 529 pmullw m2, [r5+32]
526 psrldq m6, 4 530 pmullw m3, [r5+48]
527 mova m4, m6 531 pmullw m4, [r5+64]
528 punpcklbw m6, m7 ; ABCDEFGHI 532 pmullw m5, [r5+80]
529 mova m1, m6 533 %endif
530 mova m2, m6 534 paddsw m1, m4
531 mova m3, m6 535 paddsw m0, m5
532 psrldq m1, 2 ; BCDEFGH 536 paddsw m1, m2
533 psrldq m2, 4 ; CDEFGH 537 paddsw m0, m3
534 psrldq m3, 6 ; DEFGH 538 paddsw m0, m1
535 psrldq m4, 4 539 paddsw m0, m6
536 punpcklbw m4, m7 ; EFGH
537 mova m5, m4
538 psrldq m5, 2 ; FGH
539 punpcklwd m6, m1 ; ABBCCDDE
540 punpcklwd m2, m3 ; CDDEEFFG
541 punpcklwd m4, m5 ; EFFGGHHI
542 pmaddwd m6, [r5-48]
543 pmaddwd m2, [r5-32]
544 pmaddwd m4, [r5-16]
545 paddd m6, m2
546 paddd m6, m4
547
548 packssdw m0, m6
549 paddsw m0, [pw_64]
550 psraw m0, 7 540 psraw m0, 7
551 packuswb m0, m7 541 packuswb m0, m7
552 movh [r0], m0 ; store 542 movh [r0], m0 ; store
553 543
554 ; go to next line 544 ; go to next line