Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 11991:a6d24fc1deb7 libavcodec
Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
author | darkshikari |
---|---|
date | Mon, 28 Jun 2010 18:56:24 +0000 |
parents | c3afb5be0d9b |
children | da388061b227 |
comparison
equal
deleted
inserted
replaced
11990:3c51d7ac41c9 | 11991:a6d24fc1deb7 |
---|---|
96 times 8 dw 36 | 96 times 8 dw 36 |
97 times 8 dw 108 | 97 times 8 dw 108 |
98 times 8 dw -11 | 98 times 8 dw -11 |
99 times 8 dw 2 | 99 times 8 dw 2 |
100 | 100 |
101 %ifdef PIC | 101 bilinear_filter_vw_m: times 8 dw 1 |
102 %define fourtap_filter_hw r11 | 102 times 8 dw 2 |
103 %define sixtap_filter_hw r11 | 103 times 8 dw 3 |
104 %define fourtap_filter_hb r11 | 104 times 8 dw 4 |
105 %define sixtap_filter_hb r11 | 105 times 8 dw 5 |
106 %define fourtap_filter_v r11 | 106 times 8 dw 6 |
107 %define sixtap_filter_v r11 | 107 times 8 dw 7 |
108 | |
109 bilinear_filter_vb_m: times 8 db 7, 1 | |
110 times 8 db 6, 2 | |
111 times 8 db 5, 3 | |
112 times 8 db 4, 4 | |
113 times 8 db 3, 5 | |
114 times 8 db 2, 6 | |
115 times 8 db 1, 7 | |
116 | |
117 %ifdef PIC | |
118 %define fourtap_filter_hw r11 | |
119 %define sixtap_filter_hw r11 | |
120 %define fourtap_filter_hb r11 | |
121 %define sixtap_filter_hb r11 | |
122 %define fourtap_filter_v r11 | |
123 %define sixtap_filter_v r11 | |
124 %define bilinear_filter_vw r11 | |
125 %define bilinear_filter_vb r11 | |
108 %else | 126 %else |
109 %define fourtap_filter_hw fourtap_filter_hw_m | 127 %define fourtap_filter_hw fourtap_filter_hw_m |
110 %define sixtap_filter_hw sixtap_filter_hw_m | 128 %define sixtap_filter_hw sixtap_filter_hw_m |
111 %define fourtap_filter_hb fourtap_filter_hb_m | 129 %define fourtap_filter_hb fourtap_filter_hb_m |
112 %define sixtap_filter_hb sixtap_filter_hb_m | 130 %define sixtap_filter_hb sixtap_filter_hb_m |
113 %define fourtap_filter_v fourtap_filter_v_m | 131 %define fourtap_filter_v fourtap_filter_v_m |
114 %define sixtap_filter_v sixtap_filter_v_m | 132 %define sixtap_filter_v sixtap_filter_v_m |
115 %endif | 133 %define bilinear_filter_vw bilinear_filter_vw_m |
116 | 134 %define bilinear_filter_vb bilinear_filter_vb_m |
117 filter_v4_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 | 135 %endif |
118 filter_v4_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | 136 |
119 | 137 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
120 filter_v6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 | 138 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 |
121 filter_v6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | 139 |
122 filter_v6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | 140 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
141 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
142 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
123 | 143 |
124 cextern pw_4 | 144 cextern pw_4 |
125 cextern pw_64 | 145 cextern pw_64 |
126 | 146 |
127 SECTION .text | 147 SECTION .text |
359 REP_RET | 379 REP_RET |
360 | 380 |
361 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 | 381 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 |
362 shl r5d, 4 | 382 shl r5d, 4 |
363 mova m2, [pw_64] | 383 mova m2, [pw_64] |
364 mova m3, [filter_v4_shuf1] | 384 mova m3, [filter_h4_shuf] |
365 mova m4, [filter_v4_shuf2] | 385 mova m4, [filter_h6_shuf2] |
366 %ifdef PIC | 386 %ifdef PIC |
367 lea r11, [fourtap_filter_hb_m] | 387 lea r11, [fourtap_filter_hb_m] |
368 %endif | 388 %endif |
369 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes | 389 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
370 mova m6, [fourtap_filter_hb+r5] | 390 mova m6, [fourtap_filter_hb+r5] |
389 jg .nextrow | 409 jg .nextrow |
390 REP_RET | 410 REP_RET |
391 | 411 |
392 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 | 412 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 |
393 lea r5d, [r5*3] | 413 lea r5d, [r5*3] |
394 mova m3, [filter_v6_shuf1] | 414 mova m3, [filter_h6_shuf1] |
395 mova m4, [filter_v6_shuf2] | 415 mova m4, [filter_h6_shuf2] |
396 %ifdef PIC | 416 %ifdef PIC |
397 lea r11, [sixtap_filter_hb_m] | 417 lea r11, [sixtap_filter_hb_m] |
398 %endif | 418 %endif |
399 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes | 419 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
400 mova m6, [sixtap_filter_hb+r5*8-32] | 420 mova m6, [sixtap_filter_hb+r5*8-32] |
404 movu m0, [r2-2] | 424 movu m0, [r2-2] |
405 mova m1, m0 | 425 mova m1, m0 |
406 mova m2, m0 | 426 mova m2, m0 |
407 pshufb m0, m3 | 427 pshufb m0, m3 |
408 pshufb m1, m4 | 428 pshufb m1, m4 |
409 pshufb m2, [filter_v6_shuf3] | 429 pshufb m2, [filter_h6_shuf3] |
410 pmaddubsw m0, m5 | 430 pmaddubsw m0, m5 |
411 pmaddubsw m1, m6 | 431 pmaddubsw m1, m6 |
412 pmaddubsw m2, m7 | 432 pmaddubsw m2, m7 |
413 paddsw m0, m1 | 433 paddsw m0, m1 |
414 paddsw m0, m2 | 434 paddsw m0, m2 |
629 | 649 |
630 ; go to next line | 650 ; go to next line |
631 add r0, r1 | 651 add r0, r1 |
632 add r2, r3 | 652 add r2, r3 |
633 dec r4 ; next row | 653 dec r4 ; next row |
654 jg .nextrow | |
655 REP_RET | |
656 | |
657 %macro FILTER_BILINEAR 3 | |
658 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 | |
659 mov r5d, 8*16 | |
660 shl r6d, 4 | |
661 sub r5d, r6d | |
662 %ifdef PIC | |
663 lea r11, [bilinear_filter_vw_m] | |
664 %endif | |
665 pxor m6, m6 | |
666 mova m4, [bilinear_filter_vw+r5d-16] | |
667 mova m5, [bilinear_filter_vw+r6d-16] | |
668 .nextrow | |
669 movh m0, [r2+r3*0] | |
670 movh m1, [r2+r3*1] | |
671 movh m3, [r2+r3*2] | |
672 punpcklbw m0, m6 | |
673 punpcklbw m1, m6 | |
674 punpcklbw m3, m6 | |
675 mova m2, m1 | |
676 pmullw m0, m4 | |
677 pmullw m1, m5 | |
678 pmullw m2, m4 | |
679 pmullw m3, m5 | |
680 paddsw m0, m1 | |
681 paddsw m2, m3 | |
682 psraw m0, 2 | |
683 psraw m2, 2 | |
684 pavgw m0, m6 | |
685 pavgw m2, m6 | |
686 %ifidn %1, mmxext | |
687 packuswb m0, m0 | |
688 packuswb m2, m2 | |
689 movh [r0+r1*0], m0 | |
690 movh [r0+r1*1], m2 | |
691 %else | |
692 packuswb m0, m2 | |
693 movh [r0+r1*0], m0 | |
694 movhps [r0+r1*1], m0 | |
695 %endif | |
696 | |
697 lea r0, [r0+r1*2] | |
698 lea r2, [r2+r3*2] | |
699 sub r4, 2 | |
700 jg .nextrow | |
701 REP_RET | |
702 | |
703 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 | |
704 mov r6d, 8*16 | |
705 shl r5d, 4 | |
706 sub r6d, r5d | |
707 %ifdef PIC | |
708 lea r11, [bilinear_filter_vw_m] | |
709 %endif | |
710 pxor m6, m6 | |
711 mova m4, [bilinear_filter_vw+r6d-16] | |
712 mova m5, [bilinear_filter_vw+r5d-16] | |
713 .nextrow | |
714 movh m0, [r2+r3*0+0] | |
715 movh m1, [r2+r3*0+1] | |
716 movh m2, [r2+r3*1+0] | |
717 movh m3, [r2+r3*1+1] | |
718 punpcklbw m0, m6 | |
719 punpcklbw m1, m6 | |
720 punpcklbw m2, m6 | |
721 punpcklbw m3, m6 | |
722 pmullw m0, m4 | |
723 pmullw m1, m5 | |
724 pmullw m2, m4 | |
725 pmullw m3, m5 | |
726 paddsw m0, m1 | |
727 paddsw m2, m3 | |
728 psraw m0, 2 | |
729 psraw m2, 2 | |
730 pavgw m0, m6 | |
731 pavgw m2, m6 | |
732 %ifidn %1, mmxext | |
733 packuswb m0, m0 | |
734 packuswb m2, m2 | |
735 movh [r0+r1*0], m0 | |
736 movh [r0+r1*1], m2 | |
737 %else | |
738 packuswb m0, m2 | |
739 movh [r0+r1*0], m0 | |
740 movhps [r0+r1*1], m0 | |
741 %endif | |
742 | |
743 lea r0, [r0+r1*2] | |
744 lea r2, [r2+r3*2] | |
745 sub r4, 2 | |
746 jg .nextrow | |
747 REP_RET | |
748 %endmacro | |
749 | |
750 INIT_MMX | |
751 FILTER_BILINEAR mmxext, 4, 0 | |
752 INIT_XMM | |
753 FILTER_BILINEAR sse2, 8, 7 | |
754 | |
755 cglobal put_vp8_bilinear8_v_ssse3, 7,7,5 | |
756 shl r6d, 4 | |
757 %ifdef PIC | |
758 lea r11, [bilinear_filter_vb_m] | |
759 %endif | |
760 pxor m4, m4 | |
761 mova m3, [bilinear_filter_vb+r6d-16] | |
762 .nextrow | |
763 movh m0, [r2+r3*0] | |
764 movh m1, [r2+r3*1] | |
765 movh m2, [r2+r3*2] | |
766 punpcklbw m0, m1 | |
767 punpcklbw m1, m2 | |
768 pmaddubsw m0, m3 | |
769 pmaddubsw m1, m3 | |
770 psraw m0, 2 | |
771 psraw m1, 2 | |
772 pavgw m0, m4 | |
773 pavgw m1, m4 | |
774 packuswb m0, m1 | |
775 movh [r0+r1*0], m0 | |
776 movhps [r0+r1*1], m0 | |
777 | |
778 lea r0, [r0+r1*2] | |
779 lea r2, [r2+r3*2] | |
780 sub r4, 2 | |
781 jg .nextrow | |
782 REP_RET | |
783 | |
784 cglobal put_vp8_bilinear8_h_ssse3, 7,7,5 | |
785 shl r5d, 4 | |
786 %ifdef PIC | |
787 lea r11, [bilinear_filter_vb_m] | |
788 %endif | |
789 pxor m4, m4 | |
790 mova m2, [filter_h2_shuf] | |
791 mova m3, [bilinear_filter_vb+r5d-16] | |
792 .nextrow | |
793 movu m0, [r2+r3*0] | |
794 movu m1, [r2+r3*1] | |
795 pshufb m0, m2 | |
796 pshufb m1, m2 | |
797 pmaddubsw m0, m3 | |
798 pmaddubsw m1, m3 | |
799 psraw m0, 2 | |
800 psraw m1, 2 | |
801 pavgw m0, m4 | |
802 pavgw m1, m4 | |
803 packuswb m0, m1 | |
804 movh [r0+r1*0], m0 | |
805 movhps [r0+r1*1], m0 | |
806 | |
807 lea r0, [r0+r1*2] | |
808 lea r2, [r2+r3*2] | |
809 sub r4, 2 | |
634 jg .nextrow | 810 jg .nextrow |
635 REP_RET | 811 REP_RET |
636 | 812 |
637 ;----------------------------------------------------------------------------- | 813 ;----------------------------------------------------------------------------- |
638 ; IDCT functions: | 814 ; IDCT functions: |