comparison x86/vp8dsp.asm @ 11991:a6d24fc1deb7 libavcodec

Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
author darkshikari
date Mon, 28 Jun 2010 18:56:24 +0000
parents c3afb5be0d9b
children da388061b227
comparison
equal deleted inserted replaced
11990:3c51d7ac41c9 11991:a6d24fc1deb7
96 times 8 dw 36 96 times 8 dw 36
97 times 8 dw 108 97 times 8 dw 108
98 times 8 dw -11 98 times 8 dw -11
99 times 8 dw 2 99 times 8 dw 2
100 100
101 %ifdef PIC 101 bilinear_filter_vw_m: times 8 dw 1
102 %define fourtap_filter_hw r11 102 times 8 dw 2
103 %define sixtap_filter_hw r11 103 times 8 dw 3
104 %define fourtap_filter_hb r11 104 times 8 dw 4
105 %define sixtap_filter_hb r11 105 times 8 dw 5
106 %define fourtap_filter_v r11 106 times 8 dw 6
107 %define sixtap_filter_v r11 107 times 8 dw 7
108
109 bilinear_filter_vb_m: times 8 db 7, 1
110 times 8 db 6, 2
111 times 8 db 5, 3
112 times 8 db 4, 4
113 times 8 db 3, 5
114 times 8 db 2, 6
115 times 8 db 1, 7
116
117 %ifdef PIC
118 %define fourtap_filter_hw r11
119 %define sixtap_filter_hw r11
120 %define fourtap_filter_hb r11
121 %define sixtap_filter_hb r11
122 %define fourtap_filter_v r11
123 %define sixtap_filter_v r11
124 %define bilinear_filter_vw r11
125 %define bilinear_filter_vb r11
108 %else 126 %else
109 %define fourtap_filter_hw fourtap_filter_hw_m 127 %define fourtap_filter_hw fourtap_filter_hw_m
110 %define sixtap_filter_hw sixtap_filter_hw_m 128 %define sixtap_filter_hw sixtap_filter_hw_m
111 %define fourtap_filter_hb fourtap_filter_hb_m 129 %define fourtap_filter_hb fourtap_filter_hb_m
112 %define sixtap_filter_hb sixtap_filter_hb_m 130 %define sixtap_filter_hb sixtap_filter_hb_m
113 %define fourtap_filter_v fourtap_filter_v_m 131 %define fourtap_filter_v fourtap_filter_v_m
114 %define sixtap_filter_v sixtap_filter_v_m 132 %define sixtap_filter_v sixtap_filter_v_m
115 %endif 133 %define bilinear_filter_vw bilinear_filter_vw_m
116 134 %define bilinear_filter_vb bilinear_filter_vb_m
117 filter_v4_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 135 %endif
118 filter_v4_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 136
119 137 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
120 filter_v6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 138 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
121 filter_v6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 139
122 filter_v6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 140 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
141 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
142 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
123 143
124 cextern pw_4 144 cextern pw_4
125 cextern pw_64 145 cextern pw_64
126 146
127 SECTION .text 147 SECTION .text
359 REP_RET 379 REP_RET
360 380
361 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 381 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
362 shl r5d, 4 382 shl r5d, 4
363 mova m2, [pw_64] 383 mova m2, [pw_64]
364 mova m3, [filter_v4_shuf1] 384 mova m3, [filter_h4_shuf]
365 mova m4, [filter_v4_shuf2] 385 mova m4, [filter_h6_shuf2]
366 %ifdef PIC 386 %ifdef PIC
367 lea r11, [fourtap_filter_hb_m] 387 lea r11, [fourtap_filter_hb_m]
368 %endif 388 %endif
369 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes 389 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
370 mova m6, [fourtap_filter_hb+r5] 390 mova m6, [fourtap_filter_hb+r5]
389 jg .nextrow 409 jg .nextrow
390 REP_RET 410 REP_RET
391 411
392 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 412 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
393 lea r5d, [r5*3] 413 lea r5d, [r5*3]
394 mova m3, [filter_v6_shuf1] 414 mova m3, [filter_h6_shuf1]
395 mova m4, [filter_v6_shuf2] 415 mova m4, [filter_h6_shuf2]
396 %ifdef PIC 416 %ifdef PIC
397 lea r11, [sixtap_filter_hb_m] 417 lea r11, [sixtap_filter_hb_m]
398 %endif 418 %endif
399 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes 419 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
400 mova m6, [sixtap_filter_hb+r5*8-32] 420 mova m6, [sixtap_filter_hb+r5*8-32]
404 movu m0, [r2-2] 424 movu m0, [r2-2]
405 mova m1, m0 425 mova m1, m0
406 mova m2, m0 426 mova m2, m0
407 pshufb m0, m3 427 pshufb m0, m3
408 pshufb m1, m4 428 pshufb m1, m4
409 pshufb m2, [filter_v6_shuf3] 429 pshufb m2, [filter_h6_shuf3]
410 pmaddubsw m0, m5 430 pmaddubsw m0, m5
411 pmaddubsw m1, m6 431 pmaddubsw m1, m6
412 pmaddubsw m2, m7 432 pmaddubsw m2, m7
413 paddsw m0, m1 433 paddsw m0, m1
414 paddsw m0, m2 434 paddsw m0, m2
629 649
630 ; go to next line 650 ; go to next line
631 add r0, r1 651 add r0, r1
632 add r2, r3 652 add r2, r3
633 dec r4 ; next row 653 dec r4 ; next row
654 jg .nextrow
655 REP_RET
656
657 %macro FILTER_BILINEAR 3
658 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
659 mov r5d, 8*16
660 shl r6d, 4
661 sub r5d, r6d
662 %ifdef PIC
663 lea r11, [bilinear_filter_vw_m]
664 %endif
665 pxor m6, m6
666 mova m4, [bilinear_filter_vw+r5d-16]
667 mova m5, [bilinear_filter_vw+r6d-16]
668 .nextrow
669 movh m0, [r2+r3*0]
670 movh m1, [r2+r3*1]
671 movh m3, [r2+r3*2]
672 punpcklbw m0, m6
673 punpcklbw m1, m6
674 punpcklbw m3, m6
675 mova m2, m1
676 pmullw m0, m4
677 pmullw m1, m5
678 pmullw m2, m4
679 pmullw m3, m5
680 paddsw m0, m1
681 paddsw m2, m3
682 psraw m0, 2
683 psraw m2, 2
684 pavgw m0, m6
685 pavgw m2, m6
686 %ifidn %1, mmxext
687 packuswb m0, m0
688 packuswb m2, m2
689 movh [r0+r1*0], m0
690 movh [r0+r1*1], m2
691 %else
692 packuswb m0, m2
693 movh [r0+r1*0], m0
694 movhps [r0+r1*1], m0
695 %endif
696
697 lea r0, [r0+r1*2]
698 lea r2, [r2+r3*2]
699 sub r4, 2
700 jg .nextrow
701 REP_RET
702
703 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
704 mov r6d, 8*16
705 shl r5d, 4
706 sub r6d, r5d
707 %ifdef PIC
708 lea r11, [bilinear_filter_vw_m]
709 %endif
710 pxor m6, m6
711 mova m4, [bilinear_filter_vw+r6d-16]
712 mova m5, [bilinear_filter_vw+r5d-16]
713 .nextrow
714 movh m0, [r2+r3*0+0]
715 movh m1, [r2+r3*0+1]
716 movh m2, [r2+r3*1+0]
717 movh m3, [r2+r3*1+1]
718 punpcklbw m0, m6
719 punpcklbw m1, m6
720 punpcklbw m2, m6
721 punpcklbw m3, m6
722 pmullw m0, m4
723 pmullw m1, m5
724 pmullw m2, m4
725 pmullw m3, m5
726 paddsw m0, m1
727 paddsw m2, m3
728 psraw m0, 2
729 psraw m2, 2
730 pavgw m0, m6
731 pavgw m2, m6
732 %ifidn %1, mmxext
733 packuswb m0, m0
734 packuswb m2, m2
735 movh [r0+r1*0], m0
736 movh [r0+r1*1], m2
737 %else
738 packuswb m0, m2
739 movh [r0+r1*0], m0
740 movhps [r0+r1*1], m0
741 %endif
742
743 lea r0, [r0+r1*2]
744 lea r2, [r2+r3*2]
745 sub r4, 2
746 jg .nextrow
747 REP_RET
748 %endmacro
749
750 INIT_MMX
751 FILTER_BILINEAR mmxext, 4, 0
752 INIT_XMM
753 FILTER_BILINEAR sse2, 8, 7
754
755 cglobal put_vp8_bilinear8_v_ssse3, 7,7,5
756 shl r6d, 4
757 %ifdef PIC
758 lea r11, [bilinear_filter_vb_m]
759 %endif
760 pxor m4, m4
761 mova m3, [bilinear_filter_vb+r6d-16]
762 .nextrow
763 movh m0, [r2+r3*0]
764 movh m1, [r2+r3*1]
765 movh m2, [r2+r3*2]
766 punpcklbw m0, m1
767 punpcklbw m1, m2
768 pmaddubsw m0, m3
769 pmaddubsw m1, m3
770 psraw m0, 2
771 psraw m1, 2
772 pavgw m0, m4
773 pavgw m1, m4
774 packuswb m0, m1
775 movh [r0+r1*0], m0
776 movhps [r0+r1*1], m0
777
778 lea r0, [r0+r1*2]
779 lea r2, [r2+r3*2]
780 sub r4, 2
781 jg .nextrow
782 REP_RET
783
784 cglobal put_vp8_bilinear8_h_ssse3, 7,7,5
785 shl r5d, 4
786 %ifdef PIC
787 lea r11, [bilinear_filter_vb_m]
788 %endif
789 pxor m4, m4
790 mova m2, [filter_h2_shuf]
791 mova m3, [bilinear_filter_vb+r5d-16]
792 .nextrow
793 movu m0, [r2+r3*0]
794 movu m1, [r2+r3*1]
795 pshufb m0, m2
796 pshufb m1, m2
797 pmaddubsw m0, m3
798 pmaddubsw m1, m3
799 psraw m0, 2
800 psraw m1, 2
801 pavgw m0, m4
802 pavgw m1, m4
803 packuswb m0, m1
804 movh [r0+r1*0], m0
805 movhps [r0+r1*1], m0
806
807 lea r0, [r0+r1*2]
808 lea r2, [r2+r3*2]
809 sub r4, 2
634 jg .nextrow 810 jg .nextrow
635 REP_RET 811 REP_RET
636 812
637 ;----------------------------------------------------------------------------- 813 ;-----------------------------------------------------------------------------
638 ; IDCT functions: 814 ; IDCT functions: