comparison x86/vp8dsp.asm @ 12279:7fb91885433c libavcodec

Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
author rbultje
date Mon, 26 Jul 2010 21:18:19 +0000
parents da5b503f050d
children 435319d67bd8
comparison
equal deleted inserted replaced
12278:da5b503f050d 12279:7fb91885433c
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
144 144
145 pw_20091: times 4 dw 20091 145 pw_20091: times 4 dw 20091
146 pw_17734: times 4 dw 17734 146 pw_17734: times 4 dw 17734
147
148 pb_27_63: times 8 db 27, 63
149 pb_18_63: times 8 db 18, 63
150 pb_9_63: times 8 db 9, 63
147 151
148 cextern pb_1 152 cextern pb_1
149 cextern pw_3 153 cextern pw_3
150 cextern pb_3 154 cextern pb_3
151 cextern pw_4 155 cextern pw_4
2173 %define dst2_reg I_reg 2177 %define dst2_reg I_reg
2174 %ifndef m8 2178 %ifndef m8
2175 %define stack_reg hev_thr_reg 2179 %define stack_reg hev_thr_reg
2176 %endif 2180 %endif
2177 2181
2182 %define ssse3_or_higher 0
2178 %ifnidn %1, sse2 2183 %ifnidn %1, sse2
2179 %if mmsize == 16 2184 %if mmsize == 16
2185 %define ssse3_or_higher 1
2186 %endif
2187 %endif
2188
2189 %if ssse3_or_higher
2180 pxor m7, m7 2190 pxor m7, m7
2181 %endif
2182 %endif 2191 %endif
2183 2192
2184 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 2193 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
2185 ; splat function arguments 2194 ; splat function arguments
2186 SPLATB_REG m0, E_reg, m7 ; E 2195 SPLATB_REG m0, E_reg, m7 ; E
2574 pandn m0, m6 2583 pandn m0, m6
2575 psubusb m4, m0 2584 psubusb m4, m0
2576 paddusb m4, m1 ; q0-f1 2585 paddusb m4, m1 ; q0-f1
2577 2586
2578 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) 2587 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
2588 %if ssse3_or_higher
2589 mova m7, [pb_1]
2590 %else
2579 mova m7, [pw_63] 2591 mova m7, [pw_63]
2592 %endif
2580 %ifdef m8 2593 %ifdef m8
2581 SWAP 1, 8 2594 SWAP 1, 8
2582 %else 2595 %else
2583 mova m1, lim_res 2596 mova m1, lim_res
2584 %endif 2597 %endif
2585 pxor m0, m0 2598 pxor m0, m0
2586 mova m6, m1 2599 mova m6, m1
2587 pcmpgtb m0, m1 ; which are negative 2600 pcmpgtb m0, m1 ; which are negative
2601 %if ssse3_or_higher
2602 punpcklbw m6, m7 ; interleave with "1" for rounding
2603 punpckhbw m1, m7
2604 %else
2588 punpcklbw m6, m0 ; signed byte->word 2605 punpcklbw m6, m0 ; signed byte->word
2589 punpckhbw m1, m0 2606 punpckhbw m1, m0
2607 %endif
2590 mova lim_sign, m0 2608 mova lim_sign, m0
2609 %if ssse3_or_higher
2610 mova m7, [pb_27_63]
2611 %ifndef m8
2612 mova lim_res, m1
2613 %endif
2614 %ifdef m10
2615 SWAP 0, 10 ; don't lose lim_sign copy
2616 %endif
2617 mova m0, m7
2618 pmaddubsw m7, m6
2619 SWAP 6, 7
2620 pmaddubsw m0, m1
2621 SWAP 1, 0
2622 %ifdef m10
2623 SWAP 0, 10
2624 %else
2625 mova m0, lim_sign
2626 %endif
2627 %else
2591 mova mask_res, m6 ; backup for later in filter 2628 mova mask_res, m6 ; backup for later in filter
2592 mova lim_res, m1 2629 mova lim_res, m1
2593 pmullw m6, [pw_27] 2630 pmullw m6, [pw_27]
2594 pmullw m1, [pw_27] 2631 pmullw m1, [pw_27]
2595 paddw m6, m7 2632 paddw m6, m7
2596 paddw m1, m7 2633 paddw m1, m7
2634 %endif
2597 psraw m6, 7 2635 psraw m6, 7
2598 psraw m1, 7 2636 psraw m1, 7
2599 packsswb m6, m1 ; a0 2637 packsswb m6, m1 ; a0
2600 pxor m1, m1 2638 pxor m1, m1
2601 psubb m1, m6 2639 psubb m1, m6
2602 pand m1, m0 ; -a0 2640 pand m1, m0 ; -a0
2603 pandn m0, m6 ; +a0 2641 pandn m0, m6 ; +a0
2642 %if ssse3_or_higher
2643 mova m6, [pb_18_63] ; pipelining
2644 %endif
2604 psubusb m3, m1 2645 psubusb m3, m1
2605 paddusb m4, m1 2646 paddusb m4, m1
2606 paddusb m3, m0 ; p0+a0 2647 paddusb m3, m0 ; p0+a0
2607 psubusb m4, m0 ; q0-a0 2648 psubusb m4, m0 ; q0-a0
2608 2649
2650 %if ssse3_or_higher
2651 SWAP 6, 7
2652 %ifdef m10
2653 SWAP 1, 10
2654 %else
2655 mova m1, lim_res
2656 %endif
2657 mova m0, m7
2658 pmaddubsw m7, m6
2659 SWAP 6, 7
2660 pmaddubsw m0, m1
2661 SWAP 1, 0
2662 %ifdef m10
2663 SWAP 0, 10
2664 %endif
2665 mova m0, lim_sign
2666 %else
2609 mova m6, mask_res 2667 mova m6, mask_res
2610 mova m1, lim_res 2668 mova m1, lim_res
2611 mova m0, lim_sign
2612 pmullw m6, [pw_18] 2669 pmullw m6, [pw_18]
2613 pmullw m1, [pw_18] 2670 pmullw m1, [pw_18]
2614 paddw m6, m7 2671 paddw m6, m7
2615 paddw m1, m7 2672 paddw m1, m7
2673 %endif
2674 mova m0, lim_sign
2616 psraw m6, 7 2675 psraw m6, 7
2617 psraw m1, 7 2676 psraw m1, 7
2618 packsswb m6, m1 ; a1 2677 packsswb m6, m1 ; a1
2619 pxor m1, m1 2678 pxor m1, m1
2620 psubb m1, m6 2679 psubb m1, m6
2621 pand m1, m0 ; -a1 2680 pand m1, m0 ; -a1
2622 pandn m0, m6 ; +a1 2681 pandn m0, m6 ; +a1
2682 %if ssse3_or_higher
2683 mova m6, [pb_9_63]
2684 %endif
2623 psubusb m2, m1 2685 psubusb m2, m1
2624 paddusb m5, m1 2686 paddusb m5, m1
2625 paddusb m2, m0 ; p1+a1 2687 paddusb m2, m0 ; p1+a1
2626 psubusb m5, m0 ; q1-a1 2688 psubusb m5, m0 ; q1-a1
2627 2689
2690 %if ssse3_or_higher
2691 SWAP 6, 7
2692 %ifdef m10
2693 SWAP 1, 10
2694 %else
2695 mova m1, lim_res
2696 %endif
2697 mova m0, m7
2698 pmaddubsw m7, m6
2699 SWAP 6, 7
2700 pmaddubsw m0, m1
2701 SWAP 1, 0
2702 %else
2628 %ifdef m8 2703 %ifdef m8
2629 SWAP 6, 12 2704 SWAP 6, 12
2630 SWAP 1, 8 2705 SWAP 1, 8
2631 %else 2706 %else
2632 mova m6, mask_res 2707 mova m6, mask_res
2634 %endif 2709 %endif
2635 pmullw m6, [pw_9] 2710 pmullw m6, [pw_9]
2636 pmullw m1, [pw_9] 2711 pmullw m1, [pw_9]
2637 paddw m6, m7 2712 paddw m6, m7
2638 paddw m1, m7 2713 paddw m1, m7
2714 %endif
2639 %ifdef m9 2715 %ifdef m9
2640 SWAP 7, 9 2716 SWAP 7, 9
2641 %else 2717 %else
2642 mova m7, lim_sign 2718 mova m7, lim_sign
2643 %endif 2719 %endif