Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12279:7fb91885433c libavcodec
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
author | rbultje |
---|---|
date | Mon, 26 Jul 2010 21:18:19 +0000 |
parents | da5b503f050d |
children | 435319d67bd8 |
comparison
equal
deleted
inserted
replaced
12278:da5b503f050d | 12279:7fb91885433c |
---|---|
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | 142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | 143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |
144 | 144 |
145 pw_20091: times 4 dw 20091 | 145 pw_20091: times 4 dw 20091 |
146 pw_17734: times 4 dw 17734 | 146 pw_17734: times 4 dw 17734 |
147 | |
148 pb_27_63: times 8 db 27, 63 | |
149 pb_18_63: times 8 db 18, 63 | |
150 pb_9_63: times 8 db 9, 63 | |
147 | 151 |
148 cextern pb_1 | 152 cextern pb_1 |
149 cextern pw_3 | 153 cextern pw_3 |
150 cextern pb_3 | 154 cextern pb_3 |
151 cextern pw_4 | 155 cextern pw_4 |
2173 %define dst2_reg I_reg | 2177 %define dst2_reg I_reg |
2174 %ifndef m8 | 2178 %ifndef m8 |
2175 %define stack_reg hev_thr_reg | 2179 %define stack_reg hev_thr_reg |
2176 %endif | 2180 %endif |
2177 | 2181 |
2182 %define ssse3_or_higher 0 | |
2178 %ifnidn %1, sse2 | 2183 %ifnidn %1, sse2 |
2179 %if mmsize == 16 | 2184 %if mmsize == 16 |
2185 %define ssse3_or_higher 1 | |
2186 %endif | |
2187 %endif | |
2188 | |
2189 %if ssse3_or_higher | |
2180 pxor m7, m7 | 2190 pxor m7, m7 |
2181 %endif | |
2182 %endif | 2191 %endif |
2183 | 2192 |
2184 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 | 2193 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
2185 ; splat function arguments | 2194 ; splat function arguments |
2186 SPLATB_REG m0, E_reg, m7 ; E | 2195 SPLATB_REG m0, E_reg, m7 ; E |
2574 pandn m0, m6 | 2583 pandn m0, m6 |
2575 psubusb m4, m0 | 2584 psubusb m4, m0 |
2576 paddusb m4, m1 ; q0-f1 | 2585 paddusb m4, m1 ; q0-f1 |
2577 | 2586 |
2578 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) | 2587 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) |
2588 %if ssse3_or_higher | |
2589 mova m7, [pb_1] | |
2590 %else | |
2579 mova m7, [pw_63] | 2591 mova m7, [pw_63] |
2592 %endif | |
2580 %ifdef m8 | 2593 %ifdef m8 |
2581 SWAP 1, 8 | 2594 SWAP 1, 8 |
2582 %else | 2595 %else |
2583 mova m1, lim_res | 2596 mova m1, lim_res |
2584 %endif | 2597 %endif |
2585 pxor m0, m0 | 2598 pxor m0, m0 |
2586 mova m6, m1 | 2599 mova m6, m1 |
2587 pcmpgtb m0, m1 ; which are negative | 2600 pcmpgtb m0, m1 ; which are negative |
2601 %if ssse3_or_higher | |
2602 punpcklbw m6, m7 ; interleave with "1" for rounding | |
2603 punpckhbw m1, m7 | |
2604 %else | |
2588 punpcklbw m6, m0 ; signed byte->word | 2605 punpcklbw m6, m0 ; signed byte->word |
2589 punpckhbw m1, m0 | 2606 punpckhbw m1, m0 |
2607 %endif | |
2590 mova lim_sign, m0 | 2608 mova lim_sign, m0 |
2609 %if ssse3_or_higher | |
2610 mova m7, [pb_27_63] | |
2611 %ifndef m8 | |
2612 mova lim_res, m1 | |
2613 %endif | |
2614 %ifdef m10 | |
2615 SWAP 0, 10 ; don't lose lim_sign copy | |
2616 %endif | |
2617 mova m0, m7 | |
2618 pmaddubsw m7, m6 | |
2619 SWAP 6, 7 | |
2620 pmaddubsw m0, m1 | |
2621 SWAP 1, 0 | |
2622 %ifdef m10 | |
2623 SWAP 0, 10 | |
2624 %else | |
2625 mova m0, lim_sign | |
2626 %endif | |
2627 %else | |
2591 mova mask_res, m6 ; backup for later in filter | 2628 mova mask_res, m6 ; backup for later in filter |
2592 mova lim_res, m1 | 2629 mova lim_res, m1 |
2593 pmullw m6, [pw_27] | 2630 pmullw m6, [pw_27] |
2594 pmullw m1, [pw_27] | 2631 pmullw m1, [pw_27] |
2595 paddw m6, m7 | 2632 paddw m6, m7 |
2596 paddw m1, m7 | 2633 paddw m1, m7 |
2634 %endif | |
2597 psraw m6, 7 | 2635 psraw m6, 7 |
2598 psraw m1, 7 | 2636 psraw m1, 7 |
2599 packsswb m6, m1 ; a0 | 2637 packsswb m6, m1 ; a0 |
2600 pxor m1, m1 | 2638 pxor m1, m1 |
2601 psubb m1, m6 | 2639 psubb m1, m6 |
2602 pand m1, m0 ; -a0 | 2640 pand m1, m0 ; -a0 |
2603 pandn m0, m6 ; +a0 | 2641 pandn m0, m6 ; +a0 |
2642 %if ssse3_or_higher | |
2643 mova m6, [pb_18_63] ; pipelining | |
2644 %endif | |
2604 psubusb m3, m1 | 2645 psubusb m3, m1 |
2605 paddusb m4, m1 | 2646 paddusb m4, m1 |
2606 paddusb m3, m0 ; p0+a0 | 2647 paddusb m3, m0 ; p0+a0 |
2607 psubusb m4, m0 ; q0-a0 | 2648 psubusb m4, m0 ; q0-a0 |
2608 | 2649 |
2650 %if ssse3_or_higher | |
2651 SWAP 6, 7 | |
2652 %ifdef m10 | |
2653 SWAP 1, 10 | |
2654 %else | |
2655 mova m1, lim_res | |
2656 %endif | |
2657 mova m0, m7 | |
2658 pmaddubsw m7, m6 | |
2659 SWAP 6, 7 | |
2660 pmaddubsw m0, m1 | |
2661 SWAP 1, 0 | |
2662 %ifdef m10 | |
2663 SWAP 0, 10 | |
2664 %endif | |
2665 mova m0, lim_sign | |
2666 %else | |
2609 mova m6, mask_res | 2667 mova m6, mask_res |
2610 mova m1, lim_res | 2668 mova m1, lim_res |
2611 mova m0, lim_sign | |
2612 pmullw m6, [pw_18] | 2669 pmullw m6, [pw_18] |
2613 pmullw m1, [pw_18] | 2670 pmullw m1, [pw_18] |
2614 paddw m6, m7 | 2671 paddw m6, m7 |
2615 paddw m1, m7 | 2672 paddw m1, m7 |
2673 %endif | |
2674 mova m0, lim_sign | |
2616 psraw m6, 7 | 2675 psraw m6, 7 |
2617 psraw m1, 7 | 2676 psraw m1, 7 |
2618 packsswb m6, m1 ; a1 | 2677 packsswb m6, m1 ; a1 |
2619 pxor m1, m1 | 2678 pxor m1, m1 |
2620 psubb m1, m6 | 2679 psubb m1, m6 |
2621 pand m1, m0 ; -a1 | 2680 pand m1, m0 ; -a1 |
2622 pandn m0, m6 ; +a1 | 2681 pandn m0, m6 ; +a1 |
2682 %if ssse3_or_higher | |
2683 mova m6, [pb_9_63] | |
2684 %endif | |
2623 psubusb m2, m1 | 2685 psubusb m2, m1 |
2624 paddusb m5, m1 | 2686 paddusb m5, m1 |
2625 paddusb m2, m0 ; p1+a1 | 2687 paddusb m2, m0 ; p1+a1 |
2626 psubusb m5, m0 ; q1-a1 | 2688 psubusb m5, m0 ; q1-a1 |
2627 | 2689 |
2690 %if ssse3_or_higher | |
2691 SWAP 6, 7 | |
2692 %ifdef m10 | |
2693 SWAP 1, 10 | |
2694 %else | |
2695 mova m1, lim_res | |
2696 %endif | |
2697 mova m0, m7 | |
2698 pmaddubsw m7, m6 | |
2699 SWAP 6, 7 | |
2700 pmaddubsw m0, m1 | |
2701 SWAP 1, 0 | |
2702 %else | |
2628 %ifdef m8 | 2703 %ifdef m8 |
2629 SWAP 6, 12 | 2704 SWAP 6, 12 |
2630 SWAP 1, 8 | 2705 SWAP 1, 8 |
2631 %else | 2706 %else |
2632 mova m6, mask_res | 2707 mova m6, mask_res |
2634 %endif | 2709 %endif |
2635 pmullw m6, [pw_9] | 2710 pmullw m6, [pw_9] |
2636 pmullw m1, [pw_9] | 2711 pmullw m1, [pw_9] |
2637 paddw m6, m7 | 2712 paddw m6, m7 |
2638 paddw m1, m7 | 2713 paddw m1, m7 |
2714 %endif | |
2639 %ifdef m9 | 2715 %ifdef m9 |
2640 SWAP 7, 9 | 2716 SWAP 7, 9 |
2641 %else | 2717 %else |
2642 mova m7, lim_sign | 2718 mova m7, lim_sign |
2643 %endif | 2719 %endif |