Mercurial > libavcodec.hg
comparison x86/h264_idct.asm @ 12510:ef2f2db5b7be libavcodec
Unroll loop in h264_idct_add8_sse2(). This means we can inline scan8[] in the
code directly also and remove loop setup. 20% faster in function, 0.8% overall.
See "[PATCH] unroll loop in h264_idct_add8_sse2()" thread on ML.
author | rbultje |
---|---|
date | Fri, 24 Sep 2010 14:05:45 +0000 |
parents | 58a960d6e34c |
children | 41ebcc0afb40 |
comparison
equal
deleted
inserted
replaced
12509:7220936dc29c | 12510:ef2f2db5b7be |
---|---|
802 add r2, 64 | 802 add r2, 64 |
803 cmp r5, 16 | 803 cmp r5, 16 |
804 jl .next2blocks | 804 jl .next2blocks |
805 REP_RET | 805 REP_RET |
806 | 806 |
807 h264_idct_add8_sse2_plane: | 807 %macro add8_sse2_cycle 2 |
808 .next2blocks | 808 movzx r0, word [r4+%2] |
809 movzx r0, byte [scan8+r5] | |
810 movzx r0, word [r4+r0] | |
811 test r0, r0 | 809 test r0, r0 |
812 jz .try_dc | 810 jz .try%1dc |
813 %ifdef ARCH_X86_64 | 811 %ifdef ARCH_X86_64 |
814 mov r0d, dword [r1+r5*4] | 812 mov r0d, dword [r1+%1*8+64] |
815 add r0, [r10] | 813 add r0, [r10] |
816 %else | 814 %else |
817 mov r0, r1m ; XXX r1m here is actually r0m of the calling func | 815 mov r0, r0m |
818 mov r0, [r0] | 816 mov r0, [r0] |
819 add r0, dword [r1+r5*4] | 817 add r0, dword [r1+%1*8+64] |
820 %endif | 818 %endif |
821 call x264_add8x4_idct_sse2 | 819 call x264_add8x4_idct_sse2 |
822 add r5, 2 | 820 jmp .cycle%1end |
823 add r2, 64 | 821 .try%1dc |
824 test r5, 3 | |
825 jnz .next2blocks | |
826 rep ret | |
827 .try_dc | |
828 movsx r0, word [r2 ] | 822 movsx r0, word [r2 ] |
829 or r0w, word [r2+32] | 823 or r0w, word [r2+32] |
830 jz .skip2blocks | 824 jz .cycle%1end |
831 %ifdef ARCH_X86_64 | 825 %ifdef ARCH_X86_64 |
832 mov r0d, dword [r1+r5*4] | 826 mov r0d, dword [r1+%1*8+64] |
833 add r0, [r10] | 827 add r0, [r10] |
834 %else | 828 %else |
835 mov r0, r1m ; XXX r1m here is actually r0m of the calling func | 829 mov r0, r0m |
836 mov r0, [r0] | 830 mov r0, [r0] |
837 add r0, dword [r1+r5*4] | 831 add r0, dword [r1+%1*8+64] |
838 %endif | 832 %endif |
839 call h264_idct_dc_add8_mmx2 | 833 call h264_idct_dc_add8_mmx2 |
840 .skip2blocks | 834 .cycle%1end |
841 add r5, 2 | 835 %if %1 < 3 |
842 add r2, 64 | 836 add r2, 64 |
843 test r5, 3 | 837 %endif |
844 jnz .next2blocks | 838 %endmacro |
845 rep ret | |
846 | 839 |
847 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, | 840 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, |
848 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | 841 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
849 cglobal h264_idct_add8_sse2, 5, 7, 8 | 842 cglobal h264_idct_add8_sse2, 5, 7, 8 |
850 mov r5, 16 | |
851 add r2, 512 | 843 add r2, 512 |
852 %ifdef PIC | |
853 lea r11, [scan8_mem] | |
854 %endif | |
855 %ifdef ARCH_X86_64 | 844 %ifdef ARCH_X86_64 |
856 mov r10, r0 | 845 mov r10, r0 |
857 %endif | 846 %endif |
858 call h264_idct_add8_sse2_plane | 847 add8_sse2_cycle 0, 0x09 |
848 add8_sse2_cycle 1, 0x11 | |
859 %ifdef ARCH_X86_64 | 849 %ifdef ARCH_X86_64 |
860 add r10, gprsize | 850 add r10, gprsize |
861 %else | 851 %else |
862 add r0mp, gprsize | 852 add r0mp, gprsize |
863 %endif | 853 %endif |
864 call h264_idct_add8_sse2_plane | 854 add8_sse2_cycle 2, 0x21 |
865 RET | 855 add8_sse2_cycle 3, 0x29 |
856 RET |