comparison x86/h264_idct.asm @ 12510:ef2f2db5b7be libavcodec

Unroll loop in h264_idct_add8_sse2(). This means we can inline scan8[] in the code directly also and remove loop setup. 20% faster in function, 0.8% overall. See "[PATCH] unroll loop in h264_idct_add8_sse2()" thread on ML.
author rbultje
date Fri, 24 Sep 2010 14:05:45 +0000
parents 58a960d6e34c
children 41ebcc0afb40
comparison
equal deleted inserted replaced
12509:7220936dc29c 12510:ef2f2db5b7be
802 add r2, 64 802 add r2, 64
803 cmp r5, 16 803 cmp r5, 16
804 jl .next2blocks 804 jl .next2blocks
805 REP_RET 805 REP_RET
806 806
807 h264_idct_add8_sse2_plane: 807 %macro add8_sse2_cycle 2
808 .next2blocks 808 movzx r0, word [r4+%2]
809 movzx r0, byte [scan8+r5]
810 movzx r0, word [r4+r0]
811 test r0, r0 809 test r0, r0
812 jz .try_dc 810 jz .try%1dc
813 %ifdef ARCH_X86_64 811 %ifdef ARCH_X86_64
814 mov r0d, dword [r1+r5*4] 812 mov r0d, dword [r1+%1*8+64]
815 add r0, [r10] 813 add r0, [r10]
816 %else 814 %else
817 mov r0, r1m ; XXX r1m here is actually r0m of the calling func 815 mov r0, r0m
818 mov r0, [r0] 816 mov r0, [r0]
819 add r0, dword [r1+r5*4] 817 add r0, dword [r1+%1*8+64]
820 %endif 818 %endif
821 call x264_add8x4_idct_sse2 819 call x264_add8x4_idct_sse2
822 add r5, 2 820 jmp .cycle%1end
823 add r2, 64 821 .try%1dc
824 test r5, 3
825 jnz .next2blocks
826 rep ret
827 .try_dc
828 movsx r0, word [r2 ] 822 movsx r0, word [r2 ]
829 or r0w, word [r2+32] 823 or r0w, word [r2+32]
830 jz .skip2blocks 824 jz .cycle%1end
831 %ifdef ARCH_X86_64 825 %ifdef ARCH_X86_64
832 mov r0d, dword [r1+r5*4] 826 mov r0d, dword [r1+%1*8+64]
833 add r0, [r10] 827 add r0, [r10]
834 %else 828 %else
835 mov r0, r1m ; XXX r1m here is actually r0m of the calling func 829 mov r0, r0m
836 mov r0, [r0] 830 mov r0, [r0]
837 add r0, dword [r1+r5*4] 831 add r0, dword [r1+%1*8+64]
838 %endif 832 %endif
839 call h264_idct_dc_add8_mmx2 833 call h264_idct_dc_add8_mmx2
840 .skip2blocks 834 .cycle%1end
841 add r5, 2 835 %if %1 < 3
842 add r2, 64 836 add r2, 64
843 test r5, 3 837 %endif
844 jnz .next2blocks 838 %endmacro
845 rep ret
846 839
847 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, 840 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
848 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) 841 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
849 cglobal h264_idct_add8_sse2, 5, 7, 8 842 cglobal h264_idct_add8_sse2, 5, 7, 8
850 mov r5, 16
851 add r2, 512 843 add r2, 512
852 %ifdef PIC
853 lea r11, [scan8_mem]
854 %endif
855 %ifdef ARCH_X86_64 844 %ifdef ARCH_X86_64
856 mov r10, r0 845 mov r10, r0
857 %endif 846 %endif
858 call h264_idct_add8_sse2_plane 847 add8_sse2_cycle 0, 0x09
848 add8_sse2_cycle 1, 0x11
859 %ifdef ARCH_X86_64 849 %ifdef ARCH_X86_64
860 add r10, gprsize 850 add r10, gprsize
861 %else 851 %else
862 add r0mp, gprsize 852 add r0mp, gprsize
863 %endif 853 %endif
864 call h264_idct_add8_sse2_plane 854 add8_sse2_cycle 2, 0x21
865 RET 855 add8_sse2_cycle 3, 0x29
856 RET