libavcodec.hg: x86/h264_idct.asm comparison

comparison x86/h264_idct.asm @ 12510:ef2f2db5b7be libavcodec

Unroll loop in h264_idct_add8_sse2(). This means we can inline scan8[] in the code directly also and remove loop setup. 20% faster in function, 0.8% overall. See "[PATCH] unroll loop in h264_idct_add8_sse2()" thread on ML.

author	rbultje
date	Fri, 24 Sep 2010 14:05:45 +0000
parents	58a960d6e34c
children	41ebcc0afb40

comparison

equal deleted inserted replaced

-:7220936dc29c
+:ef2f2db5b7be
 add         r2, 64
 cmp         r5, 16
 jl .next2blocks
 REP_RET
-h264_idct_add8_sse2_plane:
+%macro add8_sse2_cycle 2
-.next2blocks
+movzx       r0, word [r4+%2]
-movzx       r0, byte [scan8+r5]
-movzx       r0, word [r4+r0]
 test        r0, r0
-jz .try_dc
+jz .try%1dc
 %ifdef ARCH_X86_64
-mov        r0d, dword [r1+r5*4]
+mov        r0d, dword [r1+%1*8+64]
 add         r0, [r10]
 %else
-mov         r0, r1m ; XXX r1m here is actually r0m of the calling func
+mov         r0, r0m
 mov         r0, [r0]
-add         r0, dword [r1+r5*4]
+add         r0, dword [r1+%1*8+64]
 %endif
 call        x264_add8x4_idct_sse2
-add         r5, 2
+jmp .cycle%1end
-add         r2, 64
+.try%1dc
-test        r5, 3
-jnz .next2blocks
-rep ret
-.try_dc
 movsx       r0, word [r2   ]
 or         r0w, word [r2+32]
-jz .skip2blocks
+jz .cycle%1end
 %ifdef ARCH_X86_64
-mov        r0d, dword [r1+r5*4]
+mov        r0d, dword [r1+%1*8+64]
 add         r0, [r10]
 %else
-mov         r0, r1m ; XXX r1m here is actually r0m of the calling func
+mov         r0, r0m
 mov         r0, [r0]
-add         r0, dword [r1+r5*4]
+add         r0, dword [r1+%1*8+64]
 %endif
 call        h264_idct_dc_add8_mmx2
-.skip2blocks
+.cycle%1end
-add         r5, 2
+%if %1 < 3
 add         r2, 64
-test        r5, 3
+%endif
-jnz .next2blocks
+%endmacro
-rep ret
 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
 ;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
 cglobal h264_idct_add8_sse2, 5, 7, 8
-mov          r5, 16
 add          r2, 512
-%ifdef PIC
-lea        r11, [scan8_mem]
-%endif
 %ifdef ARCH_X86_64
 mov         r10, r0
 %endif
-call h264_idct_add8_sse2_plane
+add8_sse2_cycle 0, 0x09
+add8_sse2_cycle 1, 0x11
 %ifdef ARCH_X86_64
 add         r10, gprsize
 %else
 add        r0mp, gprsize
 %endif
-call h264_idct_add8_sse2_plane
+add8_sse2_cycle 2, 0x21
-RET
+add8_sse2_cycle 3, 0x29
+RET

Mercurial > libavcodec.hg

comparison x86/h264_idct.asm @ 12510:ef2f2db5b7be libavcodec