# HG changeset patch # User darkshikari # Date 1249371775 0 # Node ID c08ca946c80acd8305431f0b495b2ed623975d91 # Parent 46f8d58fbdfb1c385f0b4cdf4c8cfab4b2dec13b Update x264 asm code to latest to add support for 64-bit Windows. Use the new x86inc features to support 64-bit Windows on all non-x264 nasm assembly code as well. Patch by John Adcock, dscaler.johnad AT googlemail DOT com. Win64 changes originally by Anton Mitrofanov. x86util changes mostly by Holger Lubitz. diff -r 46f8d58fbdfb -r c08ca946c80a x86/fft_mmx.asm --- a/x86/fft_mmx.asm Mon Aug 03 23:22:46 2009 +0000 +++ b/x86/fft_mmx.asm Tue Aug 04 07:42:55 2009 +0000 @@ -457,7 +457,7 @@ ; On x86_32, this function does the register saving and restoring for all of fft. ; The others pass args in registers and don't spill anything. -cglobal fft_dispatch%3%2, 2,5,0, z, nbits +cglobal fft_dispatch%3%2, 2,5,8, z, nbits lea r2, [dispatch_tab%3%2 GLOBAL] mov r2, [r2 + (nbitsq-2)*gprsize] call r2 diff -r 46f8d58fbdfb -r c08ca946c80a x86/h264_deblock_sse2.asm --- a/x86/h264_deblock_sse2.asm Mon Aug 03 23:22:46 2009 +0000 +++ b/x86/h264_deblock_sse2.asm Tue Aug 04 07:42:55 2009 +0000 @@ -278,7 +278,7 @@ ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_XMM -cglobal x264_deblock_v_luma_sse2 +cglobal x264_deblock_v_luma_sse2, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] dec r2d ; alpha-1 @@ -318,54 +318,66 @@ DEBLOCK_P0_Q0 mova [r4+2*r1], m1 mova [r0], m2 - ret + RET ;----------------------------------------------------------------------------- ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_sse2 - movsxd r10, esi +cglobal x264_deblock_h_luma_sse2, 5,7 + movsxd r10, r1d lea r11, [r10+r10*2] - lea rax, [r0-4] - lea r9, [r0-4+r11] + lea r6, [r0-4] + lea r5, [r0-4+r11] +%ifdef WIN64 + sub rsp, 0x98 + %define pix_tmp rsp+0x30 +%else sub rsp, 0x68 %define pix_tmp rsp +%endif ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp - lea rax, [rax+r10*8] - lea r9, [r9 +r10*8] - TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8 + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp + lea r6, [r6+r10*8] + lea r5, [r5+r10*8] + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] - mov esi, 0x10 + mov r1d, 0x10 +%ifdef WIN64 + mov [rsp+0x20], r4 +%endif call x264_deblock_v_luma_sse2 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - add rax, 2 - add r9, 2 + add r6, 2 + add r5, 2 movq m0, [pix_tmp+0x18] movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) shl r10, 3 - sub rax, r10 - sub r9, r10 + sub r6, r10 + sub r5, r10 shr r10, 3 movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) +%ifdef WIN64 + add rsp, 0x98 +%else add rsp, 0x68 - ret +%endif + RET %else @@ -388,7 +400,7 @@ mova m3, [r0+r1] ; q1 LOAD_MASK r2, r3 - mov r3, r4m + mov r3, r4mp movd m4, [r3] ; tc0 punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] @@ -428,7 +440,7 @@ ;----------------------------------------------------------------------------- INIT_MMX cglobal x264_deblock_h_luma_%1, 0,5 - mov r0, r0m + mov r0, r0mp mov r3, r1m lea r4, [r3*3] sub r0, 4 @@ -459,7 +471,7 @@ ADD esp, 20 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - mov r0, r0m + mov r0, r0mp sub r0, 2 lea r1, [r0+r4] @@ -607,7 +619,7 @@ ;----------------------------------------------------------------------------- ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6 +cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 %ifndef ARCH_X86_64 sub esp, 0x60 %endif @@ -669,34 +681,34 @@ ;----------------------------------------------------------------------------- ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_intra_%1 +cglobal x264_deblock_h_luma_intra_%1, 4,7 movsxd r10, r1d lea r11, [r10*3] - lea rax, [r0-4] - lea r9, [r0-4+r11] + lea r6, [r0-4] + lea r5, [r0-4+r11] sub rsp, 0x88 %define pix_tmp rsp ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea rax, [rax+r10*8] - lea r9, [r9+r10*8] - TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) + lea r6, [r6+r10*8] + lea r5, [r5+r10*8] + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) lea r0, [pix_tmp+0x40] mov r1, 0x10 call x264_deblock_v_luma_intra_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - lea r9, [rax+r11] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) + lea r5, [r6+r11] + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) shl r10, 3 - sub rax, r10 - sub r9, r10 + sub r6, r10 + sub r5, r10 shr r10, 3 - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) add rsp, 0x88 - ret + RET %else cglobal x264_deblock_h_luma_intra_%1, 2,4 lea r3, [r1*3] @@ -725,7 +737,7 @@ ADD esp, 16 mov r1, r1m - mov r0, r0m + mov r0, r0mp lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] diff -r 46f8d58fbdfb -r c08ca946c80a x86/h264_idct_sse2.asm --- a/x86/h264_idct_sse2.asm Mon Aug 03 23:22:46 2009 +0000 +++ b/x86/h264_idct_sse2.asm Tue Aug 04 07:42:55 2009 +0000 @@ -31,15 +31,8 @@ SECTION .text -%macro IDCT4_1D 6 - SUMSUB_BA m%3, m%1 - SUMSUBD2_AB m%2, m%4, m%6, m%5 - SUMSUB_BADC m%2, m%3, m%5, m%1 - SWAP %1, %2, %5, %4, %3 -%endmacro - INIT_XMM -cglobal x264_add8x4_idct_sse2, 3,3 +cglobal x264_add8x4_idct_sse2, 3,3,8 movq m0, [r1+ 0] movq m1, [r1+ 8] movq m2, [r1+16] diff -r 46f8d58fbdfb -r c08ca946c80a x86/x86inc.asm --- a/x86/x86inc.asm Mon Aug 03 23:22:46 2009 +0000 +++ b/x86/x86inc.asm Tue Aug 04 07:42:55 2009 +0000 @@ -20,6 +20,14 @@ ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;***************************************************************************** +%ifdef ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 + %else + %define UNIX64 + %endif +%endif + ; FIXME: All of the 64bit asm functions that take a stride as an argument ; via register, assume that the high dword of that register is filled with 0. ; This is true in practice (since we never do any 64bit arithmetic on strides, @@ -28,68 +36,39 @@ ; Name of the .rodata section. ; Kludge: Something on OS X fails to align .rodata even given an align attribute, ; so use a different read-only section. -%macro SECTION_RODATA 0 +%macro SECTION_RODATA 0-1 16 %ifidn __OUTPUT_FORMAT__,macho64 - SECTION .text align=16 + SECTION .text align=%1 %elifidn __OUTPUT_FORMAT__,macho - SECTION .text align=16 + SECTION .text align=%1 fakegot: %else - SECTION .rodata align=16 + SECTION .rodata align=%1 %endif %endmacro -; PIC support macros. All these macros are totally harmless when PIC is -; not defined but can ruin everything if misused in PIC mode. On x86_32, shared -; objects cannot directly access global variables by address, they need to -; go through the GOT (global offset table). Most OSes do not care about it -; and let you load non-shared .so objects (Linux, Win32...). However, OS X -; requires PIC code in its .dylib objects. -; -; - GLOBAL should be used as a suffix for global addressing, eg. -; picgetgot ebx +; PIC support macros. +; x86_64 can't fit 64bit address literals in most instruction types, +; so shared objects (under the assumption that they might be anywhere +; in memory) must use an address mode that does fit. +; So all accesses to global variables must use this macro, e.g. ; mov eax, [foo GLOBAL] ; instead of ; mov eax, [foo] ; -; - picgetgot computes the GOT address into the given register in PIC -; mode, otherwise does nothing. You need to do this before using GLOBAL. -; Before in both execution order and compiled code order (so GLOBAL knows -; which register the GOT is in). +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. -%ifndef PIC - %define GLOBAL - %macro picgetgot 1 - %endmacro -%elifdef ARCH_X86_64 - %define PIC64 +%ifdef WIN64 + %define PIC +%elifndef ARCH_X86_64 + %undef PIC +%endif +%ifdef PIC %define GLOBAL wrt rip - %macro picgetgot 1 - %endmacro %else - %define PIC32 - %ifidn __OUTPUT_FORMAT__,macho - ; There is no real global offset table on OS X, but we still - ; need to reference our variables by offset. - %macro picgetgot 1 - call %%getgot - %%getgot: - pop %1 - add %1, $$ - %%getgot - %undef GLOBAL - %define GLOBAL + %1 - fakegot - %endmacro - %else ; elf - extern _GLOBAL_OFFSET_TABLE_ - %macro picgetgot 1 - call %%getgot - %%getgot: - pop %1 - add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc - %undef GLOBAL - %define GLOBAL + %1 wrt ..gotoff - %endmacro - %endif + %define GLOBAL %endif ; Macros to eliminate most code duplication between x86_32 and x86_64: @@ -99,14 +78,14 @@ ; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. -; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed. -; %3 = whether global constants are used in this function. inits x86_32 PIC if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. ; %4 = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. -; cglobal foo, 2,3,0, dst, src, tmp -; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals +; cglobal foo, 2,3, dst, src, tmp +; declares a function (foo), taking two args (dst and src) and one local variable (tmp) ; TODO Some functions can use some args directly from the stack. If they're the ; last args then you can just not declare them, but if they're in the middle @@ -119,12 +98,25 @@ ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons ; which are slow when a normal ret follows a branch. +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + %macro DECLARE_REG 6 %define r%1q %2 %define r%1d %3 %define r%1w %4 %define r%1b %5 %define r%1m %6 + %ifid %6 ; i.e. it's a register + %define r%1mp %2 + %elifdef ARCH_X86_64 ; memory + %define r%1mp qword %6 + %else + %define r%1mp dword %6 + %endif %define r%1 %2 %endmacro @@ -150,6 +142,29 @@ DECLARE_REG_SIZE di, dil DECLARE_REG_SIZE bp, bpl +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 + %ifdef ARCH_X86_64 %define gprsize 8 %else @@ -224,8 +239,7 @@ %assign n_arg_names %%i %endmacro -%ifdef ARCH_X86_64 ;========================================================== -%ifidn __OUTPUT_FORMAT__,win32 +%ifdef WIN64 ; Windows x64 ;================================================= DECLARE_REG 0, rcx, ecx, cx, cl, ecx DECLARE_REG 1, rdx, edx, dx, dl, edx @@ -239,11 +253,75 @@ %macro LOAD_IF_USED 2 ; reg_id, number_of_args %if %1 < %2 - mov r%1, [rsp + 8 + %1*8] + mov r%1, [rsp + stack_offset + 8 + %1*8] %endif %endmacro -%else ;======================================================================= +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... + ASSERT %2 >= %1 + %assign regs_used %2 + ASSERT regs_used <= 7 + %if %0 > 2 + %assign xmm_regs_used %3 + %else + %assign xmm_regs_used 0 + %endif + ASSERT xmm_regs_used <= 16 + %if regs_used > 4 + push r4 + push r5 + %assign stack_offset stack_offset+16 + %endif + %if xmm_regs_used > 6 + sub rsp, (xmm_regs_used-6)*16+16 + %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i + %endrep + %endif + LOAD_IF_USED 4, %1 + LOAD_IF_USED 5, %1 + LOAD_IF_USED 6, %1 + DEFINE_ARGS %4 +%endmacro + +%macro RESTORE_XMM_INTERNAL 1 + %if xmm_regs_used > 6 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8] + %endrep + add %1, (xmm_regs_used-6)*16+16 + %endif +%endmacro + +%macro RESTORE_XMM 1 + RESTORE_XMM_INTERNAL %1 + %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 + %assign xmm_regs_used 0 +%endmacro + +%macro RET 0 + RESTORE_XMM_INTERNAL rsp + %if regs_used > 4 + pop r5 + pop r4 + %endif + ret +%endmacro + +%macro REP_RET 0 + %if regs_used > 4 || xmm_regs_used > 6 + RET + %else + rep ret + %endif +%endmacro + +%elifdef ARCH_X86_64 ; *nix x64 ;============================================= DECLARE_REG 0, rdi, edi, di, dil, edi DECLARE_REG 1, rsi, esi, si, sil, esi @@ -261,16 +339,9 @@ %endif %endmacro -%endif ; !WIN64 - -%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... ASSERT %2 >= %1 ASSERT %2 <= 7 - %assign stack_offset 0 -%ifidn __OUTPUT_FORMAT__,win32 - LOAD_IF_USED 4, %1 - LOAD_IF_USED 5, %1 -%endif LOAD_IF_USED 6, %1 DEFINE_ARGS %4 %endmacro @@ -315,15 +386,9 @@ %endif %endmacro -%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... +%macro PROLOGUE 2-4+ ; #args, #regs, arg_names... ASSERT %2 >= %1 - %assign stack_offset 0 %assign regs_used %2 - %ifdef PIC - %if %3 - %assign regs_used regs_used+1 - %endif - %endif ASSERT regs_used <= 7 PUSH_IF_USED 3 PUSH_IF_USED 4 @@ -336,9 +401,6 @@ LOAD_IF_USED 4, %1 LOAD_IF_USED 5, %1 LOAD_IF_USED 6, %1 - %if %3 - picgetgot r%2 - %endif DEFINE_ARGS %4 %endmacro @@ -382,6 +444,7 @@ align function_align %1: RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer + %assign stack_offset 0 %if %0 > 1 PROLOGUE %2 %endif @@ -389,11 +452,9 @@ %macro cextern 1 %ifdef PREFIX - extern _%1 - %define %1 _%1 - %else - extern %1 + %xdefine %1 _%1 %endif + extern %1 %endmacro ; This is needed for ELF, otherwise the GNU linker assumes the stack is @@ -523,6 +584,7 @@ %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE n, m %+ %%i, %%i %assign %%i %%i+1 %endrep %endmacro @@ -534,7 +596,30 @@ %endif %endmacro -; substitutions which are functionally identical but reduce code size +;Substitutions that reduce instruction size but are functionally equivalent %define movdqa movaps %define movdqu movups +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro diff -r 46f8d58fbdfb -r c08ca946c80a x86/x86util.asm --- a/x86/x86util.asm Mon Aug 03 23:22:46 2009 +0000 +++ b/x86/x86util.asm Tue Aug 04 07:42:55 2009 +0000 @@ -93,7 +93,7 @@ SBUTTERFLY qdq, %4, %8, %2 SWAP %2, %5 SWAP %4, %7 -%if 0<11 +%if %0<11 movdqa m%5, %10 %endif %endif @@ -165,28 +165,203 @@ palignr %1, %2, %3 %endmacro -%macro SUMSUB_BA 2 +%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from +%ifnum %5 + mova m%1, m%5 + mova m%3, m%5 +%else + mova m%1, %5 + mova m%3, m%1 +%endif + pand m%1, m%2 ; dst .. y6 .. y4 + pand m%3, m%4 ; src .. y6 .. y4 + psrlw m%2, 8 ; dst .. y7 .. y5 + psrlw m%4, 8 ; src .. y7 .. y5 +%endmacro + +%macro SUMSUB_BA 2-3 +%if %0==2 paddw %1, %2 paddw %2, %2 psubw %2, %1 +%else + mova %3, %1 + paddw %1, %2 + psubw %2, %3 +%endif %endmacro -%macro SUMSUB_BADC 4 +%macro SUMSUB_BADC 4-5 +%if %0==5 + SUMSUB_BA %1, %2, %5 + SUMSUB_BA %3, %4, %5 +%else paddw %1, %2 paddw %3, %4 paddw %2, %2 paddw %4, %4 psubw %2, %1 psubw %4, %3 +%endif %endmacro -%macro HADAMARD8_1D 8 +%macro HADAMARD4_V 4+ + SUMSUB_BADC %1, %2, %3, %4 + SUMSUB_BADC %1, %3, %2, %4 +%endmacro + +%macro HADAMARD8_V 8+ + SUMSUB_BADC %1, %2, %3, %4 + SUMSUB_BADC %5, %6, %7, %8 + SUMSUB_BADC %1, %3, %2, %4 + SUMSUB_BADC %5, %7, %6, %8 SUMSUB_BADC %1, %5, %2, %6 SUMSUB_BADC %3, %7, %4, %8 - SUMSUB_BADC %1, %3, %2, %4 - SUMSUB_BADC %5, %7, %6, %8 - SUMSUB_BADC %1, %2, %3, %4 - SUMSUB_BADC %5, %6, %7, %8 +%endmacro + +%macro TRANS_SSE2 5-6 +; TRANSPOSE2x2 +; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq +; %2: ord/unord (for compat with sse4, unused) +; %3/%4: source regs +; %5/%6: tmp regs +%ifidn %1, d +%define mask [mask_10 GLOBAL] +%define shift 16 +%elifidn %1, q +%define mask [mask_1100 GLOBAL] +%define shift 32 +%endif +%if %0==6 ; less dependency if we have two tmp + mova m%5, mask ; ff00 + mova m%6, m%4 ; x5x4 + psll%1 m%4, shift ; x4.. + pand m%6, m%5 ; x5.. + pandn m%5, m%3 ; ..x0 + psrl%1 m%3, shift ; ..x1 + por m%4, m%5 ; x4x0 + por m%3, m%6 ; x5x1 +%else ; more dependency, one insn less. sometimes faster, sometimes not + mova m%5, m%4 ; x5x4 + psll%1 m%4, shift ; x4.. + pxor m%4, m%3 ; (x4^x1)x0 + pand m%4, mask ; (x4^x1).. + pxor m%3, m%4 ; x4x0 + psrl%1 m%4, shift ; ..(x1^x4) + pxor m%5, m%4 ; x5x1 + SWAP %4, %3, %5 +%endif +%endmacro + +%macro TRANS_SSE4 5-6 ; see above +%ifidn %1, d + mova m%5, m%3 +%ifidn %2, ord + psrl%1 m%3, 16 +%endif + pblendw m%3, m%4, 10101010b + psll%1 m%4, 16 +%ifidn %2, ord + pblendw m%4, m%5, 01010101b +%else + psrl%1 m%5, 16 + por m%4, m%5 +%endif +%elifidn %1, q + mova m%5, m%3 + shufps m%3, m%4, 10001000b + shufps m%5, m%4, 11011101b + SWAP %4, %5 +%endif +%endmacro + +%macro HADAMARD 5-6 +; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes) +; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes) +; %3/%4: regs +; %5(%6): tmpregs +%if %1!=0 ; have to reorder stuff for horizontal op + %ifidn %2, sumsub + %define ORDER ord + ; sumsub needs order because a-b != b-a unless a=b + %else + %define ORDER unord + ; if we just max, order doesn't matter (allows pblendw+or in sse4) + %endif + %if %1==1 + TRANS d, ORDER, %3, %4, %5, %6 + %elif %1==2 + %if mmsize==8 + SBUTTERFLY dq, %3, %4, %5 + %else + TRANS q, ORDER, %3, %4, %5, %6 + %endif + %elif %1==4 + SBUTTERFLY qdq, %3, %4, %5 + %endif +%endif +%ifidn %2, sumsub + SUMSUB_BA m%3, m%4, m%5 +%else + %ifidn %2, amax + %if %0==6 + ABS2 m%3, m%4, m%5, m%6 + %else + ABS1 m%3, m%5 + ABS1 m%4, m%5 + %endif + %endif + pmaxsw m%3, m%4 +%endif +%endmacro + + +%macro HADAMARD2_2D 6-7 sumsub + HADAMARD 0, sumsub, %1, %2, %5 + HADAMARD 0, sumsub, %3, %4, %5 + SBUTTERFLY %6, %1, %2, %5 +%ifnum %7 + HADAMARD 0, amax, %1, %2, %5, %7 +%else + HADAMARD 0, %7, %1, %2, %5 +%endif + SBUTTERFLY %6, %3, %4, %5 +%ifnum %7 + HADAMARD 0, amax, %3, %4, %5, %7 +%else + HADAMARD 0, %7, %3, %4, %5 +%endif +%endmacro + +%macro HADAMARD4_2D 5-6 sumsub + HADAMARD2_2D %1, %2, %3, %4, %5, wd + HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6 + SWAP %2, %3 +%endmacro + +%macro HADAMARD4_2D_SSE 5-6 sumsub + HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1 + HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3 + SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0 + SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2 + HADAMARD2_2D %1, %3, %2, %4, %5, dq + SBUTTERFLY qdq, %1, %2, %5 + HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1 + SBUTTERFLY qdq, %3, %4, %5 + HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3 +%endmacro + +%macro HADAMARD8_2D 9-10 sumsub + HADAMARD2_2D %1, %2, %3, %4, %9, wd + HADAMARD2_2D %5, %6, %7, %8, %9, wd + HADAMARD2_2D %1, %3, %2, %4, %9, dq + HADAMARD2_2D %5, %7, %6, %8, %9, dq + HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10 + HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10 +%ifnidn %10, amax + SWAP %2, %5 + SWAP %4, %7 +%endif %endmacro %macro SUMSUB2_AB 3 @@ -197,13 +372,49 @@ psubw %3, %2 %endmacro +%macro SUMSUB2_BA 3 + mova m%3, m%1 + paddw m%1, m%2 + paddw m%1, m%2 + psubw m%2, m%3 + psubw m%2, m%3 +%endmacro + %macro SUMSUBD2_AB 4 mova %4, %1 mova %3, %2 psraw %2, 1 - psraw %4, 1 - paddw %1, %2 - psubw %4, %3 + psraw %1, 1 + paddw %2, %4 + psubw %1, %3 +%endmacro + +%macro DCT4_1D 5 +%ifnum %5 + SUMSUB_BADC m%4, m%1, m%3, m%2; m%5 + SUMSUB_BA m%3, m%4, m%5 + SUMSUB2_AB m%1, m%2, m%5 + SWAP %1, %3, %4, %5, %2 +%else + SUMSUB_BADC m%4, m%1, m%3, m%2 + SUMSUB_BA m%3, m%4 + mova [%5], m%2 + SUMSUB2_AB m%1, [%5], m%2 + SWAP %1, %3, %4, %2 +%endif +%endmacro + +%macro IDCT4_1D 5-6 +%ifnum %5 + SUMSUBD2_AB m%2, m%4, m%6, m%5 + SUMSUB_BA m%3, m%1, m%6 + SUMSUB_BADC m%4, m%3, m%2, m%1, m%6 +%else + SUMSUBD2_AB m%2, m%4, [%5], [%5+16] + SUMSUB_BA m%3, m%1 + SUMSUB_BADC m%4, m%3, m%2, m%1 +%endif + SWAP %1, %4, %3 %endmacro %macro LOAD_DIFF 5 @@ -222,17 +433,81 @@ %endif %endmacro -%macro LOAD_DIFF_8x4P 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer - LOAD_DIFF %1, %5, none, [%7], [%8] - LOAD_DIFF %2, %6, none, [%7+r1], [%8+r3] - LOAD_DIFF %3, %5, none, [%7+2*r1], [%8+2*r3] - LOAD_DIFF %4, %6, none, [%7+r4], [%8+r5] +%macro LOAD_DIFF8x4_SSE2 8 + LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE] + LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE] + LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE] + LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE] +%endmacro + +%macro LOAD_DIFF8x4_SSSE3 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr + movh m%2, [%8+%1*FDEC_STRIDE] + movh m%1, [%7+%1*FENC_STRIDE] + punpcklbw m%1, m%2 + movh m%3, [%8+%2*FDEC_STRIDE] + movh m%2, [%7+%2*FENC_STRIDE] + punpcklbw m%2, m%3 + movh m%4, [%8+%3*FDEC_STRIDE] + movh m%3, [%7+%3*FENC_STRIDE] + punpcklbw m%3, m%4 + movh m%5, [%8+%4*FDEC_STRIDE] + movh m%4, [%7+%4*FENC_STRIDE] + punpcklbw m%4, m%5 + pmaddubsw m%1, m%6 + pmaddubsw m%2, m%6 + pmaddubsw m%3, m%6 + pmaddubsw m%4, m%6 +%endmacro + +%macro STORE_DCT 6 + movq [%5+%6+ 0], m%1 + movq [%5+%6+ 8], m%2 + movq [%5+%6+16], m%3 + movq [%5+%6+24], m%4 + movhps [%5+%6+32], m%1 + movhps [%5+%6+40], m%2 + movhps [%5+%6+48], m%3 + movhps [%5+%6+56], m%4 +%endmacro + +%macro STORE_IDCT 4 + movhps [r0-4*FDEC_STRIDE], %1 + movh [r0-3*FDEC_STRIDE], %1 + movhps [r0-2*FDEC_STRIDE], %2 + movh [r0-1*FDEC_STRIDE], %2 + movhps [r0+0*FDEC_STRIDE], %3 + movh [r0+1*FDEC_STRIDE], %3 + movhps [r0+2*FDEC_STRIDE], %4 + movh [r0+3*FDEC_STRIDE], %4 +%endmacro + +%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment? + LOAD_DIFF m%1, m%5, m%7, [%8], [%9] + LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3] + LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3] + LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro DIFFx2 6-7 + movh %3, %5 + punpcklbw %3, %4 + psraw %1, 6 + paddsw %1, %3 + movh %3, %6 + punpcklbw %3, %4 + psraw %2, 6 + paddsw %2, %3 + packuswb %2, %1 %endmacro %macro STORE_DIFF 4 - psraw %1, 6 movh %2, %4 punpcklbw %2, %3 + psraw %1, 6 paddsw %1, %2 packuswb %1, %1 movh %4, %1