changeset 10019:c08ca946c80a libavcodec

Update x264 asm code to latest to add support for 64-bit Windows. Use the new x86inc features to support 64-bit Windows on all non-x264 nasm assembly code as well. Patch by John Adcock, dscaler.johnad AT googlemail DOT com. Win64 changes originally by Anton Mitrofanov. x86util changes mostly by Holger Lubitz.
author darkshikari
date Tue, 04 Aug 2009 07:42:55 +0000
parents 46f8d58fbdfb
children 617166c76faf
files x86/fft_mmx.asm x86/h264_deblock_sse2.asm x86/h264_idct_sse2.asm x86/x86inc.asm x86/x86util.asm
diffstat 5 files changed, 507 insertions(+), 142 deletions(-) [+]
line wrap: on
line diff
--- a/x86/fft_mmx.asm	Mon Aug 03 23:22:46 2009 +0000
+++ b/x86/fft_mmx.asm	Tue Aug 04 07:42:55 2009 +0000
@@ -457,7 +457,7 @@
 
 ; On x86_32, this function does the register saving and restoring for all of fft.
 ; The others pass args in registers and don't spill anything.
-cglobal fft_dispatch%3%2, 2,5,0, z, nbits
+cglobal fft_dispatch%3%2, 2,5,8, z, nbits
     lea r2, [dispatch_tab%3%2 GLOBAL]
     mov r2, [r2 + (nbitsq-2)*gprsize]
     call r2
--- a/x86/h264_deblock_sse2.asm	Mon Aug 03 23:22:46 2009 +0000
+++ b/x86/h264_deblock_sse2.asm	Tue Aug 04 07:42:55 2009 +0000
@@ -278,7 +278,7 @@
 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_XMM
-cglobal x264_deblock_v_luma_sse2
+cglobal x264_deblock_v_luma_sse2, 5,5,10
     movd    m8, [r4] ; tc0
     lea     r4, [r1*3]
     dec     r2d        ; alpha-1
@@ -318,54 +318,66 @@
     DEBLOCK_P0_Q0
     mova    [r4+2*r1], m1
     mova    [r0], m2
-    ret
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_MMX
-cglobal x264_deblock_h_luma_sse2
-    movsxd r10, esi
+cglobal x264_deblock_h_luma_sse2, 5,7
+    movsxd r10, r1d
     lea    r11, [r10+r10*2]
-    lea    rax, [r0-4]
-    lea    r9,  [r0-4+r11]
+    lea    r6,  [r0-4]
+    lea    r5,  [r0-4+r11]
+%ifdef WIN64
+    sub    rsp, 0x98
+    %define pix_tmp rsp+0x30
+%else
     sub    rsp, 0x68
     %define pix_tmp rsp
+%endif
 
     ; transpose 6x16 -> tmp space
-    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp
-    lea    rax, [rax+r10*8]
-    lea    r9,  [r9 +r10*8]
-    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
+    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp
+    lea    r6, [r6+r10*8]
+    lea    r5, [r5+r10*8]
+    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
 
     ; vertical filter
     ; alpha, beta, tc0 are still in r2d, r3d, r4
-    ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
+    ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
     lea    r0, [pix_tmp+0x30]
-    mov    esi, 0x10
+    mov    r1d, 0x10
+%ifdef WIN64
+    mov    [rsp+0x20], r4
+%endif
     call   x264_deblock_v_luma_sse2
 
     ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
-    add    rax, 2
-    add    r9,  2
+    add    r6, 2
+    add    r5, 2
     movq   m0, [pix_tmp+0x18]
     movq   m1, [pix_tmp+0x28]
     movq   m2, [pix_tmp+0x38]
     movq   m3, [pix_tmp+0x48]
-    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
+    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
 
     shl    r10, 3
-    sub    rax, r10
-    sub    r9,  r10
+    sub    r6,  r10
+    sub    r5,  r10
     shr    r10, 3
     movq   m0, [pix_tmp+0x10]
     movq   m1, [pix_tmp+0x20]
     movq   m2, [pix_tmp+0x30]
     movq   m3, [pix_tmp+0x40]
-    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
+    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
 
+%ifdef WIN64
+    add    rsp, 0x98
+%else
     add    rsp, 0x68
-    ret
+%endif
+    RET
 
 %else
 
@@ -388,7 +400,7 @@
     mova    m3, [r0+r1]   ; q1
     LOAD_MASK r2, r3
 
-    mov     r3, r4m
+    mov     r3, r4mp
     movd    m4, [r3] ; tc0
     punpcklbw m4, m4
     punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
@@ -428,7 +440,7 @@
 ;-----------------------------------------------------------------------------
 INIT_MMX
 cglobal x264_deblock_h_luma_%1, 0,5
-    mov    r0, r0m
+    mov    r0, r0mp
     mov    r3, r1m
     lea    r4, [r3*3]
     sub    r0, 4
@@ -459,7 +471,7 @@
     ADD    esp, 20
 
     ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
-    mov    r0, r0m
+    mov    r0, r0mp
     sub    r0, 2
     lea    r1, [r0+r4]
 
@@ -607,7 +619,7 @@
 ;-----------------------------------------------------------------------------
 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_intra_%1, 4,6
+cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
 %ifndef ARCH_X86_64
     sub     esp, 0x60
 %endif
@@ -669,34 +681,34 @@
 ;-----------------------------------------------------------------------------
 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_luma_intra_%1
+cglobal x264_deblock_h_luma_intra_%1, 4,7
     movsxd r10, r1d
     lea    r11, [r10*3]
-    lea    rax, [r0-4]
-    lea    r9,  [r0-4+r11]
+    lea    r6,  [r0-4]
+    lea    r5,  [r0-4+r11]
     sub    rsp, 0x88
     %define pix_tmp rsp
 
     ; transpose 8x16 -> tmp space
-    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
-    lea    rax, [rax+r10*8]
-    lea    r9,  [r9+r10*8]
-    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+    lea    r6, [r6+r10*8]
+    lea    r5, [r5+r10*8]
+    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
 
     lea    r0,  [pix_tmp+0x40]
     mov    r1,  0x10
     call   x264_deblock_v_luma_intra_%1
 
     ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
-    lea    r9, [rax+r11]
-    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
+    lea    r5, [r6+r11]
+    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
     shl    r10, 3
-    sub    rax, r10
-    sub    r9,  r10
+    sub    r6,  r10
+    sub    r5,  r10
     shr    r10, 3
-    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
+    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
     add    rsp, 0x88
-    ret
+    RET
 %else
 cglobal x264_deblock_h_luma_intra_%1, 2,4
     lea    r3,  [r1*3]
@@ -725,7 +737,7 @@
     ADD    esp, 16
 
     mov    r1,  r1m
-    mov    r0,  r0m
+    mov    r0,  r0mp
     lea    r3,  [r1*3]
     sub    r0,  4
     lea    r2,  [r0+r3]
--- a/x86/h264_idct_sse2.asm	Mon Aug 03 23:22:46 2009 +0000
+++ b/x86/h264_idct_sse2.asm	Tue Aug 04 07:42:55 2009 +0000
@@ -31,15 +31,8 @@
 
 SECTION .text
 
-%macro IDCT4_1D 6
-    SUMSUB_BA   m%3, m%1
-    SUMSUBD2_AB m%2, m%4, m%6, m%5
-    SUMSUB_BADC m%2, m%3, m%5, m%1
-    SWAP %1, %2, %5, %4, %3
-%endmacro
-
 INIT_XMM
-cglobal x264_add8x4_idct_sse2, 3,3
+cglobal x264_add8x4_idct_sse2, 3,3,8
     movq   m0, [r1+ 0]
     movq   m1, [r1+ 8]
     movq   m2, [r1+16]
--- a/x86/x86inc.asm	Mon Aug 03 23:22:46 2009 +0000
+++ b/x86/x86inc.asm	Tue Aug 04 07:42:55 2009 +0000
@@ -20,6 +20,14 @@
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
+%ifdef ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64
+    %else
+        %define UNIX64
+    %endif
+%endif
+
 ; FIXME: All of the 64bit asm functions that take a stride as an argument
 ; via register, assume that the high dword of that register is filled with 0.
 ; This is true in practice (since we never do any 64bit arithmetic on strides,
@@ -28,68 +36,39 @@
 ; Name of the .rodata section.
 ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
 ; so use a different read-only section.
-%macro SECTION_RODATA 0
+%macro SECTION_RODATA 0-1 16
     %ifidn __OUTPUT_FORMAT__,macho64
-        SECTION .text align=16
+        SECTION .text align=%1
     %elifidn __OUTPUT_FORMAT__,macho
-        SECTION .text align=16
+        SECTION .text align=%1
         fakegot:
     %else
-        SECTION .rodata align=16
+        SECTION .rodata align=%1
     %endif
 %endmacro
 
-; PIC support macros. All these macros are totally harmless when PIC is
-; not defined but can ruin everything if misused in PIC mode. On x86_32, shared
-; objects cannot directly access global variables by address, they need to
-; go through the GOT (global offset table). Most OSes do not care about it
-; and let you load non-shared .so objects (Linux, Win32...). However, OS X
-; requires PIC code in its .dylib objects.
-;
-; - GLOBAL should be used as a suffix for global addressing, eg.
-;     picgetgot ebx
+; PIC support macros.
+; x86_64 can't fit 64bit address literals in most instruction types,
+; so shared objects (under the assumption that they might be anywhere
+; in memory) must use an address mode that does fit.
+; So all accesses to global variables must use this macro, e.g.
 ;     mov eax, [foo GLOBAL]
 ;   instead of
 ;     mov eax, [foo]
 ;
-; - picgetgot computes the GOT address into the given register in PIC
-;   mode, otherwise does nothing. You need to do this before using GLOBAL.
-;   Before in both execution order and compiled code order (so GLOBAL knows
-;   which register the GOT is in).
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
 
-%ifndef PIC
-    %define GLOBAL
-    %macro picgetgot 1
-    %endmacro
-%elifdef ARCH_X86_64
-    %define PIC64
+%ifdef WIN64
+    %define PIC
+%elifndef ARCH_X86_64
+    %undef PIC
+%endif
+%ifdef PIC
     %define GLOBAL wrt rip
-    %macro picgetgot 1
-    %endmacro
 %else
-    %define PIC32
-    %ifidn __OUTPUT_FORMAT__,macho
-        ; There is no real global offset table on OS X, but we still
-        ; need to reference our variables by offset.
-        %macro picgetgot 1
-            call %%getgot
-          %%getgot:
-            pop %1
-            add %1, $$ - %%getgot
-            %undef GLOBAL
-            %define GLOBAL + %1 - fakegot
-        %endmacro
-    %else ; elf
-        extern _GLOBAL_OFFSET_TABLE_
-        %macro picgetgot 1
-            call %%getgot
-          %%getgot:
-            pop %1
-            add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
-            %undef GLOBAL
-            %define GLOBAL + %1 wrt ..gotoff
-        %endmacro
-    %endif
+    %define GLOBAL
 %endif
 
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
@@ -99,14 +78,14 @@
 
 ; PROLOGUE:
 ; %1 = number of arguments. loads them from stack if needed.
-; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed.
-; %3 = whether global constants are used in this function. inits x86_32 PIC if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
 ; %4 = list of names to define to registers
 ; PROLOGUE can also be invoked by adding the same options to cglobal
 
 ; e.g.
-; cglobal foo, 2,3,0, dst, src, tmp
-; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals
+; cglobal foo, 2,3, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
 
 ; TODO Some functions can use some args directly from the stack. If they're the
 ; last args then you can just not declare them, but if they're in the middle
@@ -119,12 +98,25 @@
 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
 ; which are slow when a normal ret follows a branch.
 
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
 %macro DECLARE_REG 6
     %define r%1q %2
     %define r%1d %3
     %define r%1w %4
     %define r%1b %5
     %define r%1m %6
+    %ifid %6 ; i.e. it's a register
+        %define r%1mp %2
+    %elifdef ARCH_X86_64 ; memory
+        %define r%1mp qword %6
+    %else
+        %define r%1mp dword %6
+    %endif
     %define r%1  %2
 %endmacro
 
@@ -150,6 +142,29 @@
 DECLARE_REG_SIZE di, dil
 DECLARE_REG_SIZE bp, bpl
 
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
+
 %ifdef ARCH_X86_64
     %define gprsize 8
 %else
@@ -224,8 +239,7 @@
     %assign n_arg_names %%i
 %endmacro
 
-%ifdef ARCH_X86_64 ;==========================================================
-%ifidn __OUTPUT_FORMAT__,win32
+%ifdef WIN64 ; Windows x64 ;=================================================
 
 DECLARE_REG 0, rcx, ecx, cx,  cl,  ecx
 DECLARE_REG 1, rdx, edx, dx,  dl,  edx
@@ -239,11 +253,75 @@
 
 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
     %if %1 < %2
-        mov r%1, [rsp + 8 + %1*8]
+        mov r%1, [rsp + stack_offset + 8 + %1*8]
     %endif
 %endmacro
 
-%else ;=======================================================================
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+    ASSERT %2 >= %1
+    %assign regs_used %2
+    ASSERT regs_used <= 7
+    %if %0 > 2
+        %assign xmm_regs_used %3
+    %else
+        %assign xmm_regs_used 0
+    %endif
+    ASSERT xmm_regs_used <= 16
+    %if regs_used > 4
+        push r4
+        push r5
+        %assign stack_offset stack_offset+16
+    %endif
+    %if xmm_regs_used > 6
+        sub rsp, (xmm_regs_used-6)*16+16
+        %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
+        %endrep
+    %endif
+    LOAD_IF_USED 4, %1
+    LOAD_IF_USED 5, %1
+    LOAD_IF_USED 6, %1
+    DEFINE_ARGS %4
+%endmacro
+
+%macro RESTORE_XMM_INTERNAL 1
+    %if xmm_regs_used > 6
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
+        %endrep
+        add %1, (xmm_regs_used-6)*16+16
+    %endif
+%endmacro
+
+%macro RESTORE_XMM 1
+    RESTORE_XMM_INTERNAL %1
+    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+    %assign xmm_regs_used 0
+%endmacro
+
+%macro RET 0
+    RESTORE_XMM_INTERNAL rsp
+    %if regs_used > 4
+        pop r5
+        pop r4
+    %endif
+    ret
+%endmacro
+
+%macro REP_RET 0
+    %if regs_used > 4 || xmm_regs_used > 6
+        RET
+    %else
+        rep ret
+    %endif
+%endmacro
+
+%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
 
 DECLARE_REG 0, rdi, edi, di,  dil, edi
 DECLARE_REG 1, rsi, esi, si,  sil, esi
@@ -261,16 +339,9 @@
     %endif
 %endmacro
 
-%endif ; !WIN64
-
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
     ASSERT %2 >= %1
     ASSERT %2 <= 7
-    %assign stack_offset 0
-%ifidn __OUTPUT_FORMAT__,win32
-    LOAD_IF_USED 4, %1
-    LOAD_IF_USED 5, %1
-%endif
     LOAD_IF_USED 6, %1
     DEFINE_ARGS %4
 %endmacro
@@ -315,15 +386,9 @@
     %endif
 %endmacro
 
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+%macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
     ASSERT %2 >= %1
-    %assign stack_offset 0
     %assign regs_used %2
-    %ifdef PIC
-    %if %3
-        %assign regs_used regs_used+1
-    %endif
-    %endif
     ASSERT regs_used <= 7
     PUSH_IF_USED 3
     PUSH_IF_USED 4
@@ -336,9 +401,6 @@
     LOAD_IF_USED 4, %1
     LOAD_IF_USED 5, %1
     LOAD_IF_USED 6, %1
-    %if %3
-        picgetgot r%2
-    %endif
     DEFINE_ARGS %4
 %endmacro
 
@@ -382,6 +444,7 @@
     align function_align
     %1:
     RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+    %assign stack_offset 0
     %if %0 > 1
         PROLOGUE %2
     %endif
@@ -389,11 +452,9 @@
 
 %macro cextern 1
     %ifdef PREFIX
-        extern _%1
-        %define %1 _%1
-    %else
-        extern %1
+        %xdefine %1 _%1
     %endif
+    extern %1
 %endmacro
 
 ; This is needed for ELF, otherwise the GNU linker assumes the stack is
@@ -523,6 +584,7 @@
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE m, %%i, %1_m %+ %%i
+    CAT_XDEFINE n, m %+ %%i, %%i
     %assign %%i %%i+1
     %endrep
 %endmacro
@@ -534,7 +596,30 @@
     %endif
 %endmacro
 
-; substitutions which are functionally identical but reduce code size
+;Substitutions that reduce instruction size but are functionally equivalent
 %define movdqa movaps
 %define movdqu movups
 
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
--- a/x86/x86util.asm	Mon Aug 03 23:22:46 2009 +0000
+++ b/x86/x86util.asm	Tue Aug 04 07:42:55 2009 +0000
@@ -93,7 +93,7 @@
     SBUTTERFLY qdq, %4, %8, %2
     SWAP %2, %5
     SWAP %4, %7
-%if 0<11
+%if %0<11
     movdqa m%5, %10
 %endif
 %endif
@@ -165,28 +165,203 @@
     palignr %1, %2, %3
 %endmacro
 
-%macro SUMSUB_BA 2
+%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
+%ifnum %5
+    mova   m%1, m%5
+    mova   m%3, m%5
+%else
+    mova   m%1, %5
+    mova   m%3, m%1
+%endif
+    pand   m%1, m%2 ; dst .. y6 .. y4
+    pand   m%3, m%4 ; src .. y6 .. y4
+    psrlw  m%2, 8   ; dst .. y7 .. y5
+    psrlw  m%4, 8   ; src .. y7 .. y5
+%endmacro
+
+%macro SUMSUB_BA 2-3
+%if %0==2
     paddw   %1, %2
     paddw   %2, %2
     psubw   %2, %1
+%else
+    mova    %3, %1
+    paddw   %1, %2
+    psubw   %2, %3
+%endif
 %endmacro
 
-%macro SUMSUB_BADC 4
+%macro SUMSUB_BADC 4-5
+%if %0==5
+    SUMSUB_BA %1, %2, %5
+    SUMSUB_BA %3, %4, %5
+%else
     paddw   %1, %2
     paddw   %3, %4
     paddw   %2, %2
     paddw   %4, %4
     psubw   %2, %1
     psubw   %4, %3
+%endif
 %endmacro
 
-%macro HADAMARD8_1D 8
+%macro HADAMARD4_V 4+
+    SUMSUB_BADC %1, %2, %3, %4
+    SUMSUB_BADC %1, %3, %2, %4
+%endmacro
+
+%macro HADAMARD8_V 8+
+    SUMSUB_BADC %1, %2, %3, %4
+    SUMSUB_BADC %5, %6, %7, %8
+    SUMSUB_BADC %1, %3, %2, %4
+    SUMSUB_BADC %5, %7, %6, %8
     SUMSUB_BADC %1, %5, %2, %6
     SUMSUB_BADC %3, %7, %4, %8
-    SUMSUB_BADC %1, %3, %2, %4
-    SUMSUB_BADC %5, %7, %6, %8
-    SUMSUB_BADC %1, %2, %3, %4
-    SUMSUB_BADC %5, %6, %7, %8
+%endmacro
+
+%macro TRANS_SSE2 5-6
+; TRANSPOSE2x2
+; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
+; %2: ord/unord (for compat with sse4, unused)
+; %3/%4: source regs
+; %5/%6: tmp regs
+%ifidn %1, d
+%define mask [mask_10 GLOBAL]
+%define shift 16
+%elifidn %1, q
+%define mask [mask_1100 GLOBAL]
+%define shift 32
+%endif
+%if %0==6 ; less dependency if we have two tmp
+    mova   m%5, mask   ; ff00
+    mova   m%6, m%4    ; x5x4
+    psll%1 m%4, shift  ; x4..
+    pand   m%6, m%5    ; x5..
+    pandn  m%5, m%3    ; ..x0
+    psrl%1 m%3, shift  ; ..x1
+    por    m%4, m%5    ; x4x0
+    por    m%3, m%6    ; x5x1
+%else ; more dependency, one insn less. sometimes faster, sometimes not
+    mova   m%5, m%4    ; x5x4
+    psll%1 m%4, shift  ; x4..
+    pxor   m%4, m%3    ; (x4^x1)x0
+    pand   m%4, mask   ; (x4^x1)..
+    pxor   m%3, m%4    ; x4x0
+    psrl%1 m%4, shift  ; ..(x1^x4)
+    pxor   m%5, m%4    ; x5x1
+    SWAP   %4, %3, %5
+%endif
+%endmacro
+
+%macro TRANS_SSE4 5-6 ; see above
+%ifidn %1, d
+    mova   m%5, m%3
+%ifidn %2, ord
+    psrl%1 m%3, 16
+%endif
+    pblendw m%3, m%4, 10101010b
+    psll%1 m%4, 16
+%ifidn %2, ord
+    pblendw m%4, m%5, 01010101b
+%else
+    psrl%1 m%5, 16
+    por    m%4, m%5
+%endif
+%elifidn %1, q
+    mova   m%5, m%3
+    shufps m%3, m%4, 10001000b
+    shufps m%5, m%4, 11011101b
+    SWAP   %4, %5
+%endif
+%endmacro
+
+%macro HADAMARD 5-6
+; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
+; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
+; %3/%4: regs
+; %5(%6): tmpregs
+%if %1!=0 ; have to reorder stuff for horizontal op
+    %ifidn %2, sumsub
+         %define ORDER ord
+         ; sumsub needs order because a-b != b-a unless a=b
+    %else
+         %define ORDER unord
+         ; if we just max, order doesn't matter (allows pblendw+or in sse4)
+    %endif
+    %if %1==1
+         TRANS d, ORDER, %3, %4, %5, %6
+    %elif %1==2
+         %if mmsize==8
+             SBUTTERFLY dq, %3, %4, %5
+         %else
+             TRANS q, ORDER, %3, %4, %5, %6
+         %endif
+    %elif %1==4
+         SBUTTERFLY qdq, %3, %4, %5
+    %endif
+%endif
+%ifidn %2, sumsub
+    SUMSUB_BA m%3, m%4, m%5
+%else
+    %ifidn %2, amax
+        %if %0==6
+            ABS2 m%3, m%4, m%5, m%6
+        %else
+            ABS1 m%3, m%5
+            ABS1 m%4, m%5
+        %endif
+    %endif
+    pmaxsw m%3, m%4
+%endif
+%endmacro
+
+
+%macro HADAMARD2_2D 6-7 sumsub
+    HADAMARD 0, sumsub, %1, %2, %5
+    HADAMARD 0, sumsub, %3, %4, %5
+    SBUTTERFLY %6, %1, %2, %5
+%ifnum %7
+    HADAMARD 0, amax, %1, %2, %5, %7
+%else
+    HADAMARD 0, %7, %1, %2, %5
+%endif
+    SBUTTERFLY %6, %3, %4, %5
+%ifnum %7
+    HADAMARD 0, amax, %3, %4, %5, %7
+%else
+    HADAMARD 0, %7, %3, %4, %5
+%endif
+%endmacro
+
+%macro HADAMARD4_2D 5-6 sumsub
+    HADAMARD2_2D %1, %2, %3, %4, %5, wd
+    HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
+    SWAP %2, %3
+%endmacro
+
+%macro HADAMARD4_2D_SSE 5-6 sumsub
+    HADAMARD  0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
+    HADAMARD  0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
+    SBUTTERFLY   wd, %1, %2, %5     ; %1: m0 1+0 %2: m1 1+0
+    SBUTTERFLY   wd, %3, %4, %5     ; %3: m0 3+2 %4: m1 3+2
+    HADAMARD2_2D %1, %3, %2, %4, %5, dq
+    SBUTTERFLY  qdq, %1, %2, %5
+    HADAMARD  0, %6, %1, %2, %5     ; 2nd H m1/m0 row 0+1
+    SBUTTERFLY  qdq, %3, %4, %5
+    HADAMARD  0, %6, %3, %4, %5     ; 2nd H m1/m0 row 2+3
+%endmacro
+
+%macro HADAMARD8_2D 9-10 sumsub
+    HADAMARD2_2D %1, %2, %3, %4, %9, wd
+    HADAMARD2_2D %5, %6, %7, %8, %9, wd
+    HADAMARD2_2D %1, %3, %2, %4, %9, dq
+    HADAMARD2_2D %5, %7, %6, %8, %9, dq
+    HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
+    HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
+%ifnidn %10, amax
+    SWAP %2, %5
+    SWAP %4, %7
+%endif
 %endmacro
 
 %macro SUMSUB2_AB 3
@@ -197,13 +372,49 @@
     psubw   %3, %2
 %endmacro
 
+%macro SUMSUB2_BA 3
+    mova    m%3, m%1
+    paddw   m%1, m%2
+    paddw   m%1, m%2
+    psubw   m%2, m%3
+    psubw   m%2, m%3
+%endmacro
+
 %macro SUMSUBD2_AB 4
     mova    %4, %1
     mova    %3, %2
     psraw   %2, 1
-    psraw   %4, 1
-    paddw   %1, %2
-    psubw   %4, %3
+    psraw   %1, 1
+    paddw   %2, %4
+    psubw   %1, %3
+%endmacro
+
+%macro DCT4_1D 5
+%ifnum %5
+    SUMSUB_BADC m%4, m%1, m%3, m%2; m%5
+    SUMSUB_BA   m%3, m%4, m%5
+    SUMSUB2_AB  m%1, m%2, m%5
+    SWAP %1, %3, %4, %5, %2
+%else
+    SUMSUB_BADC m%4, m%1, m%3, m%2
+    SUMSUB_BA   m%3, m%4
+    mova       [%5], m%2
+    SUMSUB2_AB m%1, [%5], m%2
+    SWAP %1, %3, %4, %2
+%endif
+%endmacro
+
+%macro IDCT4_1D 5-6
+%ifnum %5
+    SUMSUBD2_AB m%2, m%4, m%6, m%5
+    SUMSUB_BA   m%3, m%1, m%6
+    SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
+%else
+    SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
+    SUMSUB_BA   m%3, m%1
+    SUMSUB_BADC m%4, m%3, m%2, m%1
+%endif
+    SWAP %1, %4, %3
 %endmacro
 
 %macro LOAD_DIFF 5
@@ -222,17 +433,81 @@
 %endif
 %endmacro
 
-%macro LOAD_DIFF_8x4P 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer
-    LOAD_DIFF %1, %5, none, [%7],      [%8]
-    LOAD_DIFF %2, %6, none, [%7+r1],   [%8+r3]
-    LOAD_DIFF %3, %5, none, [%7+2*r1], [%8+2*r3]
-    LOAD_DIFF %4, %6, none, [%7+r4],   [%8+r5]
+%macro LOAD_DIFF8x4_SSE2 8
+    LOAD_DIFF  m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE]
+    LOAD_DIFF  m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE]
+    LOAD_DIFF  m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE]
+    LOAD_DIFF  m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE]
+%endmacro
+
+%macro LOAD_DIFF8x4_SSSE3 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
+    movh       m%2, [%8+%1*FDEC_STRIDE]
+    movh       m%1, [%7+%1*FENC_STRIDE]
+    punpcklbw  m%1, m%2
+    movh       m%3, [%8+%2*FDEC_STRIDE]
+    movh       m%2, [%7+%2*FENC_STRIDE]
+    punpcklbw  m%2, m%3
+    movh       m%4, [%8+%3*FDEC_STRIDE]
+    movh       m%3, [%7+%3*FENC_STRIDE]
+    punpcklbw  m%3, m%4
+    movh       m%5, [%8+%4*FDEC_STRIDE]
+    movh       m%4, [%7+%4*FENC_STRIDE]
+    punpcklbw  m%4, m%5
+    pmaddubsw  m%1, m%6
+    pmaddubsw  m%2, m%6
+    pmaddubsw  m%3, m%6
+    pmaddubsw  m%4, m%6
+%endmacro
+
+%macro STORE_DCT 6
+    movq   [%5+%6+ 0], m%1
+    movq   [%5+%6+ 8], m%2
+    movq   [%5+%6+16], m%3
+    movq   [%5+%6+24], m%4
+    movhps [%5+%6+32], m%1
+    movhps [%5+%6+40], m%2
+    movhps [%5+%6+48], m%3
+    movhps [%5+%6+56], m%4
+%endmacro
+
+%macro STORE_IDCT 4
+    movhps [r0-4*FDEC_STRIDE], %1
+    movh   [r0-3*FDEC_STRIDE], %1
+    movhps [r0-2*FDEC_STRIDE], %2
+    movh   [r0-1*FDEC_STRIDE], %2
+    movhps [r0+0*FDEC_STRIDE], %3
+    movh   [r0+1*FDEC_STRIDE], %3
+    movhps [r0+2*FDEC_STRIDE], %4
+    movh   [r0+3*FDEC_STRIDE], %4
+%endmacro
+
+%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
+    LOAD_DIFF m%1, m%5, m%7, [%8],      [%9]
+    LOAD_DIFF m%2, m%6, m%7, [%8+r1],   [%9+r3]
+    LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
+    LOAD_DIFF m%4, m%6, m%7, [%8+r4],   [%9+r5]
+%if %10
+    lea %8, [%8+4*r1]
+    lea %9, [%9+4*r3]
+%endif
+%endmacro
+
+%macro DIFFx2 6-7
+    movh       %3, %5
+    punpcklbw  %3, %4
+    psraw      %1, 6
+    paddsw     %1, %3
+    movh       %3, %6
+    punpcklbw  %3, %4
+    psraw      %2, 6
+    paddsw     %2, %3
+    packuswb   %2, %1
 %endmacro
 
 %macro STORE_DIFF 4
-    psraw      %1, 6
     movh       %2, %4
     punpcklbw  %2, %3
+    psraw      %1, 6
     paddsw     %1, %2
     packuswb   %1, %1
     movh       %4, %1