diff x86/x86inc.asm @ 10019:c08ca946c80a libavcodec

Update x264 asm code to latest to add support for 64-bit Windows. Use the new x86inc features to support 64-bit Windows on all non-x264 nasm assembly code as well. Patch by John Adcock, dscaler.johnad AT googlemail DOT com. Win64 changes originally by Anton Mitrofanov. x86util changes mostly by Holger Lubitz.
author darkshikari
date Tue, 04 Aug 2009 07:42:55 +0000
parents 7768bdfd4f7b
children 12c8175d6db5
line wrap: on
line diff
--- a/x86/x86inc.asm	Mon Aug 03 23:22:46 2009 +0000
+++ b/x86/x86inc.asm	Tue Aug 04 07:42:55 2009 +0000
@@ -20,6 +20,14 @@
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
+%ifdef ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64
+    %else
+        %define UNIX64
+    %endif
+%endif
+
 ; FIXME: All of the 64bit asm functions that take a stride as an argument
 ; via register, assume that the high dword of that register is filled with 0.
 ; This is true in practice (since we never do any 64bit arithmetic on strides,
@@ -28,68 +36,39 @@
 ; Name of the .rodata section.
 ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
 ; so use a different read-only section.
-%macro SECTION_RODATA 0
+%macro SECTION_RODATA 0-1 16
     %ifidn __OUTPUT_FORMAT__,macho64
-        SECTION .text align=16
+        SECTION .text align=%1
     %elifidn __OUTPUT_FORMAT__,macho
-        SECTION .text align=16
+        SECTION .text align=%1
         fakegot:
     %else
-        SECTION .rodata align=16
+        SECTION .rodata align=%1
     %endif
 %endmacro
 
-; PIC support macros. All these macros are totally harmless when PIC is
-; not defined but can ruin everything if misused in PIC mode. On x86_32, shared
-; objects cannot directly access global variables by address, they need to
-; go through the GOT (global offset table). Most OSes do not care about it
-; and let you load non-shared .so objects (Linux, Win32...). However, OS X
-; requires PIC code in its .dylib objects.
-;
-; - GLOBAL should be used as a suffix for global addressing, eg.
-;     picgetgot ebx
+; PIC support macros.
+; x86_64 can't fit 64bit address literals in most instruction types,
+; so shared objects (under the assumption that they might be anywhere
+; in memory) must use an address mode that does fit.
+; So all accesses to global variables must use this macro, e.g.
 ;     mov eax, [foo GLOBAL]
 ;   instead of
 ;     mov eax, [foo]
 ;
-; - picgetgot computes the GOT address into the given register in PIC
-;   mode, otherwise does nothing. You need to do this before using GLOBAL.
-;   Before in both execution order and compiled code order (so GLOBAL knows
-;   which register the GOT is in).
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
 
-%ifndef PIC
-    %define GLOBAL
-    %macro picgetgot 1
-    %endmacro
-%elifdef ARCH_X86_64
-    %define PIC64
+%ifdef WIN64
+    %define PIC
+%elifndef ARCH_X86_64
+    %undef PIC
+%endif
+%ifdef PIC
     %define GLOBAL wrt rip
-    %macro picgetgot 1
-    %endmacro
 %else
-    %define PIC32
-    %ifidn __OUTPUT_FORMAT__,macho
-        ; There is no real global offset table on OS X, but we still
-        ; need to reference our variables by offset.
-        %macro picgetgot 1
-            call %%getgot
-          %%getgot:
-            pop %1
-            add %1, $$ - %%getgot
-            %undef GLOBAL
-            %define GLOBAL + %1 - fakegot
-        %endmacro
-    %else ; elf
-        extern _GLOBAL_OFFSET_TABLE_
-        %macro picgetgot 1
-            call %%getgot
-          %%getgot:
-            pop %1
-            add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
-            %undef GLOBAL
-            %define GLOBAL + %1 wrt ..gotoff
-        %endmacro
-    %endif
+    %define GLOBAL
 %endif
 
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
@@ -99,14 +78,14 @@
 
 ; PROLOGUE:
 ; %1 = number of arguments. loads them from stack if needed.
-; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed.
-; %3 = whether global constants are used in this function. inits x86_32 PIC if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
 ; %4 = list of names to define to registers
 ; PROLOGUE can also be invoked by adding the same options to cglobal
 
 ; e.g.
-; cglobal foo, 2,3,0, dst, src, tmp
-; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals
+; cglobal foo, 2,3, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
 
 ; TODO Some functions can use some args directly from the stack. If they're the
 ; last args then you can just not declare them, but if they're in the middle
@@ -119,12 +98,25 @@
 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
 ; which are slow when a normal ret follows a branch.
 
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
 %macro DECLARE_REG 6
     %define r%1q %2
     %define r%1d %3
     %define r%1w %4
     %define r%1b %5
     %define r%1m %6
+    %ifid %6 ; i.e. it's a register
+        %define r%1mp %2
+    %elifdef ARCH_X86_64 ; memory
+        %define r%1mp qword %6
+    %else
+        %define r%1mp dword %6
+    %endif
     %define r%1  %2
 %endmacro
 
@@ -150,6 +142,29 @@
 DECLARE_REG_SIZE di, dil
 DECLARE_REG_SIZE bp, bpl
 
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
+
 %ifdef ARCH_X86_64
     %define gprsize 8
 %else
@@ -224,8 +239,7 @@
     %assign n_arg_names %%i
 %endmacro
 
-%ifdef ARCH_X86_64 ;==========================================================
-%ifidn __OUTPUT_FORMAT__,win32
+%ifdef WIN64 ; Windows x64 ;=================================================
 
 DECLARE_REG 0, rcx, ecx, cx,  cl,  ecx
 DECLARE_REG 1, rdx, edx, dx,  dl,  edx
@@ -239,11 +253,75 @@
 
 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
     %if %1 < %2
-        mov r%1, [rsp + 8 + %1*8]
+        mov r%1, [rsp + stack_offset + 8 + %1*8]
     %endif
 %endmacro
 
-%else ;=======================================================================
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+    ASSERT %2 >= %1
+    %assign regs_used %2
+    ASSERT regs_used <= 7
+    %if %0 > 2
+        %assign xmm_regs_used %3
+    %else
+        %assign xmm_regs_used 0
+    %endif
+    ASSERT xmm_regs_used <= 16
+    %if regs_used > 4
+        push r4
+        push r5
+        %assign stack_offset stack_offset+16
+    %endif
+    %if xmm_regs_used > 6
+        sub rsp, (xmm_regs_used-6)*16+16
+        %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
+        %endrep
+    %endif
+    LOAD_IF_USED 4, %1
+    LOAD_IF_USED 5, %1
+    LOAD_IF_USED 6, %1
+    DEFINE_ARGS %4
+%endmacro
+
+%macro RESTORE_XMM_INTERNAL 1
+    %if xmm_regs_used > 6
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
+        %endrep
+        add %1, (xmm_regs_used-6)*16+16
+    %endif
+%endmacro
+
+%macro RESTORE_XMM 1
+    RESTORE_XMM_INTERNAL %1
+    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+    %assign xmm_regs_used 0
+%endmacro
+
+%macro RET 0
+    RESTORE_XMM_INTERNAL rsp
+    %if regs_used > 4
+        pop r5
+        pop r4
+    %endif
+    ret
+%endmacro
+
+%macro REP_RET 0
+    %if regs_used > 4 || xmm_regs_used > 6
+        RET
+    %else
+        rep ret
+    %endif
+%endmacro
+
+%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
 
 DECLARE_REG 0, rdi, edi, di,  dil, edi
 DECLARE_REG 1, rsi, esi, si,  sil, esi
@@ -261,16 +339,9 @@
     %endif
 %endmacro
 
-%endif ; !WIN64
-
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
     ASSERT %2 >= %1
     ASSERT %2 <= 7
-    %assign stack_offset 0
-%ifidn __OUTPUT_FORMAT__,win32
-    LOAD_IF_USED 4, %1
-    LOAD_IF_USED 5, %1
-%endif
     LOAD_IF_USED 6, %1
     DEFINE_ARGS %4
 %endmacro
@@ -315,15 +386,9 @@
     %endif
 %endmacro
 
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+%macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
     ASSERT %2 >= %1
-    %assign stack_offset 0
     %assign regs_used %2
-    %ifdef PIC
-    %if %3
-        %assign regs_used regs_used+1
-    %endif
-    %endif
     ASSERT regs_used <= 7
     PUSH_IF_USED 3
     PUSH_IF_USED 4
@@ -336,9 +401,6 @@
     LOAD_IF_USED 4, %1
     LOAD_IF_USED 5, %1
     LOAD_IF_USED 6, %1
-    %if %3
-        picgetgot r%2
-    %endif
     DEFINE_ARGS %4
 %endmacro
 
@@ -382,6 +444,7 @@
     align function_align
     %1:
     RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+    %assign stack_offset 0
     %if %0 > 1
         PROLOGUE %2
     %endif
@@ -389,11 +452,9 @@
 
 %macro cextern 1
     %ifdef PREFIX
-        extern _%1
-        %define %1 _%1
-    %else
-        extern %1
+        %xdefine %1 _%1
     %endif
+    extern %1
 %endmacro
 
 ; This is needed for ELF, otherwise the GNU linker assumes the stack is
@@ -523,6 +584,7 @@
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE m, %%i, %1_m %+ %%i
+    CAT_XDEFINE n, m %+ %%i, %%i
     %assign %%i %%i+1
     %endrep
 %endmacro
@@ -534,7 +596,30 @@
     %endif
 %endmacro
 
-; substitutions which are functionally identical but reduce code size
+;Substitutions that reduce instruction size but are functionally equivalent
 %define movdqa movaps
 %define movdqu movups
 
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro