comparison x86/x86inc.asm @ 10019:c08ca946c80a libavcodec

Update x264 asm code to latest to add support for 64-bit Windows. Use the new x86inc features to support 64-bit Windows on all non-x264 nasm assembly code as well. Patch by John Adcock, dscaler.johnad AT googlemail DOT com. Win64 changes originally by Anton Mitrofanov. x86util changes mostly by Holger Lubitz.
author darkshikari
date Tue, 04 Aug 2009 07:42:55 +0000
parents 7768bdfd4f7b
children 12c8175d6db5
comparison
equal deleted inserted replaced
10018:46f8d58fbdfb 10019:c08ca946c80a
18 ;* You should have received a copy of the GNU Lesser General Public 18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software 19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;***************************************************************************** 21 ;*****************************************************************************
22 22
23 %ifdef ARCH_X86_64
24 %ifidn __OUTPUT_FORMAT__,win32
25 %define WIN64
26 %else
27 %define UNIX64
28 %endif
29 %endif
30
23 ; FIXME: All of the 64bit asm functions that take a stride as an argument 31 ; FIXME: All of the 64bit asm functions that take a stride as an argument
24 ; via register, assume that the high dword of that register is filled with 0. 32 ; via register, assume that the high dword of that register is filled with 0.
25 ; This is true in practice (since we never do any 64bit arithmetic on strides, 33 ; This is true in practice (since we never do any 64bit arithmetic on strides,
26 ; and x264's strides are all positive), but is not guaranteed by the ABI. 34 ; and x264's strides are all positive), but is not guaranteed by the ABI.
27 35
28 ; Name of the .rodata section. 36 ; Name of the .rodata section.
29 ; Kludge: Something on OS X fails to align .rodata even given an align attribute, 37 ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
30 ; so use a different read-only section. 38 ; so use a different read-only section.
31 %macro SECTION_RODATA 0 39 %macro SECTION_RODATA 0-1 16
32 %ifidn __OUTPUT_FORMAT__,macho64 40 %ifidn __OUTPUT_FORMAT__,macho64
33 SECTION .text align=16 41 SECTION .text align=%1
34 %elifidn __OUTPUT_FORMAT__,macho 42 %elifidn __OUTPUT_FORMAT__,macho
35 SECTION .text align=16 43 SECTION .text align=%1
36 fakegot: 44 fakegot:
37 %else 45 %else
38 SECTION .rodata align=16 46 SECTION .rodata align=%1
39 %endif 47 %endif
40 %endmacro 48 %endmacro
41 49
42 ; PIC support macros. All these macros are totally harmless when PIC is 50 ; PIC support macros.
43 ; not defined but can ruin everything if misused in PIC mode. On x86_32, shared 51 ; x86_64 can't fit 64bit address literals in most instruction types,
44 ; objects cannot directly access global variables by address, they need to 52 ; so shared objects (under the assumption that they might be anywhere
45 ; go through the GOT (global offset table). Most OSes do not care about it 53 ; in memory) must use an address mode that does fit.
46 ; and let you load non-shared .so objects (Linux, Win32...). However, OS X 54 ; So all accesses to global variables must use this macro, e.g.
47 ; requires PIC code in its .dylib objects.
48 ;
49 ; - GLOBAL should be used as a suffix for global addressing, eg.
50 ; picgetgot ebx
51 ; mov eax, [foo GLOBAL] 55 ; mov eax, [foo GLOBAL]
52 ; instead of 56 ; instead of
53 ; mov eax, [foo] 57 ; mov eax, [foo]
54 ; 58 ;
55 ; - picgetgot computes the GOT address into the given register in PIC 59 ; x86_32 doesn't require PIC.
56 ; mode, otherwise does nothing. You need to do this before using GLOBAL. 60 ; Some distros prefer shared objects to be PIC, but nothing breaks if
57 ; Before in both execution order and compiled code order (so GLOBAL knows 61 ; the code contains a few textrels, so we'll skip that complexity.
58 ; which register the GOT is in). 62
59 63 %ifdef WIN64
60 %ifndef PIC 64 %define PIC
65 %elifndef ARCH_X86_64
66 %undef PIC
67 %endif
68 %ifdef PIC
69 %define GLOBAL wrt rip
70 %else
61 %define GLOBAL 71 %define GLOBAL
62 %macro picgetgot 1
63 %endmacro
64 %elifdef ARCH_X86_64
65 %define PIC64
66 %define GLOBAL wrt rip
67 %macro picgetgot 1
68 %endmacro
69 %else
70 %define PIC32
71 %ifidn __OUTPUT_FORMAT__,macho
72 ; There is no real global offset table on OS X, but we still
73 ; need to reference our variables by offset.
74 %macro picgetgot 1
75 call %%getgot
76 %%getgot:
77 pop %1
78 add %1, $$ - %%getgot
79 %undef GLOBAL
80 %define GLOBAL + %1 - fakegot
81 %endmacro
82 %else ; elf
83 extern _GLOBAL_OFFSET_TABLE_
84 %macro picgetgot 1
85 call %%getgot
86 %%getgot:
87 pop %1
88 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
89 %undef GLOBAL
90 %define GLOBAL + %1 wrt ..gotoff
91 %endmacro
92 %endif
93 %endif 72 %endif
94 73
95 ; Macros to eliminate most code duplication between x86_32 and x86_64: 74 ; Macros to eliminate most code duplication between x86_32 and x86_64:
96 ; Currently this works only for leaf functions which load all their arguments 75 ; Currently this works only for leaf functions which load all their arguments
97 ; into registers at the start, and make no other use of the stack. Luckily that 76 ; into registers at the start, and make no other use of the stack. Luckily that
98 ; covers most of x264's asm. 77 ; covers most of x264's asm.
99 78
100 ; PROLOGUE: 79 ; PROLOGUE:
101 ; %1 = number of arguments. loads them from stack if needed. 80 ; %1 = number of arguments. loads them from stack if needed.
102 ; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed. 81 ; %2 = number of registers used. pushes callee-saved regs if needed.
103 ; %3 = whether global constants are used in this function. inits x86_32 PIC if needed. 82 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
104 ; %4 = list of names to define to registers 83 ; %4 = list of names to define to registers
105 ; PROLOGUE can also be invoked by adding the same options to cglobal 84 ; PROLOGUE can also be invoked by adding the same options to cglobal
106 85
107 ; e.g. 86 ; e.g.
108 ; cglobal foo, 2,3,0, dst, src, tmp 87 ; cglobal foo, 2,3, dst, src, tmp
109 ; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals 88 ; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
110 89
111 ; TODO Some functions can use some args directly from the stack. If they're the 90 ; TODO Some functions can use some args directly from the stack. If they're the
112 ; last args then you can just not declare them, but if they're in the middle 91 ; last args then you can just not declare them, but if they're in the middle
113 ; we need more flexible macro. 92 ; we need more flexible macro.
114 93
116 ; Pops anything that was pushed by PROLOGUE 95 ; Pops anything that was pushed by PROLOGUE
117 96
118 ; REP_RET: 97 ; REP_RET:
119 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons 98 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
120 ; which are slow when a normal ret follows a branch. 99 ; which are slow when a normal ret follows a branch.
100
101 ; registers:
102 ; rN and rNq are the native-size register holding function argument N
103 ; rNd, rNw, rNb are dword, word, and byte size
104 ; rNm is the original location of arg N (a register or on the stack), dword
105 ; rNmp is native size
121 106
122 %macro DECLARE_REG 6 107 %macro DECLARE_REG 6
123 %define r%1q %2 108 %define r%1q %2
124 %define r%1d %3 109 %define r%1d %3
125 %define r%1w %4 110 %define r%1w %4
126 %define r%1b %5 111 %define r%1b %5
127 %define r%1m %6 112 %define r%1m %6
113 %ifid %6 ; i.e. it's a register
114 %define r%1mp %2
115 %elifdef ARCH_X86_64 ; memory
116 %define r%1mp qword %6
117 %else
118 %define r%1mp dword %6
119 %endif
128 %define r%1 %2 120 %define r%1 %2
129 %endmacro 121 %endmacro
130 122
131 %macro DECLARE_REG_SIZE 2 123 %macro DECLARE_REG_SIZE 2
132 %define r%1q r%1 124 %define r%1q r%1
148 DECLARE_REG_SIZE dx, dl 140 DECLARE_REG_SIZE dx, dl
149 DECLARE_REG_SIZE si, sil 141 DECLARE_REG_SIZE si, sil
150 DECLARE_REG_SIZE di, dil 142 DECLARE_REG_SIZE di, dil
151 DECLARE_REG_SIZE bp, bpl 143 DECLARE_REG_SIZE bp, bpl
152 144
145 ; t# defines for when per-arch register allocation is more complex than just function arguments
146
147 %macro DECLARE_REG_TMP 1-*
148 %assign %%i 0
149 %rep %0
150 CAT_XDEFINE t, %%i, r%1
151 %assign %%i %%i+1
152 %rotate 1
153 %endrep
154 %endmacro
155
156 %macro DECLARE_REG_TMP_SIZE 0-*
157 %rep %0
158 %define t%1q t%1 %+ q
159 %define t%1d t%1 %+ d
160 %define t%1w t%1 %+ w
161 %define t%1b t%1 %+ b
162 %rotate 1
163 %endrep
164 %endmacro
165
166 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
167
153 %ifdef ARCH_X86_64 168 %ifdef ARCH_X86_64
154 %define gprsize 8 169 %define gprsize 8
155 %else 170 %else
156 %define gprsize 4 171 %define gprsize 4
157 %endif 172 %endif
222 %rotate 1 237 %rotate 1
223 %endrep 238 %endrep
224 %assign n_arg_names %%i 239 %assign n_arg_names %%i
225 %endmacro 240 %endmacro
226 241
227 %ifdef ARCH_X86_64 ;========================================================== 242 %ifdef WIN64 ; Windows x64 ;=================================================
228 %ifidn __OUTPUT_FORMAT__,win32
229 243
230 DECLARE_REG 0, rcx, ecx, cx, cl, ecx 244 DECLARE_REG 0, rcx, ecx, cx, cl, ecx
231 DECLARE_REG 1, rdx, edx, dx, dl, edx 245 DECLARE_REG 1, rdx, edx, dx, dl, edx
232 DECLARE_REG 2, r8, r8d, r8w, r8b, r8d 246 DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
233 DECLARE_REG 3, r9, r9d, r9w, r9b, r9d 247 DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
237 %define r7m [rsp + stack_offset + 64] 251 %define r7m [rsp + stack_offset + 64]
238 %define r8m [rsp + stack_offset + 72] 252 %define r8m [rsp + stack_offset + 72]
239 253
240 %macro LOAD_IF_USED 2 ; reg_id, number_of_args 254 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
241 %if %1 < %2 255 %if %1 < %2
242 mov r%1, [rsp + 8 + %1*8] 256 mov r%1, [rsp + stack_offset + 8 + %1*8]
243 %endif 257 %endif
244 %endmacro 258 %endmacro
245 259
246 %else ;======================================================================= 260 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
261 ASSERT %2 >= %1
262 %assign regs_used %2
263 ASSERT regs_used <= 7
264 %if %0 > 2
265 %assign xmm_regs_used %3
266 %else
267 %assign xmm_regs_used 0
268 %endif
269 ASSERT xmm_regs_used <= 16
270 %if regs_used > 4
271 push r4
272 push r5
273 %assign stack_offset stack_offset+16
274 %endif
275 %if xmm_regs_used > 6
276 sub rsp, (xmm_regs_used-6)*16+16
277 %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
278 %assign %%i xmm_regs_used
279 %rep (xmm_regs_used-6)
280 %assign %%i %%i-1
281 movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
282 %endrep
283 %endif
284 LOAD_IF_USED 4, %1
285 LOAD_IF_USED 5, %1
286 LOAD_IF_USED 6, %1
287 DEFINE_ARGS %4
288 %endmacro
289
290 %macro RESTORE_XMM_INTERNAL 1
291 %if xmm_regs_used > 6
292 %assign %%i xmm_regs_used
293 %rep (xmm_regs_used-6)
294 %assign %%i %%i-1
295 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
296 %endrep
297 add %1, (xmm_regs_used-6)*16+16
298 %endif
299 %endmacro
300
301 %macro RESTORE_XMM 1
302 RESTORE_XMM_INTERNAL %1
303 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
304 %assign xmm_regs_used 0
305 %endmacro
306
307 %macro RET 0
308 RESTORE_XMM_INTERNAL rsp
309 %if regs_used > 4
310 pop r5
311 pop r4
312 %endif
313 ret
314 %endmacro
315
316 %macro REP_RET 0
317 %if regs_used > 4 || xmm_regs_used > 6
318 RET
319 %else
320 rep ret
321 %endif
322 %endmacro
323
324 %elifdef ARCH_X86_64 ; *nix x64 ;=============================================
247 325
248 DECLARE_REG 0, rdi, edi, di, dil, edi 326 DECLARE_REG 0, rdi, edi, di, dil, edi
249 DECLARE_REG 1, rsi, esi, si, sil, esi 327 DECLARE_REG 1, rsi, esi, si, sil, esi
250 DECLARE_REG 2, rdx, edx, dx, dl, edx 328 DECLARE_REG 2, rdx, edx, dx, dl, edx
251 DECLARE_REG 3, rcx, ecx, cx, cl, ecx 329 DECLARE_REG 3, rcx, ecx, cx, cl, ecx
259 %if %1 < %2 337 %if %1 < %2
260 mov r%1, [rsp - 40 + %1*8] 338 mov r%1, [rsp - 40 + %1*8]
261 %endif 339 %endif
262 %endmacro 340 %endmacro
263 341
264 %endif ; !WIN64 342 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
265
266 %macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
267 ASSERT %2 >= %1 343 ASSERT %2 >= %1
268 ASSERT %2 <= 7 344 ASSERT %2 <= 7
269 %assign stack_offset 0
270 %ifidn __OUTPUT_FORMAT__,win32
271 LOAD_IF_USED 4, %1
272 LOAD_IF_USED 5, %1
273 %endif
274 LOAD_IF_USED 6, %1 345 LOAD_IF_USED 6, %1
275 DEFINE_ARGS %4 346 DEFINE_ARGS %4
276 %endmacro 347 %endmacro
277 348
278 %macro RET 0 349 %macro RET 0
313 %if %1 < %2 384 %if %1 < %2
314 mov r%1, [esp + stack_offset + 4 + %1*4] 385 mov r%1, [esp + stack_offset + 4 + %1*4]
315 %endif 386 %endif
316 %endmacro 387 %endmacro
317 388
318 %macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... 389 %macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
319 ASSERT %2 >= %1 390 ASSERT %2 >= %1
320 %assign stack_offset 0
321 %assign regs_used %2 391 %assign regs_used %2
322 %ifdef PIC
323 %if %3
324 %assign regs_used regs_used+1
325 %endif
326 %endif
327 ASSERT regs_used <= 7 392 ASSERT regs_used <= 7
328 PUSH_IF_USED 3 393 PUSH_IF_USED 3
329 PUSH_IF_USED 4 394 PUSH_IF_USED 4
330 PUSH_IF_USED 5 395 PUSH_IF_USED 5
331 PUSH_IF_USED 6 396 PUSH_IF_USED 6
334 LOAD_IF_USED 2, %1 399 LOAD_IF_USED 2, %1
335 LOAD_IF_USED 3, %1 400 LOAD_IF_USED 3, %1
336 LOAD_IF_USED 4, %1 401 LOAD_IF_USED 4, %1
337 LOAD_IF_USED 5, %1 402 LOAD_IF_USED 5, %1
338 LOAD_IF_USED 6, %1 403 LOAD_IF_USED 6, %1
339 %if %3
340 picgetgot r%2
341 %endif
342 DEFINE_ARGS %4 404 DEFINE_ARGS %4
343 %endmacro 405 %endmacro
344 406
345 %macro RET 0 407 %macro RET 0
346 POP_IF_USED 6 408 POP_IF_USED 6
380 global %1 442 global %1
381 %endif 443 %endif
382 align function_align 444 align function_align
383 %1: 445 %1:
384 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer 446 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
447 %assign stack_offset 0
385 %if %0 > 1 448 %if %0 > 1
386 PROLOGUE %2 449 PROLOGUE %2
387 %endif 450 %endif
388 %endmacro 451 %endmacro
389 452
390 %macro cextern 1 453 %macro cextern 1
391 %ifdef PREFIX 454 %ifdef PREFIX
392 extern _%1 455 %xdefine %1 _%1
393 %define %1 _%1 456 %endif
394 %else 457 extern %1
395 extern %1
396 %endif
397 %endmacro 458 %endmacro
398 459
399 ; This is needed for ELF, otherwise the GNU linker assumes the stack is 460 ; This is needed for ELF, otherwise the GNU linker assumes the stack is
400 ; executable by default. 461 ; executable by default.
401 %ifidn __OUTPUT_FORMAT__,elf 462 %ifidn __OUTPUT_FORMAT__,elf
521 582
522 %macro LOAD_MM_PERMUTATION 1 583 %macro LOAD_MM_PERMUTATION 1
523 %assign %%i 0 584 %assign %%i 0
524 %rep num_mmregs 585 %rep num_mmregs
525 CAT_XDEFINE m, %%i, %1_m %+ %%i 586 CAT_XDEFINE m, %%i, %1_m %+ %%i
587 CAT_XDEFINE n, m %+ %%i, %%i
526 %assign %%i %%i+1 588 %assign %%i %%i+1
527 %endrep 589 %endrep
528 %endmacro 590 %endmacro
529 591
530 %macro call 1 592 %macro call 1
532 %ifdef %1_m0 594 %ifdef %1_m0
533 LOAD_MM_PERMUTATION %1 595 LOAD_MM_PERMUTATION %1
534 %endif 596 %endif
535 %endmacro 597 %endmacro
536 598
537 ; substitutions which are functionally identical but reduce code size 599 ;Substitutions that reduce instruction size but are functionally equivalent
538 %define movdqa movaps 600 %define movdqa movaps
539 %define movdqu movups 601 %define movdqu movups
540 602
603 %macro add 2
604 %ifnum %2
605 %if %2==128
606 sub %1, -128
607 %else
608 add %1, %2
609 %endif
610 %else
611 add %1, %2
612 %endif
613 %endmacro
614
615 %macro sub 2
616 %ifnum %2
617 %if %2==128
618 add %1, -128
619 %else
620 sub %1, %2
621 %endif
622 %else
623 sub %1, %2
624 %endif
625 %endmacro