Mercurial > libavcodec.hg
comparison x86/x86inc.asm @ 10019:c08ca946c80a libavcodec
Update x264 asm code to latest to add support for 64-bit Windows.
Use the new x86inc features to support 64-bit Windows on all non-x264 nasm
assembly code as well.
Patch by John Adcock, dscaler.johnad AT googlemail DOT com.
Win64 changes originally by Anton Mitrofanov.
x86util changes mostly by Holger Lubitz.
author | darkshikari |
---|---|
date | Tue, 04 Aug 2009 07:42:55 +0000 |
parents | 7768bdfd4f7b |
children | 12c8175d6db5 |
comparison
equal
deleted
inserted
replaced
10018:46f8d58fbdfb | 10019:c08ca946c80a |
---|---|
18 ;* You should have received a copy of the GNU Lesser General Public | 18 ;* You should have received a copy of the GNU Lesser General Public |
19 ;* License along with FFmpeg; if not, write to the Free Software | 19 ;* License along with FFmpeg; if not, write to the Free Software |
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | 20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 ;***************************************************************************** | 21 ;***************************************************************************** |
22 | 22 |
23 %ifdef ARCH_X86_64 | |
24 %ifidn __OUTPUT_FORMAT__,win32 | |
25 %define WIN64 | |
26 %else | |
27 %define UNIX64 | |
28 %endif | |
29 %endif | |
30 | |
23 ; FIXME: All of the 64bit asm functions that take a stride as an argument | 31 ; FIXME: All of the 64bit asm functions that take a stride as an argument |
24 ; via register, assume that the high dword of that register is filled with 0. | 32 ; via register, assume that the high dword of that register is filled with 0. |
25 ; This is true in practice (since we never do any 64bit arithmetic on strides, | 33 ; This is true in practice (since we never do any 64bit arithmetic on strides, |
26 ; and x264's strides are all positive), but is not guaranteed by the ABI. | 34 ; and x264's strides are all positive), but is not guaranteed by the ABI. |
27 | 35 |
28 ; Name of the .rodata section. | 36 ; Name of the .rodata section. |
29 ; Kludge: Something on OS X fails to align .rodata even given an align attribute, | 37 ; Kludge: Something on OS X fails to align .rodata even given an align attribute, |
30 ; so use a different read-only section. | 38 ; so use a different read-only section. |
31 %macro SECTION_RODATA 0 | 39 %macro SECTION_RODATA 0-1 16 |
32 %ifidn __OUTPUT_FORMAT__,macho64 | 40 %ifidn __OUTPUT_FORMAT__,macho64 |
33 SECTION .text align=16 | 41 SECTION .text align=%1 |
34 %elifidn __OUTPUT_FORMAT__,macho | 42 %elifidn __OUTPUT_FORMAT__,macho |
35 SECTION .text align=16 | 43 SECTION .text align=%1 |
36 fakegot: | 44 fakegot: |
37 %else | 45 %else |
38 SECTION .rodata align=16 | 46 SECTION .rodata align=%1 |
39 %endif | 47 %endif |
40 %endmacro | 48 %endmacro |
41 | 49 |
42 ; PIC support macros. All these macros are totally harmless when PIC is | 50 ; PIC support macros. |
43 ; not defined but can ruin everything if misused in PIC mode. On x86_32, shared | 51 ; x86_64 can't fit 64bit address literals in most instruction types, |
44 ; objects cannot directly access global variables by address, they need to | 52 ; so shared objects (under the assumption that they might be anywhere |
45 ; go through the GOT (global offset table). Most OSes do not care about it | 53 ; in memory) must use an address mode that does fit. |
46 ; and let you load non-shared .so objects (Linux, Win32...). However, OS X | 54 ; So all accesses to global variables must use this macro, e.g. |
47 ; requires PIC code in its .dylib objects. | |
48 ; | |
49 ; - GLOBAL should be used as a suffix for global addressing, eg. | |
50 ; picgetgot ebx | |
51 ; mov eax, [foo GLOBAL] | 55 ; mov eax, [foo GLOBAL] |
52 ; instead of | 56 ; instead of |
53 ; mov eax, [foo] | 57 ; mov eax, [foo] |
54 ; | 58 ; |
55 ; - picgetgot computes the GOT address into the given register in PIC | 59 ; x86_32 doesn't require PIC. |
56 ; mode, otherwise does nothing. You need to do this before using GLOBAL. | 60 ; Some distros prefer shared objects to be PIC, but nothing breaks if |
57 ; Before in both execution order and compiled code order (so GLOBAL knows | 61 ; the code contains a few textrels, so we'll skip that complexity. |
58 ; which register the GOT is in). | 62 |
59 | 63 %ifdef WIN64 |
60 %ifndef PIC | 64 %define PIC |
65 %elifndef ARCH_X86_64 | |
66 %undef PIC | |
67 %endif | |
68 %ifdef PIC | |
69 %define GLOBAL wrt rip | |
70 %else | |
61 %define GLOBAL | 71 %define GLOBAL |
62 %macro picgetgot 1 | |
63 %endmacro | |
64 %elifdef ARCH_X86_64 | |
65 %define PIC64 | |
66 %define GLOBAL wrt rip | |
67 %macro picgetgot 1 | |
68 %endmacro | |
69 %else | |
70 %define PIC32 | |
71 %ifidn __OUTPUT_FORMAT__,macho | |
72 ; There is no real global offset table on OS X, but we still | |
73 ; need to reference our variables by offset. | |
74 %macro picgetgot 1 | |
75 call %%getgot | |
76 %%getgot: | |
77 pop %1 | |
78 add %1, $$ - %%getgot | |
79 %undef GLOBAL | |
80 %define GLOBAL + %1 - fakegot | |
81 %endmacro | |
82 %else ; elf | |
83 extern _GLOBAL_OFFSET_TABLE_ | |
84 %macro picgetgot 1 | |
85 call %%getgot | |
86 %%getgot: | |
87 pop %1 | |
88 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc | |
89 %undef GLOBAL | |
90 %define GLOBAL + %1 wrt ..gotoff | |
91 %endmacro | |
92 %endif | |
93 %endif | 72 %endif |
94 | 73 |
95 ; Macros to eliminate most code duplication between x86_32 and x86_64: | 74 ; Macros to eliminate most code duplication between x86_32 and x86_64: |
96 ; Currently this works only for leaf functions which load all their arguments | 75 ; Currently this works only for leaf functions which load all their arguments |
97 ; into registers at the start, and make no other use of the stack. Luckily that | 76 ; into registers at the start, and make no other use of the stack. Luckily that |
98 ; covers most of x264's asm. | 77 ; covers most of x264's asm. |
99 | 78 |
100 ; PROLOGUE: | 79 ; PROLOGUE: |
101 ; %1 = number of arguments. loads them from stack if needed. | 80 ; %1 = number of arguments. loads them from stack if needed. |
102 ; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed. | 81 ; %2 = number of registers used. pushes callee-saved regs if needed. |
103 ; %3 = whether global constants are used in this function. inits x86_32 PIC if needed. | 82 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. |
104 ; %4 = list of names to define to registers | 83 ; %4 = list of names to define to registers |
105 ; PROLOGUE can also be invoked by adding the same options to cglobal | 84 ; PROLOGUE can also be invoked by adding the same options to cglobal |
106 | 85 |
107 ; e.g. | 86 ; e.g. |
108 ; cglobal foo, 2,3,0, dst, src, tmp | 87 ; cglobal foo, 2,3, dst, src, tmp |
109 ; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals | 88 ; declares a function (foo), taking two args (dst and src) and one local variable (tmp) |
110 | 89 |
111 ; TODO Some functions can use some args directly from the stack. If they're the | 90 ; TODO Some functions can use some args directly from the stack. If they're the |
112 ; last args then you can just not declare them, but if they're in the middle | 91 ; last args then you can just not declare them, but if they're in the middle |
113 ; we need more flexible macro. | 92 ; we need more flexible macro. |
114 | 93 |
116 ; Pops anything that was pushed by PROLOGUE | 95 ; Pops anything that was pushed by PROLOGUE |
117 | 96 |
118 ; REP_RET: | 97 ; REP_RET: |
119 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons | 98 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons |
120 ; which are slow when a normal ret follows a branch. | 99 ; which are slow when a normal ret follows a branch. |
100 | |
101 ; registers: | |
102 ; rN and rNq are the native-size register holding function argument N | |
103 ; rNd, rNw, rNb are dword, word, and byte size | |
104 ; rNm is the original location of arg N (a register or on the stack), dword | |
105 ; rNmp is native size | |
121 | 106 |
122 %macro DECLARE_REG 6 | 107 %macro DECLARE_REG 6 |
123 %define r%1q %2 | 108 %define r%1q %2 |
124 %define r%1d %3 | 109 %define r%1d %3 |
125 %define r%1w %4 | 110 %define r%1w %4 |
126 %define r%1b %5 | 111 %define r%1b %5 |
127 %define r%1m %6 | 112 %define r%1m %6 |
113 %ifid %6 ; i.e. it's a register | |
114 %define r%1mp %2 | |
115 %elifdef ARCH_X86_64 ; memory | |
116 %define r%1mp qword %6 | |
117 %else | |
118 %define r%1mp dword %6 | |
119 %endif | |
128 %define r%1 %2 | 120 %define r%1 %2 |
129 %endmacro | 121 %endmacro |
130 | 122 |
131 %macro DECLARE_REG_SIZE 2 | 123 %macro DECLARE_REG_SIZE 2 |
132 %define r%1q r%1 | 124 %define r%1q r%1 |
148 DECLARE_REG_SIZE dx, dl | 140 DECLARE_REG_SIZE dx, dl |
149 DECLARE_REG_SIZE si, sil | 141 DECLARE_REG_SIZE si, sil |
150 DECLARE_REG_SIZE di, dil | 142 DECLARE_REG_SIZE di, dil |
151 DECLARE_REG_SIZE bp, bpl | 143 DECLARE_REG_SIZE bp, bpl |
152 | 144 |
145 ; t# defines for when per-arch register allocation is more complex than just function arguments | |
146 | |
147 %macro DECLARE_REG_TMP 1-* | |
148 %assign %%i 0 | |
149 %rep %0 | |
150 CAT_XDEFINE t, %%i, r%1 | |
151 %assign %%i %%i+1 | |
152 %rotate 1 | |
153 %endrep | |
154 %endmacro | |
155 | |
156 %macro DECLARE_REG_TMP_SIZE 0-* | |
157 %rep %0 | |
158 %define t%1q t%1 %+ q | |
159 %define t%1d t%1 %+ d | |
160 %define t%1w t%1 %+ w | |
161 %define t%1b t%1 %+ b | |
162 %rotate 1 | |
163 %endrep | |
164 %endmacro | |
165 | |
166 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 | |
167 | |
153 %ifdef ARCH_X86_64 | 168 %ifdef ARCH_X86_64 |
154 %define gprsize 8 | 169 %define gprsize 8 |
155 %else | 170 %else |
156 %define gprsize 4 | 171 %define gprsize 4 |
157 %endif | 172 %endif |
222 %rotate 1 | 237 %rotate 1 |
223 %endrep | 238 %endrep |
224 %assign n_arg_names %%i | 239 %assign n_arg_names %%i |
225 %endmacro | 240 %endmacro |
226 | 241 |
227 %ifdef ARCH_X86_64 ;========================================================== | 242 %ifdef WIN64 ; Windows x64 ;================================================= |
228 %ifidn __OUTPUT_FORMAT__,win32 | |
229 | 243 |
230 DECLARE_REG 0, rcx, ecx, cx, cl, ecx | 244 DECLARE_REG 0, rcx, ecx, cx, cl, ecx |
231 DECLARE_REG 1, rdx, edx, dx, dl, edx | 245 DECLARE_REG 1, rdx, edx, dx, dl, edx |
232 DECLARE_REG 2, r8, r8d, r8w, r8b, r8d | 246 DECLARE_REG 2, r8, r8d, r8w, r8b, r8d |
233 DECLARE_REG 3, r9, r9d, r9w, r9b, r9d | 247 DECLARE_REG 3, r9, r9d, r9w, r9b, r9d |
237 %define r7m [rsp + stack_offset + 64] | 251 %define r7m [rsp + stack_offset + 64] |
238 %define r8m [rsp + stack_offset + 72] | 252 %define r8m [rsp + stack_offset + 72] |
239 | 253 |
240 %macro LOAD_IF_USED 2 ; reg_id, number_of_args | 254 %macro LOAD_IF_USED 2 ; reg_id, number_of_args |
241 %if %1 < %2 | 255 %if %1 < %2 |
242 mov r%1, [rsp + 8 + %1*8] | 256 mov r%1, [rsp + stack_offset + 8 + %1*8] |
243 %endif | 257 %endif |
244 %endmacro | 258 %endmacro |
245 | 259 |
246 %else ;======================================================================= | 260 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
261 ASSERT %2 >= %1 | |
262 %assign regs_used %2 | |
263 ASSERT regs_used <= 7 | |
264 %if %0 > 2 | |
265 %assign xmm_regs_used %3 | |
266 %else | |
267 %assign xmm_regs_used 0 | |
268 %endif | |
269 ASSERT xmm_regs_used <= 16 | |
270 %if regs_used > 4 | |
271 push r4 | |
272 push r5 | |
273 %assign stack_offset stack_offset+16 | |
274 %endif | |
275 %if xmm_regs_used > 6 | |
276 sub rsp, (xmm_regs_used-6)*16+16 | |
277 %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 | |
278 %assign %%i xmm_regs_used | |
279 %rep (xmm_regs_used-6) | |
280 %assign %%i %%i-1 | |
281 movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i | |
282 %endrep | |
283 %endif | |
284 LOAD_IF_USED 4, %1 | |
285 LOAD_IF_USED 5, %1 | |
286 LOAD_IF_USED 6, %1 | |
287 DEFINE_ARGS %4 | |
288 %endmacro | |
289 | |
290 %macro RESTORE_XMM_INTERNAL 1 | |
291 %if xmm_regs_used > 6 | |
292 %assign %%i xmm_regs_used | |
293 %rep (xmm_regs_used-6) | |
294 %assign %%i %%i-1 | |
295 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8] | |
296 %endrep | |
297 add %1, (xmm_regs_used-6)*16+16 | |
298 %endif | |
299 %endmacro | |
300 | |
301 %macro RESTORE_XMM 1 | |
302 RESTORE_XMM_INTERNAL %1 | |
303 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 | |
304 %assign xmm_regs_used 0 | |
305 %endmacro | |
306 | |
307 %macro RET 0 | |
308 RESTORE_XMM_INTERNAL rsp | |
309 %if regs_used > 4 | |
310 pop r5 | |
311 pop r4 | |
312 %endif | |
313 ret | |
314 %endmacro | |
315 | |
316 %macro REP_RET 0 | |
317 %if regs_used > 4 || xmm_regs_used > 6 | |
318 RET | |
319 %else | |
320 rep ret | |
321 %endif | |
322 %endmacro | |
323 | |
324 %elifdef ARCH_X86_64 ; *nix x64 ;============================================= | |
247 | 325 |
248 DECLARE_REG 0, rdi, edi, di, dil, edi | 326 DECLARE_REG 0, rdi, edi, di, dil, edi |
249 DECLARE_REG 1, rsi, esi, si, sil, esi | 327 DECLARE_REG 1, rsi, esi, si, sil, esi |
250 DECLARE_REG 2, rdx, edx, dx, dl, edx | 328 DECLARE_REG 2, rdx, edx, dx, dl, edx |
251 DECLARE_REG 3, rcx, ecx, cx, cl, ecx | 329 DECLARE_REG 3, rcx, ecx, cx, cl, ecx |
259 %if %1 < %2 | 337 %if %1 < %2 |
260 mov r%1, [rsp - 40 + %1*8] | 338 mov r%1, [rsp - 40 + %1*8] |
261 %endif | 339 %endif |
262 %endmacro | 340 %endmacro |
263 | 341 |
264 %endif ; !WIN64 | 342 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
265 | |
266 %macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... | |
267 ASSERT %2 >= %1 | 343 ASSERT %2 >= %1 |
268 ASSERT %2 <= 7 | 344 ASSERT %2 <= 7 |
269 %assign stack_offset 0 | |
270 %ifidn __OUTPUT_FORMAT__,win32 | |
271 LOAD_IF_USED 4, %1 | |
272 LOAD_IF_USED 5, %1 | |
273 %endif | |
274 LOAD_IF_USED 6, %1 | 345 LOAD_IF_USED 6, %1 |
275 DEFINE_ARGS %4 | 346 DEFINE_ARGS %4 |
276 %endmacro | 347 %endmacro |
277 | 348 |
278 %macro RET 0 | 349 %macro RET 0 |
313 %if %1 < %2 | 384 %if %1 < %2 |
314 mov r%1, [esp + stack_offset + 4 + %1*4] | 385 mov r%1, [esp + stack_offset + 4 + %1*4] |
315 %endif | 386 %endif |
316 %endmacro | 387 %endmacro |
317 | 388 |
318 %macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... | 389 %macro PROLOGUE 2-4+ ; #args, #regs, arg_names... |
319 ASSERT %2 >= %1 | 390 ASSERT %2 >= %1 |
320 %assign stack_offset 0 | |
321 %assign regs_used %2 | 391 %assign regs_used %2 |
322 %ifdef PIC | |
323 %if %3 | |
324 %assign regs_used regs_used+1 | |
325 %endif | |
326 %endif | |
327 ASSERT regs_used <= 7 | 392 ASSERT regs_used <= 7 |
328 PUSH_IF_USED 3 | 393 PUSH_IF_USED 3 |
329 PUSH_IF_USED 4 | 394 PUSH_IF_USED 4 |
330 PUSH_IF_USED 5 | 395 PUSH_IF_USED 5 |
331 PUSH_IF_USED 6 | 396 PUSH_IF_USED 6 |
334 LOAD_IF_USED 2, %1 | 399 LOAD_IF_USED 2, %1 |
335 LOAD_IF_USED 3, %1 | 400 LOAD_IF_USED 3, %1 |
336 LOAD_IF_USED 4, %1 | 401 LOAD_IF_USED 4, %1 |
337 LOAD_IF_USED 5, %1 | 402 LOAD_IF_USED 5, %1 |
338 LOAD_IF_USED 6, %1 | 403 LOAD_IF_USED 6, %1 |
339 %if %3 | |
340 picgetgot r%2 | |
341 %endif | |
342 DEFINE_ARGS %4 | 404 DEFINE_ARGS %4 |
343 %endmacro | 405 %endmacro |
344 | 406 |
345 %macro RET 0 | 407 %macro RET 0 |
346 POP_IF_USED 6 | 408 POP_IF_USED 6 |
380 global %1 | 442 global %1 |
381 %endif | 443 %endif |
382 align function_align | 444 align function_align |
383 %1: | 445 %1: |
384 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer | 446 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer |
447 %assign stack_offset 0 | |
385 %if %0 > 1 | 448 %if %0 > 1 |
386 PROLOGUE %2 | 449 PROLOGUE %2 |
387 %endif | 450 %endif |
388 %endmacro | 451 %endmacro |
389 | 452 |
390 %macro cextern 1 | 453 %macro cextern 1 |
391 %ifdef PREFIX | 454 %ifdef PREFIX |
392 extern _%1 | 455 %xdefine %1 _%1 |
393 %define %1 _%1 | 456 %endif |
394 %else | 457 extern %1 |
395 extern %1 | |
396 %endif | |
397 %endmacro | 458 %endmacro |
398 | 459 |
399 ; This is needed for ELF, otherwise the GNU linker assumes the stack is | 460 ; This is needed for ELF, otherwise the GNU linker assumes the stack is |
400 ; executable by default. | 461 ; executable by default. |
401 %ifidn __OUTPUT_FORMAT__,elf | 462 %ifidn __OUTPUT_FORMAT__,elf |
521 | 582 |
522 %macro LOAD_MM_PERMUTATION 1 | 583 %macro LOAD_MM_PERMUTATION 1 |
523 %assign %%i 0 | 584 %assign %%i 0 |
524 %rep num_mmregs | 585 %rep num_mmregs |
525 CAT_XDEFINE m, %%i, %1_m %+ %%i | 586 CAT_XDEFINE m, %%i, %1_m %+ %%i |
587 CAT_XDEFINE n, m %+ %%i, %%i | |
526 %assign %%i %%i+1 | 588 %assign %%i %%i+1 |
527 %endrep | 589 %endrep |
528 %endmacro | 590 %endmacro |
529 | 591 |
530 %macro call 1 | 592 %macro call 1 |
532 %ifdef %1_m0 | 594 %ifdef %1_m0 |
533 LOAD_MM_PERMUTATION %1 | 595 LOAD_MM_PERMUTATION %1 |
534 %endif | 596 %endif |
535 %endmacro | 597 %endmacro |
536 | 598 |
537 ; substitutions which are functionally identical but reduce code size | 599 ;Substitutions that reduce instruction size but are functionally equivalent |
538 %define movdqa movaps | 600 %define movdqa movaps |
539 %define movdqu movups | 601 %define movdqu movups |
540 | 602 |
603 %macro add 2 | |
604 %ifnum %2 | |
605 %if %2==128 | |
606 sub %1, -128 | |
607 %else | |
608 add %1, %2 | |
609 %endif | |
610 %else | |
611 add %1, %2 | |
612 %endif | |
613 %endmacro | |
614 | |
615 %macro sub 2 | |
616 %ifnum %2 | |
617 %if %2==128 | |
618 add %1, -128 | |
619 %else | |
620 sub %1, %2 | |
621 %endif | |
622 %else | |
623 sub %1, %2 | |
624 %endif | |
625 %endmacro |