Mercurial > libavcodec.hg
annotate x86/fft_mmx.asm @ 11060:daff45175333 libavcodec
Make the jump-table section-relative for x86_64 with PIC enabled.
This allows to get rid of the macho64 specific hack that moves them
to rodata (with worse cache behaviour) and avoids textrels which
e.g. Gentoo does not allow for x86_64 libraries.
author | reimar |
---|---|
date | Sat, 30 Jan 2010 19:26:47 +0000 |
parents | c6aa538c0bc3 |
children | 980030a3e315 |
rev | line source |
---|---|
8430 | 1 ;****************************************************************************** |
2 ;* FFT transform with SSE/3DNow optimizations | |
3 ;* Copyright (c) 2008 Loren Merritt | |
4 ;* | |
5 ;* This file is part of FFmpeg. | |
6 ;* | |
7 ;* FFmpeg is free software; you can redistribute it and/or | |
8 ;* modify it under the terms of the GNU Lesser General Public | |
9 ;* License as published by the Free Software Foundation; either | |
10 ;* version 2.1 of the License, or (at your option) any later version. | |
11 ;* | |
12 ;* FFmpeg is distributed in the hope that it will be useful, | |
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 ;* Lesser General Public License for more details. | |
16 ;* | |
17 ;* You should have received a copy of the GNU Lesser General Public | |
18 ;* License along with FFmpeg; if not, write to the Free Software | |
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 ;****************************************************************************** | |
21 | |
22 ; These functions are not individually interchangeable with the C versions. | |
23 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results | |
24 ; in blocks as conventient to the vector size. | |
25 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) | |
26 | |
27 %include "x86inc.asm" | |
28 | |
29 SECTION_RODATA | |
30 | |
31 %define M_SQRT1_2 0.70710678118654752440 | |
32 ps_root2: times 4 dd M_SQRT1_2 | |
33 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | |
34 ps_m1p1: dd 1<<31, 0 | |
35 | |
36 %assign i 16 | |
37 %rep 13 | |
38 cextern ff_cos_ %+ i | |
39 %assign i i<<1 | |
40 %endrep | |
41 | |
42 %ifdef ARCH_X86_64 | |
43 %define pointer dq | |
44 %else | |
45 %define pointer dd | |
46 %endif | |
47 | |
48 %macro IF0 1+ | |
49 %endmacro | |
50 %macro IF1 1+ | |
51 %1 | |
52 %endmacro | |
53 | |
54 section .text align=16 | |
55 | |
56 %macro T2_3DN 4 ; z0, z1, mem0, mem1 | |
57 mova %1, %3 | |
58 mova %2, %1 | |
59 pfadd %1, %4 | |
60 pfsub %2, %4 | |
61 %endmacro | |
62 | |
63 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 | |
64 mova %5, %3 | |
65 pfsub %3, %4 | |
66 pfadd %5, %4 ; {t6,t5} | |
67 pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7} | |
68 mova %6, %1 | |
69 pswapd %3, %3 | |
70 pfadd %1, %5 ; {r0,i0} | |
71 pfsub %6, %5 ; {r2,i2} | |
72 mova %4, %2 | |
73 pfadd %2, %3 ; {r1,i1} | |
74 pfsub %4, %3 ; {r3,i3} | |
75 SWAP %3, %6 | |
76 %endmacro | |
77 | |
78 ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3} | |
79 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} | |
80 %macro T4_SSE 3 | |
81 mova %3, %1 | |
82 shufps %1, %2, 0x64 ; {r0,i0,r3,i2} | |
83 shufps %3, %2, 0xce ; {r1,i1,r2,i3} | |
84 mova %2, %1 | |
85 addps %1, %3 ; {t1,t2,t6,t5} | |
86 subps %2, %3 ; {t3,t4,t8,t7} | |
87 mova %3, %1 | |
88 shufps %1, %2, 0x44 ; {t1,t2,t3,t4} | |
89 shufps %3, %2, 0xbe ; {t6,t5,t7,t8} | |
90 mova %2, %1 | |
91 addps %1, %3 ; {r0,i0,r1,i1} | |
92 subps %2, %3 ; {r2,i2,r3,i3} | |
93 mova %3, %1 | |
94 shufps %1, %2, 0x88 ; {r0,r1,r2,r3} | |
95 shufps %3, %2, 0xdd ; {i0,i1,i2,i3} | |
96 SWAP %2, %3 | |
97 %endmacro | |
98 | |
99 %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1 | |
100 mova %5, %3 | |
101 shufps %3, %4, 0x44 ; {r4,i4,r6,i6} | |
102 shufps %5, %4, 0xee ; {r5,i5,r7,i7} | |
103 mova %6, %3 | |
104 subps %3, %5 ; {r5,i5,r7,i7} | |
105 addps %6, %5 ; {t1,t2,t3,t4} | |
106 mova %5, %3 | |
107 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} | |
108 mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7} | |
109 mulps %5, [ps_root2 GLOBAL] | |
110 addps %3, %5 ; {t8,t7,ta,t9} | |
111 mova %5, %6 | |
112 shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | |
113 shufps %5, %3, 0x9c ; {t1,t4,t7,ta} | |
114 mova %3, %6 | |
115 addps %6, %5 ; {t1,t2,t9,ta} | |
116 subps %3, %5 ; {t6,t5,tc,tb} | |
117 mova %5, %6 | |
118 shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} | |
119 shufps %5, %3, 0x8d ; {t2,ta,t6,tc} | |
120 mova %3, %1 | |
121 mova %4, %2 | |
122 addps %1, %6 ; {r0,r1,r2,r3} | |
123 addps %2, %5 ; {i0,i1,i2,i3} | |
124 subps %3, %6 ; {r4,r5,r6,r7} | |
125 subps %4, %5 ; {i4,i5,i6,i7} | |
126 %endmacro | |
127 | |
128 ; scheduled for cpu-bound sizes | |
129 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim | |
130 IF%1 mova m4, Z(4) | |
131 IF%1 mova m5, Z(5) | |
132 mova m0, %2 ; wre | |
133 mova m2, m4 | |
134 mova m1, %3 ; wim | |
135 mova m3, m5 | |
136 mulps m2, m0 ; r2*wre | |
137 IF%1 mova m6, Z(6) | |
138 mulps m3, m1 ; i2*wim | |
139 IF%1 mova m7, Z(7) | |
140 mulps m4, m1 ; r2*wim | |
141 mulps m5, m0 ; i2*wre | |
142 addps m2, m3 ; r2*wre + i2*wim | |
143 mova m3, m1 | |
144 mulps m1, m6 ; r3*wim | |
145 subps m5, m4 ; i2*wre - r2*wim | |
146 mova m4, m0 | |
147 mulps m3, m7 ; i3*wim | |
148 mulps m4, m6 ; r3*wre | |
149 mulps m0, m7 ; i3*wre | |
150 subps m4, m3 ; r3*wre - i3*wim | |
151 mova m3, Z(0) | |
152 addps m0, m1 ; i3*wre + r3*wim | |
153 mova m1, m4 | |
154 addps m4, m2 ; t5 | |
155 subps m1, m2 ; t3 | |
156 subps m3, m4 ; r2 | |
157 addps m4, Z(0) ; r0 | |
158 mova m6, Z(2) | |
159 mova Z(4), m3 | |
160 mova Z(0), m4 | |
161 mova m3, m5 | |
162 subps m5, m0 ; t4 | |
163 mova m4, m6 | |
164 subps m6, m5 ; r3 | |
165 addps m5, m4 ; r1 | |
166 mova Z(6), m6 | |
167 mova Z(2), m5 | |
168 mova m2, Z(3) | |
169 addps m3, m0 ; t6 | |
170 subps m2, m1 ; i3 | |
171 mova m7, Z(1) | |
172 addps m1, Z(3) ; i1 | |
173 mova Z(7), m2 | |
174 mova Z(3), m1 | |
175 mova m4, m7 | |
176 subps m7, m3 ; i2 | |
177 addps m3, m4 ; i0 | |
178 mova Z(5), m7 | |
179 mova Z(1), m3 | |
180 %endmacro | |
181 | |
182 ; scheduled to avoid store->load aliasing | |
183 %macro PASS_BIG 1 ; (!interleave) | |
184 mova m4, Z(4) ; r2 | |
185 mova m5, Z(5) ; i2 | |
186 mova m2, m4 | |
187 mova m0, [wq] ; wre | |
188 mova m3, m5 | |
189 mova m1, [wq+o1q] ; wim | |
190 mulps m2, m0 ; r2*wre | |
191 mova m6, Z(6) ; r3 | |
192 mulps m3, m1 ; i2*wim | |
193 mova m7, Z(7) ; i3 | |
194 mulps m4, m1 ; r2*wim | |
195 mulps m5, m0 ; i2*wre | |
196 addps m2, m3 ; r2*wre + i2*wim | |
197 mova m3, m1 | |
198 mulps m1, m6 ; r3*wim | |
199 subps m5, m4 ; i2*wre - r2*wim | |
200 mova m4, m0 | |
201 mulps m3, m7 ; i3*wim | |
202 mulps m4, m6 ; r3*wre | |
203 mulps m0, m7 ; i3*wre | |
204 subps m4, m3 ; r3*wre - i3*wim | |
205 mova m3, Z(0) | |
206 addps m0, m1 ; i3*wre + r3*wim | |
207 mova m1, m4 | |
208 addps m4, m2 ; t5 | |
209 subps m1, m2 ; t3 | |
210 subps m3, m4 ; r2 | |
211 addps m4, Z(0) ; r0 | |
212 mova m6, Z(2) | |
213 mova Z(4), m3 | |
214 mova Z(0), m4 | |
215 mova m3, m5 | |
216 subps m5, m0 ; t4 | |
217 mova m4, m6 | |
218 subps m6, m5 ; r3 | |
219 addps m5, m4 ; r1 | |
220 IF%1 mova Z(6), m6 | |
221 IF%1 mova Z(2), m5 | |
222 mova m2, Z(3) | |
223 addps m3, m0 ; t6 | |
224 subps m2, m1 ; i3 | |
225 mova m7, Z(1) | |
226 addps m1, Z(3) ; i1 | |
227 IF%1 mova Z(7), m2 | |
228 IF%1 mova Z(3), m1 | |
229 mova m4, m7 | |
230 subps m7, m3 ; i2 | |
231 addps m3, m4 ; i0 | |
232 IF%1 mova Z(5), m7 | |
233 IF%1 mova Z(1), m3 | |
234 %if %1==0 | |
235 mova m4, m5 ; r1 | |
236 mova m0, m6 ; r3 | |
237 unpcklps m5, m1 | |
238 unpckhps m4, m1 | |
239 unpcklps m6, m2 | |
240 unpckhps m0, m2 | |
241 mova m1, Z(0) | |
242 mova m2, Z(4) | |
243 mova Z(2), m5 | |
244 mova Z(3), m4 | |
245 mova Z(6), m6 | |
246 mova Z(7), m0 | |
247 mova m5, m1 ; r0 | |
248 mova m4, m2 ; r2 | |
249 unpcklps m1, m3 | |
250 unpckhps m5, m3 | |
251 unpcklps m2, m7 | |
252 unpckhps m4, m7 | |
253 mova Z(0), m1 | |
254 mova Z(1), m5 | |
255 mova Z(4), m2 | |
256 mova Z(5), m4 | |
257 %endif | |
258 %endmacro | |
259 | |
260 %macro PUNPCK 3 | |
261 mova %3, %1 | |
262 punpckldq %1, %2 | |
263 punpckhdq %3, %2 | |
264 %endmacro | |
265 | |
266 INIT_XMM | |
10452
c6aa538c0bc3
s/movdqa/movaps/ in sse1 fft. (regression in r20293)
lorenm
parents:
10019
diff
changeset
|
267 %define mova movaps |
8430 | 268 |
269 %define Z(x) [r0+mmsize*x] | |
270 | |
271 align 16 | |
272 fft4_sse: | |
273 mova m0, Z(0) | |
274 mova m1, Z(1) | |
275 T4_SSE m0, m1, m2 | |
276 mova Z(0), m0 | |
277 mova Z(1), m1 | |
278 ret | |
279 | |
280 align 16 | |
281 fft8_sse: | |
282 mova m0, Z(0) | |
283 mova m1, Z(1) | |
284 T4_SSE m0, m1, m2 | |
285 mova m2, Z(2) | |
286 mova m3, Z(3) | |
287 T8_SSE m0, m1, m2, m3, m4, m5 | |
288 mova Z(0), m0 | |
289 mova Z(1), m1 | |
290 mova Z(2), m2 | |
291 mova Z(3), m3 | |
292 ret | |
293 | |
294 align 16 | |
295 fft16_sse: | |
296 mova m0, Z(0) | |
297 mova m1, Z(1) | |
298 T4_SSE m0, m1, m2 | |
299 mova m2, Z(2) | |
300 mova m3, Z(3) | |
301 T8_SSE m0, m1, m2, m3, m4, m5 | |
302 mova m4, Z(4) | |
303 mova m5, Z(5) | |
304 mova Z(0), m0 | |
305 mova Z(1), m1 | |
306 mova Z(2), m2 | |
307 mova Z(3), m3 | |
308 T4_SSE m4, m5, m6 | |
309 mova m6, Z(6) | |
310 mova m7, Z(7) | |
311 T4_SSE m6, m7, m0 | |
312 PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL] | |
313 ret | |
314 | |
315 | |
316 INIT_MMX | |
317 | |
318 %macro FFT48_3DN 1 | |
319 align 16 | |
320 fft4%1: | |
321 T2_3DN m0, m1, Z(0), Z(1) | |
322 mova m2, Z(2) | |
323 mova m3, Z(3) | |
324 T4_3DN m0, m1, m2, m3, m4, m5 | |
325 PUNPCK m0, m1, m4 | |
326 PUNPCK m2, m3, m5 | |
327 mova Z(0), m0 | |
328 mova Z(1), m4 | |
329 mova Z(2), m2 | |
330 mova Z(3), m5 | |
331 ret | |
332 | |
333 align 16 | |
334 fft8%1: | |
335 T2_3DN m0, m1, Z(0), Z(1) | |
336 mova m2, Z(2) | |
337 mova m3, Z(3) | |
338 T4_3DN m0, m1, m2, m3, m4, m5 | |
339 mova Z(0), m0 | |
340 mova Z(2), m2 | |
341 T2_3DN m4, m5, Z(4), Z(5) | |
342 T2_3DN m6, m7, Z(6), Z(7) | |
343 pswapd m0, m5 | |
344 pswapd m2, m7 | |
345 pxor m0, [ps_m1p1 GLOBAL] | |
346 pxor m2, [ps_m1p1 GLOBAL] | |
347 pfsub m5, m0 | |
348 pfadd m7, m2 | |
349 pfmul m5, [ps_root2 GLOBAL] | |
350 pfmul m7, [ps_root2 GLOBAL] | |
351 T4_3DN m1, m3, m5, m7, m0, m2 | |
352 mova Z(5), m5 | |
353 mova Z(7), m7 | |
354 mova m0, Z(0) | |
355 mova m2, Z(2) | |
356 T4_3DN m0, m2, m4, m6, m5, m7 | |
357 PUNPCK m0, m1, m5 | |
358 PUNPCK m2, m3, m7 | |
359 mova Z(0), m0 | |
360 mova Z(1), m5 | |
361 mova Z(2), m2 | |
362 mova Z(3), m7 | |
363 PUNPCK m4, Z(5), m5 | |
364 PUNPCK m6, Z(7), m7 | |
365 mova Z(4), m4 | |
366 mova Z(5), m5 | |
367 mova Z(6), m6 | |
368 mova Z(7), m7 | |
369 ret | |
370 %endmacro | |
371 | |
372 FFT48_3DN _3dn2 | |
373 | |
374 %macro pswapd 2 | |
375 %ifidn %1, %2 | |
376 movd [r0+12], %1 | |
377 punpckhdq %1, [r0+8] | |
378 %else | |
379 movq %1, %2 | |
380 psrlq %1, 32 | |
381 punpckldq %1, %2 | |
382 %endif | |
383 %endmacro | |
384 | |
385 FFT48_3DN _3dn | |
386 | |
387 | |
388 %define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)] | |
389 | |
390 %macro DECL_PASS 2+ ; name, payload | |
391 align 16 | |
392 %1: | |
393 DEFINE_ARGS z, w, n, o1, o3 | |
394 lea o3q, [nq*3] | |
395 lea o1q, [nq*8] | |
396 shl o3q, 4 | |
397 .loop: | |
398 %2 | |
399 add zq, mmsize*2 | |
400 add wq, mmsize | |
401 sub nd, mmsize/8 | |
402 jg .loop | |
403 rep ret | |
404 %endmacro | |
405 | |
406 INIT_XMM | |
10452
c6aa538c0bc3
s/movdqa/movaps/ in sse1 fft. (regression in r20293)
lorenm
parents:
10019
diff
changeset
|
407 %define mova movaps |
8430 | 408 DECL_PASS pass_sse, PASS_BIG 1 |
409 DECL_PASS pass_interleave_sse, PASS_BIG 0 | |
410 | |
411 INIT_MMX | |
412 %define mulps pfmul | |
413 %define addps pfadd | |
414 %define subps pfsub | |
415 %define unpcklps punpckldq | |
416 %define unpckhps punpckhdq | |
417 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q] | |
418 DECL_PASS pass_interleave_3dn, PASS_BIG 0 | |
419 %define pass_3dn2 pass_3dn | |
420 %define pass_interleave_3dn2 pass_interleave_3dn | |
421 | |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
422 %ifdef PIC |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
423 %define SECTION_REL - $$ |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
424 %else |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
425 %define SECTION_REL |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
426 %endif |
8430 | 427 |
428 %macro DECL_FFT 2-3 ; nbits, cpu, suffix | |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
429 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL |
8430 | 430 %if %1==5 |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
431 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL |
8430 | 432 %endif |
433 | |
434 %assign n 1<<%1 | |
435 %rep 17-%1 | |
436 %assign n2 n/2 | |
437 %assign n4 n/4 | |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
438 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL |
8430 | 439 |
440 align 16 | |
441 fft %+ n %+ %3%2: | |
442 call fft %+ n2 %+ %2 | |
443 add r0, n*4 - (n&(-2<<%1)) | |
444 call fft %+ n4 %+ %2 | |
445 add r0, n*2 - (n2&(-2<<%1)) | |
446 call fft %+ n4 %+ %2 | |
447 sub r0, n*6 + (n2&(-2<<%1)) | |
448 lea r1, [ff_cos_ %+ n GLOBAL] | |
449 mov r2d, n4/2 | |
450 jmp pass%3%2 | |
451 | |
452 %assign n n*2 | |
453 %endrep | |
454 %undef n | |
455 | |
456 align 8 | |
457 dispatch_tab%3%2: pointer list_of_fft | |
458 | |
8820
7400956a815d
Put dispatch_tab in the rodata section for macho64.
astrange
parents:
8430
diff
changeset
|
459 section .text |
7400956a815d
Put dispatch_tab in the rodata section for macho64.
astrange
parents:
8430
diff
changeset
|
460 |
8430 | 461 ; On x86_32, this function does the register saving and restoring for all of fft. |
462 ; The others pass args in registers and don't spill anything. | |
10019
c08ca946c80a
Update x264 asm code to latest to add support for 64-bit Windows.
darkshikari
parents:
8820
diff
changeset
|
463 cglobal fft_dispatch%3%2, 2,5,8, z, nbits |
8430 | 464 lea r2, [dispatch_tab%3%2 GLOBAL] |
465 mov r2, [r2 + (nbitsq-2)*gprsize] | |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
466 %ifdef PIC |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
467 lea r3, [$$ GLOBAL] |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
468 add r2, r3 |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
469 %endif |
8430 | 470 call r2 |
471 RET | |
472 %endmacro ; DECL_FFT | |
473 | |
474 DECL_FFT 5, _sse | |
475 DECL_FFT 5, _sse, _interleave | |
476 DECL_FFT 4, _3dn | |
477 DECL_FFT 4, _3dn, _interleave | |
478 DECL_FFT 4, _3dn2 | |
479 DECL_FFT 4, _3dn2, _interleave | |
480 |