Mercurial > libavcodec.hg
annotate x86/fft_mmx.asm @ 12479:ca1896830b44 libavcodec
Fix indentation.
author | reimar |
---|---|
date | Thu, 09 Sep 2010 20:23:41 +0000 |
parents | f61e22f8cf28 |
children | 67e7e49058c2 |
rev | line source |
---|---|
8430 | 1 ;****************************************************************************** |
2 ;* FFT transform with SSE/3DNow optimizations | |
3 ;* Copyright (c) 2008 Loren Merritt | |
4 ;* | |
12188 | 5 ;* This algorithm (though not any of the implementation details) is |
6 ;* based on libdjbfft by D. J. Bernstein. | |
7 ;* | |
8430 | 8 ;* This file is part of FFmpeg. |
9 ;* | |
10 ;* FFmpeg is free software; you can redistribute it and/or | |
11 ;* modify it under the terms of the GNU Lesser General Public | |
12 ;* License as published by the Free Software Foundation; either | |
13 ;* version 2.1 of the License, or (at your option) any later version. | |
14 ;* | |
15 ;* FFmpeg is distributed in the hope that it will be useful, | |
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 ;* Lesser General Public License for more details. | |
19 ;* | |
20 ;* You should have received a copy of the GNU Lesser General Public | |
21 ;* License along with FFmpeg; if not, write to the Free Software | |
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 ;****************************************************************************** | |
24 | |
25 ; These functions are not individually interchangeable with the C versions. | |
26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results | |
27 ; in blocks as conventient to the vector size. | |
28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) | |
29 | |
30 %include "x86inc.asm" | |
31 | |
12399 | 32 %ifdef ARCH_X86_64 |
33 %define pointer resq | |
34 %else | |
35 %define pointer resd | |
36 %endif | |
37 | |
38 struc FFTContext | |
39 .nbits: resd 1 | |
40 .reverse: resd 1 | |
41 .revtab: pointer 1 | |
42 .tmpbuf: pointer 1 | |
43 .mdctsize: resd 1 | |
44 .mdctbits: resd 1 | |
45 .tcos: pointer 1 | |
46 .tsin: pointer 1 | |
47 endstruc | |
48 | |
8430 | 49 SECTION_RODATA |
50 | |
51 %define M_SQRT1_2 0.70710678118654752440 | |
52 ps_root2: times 4 dd M_SQRT1_2 | |
53 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | |
54 ps_m1p1: dd 1<<31, 0 | |
55 | |
56 %assign i 16 | |
57 %rep 13 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
58 cextern cos_ %+ i |
8430 | 59 %assign i i<<1 |
60 %endrep | |
61 | |
62 %ifdef ARCH_X86_64 | |
63 %define pointer dq | |
64 %else | |
65 %define pointer dd | |
66 %endif | |
67 | |
68 %macro IF0 1+ | |
69 %endmacro | |
70 %macro IF1 1+ | |
71 %1 | |
72 %endmacro | |
73 | |
74 section .text align=16 | |
75 | |
76 %macro T2_3DN 4 ; z0, z1, mem0, mem1 | |
77 mova %1, %3 | |
78 mova %2, %1 | |
79 pfadd %1, %4 | |
80 pfsub %2, %4 | |
81 %endmacro | |
82 | |
83 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 | |
84 mova %5, %3 | |
85 pfsub %3, %4 | |
86 pfadd %5, %4 ; {t6,t5} | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
87 pxor %3, [ps_m1p1] ; {t8,t7} |
8430 | 88 mova %6, %1 |
89 pswapd %3, %3 | |
90 pfadd %1, %5 ; {r0,i0} | |
91 pfsub %6, %5 ; {r2,i2} | |
92 mova %4, %2 | |
93 pfadd %2, %3 ; {r1,i1} | |
94 pfsub %4, %3 ; {r3,i3} | |
95 SWAP %3, %6 | |
96 %endmacro | |
97 | |
98 ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3} | |
99 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} | |
100 %macro T4_SSE 3 | |
101 mova %3, %1 | |
102 shufps %1, %2, 0x64 ; {r0,i0,r3,i2} | |
103 shufps %3, %2, 0xce ; {r1,i1,r2,i3} | |
104 mova %2, %1 | |
105 addps %1, %3 ; {t1,t2,t6,t5} | |
106 subps %2, %3 ; {t3,t4,t8,t7} | |
107 mova %3, %1 | |
108 shufps %1, %2, 0x44 ; {t1,t2,t3,t4} | |
109 shufps %3, %2, 0xbe ; {t6,t5,t7,t8} | |
110 mova %2, %1 | |
111 addps %1, %3 ; {r0,i0,r1,i1} | |
112 subps %2, %3 ; {r2,i2,r3,i3} | |
113 mova %3, %1 | |
114 shufps %1, %2, 0x88 ; {r0,r1,r2,r3} | |
115 shufps %3, %2, 0xdd ; {i0,i1,i2,i3} | |
116 SWAP %2, %3 | |
117 %endmacro | |
118 | |
119 %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1 | |
120 mova %5, %3 | |
121 shufps %3, %4, 0x44 ; {r4,i4,r6,i6} | |
122 shufps %5, %4, 0xee ; {r5,i5,r7,i7} | |
123 mova %6, %3 | |
124 subps %3, %5 ; {r5,i5,r7,i7} | |
125 addps %6, %5 ; {t1,t2,t3,t4} | |
126 mova %5, %3 | |
127 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
128 mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} |
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
129 mulps %5, [ps_root2] |
8430 | 130 addps %3, %5 ; {t8,t7,ta,t9} |
131 mova %5, %6 | |
132 shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | |
133 shufps %5, %3, 0x9c ; {t1,t4,t7,ta} | |
134 mova %3, %6 | |
135 addps %6, %5 ; {t1,t2,t9,ta} | |
136 subps %3, %5 ; {t6,t5,tc,tb} | |
137 mova %5, %6 | |
138 shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} | |
139 shufps %5, %3, 0x8d ; {t2,ta,t6,tc} | |
140 mova %3, %1 | |
141 mova %4, %2 | |
142 addps %1, %6 ; {r0,r1,r2,r3} | |
143 addps %2, %5 ; {i0,i1,i2,i3} | |
144 subps %3, %6 ; {r4,r5,r6,r7} | |
145 subps %4, %5 ; {i4,i5,i6,i7} | |
146 %endmacro | |
147 | |
148 ; scheduled for cpu-bound sizes | |
149 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim | |
150 IF%1 mova m4, Z(4) | |
151 IF%1 mova m5, Z(5) | |
152 mova m0, %2 ; wre | |
153 mova m2, m4 | |
154 mova m1, %3 ; wim | |
155 mova m3, m5 | |
156 mulps m2, m0 ; r2*wre | |
157 IF%1 mova m6, Z(6) | |
158 mulps m3, m1 ; i2*wim | |
159 IF%1 mova m7, Z(7) | |
160 mulps m4, m1 ; r2*wim | |
161 mulps m5, m0 ; i2*wre | |
162 addps m2, m3 ; r2*wre + i2*wim | |
163 mova m3, m1 | |
164 mulps m1, m6 ; r3*wim | |
165 subps m5, m4 ; i2*wre - r2*wim | |
166 mova m4, m0 | |
167 mulps m3, m7 ; i3*wim | |
168 mulps m4, m6 ; r3*wre | |
169 mulps m0, m7 ; i3*wre | |
170 subps m4, m3 ; r3*wre - i3*wim | |
171 mova m3, Z(0) | |
172 addps m0, m1 ; i3*wre + r3*wim | |
173 mova m1, m4 | |
174 addps m4, m2 ; t5 | |
175 subps m1, m2 ; t3 | |
176 subps m3, m4 ; r2 | |
177 addps m4, Z(0) ; r0 | |
178 mova m6, Z(2) | |
179 mova Z(4), m3 | |
180 mova Z(0), m4 | |
181 mova m3, m5 | |
182 subps m5, m0 ; t4 | |
183 mova m4, m6 | |
184 subps m6, m5 ; r3 | |
185 addps m5, m4 ; r1 | |
186 mova Z(6), m6 | |
187 mova Z(2), m5 | |
188 mova m2, Z(3) | |
189 addps m3, m0 ; t6 | |
190 subps m2, m1 ; i3 | |
191 mova m7, Z(1) | |
192 addps m1, Z(3) ; i1 | |
193 mova Z(7), m2 | |
194 mova Z(3), m1 | |
195 mova m4, m7 | |
196 subps m7, m3 ; i2 | |
197 addps m3, m4 ; i0 | |
198 mova Z(5), m7 | |
199 mova Z(1), m3 | |
200 %endmacro | |
201 | |
202 ; scheduled to avoid store->load aliasing | |
203 %macro PASS_BIG 1 ; (!interleave) | |
204 mova m4, Z(4) ; r2 | |
205 mova m5, Z(5) ; i2 | |
206 mova m2, m4 | |
207 mova m0, [wq] ; wre | |
208 mova m3, m5 | |
209 mova m1, [wq+o1q] ; wim | |
210 mulps m2, m0 ; r2*wre | |
211 mova m6, Z(6) ; r3 | |
212 mulps m3, m1 ; i2*wim | |
213 mova m7, Z(7) ; i3 | |
214 mulps m4, m1 ; r2*wim | |
215 mulps m5, m0 ; i2*wre | |
216 addps m2, m3 ; r2*wre + i2*wim | |
217 mova m3, m1 | |
218 mulps m1, m6 ; r3*wim | |
219 subps m5, m4 ; i2*wre - r2*wim | |
220 mova m4, m0 | |
221 mulps m3, m7 ; i3*wim | |
222 mulps m4, m6 ; r3*wre | |
223 mulps m0, m7 ; i3*wre | |
224 subps m4, m3 ; r3*wre - i3*wim | |
225 mova m3, Z(0) | |
226 addps m0, m1 ; i3*wre + r3*wim | |
227 mova m1, m4 | |
228 addps m4, m2 ; t5 | |
229 subps m1, m2 ; t3 | |
230 subps m3, m4 ; r2 | |
231 addps m4, Z(0) ; r0 | |
232 mova m6, Z(2) | |
233 mova Z(4), m3 | |
234 mova Z(0), m4 | |
235 mova m3, m5 | |
236 subps m5, m0 ; t4 | |
237 mova m4, m6 | |
238 subps m6, m5 ; r3 | |
239 addps m5, m4 ; r1 | |
240 IF%1 mova Z(6), m6 | |
241 IF%1 mova Z(2), m5 | |
242 mova m2, Z(3) | |
243 addps m3, m0 ; t6 | |
244 subps m2, m1 ; i3 | |
245 mova m7, Z(1) | |
246 addps m1, Z(3) ; i1 | |
247 IF%1 mova Z(7), m2 | |
248 IF%1 mova Z(3), m1 | |
249 mova m4, m7 | |
250 subps m7, m3 ; i2 | |
251 addps m3, m4 ; i0 | |
252 IF%1 mova Z(5), m7 | |
253 IF%1 mova Z(1), m3 | |
254 %if %1==0 | |
255 mova m4, m5 ; r1 | |
256 mova m0, m6 ; r3 | |
257 unpcklps m5, m1 | |
258 unpckhps m4, m1 | |
259 unpcklps m6, m2 | |
260 unpckhps m0, m2 | |
261 mova m1, Z(0) | |
262 mova m2, Z(4) | |
263 mova Z(2), m5 | |
264 mova Z(3), m4 | |
265 mova Z(6), m6 | |
266 mova Z(7), m0 | |
267 mova m5, m1 ; r0 | |
268 mova m4, m2 ; r2 | |
269 unpcklps m1, m3 | |
270 unpckhps m5, m3 | |
271 unpcklps m2, m7 | |
272 unpckhps m4, m7 | |
273 mova Z(0), m1 | |
274 mova Z(1), m5 | |
275 mova Z(4), m2 | |
276 mova Z(5), m4 | |
277 %endif | |
278 %endmacro | |
279 | |
280 %macro PUNPCK 3 | |
281 mova %3, %1 | |
282 punpckldq %1, %2 | |
283 punpckhdq %3, %2 | |
284 %endmacro | |
285 | |
286 INIT_XMM | |
10452
c6aa538c0bc3
s/movdqa/movaps/ in sse1 fft. (regression in r20293)
lorenm
parents:
10019
diff
changeset
|
287 %define mova movaps |
8430 | 288 |
289 %define Z(x) [r0+mmsize*x] | |
290 | |
291 align 16 | |
292 fft4_sse: | |
293 mova m0, Z(0) | |
294 mova m1, Z(1) | |
295 T4_SSE m0, m1, m2 | |
296 mova Z(0), m0 | |
297 mova Z(1), m1 | |
298 ret | |
299 | |
300 align 16 | |
301 fft8_sse: | |
302 mova m0, Z(0) | |
303 mova m1, Z(1) | |
304 T4_SSE m0, m1, m2 | |
305 mova m2, Z(2) | |
306 mova m3, Z(3) | |
307 T8_SSE m0, m1, m2, m3, m4, m5 | |
308 mova Z(0), m0 | |
309 mova Z(1), m1 | |
310 mova Z(2), m2 | |
311 mova Z(3), m3 | |
312 ret | |
313 | |
314 align 16 | |
315 fft16_sse: | |
316 mova m0, Z(0) | |
317 mova m1, Z(1) | |
318 T4_SSE m0, m1, m2 | |
319 mova m2, Z(2) | |
320 mova m3, Z(3) | |
321 T8_SSE m0, m1, m2, m3, m4, m5 | |
322 mova m4, Z(4) | |
323 mova m5, Z(5) | |
324 mova Z(0), m0 | |
325 mova Z(1), m1 | |
326 mova Z(2), m2 | |
327 mova Z(3), m3 | |
328 T4_SSE m4, m5, m6 | |
329 mova m6, Z(6) | |
330 mova m7, Z(7) | |
331 T4_SSE m6, m7, m0 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
332 PASS_SMALL 0, [cos_16], [cos_16+16] |
8430 | 333 ret |
334 | |
335 | |
336 INIT_MMX | |
337 | |
338 %macro FFT48_3DN 1 | |
339 align 16 | |
340 fft4%1: | |
341 T2_3DN m0, m1, Z(0), Z(1) | |
342 mova m2, Z(2) | |
343 mova m3, Z(3) | |
344 T4_3DN m0, m1, m2, m3, m4, m5 | |
345 PUNPCK m0, m1, m4 | |
346 PUNPCK m2, m3, m5 | |
347 mova Z(0), m0 | |
348 mova Z(1), m4 | |
349 mova Z(2), m2 | |
350 mova Z(3), m5 | |
351 ret | |
352 | |
353 align 16 | |
354 fft8%1: | |
355 T2_3DN m0, m1, Z(0), Z(1) | |
356 mova m2, Z(2) | |
357 mova m3, Z(3) | |
358 T4_3DN m0, m1, m2, m3, m4, m5 | |
359 mova Z(0), m0 | |
360 mova Z(2), m2 | |
361 T2_3DN m4, m5, Z(4), Z(5) | |
362 T2_3DN m6, m7, Z(6), Z(7) | |
363 pswapd m0, m5 | |
364 pswapd m2, m7 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
365 pxor m0, [ps_m1p1] |
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
366 pxor m2, [ps_m1p1] |
8430 | 367 pfsub m5, m0 |
368 pfadd m7, m2 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
369 pfmul m5, [ps_root2] |
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
370 pfmul m7, [ps_root2] |
8430 | 371 T4_3DN m1, m3, m5, m7, m0, m2 |
372 mova Z(5), m5 | |
373 mova Z(7), m7 | |
374 mova m0, Z(0) | |
375 mova m2, Z(2) | |
376 T4_3DN m0, m2, m4, m6, m5, m7 | |
377 PUNPCK m0, m1, m5 | |
378 PUNPCK m2, m3, m7 | |
379 mova Z(0), m0 | |
380 mova Z(1), m5 | |
381 mova Z(2), m2 | |
382 mova Z(3), m7 | |
383 PUNPCK m4, Z(5), m5 | |
384 PUNPCK m6, Z(7), m7 | |
385 mova Z(4), m4 | |
386 mova Z(5), m5 | |
387 mova Z(6), m6 | |
388 mova Z(7), m7 | |
389 ret | |
390 %endmacro | |
391 | |
392 FFT48_3DN _3dn2 | |
393 | |
394 %macro pswapd 2 | |
395 %ifidn %1, %2 | |
396 movd [r0+12], %1 | |
397 punpckhdq %1, [r0+8] | |
398 %else | |
399 movq %1, %2 | |
400 psrlq %1, 32 | |
401 punpckldq %1, %2 | |
402 %endif | |
403 %endmacro | |
404 | |
405 FFT48_3DN _3dn | |
406 | |
407 | |
408 %define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)] | |
409 | |
410 %macro DECL_PASS 2+ ; name, payload | |
411 align 16 | |
412 %1: | |
413 DEFINE_ARGS z, w, n, o1, o3 | |
414 lea o3q, [nq*3] | |
415 lea o1q, [nq*8] | |
416 shl o3q, 4 | |
417 .loop: | |
418 %2 | |
419 add zq, mmsize*2 | |
420 add wq, mmsize | |
421 sub nd, mmsize/8 | |
422 jg .loop | |
423 rep ret | |
424 %endmacro | |
425 | |
426 INIT_XMM | |
10452
c6aa538c0bc3
s/movdqa/movaps/ in sse1 fft. (regression in r20293)
lorenm
parents:
10019
diff
changeset
|
427 %define mova movaps |
8430 | 428 DECL_PASS pass_sse, PASS_BIG 1 |
429 DECL_PASS pass_interleave_sse, PASS_BIG 0 | |
430 | |
431 INIT_MMX | |
432 %define mulps pfmul | |
433 %define addps pfadd | |
434 %define subps pfsub | |
435 %define unpcklps punpckldq | |
436 %define unpckhps punpckhdq | |
437 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q] | |
438 DECL_PASS pass_interleave_3dn, PASS_BIG 0 | |
439 %define pass_3dn2 pass_3dn | |
440 %define pass_interleave_3dn2 pass_interleave_3dn | |
441 | |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
442 %ifdef PIC |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
443 %define SECTION_REL - $$ |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
444 %else |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
445 %define SECTION_REL |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
446 %endif |
8430 | 447 |
12399 | 448 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs |
449 lea r2, [dispatch_tab%1] | |
450 mov r2, [r2 + (%2q-2)*gprsize] | |
451 %ifdef PIC | |
452 lea r3, [$$] | |
453 add r2, r3 | |
454 %endif | |
455 call r2 | |
456 %endmacro ; FFT_DISPATCH | |
457 | |
8430 | 458 %macro DECL_FFT 2-3 ; nbits, cpu, suffix |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
459 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL |
8430 | 460 %if %1==5 |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
461 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL |
8430 | 462 %endif |
463 | |
464 %assign n 1<<%1 | |
465 %rep 17-%1 | |
466 %assign n2 n/2 | |
467 %assign n4 n/4 | |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
468 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL |
8430 | 469 |
470 align 16 | |
471 fft %+ n %+ %3%2: | |
472 call fft %+ n2 %+ %2 | |
473 add r0, n*4 - (n&(-2<<%1)) | |
474 call fft %+ n4 %+ %2 | |
475 add r0, n*2 - (n2&(-2<<%1)) | |
476 call fft %+ n4 %+ %2 | |
477 sub r0, n*6 + (n2&(-2<<%1)) | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
478 lea r1, [cos_ %+ n] |
8430 | 479 mov r2d, n4/2 |
480 jmp pass%3%2 | |
481 | |
482 %assign n n*2 | |
483 %endrep | |
484 %undef n | |
485 | |
486 align 8 | |
487 dispatch_tab%3%2: pointer list_of_fft | |
488 | |
8820
7400956a815d
Put dispatch_tab in the rodata section for macho64.
astrange
parents:
8430
diff
changeset
|
489 section .text |
7400956a815d
Put dispatch_tab in the rodata section for macho64.
astrange
parents:
8430
diff
changeset
|
490 |
8430 | 491 ; On x86_32, this function does the register saving and restoring for all of fft. |
492 ; The others pass args in registers and don't spill anything. | |
10019
c08ca946c80a
Update x264 asm code to latest to add support for 64-bit Windows.
darkshikari
parents:
8820
diff
changeset
|
493 cglobal fft_dispatch%3%2, 2,5,8, z, nbits |
12399 | 494 FFT_DISPATCH %3%2, nbits |
8430 | 495 RET |
496 %endmacro ; DECL_FFT | |
497 | |
498 DECL_FFT 5, _sse | |
499 DECL_FFT 5, _sse, _interleave | |
500 DECL_FFT 4, _3dn | |
501 DECL_FFT 4, _3dn, _interleave | |
502 DECL_FFT 4, _3dn2 | |
503 DECL_FFT 4, _3dn2, _interleave | |
504 | |
12399 | 505 INIT_XMM |
506 %undef mulps | |
507 %undef addps | |
508 %undef subps | |
509 %undef unpcklps | |
510 %undef unpckhps | |
511 | |
512 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 | |
513 movaps xmm0, [%3+%2*4] | |
514 movaps xmm1, [%3+%1*4-0x10] | |
515 movaps xmm2, xmm0 | |
516 shufps xmm0, xmm1, 0x88 | |
517 shufps xmm1, xmm2, 0x77 | |
518 movlps xmm4, [%4+%2*2] | |
519 movlps xmm5, [%5+%2*2+0x0] | |
520 movhps xmm4, [%4+%1*2-0x8] | |
521 movhps xmm5, [%5+%1*2-0x8] | |
522 movaps xmm2, xmm0 | |
523 movaps xmm3, xmm1 | |
524 mulps xmm0, xmm5 | |
525 mulps xmm1, xmm4 | |
526 mulps xmm2, xmm4 | |
527 mulps xmm3, xmm5 | |
528 subps xmm1, xmm0 | |
529 addps xmm2, xmm3 | |
530 movaps xmm0, xmm1 | |
531 unpcklps xmm1, xmm2 | |
532 unpckhps xmm0, xmm2 | |
533 %endmacro | |
534 | |
535 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 | |
536 movaps xmm6, [%4+%1*2] | |
537 movaps %2, [%4+%1*2+0x10] | |
538 movaps %3, xmm6 | |
539 movaps xmm7, %2 | |
12432 | 540 mulps xmm6, [%5+%1] |
541 mulps %2, [%6+%1] | |
542 mulps %3, [%6+%1] | |
543 mulps xmm7, [%5+%1] | |
12399 | 544 subps %2, xmm6 |
545 addps %3, xmm7 | |
546 %endmacro | |
547 | |
548 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 | |
549 .post: | |
550 CMUL %1, xmm0, xmm1, %3, %4, %5 | |
551 CMUL %2, xmm4, xmm5, %3, %4, %5 | |
552 shufps xmm1, xmm1, 0x1b | |
553 shufps xmm5, xmm5, 0x1b | |
554 movaps xmm6, xmm4 | |
555 unpckhps xmm4, xmm1 | |
556 unpcklps xmm6, xmm1 | |
557 movaps xmm2, xmm0 | |
558 unpcklps xmm0, xmm5 | |
559 unpckhps xmm2, xmm5 | |
560 movaps [%3+%2*2], xmm6 | |
561 movaps [%3+%2*2+0x10], xmm4 | |
562 movaps [%3+%1*2], xmm0 | |
563 movaps [%3+%1*2+0x10], xmm2 | |
564 sub %2, 0x10 | |
565 add %1, 0x10 | |
566 jl .post | |
567 %endmacro | |
568 | |
569 cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input | |
570 %ifdef ARCH_X86_64 | |
571 %define rrevtab r10 | |
572 %define rtcos r11 | |
573 %define rtsin r12 | |
574 push r12 | |
575 push r13 | |
576 push r14 | |
577 %else | |
578 %define rrevtab r6 | |
579 %define rtsin r6 | |
580 %define rtcos r5 | |
581 %endif | |
582 mov r3d, [r0+FFTContext.mdctsize] | |
583 add r2, r3 | |
584 shr r3, 1 | |
585 mov rtcos, [r0+FFTContext.tcos] | |
586 mov rtsin, [r0+FFTContext.tsin] | |
587 add rtcos, r3 | |
588 add rtsin, r3 | |
589 %ifndef ARCH_X86_64 | |
590 push rtcos | |
591 push rtsin | |
592 %endif | |
593 shr r3, 1 | |
594 mov rrevtab, [r0+FFTContext.revtab] | |
595 add rrevtab, r3 | |
596 %ifndef ARCH_X86_64 | |
597 push rrevtab | |
598 %endif | |
599 | |
600 sub r3, 4 | |
601 %ifdef ARCH_X86_64 | |
602 xor r4, r4 | |
603 sub r4, r3 | |
604 %endif | |
605 .pre: | |
606 %ifndef ARCH_X86_64 | |
607 ;unspill | |
608 xor r4, r4 | |
609 sub r4, r3 | |
610 mov rtsin, [esp+4] | |
611 mov rtcos, [esp+8] | |
612 %endif | |
613 | |
614 PREROTATER r4, r3, r2, rtcos, rtsin | |
615 %ifdef ARCH_X86_64 | |
12432 | 616 movzx r5, word [rrevtab+r4-4] |
617 movzx r6, word [rrevtab+r4-2] | |
618 movzx r13, word [rrevtab+r3] | |
619 movzx r14, word [rrevtab+r3+2] | |
620 movlps [r1+r5 *8], xmm0 | |
621 movhps [r1+r6 *8], xmm0 | |
622 movlps [r1+r13*8], xmm1 | |
623 movhps [r1+r14*8], xmm1 | |
12399 | 624 add r4, 4 |
625 %else | |
626 mov r6, [esp] | |
12432 | 627 movzx r5, word [r6+r4-4] |
628 movzx r4, word [r6+r4-2] | |
629 movlps [r1+r5*8], xmm0 | |
630 movhps [r1+r4*8], xmm0 | |
631 movzx r5, word [r6+r3] | |
632 movzx r4, word [r6+r3+2] | |
633 movlps [r1+r5*8], xmm1 | |
634 movhps [r1+r4*8], xmm1 | |
12399 | 635 %endif |
636 sub r3, 4 | |
637 jns .pre | |
638 | |
639 mov r5, r0 | |
640 mov r6, r1 | |
641 mov r0, r1 | |
642 mov r1d, [r5+FFTContext.nbits] | |
643 | |
644 FFT_DISPATCH _sse, r1 | |
645 | |
646 mov r0d, [r5+FFTContext.mdctsize] | |
647 add r6, r0 | |
648 shr r0, 1 | |
649 %ifndef ARCH_X86_64 | |
650 %define rtcos r2 | |
651 %define rtsin r3 | |
652 mov rtcos, [esp+8] | |
653 mov rtsin, [esp+4] | |
654 %endif | |
655 neg r0 | |
656 mov r1, -16 | |
657 sub r1, r0 | |
658 POSROTATESHUF r0, r1, r6, rtcos, rtsin | |
659 %ifdef ARCH_X86_64 | |
660 pop r14 | |
661 pop r13 | |
662 pop r12 | |
663 %else | |
664 add esp, 12 | |
665 %endif | |
666 RET |