Mercurial > libavcodec.hg
annotate x86/fft_mmx.asm @ 12483:0159a19bfff7 libavcodec
aacdec: Rework channel mapping compatibility hacks.
For a PCE based configuration map the channels solely based on tags.
For an indexed configuration map the channels solely based on position.
This works with all known exotic samples including al17, elem_id0, bad_concat,
and lfe_is_sce.
author | alexc |
---|---|
date | Fri, 10 Sep 2010 18:01:48 +0000 |
parents | f61e22f8cf28 |
children | 67e7e49058c2 |
rev | line source |
---|---|
8430 | 1 ;****************************************************************************** |
2 ;* FFT transform with SSE/3DNow optimizations | |
3 ;* Copyright (c) 2008 Loren Merritt | |
4 ;* | |
12188 | 5 ;* This algorithm (though not any of the implementation details) is |
6 ;* based on libdjbfft by D. J. Bernstein. | |
7 ;* | |
8430 | 8 ;* This file is part of FFmpeg. |
9 ;* | |
10 ;* FFmpeg is free software; you can redistribute it and/or | |
11 ;* modify it under the terms of the GNU Lesser General Public | |
12 ;* License as published by the Free Software Foundation; either | |
13 ;* version 2.1 of the License, or (at your option) any later version. | |
14 ;* | |
15 ;* FFmpeg is distributed in the hope that it will be useful, | |
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 ;* Lesser General Public License for more details. | |
19 ;* | |
20 ;* You should have received a copy of the GNU Lesser General Public | |
21 ;* License along with FFmpeg; if not, write to the Free Software | |
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 ;****************************************************************************** | |
24 | |
25 ; These functions are not individually interchangeable with the C versions. | |
26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results | |
27 ; in blocks as conventient to the vector size. | |
28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) | |
29 | |
30 %include "x86inc.asm" | |
31 | |
12399 | 32 %ifdef ARCH_X86_64 |
33 %define pointer resq | |
34 %else | |
35 %define pointer resd | |
36 %endif | |
37 | |
38 struc FFTContext | |
39 .nbits: resd 1 | |
40 .reverse: resd 1 | |
41 .revtab: pointer 1 | |
42 .tmpbuf: pointer 1 | |
43 .mdctsize: resd 1 | |
44 .mdctbits: resd 1 | |
45 .tcos: pointer 1 | |
46 .tsin: pointer 1 | |
47 endstruc | |
48 | |
8430 | 49 SECTION_RODATA |
50 | |
51 %define M_SQRT1_2 0.70710678118654752440 | |
52 ps_root2: times 4 dd M_SQRT1_2 | |
53 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | |
54 ps_m1p1: dd 1<<31, 0 | |
55 | |
56 %assign i 16 | |
57 %rep 13 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
58 cextern cos_ %+ i |
8430 | 59 %assign i i<<1 |
60 %endrep | |
61 | |
62 %ifdef ARCH_X86_64 | |
63 %define pointer dq | |
64 %else | |
65 %define pointer dd | |
66 %endif | |
67 | |
68 %macro IF0 1+ | |
69 %endmacro | |
70 %macro IF1 1+ | |
71 %1 | |
72 %endmacro | |
73 | |
74 section .text align=16 | |
75 | |
76 %macro T2_3DN 4 ; z0, z1, mem0, mem1 | |
77 mova %1, %3 | |
78 mova %2, %1 | |
79 pfadd %1, %4 | |
80 pfsub %2, %4 | |
81 %endmacro | |
82 | |
83 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 | |
84 mova %5, %3 | |
85 pfsub %3, %4 | |
86 pfadd %5, %4 ; {t6,t5} | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
87 pxor %3, [ps_m1p1] ; {t8,t7} |
8430 | 88 mova %6, %1 |
89 pswapd %3, %3 | |
90 pfadd %1, %5 ; {r0,i0} | |
91 pfsub %6, %5 ; {r2,i2} | |
92 mova %4, %2 | |
93 pfadd %2, %3 ; {r1,i1} | |
94 pfsub %4, %3 ; {r3,i3} | |
95 SWAP %3, %6 | |
96 %endmacro | |
97 | |
98 ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3} | |
99 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} | |
100 %macro T4_SSE 3 | |
101 mova %3, %1 | |
102 shufps %1, %2, 0x64 ; {r0,i0,r3,i2} | |
103 shufps %3, %2, 0xce ; {r1,i1,r2,i3} | |
104 mova %2, %1 | |
105 addps %1, %3 ; {t1,t2,t6,t5} | |
106 subps %2, %3 ; {t3,t4,t8,t7} | |
107 mova %3, %1 | |
108 shufps %1, %2, 0x44 ; {t1,t2,t3,t4} | |
109 shufps %3, %2, 0xbe ; {t6,t5,t7,t8} | |
110 mova %2, %1 | |
111 addps %1, %3 ; {r0,i0,r1,i1} | |
112 subps %2, %3 ; {r2,i2,r3,i3} | |
113 mova %3, %1 | |
114 shufps %1, %2, 0x88 ; {r0,r1,r2,r3} | |
115 shufps %3, %2, 0xdd ; {i0,i1,i2,i3} | |
116 SWAP %2, %3 | |
117 %endmacro | |
118 | |
119 %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1 | |
120 mova %5, %3 | |
121 shufps %3, %4, 0x44 ; {r4,i4,r6,i6} | |
122 shufps %5, %4, 0xee ; {r5,i5,r7,i7} | |
123 mova %6, %3 | |
124 subps %3, %5 ; {r5,i5,r7,i7} | |
125 addps %6, %5 ; {t1,t2,t3,t4} | |
126 mova %5, %3 | |
127 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
128 mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} |
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
129 mulps %5, [ps_root2] |
8430 | 130 addps %3, %5 ; {t8,t7,ta,t9} |
131 mova %5, %6 | |
132 shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | |
133 shufps %5, %3, 0x9c ; {t1,t4,t7,ta} | |
134 mova %3, %6 | |
135 addps %6, %5 ; {t1,t2,t9,ta} | |
136 subps %3, %5 ; {t6,t5,tc,tb} | |
137 mova %5, %6 | |
138 shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} | |
139 shufps %5, %3, 0x8d ; {t2,ta,t6,tc} | |
140 mova %3, %1 | |
141 mova %4, %2 | |
142 addps %1, %6 ; {r0,r1,r2,r3} | |
143 addps %2, %5 ; {i0,i1,i2,i3} | |
144 subps %3, %6 ; {r4,r5,r6,r7} | |
145 subps %4, %5 ; {i4,i5,i6,i7} | |
146 %endmacro | |
147 | |
148 ; scheduled for cpu-bound sizes | |
149 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim | |
150 IF%1 mova m4, Z(4) | |
151 IF%1 mova m5, Z(5) | |
152 mova m0, %2 ; wre | |
153 mova m2, m4 | |
154 mova m1, %3 ; wim | |
155 mova m3, m5 | |
156 mulps m2, m0 ; r2*wre | |
157 IF%1 mova m6, Z(6) | |
158 mulps m3, m1 ; i2*wim | |
159 IF%1 mova m7, Z(7) | |
160 mulps m4, m1 ; r2*wim | |
161 mulps m5, m0 ; i2*wre | |
162 addps m2, m3 ; r2*wre + i2*wim | |
163 mova m3, m1 | |
164 mulps m1, m6 ; r3*wim | |
165 subps m5, m4 ; i2*wre - r2*wim | |
166 mova m4, m0 | |
167 mulps m3, m7 ; i3*wim | |
168 mulps m4, m6 ; r3*wre | |
169 mulps m0, m7 ; i3*wre | |
170 subps m4, m3 ; r3*wre - i3*wim | |
171 mova m3, Z(0) | |
172 addps m0, m1 ; i3*wre + r3*wim | |
173 mova m1, m4 | |
174 addps m4, m2 ; t5 | |
175 subps m1, m2 ; t3 | |
176 subps m3, m4 ; r2 | |
177 addps m4, Z(0) ; r0 | |
178 mova m6, Z(2) | |
179 mova Z(4), m3 | |
180 mova Z(0), m4 | |
181 mova m3, m5 | |
182 subps m5, m0 ; t4 | |
183 mova m4, m6 | |
184 subps m6, m5 ; r3 | |
185 addps m5, m4 ; r1 | |
186 mova Z(6), m6 | |
187 mova Z(2), m5 | |
188 mova m2, Z(3) | |
189 addps m3, m0 ; t6 | |
190 subps m2, m1 ; i3 | |
191 mova m7, Z(1) | |
192 addps m1, Z(3) ; i1 | |
193 mova Z(7), m2 | |
194 mova Z(3), m1 | |
195 mova m4, m7 | |
196 subps m7, m3 ; i2 | |
197 addps m3, m4 ; i0 | |
198 mova Z(5), m7 | |
199 mova Z(1), m3 | |
200 %endmacro | |
201 | |
202 ; scheduled to avoid store->load aliasing | |
203 %macro PASS_BIG 1 ; (!interleave) | |
204 mova m4, Z(4) ; r2 | |
205 mova m5, Z(5) ; i2 | |
206 mova m2, m4 | |
207 mova m0, [wq] ; wre | |
208 mova m3, m5 | |
209 mova m1, [wq+o1q] ; wim | |
210 mulps m2, m0 ; r2*wre | |
211 mova m6, Z(6) ; r3 | |
212 mulps m3, m1 ; i2*wim | |
213 mova m7, Z(7) ; i3 | |
214 mulps m4, m1 ; r2*wim | |
215 mulps m5, m0 ; i2*wre | |
216 addps m2, m3 ; r2*wre + i2*wim | |
217 mova m3, m1 | |
218 mulps m1, m6 ; r3*wim | |
219 subps m5, m4 ; i2*wre - r2*wim | |
220 mova m4, m0 | |
221 mulps m3, m7 ; i3*wim | |
222 mulps m4, m6 ; r3*wre | |
223 mulps m0, m7 ; i3*wre | |
224 subps m4, m3 ; r3*wre - i3*wim | |
225 mova m3, Z(0) | |
226 addps m0, m1 ; i3*wre + r3*wim | |
227 mova m1, m4 | |
228 addps m4, m2 ; t5 | |
229 subps m1, m2 ; t3 | |
230 subps m3, m4 ; r2 | |
231 addps m4, Z(0) ; r0 | |
232 mova m6, Z(2) | |
233 mova Z(4), m3 | |
234 mova Z(0), m4 | |
235 mova m3, m5 | |
236 subps m5, m0 ; t4 | |
237 mova m4, m6 | |
238 subps m6, m5 ; r3 | |
239 addps m5, m4 ; r1 | |
240 IF%1 mova Z(6), m6 | |
241 IF%1 mova Z(2), m5 | |
242 mova m2, Z(3) | |
243 addps m3, m0 ; t6 | |
244 subps m2, m1 ; i3 | |
245 mova m7, Z(1) | |
246 addps m1, Z(3) ; i1 | |
247 IF%1 mova Z(7), m2 | |
248 IF%1 mova Z(3), m1 | |
249 mova m4, m7 | |
250 subps m7, m3 ; i2 | |
251 addps m3, m4 ; i0 | |
252 IF%1 mova Z(5), m7 | |
253 IF%1 mova Z(1), m3 | |
254 %if %1==0 | |
255 mova m4, m5 ; r1 | |
256 mova m0, m6 ; r3 | |
257 unpcklps m5, m1 | |
258 unpckhps m4, m1 | |
259 unpcklps m6, m2 | |
260 unpckhps m0, m2 | |
261 mova m1, Z(0) | |
262 mova m2, Z(4) | |
263 mova Z(2), m5 | |
264 mova Z(3), m4 | |
265 mova Z(6), m6 | |
266 mova Z(7), m0 | |
267 mova m5, m1 ; r0 | |
268 mova m4, m2 ; r2 | |
269 unpcklps m1, m3 | |
270 unpckhps m5, m3 | |
271 unpcklps m2, m7 | |
272 unpckhps m4, m7 | |
273 mova Z(0), m1 | |
274 mova Z(1), m5 | |
275 mova Z(4), m2 | |
276 mova Z(5), m4 | |
277 %endif | |
278 %endmacro | |
279 | |
280 %macro PUNPCK 3 | |
281 mova %3, %1 | |
282 punpckldq %1, %2 | |
283 punpckhdq %3, %2 | |
284 %endmacro | |
285 | |
286 INIT_XMM | |
10452
c6aa538c0bc3
s/movdqa/movaps/ in sse1 fft. (regression in r20293)
lorenm
parents:
10019
diff
changeset
|
287 %define mova movaps |
8430 | 288 |
289 %define Z(x) [r0+mmsize*x] | |
290 | |
291 align 16 | |
292 fft4_sse: | |
293 mova m0, Z(0) | |
294 mova m1, Z(1) | |
295 T4_SSE m0, m1, m2 | |
296 mova Z(0), m0 | |
297 mova Z(1), m1 | |
298 ret | |
299 | |
300 align 16 | |
301 fft8_sse: | |
302 mova m0, Z(0) | |
303 mova m1, Z(1) | |
304 T4_SSE m0, m1, m2 | |
305 mova m2, Z(2) | |
306 mova m3, Z(3) | |
307 T8_SSE m0, m1, m2, m3, m4, m5 | |
308 mova Z(0), m0 | |
309 mova Z(1), m1 | |
310 mova Z(2), m2 | |
311 mova Z(3), m3 | |
312 ret | |
313 | |
314 align 16 | |
315 fft16_sse: | |
316 mova m0, Z(0) | |
317 mova m1, Z(1) | |
318 T4_SSE m0, m1, m2 | |
319 mova m2, Z(2) | |
320 mova m3, Z(3) | |
321 T8_SSE m0, m1, m2, m3, m4, m5 | |
322 mova m4, Z(4) | |
323 mova m5, Z(5) | |
324 mova Z(0), m0 | |
325 mova Z(1), m1 | |
326 mova Z(2), m2 | |
327 mova Z(3), m3 | |
328 T4_SSE m4, m5, m6 | |
329 mova m6, Z(6) | |
330 mova m7, Z(7) | |
331 T4_SSE m6, m7, m0 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
332 PASS_SMALL 0, [cos_16], [cos_16+16] |
8430 | 333 ret |
334 | |
335 | |
336 INIT_MMX | |
337 | |
338 %macro FFT48_3DN 1 | |
339 align 16 | |
340 fft4%1: | |
341 T2_3DN m0, m1, Z(0), Z(1) | |
342 mova m2, Z(2) | |
343 mova m3, Z(3) | |
344 T4_3DN m0, m1, m2, m3, m4, m5 | |
345 PUNPCK m0, m1, m4 | |
346 PUNPCK m2, m3, m5 | |
347 mova Z(0), m0 | |
348 mova Z(1), m4 | |
349 mova Z(2), m2 | |
350 mova Z(3), m5 | |
351 ret | |
352 | |
353 align 16 | |
354 fft8%1: | |
355 T2_3DN m0, m1, Z(0), Z(1) | |
356 mova m2, Z(2) | |
357 mova m3, Z(3) | |
358 T4_3DN m0, m1, m2, m3, m4, m5 | |
359 mova Z(0), m0 | |
360 mova Z(2), m2 | |
361 T2_3DN m4, m5, Z(4), Z(5) | |
362 T2_3DN m6, m7, Z(6), Z(7) | |
363 pswapd m0, m5 | |
364 pswapd m2, m7 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
365 pxor m0, [ps_m1p1] |
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
366 pxor m2, [ps_m1p1] |
8430 | 367 pfsub m5, m0 |
368 pfadd m7, m2 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
369 pfmul m5, [ps_root2] |
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
370 pfmul m7, [ps_root2] |
8430 | 371 T4_3DN m1, m3, m5, m7, m0, m2 |
372 mova Z(5), m5 | |
373 mova Z(7), m7 | |
374 mova m0, Z(0) | |
375 mova m2, Z(2) | |
376 T4_3DN m0, m2, m4, m6, m5, m7 | |
377 PUNPCK m0, m1, m5 | |
378 PUNPCK m2, m3, m7 | |
379 mova Z(0), m0 | |
380 mova Z(1), m5 | |
381 mova Z(2), m2 | |
382 mova Z(3), m7 | |
383 PUNPCK m4, Z(5), m5 | |
384 PUNPCK m6, Z(7), m7 | |
385 mova Z(4), m4 | |
386 mova Z(5), m5 | |
387 mova Z(6), m6 | |
388 mova Z(7), m7 | |
389 ret | |
390 %endmacro | |
391 | |
392 FFT48_3DN _3dn2 | |
393 | |
394 %macro pswapd 2 | |
395 %ifidn %1, %2 | |
396 movd [r0+12], %1 | |
397 punpckhdq %1, [r0+8] | |
398 %else | |
399 movq %1, %2 | |
400 psrlq %1, 32 | |
401 punpckldq %1, %2 | |
402 %endif | |
403 %endmacro | |
404 | |
405 FFT48_3DN _3dn | |
406 | |
407 | |
408 %define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)] | |
409 | |
410 %macro DECL_PASS 2+ ; name, payload | |
411 align 16 | |
412 %1: | |
413 DEFINE_ARGS z, w, n, o1, o3 | |
414 lea o3q, [nq*3] | |
415 lea o1q, [nq*8] | |
416 shl o3q, 4 | |
417 .loop: | |
418 %2 | |
419 add zq, mmsize*2 | |
420 add wq, mmsize | |
421 sub nd, mmsize/8 | |
422 jg .loop | |
423 rep ret | |
424 %endmacro | |
425 | |
426 INIT_XMM | |
10452
c6aa538c0bc3
s/movdqa/movaps/ in sse1 fft. (regression in r20293)
lorenm
parents:
10019
diff
changeset
|
427 %define mova movaps |
8430 | 428 DECL_PASS pass_sse, PASS_BIG 1 |
429 DECL_PASS pass_interleave_sse, PASS_BIG 0 | |
430 | |
431 INIT_MMX | |
432 %define mulps pfmul | |
433 %define addps pfadd | |
434 %define subps pfsub | |
435 %define unpcklps punpckldq | |
436 %define unpckhps punpckhdq | |
437 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q] | |
438 DECL_PASS pass_interleave_3dn, PASS_BIG 0 | |
439 %define pass_3dn2 pass_3dn | |
440 %define pass_interleave_3dn2 pass_interleave_3dn | |
441 | |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
442 %ifdef PIC |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
443 %define SECTION_REL - $$ |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
444 %else |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
445 %define SECTION_REL |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
446 %endif |
8430 | 447 |
12399 | 448 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs |
449 lea r2, [dispatch_tab%1] | |
450 mov r2, [r2 + (%2q-2)*gprsize] | |
451 %ifdef PIC | |
452 lea r3, [$$] | |
453 add r2, r3 | |
454 %endif | |
455 call r2 | |
456 %endmacro ; FFT_DISPATCH | |
457 | |
8430 | 458 %macro DECL_FFT 2-3 ; nbits, cpu, suffix |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
459 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL |
8430 | 460 %if %1==5 |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
461 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL |
8430 | 462 %endif |
463 | |
464 %assign n 1<<%1 | |
465 %rep 17-%1 | |
466 %assign n2 n/2 | |
467 %assign n4 n/4 | |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
468 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL |
8430 | 469 |
470 align 16 | |
471 fft %+ n %+ %3%2: | |
472 call fft %+ n2 %+ %2 | |
473 add r0, n*4 - (n&(-2<<%1)) | |
474 call fft %+ n4 %+ %2 | |
475 add r0, n*2 - (n2&(-2<<%1)) | |
476 call fft %+ n4 %+ %2 | |
477 sub r0, n*6 + (n2&(-2<<%1)) | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
478 lea r1, [cos_ %+ n] |
8430 | 479 mov r2d, n4/2 |
480 jmp pass%3%2 | |
481 | |
482 %assign n n*2 | |
483 %endrep | |
484 %undef n | |
485 | |
486 align 8 | |
487 dispatch_tab%3%2: pointer list_of_fft | |
488 | |
8820
7400956a815d
Put dispatch_tab in the rodata section for macho64.
astrange
parents:
8430
diff
changeset
|
489 section .text |
7400956a815d
Put dispatch_tab in the rodata section for macho64.
astrange
parents:
8430
diff
changeset
|
490 |
8430 | 491 ; On x86_32, this function does the register saving and restoring for all of fft. |
492 ; The others pass args in registers and don't spill anything. | |
10019
c08ca946c80a
Update x264 asm code to latest to add support for 64-bit Windows.
darkshikari
parents:
8820
diff
changeset
|
493 cglobal fft_dispatch%3%2, 2,5,8, z, nbits |
12399 | 494 FFT_DISPATCH %3%2, nbits |
8430 | 495 RET |
496 %endmacro ; DECL_FFT | |
497 | |
498 DECL_FFT 5, _sse | |
499 DECL_FFT 5, _sse, _interleave | |
500 DECL_FFT 4, _3dn | |
501 DECL_FFT 4, _3dn, _interleave | |
502 DECL_FFT 4, _3dn2 | |
503 DECL_FFT 4, _3dn2, _interleave | |
504 | |
12399 | 505 INIT_XMM |
506 %undef mulps | |
507 %undef addps | |
508 %undef subps | |
509 %undef unpcklps | |
510 %undef unpckhps | |
511 | |
512 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 | |
513 movaps xmm0, [%3+%2*4] | |
514 movaps xmm1, [%3+%1*4-0x10] | |
515 movaps xmm2, xmm0 | |
516 shufps xmm0, xmm1, 0x88 | |
517 shufps xmm1, xmm2, 0x77 | |
518 movlps xmm4, [%4+%2*2] | |
519 movlps xmm5, [%5+%2*2+0x0] | |
520 movhps xmm4, [%4+%1*2-0x8] | |
521 movhps xmm5, [%5+%1*2-0x8] | |
522 movaps xmm2, xmm0 | |
523 movaps xmm3, xmm1 | |
524 mulps xmm0, xmm5 | |
525 mulps xmm1, xmm4 | |
526 mulps xmm2, xmm4 | |
527 mulps xmm3, xmm5 | |
528 subps xmm1, xmm0 | |
529 addps xmm2, xmm3 | |
530 movaps xmm0, xmm1 | |
531 unpcklps xmm1, xmm2 | |
532 unpckhps xmm0, xmm2 | |
533 %endmacro | |
534 | |
535 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 | |
536 movaps xmm6, [%4+%1*2] | |
537 movaps %2, [%4+%1*2+0x10] | |
538 movaps %3, xmm6 | |
539 movaps xmm7, %2 | |
12432 | 540 mulps xmm6, [%5+%1] |
541 mulps %2, [%6+%1] | |
542 mulps %3, [%6+%1] | |
543 mulps xmm7, [%5+%1] | |
12399 | 544 subps %2, xmm6 |
545 addps %3, xmm7 | |
546 %endmacro | |
547 | |
548 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 | |
549 .post: | |
550 CMUL %1, xmm0, xmm1, %3, %4, %5 | |
551 CMUL %2, xmm4, xmm5, %3, %4, %5 | |
552 shufps xmm1, xmm1, 0x1b | |
553 shufps xmm5, xmm5, 0x1b | |
554 movaps xmm6, xmm4 | |
555 unpckhps xmm4, xmm1 | |
556 unpcklps xmm6, xmm1 | |
557 movaps xmm2, xmm0 | |
558 unpcklps xmm0, xmm5 | |
559 unpckhps xmm2, xmm5 | |
560 movaps [%3+%2*2], xmm6 | |
561 movaps [%3+%2*2+0x10], xmm4 | |
562 movaps [%3+%1*2], xmm0 | |
563 movaps [%3+%1*2+0x10], xmm2 | |
564 sub %2, 0x10 | |
565 add %1, 0x10 | |
566 jl .post | |
567 %endmacro | |
568 | |
569 cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input | |
570 %ifdef ARCH_X86_64 | |
571 %define rrevtab r10 | |
572 %define rtcos r11 | |
573 %define rtsin r12 | |
574 push r12 | |
575 push r13 | |
576 push r14 | |
577 %else | |
578 %define rrevtab r6 | |
579 %define rtsin r6 | |
580 %define rtcos r5 | |
581 %endif | |
582 mov r3d, [r0+FFTContext.mdctsize] | |
583 add r2, r3 | |
584 shr r3, 1 | |
585 mov rtcos, [r0+FFTContext.tcos] | |
586 mov rtsin, [r0+FFTContext.tsin] | |
587 add rtcos, r3 | |
588 add rtsin, r3 | |
589 %ifndef ARCH_X86_64 | |
590 push rtcos | |
591 push rtsin | |
592 %endif | |
593 shr r3, 1 | |
594 mov rrevtab, [r0+FFTContext.revtab] | |
595 add rrevtab, r3 | |
596 %ifndef ARCH_X86_64 | |
597 push rrevtab | |
598 %endif | |
599 | |
600 sub r3, 4 | |
601 %ifdef ARCH_X86_64 | |
602 xor r4, r4 | |
603 sub r4, r3 | |
604 %endif | |
605 .pre: | |
606 %ifndef ARCH_X86_64 | |
607 ;unspill | |
608 xor r4, r4 | |
609 sub r4, r3 | |
610 mov rtsin, [esp+4] | |
611 mov rtcos, [esp+8] | |
612 %endif | |
613 | |
614 PREROTATER r4, r3, r2, rtcos, rtsin | |
615 %ifdef ARCH_X86_64 | |
12432 | 616 movzx r5, word [rrevtab+r4-4] |
617 movzx r6, word [rrevtab+r4-2] | |
618 movzx r13, word [rrevtab+r3] | |
619 movzx r14, word [rrevtab+r3+2] | |
620 movlps [r1+r5 *8], xmm0 | |
621 movhps [r1+r6 *8], xmm0 | |
622 movlps [r1+r13*8], xmm1 | |
623 movhps [r1+r14*8], xmm1 | |
12399 | 624 add r4, 4 |
625 %else | |
626 mov r6, [esp] | |
12432 | 627 movzx r5, word [r6+r4-4] |
628 movzx r4, word [r6+r4-2] | |
629 movlps [r1+r5*8], xmm0 | |
630 movhps [r1+r4*8], xmm0 | |
631 movzx r5, word [r6+r3] | |
632 movzx r4, word [r6+r3+2] | |
633 movlps [r1+r5*8], xmm1 | |
634 movhps [r1+r4*8], xmm1 | |
12399 | 635 %endif |
636 sub r3, 4 | |
637 jns .pre | |
638 | |
639 mov r5, r0 | |
640 mov r6, r1 | |
641 mov r0, r1 | |
642 mov r1d, [r5+FFTContext.nbits] | |
643 | |
644 FFT_DISPATCH _sse, r1 | |
645 | |
646 mov r0d, [r5+FFTContext.mdctsize] | |
647 add r6, r0 | |
648 shr r0, 1 | |
649 %ifndef ARCH_X86_64 | |
650 %define rtcos r2 | |
651 %define rtsin r3 | |
652 mov rtcos, [esp+8] | |
653 mov rtsin, [esp+4] | |
654 %endif | |
655 neg r0 | |
656 mov r1, -16 | |
657 sub r1, r0 | |
658 POSROTATESHUF r0, r1, r6, rtcos, rtsin | |
659 %ifdef ARCH_X86_64 | |
660 pop r14 | |
661 pop r13 | |
662 pop r12 | |
663 %else | |
664 add esp, 12 | |
665 %endif | |
666 RET |