comparison i386/fft_mmx.asm @ 7542:a8a8205a9081 libavcodec

split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse.
author lorenm
date Tue, 12 Aug 2008 00:26:58 +0000
parents
children 4877d4c6d8ae
comparison
equal deleted inserted replaced
7541:570c0c027998 7542:a8a8205a9081
1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
4 ;*
5 ;* This file is part of FFmpeg.
6 ;*
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
11 ;*
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
21
22 ; These functions are not individually interchangeable with the C versions.
23 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
24 ; in blocks as conventient to the vector size.
25 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
26
27 %include "x86inc.asm"
28
29 SECTION_RODATA
30
31 %define M_SQRT1_2 0.70710678118654752440
32 ps_root2: times 4 dd M_SQRT1_2
33 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
34 ps_m1p1: dd 1<<31, 0
35
36 %assign i 16
37 %rep 13
38 cextern ff_cos_ %+ i
39 %assign i i<<1
40 %endrep
41
42 %ifdef ARCH_X86_64
43 %define pointer dq
44 %else
45 %define pointer dd
46 %endif
47
48 %macro IF0 1+
49 %endmacro
50 %macro IF1 1+
51 %1
52 %endmacro
53
54 section .text align=16
55
56 %macro T2_3DN 4 ; z0, z1, mem0, mem1
57 mova %1, %3
58 mova %2, %1
59 pfadd %1, %4
60 pfsub %2, %4
61 %endmacro
62
63 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
64 mova %5, %3
65 pfsub %3, %4
66 pfadd %5, %4 ; {t6,t5}
67 pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7}
68 mova %6, %1
69 pswapd %3, %3
70 pfadd %1, %5 ; {r0,i0}
71 pfsub %6, %5 ; {r2,i2}
72 mova %4, %2
73 pfadd %2, %3 ; {r1,i1}
74 pfsub %4, %3 ; {r3,i3}
75 SWAP %3, %6
76 %endmacro
77
78 ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
79 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
80 %macro T4_SSE 3
81 mova %3, %1
82 shufps %1, %2, 0x64 ; {r0,i0,r3,i2}
83 shufps %3, %2, 0xce ; {r1,i1,r2,i3}
84 mova %2, %1
85 addps %1, %3 ; {t1,t2,t6,t5}
86 subps %2, %3 ; {t3,t4,t8,t7}
87 mova %3, %1
88 shufps %1, %2, 0x44 ; {t1,t2,t3,t4}
89 shufps %3, %2, 0xbe ; {t6,t5,t7,t8}
90 mova %2, %1
91 addps %1, %3 ; {r0,i0,r1,i1}
92 subps %2, %3 ; {r2,i2,r3,i3}
93 mova %3, %1
94 shufps %1, %2, 0x88 ; {r0,r1,r2,r3}
95 shufps %3, %2, 0xdd ; {i0,i1,i2,i3}
96 SWAP %2, %3
97 %endmacro
98
99 %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
100 mova %5, %3
101 shufps %3, %4, 0x44 ; {r4,i4,r6,i6}
102 shufps %5, %4, 0xee ; {r5,i5,r7,i7}
103 mova %6, %3
104 subps %3, %5 ; {r5,i5,r7,i7}
105 addps %6, %5 ; {t1,t2,t3,t4}
106 mova %5, %3
107 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7}
108 mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7}
109 mulps %5, [ps_root2 GLOBAL]
110 addps %3, %5 ; {t8,t7,ta,t9}
111 mova %5, %6
112 shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
113 shufps %5, %3, 0x9c ; {t1,t4,t7,ta}
114 mova %3, %6
115 addps %6, %5 ; {t1,t2,t9,ta}
116 subps %3, %5 ; {t6,t5,tc,tb}
117 mova %5, %6
118 shufps %6, %3, 0xd8 ; {t1,t9,t5,tb}
119 shufps %5, %3, 0x8d ; {t2,ta,t6,tc}
120 mova %3, %1
121 mova %4, %2
122 addps %1, %6 ; {r0,r1,r2,r3}
123 addps %2, %5 ; {i0,i1,i2,i3}
124 subps %3, %6 ; {r4,r5,r6,r7}
125 subps %4, %5 ; {i4,i5,i6,i7}
126 %endmacro
127
128 ; scheduled for cpu-bound sizes
129 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
130 IF%1 mova m4, Z(4)
131 IF%1 mova m5, Z(5)
132 mova m0, %2 ; wre
133 mova m2, m4
134 mova m1, %3 ; wim
135 mova m3, m5
136 mulps m2, m0 ; r2*wre
137 IF%1 mova m6, Z(6)
138 mulps m3, m1 ; i2*wim
139 IF%1 mova m7, Z(7)
140 mulps m4, m1 ; r2*wim
141 mulps m5, m0 ; i2*wre
142 addps m2, m3 ; r2*wre + i2*wim
143 mova m3, m1
144 mulps m1, m6 ; r3*wim
145 subps m5, m4 ; i2*wre - r2*wim
146 mova m4, m0
147 mulps m3, m7 ; i3*wim
148 mulps m4, m6 ; r3*wre
149 mulps m0, m7 ; i3*wre
150 subps m4, m3 ; r3*wre - i3*wim
151 mova m3, Z(0)
152 addps m0, m1 ; i3*wre + r3*wim
153 mova m1, m4
154 addps m4, m2 ; t5
155 subps m1, m2 ; t3
156 subps m3, m4 ; r2
157 addps m4, Z(0) ; r0
158 mova m6, Z(2)
159 mova Z(4), m3
160 mova Z(0), m4
161 mova m3, m5
162 subps m5, m0 ; t4
163 mova m4, m6
164 subps m6, m5 ; r3
165 addps m5, m4 ; r1
166 mova Z(6), m6
167 mova Z(2), m5
168 mova m2, Z(3)
169 addps m3, m0 ; t6
170 subps m2, m1 ; i3
171 mova m7, Z(1)
172 addps m1, Z(3) ; i1
173 mova Z(7), m2
174 mova Z(3), m1
175 mova m4, m7
176 subps m7, m3 ; i2
177 addps m3, m4 ; i0
178 mova Z(5), m7
179 mova Z(1), m3
180 %endmacro
181
182 ; scheduled to avoid store->load aliasing
183 %macro PASS_BIG 1 ; (!interleave)
184 mova m4, Z(4) ; r2
185 mova m5, Z(5) ; i2
186 mova m2, m4
187 mova m0, [wq] ; wre
188 mova m3, m5
189 mova m1, [wq+o1q] ; wim
190 mulps m2, m0 ; r2*wre
191 mova m6, Z(6) ; r3
192 mulps m3, m1 ; i2*wim
193 mova m7, Z(7) ; i3
194 mulps m4, m1 ; r2*wim
195 mulps m5, m0 ; i2*wre
196 addps m2, m3 ; r2*wre + i2*wim
197 mova m3, m1
198 mulps m1, m6 ; r3*wim
199 subps m5, m4 ; i2*wre - r2*wim
200 mova m4, m0
201 mulps m3, m7 ; i3*wim
202 mulps m4, m6 ; r3*wre
203 mulps m0, m7 ; i3*wre
204 subps m4, m3 ; r3*wre - i3*wim
205 mova m3, Z(0)
206 addps m0, m1 ; i3*wre + r3*wim
207 mova m1, m4
208 addps m4, m2 ; t5
209 subps m1, m2 ; t3
210 subps m3, m4 ; r2
211 addps m4, Z(0) ; r0
212 mova m6, Z(2)
213 mova Z(4), m3
214 mova Z(0), m4
215 mova m3, m5
216 subps m5, m0 ; t4
217 mova m4, m6
218 subps m6, m5 ; r3
219 addps m5, m4 ; r1
220 IF%1 mova Z(6), m6
221 IF%1 mova Z(2), m5
222 mova m2, Z(3)
223 addps m3, m0 ; t6
224 subps m2, m1 ; i3
225 mova m7, Z(1)
226 addps m1, Z(3) ; i1
227 IF%1 mova Z(7), m2
228 IF%1 mova Z(3), m1
229 mova m4, m7
230 subps m7, m3 ; i2
231 addps m3, m4 ; i0
232 IF%1 mova Z(5), m7
233 IF%1 mova Z(1), m3
234 %if %1==0
235 mova m4, m5 ; r1
236 mova m0, m6 ; r3
237 unpcklps m5, m1
238 unpckhps m4, m1
239 unpcklps m6, m2
240 unpckhps m0, m2
241 mova m1, Z(0)
242 mova m2, Z(4)
243 mova Z(2), m5
244 mova Z(3), m4
245 mova Z(6), m6
246 mova Z(7), m0
247 mova m5, m1 ; r0
248 mova m4, m2 ; r2
249 unpcklps m1, m3
250 unpckhps m5, m3
251 unpcklps m2, m7
252 unpckhps m4, m7
253 mova Z(0), m1
254 mova Z(1), m5
255 mova Z(4), m2
256 mova Z(5), m4
257 %endif
258 %endmacro
259
260 %macro PUNPCK 3
261 mova %3, %1
262 punpckldq %1, %2
263 punpckhdq %3, %2
264 %endmacro
265
266 INIT_XMM
267
268 %define Z(x) [r0+mmsize*x]
269
270 align 16
271 fft4_sse:
272 mova m0, Z(0)
273 mova m1, Z(1)
274 T4_SSE m0, m1, m2
275 mova Z(0), m0
276 mova Z(1), m1
277 ret
278
279 align 16
280 fft8_sse:
281 mova m0, Z(0)
282 mova m1, Z(1)
283 T4_SSE m0, m1, m2
284 mova m2, Z(2)
285 mova m3, Z(3)
286 T8_SSE m0, m1, m2, m3, m4, m5
287 mova Z(0), m0
288 mova Z(1), m1
289 mova Z(2), m2
290 mova Z(3), m3
291 ret
292
293 align 16
294 fft16_sse:
295 mova m0, Z(0)
296 mova m1, Z(1)
297 T4_SSE m0, m1, m2
298 mova m2, Z(2)
299 mova m3, Z(3)
300 T8_SSE m0, m1, m2, m3, m4, m5
301 mova m4, Z(4)
302 mova m5, Z(5)
303 mova Z(0), m0
304 mova Z(1), m1
305 mova Z(2), m2
306 mova Z(3), m3
307 T4_SSE m4, m5, m6
308 mova m6, Z(6)
309 mova m7, Z(7)
310 T4_SSE m6, m7, m0
311 PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL]
312 ret
313
314
315 INIT_MMX
316
317 %macro FFT48_3DN 1
318 align 16
319 fft4%1:
320 T2_3DN m0, m1, Z(0), Z(1)
321 mova m2, Z(2)
322 mova m3, Z(3)
323 T4_3DN m0, m1, m2, m3, m4, m5
324 PUNPCK m0, m1, m4
325 PUNPCK m2, m3, m5
326 mova Z(0), m0
327 mova Z(1), m4
328 mova Z(2), m2
329 mova Z(3), m5
330 ret
331
332 align 16
333 fft8%1:
334 T2_3DN m0, m1, Z(0), Z(1)
335 mova m2, Z(2)
336 mova m3, Z(3)
337 T4_3DN m0, m1, m2, m3, m4, m5
338 mova Z(0), m0
339 mova Z(2), m2
340 T2_3DN m4, m5, Z(4), Z(5)
341 T2_3DN m6, m7, Z(6), Z(7)
342 pswapd m0, m5
343 pswapd m2, m7
344 pxor m0, [ps_m1p1 GLOBAL]
345 pxor m2, [ps_m1p1 GLOBAL]
346 pfsub m5, m0
347 pfadd m7, m2
348 pfmul m5, [ps_root2 GLOBAL]
349 pfmul m7, [ps_root2 GLOBAL]
350 T4_3DN m1, m3, m5, m7, m0, m2
351 mova Z(5), m5
352 mova Z(7), m7
353 mova m0, Z(0)
354 mova m2, Z(2)
355 T4_3DN m0, m2, m4, m6, m5, m7
356 PUNPCK m0, m1, m5
357 PUNPCK m2, m3, m7
358 mova Z(0), m0
359 mova Z(1), m5
360 mova Z(2), m2
361 mova Z(3), m7
362 PUNPCK m4, Z(5), m5
363 PUNPCK m6, Z(7), m7
364 mova Z(4), m4
365 mova Z(5), m5
366 mova Z(6), m6
367 mova Z(7), m7
368 ret
369 %endmacro
370
371 FFT48_3DN _3dn2
372
373 %macro pswapd 2
374 %ifidn %1, %2
375 movd [r0+12], %1
376 punpckhdq %1, [r0+8]
377 %else
378 movq %1, %2
379 psrlq %1, 32
380 punpckldq %1, %2
381 %endif
382 %endmacro
383
384 FFT48_3DN _3dn
385
386
387 %define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)]
388
389 %macro DECL_PASS 2+ ; name, payload
390 align 16
391 %1:
392 DEFINE_ARGS z, w, n, o1, o3
393 lea o3q, [nq*3]
394 lea o1q, [nq*8]
395 shl o3q, 4
396 .loop:
397 %2
398 add zq, mmsize*2
399 add wq, mmsize
400 sub nd, mmsize/8
401 jg .loop
402 rep ret
403 %endmacro
404
405 INIT_XMM
406 DECL_PASS pass_sse, PASS_BIG 1
407 DECL_PASS pass_interleave_sse, PASS_BIG 0
408
409 INIT_MMX
410 %define mulps pfmul
411 %define addps pfadd
412 %define subps pfsub
413 %define unpcklps punpckldq
414 %define unpckhps punpckhdq
415 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
416 DECL_PASS pass_interleave_3dn, PASS_BIG 0
417 %define pass_3dn2 pass_3dn
418 %define pass_interleave_3dn2 pass_interleave_3dn
419
420
421 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
422 %xdefine list_of_fft fft4%2, fft8%2
423 %if %1==5
424 %xdefine list_of_fft list_of_fft, fft16%2
425 %endif
426
427 %assign n 1<<%1
428 %rep 17-%1
429 %assign n2 n/2
430 %assign n4 n/4
431 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2
432
433 align 16
434 fft %+ n %+ %3%2:
435 call fft %+ n2 %+ %2
436 add r0, n*4 - (n&(-2<<%1))
437 call fft %+ n4 %+ %2
438 add r0, n*2 - (n2&(-2<<%1))
439 call fft %+ n4 %+ %2
440 sub r0, n*6 + (n2&(-2<<%1))
441 lea r1, [ff_cos_ %+ n GLOBAL]
442 mov r2d, n4/2
443 jmp pass%3%2
444
445 %assign n n*2
446 %endrep
447 %undef n
448
449 align 8
450 dispatch_tab%3%2: pointer list_of_fft
451
452 ; On x86_32, this function does the register saving and restoring for all of fft.
453 ; The others pass args in registers and don't spill anything.
454 cglobal ff_fft_dispatch%3%2, 2,5,0, z, nbits
455 lea r2, [dispatch_tab%3%2 GLOBAL]
456 mov r2, [r2 + (nbitsq-2)*gprsize]
457 call r2
458 RET
459 %endmacro ; DECL_FFT
460
461 DECL_FFT 5, _sse
462 DECL_FFT 5, _sse, _interleave
463 DECL_FFT 4, _3dn
464 DECL_FFT 4, _3dn, _interleave
465 DECL_FFT 4, _3dn2
466 DECL_FFT 4, _3dn2, _interleave
467