Mercurial > libavcodec.hg
annotate x86/fft_mmx.asm @ 12233:10b02cbc3cc2 libavcodec
Get rid of more unnecessary dereferences in VP8 deblocking
author | darkshikari |
---|---|
date | Thu, 22 Jul 2010 23:11:40 +0000 |
parents | 6f064ab48463 |
children | 020540442072 |
rev | line source |
---|---|
8430 | 1 ;****************************************************************************** |
2 ;* FFT transform with SSE/3DNow optimizations | |
3 ;* Copyright (c) 2008 Loren Merritt | |
4 ;* | |
12188 | 5 ;* This algorithm (though not any of the implementation details) is |
6 ;* based on libdjbfft by D. J. Bernstein. | |
7 ;* | |
8430 | 8 ;* This file is part of FFmpeg. |
9 ;* | |
10 ;* FFmpeg is free software; you can redistribute it and/or | |
11 ;* modify it under the terms of the GNU Lesser General Public | |
12 ;* License as published by the Free Software Foundation; either | |
13 ;* version 2.1 of the License, or (at your option) any later version. | |
14 ;* | |
15 ;* FFmpeg is distributed in the hope that it will be useful, | |
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 ;* Lesser General Public License for more details. | |
19 ;* | |
20 ;* You should have received a copy of the GNU Lesser General Public | |
21 ;* License along with FFmpeg; if not, write to the Free Software | |
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 ;****************************************************************************** | |
24 | |
25 ; These functions are not individually interchangeable with the C versions. | |
26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results | |
27 ; in blocks as conventient to the vector size. | |
28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) | |
29 | |
30 %include "x86inc.asm" | |
31 | |
32 SECTION_RODATA | |
33 | |
34 %define M_SQRT1_2 0.70710678118654752440 | |
35 ps_root2: times 4 dd M_SQRT1_2 | |
36 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | |
37 ps_m1p1: dd 1<<31, 0 | |
38 | |
39 %assign i 16 | |
40 %rep 13 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
41 cextern cos_ %+ i |
8430 | 42 %assign i i<<1 |
43 %endrep | |
44 | |
45 %ifdef ARCH_X86_64 | |
46 %define pointer dq | |
47 %else | |
48 %define pointer dd | |
49 %endif | |
50 | |
51 %macro IF0 1+ | |
52 %endmacro | |
53 %macro IF1 1+ | |
54 %1 | |
55 %endmacro | |
56 | |
57 section .text align=16 | |
58 | |
59 %macro T2_3DN 4 ; z0, z1, mem0, mem1 | |
60 mova %1, %3 | |
61 mova %2, %1 | |
62 pfadd %1, %4 | |
63 pfsub %2, %4 | |
64 %endmacro | |
65 | |
66 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 | |
67 mova %5, %3 | |
68 pfsub %3, %4 | |
69 pfadd %5, %4 ; {t6,t5} | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
70 pxor %3, [ps_m1p1] ; {t8,t7} |
8430 | 71 mova %6, %1 |
72 pswapd %3, %3 | |
73 pfadd %1, %5 ; {r0,i0} | |
74 pfsub %6, %5 ; {r2,i2} | |
75 mova %4, %2 | |
76 pfadd %2, %3 ; {r1,i1} | |
77 pfsub %4, %3 ; {r3,i3} | |
78 SWAP %3, %6 | |
79 %endmacro | |
80 | |
81 ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3} | |
82 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} | |
83 %macro T4_SSE 3 | |
84 mova %3, %1 | |
85 shufps %1, %2, 0x64 ; {r0,i0,r3,i2} | |
86 shufps %3, %2, 0xce ; {r1,i1,r2,i3} | |
87 mova %2, %1 | |
88 addps %1, %3 ; {t1,t2,t6,t5} | |
89 subps %2, %3 ; {t3,t4,t8,t7} | |
90 mova %3, %1 | |
91 shufps %1, %2, 0x44 ; {t1,t2,t3,t4} | |
92 shufps %3, %2, 0xbe ; {t6,t5,t7,t8} | |
93 mova %2, %1 | |
94 addps %1, %3 ; {r0,i0,r1,i1} | |
95 subps %2, %3 ; {r2,i2,r3,i3} | |
96 mova %3, %1 | |
97 shufps %1, %2, 0x88 ; {r0,r1,r2,r3} | |
98 shufps %3, %2, 0xdd ; {i0,i1,i2,i3} | |
99 SWAP %2, %3 | |
100 %endmacro | |
101 | |
102 %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1 | |
103 mova %5, %3 | |
104 shufps %3, %4, 0x44 ; {r4,i4,r6,i6} | |
105 shufps %5, %4, 0xee ; {r5,i5,r7,i7} | |
106 mova %6, %3 | |
107 subps %3, %5 ; {r5,i5,r7,i7} | |
108 addps %6, %5 ; {t1,t2,t3,t4} | |
109 mova %5, %3 | |
110 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
111 mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} |
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
112 mulps %5, [ps_root2] |
8430 | 113 addps %3, %5 ; {t8,t7,ta,t9} |
114 mova %5, %6 | |
115 shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | |
116 shufps %5, %3, 0x9c ; {t1,t4,t7,ta} | |
117 mova %3, %6 | |
118 addps %6, %5 ; {t1,t2,t9,ta} | |
119 subps %3, %5 ; {t6,t5,tc,tb} | |
120 mova %5, %6 | |
121 shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} | |
122 shufps %5, %3, 0x8d ; {t2,ta,t6,tc} | |
123 mova %3, %1 | |
124 mova %4, %2 | |
125 addps %1, %6 ; {r0,r1,r2,r3} | |
126 addps %2, %5 ; {i0,i1,i2,i3} | |
127 subps %3, %6 ; {r4,r5,r6,r7} | |
128 subps %4, %5 ; {i4,i5,i6,i7} | |
129 %endmacro | |
130 | |
131 ; scheduled for cpu-bound sizes | |
132 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim | |
133 IF%1 mova m4, Z(4) | |
134 IF%1 mova m5, Z(5) | |
135 mova m0, %2 ; wre | |
136 mova m2, m4 | |
137 mova m1, %3 ; wim | |
138 mova m3, m5 | |
139 mulps m2, m0 ; r2*wre | |
140 IF%1 mova m6, Z(6) | |
141 mulps m3, m1 ; i2*wim | |
142 IF%1 mova m7, Z(7) | |
143 mulps m4, m1 ; r2*wim | |
144 mulps m5, m0 ; i2*wre | |
145 addps m2, m3 ; r2*wre + i2*wim | |
146 mova m3, m1 | |
147 mulps m1, m6 ; r3*wim | |
148 subps m5, m4 ; i2*wre - r2*wim | |
149 mova m4, m0 | |
150 mulps m3, m7 ; i3*wim | |
151 mulps m4, m6 ; r3*wre | |
152 mulps m0, m7 ; i3*wre | |
153 subps m4, m3 ; r3*wre - i3*wim | |
154 mova m3, Z(0) | |
155 addps m0, m1 ; i3*wre + r3*wim | |
156 mova m1, m4 | |
157 addps m4, m2 ; t5 | |
158 subps m1, m2 ; t3 | |
159 subps m3, m4 ; r2 | |
160 addps m4, Z(0) ; r0 | |
161 mova m6, Z(2) | |
162 mova Z(4), m3 | |
163 mova Z(0), m4 | |
164 mova m3, m5 | |
165 subps m5, m0 ; t4 | |
166 mova m4, m6 | |
167 subps m6, m5 ; r3 | |
168 addps m5, m4 ; r1 | |
169 mova Z(6), m6 | |
170 mova Z(2), m5 | |
171 mova m2, Z(3) | |
172 addps m3, m0 ; t6 | |
173 subps m2, m1 ; i3 | |
174 mova m7, Z(1) | |
175 addps m1, Z(3) ; i1 | |
176 mova Z(7), m2 | |
177 mova Z(3), m1 | |
178 mova m4, m7 | |
179 subps m7, m3 ; i2 | |
180 addps m3, m4 ; i0 | |
181 mova Z(5), m7 | |
182 mova Z(1), m3 | |
183 %endmacro | |
184 | |
185 ; scheduled to avoid store->load aliasing | |
186 %macro PASS_BIG 1 ; (!interleave) | |
187 mova m4, Z(4) ; r2 | |
188 mova m5, Z(5) ; i2 | |
189 mova m2, m4 | |
190 mova m0, [wq] ; wre | |
191 mova m3, m5 | |
192 mova m1, [wq+o1q] ; wim | |
193 mulps m2, m0 ; r2*wre | |
194 mova m6, Z(6) ; r3 | |
195 mulps m3, m1 ; i2*wim | |
196 mova m7, Z(7) ; i3 | |
197 mulps m4, m1 ; r2*wim | |
198 mulps m5, m0 ; i2*wre | |
199 addps m2, m3 ; r2*wre + i2*wim | |
200 mova m3, m1 | |
201 mulps m1, m6 ; r3*wim | |
202 subps m5, m4 ; i2*wre - r2*wim | |
203 mova m4, m0 | |
204 mulps m3, m7 ; i3*wim | |
205 mulps m4, m6 ; r3*wre | |
206 mulps m0, m7 ; i3*wre | |
207 subps m4, m3 ; r3*wre - i3*wim | |
208 mova m3, Z(0) | |
209 addps m0, m1 ; i3*wre + r3*wim | |
210 mova m1, m4 | |
211 addps m4, m2 ; t5 | |
212 subps m1, m2 ; t3 | |
213 subps m3, m4 ; r2 | |
214 addps m4, Z(0) ; r0 | |
215 mova m6, Z(2) | |
216 mova Z(4), m3 | |
217 mova Z(0), m4 | |
218 mova m3, m5 | |
219 subps m5, m0 ; t4 | |
220 mova m4, m6 | |
221 subps m6, m5 ; r3 | |
222 addps m5, m4 ; r1 | |
223 IF%1 mova Z(6), m6 | |
224 IF%1 mova Z(2), m5 | |
225 mova m2, Z(3) | |
226 addps m3, m0 ; t6 | |
227 subps m2, m1 ; i3 | |
228 mova m7, Z(1) | |
229 addps m1, Z(3) ; i1 | |
230 IF%1 mova Z(7), m2 | |
231 IF%1 mova Z(3), m1 | |
232 mova m4, m7 | |
233 subps m7, m3 ; i2 | |
234 addps m3, m4 ; i0 | |
235 IF%1 mova Z(5), m7 | |
236 IF%1 mova Z(1), m3 | |
237 %if %1==0 | |
238 mova m4, m5 ; r1 | |
239 mova m0, m6 ; r3 | |
240 unpcklps m5, m1 | |
241 unpckhps m4, m1 | |
242 unpcklps m6, m2 | |
243 unpckhps m0, m2 | |
244 mova m1, Z(0) | |
245 mova m2, Z(4) | |
246 mova Z(2), m5 | |
247 mova Z(3), m4 | |
248 mova Z(6), m6 | |
249 mova Z(7), m0 | |
250 mova m5, m1 ; r0 | |
251 mova m4, m2 ; r2 | |
252 unpcklps m1, m3 | |
253 unpckhps m5, m3 | |
254 unpcklps m2, m7 | |
255 unpckhps m4, m7 | |
256 mova Z(0), m1 | |
257 mova Z(1), m5 | |
258 mova Z(4), m2 | |
259 mova Z(5), m4 | |
260 %endif | |
261 %endmacro | |
262 | |
263 %macro PUNPCK 3 | |
264 mova %3, %1 | |
265 punpckldq %1, %2 | |
266 punpckhdq %3, %2 | |
267 %endmacro | |
268 | |
269 INIT_XMM | |
10452
c6aa538c0bc3
s/movdqa/movaps/ in sse1 fft. (regression in r20293)
lorenm
parents:
10019
diff
changeset
|
270 %define mova movaps |
8430 | 271 |
272 %define Z(x) [r0+mmsize*x] | |
273 | |
274 align 16 | |
275 fft4_sse: | |
276 mova m0, Z(0) | |
277 mova m1, Z(1) | |
278 T4_SSE m0, m1, m2 | |
279 mova Z(0), m0 | |
280 mova Z(1), m1 | |
281 ret | |
282 | |
283 align 16 | |
284 fft8_sse: | |
285 mova m0, Z(0) | |
286 mova m1, Z(1) | |
287 T4_SSE m0, m1, m2 | |
288 mova m2, Z(2) | |
289 mova m3, Z(3) | |
290 T8_SSE m0, m1, m2, m3, m4, m5 | |
291 mova Z(0), m0 | |
292 mova Z(1), m1 | |
293 mova Z(2), m2 | |
294 mova Z(3), m3 | |
295 ret | |
296 | |
297 align 16 | |
298 fft16_sse: | |
299 mova m0, Z(0) | |
300 mova m1, Z(1) | |
301 T4_SSE m0, m1, m2 | |
302 mova m2, Z(2) | |
303 mova m3, Z(3) | |
304 T8_SSE m0, m1, m2, m3, m4, m5 | |
305 mova m4, Z(4) | |
306 mova m5, Z(5) | |
307 mova Z(0), m0 | |
308 mova Z(1), m1 | |
309 mova Z(2), m2 | |
310 mova Z(3), m3 | |
311 T4_SSE m4, m5, m6 | |
312 mova m6, Z(6) | |
313 mova m7, Z(7) | |
314 T4_SSE m6, m7, m0 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
315 PASS_SMALL 0, [cos_16], [cos_16+16] |
8430 | 316 ret |
317 | |
318 | |
319 INIT_MMX | |
320 | |
321 %macro FFT48_3DN 1 | |
322 align 16 | |
323 fft4%1: | |
324 T2_3DN m0, m1, Z(0), Z(1) | |
325 mova m2, Z(2) | |
326 mova m3, Z(3) | |
327 T4_3DN m0, m1, m2, m3, m4, m5 | |
328 PUNPCK m0, m1, m4 | |
329 PUNPCK m2, m3, m5 | |
330 mova Z(0), m0 | |
331 mova Z(1), m4 | |
332 mova Z(2), m2 | |
333 mova Z(3), m5 | |
334 ret | |
335 | |
336 align 16 | |
337 fft8%1: | |
338 T2_3DN m0, m1, Z(0), Z(1) | |
339 mova m2, Z(2) | |
340 mova m3, Z(3) | |
341 T4_3DN m0, m1, m2, m3, m4, m5 | |
342 mova Z(0), m0 | |
343 mova Z(2), m2 | |
344 T2_3DN m4, m5, Z(4), Z(5) | |
345 T2_3DN m6, m7, Z(6), Z(7) | |
346 pswapd m0, m5 | |
347 pswapd m2, m7 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
348 pxor m0, [ps_m1p1] |
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
349 pxor m2, [ps_m1p1] |
8430 | 350 pfsub m5, m0 |
351 pfadd m7, m2 | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
352 pfmul m5, [ps_root2] |
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
353 pfmul m7, [ps_root2] |
8430 | 354 T4_3DN m1, m3, m5, m7, m0, m2 |
355 mova Z(5), m5 | |
356 mova Z(7), m7 | |
357 mova m0, Z(0) | |
358 mova m2, Z(2) | |
359 T4_3DN m0, m2, m4, m6, m5, m7 | |
360 PUNPCK m0, m1, m5 | |
361 PUNPCK m2, m3, m7 | |
362 mova Z(0), m0 | |
363 mova Z(1), m5 | |
364 mova Z(2), m2 | |
365 mova Z(3), m7 | |
366 PUNPCK m4, Z(5), m5 | |
367 PUNPCK m6, Z(7), m7 | |
368 mova Z(4), m4 | |
369 mova Z(5), m5 | |
370 mova Z(6), m6 | |
371 mova Z(7), m7 | |
372 ret | |
373 %endmacro | |
374 | |
375 FFT48_3DN _3dn2 | |
376 | |
377 %macro pswapd 2 | |
378 %ifidn %1, %2 | |
379 movd [r0+12], %1 | |
380 punpckhdq %1, [r0+8] | |
381 %else | |
382 movq %1, %2 | |
383 psrlq %1, 32 | |
384 punpckldq %1, %2 | |
385 %endif | |
386 %endmacro | |
387 | |
388 FFT48_3DN _3dn | |
389 | |
390 | |
391 %define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)] | |
392 | |
393 %macro DECL_PASS 2+ ; name, payload | |
394 align 16 | |
395 %1: | |
396 DEFINE_ARGS z, w, n, o1, o3 | |
397 lea o3q, [nq*3] | |
398 lea o1q, [nq*8] | |
399 shl o3q, 4 | |
400 .loop: | |
401 %2 | |
402 add zq, mmsize*2 | |
403 add wq, mmsize | |
404 sub nd, mmsize/8 | |
405 jg .loop | |
406 rep ret | |
407 %endmacro | |
408 | |
409 INIT_XMM | |
10452
c6aa538c0bc3
s/movdqa/movaps/ in sse1 fft. (regression in r20293)
lorenm
parents:
10019
diff
changeset
|
410 %define mova movaps |
8430 | 411 DECL_PASS pass_sse, PASS_BIG 1 |
412 DECL_PASS pass_interleave_sse, PASS_BIG 0 | |
413 | |
414 INIT_MMX | |
415 %define mulps pfmul | |
416 %define addps pfadd | |
417 %define subps pfsub | |
418 %define unpcklps punpckldq | |
419 %define unpckhps punpckhdq | |
420 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q] | |
421 DECL_PASS pass_interleave_3dn, PASS_BIG 0 | |
422 %define pass_3dn2 pass_3dn | |
423 %define pass_interleave_3dn2 pass_interleave_3dn | |
424 | |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
425 %ifdef PIC |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
426 %define SECTION_REL - $$ |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
427 %else |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
428 %define SECTION_REL |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
429 %endif |
8430 | 430 |
431 %macro DECL_FFT 2-3 ; nbits, cpu, suffix | |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
432 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL |
8430 | 433 %if %1==5 |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
434 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL |
8430 | 435 %endif |
436 | |
437 %assign n 1<<%1 | |
438 %rep 17-%1 | |
439 %assign n2 n/2 | |
440 %assign n4 n/4 | |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
441 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL |
8430 | 442 |
443 align 16 | |
444 fft %+ n %+ %3%2: | |
445 call fft %+ n2 %+ %2 | |
446 add r0, n*4 - (n&(-2<<%1)) | |
447 call fft %+ n4 %+ %2 | |
448 add r0, n*2 - (n2&(-2<<%1)) | |
449 call fft %+ n4 %+ %2 | |
450 sub r0, n*6 + (n2&(-2<<%1)) | |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
451 lea r1, [cos_ %+ n] |
8430 | 452 mov r2d, n4/2 |
453 jmp pass%3%2 | |
454 | |
455 %assign n n*2 | |
456 %endrep | |
457 %undef n | |
458 | |
459 align 8 | |
460 dispatch_tab%3%2: pointer list_of_fft | |
461 | |
8820
7400956a815d
Put dispatch_tab in the rodata section for macho64.
astrange
parents:
8430
diff
changeset
|
462 section .text |
7400956a815d
Put dispatch_tab in the rodata section for macho64.
astrange
parents:
8430
diff
changeset
|
463 |
8430 | 464 ; On x86_32, this function does the register saving and restoring for all of fft. |
465 ; The others pass args in registers and don't spill anything. | |
10019
c08ca946c80a
Update x264 asm code to latest to add support for 64-bit Windows.
darkshikari
parents:
8820
diff
changeset
|
466 cglobal fft_dispatch%3%2, 2,5,8, z, nbits |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
467 lea r2, [dispatch_tab%3%2] |
8430 | 468 mov r2, [r2 + (nbitsq-2)*gprsize] |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
469 %ifdef PIC |
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
11060
diff
changeset
|
470 lea r3, [$$] |
11060
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
471 add r2, r3 |
daff45175333
Make the jump-table section-relative for x86_64 with PIC enabled.
reimar
parents:
10452
diff
changeset
|
472 %endif |
8430 | 473 call r2 |
474 RET | |
475 %endmacro ; DECL_FFT | |
476 | |
477 DECL_FFT 5, _sse | |
478 DECL_FFT 5, _sse, _interleave | |
479 DECL_FFT 4, _3dn | |
480 DECL_FFT 4, _3dn, _interleave | |
481 DECL_FFT 4, _3dn2 | |
482 DECL_FFT 4, _3dn2, _interleave | |
483 |