Mercurial > libavcodec.hg
comparison x86/fft_mmx.asm @ 12518:67e7e49058c2 libavcodec
Split and then simplify address generation macro.
Allows nasm to work for this code.
author | reimar |
---|---|
date | Sun, 26 Sep 2010 09:08:11 +0000 |
parents | f61e22f8cf28 |
children |
comparison
equal
deleted
inserted
replaced
12517:be85455cab8f | 12518:67e7e49058c2 |
---|---|
152 mova m0, %2 ; wre | 152 mova m0, %2 ; wre |
153 mova m2, m4 | 153 mova m2, m4 |
154 mova m1, %3 ; wim | 154 mova m1, %3 ; wim |
155 mova m3, m5 | 155 mova m3, m5 |
156 mulps m2, m0 ; r2*wre | 156 mulps m2, m0 ; r2*wre |
157 IF%1 mova m6, Z(6) | 157 IF%1 mova m6, Z2(6) |
158 mulps m3, m1 ; i2*wim | 158 mulps m3, m1 ; i2*wim |
159 IF%1 mova m7, Z(7) | 159 IF%1 mova m7, Z2(7) |
160 mulps m4, m1 ; r2*wim | 160 mulps m4, m1 ; r2*wim |
161 mulps m5, m0 ; i2*wre | 161 mulps m5, m0 ; i2*wre |
162 addps m2, m3 ; r2*wre + i2*wim | 162 addps m2, m3 ; r2*wre + i2*wim |
163 mova m3, m1 | 163 mova m3, m1 |
164 mulps m1, m6 ; r3*wim | 164 mulps m1, m6 ; r3*wim |
181 mova m3, m5 | 181 mova m3, m5 |
182 subps m5, m0 ; t4 | 182 subps m5, m0 ; t4 |
183 mova m4, m6 | 183 mova m4, m6 |
184 subps m6, m5 ; r3 | 184 subps m6, m5 ; r3 |
185 addps m5, m4 ; r1 | 185 addps m5, m4 ; r1 |
186 mova Z(6), m6 | 186 mova Z2(6), m6 |
187 mova Z(2), m5 | 187 mova Z(2), m5 |
188 mova m2, Z(3) | 188 mova m2, Z(3) |
189 addps m3, m0 ; t6 | 189 addps m3, m0 ; t6 |
190 subps m2, m1 ; i3 | 190 subps m2, m1 ; i3 |
191 mova m7, Z(1) | 191 mova m7, Z(1) |
192 addps m1, Z(3) ; i1 | 192 addps m1, Z(3) ; i1 |
193 mova Z(7), m2 | 193 mova Z2(7), m2 |
194 mova Z(3), m1 | 194 mova Z(3), m1 |
195 mova m4, m7 | 195 mova m4, m7 |
196 subps m7, m3 ; i2 | 196 subps m7, m3 ; i2 |
197 addps m3, m4 ; i0 | 197 addps m3, m4 ; i0 |
198 mova Z(5), m7 | 198 mova Z(5), m7 |
206 mova m2, m4 | 206 mova m2, m4 |
207 mova m0, [wq] ; wre | 207 mova m0, [wq] ; wre |
208 mova m3, m5 | 208 mova m3, m5 |
209 mova m1, [wq+o1q] ; wim | 209 mova m1, [wq+o1q] ; wim |
210 mulps m2, m0 ; r2*wre | 210 mulps m2, m0 ; r2*wre |
211 mova m6, Z(6) ; r3 | 211 mova m6, Z2(6) ; r3 |
212 mulps m3, m1 ; i2*wim | 212 mulps m3, m1 ; i2*wim |
213 mova m7, Z(7) ; i3 | 213 mova m7, Z2(7) ; i3 |
214 mulps m4, m1 ; r2*wim | 214 mulps m4, m1 ; r2*wim |
215 mulps m5, m0 ; i2*wre | 215 mulps m5, m0 ; i2*wre |
216 addps m2, m3 ; r2*wre + i2*wim | 216 addps m2, m3 ; r2*wre + i2*wim |
217 mova m3, m1 | 217 mova m3, m1 |
218 mulps m1, m6 ; r3*wim | 218 mulps m1, m6 ; r3*wim |
235 mova m3, m5 | 235 mova m3, m5 |
236 subps m5, m0 ; t4 | 236 subps m5, m0 ; t4 |
237 mova m4, m6 | 237 mova m4, m6 |
238 subps m6, m5 ; r3 | 238 subps m6, m5 ; r3 |
239 addps m5, m4 ; r1 | 239 addps m5, m4 ; r1 |
240 IF%1 mova Z(6), m6 | 240 IF%1 mova Z2(6), m6 |
241 IF%1 mova Z(2), m5 | 241 IF%1 mova Z(2), m5 |
242 mova m2, Z(3) | 242 mova m2, Z(3) |
243 addps m3, m0 ; t6 | 243 addps m3, m0 ; t6 |
244 subps m2, m1 ; i3 | 244 subps m2, m1 ; i3 |
245 mova m7, Z(1) | 245 mova m7, Z(1) |
246 addps m1, Z(3) ; i1 | 246 addps m1, Z(3) ; i1 |
247 IF%1 mova Z(7), m2 | 247 IF%1 mova Z2(7), m2 |
248 IF%1 mova Z(3), m1 | 248 IF%1 mova Z(3), m1 |
249 mova m4, m7 | 249 mova m4, m7 |
250 subps m7, m3 ; i2 | 250 subps m7, m3 ; i2 |
251 addps m3, m4 ; i0 | 251 addps m3, m4 ; i0 |
252 IF%1 mova Z(5), m7 | 252 IF%1 mova Z(5), m7 |
260 unpckhps m0, m2 | 260 unpckhps m0, m2 |
261 mova m1, Z(0) | 261 mova m1, Z(0) |
262 mova m2, Z(4) | 262 mova m2, Z(4) |
263 mova Z(2), m5 | 263 mova Z(2), m5 |
264 mova Z(3), m4 | 264 mova Z(3), m4 |
265 mova Z(6), m6 | 265 mova Z2(6), m6 |
266 mova Z(7), m0 | 266 mova Z2(7), m0 |
267 mova m5, m1 ; r0 | 267 mova m5, m1 ; r0 |
268 mova m4, m2 ; r2 | 268 mova m4, m2 ; r2 |
269 unpcklps m1, m3 | 269 unpcklps m1, m3 |
270 unpckhps m5, m3 | 270 unpckhps m5, m3 |
271 unpcklps m2, m7 | 271 unpcklps m2, m7 |
285 | 285 |
286 INIT_XMM | 286 INIT_XMM |
287 %define mova movaps | 287 %define mova movaps |
288 | 288 |
289 %define Z(x) [r0+mmsize*x] | 289 %define Z(x) [r0+mmsize*x] |
290 %define Z2(x) [r0+mmsize*x] | |
290 | 291 |
291 align 16 | 292 align 16 |
292 fft4_sse: | 293 fft4_sse: |
293 mova m0, Z(0) | 294 mova m0, Z(0) |
294 mova m1, Z(1) | 295 mova m1, Z(1) |
324 mova Z(0), m0 | 325 mova Z(0), m0 |
325 mova Z(1), m1 | 326 mova Z(1), m1 |
326 mova Z(2), m2 | 327 mova Z(2), m2 |
327 mova Z(3), m3 | 328 mova Z(3), m3 |
328 T4_SSE m4, m5, m6 | 329 T4_SSE m4, m5, m6 |
329 mova m6, Z(6) | 330 mova m6, Z2(6) |
330 mova m7, Z(7) | 331 mova m7, Z2(7) |
331 T4_SSE m6, m7, m0 | 332 T4_SSE m6, m7, m0 |
332 PASS_SMALL 0, [cos_16], [cos_16+16] | 333 PASS_SMALL 0, [cos_16], [cos_16+16] |
333 ret | 334 ret |
334 | 335 |
335 | 336 |
356 mova m2, Z(2) | 357 mova m2, Z(2) |
357 mova m3, Z(3) | 358 mova m3, Z(3) |
358 T4_3DN m0, m1, m2, m3, m4, m5 | 359 T4_3DN m0, m1, m2, m3, m4, m5 |
359 mova Z(0), m0 | 360 mova Z(0), m0 |
360 mova Z(2), m2 | 361 mova Z(2), m2 |
361 T2_3DN m4, m5, Z(4), Z(5) | 362 T2_3DN m4, m5, Z(4), Z(5) |
362 T2_3DN m6, m7, Z(6), Z(7) | 363 T2_3DN m6, m7, Z2(6), Z2(7) |
363 pswapd m0, m5 | 364 pswapd m0, m5 |
364 pswapd m2, m7 | 365 pswapd m2, m7 |
365 pxor m0, [ps_m1p1] | 366 pxor m0, [ps_m1p1] |
366 pxor m2, [ps_m1p1] | 367 pxor m2, [ps_m1p1] |
367 pfsub m5, m0 | 368 pfsub m5, m0 |
368 pfadd m7, m2 | 369 pfadd m7, m2 |
369 pfmul m5, [ps_root2] | 370 pfmul m5, [ps_root2] |
370 pfmul m7, [ps_root2] | 371 pfmul m7, [ps_root2] |
371 T4_3DN m1, m3, m5, m7, m0, m2 | 372 T4_3DN m1, m3, m5, m7, m0, m2 |
372 mova Z(5), m5 | 373 mova Z(5), m5 |
373 mova Z(7), m7 | 374 mova Z2(7), m7 |
374 mova m0, Z(0) | 375 mova m0, Z(0) |
375 mova m2, Z(2) | 376 mova m2, Z(2) |
376 T4_3DN m0, m2, m4, m6, m5, m7 | 377 T4_3DN m0, m2, m4, m6, m5, m7 |
377 PUNPCK m0, m1, m5 | 378 PUNPCK m0, m1, m5 |
378 PUNPCK m2, m3, m7 | 379 PUNPCK m2, m3, m7 |
379 mova Z(0), m0 | 380 mova Z(0), m0 |
380 mova Z(1), m5 | 381 mova Z(1), m5 |
381 mova Z(2), m2 | 382 mova Z(2), m2 |
382 mova Z(3), m7 | 383 mova Z(3), m7 |
383 PUNPCK m4, Z(5), m5 | 384 PUNPCK m4, Z(5), m5 |
384 PUNPCK m6, Z(7), m7 | 385 PUNPCK m6, Z2(7), m7 |
385 mova Z(4), m4 | 386 mova Z(4), m4 |
386 mova Z(5), m5 | 387 mova Z(5), m5 |
387 mova Z(6), m6 | 388 mova Z2(6), m6 |
388 mova Z(7), m7 | 389 mova Z2(7), m7 |
389 ret | 390 ret |
390 %endmacro | 391 %endmacro |
391 | 392 |
392 FFT48_3DN _3dn2 | 393 FFT48_3DN _3dn2 |
393 | 394 |
403 %endmacro | 404 %endmacro |
404 | 405 |
405 FFT48_3DN _3dn | 406 FFT48_3DN _3dn |
406 | 407 |
407 | 408 |
408 %define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)] | 409 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)] |
410 %define Z2(x) [zq + o3q + mmsize*(x&1)] | |
409 | 411 |
410 %macro DECL_PASS 2+ ; name, payload | 412 %macro DECL_PASS 2+ ; name, payload |
411 align 16 | 413 align 16 |
412 %1: | 414 %1: |
413 DEFINE_ARGS z, w, n, o1, o3 | 415 DEFINE_ARGS z, w, n, o1, o3 |