Mercurial > libavcodec.hg
comparison x86/fft_mmx.asm @ 11931:980030a3e315 libavcodec
Update x264asm header files to latest versions.
Modify the asm accordingly.
GLOBAL is now no longoer necessary for PIC-compliant loads.
author | darkshikari |
---|---|
date | Wed, 23 Jun 2010 19:20:46 +0000 |
parents | daff45175333 |
children | 6f064ab48463 |
comparison
equal
deleted
inserted
replaced
11930:1e8556438209 | 11931:980030a3e315 |
---|---|
33 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | 33 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 |
34 ps_m1p1: dd 1<<31, 0 | 34 ps_m1p1: dd 1<<31, 0 |
35 | 35 |
36 %assign i 16 | 36 %assign i 16 |
37 %rep 13 | 37 %rep 13 |
38 cextern ff_cos_ %+ i | 38 cextern cos_ %+ i |
39 %assign i i<<1 | 39 %assign i i<<1 |
40 %endrep | 40 %endrep |
41 | 41 |
42 %ifdef ARCH_X86_64 | 42 %ifdef ARCH_X86_64 |
43 %define pointer dq | 43 %define pointer dq |
62 | 62 |
63 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 | 63 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 |
64 mova %5, %3 | 64 mova %5, %3 |
65 pfsub %3, %4 | 65 pfsub %3, %4 |
66 pfadd %5, %4 ; {t6,t5} | 66 pfadd %5, %4 ; {t6,t5} |
67 pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7} | 67 pxor %3, [ps_m1p1] ; {t8,t7} |
68 mova %6, %1 | 68 mova %6, %1 |
69 pswapd %3, %3 | 69 pswapd %3, %3 |
70 pfadd %1, %5 ; {r0,i0} | 70 pfadd %1, %5 ; {r0,i0} |
71 pfsub %6, %5 ; {r2,i2} | 71 pfsub %6, %5 ; {r2,i2} |
72 mova %4, %2 | 72 mova %4, %2 |
103 mova %6, %3 | 103 mova %6, %3 |
104 subps %3, %5 ; {r5,i5,r7,i7} | 104 subps %3, %5 ; {r5,i5,r7,i7} |
105 addps %6, %5 ; {t1,t2,t3,t4} | 105 addps %6, %5 ; {t1,t2,t3,t4} |
106 mova %5, %3 | 106 mova %5, %3 |
107 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} | 107 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} |
108 mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7} | 108 mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} |
109 mulps %5, [ps_root2 GLOBAL] | 109 mulps %5, [ps_root2] |
110 addps %3, %5 ; {t8,t7,ta,t9} | 110 addps %3, %5 ; {t8,t7,ta,t9} |
111 mova %5, %6 | 111 mova %5, %6 |
112 shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | 112 shufps %6, %3, 0x36 ; {t3,t2,t9,t8} |
113 shufps %5, %3, 0x9c ; {t1,t4,t7,ta} | 113 shufps %5, %3, 0x9c ; {t1,t4,t7,ta} |
114 mova %3, %6 | 114 mova %3, %6 |
307 mova Z(3), m3 | 307 mova Z(3), m3 |
308 T4_SSE m4, m5, m6 | 308 T4_SSE m4, m5, m6 |
309 mova m6, Z(6) | 309 mova m6, Z(6) |
310 mova m7, Z(7) | 310 mova m7, Z(7) |
311 T4_SSE m6, m7, m0 | 311 T4_SSE m6, m7, m0 |
312 PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL] | 312 PASS_SMALL 0, [cos_16], [cos_16+16] |
313 ret | 313 ret |
314 | 314 |
315 | 315 |
316 INIT_MMX | 316 INIT_MMX |
317 | 317 |
340 mova Z(2), m2 | 340 mova Z(2), m2 |
341 T2_3DN m4, m5, Z(4), Z(5) | 341 T2_3DN m4, m5, Z(4), Z(5) |
342 T2_3DN m6, m7, Z(6), Z(7) | 342 T2_3DN m6, m7, Z(6), Z(7) |
343 pswapd m0, m5 | 343 pswapd m0, m5 |
344 pswapd m2, m7 | 344 pswapd m2, m7 |
345 pxor m0, [ps_m1p1 GLOBAL] | 345 pxor m0, [ps_m1p1] |
346 pxor m2, [ps_m1p1 GLOBAL] | 346 pxor m2, [ps_m1p1] |
347 pfsub m5, m0 | 347 pfsub m5, m0 |
348 pfadd m7, m2 | 348 pfadd m7, m2 |
349 pfmul m5, [ps_root2 GLOBAL] | 349 pfmul m5, [ps_root2] |
350 pfmul m7, [ps_root2 GLOBAL] | 350 pfmul m7, [ps_root2] |
351 T4_3DN m1, m3, m5, m7, m0, m2 | 351 T4_3DN m1, m3, m5, m7, m0, m2 |
352 mova Z(5), m5 | 352 mova Z(5), m5 |
353 mova Z(7), m7 | 353 mova Z(7), m7 |
354 mova m0, Z(0) | 354 mova m0, Z(0) |
355 mova m2, Z(2) | 355 mova m2, Z(2) |
443 add r0, n*4 - (n&(-2<<%1)) | 443 add r0, n*4 - (n&(-2<<%1)) |
444 call fft %+ n4 %+ %2 | 444 call fft %+ n4 %+ %2 |
445 add r0, n*2 - (n2&(-2<<%1)) | 445 add r0, n*2 - (n2&(-2<<%1)) |
446 call fft %+ n4 %+ %2 | 446 call fft %+ n4 %+ %2 |
447 sub r0, n*6 + (n2&(-2<<%1)) | 447 sub r0, n*6 + (n2&(-2<<%1)) |
448 lea r1, [ff_cos_ %+ n GLOBAL] | 448 lea r1, [cos_ %+ n] |
449 mov r2d, n4/2 | 449 mov r2d, n4/2 |
450 jmp pass%3%2 | 450 jmp pass%3%2 |
451 | 451 |
452 %assign n n*2 | 452 %assign n n*2 |
453 %endrep | 453 %endrep |
459 section .text | 459 section .text |
460 | 460 |
461 ; On x86_32, this function does the register saving and restoring for all of fft. | 461 ; On x86_32, this function does the register saving and restoring for all of fft. |
462 ; The others pass args in registers and don't spill anything. | 462 ; The others pass args in registers and don't spill anything. |
463 cglobal fft_dispatch%3%2, 2,5,8, z, nbits | 463 cglobal fft_dispatch%3%2, 2,5,8, z, nbits |
464 lea r2, [dispatch_tab%3%2 GLOBAL] | 464 lea r2, [dispatch_tab%3%2] |
465 mov r2, [r2 + (nbitsq-2)*gprsize] | 465 mov r2, [r2 + (nbitsq-2)*gprsize] |
466 %ifdef PIC | 466 %ifdef PIC |
467 lea r3, [$$ GLOBAL] | 467 lea r3, [$$] |
468 add r2, r3 | 468 add r2, r3 |
469 %endif | 469 %endif |
470 call r2 | 470 call r2 |
471 RET | 471 RET |
472 %endmacro ; DECL_FFT | 472 %endmacro ; DECL_FFT |