comparison x86/fft_mmx.asm @ 11931:980030a3e315 libavcodec

Update x264asm header files to latest versions. Modify the asm accordingly. GLOBAL is now no longoer necessary for PIC-compliant loads.
author darkshikari
date Wed, 23 Jun 2010 19:20:46 +0000
parents daff45175333
children 6f064ab48463
comparison
equal deleted inserted replaced
11930:1e8556438209 11931:980030a3e315
33 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 33 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
34 ps_m1p1: dd 1<<31, 0 34 ps_m1p1: dd 1<<31, 0
35 35
36 %assign i 16 36 %assign i 16
37 %rep 13 37 %rep 13
38 cextern ff_cos_ %+ i 38 cextern cos_ %+ i
39 %assign i i<<1 39 %assign i i<<1
40 %endrep 40 %endrep
41 41
42 %ifdef ARCH_X86_64 42 %ifdef ARCH_X86_64
43 %define pointer dq 43 %define pointer dq
62 62
63 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 63 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
64 mova %5, %3 64 mova %5, %3
65 pfsub %3, %4 65 pfsub %3, %4
66 pfadd %5, %4 ; {t6,t5} 66 pfadd %5, %4 ; {t6,t5}
67 pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7} 67 pxor %3, [ps_m1p1] ; {t8,t7}
68 mova %6, %1 68 mova %6, %1
69 pswapd %3, %3 69 pswapd %3, %3
70 pfadd %1, %5 ; {r0,i0} 70 pfadd %1, %5 ; {r0,i0}
71 pfsub %6, %5 ; {r2,i2} 71 pfsub %6, %5 ; {r2,i2}
72 mova %4, %2 72 mova %4, %2
103 mova %6, %3 103 mova %6, %3
104 subps %3, %5 ; {r5,i5,r7,i7} 104 subps %3, %5 ; {r5,i5,r7,i7}
105 addps %6, %5 ; {t1,t2,t3,t4} 105 addps %6, %5 ; {t1,t2,t3,t4}
106 mova %5, %3 106 mova %5, %3
107 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} 107 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7}
108 mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7} 108 mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
109 mulps %5, [ps_root2 GLOBAL] 109 mulps %5, [ps_root2]
110 addps %3, %5 ; {t8,t7,ta,t9} 110 addps %3, %5 ; {t8,t7,ta,t9}
111 mova %5, %6 111 mova %5, %6
112 shufps %6, %3, 0x36 ; {t3,t2,t9,t8} 112 shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
113 shufps %5, %3, 0x9c ; {t1,t4,t7,ta} 113 shufps %5, %3, 0x9c ; {t1,t4,t7,ta}
114 mova %3, %6 114 mova %3, %6
307 mova Z(3), m3 307 mova Z(3), m3
308 T4_SSE m4, m5, m6 308 T4_SSE m4, m5, m6
309 mova m6, Z(6) 309 mova m6, Z(6)
310 mova m7, Z(7) 310 mova m7, Z(7)
311 T4_SSE m6, m7, m0 311 T4_SSE m6, m7, m0
312 PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL] 312 PASS_SMALL 0, [cos_16], [cos_16+16]
313 ret 313 ret
314 314
315 315
316 INIT_MMX 316 INIT_MMX
317 317
340 mova Z(2), m2 340 mova Z(2), m2
341 T2_3DN m4, m5, Z(4), Z(5) 341 T2_3DN m4, m5, Z(4), Z(5)
342 T2_3DN m6, m7, Z(6), Z(7) 342 T2_3DN m6, m7, Z(6), Z(7)
343 pswapd m0, m5 343 pswapd m0, m5
344 pswapd m2, m7 344 pswapd m2, m7
345 pxor m0, [ps_m1p1 GLOBAL] 345 pxor m0, [ps_m1p1]
346 pxor m2, [ps_m1p1 GLOBAL] 346 pxor m2, [ps_m1p1]
347 pfsub m5, m0 347 pfsub m5, m0
348 pfadd m7, m2 348 pfadd m7, m2
349 pfmul m5, [ps_root2 GLOBAL] 349 pfmul m5, [ps_root2]
350 pfmul m7, [ps_root2 GLOBAL] 350 pfmul m7, [ps_root2]
351 T4_3DN m1, m3, m5, m7, m0, m2 351 T4_3DN m1, m3, m5, m7, m0, m2
352 mova Z(5), m5 352 mova Z(5), m5
353 mova Z(7), m7 353 mova Z(7), m7
354 mova m0, Z(0) 354 mova m0, Z(0)
355 mova m2, Z(2) 355 mova m2, Z(2)
443 add r0, n*4 - (n&(-2<<%1)) 443 add r0, n*4 - (n&(-2<<%1))
444 call fft %+ n4 %+ %2 444 call fft %+ n4 %+ %2
445 add r0, n*2 - (n2&(-2<<%1)) 445 add r0, n*2 - (n2&(-2<<%1))
446 call fft %+ n4 %+ %2 446 call fft %+ n4 %+ %2
447 sub r0, n*6 + (n2&(-2<<%1)) 447 sub r0, n*6 + (n2&(-2<<%1))
448 lea r1, [ff_cos_ %+ n GLOBAL] 448 lea r1, [cos_ %+ n]
449 mov r2d, n4/2 449 mov r2d, n4/2
450 jmp pass%3%2 450 jmp pass%3%2
451 451
452 %assign n n*2 452 %assign n n*2
453 %endrep 453 %endrep
459 section .text 459 section .text
460 460
461 ; On x86_32, this function does the register saving and restoring for all of fft. 461 ; On x86_32, this function does the register saving and restoring for all of fft.
462 ; The others pass args in registers and don't spill anything. 462 ; The others pass args in registers and don't spill anything.
463 cglobal fft_dispatch%3%2, 2,5,8, z, nbits 463 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
464 lea r2, [dispatch_tab%3%2 GLOBAL] 464 lea r2, [dispatch_tab%3%2]
465 mov r2, [r2 + (nbitsq-2)*gprsize] 465 mov r2, [r2 + (nbitsq-2)*gprsize]
466 %ifdef PIC 466 %ifdef PIC
467 lea r3, [$$ GLOBAL] 467 lea r3, [$$]
468 add r2, r3 468 add r2, r3
469 %endif 469 %endif
470 call r2 470 call r2
471 RET 471 RET
472 %endmacro ; DECL_FFT 472 %endmacro ; DECL_FFT