Mercurial > libavcodec.hg
comparison ppc/fft_altivec_s.S @ 12089:5638941ec8ef libavcodec
PPC: convert Altivec FFT to pure assembler
On PPC a leaf function has a 288-byte red zone below the stack pointer,
sparing these functions the chore of setting up a full stack frame.
When a function call is disguised within an inline asm block, the
compiler might not adjust the stack pointer as required before a
function call, resulting in the red zone being clobbered.
Moving the entire function to pure asm avoids this problem and also
results in somewhat better code.
author | mru |
---|---|
date | Sun, 04 Jul 2010 18:33:47 +0000 |
parents | bb603bb20873 |
children | 6f064ab48463 |
comparison
equal
deleted
inserted
replaced
12088:bb603bb20873 | 12089:5638941ec8ef |
---|---|
46 .endif | 46 .endif |
47 .if (\imm+0x8000)>>16 | 47 .if (\imm+0x8000)>>16 |
48 addis \ra, \ra, \imm@ha | 48 addis \ra, \ra, \imm@ha |
49 .endif | 49 .endif |
50 .endm | 50 .endm |
51 | |
52 #if ARCH_PPC64 | |
53 #define PTR .quad | |
54 .macro LOAD_PTR ra, rbase, offset | |
55 ld \ra,(\offset)*8(\rbase) | |
56 .endm | |
57 .macro STORE_PTR ra, rbase, offset | |
58 std \ra,(\offset)*8(\rbase) | |
59 .endm | |
60 #else | |
61 #define PTR .int | |
62 .macro LOAD_PTR ra, rbase, offset | |
63 lwz \ra,(\offset)*4(\rbase) | |
64 .endm | |
65 .macro STORE_PTR ra, rbase, offset | |
66 stw \ra,(\offset)*4(\rbase) | |
67 .endm | |
68 #endif | |
69 | 51 |
70 .macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3 | 52 .macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3 |
71 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} | 53 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} |
72 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} | 54 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} |
73 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} | 55 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} |
312 bdnz 1b | 294 bdnz 1b |
313 sub r3,r3,r5 | 295 sub r3,r3,r5 |
314 blr | 296 blr |
315 .endm | 297 .endm |
316 | 298 |
299 #define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ | |
300 | |
301 #define WORD_0 0x00,0x01,0x02,0x03 | |
302 #define WORD_1 0x04,0x05,0x06,0x07 | |
303 #define WORD_2 0x08,0x09,0x0a,0x0b | |
304 #define WORD_3 0x0c,0x0d,0x0e,0x0f | |
305 #define WORD_s0 0x10,0x11,0x12,0x13 | |
306 #define WORD_s1 0x14,0x15,0x16,0x17 | |
307 #define WORD_s2 0x18,0x19,0x1a,0x1b | |
308 #define WORD_s3 0x1c,0x1d,0x1e,0x1f | |
309 | |
310 #define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d | |
311 | |
312 .rodata | |
313 .align 4 | |
314 fft_data: | |
315 .float 0, 0, 0, 0 | |
316 .float 1, 0.92387953, M_SQRT1_2, 0.38268343 | |
317 .float 0, 0.38268343, M_SQRT1_2, 0.92387953 | |
318 .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2 | |
319 .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 | |
320 vcprm(s0,3,2,1) | |
321 vcprm(0,1,s2,s1) | |
322 vcprm(2,3,s0,s3) | |
323 vcprm(2,s3,3,s2) | |
324 vcprm(0,1,s0,s1) | |
325 vcprm(2,3,s2,s3) | |
326 vcprm(2,3,0,1) | |
327 vcprm(1,2,s3,s0) | |
328 vcprm(0,3,s2,s1) | |
329 vcprm(0,2,s1,s3) | |
330 vcprm(1,3,s0,s2) | |
331 | |
332 .macro lvm b, r, regs:vararg | |
333 lvx \r, 0, \b | |
334 addi \b, \b, 16 | |
335 .ifnb \regs | |
336 lvm \b, \regs | |
337 .endif | |
338 .endm | |
339 | |
340 .macro stvm b, r, regs:vararg | |
341 stvx \r, 0, \b | |
342 addi \b, \b, 16 | |
343 .ifnb \regs | |
344 stvm \b, \regs | |
345 .endif | |
346 .endm | |
347 | |
348 .macro fft_calc interleave | |
349 extfunc ff_fft_calc\interleave\()_altivec | |
350 mflr r0 | |
351 stp r0, 2*PS(r1) | |
352 stpu r1, -(160+16*PS)(r1) | |
353 addi r6, r1, 16*PS | |
354 stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 | |
355 mfvrsave r0 | |
356 stw r0, 15*PS(r1) | |
357 li r6, 0xfffffffc | |
358 mtvrsave r6 | |
359 | |
360 movrel r6, fft_data | |
361 lvm r6, v14, v15, v16, v17, v18, v19, v20, v21 | |
362 lvm r6, v22, v23, v24, v25, v26, v27, v28, v29 | |
363 | |
364 li r9, 16 | |
365 movrel r12, X(ff_cos_tabs) | |
366 | |
367 movrel r6, fft_dispatch_tab\interleave\()_altivec | |
368 lwz r3, 0(r3) | |
369 subi r3, r3, 2 | |
370 slwi r3, r3, 2+ARCH_PPC64 | |
371 lpx r3, r3, r6 | |
372 mtctr r3 | |
373 mr r3, r4 | |
374 bctrl | |
375 | |
376 addi r6, r1, 16*PS | |
377 lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 | |
378 lwz r6, 15*PS(r1) | |
379 mtvrsave r6 | |
380 lp r1, 0(r1) | |
381 lp r0, 2*PS(r1) | |
382 mtlr r0 | |
383 blr | |
384 .endm | |
385 | |
317 .macro DECL_FFT suffix, bits, n, n2, n4 | 386 .macro DECL_FFT suffix, bits, n, n2, n4 |
318 fft\n\suffix\()_altivec: | 387 fft\n\suffix\()_altivec: |
319 mflr r0 | 388 mflr r0 |
320 STORE_PTR r0,r1,\bits-5 | 389 stp r0,PS*(\bits-3)(r1) |
321 bl fft\n2\()_altivec | 390 bl fft\n2\()_altivec |
322 addi2 r3,\n*4 | 391 addi2 r3,\n*4 |
323 bl fft\n4\()_altivec | 392 bl fft\n4\()_altivec |
324 addi2 r3,\n*2 | 393 addi2 r3,\n*2 |
325 bl fft\n4\()_altivec | 394 bl fft\n4\()_altivec |
326 addi2 r3,\n*-6 | 395 addi2 r3,\n*-6 |
327 LOAD_PTR r0,r1,\bits-5 | 396 lp r0,PS*(\bits-3)(r1) |
328 LOAD_PTR r4,r12,\bits | 397 lp r4,\bits*PS(r12) |
329 mtlr r0 | 398 mtlr r0 |
330 li r5,\n/16 | 399 li r5,\n/16 |
331 b fft_pass\suffix\()_altivec | 400 b fft_pass\suffix\()_altivec |
332 .endm | 401 .endm |
333 | 402 |
348 DECL_FFT \suffix,13, 8192, 4096, 2048 | 417 DECL_FFT \suffix,13, 8192, 4096, 2048 |
349 DECL_FFT \suffix,14,16384, 8192, 4096 | 418 DECL_FFT \suffix,14,16384, 8192, 4096 |
350 DECL_FFT \suffix,15,32768,16384, 8192 | 419 DECL_FFT \suffix,15,32768,16384, 8192 |
351 DECL_FFT \suffix,16,65536,32768,16384 | 420 DECL_FFT \suffix,16,65536,32768,16384 |
352 | 421 |
422 fft_calc \suffix | |
423 | |
353 .rodata | 424 .rodata |
354 .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec | 425 .align 3 |
355 EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec: | 426 fft_dispatch_tab\suffix\()_altivec: |
356 PTR fft4\suffix\()_altivec | 427 PTR fft4\suffix\()_altivec |
357 PTR fft8\suffix\()_altivec | 428 PTR fft8\suffix\()_altivec |
358 PTR fft16\suffix\()_altivec | 429 PTR fft16\suffix\()_altivec |
359 PTR fft32\suffix\()_altivec | 430 PTR fft32\suffix\()_altivec |
360 PTR fft64\suffix\()_altivec | 431 PTR fft64\suffix\()_altivec |