comparison ppc/fft_altivec_s.S @ 12089:5638941ec8ef libavcodec

PPC: convert Altivec FFT to pure assembler On PPC a leaf function has a 288-byte red zone below the stack pointer, sparing these functions the chore of setting up a full stack frame. When a function call is disguised within an inline asm block, the compiler might not adjust the stack pointer as required before a function call, resulting in the red zone being clobbered. Moving the entire function to pure asm avoids this problem and also results in somewhat better code.
author mru
date Sun, 04 Jul 2010 18:33:47 +0000
parents bb603bb20873
children 6f064ab48463
comparison
equal deleted inserted replaced
12088:bb603bb20873 12089:5638941ec8ef
46 .endif 46 .endif
47 .if (\imm+0x8000)>>16 47 .if (\imm+0x8000)>>16
48 addis \ra, \ra, \imm@ha 48 addis \ra, \ra, \imm@ha
49 .endif 49 .endif
50 .endm 50 .endm
51
52 #if ARCH_PPC64
53 #define PTR .quad
54 .macro LOAD_PTR ra, rbase, offset
55 ld \ra,(\offset)*8(\rbase)
56 .endm
57 .macro STORE_PTR ra, rbase, offset
58 std \ra,(\offset)*8(\rbase)
59 .endm
60 #else
61 #define PTR .int
62 .macro LOAD_PTR ra, rbase, offset
63 lwz \ra,(\offset)*4(\rbase)
64 .endm
65 .macro STORE_PTR ra, rbase, offset
66 stw \ra,(\offset)*4(\rbase)
67 .endm
68 #endif
69 51
70 .macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3 52 .macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
71 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} 53 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
72 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} 54 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
73 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} 55 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
312 bdnz 1b 294 bdnz 1b
313 sub r3,r3,r5 295 sub r3,r3,r5
314 blr 296 blr
315 .endm 297 .endm
316 298
299 #define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
300
301 #define WORD_0 0x00,0x01,0x02,0x03
302 #define WORD_1 0x04,0x05,0x06,0x07
303 #define WORD_2 0x08,0x09,0x0a,0x0b
304 #define WORD_3 0x0c,0x0d,0x0e,0x0f
305 #define WORD_s0 0x10,0x11,0x12,0x13
306 #define WORD_s1 0x14,0x15,0x16,0x17
307 #define WORD_s2 0x18,0x19,0x1a,0x1b
308 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
309
310 #define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
311
312 .rodata
313 .align 4
314 fft_data:
315 .float 0, 0, 0, 0
316 .float 1, 0.92387953, M_SQRT1_2, 0.38268343
317 .float 0, 0.38268343, M_SQRT1_2, 0.92387953
318 .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
319 .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
320 vcprm(s0,3,2,1)
321 vcprm(0,1,s2,s1)
322 vcprm(2,3,s0,s3)
323 vcprm(2,s3,3,s2)
324 vcprm(0,1,s0,s1)
325 vcprm(2,3,s2,s3)
326 vcprm(2,3,0,1)
327 vcprm(1,2,s3,s0)
328 vcprm(0,3,s2,s1)
329 vcprm(0,2,s1,s3)
330 vcprm(1,3,s0,s2)
331
332 .macro lvm b, r, regs:vararg
333 lvx \r, 0, \b
334 addi \b, \b, 16
335 .ifnb \regs
336 lvm \b, \regs
337 .endif
338 .endm
339
340 .macro stvm b, r, regs:vararg
341 stvx \r, 0, \b
342 addi \b, \b, 16
343 .ifnb \regs
344 stvm \b, \regs
345 .endif
346 .endm
347
348 .macro fft_calc interleave
349 extfunc ff_fft_calc\interleave\()_altivec
350 mflr r0
351 stp r0, 2*PS(r1)
352 stpu r1, -(160+16*PS)(r1)
353 addi r6, r1, 16*PS
354 stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
355 mfvrsave r0
356 stw r0, 15*PS(r1)
357 li r6, 0xfffffffc
358 mtvrsave r6
359
360 movrel r6, fft_data
361 lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
362 lvm r6, v22, v23, v24, v25, v26, v27, v28, v29
363
364 li r9, 16
365 movrel r12, X(ff_cos_tabs)
366
367 movrel r6, fft_dispatch_tab\interleave\()_altivec
368 lwz r3, 0(r3)
369 subi r3, r3, 2
370 slwi r3, r3, 2+ARCH_PPC64
371 lpx r3, r3, r6
372 mtctr r3
373 mr r3, r4
374 bctrl
375
376 addi r6, r1, 16*PS
377 lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
378 lwz r6, 15*PS(r1)
379 mtvrsave r6
380 lp r1, 0(r1)
381 lp r0, 2*PS(r1)
382 mtlr r0
383 blr
384 .endm
385
317 .macro DECL_FFT suffix, bits, n, n2, n4 386 .macro DECL_FFT suffix, bits, n, n2, n4
318 fft\n\suffix\()_altivec: 387 fft\n\suffix\()_altivec:
319 mflr r0 388 mflr r0
320 STORE_PTR r0,r1,\bits-5 389 stp r0,PS*(\bits-3)(r1)
321 bl fft\n2\()_altivec 390 bl fft\n2\()_altivec
322 addi2 r3,\n*4 391 addi2 r3,\n*4
323 bl fft\n4\()_altivec 392 bl fft\n4\()_altivec
324 addi2 r3,\n*2 393 addi2 r3,\n*2
325 bl fft\n4\()_altivec 394 bl fft\n4\()_altivec
326 addi2 r3,\n*-6 395 addi2 r3,\n*-6
327 LOAD_PTR r0,r1,\bits-5 396 lp r0,PS*(\bits-3)(r1)
328 LOAD_PTR r4,r12,\bits 397 lp r4,\bits*PS(r12)
329 mtlr r0 398 mtlr r0
330 li r5,\n/16 399 li r5,\n/16
331 b fft_pass\suffix\()_altivec 400 b fft_pass\suffix\()_altivec
332 .endm 401 .endm
333 402
348 DECL_FFT \suffix,13, 8192, 4096, 2048 417 DECL_FFT \suffix,13, 8192, 4096, 2048
349 DECL_FFT \suffix,14,16384, 8192, 4096 418 DECL_FFT \suffix,14,16384, 8192, 4096
350 DECL_FFT \suffix,15,32768,16384, 8192 419 DECL_FFT \suffix,15,32768,16384, 8192
351 DECL_FFT \suffix,16,65536,32768,16384 420 DECL_FFT \suffix,16,65536,32768,16384
352 421
422 fft_calc \suffix
423
353 .rodata 424 .rodata
354 .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec 425 .align 3
355 EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec: 426 fft_dispatch_tab\suffix\()_altivec:
356 PTR fft4\suffix\()_altivec 427 PTR fft4\suffix\()_altivec
357 PTR fft8\suffix\()_altivec 428 PTR fft8\suffix\()_altivec
358 PTR fft16\suffix\()_altivec 429 PTR fft16\suffix\()_altivec
359 PTR fft32\suffix\()_altivec 430 PTR fft32\suffix\()_altivec
360 PTR fft64\suffix\()_altivec 431 PTR fft64\suffix\()_altivec