comparison ppc/fft_altivec_s.S @ 12085:8454bb880008 libavcodec

PPC: add _interleave versions of fft{4,6,16}_altivec This removes the need for a post-swizzle with the small FFTs.
author mru
date Sat, 03 Jul 2010 18:36:10 +0000
parents ae57be2ef58c
children bb603bb20873
comparison
equal deleted inserted replaced
12084:b6cf19580e47 12085:8454bb880008
141 .macro BF d0,d1,s0,s1 141 .macro BF d0,d1,s0,s1
142 vsubfp \d1,\s0,\s1 142 vsubfp \d1,\s0,\s1
143 vaddfp \d0,\s0,\s1 143 vaddfp \d0,\s0,\s1
144 .endm 144 .endm
145 145
146 fft4_altivec: 146 .macro zip d0,d1,s0,s1
147 vmrghw \d0,\s0,\s1
148 vmrglw \d1,\s0,\s1
149 .endm
150
151 .macro def_fft4 interleave
152 fft4\interleave\()_altivec:
147 lvx v0, 0,r3 153 lvx v0, 0,r3
148 lvx v1,r9,r3 154 lvx v1,r9,r3
149 FFT4 v0,v1,v2,v3 155 FFT4 v0,v1,v2,v3
156 .ifnb \interleave
157 zip v0,v1,v2,v3
158 stvx v0, 0,r3
159 stvx v1,r9,r3
160 .else
150 stvx v2, 0,r3 161 stvx v2, 0,r3
151 stvx v3,r9,r3 162 stvx v3,r9,r3
163 .endif
152 blr 164 blr
153 165 .endm
154 fft8_altivec: 166
167 .macro def_fft8 interleave
168 fft8\interleave\()_altivec:
155 addi r4,r3,32 169 addi r4,r3,32
156 lvx v0, 0,r3 170 lvx v0, 0,r3
157 lvx v1,r9,r3 171 lvx v1,r9,r3
158 lvx v2, 0,r4 172 lvx v2, 0,r4
159 lvx v3,r9,r4 173 lvx v3,r9,r4
160 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8 174 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
175 .ifnb \interleave
176 zip v4,v5,v0,v1
177 zip v6,v7,v2,v3
178 stvx v4, 0,r3
179 stvx v5,r9,r3
180 stvx v6, 0,r4
181 stvx v7,r9,r4
182 .else
161 stvx v0, 0,r3 183 stvx v0, 0,r3
162 stvx v1,r9,r3 184 stvx v1,r9,r3
163 stvx v2, 0,r4 185 stvx v2, 0,r4
164 stvx v3,r9,r4 186 stvx v3,r9,r4
187 .endif
165 blr 188 blr
166 189 .endm
167 fft16_altivec: 190
191 .macro def_fft16 interleave
192 fft16\interleave\()_altivec:
168 addi r5,r3,64 193 addi r5,r3,64
169 addi r6,r3,96 194 addi r6,r3,96
170 addi r4,r3,32 195 addi r4,r3,32
171 lvx v0, 0,r5 196 lvx v0, 0,r5
172 lvx v1,r9,r5 197 lvx v1,r9,r5
188 vmaddfp v11,v6,v16,v11 // r3*wim 213 vmaddfp v11,v6,v16,v11 // r3*wim
189 BF v10,v12,v10,v8 214 BF v10,v12,v10,v8
190 BF v11,v13,v9,v11 215 BF v11,v13,v9,v11
191 BF v0,v4,v0,v10 216 BF v0,v4,v0,v10
192 BF v3,v7,v3,v12 217 BF v3,v7,v3,v12
218 BF v1,v5,v1,v11
219 BF v2,v6,v2,v13
220 .ifnb \interleave
221 zip v8, v9,v0,v1
222 zip v10,v11,v2,v3
223 zip v12,v13,v4,v5
224 zip v14,v15,v6,v7
225 stvx v8, 0,r3
226 stvx v9,r9,r3
227 stvx v10, 0,r4
228 stvx v11,r9,r4
229 stvx v12, 0,r5
230 stvx v13,r9,r5
231 stvx v14, 0,r6
232 stvx v15,r9,r6
233 .else
193 stvx v0, 0,r3 234 stvx v0, 0,r3
194 stvx v4, 0,r5 235 stvx v4, 0,r5
195 stvx v3,r9,r4 236 stvx v3,r9,r4
196 stvx v7,r9,r6 237 stvx v7,r9,r6
197 BF v1,v5,v1,v11
198 BF v2,v6,v2,v13
199 stvx v1,r9,r3 238 stvx v1,r9,r3
200 stvx v5,r9,r5 239 stvx v5,r9,r5
201 stvx v2, 0,r4 240 stvx v2, 0,r4
202 stvx v6, 0,r6 241 stvx v6, 0,r6
242 .endif
203 blr 243 blr
244 .endm
204 245
205 // void pass(float *z, float *wre, int n) 246 // void pass(float *z, float *wre, int n)
206 .macro PASS interleave, suffix 247 .macro PASS interleave, suffix
207 fft_pass\suffix\()_altivec: 248 fft_pass\suffix\()_altivec:
208 mtctr r5 249 mtctr r5
295 b fft_pass\suffix\()_altivec 336 b fft_pass\suffix\()_altivec
296 .endm 337 .endm
297 338
298 .macro DECL_FFTS interleave, suffix 339 .macro DECL_FFTS interleave, suffix
299 .text 340 .text
341 def_fft4 \suffix
342 def_fft8 \suffix
343 def_fft16 \suffix
300 PASS \interleave, \suffix 344 PASS \interleave, \suffix
301 DECL_FFT \suffix, 5, 32, 16, 8 345 DECL_FFT \suffix, 5, 32, 16, 8
302 DECL_FFT \suffix, 6, 64, 32, 16 346 DECL_FFT \suffix, 6, 64, 32, 16
303 DECL_FFT \suffix, 7, 128, 64, 32 347 DECL_FFT \suffix, 7, 128, 64, 32
304 DECL_FFT \suffix, 8, 256, 128, 64 348 DECL_FFT \suffix, 8, 256, 128, 64
312 DECL_FFT \suffix,16,65536,32768,16384 356 DECL_FFT \suffix,16,65536,32768,16384
313 357
314 .rodata 358 .rodata
315 .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec 359 .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec
316 EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec: 360 EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec:
317 PTR fft4_altivec 361 PTR fft4\suffix\()_altivec
318 PTR fft8_altivec 362 PTR fft8\suffix\()_altivec
319 PTR fft16_altivec 363 PTR fft16\suffix\()_altivec
320 PTR fft32\suffix\()_altivec 364 PTR fft32\suffix\()_altivec
321 PTR fft64\suffix\()_altivec 365 PTR fft64\suffix\()_altivec
322 PTR fft128\suffix\()_altivec 366 PTR fft128\suffix\()_altivec
323 PTR fft256\suffix\()_altivec 367 PTR fft256\suffix\()_altivec
324 PTR fft512\suffix\()_altivec 368 PTR fft512\suffix\()_altivec