Mercurial > libavcodec.hg
comparison ppc/fft_altivec_s.S @ 12085:8454bb880008 libavcodec
PPC: add _interleave versions of fft{4,6,16}_altivec
This removes the need for a post-swizzle with the small FFTs.
author | mru |
---|---|
date | Sat, 03 Jul 2010 18:36:10 +0000 |
parents | ae57be2ef58c |
children | bb603bb20873 |
comparison
equal
deleted
inserted
replaced
12084:b6cf19580e47 | 12085:8454bb880008 |
---|---|
141 .macro BF d0,d1,s0,s1 | 141 .macro BF d0,d1,s0,s1 |
142 vsubfp \d1,\s0,\s1 | 142 vsubfp \d1,\s0,\s1 |
143 vaddfp \d0,\s0,\s1 | 143 vaddfp \d0,\s0,\s1 |
144 .endm | 144 .endm |
145 | 145 |
146 fft4_altivec: | 146 .macro zip d0,d1,s0,s1 |
147 vmrghw \d0,\s0,\s1 | |
148 vmrglw \d1,\s0,\s1 | |
149 .endm | |
150 | |
151 .macro def_fft4 interleave | |
152 fft4\interleave\()_altivec: | |
147 lvx v0, 0,r3 | 153 lvx v0, 0,r3 |
148 lvx v1,r9,r3 | 154 lvx v1,r9,r3 |
149 FFT4 v0,v1,v2,v3 | 155 FFT4 v0,v1,v2,v3 |
156 .ifnb \interleave | |
157 zip v0,v1,v2,v3 | |
158 stvx v0, 0,r3 | |
159 stvx v1,r9,r3 | |
160 .else | |
150 stvx v2, 0,r3 | 161 stvx v2, 0,r3 |
151 stvx v3,r9,r3 | 162 stvx v3,r9,r3 |
163 .endif | |
152 blr | 164 blr |
153 | 165 .endm |
154 fft8_altivec: | 166 |
167 .macro def_fft8 interleave | |
168 fft8\interleave\()_altivec: | |
155 addi r4,r3,32 | 169 addi r4,r3,32 |
156 lvx v0, 0,r3 | 170 lvx v0, 0,r3 |
157 lvx v1,r9,r3 | 171 lvx v1,r9,r3 |
158 lvx v2, 0,r4 | 172 lvx v2, 0,r4 |
159 lvx v3,r9,r4 | 173 lvx v3,r9,r4 |
160 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8 | 174 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8 |
175 .ifnb \interleave | |
176 zip v4,v5,v0,v1 | |
177 zip v6,v7,v2,v3 | |
178 stvx v4, 0,r3 | |
179 stvx v5,r9,r3 | |
180 stvx v6, 0,r4 | |
181 stvx v7,r9,r4 | |
182 .else | |
161 stvx v0, 0,r3 | 183 stvx v0, 0,r3 |
162 stvx v1,r9,r3 | 184 stvx v1,r9,r3 |
163 stvx v2, 0,r4 | 185 stvx v2, 0,r4 |
164 stvx v3,r9,r4 | 186 stvx v3,r9,r4 |
187 .endif | |
165 blr | 188 blr |
166 | 189 .endm |
167 fft16_altivec: | 190 |
191 .macro def_fft16 interleave | |
192 fft16\interleave\()_altivec: | |
168 addi r5,r3,64 | 193 addi r5,r3,64 |
169 addi r6,r3,96 | 194 addi r6,r3,96 |
170 addi r4,r3,32 | 195 addi r4,r3,32 |
171 lvx v0, 0,r5 | 196 lvx v0, 0,r5 |
172 lvx v1,r9,r5 | 197 lvx v1,r9,r5 |
188 vmaddfp v11,v6,v16,v11 // r3*wim | 213 vmaddfp v11,v6,v16,v11 // r3*wim |
189 BF v10,v12,v10,v8 | 214 BF v10,v12,v10,v8 |
190 BF v11,v13,v9,v11 | 215 BF v11,v13,v9,v11 |
191 BF v0,v4,v0,v10 | 216 BF v0,v4,v0,v10 |
192 BF v3,v7,v3,v12 | 217 BF v3,v7,v3,v12 |
218 BF v1,v5,v1,v11 | |
219 BF v2,v6,v2,v13 | |
220 .ifnb \interleave | |
221 zip v8, v9,v0,v1 | |
222 zip v10,v11,v2,v3 | |
223 zip v12,v13,v4,v5 | |
224 zip v14,v15,v6,v7 | |
225 stvx v8, 0,r3 | |
226 stvx v9,r9,r3 | |
227 stvx v10, 0,r4 | |
228 stvx v11,r9,r4 | |
229 stvx v12, 0,r5 | |
230 stvx v13,r9,r5 | |
231 stvx v14, 0,r6 | |
232 stvx v15,r9,r6 | |
233 .else | |
193 stvx v0, 0,r3 | 234 stvx v0, 0,r3 |
194 stvx v4, 0,r5 | 235 stvx v4, 0,r5 |
195 stvx v3,r9,r4 | 236 stvx v3,r9,r4 |
196 stvx v7,r9,r6 | 237 stvx v7,r9,r6 |
197 BF v1,v5,v1,v11 | |
198 BF v2,v6,v2,v13 | |
199 stvx v1,r9,r3 | 238 stvx v1,r9,r3 |
200 stvx v5,r9,r5 | 239 stvx v5,r9,r5 |
201 stvx v2, 0,r4 | 240 stvx v2, 0,r4 |
202 stvx v6, 0,r6 | 241 stvx v6, 0,r6 |
242 .endif | |
203 blr | 243 blr |
244 .endm | |
204 | 245 |
205 // void pass(float *z, float *wre, int n) | 246 // void pass(float *z, float *wre, int n) |
206 .macro PASS interleave, suffix | 247 .macro PASS interleave, suffix |
207 fft_pass\suffix\()_altivec: | 248 fft_pass\suffix\()_altivec: |
208 mtctr r5 | 249 mtctr r5 |
295 b fft_pass\suffix\()_altivec | 336 b fft_pass\suffix\()_altivec |
296 .endm | 337 .endm |
297 | 338 |
298 .macro DECL_FFTS interleave, suffix | 339 .macro DECL_FFTS interleave, suffix |
299 .text | 340 .text |
341 def_fft4 \suffix | |
342 def_fft8 \suffix | |
343 def_fft16 \suffix | |
300 PASS \interleave, \suffix | 344 PASS \interleave, \suffix |
301 DECL_FFT \suffix, 5, 32, 16, 8 | 345 DECL_FFT \suffix, 5, 32, 16, 8 |
302 DECL_FFT \suffix, 6, 64, 32, 16 | 346 DECL_FFT \suffix, 6, 64, 32, 16 |
303 DECL_FFT \suffix, 7, 128, 64, 32 | 347 DECL_FFT \suffix, 7, 128, 64, 32 |
304 DECL_FFT \suffix, 8, 256, 128, 64 | 348 DECL_FFT \suffix, 8, 256, 128, 64 |
312 DECL_FFT \suffix,16,65536,32768,16384 | 356 DECL_FFT \suffix,16,65536,32768,16384 |
313 | 357 |
314 .rodata | 358 .rodata |
315 .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec | 359 .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec |
316 EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec: | 360 EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec: |
317 PTR fft4_altivec | 361 PTR fft4\suffix\()_altivec |
318 PTR fft8_altivec | 362 PTR fft8\suffix\()_altivec |
319 PTR fft16_altivec | 363 PTR fft16\suffix\()_altivec |
320 PTR fft32\suffix\()_altivec | 364 PTR fft32\suffix\()_altivec |
321 PTR fft64\suffix\()_altivec | 365 PTR fft64\suffix\()_altivec |
322 PTR fft128\suffix\()_altivec | 366 PTR fft128\suffix\()_altivec |
323 PTR fft256\suffix\()_altivec | 367 PTR fft256\suffix\()_altivec |
324 PTR fft512\suffix\()_altivec | 368 PTR fft512\suffix\()_altivec |