comparison x86/fft_mmx.asm @ 12518:67e7e49058c2 libavcodec

Split and then simplify address generation macro. Allows nasm to work for this code.
author reimar
date Sun, 26 Sep 2010 09:08:11 +0000
parents f61e22f8cf28
children
comparison
equal deleted inserted replaced
12517:be85455cab8f 12518:67e7e49058c2
152 mova m0, %2 ; wre 152 mova m0, %2 ; wre
153 mova m2, m4 153 mova m2, m4
154 mova m1, %3 ; wim 154 mova m1, %3 ; wim
155 mova m3, m5 155 mova m3, m5
156 mulps m2, m0 ; r2*wre 156 mulps m2, m0 ; r2*wre
157 IF%1 mova m6, Z(6) 157 IF%1 mova m6, Z2(6)
158 mulps m3, m1 ; i2*wim 158 mulps m3, m1 ; i2*wim
159 IF%1 mova m7, Z(7) 159 IF%1 mova m7, Z2(7)
160 mulps m4, m1 ; r2*wim 160 mulps m4, m1 ; r2*wim
161 mulps m5, m0 ; i2*wre 161 mulps m5, m0 ; i2*wre
162 addps m2, m3 ; r2*wre + i2*wim 162 addps m2, m3 ; r2*wre + i2*wim
163 mova m3, m1 163 mova m3, m1
164 mulps m1, m6 ; r3*wim 164 mulps m1, m6 ; r3*wim
181 mova m3, m5 181 mova m3, m5
182 subps m5, m0 ; t4 182 subps m5, m0 ; t4
183 mova m4, m6 183 mova m4, m6
184 subps m6, m5 ; r3 184 subps m6, m5 ; r3
185 addps m5, m4 ; r1 185 addps m5, m4 ; r1
186 mova Z(6), m6 186 mova Z2(6), m6
187 mova Z(2), m5 187 mova Z(2), m5
188 mova m2, Z(3) 188 mova m2, Z(3)
189 addps m3, m0 ; t6 189 addps m3, m0 ; t6
190 subps m2, m1 ; i3 190 subps m2, m1 ; i3
191 mova m7, Z(1) 191 mova m7, Z(1)
192 addps m1, Z(3) ; i1 192 addps m1, Z(3) ; i1
193 mova Z(7), m2 193 mova Z2(7), m2
194 mova Z(3), m1 194 mova Z(3), m1
195 mova m4, m7 195 mova m4, m7
196 subps m7, m3 ; i2 196 subps m7, m3 ; i2
197 addps m3, m4 ; i0 197 addps m3, m4 ; i0
198 mova Z(5), m7 198 mova Z(5), m7
206 mova m2, m4 206 mova m2, m4
207 mova m0, [wq] ; wre 207 mova m0, [wq] ; wre
208 mova m3, m5 208 mova m3, m5
209 mova m1, [wq+o1q] ; wim 209 mova m1, [wq+o1q] ; wim
210 mulps m2, m0 ; r2*wre 210 mulps m2, m0 ; r2*wre
211 mova m6, Z(6) ; r3 211 mova m6, Z2(6) ; r3
212 mulps m3, m1 ; i2*wim 212 mulps m3, m1 ; i2*wim
213 mova m7, Z(7) ; i3 213 mova m7, Z2(7) ; i3
214 mulps m4, m1 ; r2*wim 214 mulps m4, m1 ; r2*wim
215 mulps m5, m0 ; i2*wre 215 mulps m5, m0 ; i2*wre
216 addps m2, m3 ; r2*wre + i2*wim 216 addps m2, m3 ; r2*wre + i2*wim
217 mova m3, m1 217 mova m3, m1
218 mulps m1, m6 ; r3*wim 218 mulps m1, m6 ; r3*wim
235 mova m3, m5 235 mova m3, m5
236 subps m5, m0 ; t4 236 subps m5, m0 ; t4
237 mova m4, m6 237 mova m4, m6
238 subps m6, m5 ; r3 238 subps m6, m5 ; r3
239 addps m5, m4 ; r1 239 addps m5, m4 ; r1
240 IF%1 mova Z(6), m6 240 IF%1 mova Z2(6), m6
241 IF%1 mova Z(2), m5 241 IF%1 mova Z(2), m5
242 mova m2, Z(3) 242 mova m2, Z(3)
243 addps m3, m0 ; t6 243 addps m3, m0 ; t6
244 subps m2, m1 ; i3 244 subps m2, m1 ; i3
245 mova m7, Z(1) 245 mova m7, Z(1)
246 addps m1, Z(3) ; i1 246 addps m1, Z(3) ; i1
247 IF%1 mova Z(7), m2 247 IF%1 mova Z2(7), m2
248 IF%1 mova Z(3), m1 248 IF%1 mova Z(3), m1
249 mova m4, m7 249 mova m4, m7
250 subps m7, m3 ; i2 250 subps m7, m3 ; i2
251 addps m3, m4 ; i0 251 addps m3, m4 ; i0
252 IF%1 mova Z(5), m7 252 IF%1 mova Z(5), m7
260 unpckhps m0, m2 260 unpckhps m0, m2
261 mova m1, Z(0) 261 mova m1, Z(0)
262 mova m2, Z(4) 262 mova m2, Z(4)
263 mova Z(2), m5 263 mova Z(2), m5
264 mova Z(3), m4 264 mova Z(3), m4
265 mova Z(6), m6 265 mova Z2(6), m6
266 mova Z(7), m0 266 mova Z2(7), m0
267 mova m5, m1 ; r0 267 mova m5, m1 ; r0
268 mova m4, m2 ; r2 268 mova m4, m2 ; r2
269 unpcklps m1, m3 269 unpcklps m1, m3
270 unpckhps m5, m3 270 unpckhps m5, m3
271 unpcklps m2, m7 271 unpcklps m2, m7
285 285
286 INIT_XMM 286 INIT_XMM
287 %define mova movaps 287 %define mova movaps
288 288
289 %define Z(x) [r0+mmsize*x] 289 %define Z(x) [r0+mmsize*x]
290 %define Z2(x) [r0+mmsize*x]
290 291
291 align 16 292 align 16
292 fft4_sse: 293 fft4_sse:
293 mova m0, Z(0) 294 mova m0, Z(0)
294 mova m1, Z(1) 295 mova m1, Z(1)
324 mova Z(0), m0 325 mova Z(0), m0
325 mova Z(1), m1 326 mova Z(1), m1
326 mova Z(2), m2 327 mova Z(2), m2
327 mova Z(3), m3 328 mova Z(3), m3
328 T4_SSE m4, m5, m6 329 T4_SSE m4, m5, m6
329 mova m6, Z(6) 330 mova m6, Z2(6)
330 mova m7, Z(7) 331 mova m7, Z2(7)
331 T4_SSE m6, m7, m0 332 T4_SSE m6, m7, m0
332 PASS_SMALL 0, [cos_16], [cos_16+16] 333 PASS_SMALL 0, [cos_16], [cos_16+16]
333 ret 334 ret
334 335
335 336
356 mova m2, Z(2) 357 mova m2, Z(2)
357 mova m3, Z(3) 358 mova m3, Z(3)
358 T4_3DN m0, m1, m2, m3, m4, m5 359 T4_3DN m0, m1, m2, m3, m4, m5
359 mova Z(0), m0 360 mova Z(0), m0
360 mova Z(2), m2 361 mova Z(2), m2
361 T2_3DN m4, m5, Z(4), Z(5) 362 T2_3DN m4, m5, Z(4), Z(5)
362 T2_3DN m6, m7, Z(6), Z(7) 363 T2_3DN m6, m7, Z2(6), Z2(7)
363 pswapd m0, m5 364 pswapd m0, m5
364 pswapd m2, m7 365 pswapd m2, m7
365 pxor m0, [ps_m1p1] 366 pxor m0, [ps_m1p1]
366 pxor m2, [ps_m1p1] 367 pxor m2, [ps_m1p1]
367 pfsub m5, m0 368 pfsub m5, m0
368 pfadd m7, m2 369 pfadd m7, m2
369 pfmul m5, [ps_root2] 370 pfmul m5, [ps_root2]
370 pfmul m7, [ps_root2] 371 pfmul m7, [ps_root2]
371 T4_3DN m1, m3, m5, m7, m0, m2 372 T4_3DN m1, m3, m5, m7, m0, m2
372 mova Z(5), m5 373 mova Z(5), m5
373 mova Z(7), m7 374 mova Z2(7), m7
374 mova m0, Z(0) 375 mova m0, Z(0)
375 mova m2, Z(2) 376 mova m2, Z(2)
376 T4_3DN m0, m2, m4, m6, m5, m7 377 T4_3DN m0, m2, m4, m6, m5, m7
377 PUNPCK m0, m1, m5 378 PUNPCK m0, m1, m5
378 PUNPCK m2, m3, m7 379 PUNPCK m2, m3, m7
379 mova Z(0), m0 380 mova Z(0), m0
380 mova Z(1), m5 381 mova Z(1), m5
381 mova Z(2), m2 382 mova Z(2), m2
382 mova Z(3), m7 383 mova Z(3), m7
383 PUNPCK m4, Z(5), m5 384 PUNPCK m4, Z(5), m5
384 PUNPCK m6, Z(7), m7 385 PUNPCK m6, Z2(7), m7
385 mova Z(4), m4 386 mova Z(4), m4
386 mova Z(5), m5 387 mova Z(5), m5
387 mova Z(6), m6 388 mova Z2(6), m6
388 mova Z(7), m7 389 mova Z2(7), m7
389 ret 390 ret
390 %endmacro 391 %endmacro
391 392
392 FFT48_3DN _3dn2 393 FFT48_3DN _3dn2
393 394
403 %endmacro 404 %endmacro
404 405
405 FFT48_3DN _3dn 406 FFT48_3DN _3dn
406 407
407 408
408 %define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)] 409 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
410 %define Z2(x) [zq + o3q + mmsize*(x&1)]
409 411
410 %macro DECL_PASS 2+ ; name, payload 412 %macro DECL_PASS 2+ ; name, payload
411 align 16 413 align 16
412 %1: 414 %1:
413 DEFINE_ARGS z, w, n, o1, o3 415 DEFINE_ARGS z, w, n, o1, o3