Mercurial > mplayer.hg
comparison liba52/liba52_changes.diff @ 19373:87dddfc314fd
Update for changes from r19378 (ASMALIGN macro).
author | diego |
---|---|
date | Sun, 13 Aug 2006 00:44:42 +0000 |
parents | 9960f9ef96fd |
children | 212cf6625a47 |
comparison
equal
deleted
inserted
replaced
19372:6334c14b38eb | 19373:87dddfc314fd |
---|---|
208 return a52_bitstream_get_bh_2 (state, num_bits); | 208 return a52_bitstream_get_bh_2 (state, num_bits); |
209 +#endif | 209 +#endif |
210 } | 210 } |
211 --- liba52/downmix.c 2006-06-12 15:17:53.000000000 +0200 | 211 --- liba52/downmix.c 2006-06-12 15:17:53.000000000 +0200 |
212 +++ liba52/downmix.c 2006-06-05 02:23:02.000000000 +0200 | 212 +++ liba52/downmix.c 2006-06-05 02:23:02.000000000 +0200 |
213 @@ -19,18 +23,47 @@ | 213 @@ -19,18 +23,46 @@ |
214 * You should have received a copy of the GNU General Public License | 214 * You should have received a copy of the GNU General Public License |
215 * along with this program; if not, write to the Free Software | 215 * along with this program; if not, write to the Free Software |
216 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 216 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
217 + * | 217 + * |
218 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) | 218 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) |
219 */ | 219 */ |
220 | 220 |
221 #include "config.h" | 221 #include "config.h" |
222 +#include "asmalign.h" | |
223 | 222 |
224 #include <string.h> | 223 #include <string.h> |
225 #include <inttypes.h> | 224 #include <inttypes.h> |
226 | 225 |
227 #include "a52.h" | 226 #include "a52.h" |
256 +} | 255 +} |
257 + | 256 + |
258 int a52_downmix_init (int input, int flags, sample_t * level, | 257 int a52_downmix_init (int input, int flags, sample_t * level, |
259 sample_t clev, sample_t slev) | 258 sample_t clev, sample_t slev) |
260 { | 259 { |
261 @@ -447,7 +480,7 @@ | 260 @@ -447,7 +479,7 @@ |
262 samples[i] = 0; | 261 samples[i] = 0; |
263 } | 262 } |
264 | 263 |
265 -void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias, | 264 -void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias, |
266 +void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, | 265 +void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, |
267 sample_t clev, sample_t slev) | 266 sample_t clev, sample_t slev) |
268 { | 267 { |
269 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | 268 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { |
270 @@ -559,7 +592,7 @@ | 269 @@ -559,7 +591,7 @@ |
271 break; | 270 break; |
272 | 271 |
273 case CONVERT (A52_3F2R, A52_2F1R): | 272 case CONVERT (A52_3F2R, A52_2F1R): |
274 - mix3to2 (samples, bias); | 273 - mix3to2 (samples, bias); |
275 + mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used) | 274 + mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used) |
276 move2to1 (samples + 768, samples + 512, bias); | 275 move2to1 (samples + 768, samples + 512, bias); |
277 break; | 276 break; |
278 | 277 |
279 @@ -583,12 +616,12 @@ | 278 @@ -583,12 +615,12 @@ |
280 break; | 279 break; |
281 | 280 |
282 case CONVERT (A52_3F1R, A52_3F2R): | 281 case CONVERT (A52_3F1R, A52_3F2R): |
283 - memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t)); | 282 - memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t)); |
284 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); | 283 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); |
289 -void a52_upmix (sample_t * samples, int acmod, int output) | 288 -void a52_upmix (sample_t * samples, int acmod, int output) |
290 +void upmix_C (sample_t * samples, int acmod, int output) | 289 +void upmix_C (sample_t * samples, int acmod, int output) |
291 { | 290 { |
292 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | 291 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { |
293 | 292 |
294 @@ -653,3 +686,1137 @@ | 293 @@ -653,3 +685,1137 @@ |
295 goto mix_31to21; | 294 goto mix_31to21; |
296 } | 295 } |
297 } | 296 } |
298 + | 297 + |
299 +#if defined(ARCH_X86) || defined(ARCH_X86_64) | 298 +#if defined(ARCH_X86) || defined(ARCH_X86_64) |
301 +{ | 300 +{ |
302 + asm volatile( | 301 + asm volatile( |
303 + "movlps %2, %%xmm7 \n\t" | 302 + "movlps %2, %%xmm7 \n\t" |
304 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 303 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
305 + "mov $-1024, %%"REG_S" \n\t" | 304 + "mov $-1024, %%"REG_S" \n\t" |
306 + ASMALIGN16 | 305 + ASMALIGN(4) |
307 + "1: \n\t" | 306 + "1: \n\t" |
308 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" | 307 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
309 + "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" | 308 + "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" |
310 + "addps (%1, %%"REG_S"), %%xmm0 \n\t" | 309 + "addps (%1, %%"REG_S"), %%xmm0 \n\t" |
311 + "addps 16(%1, %%"REG_S"), %%xmm1\n\t" | 310 + "addps 16(%1, %%"REG_S"), %%xmm1\n\t" |
324 +{ | 323 +{ |
325 + asm volatile( | 324 + asm volatile( |
326 + "movlps %1, %%xmm7 \n\t" | 325 + "movlps %1, %%xmm7 \n\t" |
327 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 326 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
328 + "mov $-1024, %%"REG_S" \n\t" | 327 + "mov $-1024, %%"REG_S" \n\t" |
329 + ASMALIGN16 | 328 + ASMALIGN(4) |
330 + "1: \n\t" | 329 + "1: \n\t" |
331 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" | 330 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
332 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" | 331 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" |
333 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" | 332 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" |
334 + "addps %%xmm7, %%xmm1 \n\t" | 333 + "addps %%xmm7, %%xmm1 \n\t" |
345 +{ | 344 +{ |
346 + asm volatile( | 345 + asm volatile( |
347 + "movlps %1, %%xmm7 \n\t" | 346 + "movlps %1, %%xmm7 \n\t" |
348 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 347 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
349 + "mov $-1024, %%"REG_S" \n\t" | 348 + "mov $-1024, %%"REG_S" \n\t" |
350 + ASMALIGN16 | 349 + ASMALIGN(4) |
351 + "1: \n\t" | 350 + "1: \n\t" |
352 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" | 351 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
353 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" | 352 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" |
354 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" | 353 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" |
355 + "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" | 354 + "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" |
367 +{ | 366 +{ |
368 + asm volatile( | 367 + asm volatile( |
369 + "movlps %1, %%xmm7 \n\t" | 368 + "movlps %1, %%xmm7 \n\t" |
370 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 369 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
371 + "mov $-1024, %%"REG_S" \n\t" | 370 + "mov $-1024, %%"REG_S" \n\t" |
372 + ASMALIGN16 | 371 + ASMALIGN(4) |
373 + "1: \n\t" | 372 + "1: \n\t" |
374 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" | 373 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
375 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" | 374 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" |
376 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" | 375 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" |
377 + "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" | 376 + "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" |
390 +{ | 389 +{ |
391 + asm volatile( | 390 + asm volatile( |
392 + "movlps %1, %%xmm7 \n\t" | 391 + "movlps %1, %%xmm7 \n\t" |
393 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 392 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
394 + "mov $-1024, %%"REG_S" \n\t" | 393 + "mov $-1024, %%"REG_S" \n\t" |
395 + ASMALIGN16 | 394 + ASMALIGN(4) |
396 + "1: \n\t" | 395 + "1: \n\t" |
397 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" | 396 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" |
398 + "addps %%xmm7, %%xmm0 \n\t" //common | 397 + "addps %%xmm7, %%xmm0 \n\t" //common |
399 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" | 398 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
400 + "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" | 399 + "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" |
413 +{ | 412 +{ |
414 + asm volatile( | 413 + asm volatile( |
415 + "movlps %2, %%xmm7 \n\t" | 414 + "movlps %2, %%xmm7 \n\t" |
416 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 415 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
417 + "mov $-1024, %%"REG_S" \n\t" | 416 + "mov $-1024, %%"REG_S" \n\t" |
418 + ASMALIGN16 | 417 + ASMALIGN(4) |
419 + "1: \n\t" | 418 + "1: \n\t" |
420 + "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t" | 419 + "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t" |
421 + "addps %%xmm7, %%xmm0 \n\t" //common | 420 + "addps %%xmm7, %%xmm0 \n\t" //common |
422 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" | 421 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
423 + "movaps (%1, %%"REG_S"), %%xmm2 \n\t" | 422 + "movaps (%1, %%"REG_S"), %%xmm2 \n\t" |
436 +{ | 435 +{ |
437 + asm volatile( | 436 + asm volatile( |
438 + "movlps %1, %%xmm7 \n\t" | 437 + "movlps %1, %%xmm7 \n\t" |
439 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 438 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
440 + "mov $-1024, %%"REG_S" \n\t" | 439 + "mov $-1024, %%"REG_S" \n\t" |
441 + ASMALIGN16 | 440 + ASMALIGN(4) |
442 + "1: \n\t" | 441 + "1: \n\t" |
443 + "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" // surround | 442 + "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" // surround |
444 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" | 443 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
445 + "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" | 444 + "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" |
446 + "addps %%xmm7, %%xmm1 \n\t" | 445 + "addps %%xmm7, %%xmm1 \n\t" |
460 +{ | 459 +{ |
461 + asm volatile( | 460 + asm volatile( |
462 + "movlps %1, %%xmm7 \n\t" | 461 + "movlps %1, %%xmm7 \n\t" |
463 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 462 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
464 + "mov $-1024, %%"REG_S" \n\t" | 463 + "mov $-1024, %%"REG_S" \n\t" |
465 + ASMALIGN16 | 464 + ASMALIGN(4) |
466 + "1: \n\t" | 465 + "1: \n\t" |
467 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" | 466 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" |
468 + "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" | 467 + "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" |
469 + "addps %%xmm7, %%xmm0 \n\t" // common | 468 + "addps %%xmm7, %%xmm0 \n\t" // common |
470 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" | 469 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
484 +{ | 483 +{ |
485 + asm volatile( | 484 + asm volatile( |
486 + "movlps %1, %%xmm7 \n\t" | 485 + "movlps %1, %%xmm7 \n\t" |
487 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 486 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
488 + "mov $-1024, %%"REG_S" \n\t" | 487 + "mov $-1024, %%"REG_S" \n\t" |
489 + ASMALIGN16 | 488 + ASMALIGN(4) |
490 + "1: \n\t" | 489 + "1: \n\t" |
491 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" | 490 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" |
492 + "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround | 491 + "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround |
493 + "addps %%xmm7, %%xmm0 \n\t" // common | 492 + "addps %%xmm7, %%xmm0 \n\t" // common |
494 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" | 493 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
510 +{ | 509 +{ |
511 + asm volatile( | 510 + asm volatile( |
512 + "movlps %1, %%xmm7 \n\t" | 511 + "movlps %1, %%xmm7 \n\t" |
513 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 512 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
514 + "mov $-1024, %%"REG_S" \n\t" | 513 + "mov $-1024, %%"REG_S" \n\t" |
515 + ASMALIGN16 | 514 + ASMALIGN(4) |
516 + "1: \n\t" | 515 + "1: \n\t" |
517 + "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" | 516 + "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" |
518 + "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround | 517 + "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround |
519 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" | 518 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
520 + "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" | 519 + "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" |
535 +{ | 534 +{ |
536 + asm volatile( | 535 + asm volatile( |
537 + "movlps %1, %%xmm7 \n\t" | 536 + "movlps %1, %%xmm7 \n\t" |
538 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 537 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
539 + "mov $-1024, %%"REG_S" \n\t" | 538 + "mov $-1024, %%"REG_S" \n\t" |
540 + ASMALIGN16 | 539 + ASMALIGN(4) |
541 + "1: \n\t" | 540 + "1: \n\t" |
542 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" | 541 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" |
543 + "addps %%xmm7, %%xmm0 \n\t" // common | 542 + "addps %%xmm7, %%xmm0 \n\t" // common |
544 + "movaps %%xmm0, %%xmm1 \n\t" // common | 543 + "movaps %%xmm0, %%xmm1 \n\t" // common |
545 + "addps (%0, %%"REG_S"), %%xmm0 \n\t" | 544 + "addps (%0, %%"REG_S"), %%xmm0 \n\t" |
559 +{ | 558 +{ |
560 + asm volatile( | 559 + asm volatile( |
561 + "movlps %1, %%xmm7 \n\t" | 560 + "movlps %1, %%xmm7 \n\t" |
562 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 561 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
563 + "mov $-1024, %%"REG_S" \n\t" | 562 + "mov $-1024, %%"REG_S" \n\t" |
564 + ASMALIGN16 | 563 + ASMALIGN(4) |
565 + "1: \n\t" | 564 + "1: \n\t" |
566 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" | 565 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" |
567 + "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t" | 566 + "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t" |
568 + "addps %%xmm7, %%xmm0 \n\t" // common | 567 + "addps %%xmm7, %%xmm0 \n\t" // common |
569 + "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround | 568 + "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround |
586 +{ | 585 +{ |
587 + asm volatile( | 586 + asm volatile( |
588 + "movlps %2, %%xmm7 \n\t" | 587 + "movlps %2, %%xmm7 \n\t" |
589 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | 588 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" |
590 + "mov $-1024, %%"REG_S" \n\t" | 589 + "mov $-1024, %%"REG_S" \n\t" |
591 + ASMALIGN16 | 590 + ASMALIGN(4) |
592 + "1: \n\t" | 591 + "1: \n\t" |
593 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" | 592 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
594 + "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" | 593 + "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" |
595 + "addps 1024(%0, %%"REG_S"), %%xmm0\n\t" | 594 + "addps 1024(%0, %%"REG_S"), %%xmm0\n\t" |
596 + "addps 1040(%0, %%"REG_S"), %%xmm1\n\t" | 595 + "addps 1040(%0, %%"REG_S"), %%xmm1\n\t" |
608 +static void zero_MMX(sample_t * samples) | 607 +static void zero_MMX(sample_t * samples) |
609 +{ | 608 +{ |
610 + asm volatile( | 609 + asm volatile( |
611 + "mov $-1024, %%"REG_S" \n\t" | 610 + "mov $-1024, %%"REG_S" \n\t" |
612 + "pxor %%mm0, %%mm0 \n\t" | 611 + "pxor %%mm0, %%mm0 \n\t" |
613 + ASMALIGN16 | 612 + ASMALIGN(4) |
614 + "1: \n\t" | 613 + "1: \n\t" |
615 + "movq %%mm0, (%0, %%"REG_S") \n\t" | 614 + "movq %%mm0, (%0, %%"REG_S") \n\t" |
616 + "movq %%mm0, 8(%0, %%"REG_S") \n\t" | 615 + "movq %%mm0, 8(%0, %%"REG_S") \n\t" |
617 + "movq %%mm0, 16(%0, %%"REG_S") \n\t" | 616 + "movq %%mm0, 16(%0, %%"REG_S") \n\t" |
618 + "movq %%mm0, 24(%0, %%"REG_S") \n\t" | 617 + "movq %%mm0, 24(%0, %%"REG_S") \n\t" |
868 +{ | 867 +{ |
869 + asm volatile( | 868 + asm volatile( |
870 + "movd %2, %%mm7 \n\t" | 869 + "movd %2, %%mm7 \n\t" |
871 + "punpckldq %2, %%mm7 \n\t" | 870 + "punpckldq %2, %%mm7 \n\t" |
872 + "mov $-1024, %%"REG_S" \n\t" | 871 + "mov $-1024, %%"REG_S" \n\t" |
873 + ASMALIGN16 | 872 + ASMALIGN(4) |
874 + "1: \n\t" | 873 + "1: \n\t" |
875 + "movq (%0, %%"REG_S"), %%mm0 \n\t" | 874 + "movq (%0, %%"REG_S"), %%mm0 \n\t" |
876 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" | 875 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" |
877 + "movq 16(%0, %%"REG_S"), %%mm2 \n\t" | 876 + "movq 16(%0, %%"REG_S"), %%mm2 \n\t" |
878 + "movq 24(%0, %%"REG_S"), %%mm3 \n\t" | 877 + "movq 24(%0, %%"REG_S"), %%mm3 \n\t" |
899 +{ | 898 +{ |
900 + asm volatile( | 899 + asm volatile( |
901 + "movd %1, %%mm7 \n\t" | 900 + "movd %1, %%mm7 \n\t" |
902 + "punpckldq %1, %%mm7 \n\t" | 901 + "punpckldq %1, %%mm7 \n\t" |
903 + "mov $-1024, %%"REG_S" \n\t" | 902 + "mov $-1024, %%"REG_S" \n\t" |
904 + ASMALIGN16 | 903 + ASMALIGN(4) |
905 + "1: \n\t" | 904 + "1: \n\t" |
906 + "movq (%0, %%"REG_S"), %%mm0 \n\t" | 905 + "movq (%0, %%"REG_S"), %%mm0 \n\t" |
907 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" | 906 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" |
908 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" | 907 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" |
909 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" | 908 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" |
926 +{ | 925 +{ |
927 + asm volatile( | 926 + asm volatile( |
928 + "movd %1, %%mm7 \n\t" | 927 + "movd %1, %%mm7 \n\t" |
929 + "punpckldq %1, %%mm7 \n\t" | 928 + "punpckldq %1, %%mm7 \n\t" |
930 + "mov $-1024, %%"REG_S" \n\t" | 929 + "mov $-1024, %%"REG_S" \n\t" |
931 + ASMALIGN16 | 930 + ASMALIGN(4) |
932 + "1: \n\t" | 931 + "1: \n\t" |
933 + "movq (%0, %%"REG_S"), %%mm0 \n\t" | 932 + "movq (%0, %%"REG_S"), %%mm0 \n\t" |
934 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" | 933 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" |
935 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" | 934 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" |
936 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" | 935 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" |
955 +{ | 954 +{ |
956 + asm volatile( | 955 + asm volatile( |
957 + "movd %1, %%mm7 \n\t" | 956 + "movd %1, %%mm7 \n\t" |
958 + "punpckldq %1, %%mm7 \n\t" | 957 + "punpckldq %1, %%mm7 \n\t" |
959 + "mov $-1024, %%"REG_S" \n\t" | 958 + "mov $-1024, %%"REG_S" \n\t" |
960 + ASMALIGN16 | 959 + ASMALIGN(4) |
961 + "1: \n\t" | 960 + "1: \n\t" |
962 + "movq (%0, %%"REG_S"), %%mm0 \n\t" | 961 + "movq (%0, %%"REG_S"), %%mm0 \n\t" |
963 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" | 962 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" |
964 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" | 963 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" |
965 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" | 964 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" |
986 +{ | 985 +{ |
987 + asm volatile( | 986 + asm volatile( |
988 + "movd %1, %%mm7 \n\t" | 987 + "movd %1, %%mm7 \n\t" |
989 + "punpckldq %1, %%mm7 \n\t" | 988 + "punpckldq %1, %%mm7 \n\t" |
990 + "mov $-1024, %%"REG_S" \n\t" | 989 + "mov $-1024, %%"REG_S" \n\t" |
991 + ASMALIGN16 | 990 + ASMALIGN(4) |
992 + "1: \n\t" | 991 + "1: \n\t" |
993 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" | 992 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" |
994 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" | 993 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" |
995 + "pfadd %%mm7, %%mm0 \n\t" //common | 994 + "pfadd %%mm7, %%mm0 \n\t" //common |
996 + "pfadd %%mm7, %%mm1 \n\t" //common | 995 + "pfadd %%mm7, %%mm1 \n\t" //common |
1017 +{ | 1016 +{ |
1018 + asm volatile( | 1017 + asm volatile( |
1019 + "movd %2, %%mm7 \n\t" | 1018 + "movd %2, %%mm7 \n\t" |
1020 + "punpckldq %2, %%mm7 \n\t" | 1019 + "punpckldq %2, %%mm7 \n\t" |
1021 + "mov $-1024, %%"REG_S" \n\t" | 1020 + "mov $-1024, %%"REG_S" \n\t" |
1022 + ASMALIGN16 | 1021 + ASMALIGN(4) |
1023 + "1: \n\t" | 1022 + "1: \n\t" |
1024 + "movq 1024(%1, %%"REG_S"), %%mm0\n\t" | 1023 + "movq 1024(%1, %%"REG_S"), %%mm0\n\t" |
1025 + "movq 1032(%1, %%"REG_S"), %%mm1\n\t" | 1024 + "movq 1032(%1, %%"REG_S"), %%mm1\n\t" |
1026 + "pfadd %%mm7, %%mm0 \n\t" //common | 1025 + "pfadd %%mm7, %%mm0 \n\t" //common |
1027 + "pfadd %%mm7, %%mm1 \n\t" //common | 1026 + "pfadd %%mm7, %%mm1 \n\t" //common |
1048 +{ | 1047 +{ |
1049 + asm volatile( | 1048 + asm volatile( |
1050 + "movd %1, %%mm7 \n\t" | 1049 + "movd %1, %%mm7 \n\t" |
1051 + "punpckldq %1, %%mm7 \n\t" | 1050 + "punpckldq %1, %%mm7 \n\t" |
1052 + "mov $-1024, %%"REG_S" \n\t" | 1051 + "mov $-1024, %%"REG_S" \n\t" |
1053 + ASMALIGN16 | 1052 + ASMALIGN(4) |
1054 + "1: \n\t" | 1053 + "1: \n\t" |
1055 + "movq 2048(%0, %%"REG_S"), %%mm0\n\t" // surround | 1054 + "movq 2048(%0, %%"REG_S"), %%mm0\n\t" // surround |
1056 + "movq 2056(%0, %%"REG_S"), %%mm1\n\t" // surround | 1055 + "movq 2056(%0, %%"REG_S"), %%mm1\n\t" // surround |
1057 + "movq (%0, %%"REG_S"), %%mm2 \n\t" | 1056 + "movq (%0, %%"REG_S"), %%mm2 \n\t" |
1058 + "movq 8(%0, %%"REG_S"), %%mm3 \n\t" | 1057 + "movq 8(%0, %%"REG_S"), %%mm3 \n\t" |
1081 +{ | 1080 +{ |
1082 + asm volatile( | 1081 + asm volatile( |
1083 + "movd %1, %%mm7 \n\t" | 1082 + "movd %1, %%mm7 \n\t" |
1084 + "punpckldq %1, %%mm7 \n\t" | 1083 + "punpckldq %1, %%mm7 \n\t" |
1085 + "mov $-1024, %%"REG_S" \n\t" | 1084 + "mov $-1024, %%"REG_S" \n\t" |
1086 + ASMALIGN16 | 1085 + ASMALIGN(4) |
1087 + "1: \n\t" | 1086 + "1: \n\t" |
1088 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" | 1087 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" |
1089 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" | 1088 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" |
1090 + "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" | 1089 + "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" |
1091 + "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" | 1090 + "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" |
1114 +{ | 1113 +{ |
1115 + asm volatile( | 1114 + asm volatile( |
1116 + "movd %1, %%mm7 \n\t" | 1115 + "movd %1, %%mm7 \n\t" |
1117 + "punpckldq %1, %%mm7 \n\t" | 1116 + "punpckldq %1, %%mm7 \n\t" |
1118 + "mov $-1024, %%"REG_S" \n\t" | 1117 + "mov $-1024, %%"REG_S" \n\t" |
1119 + ASMALIGN16 | 1118 + ASMALIGN(4) |
1120 + "1: \n\t" | 1119 + "1: \n\t" |
1121 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" | 1120 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" |
1122 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" | 1121 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" |
1123 + "pfadd %%mm7, %%mm0 \n\t" // common | 1122 + "pfadd %%mm7, %%mm0 \n\t" // common |
1124 + "pfadd %%mm7, %%mm1 \n\t" // common | 1123 + "pfadd %%mm7, %%mm1 \n\t" // common |
1151 +{ | 1150 +{ |
1152 + asm volatile( | 1151 + asm volatile( |
1153 + "movd %1, %%mm7 \n\t" | 1152 + "movd %1, %%mm7 \n\t" |
1154 + "punpckldq %1, %%mm7 \n\t" | 1153 + "punpckldq %1, %%mm7 \n\t" |
1155 + "mov $-1024, %%"REG_S" \n\t" | 1154 + "mov $-1024, %%"REG_S" \n\t" |
1156 + ASMALIGN16 | 1155 + ASMALIGN(4) |
1157 + "1: \n\t" | 1156 + "1: \n\t" |
1158 + "movq 2048(%0, %%"REG_S"), %%mm0\n\t" | 1157 + "movq 2048(%0, %%"REG_S"), %%mm0\n\t" |
1159 + "movq 2056(%0, %%"REG_S"), %%mm1\n\t" | 1158 + "movq 2056(%0, %%"REG_S"), %%mm1\n\t" |
1160 + "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround | 1159 + "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround |
1161 + "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround | 1160 + "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround |
1186 +{ | 1185 +{ |
1187 + asm volatile( | 1186 + asm volatile( |
1188 + "movd %1, %%mm7 \n\t" | 1187 + "movd %1, %%mm7 \n\t" |
1189 + "punpckldq %1, %%mm7 \n\t" | 1188 + "punpckldq %1, %%mm7 \n\t" |
1190 + "mov $-1024, %%"REG_S" \n\t" | 1189 + "mov $-1024, %%"REG_S" \n\t" |
1191 + ASMALIGN16 | 1190 + ASMALIGN(4) |
1192 + "1: \n\t" | 1191 + "1: \n\t" |
1193 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" | 1192 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" |
1194 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" | 1193 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" |
1195 + "pfadd %%mm7, %%mm0 \n\t" // common | 1194 + "pfadd %%mm7, %%mm0 \n\t" // common |
1196 + "pfadd %%mm7, %%mm1 \n\t" // common | 1195 + "pfadd %%mm7, %%mm1 \n\t" // common |
1218 +/* todo: should be optimized better */ | 1217 +/* todo: should be optimized better */ |
1219 +static void mix32toS_3dnow (sample_t * samples, sample_t bias) | 1218 +static void mix32toS_3dnow (sample_t * samples, sample_t bias) |
1220 +{ | 1219 +{ |
1221 + asm volatile( | 1220 + asm volatile( |
1222 + "mov $-1024, %%"REG_S" \n\t" | 1221 + "mov $-1024, %%"REG_S" \n\t" |
1223 + ASMALIGN16 | 1222 + ASMALIGN(4) |
1224 + "1: \n\t" | 1223 + "1: \n\t" |
1225 + "movd %1, %%mm7 \n\t" | 1224 + "movd %1, %%mm7 \n\t" |
1226 + "punpckldq %1, %%mm7 \n\t" | 1225 + "punpckldq %1, %%mm7 \n\t" |
1227 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" | 1226 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" |
1228 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" | 1227 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" |
1259 +{ | 1258 +{ |
1260 + asm volatile( | 1259 + asm volatile( |
1261 + "movd %2, %%mm7 \n\t" | 1260 + "movd %2, %%mm7 \n\t" |
1262 + "punpckldq %2, %%mm7 \n\t" | 1261 + "punpckldq %2, %%mm7 \n\t" |
1263 + "mov $-1024, %%"REG_S" \n\t" | 1262 + "mov $-1024, %%"REG_S" \n\t" |
1264 + ASMALIGN16 | 1263 + ASMALIGN(4) |
1265 + "1: \n\t" | 1264 + "1: \n\t" |
1266 + "movq (%0, %%"REG_S"), %%mm0 \n\t" | 1265 + "movq (%0, %%"REG_S"), %%mm0 \n\t" |
1267 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" | 1266 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" |
1268 + "movq 16(%0, %%"REG_S"), %%mm2 \n\t" | 1267 + "movq 16(%0, %%"REG_S"), %%mm2 \n\t" |
1269 + "movq 24(%0, %%"REG_S"), %%mm3 \n\t" | 1268 + "movq 24(%0, %%"REG_S"), %%mm3 \n\t" |
1429 +} | 1428 +} |
1430 + | 1429 + |
1431 +#endif // ARCH_X86 || ARCH_X86_64 | 1430 +#endif // ARCH_X86 || ARCH_X86_64 |
1432 --- liba52/imdct.c 2006-06-12 15:18:27.000000000 +0200 | 1431 --- liba52/imdct.c 2006-06-12 15:18:27.000000000 +0200 |
1433 +++ liba52/imdct.c 2006-06-12 19:18:39.000000000 +0200 | 1432 +++ liba52/imdct.c 2006-06-12 19:18:39.000000000 +0200 |
1434 @@ -22,9 +26,15 @@ | 1433 @@ -26,6 +26,11 @@ |
1435 * You should have received a copy of the GNU General Public License | 1434 * You should have received a copy of the GNU General Public License |
1436 * along with this program; if not, write to the Free Software | 1435 * along with this program; if not, write to the Free Software |
1437 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 1436 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
1438 + * | 1437 + * |
1439 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) | 1438 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) |
1441 + * michael did port them from libac3 (untested, perhaps totally broken) | 1440 + * michael did port them from libac3 (untested, perhaps totally broken) |
1442 + * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org) | 1441 + * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org) |
1443 */ | 1442 */ |
1444 | 1443 |
1445 #include "config.h" | 1444 #include "config.h" |
1446 +#include "asmalign.h" | 1445 @@ -39,12 +48,49 @@ |
1447 | |
1448 #include <math.h> | |
1449 #include <stdio.h> | |
1450 @@ -39,12 +49,49 @@ | |
1451 #include "a52.h" | 1446 #include "a52.h" |
1452 #include "a52_internal.h" | 1447 #include "a52_internal.h" |
1453 #include "mm_accel.h" | 1448 #include "mm_accel.h" |
1454 +#include "mangle.h" | 1449 +#include "mangle.h" |
1455 + | 1450 + |
1495 + 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f}; | 1490 + 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f}; |
1496 + | 1491 + |
1497 static uint8_t fftorder[] = { | 1492 static uint8_t fftorder[] = { |
1498 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176, | 1493 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176, |
1499 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88, | 1494 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88, |
1500 @@ -56,6 +103,40 @@ | 1495 @@ -56,6 +102,40 @@ |
1501 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86 | 1496 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86 |
1502 }; | 1497 }; |
1503 | 1498 |
1504 +static complex_t __attribute__((aligned(16))) buf[128]; | 1499 +static complex_t __attribute__((aligned(16))) buf[128]; |
1505 + | 1500 + |
1536 +#endif | 1531 +#endif |
1537 + | 1532 + |
1538 /* Root values for IFFT */ | 1533 /* Root values for IFFT */ |
1539 static sample_t roots16[3]; | 1534 static sample_t roots16[3]; |
1540 static sample_t roots32[7]; | 1535 static sample_t roots32[7]; |
1541 @@ -241,7 +322,7 @@ | 1536 @@ -241,7 +321,7 @@ |
1542 ifft_pass (buf, roots128 - 32, 32); | 1537 ifft_pass (buf, roots128 - 32, 32); |
1543 } | 1538 } |
1544 | 1539 |
1545 -void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias) | 1540 -void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias) |
1546 +void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias) | 1541 +void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias) |
1547 { | 1542 { |
1548 int i, k; | 1543 int i, k; |
1549 sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2; | 1544 sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2; |
1550 @@ -285,6 +366,714 @@ | 1545 @@ -285,6 +365,714 @@ |
1551 } | 1546 } |
1552 } | 1547 } |
1553 | 1548 |
1554 +#ifdef HAVE_ALTIVEC | 1549 +#ifdef HAVE_ALTIVEC |
1555 + | 1550 + |
1952 + asm volatile( | 1947 + asm volatile( |
1953 + "xor %%"REG_S", %%"REG_S" \n\t" | 1948 + "xor %%"REG_S", %%"REG_S" \n\t" |
1954 + "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t" | 1949 + "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t" |
1955 + "mov $1008, %%"REG_D" \n\t" | 1950 + "mov $1008, %%"REG_D" \n\t" |
1956 + "push %%"REG_BP" \n\t" //use ebp without telling gcc | 1951 + "push %%"REG_BP" \n\t" //use ebp without telling gcc |
1957 + ASMALIGN16 | 1952 + ASMALIGN(4) |
1958 + "1: \n\t" | 1953 + "1: \n\t" |
1959 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI | 1954 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI |
1960 + "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI | 1955 + "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI |
1961 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi | 1956 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi |
1962 + "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi | 1957 + "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi |
2011 + // Note w[0][0]={1,0} | 2006 + // Note w[0][0]={1,0} |
2012 + asm volatile( | 2007 + asm volatile( |
2013 + "xorps %%xmm1, %%xmm1 \n\t" | 2008 + "xorps %%xmm1, %%xmm1 \n\t" |
2014 + "xorps %%xmm2, %%xmm2 \n\t" | 2009 + "xorps %%xmm2, %%xmm2 \n\t" |
2015 + "mov %0, %%"REG_S" \n\t" | 2010 + "mov %0, %%"REG_S" \n\t" |
2016 + ASMALIGN16 | 2011 + ASMALIGN(4) |
2017 + "1: \n\t" | 2012 + "1: \n\t" |
2018 + "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p] | 2013 + "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p] |
2019 + "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q] | 2014 + "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q] |
2020 + "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p] | 2015 + "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p] |
2021 + "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q] | 2016 + "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q] |
2032 + /* 2. iteration */ | 2027 + /* 2. iteration */ |
2033 + // Note w[1]={{1,0}, {0,-1}} | 2028 + // Note w[1]={{1,0}, {0,-1}} |
2034 + asm volatile( | 2029 + asm volatile( |
2035 + "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 | 2030 + "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 |
2036 + "mov %0, %%"REG_S" \n\t" | 2031 + "mov %0, %%"REG_S" \n\t" |
2037 + ASMALIGN16 | 2032 + ASMALIGN(4) |
2038 + "1: \n\t" | 2033 + "1: \n\t" |
2039 + "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3 | 2034 + "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3 |
2040 + "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 | 2035 + "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 |
2041 + "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 | 2036 + "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 |
2042 + "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1 | 2037 + "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1 |
2063 + "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" | 2058 + "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" |
2064 + "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" | 2059 + "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" |
2065 + "xorps %%xmm5, %%xmm5 \n\t" | 2060 + "xorps %%xmm5, %%xmm5 \n\t" |
2066 + "xorps %%xmm2, %%xmm2 \n\t" | 2061 + "xorps %%xmm2, %%xmm2 \n\t" |
2067 + "mov %0, %%"REG_S" \n\t" | 2062 + "mov %0, %%"REG_S" \n\t" |
2068 + ASMALIGN16 | 2063 + ASMALIGN(4) |
2069 + "1: \n\t" | 2064 + "1: \n\t" |
2070 + "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5 | 2065 + "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5 |
2071 + "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7 | 2066 + "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7 |
2072 + "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 | 2067 + "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 |
2073 + "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 | 2068 + "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 |
2104 + two_m_plus_one = two_m<<1; | 2099 + two_m_plus_one = two_m<<1; |
2105 + two_m_plus_one_shl3 = (two_m_plus_one<<3); | 2100 + two_m_plus_one_shl3 = (two_m_plus_one<<3); |
2106 + buf_offset = buf+128; | 2101 + buf_offset = buf+128; |
2107 + asm volatile( | 2102 + asm volatile( |
2108 + "mov %0, %%"REG_S" \n\t" | 2103 + "mov %0, %%"REG_S" \n\t" |
2109 + ASMALIGN16 | 2104 + ASMALIGN(4) |
2110 + "1: \n\t" | 2105 + "1: \n\t" |
2111 + "xor %%"REG_D", %%"REG_D" \n\t" // k | 2106 + "xor %%"REG_D", %%"REG_D" \n\t" // k |
2112 + "lea (%%"REG_S", %3), %%"REG_d" \n\t" | 2107 + "lea (%%"REG_S", %3), %%"REG_d" \n\t" |
2113 + "2: \n\t" | 2108 + "2: \n\t" |
2114 + "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t" | 2109 + "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t" |
2136 + } | 2131 + } |
2137 + | 2132 + |
2138 + /* Post IFFT complex multiply plus IFFT complex conjugate*/ | 2133 + /* Post IFFT complex multiply plus IFFT complex conjugate*/ |
2139 + asm volatile( | 2134 + asm volatile( |
2140 + "mov $-1024, %%"REG_S" \n\t" | 2135 + "mov $-1024, %%"REG_S" \n\t" |
2141 + ASMALIGN16 | 2136 + ASMALIGN(4) |
2142 + "1: \n\t" | 2137 + "1: \n\t" |
2143 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" | 2138 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
2144 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" | 2139 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
2145 + "shufps $0xB1, %%xmm0, %%xmm0 \n\t" | 2140 + "shufps $0xB1, %%xmm0, %%xmm0 \n\t" |
2146 + "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t" | 2141 + "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t" |
2162 + asm volatile( | 2157 + asm volatile( |
2163 + "xor %%"REG_D", %%"REG_D" \n\t" // 0 | 2158 + "xor %%"REG_D", %%"REG_D" \n\t" // 0 |
2164 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 | 2159 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 |
2165 + "movss %3, %%xmm2 \n\t" // bias | 2160 + "movss %3, %%xmm2 \n\t" // bias |
2166 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | 2161 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... |
2167 + ASMALIGN16 | 2162 + ASMALIGN(4) |
2168 + "1: \n\t" | 2163 + "1: \n\t" |
2169 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? | 2164 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? |
2170 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? | 2165 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? |
2171 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? | 2166 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? |
2172 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? | 2167 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? |
2189 + asm volatile( | 2184 + asm volatile( |
2190 + "mov $1024, %%"REG_D" \n\t" // 512 | 2185 + "mov $1024, %%"REG_D" \n\t" // 512 |
2191 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 | 2186 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 |
2192 + "movss %3, %%xmm2 \n\t" // bias | 2187 + "movss %3, %%xmm2 \n\t" // bias |
2193 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | 2188 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... |
2194 + ASMALIGN16 | 2189 + ASMALIGN(4) |
2195 + "1: \n\t" | 2190 + "1: \n\t" |
2196 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A | 2191 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A |
2197 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C | 2192 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C |
2198 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C | 2193 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C |
2199 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A | 2194 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A |
2216 + delay_ptr = delay; | 2211 + delay_ptr = delay; |
2217 + | 2212 + |
2218 + asm volatile( | 2213 + asm volatile( |
2219 + "xor %%"REG_D", %%"REG_D" \n\t" // 0 | 2214 + "xor %%"REG_D", %%"REG_D" \n\t" // 0 |
2220 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 | 2215 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 |
2221 + ASMALIGN16 | 2216 + ASMALIGN(4) |
2222 + "1: \n\t" | 2217 + "1: \n\t" |
2223 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A | 2218 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A |
2224 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C | 2219 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C |
2225 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C | 2220 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C |
2226 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A | 2221 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A |
2238 +// window_ptr-=128; | 2233 +// window_ptr-=128; |
2239 + | 2234 + |
2240 + asm volatile( | 2235 + asm volatile( |
2241 + "mov $1024, %%"REG_D" \n\t" // 1024 | 2236 + "mov $1024, %%"REG_D" \n\t" // 1024 |
2242 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 | 2237 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 |
2243 + ASMALIGN16 | 2238 + ASMALIGN(4) |
2244 + "1: \n\t" | 2239 + "1: \n\t" |
2245 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? | 2240 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? |
2246 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? | 2241 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? |
2247 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? | 2242 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? |
2248 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? | 2243 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? |
2260 +#endif // ARCH_X86 || ARCH_X86_64 | 2255 +#endif // ARCH_X86 || ARCH_X86_64 |
2261 + | 2256 + |
2262 void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias) | 2257 void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias) |
2263 { | 2258 { |
2264 int i, k; | 2259 int i, k; |
2265 @@ -364,7 +1153,7 @@ | 2260 @@ -364,7 +1152,7 @@ |
2266 | 2261 |
2267 void a52_imdct_init (uint32_t mm_accel) | 2262 void a52_imdct_init (uint32_t mm_accel) |
2268 { | 2263 { |
2269 - int i, k; | 2264 - int i, k; |
2270 + int i, j, k; | 2265 + int i, j, k; |
2271 double sum; | 2266 double sum; |
2272 | 2267 |
2273 /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */ | 2268 /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */ |
2274 @@ -416,6 +1205,99 @@ | 2269 @@ -416,6 +1204,99 @@ |
2275 post2[i].real = cos ((M_PI / 128) * (i + 0.5)); | 2270 post2[i].real = cos ((M_PI / 128) * (i + 0.5)); |
2276 post2[i].imag = sin ((M_PI / 128) * (i + 0.5)); | 2271 post2[i].imag = sin ((M_PI / 128) * (i + 0.5)); |
2277 } | 2272 } |
2278 + for (i = 0; i < 128; i++) { | 2273 + for (i = 0; i < 128; i++) { |
2279 + xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); | 2274 + xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); |
2369 + else | 2364 + else |
2370 +#endif | 2365 +#endif |
2371 | 2366 |
2372 #ifdef LIBA52_DJBFFT | 2367 #ifdef LIBA52_DJBFFT |
2373 if (mm_accel & MM_ACCEL_DJBFFT) { | 2368 if (mm_accel & MM_ACCEL_DJBFFT) { |
2374 @@ -426,7 +1308,5 @@ | 2369 @@ -426,7 +1307,5 @@ |
2375 #endif | 2370 #endif |
2376 { | 2371 { |
2377 fprintf (stderr, "No accelerated IMDCT transform found\n"); | 2372 fprintf (stderr, "No accelerated IMDCT transform found\n"); |
2378 - ifft128 = ifft128_c; | 2373 - ifft128 = ifft128_c; |
2379 - ifft64 = ifft64_c; | 2374 - ifft64 = ifft64_c; |