comparison liba52/liba52_changes.diff @ 19373:87dddfc314fd

Update for changes from r19378 (ASMALIGN macro).
author diego
date Sun, 13 Aug 2006 00:44:42 +0000
parents 9960f9ef96fd
children 212cf6625a47
comparison
equal deleted inserted replaced
19372:6334c14b38eb 19373:87dddfc314fd
208 return a52_bitstream_get_bh_2 (state, num_bits); 208 return a52_bitstream_get_bh_2 (state, num_bits);
209 +#endif 209 +#endif
210 } 210 }
211 --- liba52/downmix.c 2006-06-12 15:17:53.000000000 +0200 211 --- liba52/downmix.c 2006-06-12 15:17:53.000000000 +0200
212 +++ liba52/downmix.c 2006-06-05 02:23:02.000000000 +0200 212 +++ liba52/downmix.c 2006-06-05 02:23:02.000000000 +0200
213 @@ -19,18 +23,47 @@ 213 @@ -19,18 +23,46 @@
214 * You should have received a copy of the GNU General Public License 214 * You should have received a copy of the GNU General Public License
215 * along with this program; if not, write to the Free Software 215 * along with this program; if not, write to the Free Software
216 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 216 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
217 + * 217 + *
218 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) 218 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
219 */ 219 */
220 220
221 #include "config.h" 221 #include "config.h"
222 +#include "asmalign.h"
223 222
224 #include <string.h> 223 #include <string.h>
225 #include <inttypes.h> 224 #include <inttypes.h>
226 225
227 #include "a52.h" 226 #include "a52.h"
256 +} 255 +}
257 + 256 +
258 int a52_downmix_init (int input, int flags, sample_t * level, 257 int a52_downmix_init (int input, int flags, sample_t * level,
259 sample_t clev, sample_t slev) 258 sample_t clev, sample_t slev)
260 { 259 {
261 @@ -447,7 +480,7 @@ 260 @@ -447,7 +479,7 @@
262 samples[i] = 0; 261 samples[i] = 0;
263 } 262 }
264 263
265 -void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias, 264 -void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias,
266 +void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, 265 +void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
267 sample_t clev, sample_t slev) 266 sample_t clev, sample_t slev)
268 { 267 {
269 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { 268 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
270 @@ -559,7 +592,7 @@ 269 @@ -559,7 +591,7 @@
271 break; 270 break;
272 271
273 case CONVERT (A52_3F2R, A52_2F1R): 272 case CONVERT (A52_3F2R, A52_2F1R):
274 - mix3to2 (samples, bias); 273 - mix3to2 (samples, bias);
275 + mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used) 274 + mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
276 move2to1 (samples + 768, samples + 512, bias); 275 move2to1 (samples + 768, samples + 512, bias);
277 break; 276 break;
278 277
279 @@ -583,12 +616,12 @@ 278 @@ -583,12 +615,12 @@
280 break; 279 break;
281 280
282 case CONVERT (A52_3F1R, A52_3F2R): 281 case CONVERT (A52_3F1R, A52_3F2R):
283 - memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t)); 282 - memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t));
284 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); 283 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
289 -void a52_upmix (sample_t * samples, int acmod, int output) 288 -void a52_upmix (sample_t * samples, int acmod, int output)
290 +void upmix_C (sample_t * samples, int acmod, int output) 289 +void upmix_C (sample_t * samples, int acmod, int output)
291 { 290 {
292 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { 291 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
293 292
294 @@ -653,3 +686,1137 @@ 293 @@ -653,3 +685,1137 @@
295 goto mix_31to21; 294 goto mix_31to21;
296 } 295 }
297 } 296 }
298 + 297 +
299 +#if defined(ARCH_X86) || defined(ARCH_X86_64) 298 +#if defined(ARCH_X86) || defined(ARCH_X86_64)
301 +{ 300 +{
302 + asm volatile( 301 + asm volatile(
303 + "movlps %2, %%xmm7 \n\t" 302 + "movlps %2, %%xmm7 \n\t"
304 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 303 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
305 + "mov $-1024, %%"REG_S" \n\t" 304 + "mov $-1024, %%"REG_S" \n\t"
306 + ASMALIGN16 305 + ASMALIGN(4)
307 + "1: \n\t" 306 + "1: \n\t"
308 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" 307 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
309 + "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" 308 + "movaps 16(%0, %%"REG_S"), %%xmm1\n\t"
310 + "addps (%1, %%"REG_S"), %%xmm0 \n\t" 309 + "addps (%1, %%"REG_S"), %%xmm0 \n\t"
311 + "addps 16(%1, %%"REG_S"), %%xmm1\n\t" 310 + "addps 16(%1, %%"REG_S"), %%xmm1\n\t"
324 +{ 323 +{
325 + asm volatile( 324 + asm volatile(
326 + "movlps %1, %%xmm7 \n\t" 325 + "movlps %1, %%xmm7 \n\t"
327 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 326 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
328 + "mov $-1024, %%"REG_S" \n\t" 327 + "mov $-1024, %%"REG_S" \n\t"
329 + ASMALIGN16 328 + ASMALIGN(4)
330 + "1: \n\t" 329 + "1: \n\t"
331 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" 330 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
332 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" 331 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
333 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" 332 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
334 + "addps %%xmm7, %%xmm1 \n\t" 333 + "addps %%xmm7, %%xmm1 \n\t"
345 +{ 344 +{
346 + asm volatile( 345 + asm volatile(
347 + "movlps %1, %%xmm7 \n\t" 346 + "movlps %1, %%xmm7 \n\t"
348 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 347 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
349 + "mov $-1024, %%"REG_S" \n\t" 348 + "mov $-1024, %%"REG_S" \n\t"
350 + ASMALIGN16 349 + ASMALIGN(4)
351 + "1: \n\t" 350 + "1: \n\t"
352 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" 351 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
353 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" 352 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
354 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" 353 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
355 + "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" 354 + "addps 3072(%0, %%"REG_S"), %%xmm1\n\t"
367 +{ 366 +{
368 + asm volatile( 367 + asm volatile(
369 + "movlps %1, %%xmm7 \n\t" 368 + "movlps %1, %%xmm7 \n\t"
370 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 369 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
371 + "mov $-1024, %%"REG_S" \n\t" 370 + "mov $-1024, %%"REG_S" \n\t"
372 + ASMALIGN16 371 + ASMALIGN(4)
373 + "1: \n\t" 372 + "1: \n\t"
374 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" 373 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
375 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" 374 + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
376 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" 375 + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
377 + "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" 376 + "addps 3072(%0, %%"REG_S"), %%xmm1\n\t"
390 +{ 389 +{
391 + asm volatile( 390 + asm volatile(
392 + "movlps %1, %%xmm7 \n\t" 391 + "movlps %1, %%xmm7 \n\t"
393 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 392 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
394 + "mov $-1024, %%"REG_S" \n\t" 393 + "mov $-1024, %%"REG_S" \n\t"
395 + ASMALIGN16 394 + ASMALIGN(4)
396 + "1: \n\t" 395 + "1: \n\t"
397 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" 396 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
398 + "addps %%xmm7, %%xmm0 \n\t" //common 397 + "addps %%xmm7, %%xmm0 \n\t" //common
399 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" 398 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
400 + "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" 399 + "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
413 +{ 412 +{
414 + asm volatile( 413 + asm volatile(
415 + "movlps %2, %%xmm7 \n\t" 414 + "movlps %2, %%xmm7 \n\t"
416 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 415 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
417 + "mov $-1024, %%"REG_S" \n\t" 416 + "mov $-1024, %%"REG_S" \n\t"
418 + ASMALIGN16 417 + ASMALIGN(4)
419 + "1: \n\t" 418 + "1: \n\t"
420 + "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t" 419 + "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t"
421 + "addps %%xmm7, %%xmm0 \n\t" //common 420 + "addps %%xmm7, %%xmm0 \n\t" //common
422 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" 421 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
423 + "movaps (%1, %%"REG_S"), %%xmm2 \n\t" 422 + "movaps (%1, %%"REG_S"), %%xmm2 \n\t"
436 +{ 435 +{
437 + asm volatile( 436 + asm volatile(
438 + "movlps %1, %%xmm7 \n\t" 437 + "movlps %1, %%xmm7 \n\t"
439 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 438 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
440 + "mov $-1024, %%"REG_S" \n\t" 439 + "mov $-1024, %%"REG_S" \n\t"
441 + ASMALIGN16 440 + ASMALIGN(4)
442 + "1: \n\t" 441 + "1: \n\t"
443 + "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" // surround 442 + "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" // surround
444 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" 443 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
445 + "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" 444 + "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t"
446 + "addps %%xmm7, %%xmm1 \n\t" 445 + "addps %%xmm7, %%xmm1 \n\t"
460 +{ 459 +{
461 + asm volatile( 460 + asm volatile(
462 + "movlps %1, %%xmm7 \n\t" 461 + "movlps %1, %%xmm7 \n\t"
463 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 462 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
464 + "mov $-1024, %%"REG_S" \n\t" 463 + "mov $-1024, %%"REG_S" \n\t"
465 + ASMALIGN16 464 + ASMALIGN(4)
466 + "1: \n\t" 465 + "1: \n\t"
467 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" 466 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
468 + "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" 467 + "addps 3072(%0, %%"REG_S"), %%xmm0\n\t"
469 + "addps %%xmm7, %%xmm0 \n\t" // common 468 + "addps %%xmm7, %%xmm0 \n\t" // common
470 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" 469 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
484 +{ 483 +{
485 + asm volatile( 484 + asm volatile(
486 + "movlps %1, %%xmm7 \n\t" 485 + "movlps %1, %%xmm7 \n\t"
487 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 486 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
488 + "mov $-1024, %%"REG_S" \n\t" 487 + "mov $-1024, %%"REG_S" \n\t"
489 + ASMALIGN16 488 + ASMALIGN(4)
490 + "1: \n\t" 489 + "1: \n\t"
491 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" 490 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
492 + "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround 491 + "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround
493 + "addps %%xmm7, %%xmm0 \n\t" // common 492 + "addps %%xmm7, %%xmm0 \n\t" // common
494 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" 493 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
510 +{ 509 +{
511 + asm volatile( 510 + asm volatile(
512 + "movlps %1, %%xmm7 \n\t" 511 + "movlps %1, %%xmm7 \n\t"
513 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 512 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
514 + "mov $-1024, %%"REG_S" \n\t" 513 + "mov $-1024, %%"REG_S" \n\t"
515 + ASMALIGN16 514 + ASMALIGN(4)
516 + "1: \n\t" 515 + "1: \n\t"
517 + "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" 516 + "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t"
518 + "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround 517 + "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround
519 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" 518 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
520 + "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" 519 + "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t"
535 +{ 534 +{
536 + asm volatile( 535 + asm volatile(
537 + "movlps %1, %%xmm7 \n\t" 536 + "movlps %1, %%xmm7 \n\t"
538 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 537 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
539 + "mov $-1024, %%"REG_S" \n\t" 538 + "mov $-1024, %%"REG_S" \n\t"
540 + ASMALIGN16 539 + ASMALIGN(4)
541 + "1: \n\t" 540 + "1: \n\t"
542 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" 541 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
543 + "addps %%xmm7, %%xmm0 \n\t" // common 542 + "addps %%xmm7, %%xmm0 \n\t" // common
544 + "movaps %%xmm0, %%xmm1 \n\t" // common 543 + "movaps %%xmm0, %%xmm1 \n\t" // common
545 + "addps (%0, %%"REG_S"), %%xmm0 \n\t" 544 + "addps (%0, %%"REG_S"), %%xmm0 \n\t"
559 +{ 558 +{
560 + asm volatile( 559 + asm volatile(
561 + "movlps %1, %%xmm7 \n\t" 560 + "movlps %1, %%xmm7 \n\t"
562 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 561 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
563 + "mov $-1024, %%"REG_S" \n\t" 562 + "mov $-1024, %%"REG_S" \n\t"
564 + ASMALIGN16 563 + ASMALIGN(4)
565 + "1: \n\t" 564 + "1: \n\t"
566 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" 565 + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
567 + "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t" 566 + "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t"
568 + "addps %%xmm7, %%xmm0 \n\t" // common 567 + "addps %%xmm7, %%xmm0 \n\t" // common
569 + "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround 568 + "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround
586 +{ 585 +{
587 + asm volatile( 586 + asm volatile(
588 + "movlps %2, %%xmm7 \n\t" 587 + "movlps %2, %%xmm7 \n\t"
589 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" 588 + "shufps $0x00, %%xmm7, %%xmm7 \n\t"
590 + "mov $-1024, %%"REG_S" \n\t" 589 + "mov $-1024, %%"REG_S" \n\t"
591 + ASMALIGN16 590 + ASMALIGN(4)
592 + "1: \n\t" 591 + "1: \n\t"
593 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" 592 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
594 + "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" 593 + "movaps 16(%0, %%"REG_S"), %%xmm1\n\t"
595 + "addps 1024(%0, %%"REG_S"), %%xmm0\n\t" 594 + "addps 1024(%0, %%"REG_S"), %%xmm0\n\t"
596 + "addps 1040(%0, %%"REG_S"), %%xmm1\n\t" 595 + "addps 1040(%0, %%"REG_S"), %%xmm1\n\t"
608 +static void zero_MMX(sample_t * samples) 607 +static void zero_MMX(sample_t * samples)
609 +{ 608 +{
610 + asm volatile( 609 + asm volatile(
611 + "mov $-1024, %%"REG_S" \n\t" 610 + "mov $-1024, %%"REG_S" \n\t"
612 + "pxor %%mm0, %%mm0 \n\t" 611 + "pxor %%mm0, %%mm0 \n\t"
613 + ASMALIGN16 612 + ASMALIGN(4)
614 + "1: \n\t" 613 + "1: \n\t"
615 + "movq %%mm0, (%0, %%"REG_S") \n\t" 614 + "movq %%mm0, (%0, %%"REG_S") \n\t"
616 + "movq %%mm0, 8(%0, %%"REG_S") \n\t" 615 + "movq %%mm0, 8(%0, %%"REG_S") \n\t"
617 + "movq %%mm0, 16(%0, %%"REG_S") \n\t" 616 + "movq %%mm0, 16(%0, %%"REG_S") \n\t"
618 + "movq %%mm0, 24(%0, %%"REG_S") \n\t" 617 + "movq %%mm0, 24(%0, %%"REG_S") \n\t"
868 +{ 867 +{
869 + asm volatile( 868 + asm volatile(
870 + "movd %2, %%mm7 \n\t" 869 + "movd %2, %%mm7 \n\t"
871 + "punpckldq %2, %%mm7 \n\t" 870 + "punpckldq %2, %%mm7 \n\t"
872 + "mov $-1024, %%"REG_S" \n\t" 871 + "mov $-1024, %%"REG_S" \n\t"
873 + ASMALIGN16 872 + ASMALIGN(4)
874 + "1: \n\t" 873 + "1: \n\t"
875 + "movq (%0, %%"REG_S"), %%mm0 \n\t" 874 + "movq (%0, %%"REG_S"), %%mm0 \n\t"
876 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" 875 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t"
877 + "movq 16(%0, %%"REG_S"), %%mm2 \n\t" 876 + "movq 16(%0, %%"REG_S"), %%mm2 \n\t"
878 + "movq 24(%0, %%"REG_S"), %%mm3 \n\t" 877 + "movq 24(%0, %%"REG_S"), %%mm3 \n\t"
899 +{ 898 +{
900 + asm volatile( 899 + asm volatile(
901 + "movd %1, %%mm7 \n\t" 900 + "movd %1, %%mm7 \n\t"
902 + "punpckldq %1, %%mm7 \n\t" 901 + "punpckldq %1, %%mm7 \n\t"
903 + "mov $-1024, %%"REG_S" \n\t" 902 + "mov $-1024, %%"REG_S" \n\t"
904 + ASMALIGN16 903 + ASMALIGN(4)
905 + "1: \n\t" 904 + "1: \n\t"
906 + "movq (%0, %%"REG_S"), %%mm0 \n\t" 905 + "movq (%0, %%"REG_S"), %%mm0 \n\t"
907 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" 906 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t"
908 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" 907 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t"
909 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" 908 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t"
926 +{ 925 +{
927 + asm volatile( 926 + asm volatile(
928 + "movd %1, %%mm7 \n\t" 927 + "movd %1, %%mm7 \n\t"
929 + "punpckldq %1, %%mm7 \n\t" 928 + "punpckldq %1, %%mm7 \n\t"
930 + "mov $-1024, %%"REG_S" \n\t" 929 + "mov $-1024, %%"REG_S" \n\t"
931 + ASMALIGN16 930 + ASMALIGN(4)
932 + "1: \n\t" 931 + "1: \n\t"
933 + "movq (%0, %%"REG_S"), %%mm0 \n\t" 932 + "movq (%0, %%"REG_S"), %%mm0 \n\t"
934 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" 933 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t"
935 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" 934 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t"
936 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" 935 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t"
955 +{ 954 +{
956 + asm volatile( 955 + asm volatile(
957 + "movd %1, %%mm7 \n\t" 956 + "movd %1, %%mm7 \n\t"
958 + "punpckldq %1, %%mm7 \n\t" 957 + "punpckldq %1, %%mm7 \n\t"
959 + "mov $-1024, %%"REG_S" \n\t" 958 + "mov $-1024, %%"REG_S" \n\t"
960 + ASMALIGN16 959 + ASMALIGN(4)
961 + "1: \n\t" 960 + "1: \n\t"
962 + "movq (%0, %%"REG_S"), %%mm0 \n\t" 961 + "movq (%0, %%"REG_S"), %%mm0 \n\t"
963 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" 962 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t"
964 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" 963 + "movq 1024(%0, %%"REG_S"), %%mm2\n\t"
965 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" 964 + "movq 1032(%0, %%"REG_S"), %%mm3\n\t"
986 +{ 985 +{
987 + asm volatile( 986 + asm volatile(
988 + "movd %1, %%mm7 \n\t" 987 + "movd %1, %%mm7 \n\t"
989 + "punpckldq %1, %%mm7 \n\t" 988 + "punpckldq %1, %%mm7 \n\t"
990 + "mov $-1024, %%"REG_S" \n\t" 989 + "mov $-1024, %%"REG_S" \n\t"
991 + ASMALIGN16 990 + ASMALIGN(4)
992 + "1: \n\t" 991 + "1: \n\t"
993 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" 992 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t"
994 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" 993 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t"
995 + "pfadd %%mm7, %%mm0 \n\t" //common 994 + "pfadd %%mm7, %%mm0 \n\t" //common
996 + "pfadd %%mm7, %%mm1 \n\t" //common 995 + "pfadd %%mm7, %%mm1 \n\t" //common
1017 +{ 1016 +{
1018 + asm volatile( 1017 + asm volatile(
1019 + "movd %2, %%mm7 \n\t" 1018 + "movd %2, %%mm7 \n\t"
1020 + "punpckldq %2, %%mm7 \n\t" 1019 + "punpckldq %2, %%mm7 \n\t"
1021 + "mov $-1024, %%"REG_S" \n\t" 1020 + "mov $-1024, %%"REG_S" \n\t"
1022 + ASMALIGN16 1021 + ASMALIGN(4)
1023 + "1: \n\t" 1022 + "1: \n\t"
1024 + "movq 1024(%1, %%"REG_S"), %%mm0\n\t" 1023 + "movq 1024(%1, %%"REG_S"), %%mm0\n\t"
1025 + "movq 1032(%1, %%"REG_S"), %%mm1\n\t" 1024 + "movq 1032(%1, %%"REG_S"), %%mm1\n\t"
1026 + "pfadd %%mm7, %%mm0 \n\t" //common 1025 + "pfadd %%mm7, %%mm0 \n\t" //common
1027 + "pfadd %%mm7, %%mm1 \n\t" //common 1026 + "pfadd %%mm7, %%mm1 \n\t" //common
1048 +{ 1047 +{
1049 + asm volatile( 1048 + asm volatile(
1050 + "movd %1, %%mm7 \n\t" 1049 + "movd %1, %%mm7 \n\t"
1051 + "punpckldq %1, %%mm7 \n\t" 1050 + "punpckldq %1, %%mm7 \n\t"
1052 + "mov $-1024, %%"REG_S" \n\t" 1051 + "mov $-1024, %%"REG_S" \n\t"
1053 + ASMALIGN16 1052 + ASMALIGN(4)
1054 + "1: \n\t" 1053 + "1: \n\t"
1055 + "movq 2048(%0, %%"REG_S"), %%mm0\n\t" // surround 1054 + "movq 2048(%0, %%"REG_S"), %%mm0\n\t" // surround
1056 + "movq 2056(%0, %%"REG_S"), %%mm1\n\t" // surround 1055 + "movq 2056(%0, %%"REG_S"), %%mm1\n\t" // surround
1057 + "movq (%0, %%"REG_S"), %%mm2 \n\t" 1056 + "movq (%0, %%"REG_S"), %%mm2 \n\t"
1058 + "movq 8(%0, %%"REG_S"), %%mm3 \n\t" 1057 + "movq 8(%0, %%"REG_S"), %%mm3 \n\t"
1081 +{ 1080 +{
1082 + asm volatile( 1081 + asm volatile(
1083 + "movd %1, %%mm7 \n\t" 1082 + "movd %1, %%mm7 \n\t"
1084 + "punpckldq %1, %%mm7 \n\t" 1083 + "punpckldq %1, %%mm7 \n\t"
1085 + "mov $-1024, %%"REG_S" \n\t" 1084 + "mov $-1024, %%"REG_S" \n\t"
1086 + ASMALIGN16 1085 + ASMALIGN(4)
1087 + "1: \n\t" 1086 + "1: \n\t"
1088 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" 1087 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t"
1089 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" 1088 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t"
1090 + "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" 1089 + "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t"
1091 + "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" 1090 + "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t"
1114 +{ 1113 +{
1115 + asm volatile( 1114 + asm volatile(
1116 + "movd %1, %%mm7 \n\t" 1115 + "movd %1, %%mm7 \n\t"
1117 + "punpckldq %1, %%mm7 \n\t" 1116 + "punpckldq %1, %%mm7 \n\t"
1118 + "mov $-1024, %%"REG_S" \n\t" 1117 + "mov $-1024, %%"REG_S" \n\t"
1119 + ASMALIGN16 1118 + ASMALIGN(4)
1120 + "1: \n\t" 1119 + "1: \n\t"
1121 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" 1120 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t"
1122 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" 1121 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t"
1123 + "pfadd %%mm7, %%mm0 \n\t" // common 1122 + "pfadd %%mm7, %%mm0 \n\t" // common
1124 + "pfadd %%mm7, %%mm1 \n\t" // common 1123 + "pfadd %%mm7, %%mm1 \n\t" // common
1151 +{ 1150 +{
1152 + asm volatile( 1151 + asm volatile(
1153 + "movd %1, %%mm7 \n\t" 1152 + "movd %1, %%mm7 \n\t"
1154 + "punpckldq %1, %%mm7 \n\t" 1153 + "punpckldq %1, %%mm7 \n\t"
1155 + "mov $-1024, %%"REG_S" \n\t" 1154 + "mov $-1024, %%"REG_S" \n\t"
1156 + ASMALIGN16 1155 + ASMALIGN(4)
1157 + "1: \n\t" 1156 + "1: \n\t"
1158 + "movq 2048(%0, %%"REG_S"), %%mm0\n\t" 1157 + "movq 2048(%0, %%"REG_S"), %%mm0\n\t"
1159 + "movq 2056(%0, %%"REG_S"), %%mm1\n\t" 1158 + "movq 2056(%0, %%"REG_S"), %%mm1\n\t"
1160 + "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround 1159 + "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround
1161 + "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround 1160 + "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround
1186 +{ 1185 +{
1187 + asm volatile( 1186 + asm volatile(
1188 + "movd %1, %%mm7 \n\t" 1187 + "movd %1, %%mm7 \n\t"
1189 + "punpckldq %1, %%mm7 \n\t" 1188 + "punpckldq %1, %%mm7 \n\t"
1190 + "mov $-1024, %%"REG_S" \n\t" 1189 + "mov $-1024, %%"REG_S" \n\t"
1191 + ASMALIGN16 1190 + ASMALIGN(4)
1192 + "1: \n\t" 1191 + "1: \n\t"
1193 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" 1192 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t"
1194 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" 1193 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t"
1195 + "pfadd %%mm7, %%mm0 \n\t" // common 1194 + "pfadd %%mm7, %%mm0 \n\t" // common
1196 + "pfadd %%mm7, %%mm1 \n\t" // common 1195 + "pfadd %%mm7, %%mm1 \n\t" // common
1218 +/* todo: should be optimized better */ 1217 +/* todo: should be optimized better */
1219 +static void mix32toS_3dnow (sample_t * samples, sample_t bias) 1218 +static void mix32toS_3dnow (sample_t * samples, sample_t bias)
1220 +{ 1219 +{
1221 + asm volatile( 1220 + asm volatile(
1222 + "mov $-1024, %%"REG_S" \n\t" 1221 + "mov $-1024, %%"REG_S" \n\t"
1223 + ASMALIGN16 1222 + ASMALIGN(4)
1224 + "1: \n\t" 1223 + "1: \n\t"
1225 + "movd %1, %%mm7 \n\t" 1224 + "movd %1, %%mm7 \n\t"
1226 + "punpckldq %1, %%mm7 \n\t" 1225 + "punpckldq %1, %%mm7 \n\t"
1227 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" 1226 + "movq 1024(%0, %%"REG_S"), %%mm0\n\t"
1228 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" 1227 + "movq 1032(%0, %%"REG_S"), %%mm1\n\t"
1259 +{ 1258 +{
1260 + asm volatile( 1259 + asm volatile(
1261 + "movd %2, %%mm7 \n\t" 1260 + "movd %2, %%mm7 \n\t"
1262 + "punpckldq %2, %%mm7 \n\t" 1261 + "punpckldq %2, %%mm7 \n\t"
1263 + "mov $-1024, %%"REG_S" \n\t" 1262 + "mov $-1024, %%"REG_S" \n\t"
1264 + ASMALIGN16 1263 + ASMALIGN(4)
1265 + "1: \n\t" 1264 + "1: \n\t"
1266 + "movq (%0, %%"REG_S"), %%mm0 \n\t" 1265 + "movq (%0, %%"REG_S"), %%mm0 \n\t"
1267 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" 1266 + "movq 8(%0, %%"REG_S"), %%mm1 \n\t"
1268 + "movq 16(%0, %%"REG_S"), %%mm2 \n\t" 1267 + "movq 16(%0, %%"REG_S"), %%mm2 \n\t"
1269 + "movq 24(%0, %%"REG_S"), %%mm3 \n\t" 1268 + "movq 24(%0, %%"REG_S"), %%mm3 \n\t"
1429 +} 1428 +}
1430 + 1429 +
1431 +#endif // ARCH_X86 || ARCH_X86_64 1430 +#endif // ARCH_X86 || ARCH_X86_64
1432 --- liba52/imdct.c 2006-06-12 15:18:27.000000000 +0200 1431 --- liba52/imdct.c 2006-06-12 15:18:27.000000000 +0200
1433 +++ liba52/imdct.c 2006-06-12 19:18:39.000000000 +0200 1432 +++ liba52/imdct.c 2006-06-12 19:18:39.000000000 +0200
1434 @@ -22,9 +26,15 @@ 1433 @@ -26,6 +26,11 @@
1435 * You should have received a copy of the GNU General Public License 1434 * You should have received a copy of the GNU General Public License
1436 * along with this program; if not, write to the Free Software 1435 * along with this program; if not, write to the Free Software
1437 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 1436 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1438 + * 1437 + *
1439 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) 1438 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
1441 + * michael did port them from libac3 (untested, perhaps totally broken) 1440 + * michael did port them from libac3 (untested, perhaps totally broken)
1442 + * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org) 1441 + * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
1443 */ 1442 */
1444 1443
1445 #include "config.h" 1444 #include "config.h"
1446 +#include "asmalign.h" 1445 @@ -39,12 +48,49 @@
1447
1448 #include <math.h>
1449 #include <stdio.h>
1450 @@ -39,12 +49,49 @@
1451 #include "a52.h" 1446 #include "a52.h"
1452 #include "a52_internal.h" 1447 #include "a52_internal.h"
1453 #include "mm_accel.h" 1448 #include "mm_accel.h"
1454 +#include "mangle.h" 1449 +#include "mangle.h"
1455 + 1450 +
1495 + 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f}; 1490 + 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
1496 + 1491 +
1497 static uint8_t fftorder[] = { 1492 static uint8_t fftorder[] = {
1498 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176, 1493 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176,
1499 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88, 1494 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88,
1500 @@ -56,6 +103,40 @@ 1495 @@ -56,6 +102,40 @@
1501 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86 1496 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86
1502 }; 1497 };
1503 1498
1504 +static complex_t __attribute__((aligned(16))) buf[128]; 1499 +static complex_t __attribute__((aligned(16))) buf[128];
1505 + 1500 +
1536 +#endif 1531 +#endif
1537 + 1532 +
1538 /* Root values for IFFT */ 1533 /* Root values for IFFT */
1539 static sample_t roots16[3]; 1534 static sample_t roots16[3];
1540 static sample_t roots32[7]; 1535 static sample_t roots32[7];
1541 @@ -241,7 +322,7 @@ 1536 @@ -241,7 +321,7 @@
1542 ifft_pass (buf, roots128 - 32, 32); 1537 ifft_pass (buf, roots128 - 32, 32);
1543 } 1538 }
1544 1539
1545 -void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias) 1540 -void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias)
1546 +void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias) 1541 +void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias)
1547 { 1542 {
1548 int i, k; 1543 int i, k;
1549 sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2; 1544 sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2;
1550 @@ -285,6 +366,714 @@ 1545 @@ -285,6 +365,714 @@
1551 } 1546 }
1552 } 1547 }
1553 1548
1554 +#ifdef HAVE_ALTIVEC 1549 +#ifdef HAVE_ALTIVEC
1555 + 1550 +
1952 + asm volatile( 1947 + asm volatile(
1953 + "xor %%"REG_S", %%"REG_S" \n\t" 1948 + "xor %%"REG_S", %%"REG_S" \n\t"
1954 + "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t" 1949 + "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
1955 + "mov $1008, %%"REG_D" \n\t" 1950 + "mov $1008, %%"REG_D" \n\t"
1956 + "push %%"REG_BP" \n\t" //use ebp without telling gcc 1951 + "push %%"REG_BP" \n\t" //use ebp without telling gcc
1957 + ASMALIGN16 1952 + ASMALIGN(4)
1958 + "1: \n\t" 1953 + "1: \n\t"
1959 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI 1954 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI
1960 + "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI 1955 + "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI
1961 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi 1956 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi
1962 + "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi 1957 + "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi
2011 + // Note w[0][0]={1,0} 2006 + // Note w[0][0]={1,0}
2012 + asm volatile( 2007 + asm volatile(
2013 + "xorps %%xmm1, %%xmm1 \n\t" 2008 + "xorps %%xmm1, %%xmm1 \n\t"
2014 + "xorps %%xmm2, %%xmm2 \n\t" 2009 + "xorps %%xmm2, %%xmm2 \n\t"
2015 + "mov %0, %%"REG_S" \n\t" 2010 + "mov %0, %%"REG_S" \n\t"
2016 + ASMALIGN16 2011 + ASMALIGN(4)
2017 + "1: \n\t" 2012 + "1: \n\t"
2018 + "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p] 2013 + "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
2019 + "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q] 2014 + "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
2020 + "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p] 2015 + "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
2021 + "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q] 2016 + "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
2032 + /* 2. iteration */ 2027 + /* 2. iteration */
2033 + // Note w[1]={{1,0}, {0,-1}} 2028 + // Note w[1]={{1,0}, {0,-1}}
2034 + asm volatile( 2029 + asm volatile(
2035 + "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 2030 + "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
2036 + "mov %0, %%"REG_S" \n\t" 2031 + "mov %0, %%"REG_S" \n\t"
2037 + ASMALIGN16 2032 + ASMALIGN(4)
2038 + "1: \n\t" 2033 + "1: \n\t"
2039 + "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3 2034 + "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3
2040 + "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 2035 + "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
2041 + "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 2036 + "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
2042 + "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1 2037 + "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
2063 + "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" 2058 + "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
2064 + "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" 2059 + "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
2065 + "xorps %%xmm5, %%xmm5 \n\t" 2060 + "xorps %%xmm5, %%xmm5 \n\t"
2066 + "xorps %%xmm2, %%xmm2 \n\t" 2061 + "xorps %%xmm2, %%xmm2 \n\t"
2067 + "mov %0, %%"REG_S" \n\t" 2062 + "mov %0, %%"REG_S" \n\t"
2068 + ASMALIGN16 2063 + ASMALIGN(4)
2069 + "1: \n\t" 2064 + "1: \n\t"
2070 + "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5 2065 + "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5
2071 + "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7 2066 + "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7
2072 + "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 2067 + "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
2073 + "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 2068 + "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
2104 + two_m_plus_one = two_m<<1; 2099 + two_m_plus_one = two_m<<1;
2105 + two_m_plus_one_shl3 = (two_m_plus_one<<3); 2100 + two_m_plus_one_shl3 = (two_m_plus_one<<3);
2106 + buf_offset = buf+128; 2101 + buf_offset = buf+128;
2107 + asm volatile( 2102 + asm volatile(
2108 + "mov %0, %%"REG_S" \n\t" 2103 + "mov %0, %%"REG_S" \n\t"
2109 + ASMALIGN16 2104 + ASMALIGN(4)
2110 + "1: \n\t" 2105 + "1: \n\t"
2111 + "xor %%"REG_D", %%"REG_D" \n\t" // k 2106 + "xor %%"REG_D", %%"REG_D" \n\t" // k
2112 + "lea (%%"REG_S", %3), %%"REG_d" \n\t" 2107 + "lea (%%"REG_S", %3), %%"REG_d" \n\t"
2113 + "2: \n\t" 2108 + "2: \n\t"
2114 + "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t" 2109 + "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t"
2136 + } 2131 + }
2137 + 2132 +
2138 + /* Post IFFT complex multiply plus IFFT complex conjugate*/ 2133 + /* Post IFFT complex multiply plus IFFT complex conjugate*/
2139 + asm volatile( 2134 + asm volatile(
2140 + "mov $-1024, %%"REG_S" \n\t" 2135 + "mov $-1024, %%"REG_S" \n\t"
2141 + ASMALIGN16 2136 + ASMALIGN(4)
2142 + "1: \n\t" 2137 + "1: \n\t"
2143 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" 2138 + "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
2144 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" 2139 + "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
2145 + "shufps $0xB1, %%xmm0, %%xmm0 \n\t" 2140 + "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
2146 + "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t" 2141 + "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
2162 + asm volatile( 2157 + asm volatile(
2163 + "xor %%"REG_D", %%"REG_D" \n\t" // 0 2158 + "xor %%"REG_D", %%"REG_D" \n\t" // 0
2164 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 2159 + "xor %%"REG_S", %%"REG_S" \n\t" // 0
2165 + "movss %3, %%xmm2 \n\t" // bias 2160 + "movss %3, %%xmm2 \n\t" // bias
2166 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... 2161 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
2167 + ASMALIGN16 2162 + ASMALIGN(4)
2168 + "1: \n\t" 2163 + "1: \n\t"
2169 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? 2164 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
2170 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? 2165 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
2171 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? 2166 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
2172 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? 2167 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
2189 + asm volatile( 2184 + asm volatile(
2190 + "mov $1024, %%"REG_D" \n\t" // 512 2185 + "mov $1024, %%"REG_D" \n\t" // 512
2191 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 2186 + "xor %%"REG_S", %%"REG_S" \n\t" // 0
2192 + "movss %3, %%xmm2 \n\t" // bias 2187 + "movss %3, %%xmm2 \n\t" // bias
2193 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... 2188 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
2194 + ASMALIGN16 2189 + ASMALIGN(4)
2195 + "1: \n\t" 2190 + "1: \n\t"
2196 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A 2191 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
2197 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C 2192 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
2198 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C 2193 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
2199 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A 2194 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
2216 + delay_ptr = delay; 2211 + delay_ptr = delay;
2217 + 2212 +
2218 + asm volatile( 2213 + asm volatile(
2219 + "xor %%"REG_D", %%"REG_D" \n\t" // 0 2214 + "xor %%"REG_D", %%"REG_D" \n\t" // 0
2220 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 2215 + "xor %%"REG_S", %%"REG_S" \n\t" // 0
2221 + ASMALIGN16 2216 + ASMALIGN(4)
2222 + "1: \n\t" 2217 + "1: \n\t"
2223 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A 2218 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
2224 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C 2219 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
2225 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C 2220 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
2226 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A 2221 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
2238 +// window_ptr-=128; 2233 +// window_ptr-=128;
2239 + 2234 +
2240 + asm volatile( 2235 + asm volatile(
2241 + "mov $1024, %%"REG_D" \n\t" // 1024 2236 + "mov $1024, %%"REG_D" \n\t" // 1024
2242 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 2237 + "xor %%"REG_S", %%"REG_S" \n\t" // 0
2243 + ASMALIGN16 2238 + ASMALIGN(4)
2244 + "1: \n\t" 2239 + "1: \n\t"
2245 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? 2240 + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
2246 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? 2241 + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
2247 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? 2242 + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
2248 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? 2243 + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
2260 +#endif // ARCH_X86 || ARCH_X86_64 2255 +#endif // ARCH_X86 || ARCH_X86_64
2261 + 2256 +
2262 void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias) 2257 void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias)
2263 { 2258 {
2264 int i, k; 2259 int i, k;
2265 @@ -364,7 +1153,7 @@ 2260 @@ -364,7 +1152,7 @@
2266 2261
2267 void a52_imdct_init (uint32_t mm_accel) 2262 void a52_imdct_init (uint32_t mm_accel)
2268 { 2263 {
2269 - int i, k; 2264 - int i, k;
2270 + int i, j, k; 2265 + int i, j, k;
2271 double sum; 2266 double sum;
2272 2267
2273 /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */ 2268 /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */
2274 @@ -416,6 +1205,99 @@ 2269 @@ -416,6 +1204,99 @@
2275 post2[i].real = cos ((M_PI / 128) * (i + 0.5)); 2270 post2[i].real = cos ((M_PI / 128) * (i + 0.5));
2276 post2[i].imag = sin ((M_PI / 128) * (i + 0.5)); 2271 post2[i].imag = sin ((M_PI / 128) * (i + 0.5));
2277 } 2272 }
2278 + for (i = 0; i < 128; i++) { 2273 + for (i = 0; i < 128; i++) {
2279 + xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); 2274 + xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
2369 + else 2364 + else
2370 +#endif 2365 +#endif
2371 2366
2372 #ifdef LIBA52_DJBFFT 2367 #ifdef LIBA52_DJBFFT
2373 if (mm_accel & MM_ACCEL_DJBFFT) { 2368 if (mm_accel & MM_ACCEL_DJBFFT) {
2374 @@ -426,7 +1308,5 @@ 2369 @@ -426,7 +1307,5 @@
2375 #endif 2370 #endif
2376 { 2371 {
2377 fprintf (stderr, "No accelerated IMDCT transform found\n"); 2372 fprintf (stderr, "No accelerated IMDCT transform found\n");
2378 - ifft128 = ifft128_c; 2373 - ifft128 = ifft128_c;
2379 - ifft64 = ifft64_c; 2374 - ifft64 = ifft64_c;