comparison liba52/imdct.c @ 9001:01a9cf43074c

An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c) It's nearly bit-perfect, I have a couple of lsb changed in a 128 frames sample. I can't hear the differences :-) patch by Romain Dolbeau <dolbeau@irisa.fr>
author arpi
date Sat, 18 Jan 2003 19:28:29 +0000
parents fb88ccbc5ccc
children 5ba896a38d75
comparison
equal deleted inserted replaced
9000:cc4c2ff1588e 9001:01a9cf43074c
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * 22 *
23 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) 23 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
24 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru> 24 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
25 * michael did port them from libac3 (untested, perhaps totally broken) 25 * michael did port them from libac3 (untested, perhaps totally broken)
26 * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
26 */ 27 */
27 28
28 #include "config.h" 29 #include "config.h"
29 30
30 #include <math.h> 31 #include <math.h>
112 static float __attribute__((aligned(16))) sseW6[256]; 113 static float __attribute__((aligned(16))) sseW6[256];
113 static float __attribute__((aligned(16))) *sseW[7]= 114 static float __attribute__((aligned(16))) *sseW[7]=
114 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; 115 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
115 static float __attribute__((aligned(16))) sseWindow[512]; 116 static float __attribute__((aligned(16))) sseWindow[512];
116 #else 117 #else
117 static complex_t buf[128]; 118 static complex_t __attribute__((aligned(16))) buf[128];
118 #endif 119 #endif
119 120
120 /* Twiddle factor LUT */ 121 /* Twiddle factor LUT */
121 static complex_t w_1[1]; 122 static complex_t __attribute__((aligned(16))) w_1[1];
122 static complex_t w_2[2]; 123 static complex_t __attribute__((aligned(16))) w_2[2];
123 static complex_t w_4[4]; 124 static complex_t __attribute__((aligned(16))) w_4[4];
124 static complex_t w_8[8]; 125 static complex_t __attribute__((aligned(16))) w_8[8];
125 static complex_t w_16[16]; 126 static complex_t __attribute__((aligned(16))) w_16[16];
126 static complex_t w_32[32]; 127 static complex_t __attribute__((aligned(16))) w_32[32];
127 static complex_t w_64[64]; 128 static complex_t __attribute__((aligned(16))) w_64[64];
128 static complex_t * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; 129 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
129 130
130 /* Twiddle factors for IMDCT */ 131 /* Twiddle factors for IMDCT */
131 static sample_t xcos1[128]; 132 static sample_t __attribute__((aligned(16))) xcos1[128];
132 static sample_t xsin1[128]; 133 static sample_t __attribute__((aligned(16))) xsin1[128];
133 static sample_t xcos2[64]; 134 static sample_t __attribute__((aligned(16))) xcos2[64];
134 static sample_t xsin2[64]; 135 static sample_t __attribute__((aligned(16))) xsin2[64];
135 136
136 /* Windowing function for Modified DCT - Thank you acroread */ 137 /* Windowing function for Modified DCT - Thank you acroread */
137 sample_t imdct_window[] = { 138 sample_t imdct_window[] = {
138 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130, 139 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
139 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443, 140 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
381 for(i=0; i<64; i++) { 382 for(i=0; i<64; i++) {
382 *delay_ptr++ = buf[i].imag * *--window_ptr; 383 *delay_ptr++ = buf[i].imag * *--window_ptr;
383 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; 384 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
384 } 385 }
385 } 386 }
387
388 #ifdef HAVE_ALTIVEC
389
390 // used to build registers permutation vectors (vcprm)
391 // the 's' are for words in the _s_econd vector
392 #define WORD_0 0x00,0x01,0x02,0x03
393 #define WORD_1 0x04,0x05,0x06,0x07
394 #define WORD_2 0x08,0x09,0x0a,0x0b
395 #define WORD_3 0x0c,0x0d,0x0e,0x0f
396 #define WORD_s0 0x10,0x11,0x12,0x13
397 #define WORD_s1 0x14,0x15,0x16,0x17
398 #define WORD_s2 0x18,0x19,0x1a,0x1b
399 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
400
401 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
402
403 // vcprmle is used to keep the same index as in the SSE version.
404 // it's the same as vcprm, with the index inversed
405 // ('le' is Little Endian)
406 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
407
408 // used to build inverse/identity vectors (vcii)
409 // n is _n_egative, p is _p_ositive
410 #define FLOAT_n -1.
411 #define FLOAT_p 1.
412
413 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
414
415 void
416 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
417 {
418 int i;
419 int k;
420 int p,q;
421 int m;
422 int two_m;
423 int two_m_plus_one;
424
425 sample_t tmp_b_i;
426 sample_t tmp_b_r;
427 sample_t tmp_a_i;
428 sample_t tmp_a_r;
429
430 sample_t *data_ptr;
431 sample_t *delay_ptr;
432 sample_t *window_ptr;
433
434 /* 512 IMDCT with source and dest data in 'data' */
435
436 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
437 for( i=0; i < 128; i++) {
438 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
439 int j= bit_reverse_512[i];
440 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
441 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
442 }
443
444 /* 1. iteration */
445 for(i = 0; i < 128; i += 2) {
446 #if 0
447 tmp_a_r = buf[i].real;
448 tmp_a_i = buf[i].imag;
449 tmp_b_r = buf[i+1].real;
450 tmp_b_i = buf[i+1].imag;
451 buf[i].real = tmp_a_r + tmp_b_r;
452 buf[i].imag = tmp_a_i + tmp_b_i;
453 buf[i+1].real = tmp_a_r - tmp_b_r;
454 buf[i+1].imag = tmp_a_i - tmp_b_i;
455 #else
456 vector float temp, bufv;
457
458 bufv = vec_ld(i << 3, (float*)buf);
459 temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
460 bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
461 vec_st(bufv, i << 3, (float*)buf);
462 #endif
463 }
464
465 /* 2. iteration */
466 // Note w[1]={{1,0}, {0,-1}}
467 for(i = 0; i < 128; i += 4) {
468 #if 0
469 tmp_a_r = buf[i].real;
470 tmp_a_i = buf[i].imag;
471 tmp_b_r = buf[i+2].real;
472 tmp_b_i = buf[i+2].imag;
473 buf[i].real = tmp_a_r + tmp_b_r;
474 buf[i].imag = tmp_a_i + tmp_b_i;
475 buf[i+2].real = tmp_a_r - tmp_b_r;
476 buf[i+2].imag = tmp_a_i - tmp_b_i;
477 tmp_a_r = buf[i+1].real;
478 tmp_a_i = buf[i+1].imag;
479 /* WARNING: im <-> re here ! */
480 tmp_b_r = buf[i+3].imag;
481 tmp_b_i = buf[i+3].real;
482 buf[i+1].real = tmp_a_r + tmp_b_r;
483 buf[i+1].imag = tmp_a_i - tmp_b_i;
484 buf[i+3].real = tmp_a_r - tmp_b_r;
485 buf[i+3].imag = tmp_a_i + tmp_b_i;
486 #else
487 vector float buf01, buf23, temp1, temp2;
488
489 buf01 = vec_ld((i + 0) << 3, (float*)buf);
490 buf23 = vec_ld((i + 2) << 3, (float*)buf);
491 buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
492
493 temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
494 temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
495
496 vec_st(temp1, (i + 0) << 3, (float*)buf);
497 vec_st(temp2, (i + 2) << 3, (float*)buf);
498 #endif
499 }
500
501 /* 3. iteration */
502 for(i = 0; i < 128; i += 8) {
503 #if 0
504 tmp_a_r = buf[i].real;
505 tmp_a_i = buf[i].imag;
506 tmp_b_r = buf[i+4].real;
507 tmp_b_i = buf[i+4].imag;
508 buf[i].real = tmp_a_r + tmp_b_r;
509 buf[i].imag = tmp_a_i + tmp_b_i;
510 buf[i+4].real = tmp_a_r - tmp_b_r;
511 buf[i+4].imag = tmp_a_i - tmp_b_i;
512 tmp_a_r = buf[1+i].real;
513 tmp_a_i = buf[1+i].imag;
514 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
515 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
516 buf[1+i].real = tmp_a_r + tmp_b_r;
517 buf[1+i].imag = tmp_a_i + tmp_b_i;
518 buf[i+5].real = tmp_a_r - tmp_b_r;
519 buf[i+5].imag = tmp_a_i - tmp_b_i;
520 tmp_a_r = buf[i+2].real;
521 tmp_a_i = buf[i+2].imag;
522 /* WARNING re <-> im & sign */
523 tmp_b_r = buf[i+6].imag;
524 tmp_b_i = - buf[i+6].real;
525 buf[i+2].real = tmp_a_r + tmp_b_r;
526 buf[i+2].imag = tmp_a_i + tmp_b_i;
527 buf[i+6].real = tmp_a_r - tmp_b_r;
528 buf[i+6].imag = tmp_a_i - tmp_b_i;
529 tmp_a_r = buf[i+3].real;
530 tmp_a_i = buf[i+3].imag;
531 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
532 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
533 buf[i+3].real = tmp_a_r + tmp_b_r;
534 buf[i+3].imag = tmp_a_i + tmp_b_i;
535 buf[i+7].real = tmp_a_r - tmp_b_r;
536 buf[i+7].imag = tmp_a_i - tmp_b_i;
537 #else
538 vector float buf01, buf23, buf45, buf67;
539
540 buf01 = vec_ld((i + 0) << 3, (float*)buf);
541 buf23 = vec_ld((i + 2) << 3, (float*)buf);
542
543 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
544 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
545 buf[i+5].real = tmp_b_r;
546 buf[i+5].imag = tmp_b_i;
547 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
548 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
549 buf[i+7].real = tmp_b_r;
550 buf[i+7].imag = tmp_b_i;
551
552 buf23 = vec_ld((i + 2) << 3, (float*)buf);
553 buf45 = vec_ld((i + 4) << 3, (float*)buf);
554 buf67 = vec_ld((i + 6) << 3, (float*)buf);
555 buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
556
557 vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
558 vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
559 vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
560 vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
561 #endif
562 }
563
564 /* 4-7. iterations */
565 for (m=3; m < 7; m++) {
566 two_m = (1 << m);
567
568 two_m_plus_one = two_m<<1;
569
570 for(i = 0; i < 128; i += two_m_plus_one) {
571 for(k = 0; k < two_m; k+=2) {
572 #if 0
573 int p = k + i;
574 int q = p + two_m;
575 tmp_a_r = buf[p].real;
576 tmp_a_i = buf[p].imag;
577 tmp_b_r =
578 buf[q].real * w[m][k].real -
579 buf[q].imag * w[m][k].imag;
580 tmp_b_i =
581 buf[q].imag * w[m][k].real +
582 buf[q].real * w[m][k].imag;
583 buf[p].real = tmp_a_r + tmp_b_r;
584 buf[p].imag = tmp_a_i + tmp_b_i;
585 buf[q].real = tmp_a_r - tmp_b_r;
586 buf[q].imag = tmp_a_i - tmp_b_i;
587
588 tmp_a_r = buf[(p + 1)].real;
589 tmp_a_i = buf[(p + 1)].imag;
590 tmp_b_r =
591 buf[(q + 1)].real * w[m][(k + 1)].real -
592 buf[(q + 1)].imag * w[m][(k + 1)].imag;
593 tmp_b_i =
594 buf[(q + 1)].imag * w[m][(k + 1)].real +
595 buf[(q + 1)].real * w[m][(k + 1)].imag;
596 buf[(p + 1)].real = tmp_a_r + tmp_b_r;
597 buf[(p + 1)].imag = tmp_a_i + tmp_b_i;
598 buf[(q + 1)].real = tmp_a_r - tmp_b_r;
599 buf[(q + 1)].imag = tmp_a_i - tmp_b_i;
600 #else
601 int p = k + i;
602 int q = p + two_m;
603 vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
604 const vector float vczero = (const vector float)(0);
605 // first compute buf[q] and buf[q+1]
606 vecq = vec_ld(q << 3, (float*)buf);
607 vecw = vec_ld(0, (float*)&(w[m][k]));
608 temp1 = vec_madd(vecq, vecw, vczero);
609 temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
610 temp2 = vec_madd(temp2, vecw, vczero);
611 temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
612 temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
613 vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
614 // then butterfly with buf[p] and buf[p+1]
615 vecp = vec_ld(p << 3, (float*)buf);
616
617 temp1 = vec_add(vecp, vecq);
618 temp2 = vec_sub(vecp, vecq);
619
620 vec_st(temp1, p << 3, (float*)buf);
621 vec_st(temp2, q << 3, (float*)buf);
622 #endif
623 }
624 }
625 }
626
627 /* Post IFFT complex multiply plus IFFT complex conjugate*/
628 for( i=0; i < 128; i+=4) {
629 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
630 #if 0
631 tmp_a_r = buf[(i + 0)].real;
632 tmp_a_i = -1.0 * buf[(i + 0)].imag;
633 buf[(i + 0)].real =
634 (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]);
635 buf[(i + 0)].imag =
636 (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]);
637
638 tmp_a_r = buf[(i + 1)].real;
639 tmp_a_i = -1.0 * buf[(i + 1)].imag;
640 buf[(i + 1)].real =
641 (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]);
642 buf[(i + 1)].imag =
643 (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]);
644
645 tmp_a_r = buf[(i + 2)].real;
646 tmp_a_i = -1.0 * buf[(i + 2)].imag;
647 buf[(i + 2)].real =
648 (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]);
649 buf[(i + 2)].imag =
650 (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]);
651
652 tmp_a_r = buf[(i + 3)].real;
653 tmp_a_i = -1.0 * buf[(i + 3)].imag;
654 buf[(i + 3)].real =
655 (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]);
656 buf[(i + 3)].imag =
657 (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]);
658 #else
659 vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
660 vector float temp0022, temp1133, tempCS01;
661 const vector float vczero = (const vector float)(0);
662
663 bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
664 bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
665
666 cosv = vec_ld(i << 2, xcos1);
667 sinv = vec_ld(i << 2, xsin1);
668
669 temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
670 temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
671 tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
672 temp1 = vec_madd(temp0022, tempCS01, vczero);
673 tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
674 temp2 = vec_madd(temp1133, tempCS01, vczero);
675 bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
676
677 vec_st(bufv_0, (i + 0) << 3, (float*)buf);
678
679 /* idem with bufv_2 and high-order cosv/sinv */
680
681 temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
682 temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
683 tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
684 temp1 = vec_madd(temp0022, tempCS01, vczero);
685 tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
686 temp2 = vec_madd(temp1133, tempCS01, vczero);
687 bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
688
689 vec_st(bufv_2, (i + 2) << 3, (float*)buf);
690
691 #endif
692 }
693
694 data_ptr = data;
695 delay_ptr = delay;
696 window_ptr = imdct_window;
697
698 /* Window and convert to real valued signal */
699 for(i=0; i< 64; i++) {
700 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
701 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
702 }
703
704 for(i=0; i< 64; i++) {
705 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
706 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
707 }
708
709 /* The trailing edge of the window goes into the delay line */
710 delay_ptr = delay;
711
712 for(i=0; i< 64; i++) {
713 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
714 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
715 }
716
717 for(i=0; i<64; i++) {
718 *delay_ptr++ = buf[i].imag * *--window_ptr;
719 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
720 }
721 }
722 #endif
723
386 724
387 // Stuff below this line is borrowed from libac3 725 // Stuff below this line is borrowed from libac3
388 #include "srfftp.h" 726 #include "srfftp.h"
389 #ifdef ARCH_X86 727 #ifdef ARCH_X86
390 #ifndef HAVE_3DNOW 728 #ifndef HAVE_3DNOW
963 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n"); 1301 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
964 imdct_512 = imdct_do_512_3dnow; 1302 imdct_512 = imdct_do_512_3dnow;
965 } 1303 }
966 else 1304 else
967 #endif // arch_x86 1305 #endif // arch_x86
1306 #ifdef HAVE_ALTIVEC
1307 if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
1308 {
1309 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
1310 imdct_512 = imdct_do_512_altivec;
1311 }
1312 else
1313 #endif
968 fprintf (stderr, "No accelerated IMDCT transform found\n"); 1314 fprintf (stderr, "No accelerated IMDCT transform found\n");
969 imdct_256 = imdct_do_256; 1315 imdct_256 = imdct_do_256;
970 } 1316 }
971 } 1317 }
972 1318