comparison liba52/imdct.c @ 3553:a501627fc6db

sse opt
author michael
date Mon, 17 Dec 2001 03:30:08 +0000
parents 9ff2e3801027
children 831860fada69
comparison
equal deleted inserted replaced
3552:9ff2e3801027 3553:a501627fc6db
85 static float __attribute__((aligned(16))) sseW4[64]; 85 static float __attribute__((aligned(16))) sseW4[64];
86 static float __attribute__((aligned(16))) sseW5[128]; 86 static float __attribute__((aligned(16))) sseW5[128];
87 static float __attribute__((aligned(16))) sseW6[256]; 87 static float __attribute__((aligned(16))) sseW6[256];
88 static float __attribute__((aligned(16))) *sseW[7]= 88 static float __attribute__((aligned(16))) *sseW[7]=
89 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; 89 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
90 static float __attribute__((aligned(16))) sseWindow[256]; 90 static float __attribute__((aligned(16))) sseWindow[512];
91 #else 91 #else
92 static complex_t buf[128]; 92 static complex_t buf[128];
93 #endif 93 #endif
94 94
95 /* Twiddle factor LUT */ 95 /* Twiddle factor LUT */
513 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) 513 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
514 : "%esi", "%edi" 514 : "%esi", "%edi"
515 ); 515 );
516 data_ptr+=128; 516 data_ptr+=128;
517 delay_ptr+=128; 517 delay_ptr+=128;
518 window_ptr+=128; 518 // window_ptr+=128;
519 #else 519 #else
520 for(i=0; i< 64; i++) { 520 for(i=0; i< 64; i++) {
521 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; 521 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
522 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; 522 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
523 } 523 }
546 " jb 1b \n\t" 546 " jb 1b \n\t"
547 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) 547 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
548 : "%esi", "%edi" 548 : "%esi", "%edi"
549 ); 549 );
550 data_ptr+=128; 550 data_ptr+=128;
551 window_ptr+=128; 551 // window_ptr+=128;
552 #else 552 #else
553 for(i=0; i< 64; i++) { 553 for(i=0; i< 64; i++) {
554 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; 554 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
555 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; 555 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
556 } 556 }
557 #endif 557 #endif
558 558
559 /* The trailing edge of the window goes into the delay line */ 559 /* The trailing edge of the window goes into the delay line */
560 delay_ptr = delay; 560 delay_ptr = delay;
561 561
562 #ifdef HAVE_SSE
563 asm volatile(
564 "xorl %%edi, %%edi \n\t" // 0
565 "xorl %%esi, %%esi \n\t" // 0
566 ".balign 16 \n\t"
567 "1: \n\t"
568 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A
569 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C
570 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C
571 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A
572 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
573 "mulps 1024+sseWindow(%%esi), %%xmm0 \n\t"
574 "movaps %%xmm0, (%1, %%esi) \n\t"
575 "addl $16, %%esi \n\t"
576 "subl $16, %%edi \n\t"
577 "cmpl $512, %%esi \n\t"
578 " jb 1b \n\t"
579 :: "r" (buf+64), "r" (delay_ptr)
580 : "%esi", "%edi"
581 );
582 delay_ptr+=128;
583 // window_ptr-=128;
584 #else
562 for(i=0; i< 64; i++) { 585 for(i=0; i< 64; i++) {
563 *delay_ptr++ = -buf[64+i].real * *--window_ptr; 586 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
564 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; 587 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
565 } 588 }
566 589 #endif
590
591 #ifdef HAVE_SSE
592 asm volatile(
593 "movl $1024, %%edi \n\t" // 1024
594 "xorl %%esi, %%esi \n\t" // 0
595 ".balign 16 \n\t"
596 "1: \n\t"
597 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ?
598 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ?
599 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ?
600 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ?
601 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
602 "mulps 1536+sseWindow(%%esi), %%xmm0 \n\t"
603 "movaps %%xmm0, (%1, %%esi) \n\t"
604 "addl $16, %%esi \n\t"
605 "subl $16, %%edi \n\t"
606 "cmpl $512, %%esi \n\t"
607 " jb 1b \n\t"
608 :: "r" (buf), "r" (delay_ptr)
609 : "%esi", "%edi"
610 );
611 #else
567 for(i=0; i<64; i++) { 612 for(i=0; i<64; i++) {
568 *delay_ptr++ = buf[i].imag * *--window_ptr; 613 *delay_ptr++ = buf[i].imag * *--window_ptr;
569 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; 614 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
570 } 615 }
616 #endif
571 } 617 }
572 618
573 void 619 void
574 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias) 620 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
575 { 621 {
770 } 816 }
771 817
772 for(i=0; i<128; i++) 818 for(i=0; i<128; i++)
773 { 819 {
774 sseWindow[2*i+0]= -imdct_window[2*i+0]; 820 sseWindow[2*i+0]= -imdct_window[2*i+0];
775 sseWindow[2*i+1]= imdct_window[2*i+1]; 821 sseWindow[2*i+1]= imdct_window[2*i+1];
776 } 822 }
777 823
824 for(i=0; i<64; i++)
825 {
826 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
827 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0];
828 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
829 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
830 }
778 #endif 831 #endif
779 832
780 imdct_512 = imdct_do_512; 833 imdct_512 = imdct_do_512;
781 imdct_256 = imdct_do_256; 834 imdct_256 = imdct_do_256;
782 } 835 }