Mercurial > mplayer.hg
comparison liba52/imdct.c @ 3553:a501627fc6db
sse opt
author | michael |
---|---|
date | Mon, 17 Dec 2001 03:30:08 +0000 |
parents | 9ff2e3801027 |
children | 831860fada69 |
comparison
equal
deleted
inserted
replaced
3552:9ff2e3801027 | 3553:a501627fc6db |
---|---|
85 static float __attribute__((aligned(16))) sseW4[64]; | 85 static float __attribute__((aligned(16))) sseW4[64]; |
86 static float __attribute__((aligned(16))) sseW5[128]; | 86 static float __attribute__((aligned(16))) sseW5[128]; |
87 static float __attribute__((aligned(16))) sseW6[256]; | 87 static float __attribute__((aligned(16))) sseW6[256]; |
88 static float __attribute__((aligned(16))) *sseW[7]= | 88 static float __attribute__((aligned(16))) *sseW[7]= |
89 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; | 89 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; |
90 static float __attribute__((aligned(16))) sseWindow[256]; | 90 static float __attribute__((aligned(16))) sseWindow[512]; |
91 #else | 91 #else |
92 static complex_t buf[128]; | 92 static complex_t buf[128]; |
93 #endif | 93 #endif |
94 | 94 |
95 /* Twiddle factor LUT */ | 95 /* Twiddle factor LUT */ |
513 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | 513 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) |
514 : "%esi", "%edi" | 514 : "%esi", "%edi" |
515 ); | 515 ); |
516 data_ptr+=128; | 516 data_ptr+=128; |
517 delay_ptr+=128; | 517 delay_ptr+=128; |
518 window_ptr+=128; | 518 // window_ptr+=128; |
519 #else | 519 #else |
520 for(i=0; i< 64; i++) { | 520 for(i=0; i< 64; i++) { |
521 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; | 521 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; |
522 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | 522 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; |
523 } | 523 } |
546 " jb 1b \n\t" | 546 " jb 1b \n\t" |
547 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | 547 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) |
548 : "%esi", "%edi" | 548 : "%esi", "%edi" |
549 ); | 549 ); |
550 data_ptr+=128; | 550 data_ptr+=128; |
551 window_ptr+=128; | 551 // window_ptr+=128; |
552 #else | 552 #else |
553 for(i=0; i< 64; i++) { | 553 for(i=0; i< 64; i++) { |
554 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; | 554 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; |
555 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | 555 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; |
556 } | 556 } |
557 #endif | 557 #endif |
558 | 558 |
559 /* The trailing edge of the window goes into the delay line */ | 559 /* The trailing edge of the window goes into the delay line */ |
560 delay_ptr = delay; | 560 delay_ptr = delay; |
561 | 561 |
562 #ifdef HAVE_SSE | |
563 asm volatile( | |
564 "xorl %%edi, %%edi \n\t" // 0 | |
565 "xorl %%esi, %%esi \n\t" // 0 | |
566 ".balign 16 \n\t" | |
567 "1: \n\t" | |
568 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A | |
569 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C | |
570 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C | |
571 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A | |
572 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A | |
573 "mulps 1024+sseWindow(%%esi), %%xmm0 \n\t" | |
574 "movaps %%xmm0, (%1, %%esi) \n\t" | |
575 "addl $16, %%esi \n\t" | |
576 "subl $16, %%edi \n\t" | |
577 "cmpl $512, %%esi \n\t" | |
578 " jb 1b \n\t" | |
579 :: "r" (buf+64), "r" (delay_ptr) | |
580 : "%esi", "%edi" | |
581 ); | |
582 delay_ptr+=128; | |
583 // window_ptr-=128; | |
584 #else | |
562 for(i=0; i< 64; i++) { | 585 for(i=0; i< 64; i++) { |
563 *delay_ptr++ = -buf[64+i].real * *--window_ptr; | 586 *delay_ptr++ = -buf[64+i].real * *--window_ptr; |
564 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; | 587 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; |
565 } | 588 } |
566 | 589 #endif |
590 | |
591 #ifdef HAVE_SSE | |
592 asm volatile( | |
593 "movl $1024, %%edi \n\t" // 1024 | |
594 "xorl %%esi, %%esi \n\t" // 0 | |
595 ".balign 16 \n\t" | |
596 "1: \n\t" | |
597 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? | |
598 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? | |
599 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? | |
600 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? | |
601 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A | |
602 "mulps 1536+sseWindow(%%esi), %%xmm0 \n\t" | |
603 "movaps %%xmm0, (%1, %%esi) \n\t" | |
604 "addl $16, %%esi \n\t" | |
605 "subl $16, %%edi \n\t" | |
606 "cmpl $512, %%esi \n\t" | |
607 " jb 1b \n\t" | |
608 :: "r" (buf), "r" (delay_ptr) | |
609 : "%esi", "%edi" | |
610 ); | |
611 #else | |
567 for(i=0; i<64; i++) { | 612 for(i=0; i<64; i++) { |
568 *delay_ptr++ = buf[i].imag * *--window_ptr; | 613 *delay_ptr++ = buf[i].imag * *--window_ptr; |
569 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; | 614 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; |
570 } | 615 } |
616 #endif | |
571 } | 617 } |
572 | 618 |
573 void | 619 void |
574 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias) | 620 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias) |
575 { | 621 { |
770 } | 816 } |
771 | 817 |
772 for(i=0; i<128; i++) | 818 for(i=0; i<128; i++) |
773 { | 819 { |
774 sseWindow[2*i+0]= -imdct_window[2*i+0]; | 820 sseWindow[2*i+0]= -imdct_window[2*i+0]; |
775 sseWindow[2*i+1]= imdct_window[2*i+1]; | 821 sseWindow[2*i+1]= imdct_window[2*i+1]; |
776 } | 822 } |
777 | 823 |
824 for(i=0; i<64; i++) | |
825 { | |
826 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1]; | |
827 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0]; | |
828 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1]; | |
829 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0]; | |
830 } | |
778 #endif | 831 #endif |
779 | 832 |
780 imdct_512 = imdct_do_512; | 833 imdct_512 = imdct_do_512; |
781 imdct_256 = imdct_do_256; | 834 imdct_256 = imdct_do_256; |
782 } | 835 } |