# HG changeset patch # User michael # Date 1008555990 0 # Node ID 9ff2e380102749a2a04435ea9f6f76b67616b16e # Parent 314987c0aaf4806eed7f0b67d193ea3a1e3676b7 sse opt diff -r 314987c0aaf4 -r 9ff2e3801027 liba52/imdct.c --- a/liba52/imdct.c Mon Dec 17 01:58:24 2001 +0000 +++ b/liba52/imdct.c Mon Dec 17 02:26:30 2001 +0000 @@ -87,6 +87,7 @@ static float __attribute__((aligned(16))) sseW6[256]; static float __attribute__((aligned(16))) *sseW[7]= {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; +static float __attribute__((aligned(16))) sseWindow[256]; #else static complex_t buf[128]; #endif @@ -488,15 +489,72 @@ window_ptr = imdct_window; /* Window and convert to real valued signal */ +#ifdef HAVE_SSE + asm volatile( + "xorl %%edi, %%edi \n\t" // 0 + "xorl %%esi, %%esi \n\t" // 0 + "movss %3, %%xmm2 \n\t" // bias + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... + ".balign 16 \n\t" + "1: \n\t" + "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? + "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? + "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? + "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? + "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A + "mulps sseWindow(%%esi), %%xmm0 \n\t" + "addps (%2, %%esi), %%xmm0 \n\t" + "addps %%xmm2, %%xmm0 \n\t" + "movaps %%xmm0, (%1, %%esi) \n\t" + "addl $16, %%esi \n\t" + "subl $16, %%edi \n\t" + "cmpl $512, %%esi \n\t" + " jb 1b \n\t" + :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) + : "%esi", "%edi" + ); + data_ptr+=128; + delay_ptr+=128; + window_ptr+=128; +#else for(i=0; i< 64; i++) { *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; } +#endif +#ifdef HAVE_SSE + asm volatile( + "movl $1024, %%edi \n\t" // 512 + "xorl %%esi, %%esi \n\t" // 0 + "movss %3, %%xmm2 \n\t" // bias + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... + ".balign 16 \n\t" + "1: \n\t" + "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A + "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C + "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C + "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A + "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A + "mulps 512+sseWindow(%%esi), %%xmm0 \n\t" + "addps (%2, %%esi), %%xmm0 \n\t" + "addps %%xmm2, %%xmm0 \n\t" + "movaps %%xmm0, (%1, %%esi) \n\t" + "addl $16, %%esi \n\t" + "subl $16, %%edi \n\t" + "cmpl $512, %%esi \n\t" + " jb 1b \n\t" + :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) + : "%esi", "%edi" + ); + data_ptr+=128; + window_ptr+=128; +#else for(i=0; i< 64; i++) { *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; } +#endif /* The trailing edge of the window goes into the delay line */ delay_ptr = delay; @@ -710,6 +768,13 @@ } } } + + for(i=0; i<128; i++) + { + sseWindow[2*i+0]= -imdct_window[2*i+0]; + sseWindow[2*i+1]= imdct_window[2*i+1]; + } + #endif imdct_512 = imdct_do_512;