comparison liba52/imdct.c @ 3527:5a88b21cfe8a

sse opt
author michael
date Sun, 16 Dec 2001 15:00:02 +0000
parents 1f166e420b15
children a86166b495a6
comparison
equal deleted inserted replaced
3526:cf787373f0aa 3527:5a88b21cfe8a
73 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f}; 73 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
74 74
75 #ifdef HAVE_SSE 75 #ifdef HAVE_SSE
76 // NOTE: SSE needs 16byte alignment or it will segfault 76 // NOTE: SSE needs 16byte alignment or it will segfault
77 static complex_t __attribute__((aligned(16))) buf[128]; 77 static complex_t __attribute__((aligned(16))) buf[128];
78 static float __attribute__((aligned(16))) sseSinCos1a[256];
79 static float __attribute__((aligned(16))) sseSinCos1b[256];
78 static float __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1}; 80 static float __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
79 #else 81 #else
80 static complex_t buf[128]; 82 static complex_t buf[128];
81 #endif 83 #endif
82 84
172 sample_t *window_ptr; 174 sample_t *window_ptr;
173 175
174 /* 512 IMDCT with source and dest data in 'data' */ 176 /* 512 IMDCT with source and dest data in 'data' */
175 177
176 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ 178 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
179 #ifdef HAVE_SSE
180 asm volatile(
181 "xorl %%esi, %%esi \n\t"
182 "movl $1008, %%edi \n\t"
183 "1: \n\t"
184 "movaps (%0, %%esi), %%xmm0 \n\t"
185 "movaps (%0, %%edi), %%xmm1 \n\t"
186 "shufps $0xA0, %%xmm0, %%xmm0 \n\t"
187 "shufps $0x5F, %%xmm1, %%xmm1 \n\t"
188 "mulps sseSinCos1a(%%esi), %%xmm0 \n\t"
189 "mulps sseSinCos1b(%%esi), %%xmm1 \n\t"
190 "addps %%xmm1, %%xmm0 \n\t"
191 "movaps %%xmm0, (%1, %%esi) \n\t"
192 "addl $16, %%esi \n\t"
193 "subl $16, %%edi \n\t"
194 " jnc 1b \n\t"
195 :: "r" (data), "r" (buf)
196 : "%esi", "%edi"
197 );
198 #else
177 for( i=0; i < 128; i++) { 199 for( i=0; i < 128; i++) {
178 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ 200 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
179 buf[i].real = (data[256-2*i-1] * xcos1[i]) - (data[2*i] * xsin1[i]); 201 buf[i].real = (data[256-2*i-1] * xcos1[i]) - (data[2*i] * xsin1[i]);
180 buf[i].imag = -1.0 * ((data[2*i] * xcos1[i]) + (data[256-2*i-1] * xsin1[i])); 202 buf[i].imag = -1.0 * ((data[2*i] * xcos1[i]) + (data[256-2*i-1] * xsin1[i]));
181 } 203 }
204 #endif
182 205
183 /* Bit reversed shuffling */ 206 /* Bit reversed shuffling */
184 for(i=0; i<128; i++) { 207 for(i=0; i<128; i++) {
185 k = bit_reverse_512[i]; 208 k = bit_reverse_512[i];
186 if (k < i) 209 if (k < i)
512 /* Twiddle factors to turn IFFT into IMDCT */ 535 /* Twiddle factors to turn IFFT into IMDCT */
513 for (i = 0; i < 128; i++) { 536 for (i = 0; i < 128; i++) {
514 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); 537 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
515 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); 538 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
516 } 539 }
540 #ifdef HAVE_SSE
541 for (i = 0; i < 128; i++) {
542 sseSinCos1a[2*i+0]= -xsin1[i];
543 sseSinCos1a[2*i+1]= -xcos1[i];
544 sseSinCos1b[2*i+0]= xcos1[i];
545 sseSinCos1b[2*i+1]= -xsin1[i];
546 }
547 #endif
517 548
518 /* More twiddle factors to turn IFFT into IMDCT */ 549 /* More twiddle factors to turn IFFT into IMDCT */
519 for (i = 0; i < 64; i++) { 550 for (i = 0; i < 64; i++) {
520 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1)); 551 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
521 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1)); 552 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));