Mercurial > mplayer.hg
comparison liba52/imdct.c @ 3527:5a88b21cfe8a
sse opt
author | michael |
---|---|
date | Sun, 16 Dec 2001 15:00:02 +0000 |
parents | 1f166e420b15 |
children | a86166b495a6 |
comparison
equal
deleted
inserted
replaced
3526:cf787373f0aa | 3527:5a88b21cfe8a |
---|---|
73 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f}; | 73 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f}; |
74 | 74 |
75 #ifdef HAVE_SSE | 75 #ifdef HAVE_SSE |
76 // NOTE: SSE needs 16byte alignment or it will segfault | 76 // NOTE: SSE needs 16byte alignment or it will segfault |
77 static complex_t __attribute__((aligned(16))) buf[128]; | 77 static complex_t __attribute__((aligned(16))) buf[128]; |
78 static float __attribute__((aligned(16))) sseSinCos1a[256]; | |
79 static float __attribute__((aligned(16))) sseSinCos1b[256]; | |
78 static float __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1}; | 80 static float __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1}; |
79 #else | 81 #else |
80 static complex_t buf[128]; | 82 static complex_t buf[128]; |
81 #endif | 83 #endif |
82 | 84 |
172 sample_t *window_ptr; | 174 sample_t *window_ptr; |
173 | 175 |
174 /* 512 IMDCT with source and dest data in 'data' */ | 176 /* 512 IMDCT with source and dest data in 'data' */ |
175 | 177 |
176 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ | 178 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ |
179 #ifdef HAVE_SSE | |
180 asm volatile( | |
181 "xorl %%esi, %%esi \n\t" | |
182 "movl $1008, %%edi \n\t" | |
183 "1: \n\t" | |
184 "movaps (%0, %%esi), %%xmm0 \n\t" | |
185 "movaps (%0, %%edi), %%xmm1 \n\t" | |
186 "shufps $0xA0, %%xmm0, %%xmm0 \n\t" | |
187 "shufps $0x5F, %%xmm1, %%xmm1 \n\t" | |
188 "mulps sseSinCos1a(%%esi), %%xmm0 \n\t" | |
189 "mulps sseSinCos1b(%%esi), %%xmm1 \n\t" | |
190 "addps %%xmm1, %%xmm0 \n\t" | |
191 "movaps %%xmm0, (%1, %%esi) \n\t" | |
192 "addl $16, %%esi \n\t" | |
193 "subl $16, %%edi \n\t" | |
194 " jnc 1b \n\t" | |
195 :: "r" (data), "r" (buf) | |
196 : "%esi", "%edi" | |
197 ); | |
198 #else | |
177 for( i=0; i < 128; i++) { | 199 for( i=0; i < 128; i++) { |
178 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ | 200 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ |
179 buf[i].real = (data[256-2*i-1] * xcos1[i]) - (data[2*i] * xsin1[i]); | 201 buf[i].real = (data[256-2*i-1] * xcos1[i]) - (data[2*i] * xsin1[i]); |
180 buf[i].imag = -1.0 * ((data[2*i] * xcos1[i]) + (data[256-2*i-1] * xsin1[i])); | 202 buf[i].imag = -1.0 * ((data[2*i] * xcos1[i]) + (data[256-2*i-1] * xsin1[i])); |
181 } | 203 } |
204 #endif | |
182 | 205 |
183 /* Bit reversed shuffling */ | 206 /* Bit reversed shuffling */ |
184 for(i=0; i<128; i++) { | 207 for(i=0; i<128; i++) { |
185 k = bit_reverse_512[i]; | 208 k = bit_reverse_512[i]; |
186 if (k < i) | 209 if (k < i) |
512 /* Twiddle factors to turn IFFT into IMDCT */ | 535 /* Twiddle factors to turn IFFT into IMDCT */ |
513 for (i = 0; i < 128; i++) { | 536 for (i = 0; i < 128; i++) { |
514 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); | 537 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); |
515 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); | 538 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); |
516 } | 539 } |
540 #ifdef HAVE_SSE | |
541 for (i = 0; i < 128; i++) { | |
542 sseSinCos1a[2*i+0]= -xsin1[i]; | |
543 sseSinCos1a[2*i+1]= -xcos1[i]; | |
544 sseSinCos1b[2*i+0]= xcos1[i]; | |
545 sseSinCos1b[2*i+1]= -xsin1[i]; | |
546 } | |
547 #endif | |
517 | 548 |
518 /* More twiddle factors to turn IFFT into IMDCT */ | 549 /* More twiddle factors to turn IFFT into IMDCT */ |
519 for (i = 0; i < 64; i++) { | 550 for (i = 0; i < 64; i++) { |
520 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1)); | 551 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1)); |
521 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1)); | 552 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1)); |