10725
|
1 /*
|
|
2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
|
12527
|
3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
|
10725
|
4 **
|
|
5 ** This program is free software; you can redistribute it and/or modify
|
|
6 ** it under the terms of the GNU General Public License as published by
|
|
7 ** the Free Software Foundation; either version 2 of the License, or
|
|
8 ** (at your option) any later version.
|
|
9 **
|
|
10 ** This program is distributed in the hope that it will be useful,
|
|
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 ** GNU General Public License for more details.
|
|
14 **
|
|
15 ** You should have received a copy of the GNU General Public License
|
|
16 ** along with this program; if not, write to the Free Software
|
|
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
18 **
|
|
19 ** Any non-GPL usage of this software or parts of this software is strictly
|
|
20 ** forbidden.
|
|
21 **
|
|
22 ** Commercial non-GPL licensing of this software is possible.
|
|
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
|
|
24 **
|
12527
|
25 ** $Id: mdct.c,v 1.2 2003/10/03 22:22:27 alex Exp $
|
10725
|
26 **/
|
|
27
|
|
28 /*
|
|
29 * Fast (I)MDCT Implementation using (I)FFT ((Inverse) Fast Fourier Transform)
|
|
30 * and consists of three steps: pre-(I)FFT complex multiplication, complex
|
|
31 * (I)FFT, post-(I)FFT complex multiplication,
|
|
32 *
|
|
33 * As described in:
|
|
34 * P. Duhamel, Y. Mahieux, and J.P. Petit, "A Fast Algorithm for the
|
|
35 * Implementation of Filter Banks Based on 'Time Domain Aliasing
|
|
36 * Cancellation’," IEEE Proc. on ICASSP‘91, 1991, pp. 2209-2212.
|
|
37 *
|
|
38 *
|
|
39 * As of April 6th 2002 completely rewritten.
|
|
40 * This (I)MDCT can now be used for any data size n, where n is divisible by 8.
|
|
41 *
|
|
42 */
|
|
43
|
|
44 #include "common.h"
|
|
45 #include "structs.h"
|
|
46
|
|
47 #include <stdlib.h>
|
|
48 #ifdef _WIN32_WCE
|
|
49 #define assert(x)
|
|
50 #else
|
|
51 #include <assert.h>
|
|
52 #endif
|
|
53
|
|
54 #include "cfft.h"
|
|
55 #include "mdct.h"
|
|
56
|
|
57 /* const_tab[]:
|
|
58 0: sqrt(2 / N)
|
|
59 1: cos(2 * PI / N)
|
|
60 2: sin(2 * PI / N)
|
|
61 3: cos(2 * PI * (1/8) / N)
|
|
62 4: sin(2 * PI * (1/8) / N)
|
|
63 */
|
12527
|
64 #ifdef FIXED_POINT
|
10725
|
65 real_t const_tab[][5] =
|
|
66 {
|
12527
|
67 { /* 2048 */
|
|
68 COEF_CONST(1),
|
|
69 FRAC_CONST(0.99999529380957619),
|
|
70 FRAC_CONST(0.0030679567629659761),
|
|
71 FRAC_CONST(0.99999992646571789),
|
|
72 FRAC_CONST(0.00038349518757139556)
|
|
73 }, { /* 1920 */
|
|
74 COEF_CONST(/* sqrt(1024/960) */ 1.0327955589886444),
|
|
75 FRAC_CONST(0.99999464540169647),
|
|
76 FRAC_CONST(0.0032724865065266251),
|
|
77 FRAC_CONST(0.99999991633432805),
|
|
78 FRAC_CONST(0.00040906153202803459)
|
|
79 }, { /* 1024 */
|
|
80 COEF_CONST(1),
|
|
81 FRAC_CONST(0.99998117528260111),
|
|
82 FRAC_CONST(0.0061358846491544753),
|
|
83 FRAC_CONST(0.99999970586288223),
|
|
84 FRAC_CONST(0.00076699031874270449)
|
|
85 }, { /* 960 */
|
|
86 COEF_CONST(/* sqrt(512/480) */ 1.0327955589886444),
|
|
87 FRAC_CONST(0.99997858166412923),
|
|
88 FRAC_CONST(0.0065449379673518581),
|
|
89 FRAC_CONST(0.99999966533732598),
|
|
90 FRAC_CONST(0.00081812299560725323)
|
|
91 }, { /* 256 */
|
|
92 COEF_CONST(1),
|
|
93 FRAC_CONST(0.99969881869620425),
|
|
94 FRAC_CONST(0.024541228522912288),
|
|
95 FRAC_CONST(0.99999529380957619),
|
|
96 FRAC_CONST(0.0030679567629659761)
|
|
97 }, { /* 240 */
|
|
98 COEF_CONST(/* sqrt(256/240) */ 1.0327955589886444),
|
|
99 FRAC_CONST(0.99965732497555726),
|
|
100 FRAC_CONST(0.026176948307873149),
|
|
101 FRAC_CONST(0.99999464540169647),
|
|
102 FRAC_CONST(0.0032724865065266251)
|
|
103 }
|
10725
|
104 #ifdef SSR_DEC
|
12527
|
105 ,{ /* 512 */
|
|
106 COEF_CONST(1),
|
|
107 FRAC_CONST(0.9999247018391445),
|
|
108 FRAC_CONST(0.012271538285719925),
|
|
109 FRAC_CONST(0.99999882345170188),
|
|
110 FRAC_CONST(0.0015339801862847655)
|
|
111 }, { /* 64 */
|
|
112 COEF_CONST(1),
|
|
113 FRAC_CONST(0.99518472667219693),
|
|
114 FRAC_CONST(0.098017140329560604),
|
|
115 FRAC_CONST(0.9999247018391445),
|
|
116 FRAC_CONST(0.012271538285719925)
|
|
117 }
|
10725
|
118 #endif
|
|
119 };
|
|
120 #endif
|
|
121
|
12527
|
122 #ifdef FIXED_POINT
|
|
123 static uint8_t map_N_to_idx(uint16_t N)
|
10725
|
124 {
|
10989
|
125 /* gives an index into const_tab above */
|
|
126 /* for normal AAC deocding (eg. no scalable profile) only */
|
|
127 /* index 0 and 4 will be used */
|
10725
|
128 switch(N)
|
|
129 {
|
|
130 case 2048: return 0;
|
|
131 case 1920: return 1;
|
|
132 case 1024: return 2;
|
|
133 case 960: return 3;
|
|
134 case 256: return 4;
|
|
135 case 240: return 5;
|
|
136 #ifdef SSR_DEC
|
|
137 case 512: return 6;
|
|
138 case 64: return 7;
|
|
139 #endif
|
|
140 }
|
|
141 return 0;
|
|
142 }
|
12527
|
143 #endif
|
10725
|
144
|
|
145 mdct_info *faad_mdct_init(uint16_t N)
|
|
146 {
|
12527
|
147 uint16_t k;
|
|
148 #ifdef FIXED_POINT
|
|
149 uint16_t N_idx;
|
10725
|
150 real_t cangle, sangle, c, s, cold;
|
12527
|
151 #endif
|
10725
|
152 real_t scale;
|
|
153
|
12527
|
154 mdct_info *mdct = (mdct_info*)faad_malloc(sizeof(mdct_info));
|
10725
|
155
|
|
156 assert(N % 8 == 0);
|
|
157
|
|
158 mdct->N = N;
|
12527
|
159 mdct->sincos = (complex_t*)faad_malloc(N/4*sizeof(complex_t));
|
10725
|
160
|
12527
|
161 #ifdef FIXED_POINT
|
10725
|
162 N_idx = map_N_to_idx(N);
|
|
163
|
|
164 scale = const_tab[N_idx][0];
|
|
165 cangle = const_tab[N_idx][1];
|
|
166 sangle = const_tab[N_idx][2];
|
|
167 c = const_tab[N_idx][3];
|
|
168 s = const_tab[N_idx][4];
|
12527
|
169 #else
|
|
170 scale = (real_t)sqrt(2.0 / (real_t)N);
|
|
171 #endif
|
10725
|
172
|
10989
|
173 /* (co)sine table build using recurrence relations */
|
|
174 /* this can also be done using static table lookup or */
|
|
175 /* some form of interpolation */
|
10725
|
176 for (k = 0; k < N/4; k++)
|
|
177 {
|
12527
|
178 #ifdef FIXED_POINT
|
|
179 RE(mdct->sincos[k]) = c; //MUL_C_C(c,scale);
|
|
180 IM(mdct->sincos[k]) = s; //MUL_C_C(s,scale);
|
10725
|
181
|
|
182 cold = c;
|
12527
|
183 c = MUL_F(c,cangle) - MUL_F(s,sangle);
|
|
184 s = MUL_F(s,cangle) + MUL_F(cold,sangle);
|
10989
|
185 #else
|
|
186 /* no recurrence, just sines */
|
12527
|
187 RE(mdct->sincos[k]) = scale*(real_t)(cos(2.0*M_PI*(k+1./8.) / (real_t)N));
|
|
188 IM(mdct->sincos[k]) = scale*(real_t)(sin(2.0*M_PI*(k+1./8.) / (real_t)N));
|
10989
|
189 #endif
|
10725
|
190 }
|
|
191
|
|
192 /* initialise fft */
|
|
193 mdct->cfft = cffti(N/4);
|
|
194
|
12527
|
195 #ifdef PROFILE
|
|
196 mdct->cycles = 0;
|
|
197 mdct->fft_cycles = 0;
|
|
198 #endif
|
|
199
|
10725
|
200 return mdct;
|
|
201 }
|
|
202
|
|
203 void faad_mdct_end(mdct_info *mdct)
|
|
204 {
|
|
205 if (mdct != NULL)
|
|
206 {
|
12527
|
207 #ifdef PROFILE
|
|
208 printf("MDCT[%.4d]: %I64d cycles\n", mdct->N, mdct->cycles);
|
|
209 printf("CFFT[%.4d]: %I64d cycles\n", mdct->N/4, mdct->fft_cycles);
|
|
210 #endif
|
|
211
|
10725
|
212 cfftu(mdct->cfft);
|
|
213
|
12527
|
214 if (mdct->sincos) faad_free(mdct->sincos);
|
10725
|
215
|
12527
|
216 faad_free(mdct);
|
10725
|
217 }
|
|
218 }
|
|
219
|
|
220 void faad_imdct(mdct_info *mdct, real_t *X_in, real_t *X_out)
|
|
221 {
|
|
222 uint16_t k;
|
|
223
|
|
224 complex_t x;
|
12527
|
225 ALIGN complex_t Z1[512];
|
|
226 complex_t *sincos = mdct->sincos;
|
|
227
|
|
228 uint16_t N = mdct->N;
|
|
229 uint16_t N2 = N >> 1;
|
|
230 uint16_t N4 = N >> 2;
|
|
231 uint16_t N8 = N >> 3;
|
|
232
|
|
233 #ifdef PROFILE
|
|
234 int64_t count1, count2 = faad_get_ts();
|
|
235 #endif
|
|
236
|
|
237 /* pre-IFFT complex multiplication */
|
|
238 for (k = 0; k < N4; k++)
|
|
239 {
|
|
240 ComplexMult(&IM(Z1[k]), &RE(Z1[k]),
|
|
241 X_in[2*k], X_in[N2 - 1 - 2*k], RE(sincos[k]), IM(sincos[k]));
|
|
242 }
|
|
243
|
|
244 #ifdef PROFILE
|
|
245 count1 = faad_get_ts();
|
|
246 #endif
|
|
247
|
|
248 /* complex IFFT, any non-scaling FFT can be used here */
|
|
249 cfftb(mdct->cfft, Z1);
|
|
250
|
|
251 #ifdef PROFILE
|
|
252 count1 = faad_get_ts() - count1;
|
|
253 #endif
|
|
254
|
|
255 /* post-IFFT complex multiplication */
|
|
256 for (k = 0; k < N4; k++)
|
|
257 {
|
|
258 RE(x) = RE(Z1[k]);
|
|
259 IM(x) = IM(Z1[k]);
|
|
260 ComplexMult(&IM(Z1[k]), &RE(Z1[k]),
|
|
261 IM(x), RE(x), RE(sincos[k]), IM(sincos[k]));
|
|
262 }
|
|
263
|
|
264 /* reordering */
|
|
265 for (k = 0; k < N8; k+=2)
|
|
266 {
|
|
267 X_out[ 2*k] = IM(Z1[N8 + k]);
|
|
268 X_out[ 2 + 2*k] = IM(Z1[N8 + 1 + k]);
|
|
269
|
|
270 X_out[ 1 + 2*k] = -RE(Z1[N8 - 1 - k]);
|
|
271 X_out[ 3 + 2*k] = -RE(Z1[N8 - 2 - k]);
|
|
272
|
|
273 X_out[N4 + 2*k] = RE(Z1[ k]);
|
|
274 X_out[N4 + + 2 + 2*k] = RE(Z1[ 1 + k]);
|
|
275
|
|
276 X_out[N4 + 1 + 2*k] = -IM(Z1[N4 - 1 - k]);
|
|
277 X_out[N4 + 3 + 2*k] = -IM(Z1[N4 - 2 - k]);
|
|
278
|
|
279 X_out[N2 + 2*k] = RE(Z1[N8 + k]);
|
|
280 X_out[N2 + + 2 + 2*k] = RE(Z1[N8 + 1 + k]);
|
|
281
|
|
282 X_out[N2 + 1 + 2*k] = -IM(Z1[N8 - 1 - k]);
|
|
283 X_out[N2 + 3 + 2*k] = -IM(Z1[N8 - 2 - k]);
|
|
284
|
|
285 X_out[N2 + N4 + 2*k] = -IM(Z1[ k]);
|
|
286 X_out[N2 + N4 + 2 + 2*k] = -IM(Z1[ 1 + k]);
|
|
287
|
|
288 X_out[N2 + N4 + 1 + 2*k] = RE(Z1[N4 - 1 - k]);
|
|
289 X_out[N2 + N4 + 3 + 2*k] = RE(Z1[N4 - 2 - k]);
|
|
290 }
|
|
291
|
|
292 #ifdef PROFILE
|
|
293 count2 = faad_get_ts() - count2;
|
|
294 mdct->fft_cycles += count1;
|
|
295 mdct->cycles += (count2 - count1);
|
|
296 #endif
|
|
297 }
|
|
298
|
|
299 #ifdef USE_SSE
|
|
300 void faad_imdct_sse(mdct_info *mdct, real_t *X_in, real_t *X_out)
|
|
301 {
|
|
302 uint16_t k;
|
|
303
|
|
304 ALIGN complex_t Z1[512];
|
10725
|
305 complex_t *sincos = mdct->sincos;
|
|
306
|
|
307 uint16_t N = mdct->N;
|
|
308 uint16_t N2 = N >> 1;
|
|
309 uint16_t N4 = N >> 2;
|
|
310 uint16_t N8 = N >> 3;
|
|
311
|
12527
|
312 #ifdef PROFILE
|
|
313 int64_t count1, count2 = faad_get_ts();
|
|
314 #endif
|
|
315
|
10725
|
316 /* pre-IFFT complex multiplication */
|
12527
|
317 for (k = 0; k < N4; k+=4)
|
10725
|
318 {
|
12527
|
319 __m128 m12, m13, m14, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11;
|
|
320 __m128 n12, n13, n14, n0, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11;
|
|
321 n12 = _mm_load_ps(&X_in[N2 - 2*k - 8]);
|
|
322 m12 = _mm_load_ps(&X_in[N2 - 2*k - 4]);
|
|
323 m13 = _mm_load_ps(&X_in[2*k]);
|
|
324 n13 = _mm_load_ps(&X_in[2*k + 4]);
|
|
325 m1 = _mm_load_ps(&RE(sincos[k]));
|
|
326 n1 = _mm_load_ps(&RE(sincos[k+2]));
|
|
327
|
|
328 m0 = _mm_shuffle_ps(m12, m13, _MM_SHUFFLE(2,0,1,3));
|
|
329 m2 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(2,3,0,1));
|
|
330 m14 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3,1,2,0));
|
|
331 n0 = _mm_shuffle_ps(n12, n13, _MM_SHUFFLE(2,0,1,3));
|
|
332 n2 = _mm_shuffle_ps(n1, n1, _MM_SHUFFLE(2,3,0,1));
|
|
333 n14 = _mm_shuffle_ps(n0, n0, _MM_SHUFFLE(3,1,2,0));
|
|
334
|
|
335 m3 = _mm_mul_ps(m14, m1);
|
|
336 n3 = _mm_mul_ps(n14, n1);
|
|
337 m4 = _mm_mul_ps(m14, m2);
|
|
338 n4 = _mm_mul_ps(n14, n2);
|
|
339
|
|
340 m5 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(2,0,2,0));
|
|
341 n5 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(2,0,2,0));
|
|
342 m6 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(3,1,3,1));
|
|
343 n6 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(3,1,3,1));
|
|
344
|
|
345 m7 = _mm_add_ps(m5, m6);
|
|
346 n7 = _mm_add_ps(n5, n6);
|
|
347 m8 = _mm_sub_ps(m5, m6);
|
|
348 n8 = _mm_sub_ps(n5, n6);
|
|
349
|
|
350 m9 = _mm_shuffle_ps(m7, m7, _MM_SHUFFLE(3,2,3,2));
|
|
351 n9 = _mm_shuffle_ps(n7, n7, _MM_SHUFFLE(3,2,3,2));
|
|
352 m10 = _mm_shuffle_ps(m8, m8, _MM_SHUFFLE(1,0,1,0));
|
|
353 n10 = _mm_shuffle_ps(n8, n8, _MM_SHUFFLE(1,0,1,0));
|
|
354
|
|
355 m11 = _mm_unpacklo_ps(m10, m9);
|
|
356 n11 = _mm_unpacklo_ps(n10, n9);
|
|
357
|
|
358 _mm_store_ps(&RE(Z1[k]), m11);
|
|
359 _mm_store_ps(&RE(Z1[k+2]), n11);
|
10725
|
360 }
|
|
361
|
12527
|
362 #ifdef PROFILE
|
|
363 count1 = faad_get_ts();
|
|
364 #endif
|
|
365
|
10989
|
366 /* complex IFFT, any non-scaling FFT can be used here */
|
12527
|
367 cfftb_sse(mdct->cfft, Z1);
|
|
368
|
|
369 #ifdef PROFILE
|
|
370 count1 = faad_get_ts() - count1;
|
|
371 #endif
|
10725
|
372
|
|
373 /* post-IFFT complex multiplication */
|
12527
|
374 for (k = 0; k < N4; k+=4)
|
10725
|
375 {
|
12527
|
376 __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11;
|
|
377 __m128 n0, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11;
|
|
378 m0 = _mm_load_ps(&RE(Z1[k]));
|
|
379 n0 = _mm_load_ps(&RE(Z1[k+2]));
|
|
380 m1 = _mm_load_ps(&RE(sincos[k]));
|
|
381 n1 = _mm_load_ps(&RE(sincos[k+2]));
|
|
382
|
|
383 m2 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(2,3,0,1));
|
|
384 n2 = _mm_shuffle_ps(n1, n1, _MM_SHUFFLE(2,3,0,1));
|
|
385
|
|
386 m3 = _mm_mul_ps(m0, m1);
|
|
387 n3 = _mm_mul_ps(n0, n1);
|
|
388 m4 = _mm_mul_ps(m0, m2);
|
|
389 n4 = _mm_mul_ps(n0, n2);
|
10725
|
390
|
12527
|
391 m5 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(2,0,2,0));
|
|
392 n5 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(2,0,2,0));
|
|
393 m6 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(3,1,3,1));
|
|
394 n6 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(3,1,3,1));
|
|
395
|
|
396 m7 = _mm_add_ps(m5, m6);
|
|
397 n7 = _mm_add_ps(n5, n6);
|
|
398 m8 = _mm_sub_ps(m5, m6);
|
|
399 n8 = _mm_sub_ps(n5, n6);
|
|
400
|
|
401 m9 = _mm_shuffle_ps(m7, m7, _MM_SHUFFLE(3,2,3,2));
|
|
402 n9 = _mm_shuffle_ps(n7, n7, _MM_SHUFFLE(3,2,3,2));
|
|
403 m10 = _mm_shuffle_ps(m8, m8, _MM_SHUFFLE(1,0,1,0));
|
|
404 n10 = _mm_shuffle_ps(n8, n8, _MM_SHUFFLE(1,0,1,0));
|
|
405
|
|
406 m11 = _mm_unpacklo_ps(m10, m9);
|
|
407 n11 = _mm_unpacklo_ps(n10, n9);
|
|
408
|
|
409 _mm_store_ps(&RE(Z1[k]), m11);
|
|
410 _mm_store_ps(&RE(Z1[k+2]), n11);
|
10725
|
411 }
|
|
412
|
|
413 /* reordering */
|
12527
|
414 for (k = 0; k < N8; k+=2)
|
10725
|
415 {
|
12527
|
416 __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m13;
|
|
417 __m128 n4, n5, n6, n7, n8, n9;
|
|
418 __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0);
|
|
419 __m128 neg2 = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);
|
|
420
|
|
421 m0 = _mm_load_ps(&RE(Z1[k]));
|
|
422 m1 = _mm_load_ps(&RE(Z1[N8 - 2 - k]));
|
|
423 m2 = _mm_load_ps(&RE(Z1[N8 + k]));
|
|
424 m3 = _mm_load_ps(&RE(Z1[N4 - 2 - k]));
|
|
425
|
|
426 m10 = _mm_mul_ps(m0, neg1);
|
|
427 m11 = _mm_mul_ps(m1, neg2);
|
|
428 m13 = _mm_mul_ps(m3, neg1);
|
|
429
|
|
430 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3,1,2,0));
|
|
431 n4 = _mm_shuffle_ps(m10, m10, _MM_SHUFFLE(3,1,2,0));
|
|
432 m4 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(3,1,2,0));
|
|
433 n5 = _mm_shuffle_ps(m13, m13, _MM_SHUFFLE(3,1,2,0));
|
|
434
|
|
435 m6 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(3,2,1,0));
|
|
436 n6 = _mm_shuffle_ps(n4, n5, _MM_SHUFFLE(3,2,1,0));
|
|
437 m7 = _mm_shuffle_ps(m5, m4, _MM_SHUFFLE(3,2,1,0));
|
|
438 n7 = _mm_shuffle_ps(n5, n4, _MM_SHUFFLE(3,2,1,0));
|
|
439
|
|
440 m8 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0,3,1,2));
|
|
441 n8 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2,1,3,0));
|
|
442 m9 = _mm_shuffle_ps(m7, m7, _MM_SHUFFLE(2,1,3,0));
|
|
443 n9 = _mm_shuffle_ps(n7, n7, _MM_SHUFFLE(0,3,1,2));
|
|
444
|
|
445 _mm_store_ps(&X_out[2*k], m8);
|
|
446 _mm_store_ps(&X_out[N4 + 2*k], n8);
|
|
447 _mm_store_ps(&X_out[N2 + 2*k], m9);
|
|
448 _mm_store_ps(&X_out[N2 + N4 + 2*k], n9);
|
10725
|
449 }
|
12527
|
450
|
|
451 #ifdef PROFILE
|
|
452 count2 = faad_get_ts() - count2;
|
|
453 mdct->fft_cycles += count1;
|
|
454 mdct->cycles += (count2 - count1);
|
|
455 #endif
|
10725
|
456 }
|
12527
|
457 #endif
|
10725
|
458
|
|
459 #ifdef LTP_DEC
|
|
460 void faad_mdct(mdct_info *mdct, real_t *X_in, real_t *X_out)
|
|
461 {
|
|
462 uint16_t k;
|
|
463
|
|
464 complex_t x;
|
12527
|
465 ALIGN complex_t Z1[512];
|
10725
|
466 complex_t *sincos = mdct->sincos;
|
|
467
|
|
468 uint16_t N = mdct->N;
|
|
469 uint16_t N2 = N >> 1;
|
|
470 uint16_t N4 = N >> 2;
|
|
471 uint16_t N8 = N >> 3;
|
|
472
|
12527
|
473 #ifndef FIXED_POINT
|
10725
|
474 real_t scale = REAL_CONST(N);
|
12527
|
475 #else
|
|
476 real_t scale = REAL_CONST(4.0/N);
|
|
477 #endif
|
10725
|
478
|
|
479 /* pre-FFT complex multiplication */
|
|
480 for (k = 0; k < N8; k++)
|
|
481 {
|
|
482 uint16_t n = k << 1;
|
|
483 RE(x) = X_in[N - N4 - 1 - n] + X_in[N - N4 + n];
|
|
484 IM(x) = X_in[ N4 + n] - X_in[ N4 - 1 - n];
|
|
485
|
12527
|
486 ComplexMult(&RE(Z1[k]), &IM(Z1[k]),
|
|
487 RE(x), IM(x), RE(sincos[k]), IM(sincos[k]));
|
|
488
|
|
489 RE(Z1[k]) = MUL_R(RE(Z1[k]), scale);
|
|
490 IM(Z1[k]) = MUL_R(IM(Z1[k]), scale);
|
10725
|
491
|
|
492 RE(x) = X_in[N2 - 1 - n] - X_in[ n];
|
|
493 IM(x) = X_in[N2 + n] + X_in[N - 1 - n];
|
|
494
|
12527
|
495 ComplexMult(&RE(Z1[k + N8]), &IM(Z1[k + N8]),
|
|
496 RE(x), IM(x), RE(sincos[k + N8]), IM(sincos[k + N8]));
|
|
497
|
|
498 RE(Z1[k + N8]) = MUL_R(RE(Z1[k + N8]), scale);
|
|
499 IM(Z1[k + N8]) = MUL_R(IM(Z1[k + N8]), scale);
|
10725
|
500 }
|
|
501
|
10989
|
502 /* complex FFT, any non-scaling FFT can be used here */
|
10725
|
503 cfftf(mdct->cfft, Z1);
|
|
504
|
|
505 /* post-FFT complex multiplication */
|
|
506 for (k = 0; k < N4; k++)
|
|
507 {
|
|
508 uint16_t n = k << 1;
|
12527
|
509 ComplexMult(&RE(x), &IM(x),
|
|
510 RE(Z1[k]), IM(Z1[k]), RE(sincos[k]), IM(sincos[k]));
|
10725
|
511
|
12527
|
512 X_out[ n] = -RE(x);
|
|
513 X_out[N2 - 1 - n] = IM(x);
|
|
514 X_out[N2 + n] = -IM(x);
|
|
515 X_out[N - 1 - n] = RE(x);
|
10725
|
516 }
|
|
517 }
|
|
518 #endif
|