10725
|
1 /*
|
|
2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
|
12527
|
3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
|
|
4 **
|
10725
|
5 ** This program is free software; you can redistribute it and/or modify
|
|
6 ** it under the terms of the GNU General Public License as published by
|
|
7 ** the Free Software Foundation; either version 2 of the License, or
|
|
8 ** (at your option) any later version.
|
12527
|
9 **
|
10725
|
10 ** This program is distributed in the hope that it will be useful,
|
|
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 ** GNU General Public License for more details.
|
12527
|
14 **
|
10725
|
15 ** You should have received a copy of the GNU General Public License
|
12527
|
16 ** along with this program; if not, write to the Free Software
|
10725
|
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
18 **
|
|
19 ** Any non-GPL usage of this software or parts of this software is strictly
|
|
20 ** forbidden.
|
|
21 **
|
|
22 ** Commercial non-GPL licensing of this software is possible.
|
|
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
|
|
24 **
|
12527
|
25 ** $Id: cfft.c,v 1.2 2003/10/03 22:22:27 alex Exp $
|
10725
|
26 **/
|
|
27
|
|
28 /*
|
|
29 * Algorithmically based on Fortran-77 FFTPACK
|
|
30 * by Paul N. Swarztrauber(Version 4, 1985).
|
|
31 *
|
|
32 * Does even sized fft only
|
|
33 */
|
|
34
|
|
35 /* isign is +1 for backward and -1 for forward transforms */
|
|
36
|
|
37 #include "common.h"
|
|
38 #include "structs.h"
|
|
39
|
|
40 #include <stdlib.h>
|
|
41
|
|
42 #include "cfft.h"
|
|
43 #include "cfft_tab.h"
|
|
44
|
|
45
|
12527
|
46 /* static function declarations */
|
|
47 #ifdef USE_SSE
|
|
48 static void passf2pos_sse(const uint16_t l1, const complex_t *cc,
|
|
49 complex_t *ch, const complex_t *wa);
|
|
50 static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
51 complex_t *ch, const complex_t *wa);
|
|
52 static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
|
|
53 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
|
|
54 #endif
|
|
55 static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
56 complex_t *ch, const complex_t *wa);
|
|
57 static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
58 complex_t *ch, const complex_t *wa);
|
|
59 static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
60 complex_t *ch, const complex_t *wa1, const complex_t *wa2, const int8_t isign);
|
|
61 static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
|
|
62 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
|
|
63 static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
|
|
64 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
|
|
65 static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
|
|
66 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,
|
|
67 const complex_t *wa4, const int8_t isign);
|
|
68 INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,
|
|
69 const uint16_t *ifac, const complex_t *wa, const int8_t isign);
|
|
70 static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac);
|
|
71
|
|
72
|
10725
|
73 /*----------------------------------------------------------------------
|
|
74 passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd.
|
|
75 ----------------------------------------------------------------------*/
|
|
76
|
12527
|
77 #if 0 //def USE_SSE
|
|
78 static void passf2pos_sse(const uint16_t l1, const complex_t *cc,
|
|
79 complex_t *ch, const complex_t *wa)
|
|
80 {
|
|
81 uint16_t k, ah, ac;
|
|
82
|
|
83 for (k = 0; k < l1; k++)
|
|
84 {
|
|
85 ah = 2*k;
|
|
86 ac = 4*k;
|
|
87
|
|
88 RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
|
|
89 IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
|
|
90
|
|
91 RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
|
|
92 IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
|
|
93 }
|
|
94 }
|
|
95
|
|
96 static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
97 complex_t *ch, const complex_t *wa)
|
|
98 {
|
|
99 uint16_t i, k, ah, ac;
|
|
100
|
|
101 for (k = 0; k < l1; k++)
|
|
102 {
|
|
103 ah = k*ido;
|
|
104 ac = 2*k*ido;
|
|
105
|
|
106 for (i = 0; i < ido; i+=4)
|
|
107 {
|
|
108 __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14;
|
|
109 __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24;
|
|
110 __m128 w1, w2, w3, w4;
|
|
111
|
|
112 m1 = _mm_load_ps(&RE(cc[ac+i]));
|
|
113 m2 = _mm_load_ps(&RE(cc[ac+ido+i]));
|
|
114 m5 = _mm_load_ps(&RE(cc[ac+i+2]));
|
|
115 m6 = _mm_load_ps(&RE(cc[ac+ido+i+2]));
|
|
116 w1 = _mm_load_ps(&RE(wa[i]));
|
|
117 w3 = _mm_load_ps(&RE(wa[i+2]));
|
|
118
|
|
119 m3 = _mm_add_ps(m1, m2);
|
|
120 m15 = _mm_add_ps(m5, m6);
|
|
121
|
|
122 m4 = _mm_sub_ps(m1, m2);
|
|
123 m16 = _mm_sub_ps(m5, m6);
|
|
124
|
|
125 _mm_store_ps(&RE(ch[ah+i]), m3);
|
|
126 _mm_store_ps(&RE(ch[ah+i+2]), m15);
|
|
127
|
|
128
|
|
129 w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
|
|
130 w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
|
|
131
|
|
132 m7 = _mm_mul_ps(m4, w1);
|
|
133 m17 = _mm_mul_ps(m16, w3);
|
|
134 m8 = _mm_mul_ps(m4, w2);
|
|
135 m18 = _mm_mul_ps(m16, w4);
|
|
136
|
|
137 m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
|
|
138 m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0));
|
|
139 m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
|
|
140 m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1));
|
|
141
|
|
142 m11 = _mm_add_ps(m9, m10);
|
|
143 m21 = _mm_add_ps(m19, m20);
|
|
144 m12 = _mm_sub_ps(m9, m10);
|
|
145 m22 = _mm_sub_ps(m19, m20);
|
|
146
|
|
147 m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
|
|
148 m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2));
|
|
149
|
|
150 m14 = _mm_unpacklo_ps(m12, m13);
|
|
151 m24 = _mm_unpacklo_ps(m22, m23);
|
|
152
|
|
153 _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
|
|
154 _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24);
|
|
155 }
|
|
156 }
|
|
157 }
|
|
158 #endif
|
|
159
|
|
160 static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
161 complex_t *ch, const complex_t *wa)
|
10725
|
162 {
|
|
163 uint16_t i, k, ah, ac;
|
|
164
|
|
165 if (ido == 1)
|
|
166 {
|
|
167 for (k = 0; k < l1; k++)
|
|
168 {
|
|
169 ah = 2*k;
|
|
170 ac = 4*k;
|
|
171
|
12527
|
172 RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
|
|
173 RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
|
|
174 IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
|
|
175 IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
|
|
176 }
|
|
177 } else {
|
|
178 for (k = 0; k < l1; k++)
|
|
179 {
|
|
180 ah = k*ido;
|
|
181 ac = 2*k*ido;
|
|
182
|
|
183 for (i = 0; i < ido; i++)
|
|
184 {
|
|
185 complex_t t2;
|
|
186
|
|
187 RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);
|
|
188 RE(t2) = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
|
|
189
|
|
190 IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
|
|
191 IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
|
|
192
|
|
193 #if 1
|
|
194 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
|
|
195 IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));
|
|
196 #else
|
|
197 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
|
|
198 RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));
|
|
199 #endif
|
|
200 }
|
|
201 }
|
|
202 }
|
|
203 }
|
|
204
|
|
205 static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
206 complex_t *ch, const complex_t *wa)
|
|
207 {
|
|
208 uint16_t i, k, ah, ac;
|
|
209
|
|
210 if (ido == 1)
|
|
211 {
|
|
212 for (k = 0; k < l1; k++)
|
|
213 {
|
|
214 ah = 2*k;
|
|
215 ac = 4*k;
|
|
216
|
|
217 RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
|
10725
|
218 RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
|
10989
|
219 IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
|
10725
|
220 IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
|
|
221 }
|
|
222 } else {
|
|
223 for (k = 0; k < l1; k++)
|
|
224 {
|
|
225 ah = k*ido;
|
|
226 ac = 2*k*ido;
|
|
227
|
|
228 for (i = 0; i < ido; i++)
|
|
229 {
|
|
230 complex_t t2;
|
|
231
|
10989
|
232 RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);
|
|
233 RE(t2) = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
|
10725
|
234
|
10989
|
235 IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
|
|
236 IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
|
10725
|
237
|
12527
|
238 #if 1
|
|
239 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
|
|
240 RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));
|
|
241 #else
|
|
242 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
|
|
243 IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));
|
|
244 #endif
|
10725
|
245 }
|
|
246 }
|
|
247 }
|
|
248 }
|
|
249
|
|
250
|
12527
|
251 static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
252 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
|
|
253 const int8_t isign)
|
10725
|
254 {
|
12527
|
255 static real_t taur = FRAC_CONST(-0.5);
|
|
256 static real_t taui = FRAC_CONST(0.866025403784439);
|
10725
|
257 uint16_t i, k, ac, ah;
|
|
258 complex_t c2, c3, d2, d3, t2;
|
|
259
|
|
260 if (ido == 1)
|
|
261 {
|
12527
|
262 if (isign == 1)
|
10725
|
263 {
|
12527
|
264 for (k = 0; k < l1; k++)
|
|
265 {
|
|
266 ac = 3*k+1;
|
|
267 ah = k;
|
10725
|
268
|
12527
|
269 RE(t2) = RE(cc[ac]) + RE(cc[ac+1]);
|
|
270 IM(t2) = IM(cc[ac]) + IM(cc[ac+1]);
|
|
271 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),taur);
|
|
272 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),taur);
|
|
273
|
|
274 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2);
|
|
275 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2);
|
|
276
|
|
277 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+1])), taui);
|
|
278 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+1])), taui);
|
10725
|
279
|
12527
|
280 RE(ch[ah+l1]) = RE(c2) - IM(c3);
|
|
281 IM(ch[ah+l1]) = IM(c2) + RE(c3);
|
|
282 RE(ch[ah+2*l1]) = RE(c2) + IM(c3);
|
|
283 IM(ch[ah+2*l1]) = IM(c2) - RE(c3);
|
|
284 }
|
|
285 } else {
|
|
286 for (k = 0; k < l1; k++)
|
|
287 {
|
|
288 ac = 3*k+1;
|
|
289 ah = k;
|
10725
|
290
|
12527
|
291 RE(t2) = RE(cc[ac]) + RE(cc[ac+1]);
|
|
292 IM(t2) = IM(cc[ac]) + IM(cc[ac+1]);
|
|
293 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),taur);
|
|
294 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),taur);
|
|
295
|
|
296 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2);
|
|
297 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2);
|
10725
|
298
|
12527
|
299 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+1])), taui);
|
|
300 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+1])), taui);
|
|
301
|
|
302 RE(ch[ah+l1]) = RE(c2) + IM(c3);
|
|
303 IM(ch[ah+l1]) = IM(c2) - RE(c3);
|
|
304 RE(ch[ah+2*l1]) = RE(c2) - IM(c3);
|
|
305 IM(ch[ah+2*l1]) = IM(c2) + RE(c3);
|
|
306 }
|
10725
|
307 }
|
|
308 } else {
|
12527
|
309 if (isign == 1)
|
10725
|
310 {
|
12527
|
311 for (k = 0; k < l1; k++)
|
10725
|
312 {
|
12527
|
313 for (i = 0; i < ido; i++)
|
|
314 {
|
|
315 ac = i + (3*k+1)*ido;
|
|
316 ah = i + k * ido;
|
|
317
|
|
318 RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]);
|
|
319 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),taur);
|
|
320 IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]);
|
|
321 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),taur);
|
10725
|
322
|
12527
|
323 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2);
|
|
324 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2);
|
|
325
|
|
326 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+ido])), taui);
|
|
327 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+ido])), taui);
|
|
328
|
|
329 RE(d2) = RE(c2) - IM(c3);
|
|
330 IM(d3) = IM(c2) - RE(c3);
|
|
331 RE(d3) = RE(c2) + IM(c3);
|
|
332 IM(d2) = IM(c2) + RE(c3);
|
10725
|
333
|
12527
|
334 #if 1
|
|
335 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
|
|
336 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
|
|
337 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
|
|
338 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
|
|
339 #else
|
|
340 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
|
|
341 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
|
|
342 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
|
|
343 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
|
|
344 #endif
|
|
345 }
|
|
346 }
|
|
347 } else {
|
|
348 for (k = 0; k < l1; k++)
|
|
349 {
|
|
350 for (i = 0; i < ido; i++)
|
|
351 {
|
|
352 ac = i + (3*k+1)*ido;
|
|
353 ah = i + k * ido;
|
10725
|
354
|
12527
|
355 RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]);
|
|
356 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),taur);
|
|
357 IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]);
|
|
358 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),taur);
|
|
359
|
|
360 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2);
|
|
361 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2);
|
|
362
|
|
363 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+ido])), taui);
|
|
364 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+ido])), taui);
|
10725
|
365
|
12527
|
366 RE(d2) = RE(c2) + IM(c3);
|
|
367 IM(d3) = IM(c2) + RE(c3);
|
|
368 RE(d3) = RE(c2) - IM(c3);
|
|
369 IM(d2) = IM(c2) - RE(c3);
|
|
370
|
|
371 #if 1
|
|
372 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
|
|
373 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
|
|
374 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
|
|
375 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
|
|
376 #else
|
|
377 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
|
|
378 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
|
|
379 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
|
|
380 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
|
|
381 #endif
|
|
382 }
|
10725
|
383 }
|
|
384 }
|
|
385 }
|
|
386 }
|
|
387
|
12527
|
388 #ifdef USE_SSE
|
|
389 ALIGN static const int32_t negate[4] = { 0x0, 0x0, 0x0, 0x80000000 };
|
10725
|
390
|
12527
|
391 __declspec(naked) static void passf4pos_sse(const uint16_t l1, const complex_t *cc,
|
|
392 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
|
|
393 const complex_t *wa3)
|
|
394 {
|
|
395 __asm {
|
|
396 push ebx
|
|
397 mov ebx, esp
|
|
398 and esp, -16
|
|
399 push edi
|
|
400 push esi
|
|
401 sub esp, 8
|
|
402 movzx edi, WORD PTR [ebx+8]
|
|
403
|
|
404 movaps xmm1, XMMWORD PTR negate
|
|
405
|
|
406 test edi, edi
|
|
407 jle l1_is_zero
|
|
408
|
|
409 lea esi, DWORD PTR [edi+edi]
|
|
410 add esi, esi
|
|
411 sub esi, edi
|
|
412 add esi, esi
|
|
413 add esi, esi
|
|
414 add esi, esi
|
|
415 mov eax, DWORD PTR [ebx+16]
|
|
416 add esi, eax
|
|
417 lea ecx, DWORD PTR [edi+edi]
|
|
418 add ecx, ecx
|
|
419 add ecx, ecx
|
|
420 add ecx, ecx
|
|
421 add ecx, eax
|
|
422 lea edx, DWORD PTR [edi+edi]
|
|
423 add edx, edx
|
|
424 add edx, edx
|
|
425 add edx, eax
|
|
426 xor eax, eax
|
|
427 mov DWORD PTR [esp], ebp
|
|
428 mov ebp, DWORD PTR [ebx+12]
|
|
429
|
|
430 fftloop:
|
|
431 lea edi, DWORD PTR [eax+eax]
|
|
432 add edi, edi
|
|
433 movaps xmm2, XMMWORD PTR [ebp+edi*8]
|
|
434 movaps xmm0, XMMWORD PTR [ebp+edi*8+16]
|
|
435 movaps xmm7, XMMWORD PTR [ebp+edi*8+32]
|
|
436 movaps xmm5, XMMWORD PTR [ebp+edi*8+48]
|
|
437 movaps xmm6, xmm2
|
|
438 addps xmm6, xmm0
|
|
439 movaps xmm4, xmm1
|
|
440 xorps xmm4, xmm7
|
|
441 movaps xmm3, xmm1
|
|
442 xorps xmm3, xmm5
|
|
443 xorps xmm2, xmm1
|
|
444 xorps xmm0, xmm1
|
|
445 addps xmm7, xmm5
|
|
446 subps xmm2, xmm0
|
|
447 movaps xmm0, xmm6
|
|
448 shufps xmm0, xmm7, 68
|
|
449 subps xmm4, xmm3
|
|
450 shufps xmm6, xmm7, 238
|
|
451 movaps xmm5, xmm2
|
|
452 shufps xmm5, xmm4, 68
|
|
453 movaps xmm3, xmm0
|
|
454 addps xmm3, xmm6
|
|
455 shufps xmm2, xmm4, 187
|
|
456 subps xmm0, xmm6
|
|
457 movaps xmm4, xmm5
|
|
458 addps xmm4, xmm2
|
|
459 mov edi, DWORD PTR [ebx+16]
|
|
460 movaps XMMWORD PTR [edi+eax*8], xmm3
|
|
461 subps xmm5, xmm2
|
|
462 movaps XMMWORD PTR [edx+eax*8], xmm4
|
|
463 movaps XMMWORD PTR [ecx+eax*8], xmm0
|
|
464 movaps XMMWORD PTR [esi+eax*8], xmm5
|
|
465 add eax, 2
|
|
466 movzx eax, ax
|
|
467 movzx edi, WORD PTR [ebx+8]
|
|
468 cmp eax, edi
|
|
469 jl fftloop
|
|
470
|
|
471 mov ebp, DWORD PTR [esp]
|
|
472
|
|
473 l1_is_zero:
|
|
474
|
|
475 add esp, 8
|
|
476 pop esi
|
|
477 pop edi
|
|
478 mov esp, ebx
|
|
479 pop ebx
|
|
480 ret
|
|
481 }
|
|
482 }
|
|
483 #endif
|
|
484
|
|
485 #if 0
|
|
486 static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
487 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
|
|
488 const complex_t *wa3)
|
|
489 {
|
|
490 uint16_t i, k, ac, ah;
|
|
491
|
|
492 for (k = 0; k < l1; k++)
|
|
493 {
|
|
494 ac = 4*k*ido;
|
|
495 ah = k*ido;
|
|
496
|
|
497 for (i = 0; i < ido; i+=2)
|
|
498 {
|
|
499 __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16;
|
|
500 __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23;
|
|
501 __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30;
|
|
502 __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0);
|
|
503
|
|
504 m1 = _mm_load_ps(&RE(cc[ac+i]));
|
|
505 m2 = _mm_load_ps(&RE(cc[ac+i+2*ido]));
|
|
506 m3 = _mm_add_ps(m1, m2);
|
|
507 m4 = _mm_sub_ps(m1, m2);
|
|
508
|
|
509 n1 = _mm_load_ps(&RE(cc[ac+i+ido]));
|
|
510 n2 = _mm_load_ps(&RE(cc[ac+i+3*ido]));
|
|
511 n3 = _mm_add_ps(n1, n2);
|
|
512
|
|
513 n4 = _mm_mul_ps(neg1, n1);
|
|
514 n5 = _mm_mul_ps(neg1, n2);
|
|
515 n6 = _mm_sub_ps(n4, n5);
|
|
516
|
|
517 m5 = _mm_add_ps(m3, n3);
|
|
518
|
|
519 n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1));
|
|
520 n8 = _mm_add_ps(m4, n7);
|
|
521
|
|
522 m6 = _mm_sub_ps(m3, n3);
|
|
523 n9 = _mm_sub_ps(m4, n7);
|
|
524
|
|
525 _mm_store_ps(&RE(ch[ah+i]), m5);
|
|
526
|
|
527 #if 0
|
|
528 static INLINE void ComplexMult(real_t *y1, real_t *y2,
|
|
529 real_t x1, real_t x2, real_t c1, real_t c2)
|
|
530 {
|
|
531 *y1 = MUL_F(x1, c1) + MUL_F(x2, c2);
|
|
532 *y2 = MUL_F(x2, c1) - MUL_F(x1, c2);
|
|
533 }
|
|
534
|
|
535 m7.0 = RE(c2)*RE(wa1[i])
|
|
536 m7.1 = IM(c2)*IM(wa1[i])
|
|
537 m7.2 = RE(c6)*RE(wa1[i+1])
|
|
538 m7.3 = IM(c6)*IM(wa1[i+1])
|
|
539
|
|
540 m8.0 = RE(c2)*IM(wa1[i])
|
|
541 m8.1 = IM(c2)*RE(wa1[i])
|
|
542 m8.2 = RE(c6)*IM(wa1[i+1])
|
|
543 m8.3 = IM(c6)*RE(wa1[i+1])
|
|
544
|
|
545 RE(0) = m7.0 - m7.1
|
|
546 IM(0) = m8.0 + m8.1
|
|
547 RE(1) = m7.2 - m7.3
|
|
548 IM(1) = m8.2 + m8.3
|
|
549
|
|
550 ////
|
|
551 RE(0) = RE(c2)*RE(wa1[i]) - IM(c2)*IM(wa1[i])
|
|
552 IM(0) = RE(c2)*IM(wa1[i]) + IM(c2)*RE(wa1[i])
|
|
553 RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1])
|
|
554 IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1])
|
|
555 #endif
|
|
556
|
|
557 w1 = _mm_load_ps(&RE(wa1[i]));
|
|
558 w3 = _mm_load_ps(&RE(wa2[i]));
|
|
559 w5 = _mm_load_ps(&RE(wa3[i]));
|
|
560
|
|
561 w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
|
|
562 w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
|
|
563 w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1));
|
|
564
|
|
565 m7 = _mm_mul_ps(n8, w1);
|
|
566 m15 = _mm_mul_ps(m6, w3);
|
|
567 m23 = _mm_mul_ps(n9, w5);
|
|
568 m8 = _mm_mul_ps(n8, w2);
|
|
569 m16 = _mm_mul_ps(m6, w4);
|
|
570 m24 = _mm_mul_ps(n9, w6);
|
|
571
|
|
572 m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
|
|
573 m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0));
|
|
574 m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0));
|
|
575 m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
|
|
576 m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1));
|
|
577 m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1));
|
|
578
|
|
579 m11 = _mm_add_ps(m9, m10);
|
|
580 m19 = _mm_add_ps(m17, m18);
|
|
581 m27 = _mm_add_ps(m25, m26);
|
|
582 m12 = _mm_sub_ps(m9, m10);
|
|
583 m20 = _mm_sub_ps(m17, m18);
|
|
584 m28 = _mm_sub_ps(m25, m26);
|
|
585
|
|
586 m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
|
|
587 m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2));
|
|
588 m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2));
|
|
589 m14 = _mm_unpacklo_ps(m12, m13);
|
|
590 m22 = _mm_unpacklo_ps(m20, m21);
|
|
591 m30 = _mm_unpacklo_ps(m28, m29);
|
|
592
|
|
593 _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
|
|
594 _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22);
|
|
595 _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30);
|
|
596 }
|
|
597 }
|
|
598 }
|
|
599 #endif
|
|
600
|
|
601 static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
602 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
|
|
603 const complex_t *wa3)
|
10725
|
604 {
|
|
605 uint16_t i, k, ac, ah;
|
|
606
|
|
607 if (ido == 1)
|
|
608 {
|
|
609 for (k = 0; k < l1; k++)
|
|
610 {
|
10989
|
611 complex_t t1, t2, t3, t4;
|
|
612
|
10725
|
613 ac = 4*k;
|
|
614 ah = k;
|
|
615
|
12527
|
616 RE(t2) = RE(cc[ac]) + RE(cc[ac+2]);
|
|
617 RE(t1) = RE(cc[ac]) - RE(cc[ac+2]);
|
10989
|
618 IM(t2) = IM(cc[ac]) + IM(cc[ac+2]);
|
12527
|
619 IM(t1) = IM(cc[ac]) - IM(cc[ac+2]);
|
10725
|
620 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
|
10989
|
621 IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
|
|
622 IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
|
10725
|
623 RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
|
|
624
|
12527
|
625 RE(ch[ah]) = RE(t2) + RE(t3);
|
10725
|
626 RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
|
10989
|
627
|
|
628 IM(ch[ah]) = IM(t2) + IM(t3);
|
10725
|
629 IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
|
10989
|
630
|
12527
|
631 RE(ch[ah+l1]) = RE(t1) + RE(t4);
|
|
632 RE(ch[ah+3*l1]) = RE(t1) - RE(t4);
|
10989
|
633
|
12527
|
634 IM(ch[ah+l1]) = IM(t1) + IM(t4);
|
|
635 IM(ch[ah+3*l1]) = IM(t1) - IM(t4);
|
10725
|
636 }
|
|
637 } else {
|
|
638 for (k = 0; k < l1; k++)
|
|
639 {
|
10989
|
640 ac = 4*k*ido;
|
|
641 ah = k*ido;
|
|
642
|
10725
|
643 for (i = 0; i < ido; i++)
|
|
644 {
|
10989
|
645 complex_t c2, c3, c4, t1, t2, t3, t4;
|
10725
|
646
|
10989
|
647 RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);
|
|
648 RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);
|
|
649 IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);
|
|
650 IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);
|
|
651 RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
|
|
652 IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
|
|
653 IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
|
|
654 RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
|
10725
|
655
|
12527
|
656 RE(c2) = RE(t1) + RE(t4);
|
|
657 RE(c4) = RE(t1) - RE(t4);
|
10989
|
658
|
12527
|
659 IM(c2) = IM(t1) + IM(t4);
|
|
660 IM(c4) = IM(t1) - IM(t4);
|
10725
|
661
|
10989
|
662 RE(ch[ah+i]) = RE(t2) + RE(t3);
|
12527
|
663 RE(c3) = RE(t2) - RE(t3);
|
10989
|
664
|
|
665 IM(ch[ah+i]) = IM(t2) + IM(t3);
|
12527
|
666 IM(c3) = IM(t2) - IM(t3);
|
10989
|
667
|
12527
|
668 #if 1
|
|
669 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
|
|
670 IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));
|
|
671 ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),
|
|
672 IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));
|
|
673 ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),
|
|
674 IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));
|
|
675 #else
|
|
676 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
|
|
677 RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));
|
|
678 ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),
|
|
679 RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));
|
|
680 ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),
|
|
681 RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));
|
|
682 #endif
|
10725
|
683 }
|
|
684 }
|
|
685 }
|
|
686 }
|
|
687
|
12527
|
688 static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
689 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
|
|
690 const complex_t *wa3)
|
10725
|
691 {
|
|
692 uint16_t i, k, ac, ah;
|
|
693
|
|
694 if (ido == 1)
|
|
695 {
|
|
696 for (k = 0; k < l1; k++)
|
|
697 {
|
12527
|
698 complex_t t1, t2, t3, t4;
|
|
699
|
|
700 ac = 4*k;
|
10725
|
701 ah = k;
|
|
702
|
12527
|
703 RE(t2) = RE(cc[ac]) + RE(cc[ac+2]);
|
|
704 RE(t1) = RE(cc[ac]) - RE(cc[ac+2]);
|
|
705 IM(t2) = IM(cc[ac]) + IM(cc[ac+2]);
|
|
706 IM(t1) = IM(cc[ac]) - IM(cc[ac+2]);
|
|
707 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
|
|
708 IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
|
|
709 IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
|
|
710 RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
|
10725
|
711
|
12527
|
712 RE(ch[ah]) = RE(t2) + RE(t3);
|
|
713 RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
|
|
714
|
|
715 IM(ch[ah]) = IM(t2) + IM(t3);
|
|
716 IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
|
10725
|
717
|
12527
|
718 RE(ch[ah+l1]) = RE(t1) - RE(t4);
|
|
719 RE(ch[ah+3*l1]) = RE(t1) + RE(t4);
|
|
720
|
|
721 IM(ch[ah+l1]) = IM(t1) - IM(t4);
|
|
722 IM(ch[ah+3*l1]) = IM(t1) + IM(t4);
|
10725
|
723 }
|
|
724 } else {
|
|
725 for (k = 0; k < l1; k++)
|
|
726 {
|
12527
|
727 ac = 4*k*ido;
|
|
728 ah = k*ido;
|
|
729
|
10725
|
730 for (i = 0; i < ido; i++)
|
|
731 {
|
12527
|
732 complex_t c2, c3, c4, t1, t2, t3, t4;
|
|
733
|
|
734 RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);
|
|
735 RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);
|
|
736 IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);
|
|
737 IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);
|
|
738 RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
|
|
739 IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
|
|
740 IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
|
|
741 RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
|
|
742
|
|
743 RE(c2) = RE(t1) - RE(t4);
|
|
744 RE(c4) = RE(t1) + RE(t4);
|
|
745
|
|
746 IM(c2) = IM(t1) - IM(t4);
|
|
747 IM(c4) = IM(t1) + IM(t4);
|
|
748
|
|
749 RE(ch[ah+i]) = RE(t2) + RE(t3);
|
|
750 RE(c3) = RE(t2) - RE(t3);
|
|
751
|
|
752 IM(ch[ah+i]) = IM(t2) + IM(t3);
|
|
753 IM(c3) = IM(t2) - IM(t3);
|
|
754
|
|
755 #if 1
|
|
756 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
|
|
757 RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));
|
|
758 ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),
|
|
759 RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));
|
|
760 ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),
|
|
761 RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));
|
|
762 #else
|
|
763 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
|
|
764 IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));
|
|
765 ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),
|
|
766 IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));
|
|
767 ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),
|
|
768 IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));
|
|
769 #endif
|
|
770 }
|
|
771 }
|
|
772 }
|
|
773 }
|
|
774
|
|
775 static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc,
|
|
776 complex_t *ch, const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,
|
|
777 const complex_t *wa4, const int8_t isign)
|
|
778 {
|
|
779 static real_t tr11 = FRAC_CONST(0.309016994374947);
|
|
780 static real_t ti11 = FRAC_CONST(0.951056516295154);
|
|
781 static real_t tr12 = FRAC_CONST(-0.809016994374947);
|
|
782 static real_t ti12 = FRAC_CONST(0.587785252292473);
|
|
783 uint16_t i, k, ac, ah;
|
|
784 complex_t c2, c3, c4, c5, d3, d4, d5, d2, t2, t3, t4, t5;
|
10725
|
785
|
12527
|
786 if (ido == 1)
|
|
787 {
|
|
788 if (isign == 1)
|
|
789 {
|
|
790 for (k = 0; k < l1; k++)
|
|
791 {
|
|
792 ac = 5*k + 1;
|
|
793 ah = k;
|
|
794
|
|
795 RE(t2) = RE(cc[ac]) + RE(cc[ac+3]);
|
|
796 IM(t2) = IM(cc[ac]) + IM(cc[ac+3]);
|
|
797 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]);
|
|
798 IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]);
|
|
799 RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]);
|
|
800 IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]);
|
|
801 RE(t5) = RE(cc[ac]) - RE(cc[ac+3]);
|
|
802 IM(t5) = IM(cc[ac]) - IM(cc[ac+3]);
|
|
803
|
|
804 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3);
|
|
805 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3);
|
|
806
|
|
807 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
|
|
808 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
|
|
809 RE(c3) = RE(cc[ac-1]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
|
|
810 IM(c3) = IM(cc[ac-1]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
|
|
811
|
|
812 ComplexMult(&RE(c5), &RE(c4),
|
|
813 ti11, ti12, RE(t5), RE(t4));
|
|
814 ComplexMult(&IM(c5), &IM(c4),
|
|
815 ti11, ti12, IM(t5), IM(t4));
|
10725
|
816
|
12527
|
817 RE(ch[ah+l1]) = RE(c2) - IM(c5);
|
|
818 IM(ch[ah+l1]) = IM(c2) + RE(c5);
|
|
819 RE(ch[ah+2*l1]) = RE(c3) - IM(c4);
|
|
820 IM(ch[ah+2*l1]) = IM(c3) + RE(c4);
|
|
821 RE(ch[ah+3*l1]) = RE(c3) + IM(c4);
|
|
822 IM(ch[ah+3*l1]) = IM(c3) - RE(c4);
|
|
823 RE(ch[ah+4*l1]) = RE(c2) + IM(c5);
|
|
824 IM(ch[ah+4*l1]) = IM(c2) - RE(c5);
|
|
825 }
|
|
826 } else {
|
|
827 for (k = 0; k < l1; k++)
|
|
828 {
|
|
829 ac = 5*k + 1;
|
|
830 ah = k;
|
|
831
|
|
832 RE(t2) = RE(cc[ac]) + RE(cc[ac+3]);
|
|
833 IM(t2) = IM(cc[ac]) + IM(cc[ac+3]);
|
|
834 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]);
|
|
835 IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]);
|
|
836 RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]);
|
|
837 IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]);
|
|
838 RE(t5) = RE(cc[ac]) - RE(cc[ac+3]);
|
|
839 IM(t5) = IM(cc[ac]) - IM(cc[ac+3]);
|
|
840
|
|
841 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3);
|
|
842 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3);
|
|
843
|
|
844 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
|
|
845 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
|
|
846 RE(c3) = RE(cc[ac-1]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
|
|
847 IM(c3) = IM(cc[ac-1]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
|
|
848
|
|
849 ComplexMult(&RE(c4), &RE(c5),
|
|
850 ti12, ti11, RE(t5), RE(t4));
|
|
851 ComplexMult(&IM(c4), &IM(c5),
|
|
852 ti12, ti12, IM(t5), IM(t4));
|
10725
|
853
|
12527
|
854 RE(ch[ah+l1]) = RE(c2) + IM(c5);
|
|
855 IM(ch[ah+l1]) = IM(c2) - RE(c5);
|
|
856 RE(ch[ah+2*l1]) = RE(c3) + IM(c4);
|
|
857 IM(ch[ah+2*l1]) = IM(c3) - RE(c4);
|
|
858 RE(ch[ah+3*l1]) = RE(c3) - IM(c4);
|
|
859 IM(ch[ah+3*l1]) = IM(c3) + RE(c4);
|
|
860 RE(ch[ah+4*l1]) = RE(c2) - IM(c5);
|
|
861 IM(ch[ah+4*l1]) = IM(c2) + RE(c5);
|
|
862 }
|
|
863 }
|
|
864 } else {
|
|
865 if (isign == 1)
|
|
866 {
|
|
867 for (k = 0; k < l1; k++)
|
|
868 {
|
|
869 for (i = 0; i < ido; i++)
|
|
870 {
|
|
871 ac = i + (k*5 + 1) * ido;
|
|
872 ah = i + k * ido;
|
|
873
|
|
874 RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]);
|
|
875 IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]);
|
|
876 RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]);
|
|
877 IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]);
|
|
878 RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]);
|
|
879 IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]);
|
|
880 RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]);
|
|
881 IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]);
|
|
882
|
|
883 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3);
|
|
884 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3);
|
|
885
|
|
886 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
|
|
887 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
|
|
888 RE(c3) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
|
|
889 IM(c3) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
|
|
890
|
|
891 ComplexMult(&RE(c5), &RE(c4),
|
|
892 ti11, ti12, RE(t5), RE(t4));
|
|
893 ComplexMult(&IM(c5), &IM(c4),
|
|
894 ti11, ti12, IM(t5), IM(t4));
|
|
895
|
|
896 IM(d2) = IM(c2) + RE(c5);
|
|
897 IM(d3) = IM(c3) + RE(c4);
|
|
898 RE(d4) = RE(c3) + IM(c4);
|
|
899 RE(d5) = RE(c2) + IM(c5);
|
|
900 RE(d2) = RE(c2) - IM(c5);
|
|
901 IM(d5) = IM(c2) - RE(c5);
|
|
902 RE(d3) = RE(c3) - IM(c4);
|
|
903 IM(d4) = IM(c3) - RE(c4);
|
10725
|
904
|
12527
|
905 #if 1
|
|
906 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
|
|
907 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
|
|
908 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
|
|
909 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
|
|
910 ComplexMult(&IM(ch[ah+3*l1*ido]), &RE(ch[ah+3*l1*ido]),
|
|
911 IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i]));
|
|
912 ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]),
|
|
913 IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i]));
|
|
914 #else
|
|
915 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
|
|
916 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
|
|
917 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
|
|
918 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
|
|
919 ComplexMult(&RE(ch[ah+3*l1*ido]), &IM(ch[ah+3*l1*ido]),
|
|
920 RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i]));
|
|
921 ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]),
|
|
922 RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i]));
|
|
923 #endif
|
|
924 }
|
|
925 }
|
|
926 } else {
|
|
927 for (k = 0; k < l1; k++)
|
|
928 {
|
|
929 for (i = 0; i < ido; i++)
|
|
930 {
|
|
931 ac = i + (k*5 + 1) * ido;
|
|
932 ah = i + k * ido;
|
|
933
|
|
934 RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]);
|
|
935 IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]);
|
|
936 RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]);
|
|
937 IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]);
|
|
938 RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]);
|
|
939 IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]);
|
|
940 RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]);
|
|
941 IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]);
|
10725
|
942
|
12527
|
943 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3);
|
|
944 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3);
|
|
945
|
|
946 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
|
|
947 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
|
|
948 RE(c3) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
|
|
949 IM(c3) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
|
|
950
|
|
951 ComplexMult(&RE(c4), &RE(c5),
|
|
952 ti12, ti11, RE(t5), RE(t4));
|
|
953 ComplexMult(&IM(c4), &IM(c5),
|
|
954 ti12, ti12, IM(t5), IM(t4));
|
|
955
|
|
956 IM(d2) = IM(c2) - RE(c5);
|
|
957 IM(d3) = IM(c3) - RE(c4);
|
|
958 RE(d4) = RE(c3) - IM(c4);
|
|
959 RE(d5) = RE(c2) - IM(c5);
|
|
960 RE(d2) = RE(c2) + IM(c5);
|
|
961 IM(d5) = IM(c2) + RE(c5);
|
|
962 RE(d3) = RE(c3) + IM(c4);
|
|
963 IM(d4) = IM(c3) + RE(c4);
|
|
964
|
|
965 #if 1
|
|
966 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
|
|
967 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
|
|
968 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
|
|
969 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
|
|
970 ComplexMult(&RE(ch[ah+3*l1*ido]), &IM(ch[ah+3*l1*ido]),
|
|
971 RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i]));
|
|
972 ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]),
|
|
973 RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i]));
|
|
974 #else
|
|
975 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
|
|
976 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
|
|
977 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
|
|
978 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
|
|
979 ComplexMult(&IM(ch[ah+3*l1*ido]), &RE(ch[ah+3*l1*ido]),
|
|
980 IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i]));
|
|
981 ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]),
|
|
982 IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i]));
|
|
983 #endif
|
|
984 }
|
10725
|
985 }
|
|
986 }
|
|
987 }
|
|
988 }
|
|
989
|
|
990
|
|
991 /*----------------------------------------------------------------------
|
|
992 cfftf1, cfftf, cfftb, cffti1, cffti. Complex FFTs.
|
|
993 ----------------------------------------------------------------------*/
|
|
994
|
12527
|
995 #ifdef USE_SSE
|
|
996
|
|
997 #define CONV(A,B,C) ( (A<<2) | ((B & 0x1)<<1) | ((C==1)&0x1) )
|
|
998
|
|
999 static INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch,
|
|
1000 const uint16_t *ifac, const complex_t *wa,
|
|
1001 const int8_t isign)
|
|
1002 {
|
|
1003 uint16_t i;
|
|
1004 uint16_t k1, l1, l2;
|
|
1005 uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
|
|
1006
|
|
1007 nf = ifac[1];
|
|
1008 na = 0;
|
|
1009 l1 = 1;
|
|
1010 iw = 0;
|
|
1011
|
|
1012 for (k1 = 2; k1 <= nf+1; k1++)
|
|
1013 {
|
|
1014 ip = ifac[k1];
|
|
1015 l2 = ip*l1;
|
|
1016 ido = n / l2;
|
|
1017 idl1 = ido*l1;
|
|
1018
|
|
1019 ix2 = iw + ido;
|
|
1020 ix3 = ix2 + ido;
|
|
1021 ix4 = ix3 + ido;
|
|
1022
|
|
1023 switch (CONV(ip,na,ido))
|
|
1024 {
|
|
1025 case CONV(4,0,0):
|
|
1026 //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
|
|
1027 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
|
|
1028 break;
|
|
1029 case CONV(4,0,1):
|
|
1030 passf4pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
|
|
1031 break;
|
|
1032 case CONV(4,1,0):
|
|
1033 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
|
|
1034 //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
|
|
1035 break;
|
|
1036 case CONV(4,1,1):
|
|
1037 passf4pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
|
|
1038 break;
|
|
1039 case CONV(2,0,0):
|
|
1040 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
|
|
1041 //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
|
|
1042 break;
|
|
1043 case CONV(2,0,1):
|
|
1044 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
|
|
1045 //passf2pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
|
|
1046 break;
|
|
1047 case CONV(2,1,0):
|
|
1048 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
|
|
1049 //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
|
|
1050 break;
|
|
1051 case CONV(2,1,1):
|
|
1052 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
|
|
1053 //passf2pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
|
|
1054 break;
|
|
1055 case CONV(3,0,0):
|
|
1056 case CONV(3,0,1):
|
|
1057 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
|
|
1058 break;
|
|
1059 case CONV(3,1,0):
|
|
1060 case CONV(3,1,1):
|
|
1061 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
|
|
1062 break;
|
|
1063 case CONV(5,0,0):
|
|
1064 case CONV(5,0,1):
|
|
1065 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
|
|
1066 break;
|
|
1067 case CONV(5,1,0):
|
|
1068 case CONV(5,1,1):
|
|
1069 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
|
|
1070 break;
|
|
1071 }
|
|
1072
|
|
1073 na = 1 - na;
|
|
1074
|
|
1075 l1 = l2;
|
|
1076 iw += (ip-1) * ido;
|
|
1077 }
|
|
1078
|
|
1079 if (na == 0)
|
|
1080 return;
|
|
1081
|
|
1082 for (i = 0; i < n; i++)
|
|
1083 {
|
|
1084 RE(c[i]) = RE(ch[i]);
|
|
1085 IM(c[i]) = IM(ch[i]);
|
|
1086 }
|
|
1087 }
|
|
1088 #endif
|
|
1089
|
|
1090 static INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch,
|
|
1091 const uint16_t *ifac, const complex_t *wa,
|
|
1092 const int8_t isign)
|
10725
|
1093 {
|
|
1094 uint16_t i;
|
|
1095 uint16_t k1, l1, l2;
|
|
1096 uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
|
|
1097
|
|
1098 nf = ifac[1];
|
|
1099 na = 0;
|
|
1100 l1 = 1;
|
|
1101 iw = 0;
|
|
1102
|
|
1103 for (k1 = 2; k1 <= nf+1; k1++)
|
|
1104 {
|
|
1105 ip = ifac[k1];
|
|
1106 l2 = ip*l1;
|
|
1107 ido = n / l2;
|
|
1108 idl1 = ido*l1;
|
|
1109
|
|
1110 switch (ip)
|
|
1111 {
|
10989
|
1112 case 4:
|
|
1113 ix2 = iw + ido;
|
|
1114 ix3 = ix2 + ido;
|
|
1115
|
|
1116 if (na == 0)
|
12527
|
1117 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
|
10989
|
1118 else
|
12527
|
1119 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
|
10989
|
1120
|
|
1121 na = 1 - na;
|
|
1122 break;
|
10725
|
1123 case 2:
|
|
1124 if (na == 0)
|
12527
|
1125 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
|
10725
|
1126 else
|
12527
|
1127 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
|
10725
|
1128
|
|
1129 na = 1 - na;
|
|
1130 break;
|
|
1131 case 3:
|
|
1132 ix2 = iw + ido;
|
|
1133
|
|
1134 if (na == 0)
|
12527
|
1135 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
|
10725
|
1136 else
|
12527
|
1137 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
|
10725
|
1138
|
|
1139 na = 1 - na;
|
|
1140 break;
|
|
1141 case 5:
|
|
1142 ix2 = iw + ido;
|
|
1143 ix3 = ix2 + ido;
|
|
1144 ix4 = ix3 + ido;
|
|
1145
|
|
1146 if (na == 0)
|
12527
|
1147 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
|
10725
|
1148 else
|
12527
|
1149 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
|
|
1150
|
|
1151 na = 1 - na;
|
|
1152 break;
|
|
1153 }
|
|
1154
|
|
1155 l1 = l2;
|
|
1156 iw += (ip-1) * ido;
|
|
1157 }
|
|
1158
|
|
1159 if (na == 0)
|
|
1160 return;
|
|
1161
|
|
1162 for (i = 0; i < n; i++)
|
|
1163 {
|
|
1164 RE(c[i]) = RE(ch[i]);
|
|
1165 IM(c[i]) = IM(ch[i]);
|
|
1166 }
|
|
1167 }
|
|
1168
|
|
1169 static INLINE void cfftf1neg(uint16_t n, complex_t *c, complex_t *ch,
|
|
1170 const uint16_t *ifac, const complex_t *wa,
|
|
1171 const int8_t isign)
|
|
1172 {
|
|
1173 uint16_t i;
|
|
1174 uint16_t k1, l1, l2;
|
|
1175 uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
|
|
1176
|
|
1177 nf = ifac[1];
|
|
1178 na = 0;
|
|
1179 l1 = 1;
|
|
1180 iw = 0;
|
|
1181
|
|
1182 for (k1 = 2; k1 <= nf+1; k1++)
|
|
1183 {
|
|
1184 ip = ifac[k1];
|
|
1185 l2 = ip*l1;
|
|
1186 ido = n / l2;
|
|
1187 idl1 = ido*l1;
|
|
1188
|
|
1189 switch (ip)
|
|
1190 {
|
|
1191 case 4:
|
|
1192 ix2 = iw + ido;
|
|
1193 ix3 = ix2 + ido;
|
|
1194
|
|
1195 if (na == 0)
|
|
1196 passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
|
|
1197 else
|
|
1198 passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
|
|
1199
|
|
1200 na = 1 - na;
|
|
1201 break;
|
|
1202 case 2:
|
|
1203 if (na == 0)
|
|
1204 passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
|
|
1205 else
|
|
1206 passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
|
|
1207
|
|
1208 na = 1 - na;
|
|
1209 break;
|
|
1210 case 3:
|
|
1211 ix2 = iw + ido;
|
|
1212
|
|
1213 if (na == 0)
|
|
1214 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
|
|
1215 else
|
|
1216 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
|
|
1217
|
|
1218 na = 1 - na;
|
|
1219 break;
|
|
1220 case 5:
|
|
1221 ix2 = iw + ido;
|
|
1222 ix3 = ix2 + ido;
|
|
1223 ix4 = ix3 + ido;
|
|
1224
|
|
1225 if (na == 0)
|
|
1226 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
|
|
1227 else
|
|
1228 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
|
10725
|
1229
|
|
1230 na = 1 - na;
|
|
1231 break;
|
|
1232 }
|
|
1233
|
|
1234 l1 = l2;
|
|
1235 iw += (ip-1) * ido;
|
|
1236 }
|
|
1237
|
|
1238 if (na == 0)
|
|
1239 return;
|
|
1240
|
|
1241 for (i = 0; i < n; i++)
|
|
1242 {
|
|
1243 RE(c[i]) = RE(ch[i]);
|
|
1244 IM(c[i]) = IM(ch[i]);
|
|
1245 }
|
|
1246 }
|
|
1247
|
|
1248 void cfftf(cfft_info *cfft, complex_t *c)
|
|
1249 {
|
12527
|
1250 cfftf1neg(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, -1);
|
10725
|
1251 }
|
|
1252
|
|
1253 void cfftb(cfft_info *cfft, complex_t *c)
|
|
1254 {
|
12527
|
1255 cfftf1pos(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);
|
10725
|
1256 }
|
|
1257
|
12527
|
1258 #ifdef USE_SSE
|
|
1259 void cfftb_sse(cfft_info *cfft, complex_t *c)
|
|
1260 {
|
|
1261 cfftf1pos_sse(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);
|
|
1262 }
|
|
1263 #endif
|
|
1264
|
10725
|
1265 static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac)
|
|
1266 {
|
|
1267 static uint16_t ntryh[4] = {3, 4, 2, 5};
|
|
1268 #ifndef FIXED_POINT
|
|
1269 real_t arg, argh, argld, fi;
|
|
1270 uint16_t ido, ipm;
|
|
1271 uint16_t i1, k1, l1, l2;
|
|
1272 uint16_t ld, ii, ip;
|
|
1273 #endif
|
12527
|
1274 uint16_t ntry = 0, i, j;
|
10725
|
1275 uint16_t ib;
|
|
1276 uint16_t nf, nl, nq, nr;
|
|
1277
|
|
1278 nl = n;
|
|
1279 nf = 0;
|
|
1280 j = 0;
|
|
1281
|
|
1282 startloop:
|
|
1283 j++;
|
|
1284
|
|
1285 if (j <= 4)
|
|
1286 ntry = ntryh[j-1];
|
|
1287 else
|
|
1288 ntry += 2;
|
|
1289
|
|
1290 do
|
|
1291 {
|
|
1292 nq = nl / ntry;
|
|
1293 nr = nl - ntry*nq;
|
|
1294
|
|
1295 if (nr != 0)
|
|
1296 goto startloop;
|
|
1297
|
|
1298 nf++;
|
|
1299 ifac[nf+1] = ntry;
|
|
1300 nl = nq;
|
|
1301
|
|
1302 if (ntry == 2 && nf != 1)
|
|
1303 {
|
|
1304 for (i = 2; i <= nf; i++)
|
|
1305 {
|
|
1306 ib = nf - i + 2;
|
|
1307 ifac[ib+1] = ifac[ib];
|
|
1308 }
|
|
1309 ifac[2] = 2;
|
|
1310 }
|
|
1311 } while (nl != 1);
|
|
1312
|
|
1313 ifac[0] = n;
|
|
1314 ifac[1] = nf;
|
|
1315
|
|
1316 #ifndef FIXED_POINT
|
12527
|
1317 argh = (real_t)2.0*(real_t)M_PI / (real_t)n;
|
10725
|
1318 i = 0;
|
|
1319 l1 = 1;
|
|
1320
|
|
1321 for (k1 = 1; k1 <= nf; k1++)
|
|
1322 {
|
|
1323 ip = ifac[k1+1];
|
|
1324 ld = 0;
|
|
1325 l2 = l1*ip;
|
|
1326 ido = n / l2;
|
|
1327 ipm = ip - 1;
|
|
1328
|
|
1329 for (j = 0; j < ipm; j++)
|
|
1330 {
|
|
1331 i1 = i;
|
|
1332 RE(wa[i]) = 1.0;
|
|
1333 IM(wa[i]) = 0.0;
|
|
1334 ld += l1;
|
|
1335 fi = 0;
|
|
1336 argld = ld*argh;
|
|
1337
|
|
1338 for (ii = 0; ii < ido; ii++)
|
|
1339 {
|
|
1340 i++;
|
|
1341 fi++;
|
|
1342 arg = fi * argld;
|
10989
|
1343 RE(wa[i]) = (real_t)cos(arg);
|
12527
|
1344 #if 1
|
10989
|
1345 IM(wa[i]) = (real_t)sin(arg);
|
12527
|
1346 #else
|
|
1347 IM(wa[i]) = (real_t)-sin(arg);
|
|
1348 #endif
|
10725
|
1349 }
|
|
1350
|
|
1351 if (ip > 5)
|
|
1352 {
|
|
1353 RE(wa[i1]) = RE(wa[i]);
|
|
1354 IM(wa[i1]) = IM(wa[i]);
|
|
1355 }
|
|
1356 }
|
|
1357 l1 = l2;
|
|
1358 }
|
|
1359 #endif
|
|
1360 }
|
|
1361
|
|
1362 cfft_info *cffti(uint16_t n)
|
|
1363 {
|
12527
|
1364 cfft_info *cfft = (cfft_info*)faad_malloc(sizeof(cfft_info));
|
10725
|
1365
|
|
1366 cfft->n = n;
|
12527
|
1367 cfft->work = (complex_t*)faad_malloc(n*sizeof(complex_t));
|
10725
|
1368
|
|
1369 #ifndef FIXED_POINT
|
12527
|
1370 cfft->tab = (complex_t*)faad_malloc(n*sizeof(complex_t));
|
10725
|
1371
|
|
1372 cffti1(n, cfft->tab, cfft->ifac);
|
|
1373 #else
|
|
1374 cffti1(n, NULL, cfft->ifac);
|
|
1375
|
|
1376 switch (n)
|
|
1377 {
|
|
1378 case 64: cfft->tab = cfft_tab_64; break;
|
|
1379 case 512: cfft->tab = cfft_tab_512; break;
|
|
1380 #ifdef LD_DEC
|
12527
|
1381 case 256: cfft->tab = cfft_tab_256; break;
|
|
1382 #endif
|
|
1383
|
|
1384 #ifdef ALLOW_SMALL_FRAMELENGTH
|
|
1385 case 60: cfft->tab = cfft_tab_60; break;
|
|
1386 case 480: cfft->tab = cfft_tab_480; break;
|
|
1387 #ifdef LD_DEC
|
10725
|
1388 case 240: cfft->tab = cfft_tab_240; break;
|
12527
|
1389 #endif
|
10725
|
1390 #endif
|
|
1391 }
|
|
1392 #endif
|
|
1393
|
|
1394 return cfft;
|
|
1395 }
|
|
1396
|
|
1397 void cfftu(cfft_info *cfft)
|
|
1398 {
|
12527
|
1399 if (cfft->work) faad_free(cfft->work);
|
10725
|
1400 #ifndef FIXED_POINT
|
12527
|
1401 if (cfft->tab) faad_free(cfft->tab);
|
10725
|
1402 #endif
|
|
1403
|
12527
|
1404 if (cfft) faad_free(cfft);
|
10989
|
1405 }
|
12527
|
1406
|