comparison libfaad2/cfft.c @ 12527:4a370c80fe5c

update to the 2.0 release of faad, patch by adland
author diego
date Wed, 02 Jun 2004 22:59:04 +0000
parents 3185f64f6350
children d81145997036
comparison
equal deleted inserted replaced
12526:e183ad37d24c 12527:4a370c80fe5c
1 /* 1 /*
2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding 2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
3 ** Copyright (C) 2003 M. Bakker, Ahead Software AG, http://www.nero.com 3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
4 ** 4 **
5 ** This program is free software; you can redistribute it and/or modify 5 ** This program is free software; you can redistribute it and/or modify
6 ** it under the terms of the GNU General Public License as published by 6 ** it under the terms of the GNU General Public License as published by
7 ** the Free Software Foundation; either version 2 of the License, or 7 ** the Free Software Foundation; either version 2 of the License, or
8 ** (at your option) any later version. 8 ** (at your option) any later version.
9 ** 9 **
10 ** This program is distributed in the hope that it will be useful, 10 ** This program is distributed in the hope that it will be useful,
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of 11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ** GNU General Public License for more details. 13 ** GNU General Public License for more details.
14 ** 14 **
15 ** You should have received a copy of the GNU General Public License 15 ** You should have received a copy of the GNU General Public License
16 ** along with this program; if not, write to the Free Software 16 ** along with this program; if not, write to the Free Software
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 ** 18 **
19 ** Any non-GPL usage of this software or parts of this software is strictly 19 ** Any non-GPL usage of this software or parts of this software is strictly
20 ** forbidden. 20 ** forbidden.
21 ** 21 **
22 ** Commercial non-GPL licensing of this software is possible. 22 ** Commercial non-GPL licensing of this software is possible.
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com. 23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
24 ** 24 **
25 ** $Id: cfft.c,v 1.1 2003/08/30 22:30:21 arpi Exp $ 25 ** $Id: cfft.c,v 1.2 2003/10/03 22:22:27 alex Exp $
26 **/ 26 **/
27 27
28 /* 28 /*
29 * Algorithmically based on Fortran-77 FFTPACK 29 * Algorithmically based on Fortran-77 FFTPACK
30 * by Paul N. Swarztrauber(Version 4, 1985). 30 * by Paul N. Swarztrauber(Version 4, 1985).
36 36
37 #include "common.h" 37 #include "common.h"
38 #include "structs.h" 38 #include "structs.h"
39 39
40 #include <stdlib.h> 40 #include <stdlib.h>
41 #ifdef _WIN32_WCE
42 #define assert(x)
43 #else
44 #include <assert.h>
45 #endif
46 41
47 #include "cfft.h" 42 #include "cfft.h"
48 #include "cfft_tab.h" 43 #include "cfft_tab.h"
44
45
46 /* static function declarations */
47 #ifdef USE_SSE
48 static void passf2pos_sse(const uint16_t l1, const complex_t *cc,
49 complex_t *ch, const complex_t *wa);
50 static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
51 complex_t *ch, const complex_t *wa);
52 static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
53 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
54 #endif
55 static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
56 complex_t *ch, const complex_t *wa);
57 static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
58 complex_t *ch, const complex_t *wa);
59 static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
60 complex_t *ch, const complex_t *wa1, const complex_t *wa2, const int8_t isign);
61 static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
62 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
63 static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
64 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
65 static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
66 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,
67 const complex_t *wa4, const int8_t isign);
68 INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,
69 const uint16_t *ifac, const complex_t *wa, const int8_t isign);
70 static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac);
49 71
50 72
51 /*---------------------------------------------------------------------- 73 /*----------------------------------------------------------------------
52 passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd. 74 passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd.
53 ----------------------------------------------------------------------*/ 75 ----------------------------------------------------------------------*/
54 76
55 static void passf2(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, 77 #if 0 //def USE_SSE
56 complex_t *wa, int8_t isign) 78 static void passf2pos_sse(const uint16_t l1, const complex_t *cc,
79 complex_t *ch, const complex_t *wa)
80 {
81 uint16_t k, ah, ac;
82
83 for (k = 0; k < l1; k++)
84 {
85 ah = 2*k;
86 ac = 4*k;
87
88 RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
89 IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
90
91 RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
92 IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
93 }
94 }
95
96 static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
97 complex_t *ch, const complex_t *wa)
98 {
99 uint16_t i, k, ah, ac;
100
101 for (k = 0; k < l1; k++)
102 {
103 ah = k*ido;
104 ac = 2*k*ido;
105
106 for (i = 0; i < ido; i+=4)
107 {
108 __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14;
109 __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24;
110 __m128 w1, w2, w3, w4;
111
112 m1 = _mm_load_ps(&RE(cc[ac+i]));
113 m2 = _mm_load_ps(&RE(cc[ac+ido+i]));
114 m5 = _mm_load_ps(&RE(cc[ac+i+2]));
115 m6 = _mm_load_ps(&RE(cc[ac+ido+i+2]));
116 w1 = _mm_load_ps(&RE(wa[i]));
117 w3 = _mm_load_ps(&RE(wa[i+2]));
118
119 m3 = _mm_add_ps(m1, m2);
120 m15 = _mm_add_ps(m5, m6);
121
122 m4 = _mm_sub_ps(m1, m2);
123 m16 = _mm_sub_ps(m5, m6);
124
125 _mm_store_ps(&RE(ch[ah+i]), m3);
126 _mm_store_ps(&RE(ch[ah+i+2]), m15);
127
128
129 w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
130 w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
131
132 m7 = _mm_mul_ps(m4, w1);
133 m17 = _mm_mul_ps(m16, w3);
134 m8 = _mm_mul_ps(m4, w2);
135 m18 = _mm_mul_ps(m16, w4);
136
137 m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
138 m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0));
139 m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
140 m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1));
141
142 m11 = _mm_add_ps(m9, m10);
143 m21 = _mm_add_ps(m19, m20);
144 m12 = _mm_sub_ps(m9, m10);
145 m22 = _mm_sub_ps(m19, m20);
146
147 m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
148 m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2));
149
150 m14 = _mm_unpacklo_ps(m12, m13);
151 m24 = _mm_unpacklo_ps(m22, m23);
152
153 _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
154 _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24);
155 }
156 }
157 }
158 #endif
159
160 static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
161 complex_t *ch, const complex_t *wa)
57 { 162 {
58 uint16_t i, k, ah, ac; 163 uint16_t i, k, ah, ac;
59 164
60 if (ido == 1) 165 if (ido == 1)
61 { 166 {
62 for (k = 0; k < l1; k++) 167 for (k = 0; k < l1; k++)
63 { 168 {
64 ah = 2*k; 169 ah = 2*k;
65 ac = 4*k; 170 ac = 4*k;
66 171
67 RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]); 172 RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
68 RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]); 173 RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
69 IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]); 174 IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
70 IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]); 175 IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
71 } 176 }
72 } else { 177 } else {
83 RE(t2) = RE(cc[ac+i]) - RE(cc[ac+i+ido]); 188 RE(t2) = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
84 189
85 IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]); 190 IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
86 IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]); 191 IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
87 192
88 RE(ch[ah+i+l1*ido]) = MUL_R_C(RE(t2),RE(wa[i])) - MUL_R_C(IM(t2),IM(wa[i]))*isign; 193 #if 1
89 IM(ch[ah+i+l1*ido]) = MUL_R_C(IM(t2),RE(wa[i])) + MUL_R_C(RE(t2),IM(wa[i]))*isign; 194 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
90 } 195 IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));
91 } 196 #else
92 } 197 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
93 } 198 RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));
94 199 #endif
95 200 }
96 static void passf3(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, 201 }
97 complex_t *wa1, complex_t *wa2, int8_t isign) 202 }
98 { 203 }
99 static real_t taur = COEF_CONST(-0.5); 204
100 static real_t taui = COEF_CONST(0.866025403784439); 205 static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
206 complex_t *ch, const complex_t *wa)
207 {
208 uint16_t i, k, ah, ac;
209
210 if (ido == 1)
211 {
212 for (k = 0; k < l1; k++)
213 {
214 ah = 2*k;
215 ac = 4*k;
216
217 RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
218 RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
219 IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
220 IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
221 }
222 } else {
223 for (k = 0; k < l1; k++)
224 {
225 ah = k*ido;
226 ac = 2*k*ido;
227
228 for (i = 0; i < ido; i++)
229 {
230 complex_t t2;
231
232 RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);
233 RE(t2) = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
234
235 IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
236 IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
237
238 #if 1
239 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
240 RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));
241 #else
242 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
243 IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));
244 #endif
245 }
246 }
247 }
248 }
249
250
251 static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
252 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
253 const int8_t isign)
254 {
255 static real_t taur = FRAC_CONST(-0.5);
256 static real_t taui = FRAC_CONST(0.866025403784439);
101 uint16_t i, k, ac, ah; 257 uint16_t i, k, ac, ah;
102 complex_t c2, c3, d2, d3, t2; 258 complex_t c2, c3, d2, d3, t2;
103 259
104 if (ido == 1) 260 if (ido == 1)
105 { 261 {
106 for (k = 0; k < l1; k++) 262 if (isign == 1)
107 { 263 {
108 ac = 3*k+1; 264 for (k = 0; k < l1; k++)
109 ah = k; 265 {
110 266 ac = 3*k+1;
111 RE(t2) = RE(cc[ac]) + RE(cc[ac+1]); 267 ah = k;
112 IM(t2) = IM(cc[ac]) + IM(cc[ac+1]); 268
113 RE(c2) = RE(cc[ac-1]) + MUL_R_C(RE(t2),taur); 269 RE(t2) = RE(cc[ac]) + RE(cc[ac+1]);
114 IM(c2) = IM(cc[ac-1]) + MUL_R_C(IM(t2),taur); 270 IM(t2) = IM(cc[ac]) + IM(cc[ac+1]);
115 271 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),taur);
116 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2); 272 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),taur);
117 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2); 273
118 274 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2);
119 RE(c3) = MUL_R_C((RE(cc[ac]) - RE(cc[ac+1])), taui)*isign; 275 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2);
120 IM(c3) = MUL_R_C((IM(cc[ac]) - IM(cc[ac+1])), taui)*isign; 276
121 277 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+1])), taui);
122 RE(ch[ah+l1]) = RE(c2) - IM(c3); 278 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+1])), taui);
123 IM(ch[ah+l1]) = IM(c2) + RE(c3); 279
124 RE(ch[ah+2*l1]) = RE(c2) + IM(c3); 280 RE(ch[ah+l1]) = RE(c2) - IM(c3);
125 IM(ch[ah+2*l1]) = IM(c2) - RE(c3); 281 IM(ch[ah+l1]) = IM(c2) + RE(c3);
282 RE(ch[ah+2*l1]) = RE(c2) + IM(c3);
283 IM(ch[ah+2*l1]) = IM(c2) - RE(c3);
284 }
285 } else {
286 for (k = 0; k < l1; k++)
287 {
288 ac = 3*k+1;
289 ah = k;
290
291 RE(t2) = RE(cc[ac]) + RE(cc[ac+1]);
292 IM(t2) = IM(cc[ac]) + IM(cc[ac+1]);
293 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),taur);
294 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),taur);
295
296 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2);
297 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2);
298
299 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+1])), taui);
300 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+1])), taui);
301
302 RE(ch[ah+l1]) = RE(c2) + IM(c3);
303 IM(ch[ah+l1]) = IM(c2) - RE(c3);
304 RE(ch[ah+2*l1]) = RE(c2) - IM(c3);
305 IM(ch[ah+2*l1]) = IM(c2) + RE(c3);
306 }
126 } 307 }
127 } else { 308 } else {
128 for (k = 0; k < l1; k++) 309 if (isign == 1)
129 { 310 {
130 for (i = 0; i < ido; i++) 311 for (k = 0; k < l1; k++)
131 { 312 {
132 ac = i + (3*k+1)*ido; 313 for (i = 0; i < ido; i++)
133 ah = i + k * ido; 314 {
134 315 ac = i + (3*k+1)*ido;
135 RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]); 316 ah = i + k * ido;
136 RE(c2) = RE(cc[ac-ido]) + MUL_R_C(RE(t2),taur); 317
137 IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]); 318 RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]);
138 IM(c2) = IM(cc[ac-ido]) + MUL_R_C(IM(t2),taur); 319 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),taur);
139 320 IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]);
140 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2); 321 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),taur);
141 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2); 322
142 323 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2);
143 RE(c3) = MUL_R_C((RE(cc[ac]) - RE(cc[ac+ido])), taui)*isign; 324 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2);
144 IM(c3) = MUL_R_C((IM(cc[ac]) - IM(cc[ac+ido])), taui)*isign; 325
145 326 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+ido])), taui);
146 RE(d2) = RE(c2) - IM(c3); 327 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+ido])), taui);
147 IM(d3) = IM(c2) - RE(c3); 328
148 RE(d3) = RE(c2) + IM(c3); 329 RE(d2) = RE(c2) - IM(c3);
149 IM(d2) = IM(c2) + RE(c3); 330 IM(d3) = IM(c2) - RE(c3);
150 331 RE(d3) = RE(c2) + IM(c3);
151 RE(ch[ah+l1*ido]) = MUL_R_C(RE(d2),RE(wa1[i])) - MUL_R_C(IM(d2),IM(wa1[i]))*isign; 332 IM(d2) = IM(c2) + RE(c3);
152 IM(ch[ah+l1*ido]) = MUL_R_C(IM(d2),RE(wa1[i])) + MUL_R_C(RE(d2),IM(wa1[i]))*isign; 333
153 RE(ch[ah+l1*2*ido]) = MUL_R_C(RE(d3),RE(wa2[i])) - MUL_R_C(IM(d3),IM(wa2[i]))*isign; 334 #if 1
154 IM(ch[ah+l1*2*ido]) = MUL_R_C(IM(d3),RE(wa2[i])) + MUL_R_C(RE(d3),IM(wa2[i]))*isign; 335 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
155 } 336 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
156 } 337 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
157 } 338 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
158 } 339 #else
159 340 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
160 341 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
161 static void passf4(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, 342 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
162 complex_t *wa1, complex_t *wa2, complex_t *wa3, int8_t isign) 343 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
344 #endif
345 }
346 }
347 } else {
348 for (k = 0; k < l1; k++)
349 {
350 for (i = 0; i < ido; i++)
351 {
352 ac = i + (3*k+1)*ido;
353 ah = i + k * ido;
354
355 RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]);
356 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),taur);
357 IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]);
358 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),taur);
359
360 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2);
361 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2);
362
363 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+ido])), taui);
364 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+ido])), taui);
365
366 RE(d2) = RE(c2) + IM(c3);
367 IM(d3) = IM(c2) + RE(c3);
368 RE(d3) = RE(c2) - IM(c3);
369 IM(d2) = IM(c2) - RE(c3);
370
371 #if 1
372 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
373 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
374 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
375 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
376 #else
377 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
378 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
379 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
380 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
381 #endif
382 }
383 }
384 }
385 }
386 }
387
388 #ifdef USE_SSE
389 ALIGN static const int32_t negate[4] = { 0x0, 0x0, 0x0, 0x80000000 };
390
391 __declspec(naked) static void passf4pos_sse(const uint16_t l1, const complex_t *cc,
392 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
393 const complex_t *wa3)
394 {
395 __asm {
396 push ebx
397 mov ebx, esp
398 and esp, -16
399 push edi
400 push esi
401 sub esp, 8
402 movzx edi, WORD PTR [ebx+8]
403
404 movaps xmm1, XMMWORD PTR negate
405
406 test edi, edi
407 jle l1_is_zero
408
409 lea esi, DWORD PTR [edi+edi]
410 add esi, esi
411 sub esi, edi
412 add esi, esi
413 add esi, esi
414 add esi, esi
415 mov eax, DWORD PTR [ebx+16]
416 add esi, eax
417 lea ecx, DWORD PTR [edi+edi]
418 add ecx, ecx
419 add ecx, ecx
420 add ecx, ecx
421 add ecx, eax
422 lea edx, DWORD PTR [edi+edi]
423 add edx, edx
424 add edx, edx
425 add edx, eax
426 xor eax, eax
427 mov DWORD PTR [esp], ebp
428 mov ebp, DWORD PTR [ebx+12]
429
430 fftloop:
431 lea edi, DWORD PTR [eax+eax]
432 add edi, edi
433 movaps xmm2, XMMWORD PTR [ebp+edi*8]
434 movaps xmm0, XMMWORD PTR [ebp+edi*8+16]
435 movaps xmm7, XMMWORD PTR [ebp+edi*8+32]
436 movaps xmm5, XMMWORD PTR [ebp+edi*8+48]
437 movaps xmm6, xmm2
438 addps xmm6, xmm0
439 movaps xmm4, xmm1
440 xorps xmm4, xmm7
441 movaps xmm3, xmm1
442 xorps xmm3, xmm5
443 xorps xmm2, xmm1
444 xorps xmm0, xmm1
445 addps xmm7, xmm5
446 subps xmm2, xmm0
447 movaps xmm0, xmm6
448 shufps xmm0, xmm7, 68
449 subps xmm4, xmm3
450 shufps xmm6, xmm7, 238
451 movaps xmm5, xmm2
452 shufps xmm5, xmm4, 68
453 movaps xmm3, xmm0
454 addps xmm3, xmm6
455 shufps xmm2, xmm4, 187
456 subps xmm0, xmm6
457 movaps xmm4, xmm5
458 addps xmm4, xmm2
459 mov edi, DWORD PTR [ebx+16]
460 movaps XMMWORD PTR [edi+eax*8], xmm3
461 subps xmm5, xmm2
462 movaps XMMWORD PTR [edx+eax*8], xmm4
463 movaps XMMWORD PTR [ecx+eax*8], xmm0
464 movaps XMMWORD PTR [esi+eax*8], xmm5
465 add eax, 2
466 movzx eax, ax
467 movzx edi, WORD PTR [ebx+8]
468 cmp eax, edi
469 jl fftloop
470
471 mov ebp, DWORD PTR [esp]
472
473 l1_is_zero:
474
475 add esp, 8
476 pop esi
477 pop edi
478 mov esp, ebx
479 pop ebx
480 ret
481 }
482 }
483 #endif
484
485 #if 0
486 static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
487 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
488 const complex_t *wa3)
489 {
490 uint16_t i, k, ac, ah;
491
492 for (k = 0; k < l1; k++)
493 {
494 ac = 4*k*ido;
495 ah = k*ido;
496
497 for (i = 0; i < ido; i+=2)
498 {
499 __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16;
500 __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23;
501 __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30;
502 __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0);
503
504 m1 = _mm_load_ps(&RE(cc[ac+i]));
505 m2 = _mm_load_ps(&RE(cc[ac+i+2*ido]));
506 m3 = _mm_add_ps(m1, m2);
507 m4 = _mm_sub_ps(m1, m2);
508
509 n1 = _mm_load_ps(&RE(cc[ac+i+ido]));
510 n2 = _mm_load_ps(&RE(cc[ac+i+3*ido]));
511 n3 = _mm_add_ps(n1, n2);
512
513 n4 = _mm_mul_ps(neg1, n1);
514 n5 = _mm_mul_ps(neg1, n2);
515 n6 = _mm_sub_ps(n4, n5);
516
517 m5 = _mm_add_ps(m3, n3);
518
519 n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1));
520 n8 = _mm_add_ps(m4, n7);
521
522 m6 = _mm_sub_ps(m3, n3);
523 n9 = _mm_sub_ps(m4, n7);
524
525 _mm_store_ps(&RE(ch[ah+i]), m5);
526
527 #if 0
528 static INLINE void ComplexMult(real_t *y1, real_t *y2,
529 real_t x1, real_t x2, real_t c1, real_t c2)
530 {
531 *y1 = MUL_F(x1, c1) + MUL_F(x2, c2);
532 *y2 = MUL_F(x2, c1) - MUL_F(x1, c2);
533 }
534
535 m7.0 = RE(c2)*RE(wa1[i])
536 m7.1 = IM(c2)*IM(wa1[i])
537 m7.2 = RE(c6)*RE(wa1[i+1])
538 m7.3 = IM(c6)*IM(wa1[i+1])
539
540 m8.0 = RE(c2)*IM(wa1[i])
541 m8.1 = IM(c2)*RE(wa1[i])
542 m8.2 = RE(c6)*IM(wa1[i+1])
543 m8.3 = IM(c6)*RE(wa1[i+1])
544
545 RE(0) = m7.0 - m7.1
546 IM(0) = m8.0 + m8.1
547 RE(1) = m7.2 - m7.3
548 IM(1) = m8.2 + m8.3
549
550 ////
551 RE(0) = RE(c2)*RE(wa1[i]) - IM(c2)*IM(wa1[i])
552 IM(0) = RE(c2)*IM(wa1[i]) + IM(c2)*RE(wa1[i])
553 RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1])
554 IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1])
555 #endif
556
557 w1 = _mm_load_ps(&RE(wa1[i]));
558 w3 = _mm_load_ps(&RE(wa2[i]));
559 w5 = _mm_load_ps(&RE(wa3[i]));
560
561 w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
562 w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
563 w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1));
564
565 m7 = _mm_mul_ps(n8, w1);
566 m15 = _mm_mul_ps(m6, w3);
567 m23 = _mm_mul_ps(n9, w5);
568 m8 = _mm_mul_ps(n8, w2);
569 m16 = _mm_mul_ps(m6, w4);
570 m24 = _mm_mul_ps(n9, w6);
571
572 m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
573 m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0));
574 m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0));
575 m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
576 m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1));
577 m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1));
578
579 m11 = _mm_add_ps(m9, m10);
580 m19 = _mm_add_ps(m17, m18);
581 m27 = _mm_add_ps(m25, m26);
582 m12 = _mm_sub_ps(m9, m10);
583 m20 = _mm_sub_ps(m17, m18);
584 m28 = _mm_sub_ps(m25, m26);
585
586 m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
587 m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2));
588 m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2));
589 m14 = _mm_unpacklo_ps(m12, m13);
590 m22 = _mm_unpacklo_ps(m20, m21);
591 m30 = _mm_unpacklo_ps(m28, m29);
592
593 _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
594 _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22);
595 _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30);
596 }
597 }
598 }
599 #endif
600
601 static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
602 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
603 const complex_t *wa3)
163 { 604 {
164 uint16_t i, k, ac, ah; 605 uint16_t i, k, ac, ah;
165 606
166 if (ido == 1) 607 if (ido == 1)
167 { 608 {
170 complex_t t1, t2, t3, t4; 611 complex_t t1, t2, t3, t4;
171 612
172 ac = 4*k; 613 ac = 4*k;
173 ah = k; 614 ah = k;
174 615
175 RE(t2) = RE(cc[ac]) + RE(cc[ac+2]); 616 RE(t2) = RE(cc[ac]) + RE(cc[ac+2]);
176 RE(t1) = RE(cc[ac]) - RE(cc[ac+2]); 617 RE(t1) = RE(cc[ac]) - RE(cc[ac+2]);
177 IM(t2) = IM(cc[ac]) + IM(cc[ac+2]); 618 IM(t2) = IM(cc[ac]) + IM(cc[ac+2]);
178 IM(t1) = IM(cc[ac]) - IM(cc[ac+2]); 619 IM(t1) = IM(cc[ac]) - IM(cc[ac+2]);
179 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]); 620 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
180 IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]); 621 IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
181 IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]); 622 IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
182 RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]); 623 RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
183 624
184 RE(ch[ah]) = RE(t2) + RE(t3); 625 RE(ch[ah]) = RE(t2) + RE(t3);
185 RE(ch[ah+2*l1]) = RE(t2) - RE(t3); 626 RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
186 627
187 IM(ch[ah]) = IM(t2) + IM(t3); 628 IM(ch[ah]) = IM(t2) + IM(t3);
188 IM(ch[ah+2*l1]) = IM(t2) - IM(t3); 629 IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
189 630
190 RE(ch[ah+l1]) = RE(t1) + RE(t4)*isign; 631 RE(ch[ah+l1]) = RE(t1) + RE(t4);
191 RE(ch[ah+3*l1]) = RE(t1) - RE(t4)*isign; 632 RE(ch[ah+3*l1]) = RE(t1) - RE(t4);
192 633
193 IM(ch[ah+l1]) = IM(t1) + IM(t4)*isign; 634 IM(ch[ah+l1]) = IM(t1) + IM(t4);
194 IM(ch[ah+3*l1]) = IM(t1) - IM(t4)*isign; 635 IM(ch[ah+3*l1]) = IM(t1) - IM(t4);
195 } 636 }
196 } else { 637 } else {
197 for (k = 0; k < l1; k++) 638 for (k = 0; k < l1; k++)
198 { 639 {
199 ac = 4*k*ido; 640 ac = 4*k*ido;
210 RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]); 651 RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
211 IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]); 652 IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
212 IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]); 653 IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
213 RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]); 654 RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
214 655
215 RE(c2) = RE(t1) + RE(t4)*isign; 656 RE(c2) = RE(t1) + RE(t4);
216 RE(c4) = RE(t1) - RE(t4)*isign; 657 RE(c4) = RE(t1) - RE(t4);
217 658
218 IM(c2) = IM(t1) + IM(t4)*isign; 659 IM(c2) = IM(t1) + IM(t4);
219 IM(c4) = IM(t1) - IM(t4)*isign; 660 IM(c4) = IM(t1) - IM(t4);
220 661
221 RE(ch[ah+i]) = RE(t2) + RE(t3); 662 RE(ch[ah+i]) = RE(t2) + RE(t3);
222 RE(c3) = RE(t2) - RE(t3); 663 RE(c3) = RE(t2) - RE(t3);
223 664
224 IM(ch[ah+i]) = IM(t2) + IM(t3); 665 IM(ch[ah+i]) = IM(t2) + IM(t3);
225 IM(c3) = IM(t2) - IM(t3); 666 IM(c3) = IM(t2) - IM(t3);
226 667
227 IM(ch[ah+i+l1*ido]) = MUL_R_C(IM(c2),RE(wa1[i])) + MUL_R_C(RE(c2),IM(wa1[i]))*isign; 668 #if 1
228 RE(ch[ah+i+l1*ido]) = MUL_R_C(RE(c2),RE(wa1[i])) - MUL_R_C(IM(c2),IM(wa1[i]))*isign; 669 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
229 IM(ch[ah+i+2*l1*ido]) = MUL_R_C(IM(c3),RE(wa2[i])) + MUL_R_C(RE(c3),IM(wa2[i]))*isign; 670 IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));
230 RE(ch[ah+i+2*l1*ido]) = MUL_R_C(RE(c3),RE(wa2[i])) - MUL_R_C(IM(c3),IM(wa2[i]))*isign; 671 ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),
231 IM(ch[ah+i+3*l1*ido]) = MUL_R_C(IM(c4),RE(wa3[i])) + MUL_R_C(RE(c4),IM(wa3[i]))*isign; 672 IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));
232 RE(ch[ah+i+3*l1*ido]) = MUL_R_C(RE(c4),RE(wa3[i])) - MUL_R_C(IM(c4),IM(wa3[i]))*isign; 673 ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),
233 } 674 IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));
234 } 675 #else
235 } 676 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
236 } 677 RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));
237 678 ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),
238 679 RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));
239 static void passf5(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, 680 ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),
240 complex_t *wa1, complex_t *wa2, complex_t *wa3, complex_t *wa4, 681 RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));
241 int8_t isign) 682 #endif
242 { 683 }
243 static real_t tr11 = COEF_CONST(0.309016994374947); 684 }
244 static real_t ti11 = COEF_CONST(0.951056516295154); 685 }
245 static real_t tr12 = COEF_CONST(-0.809016994374947); 686 }
246 static real_t ti12 = COEF_CONST(0.587785252292473); 687
688 static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
689 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
690 const complex_t *wa3)
691 {
692 uint16_t i, k, ac, ah;
693
694 if (ido == 1)
695 {
696 for (k = 0; k < l1; k++)
697 {
698 complex_t t1, t2, t3, t4;
699
700 ac = 4*k;
701 ah = k;
702
703 RE(t2) = RE(cc[ac]) + RE(cc[ac+2]);
704 RE(t1) = RE(cc[ac]) - RE(cc[ac+2]);
705 IM(t2) = IM(cc[ac]) + IM(cc[ac+2]);
706 IM(t1) = IM(cc[ac]) - IM(cc[ac+2]);
707 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
708 IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
709 IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
710 RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
711
712 RE(ch[ah]) = RE(t2) + RE(t3);
713 RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
714
715 IM(ch[ah]) = IM(t2) + IM(t3);
716 IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
717
718 RE(ch[ah+l1]) = RE(t1) - RE(t4);
719 RE(ch[ah+3*l1]) = RE(t1) + RE(t4);
720
721 IM(ch[ah+l1]) = IM(t1) - IM(t4);
722 IM(ch[ah+3*l1]) = IM(t1) + IM(t4);
723 }
724 } else {
725 for (k = 0; k < l1; k++)
726 {
727 ac = 4*k*ido;
728 ah = k*ido;
729
730 for (i = 0; i < ido; i++)
731 {
732 complex_t c2, c3, c4, t1, t2, t3, t4;
733
734 RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);
735 RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);
736 IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);
737 IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);
738 RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
739 IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
740 IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
741 RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
742
743 RE(c2) = RE(t1) - RE(t4);
744 RE(c4) = RE(t1) + RE(t4);
745
746 IM(c2) = IM(t1) - IM(t4);
747 IM(c4) = IM(t1) + IM(t4);
748
749 RE(ch[ah+i]) = RE(t2) + RE(t3);
750 RE(c3) = RE(t2) - RE(t3);
751
752 IM(ch[ah+i]) = IM(t2) + IM(t3);
753 IM(c3) = IM(t2) - IM(t3);
754
755 #if 1
756 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
757 RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));
758 ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),
759 RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));
760 ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),
761 RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));
762 #else
763 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
764 IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));
765 ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),
766 IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));
767 ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),
768 IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));
769 #endif
770 }
771 }
772 }
773 }
774
775 static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc,
776 complex_t *ch, const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,
777 const complex_t *wa4, const int8_t isign)
778 {
779 static real_t tr11 = FRAC_CONST(0.309016994374947);
780 static real_t ti11 = FRAC_CONST(0.951056516295154);
781 static real_t tr12 = FRAC_CONST(-0.809016994374947);
782 static real_t ti12 = FRAC_CONST(0.587785252292473);
247 uint16_t i, k, ac, ah; 783 uint16_t i, k, ac, ah;
248 complex_t c2, c3, c4, c5, d3, d4, d5, d2, t2, t3, t4, t5; 784 complex_t c2, c3, c4, c5, d3, d4, d5, d2, t2, t3, t4, t5;
249 785
250 if (ido == 1) 786 if (ido == 1)
251 { 787 {
252 for (k = 0; k < l1; k++) 788 if (isign == 1)
253 { 789 {
254 ac = 5*k + 1; 790 for (k = 0; k < l1; k++)
255 ah = k; 791 {
256 792 ac = 5*k + 1;
257 RE(t2) = RE(cc[ac]) + RE(cc[ac+3]); 793 ah = k;
258 IM(t2) = IM(cc[ac]) + IM(cc[ac+3]); 794
259 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]); 795 RE(t2) = RE(cc[ac]) + RE(cc[ac+3]);
260 IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]); 796 IM(t2) = IM(cc[ac]) + IM(cc[ac+3]);
261 RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]); 797 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]);
262 IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]); 798 IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]);
263 RE(t5) = RE(cc[ac]) - RE(cc[ac+3]); 799 RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]);
264 IM(t5) = IM(cc[ac]) - IM(cc[ac+3]); 800 IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]);
265 801 RE(t5) = RE(cc[ac]) - RE(cc[ac+3]);
266 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3); 802 IM(t5) = IM(cc[ac]) - IM(cc[ac+3]);
267 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3); 803
268 804 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3);
269 RE(c2) = RE(cc[ac-1]) + MUL_R_C(RE(t2),tr11) + MUL_R_C(RE(t3),tr12); 805 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3);
270 IM(c2) = IM(cc[ac-1]) + MUL_R_C(IM(t2),tr11) + MUL_R_C(IM(t3),tr12); 806
271 RE(c3) = RE(cc[ac-1]) + MUL_R_C(RE(t2),tr12) + MUL_R_C(RE(t3),tr11); 807 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
272 IM(c3) = IM(cc[ac-1]) + MUL_R_C(IM(t2),tr12) + MUL_R_C(IM(t3),tr11); 808 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
273 RE(c4) = (MUL_R_C(RE(t5),ti12)*isign - MUL_R_C(RE(t4),ti11)); 809 RE(c3) = RE(cc[ac-1]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
274 IM(c4) = (MUL_R_C(IM(t5),ti12)*isign - MUL_R_C(IM(t4),ti11)); 810 IM(c3) = IM(cc[ac-1]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
275 RE(c5) = (MUL_R_C(RE(t5),ti11)*isign + MUL_R_C(RE(t4),ti12)); 811
276 IM(c5) = (MUL_R_C(IM(t5),ti11)*isign + MUL_R_C(IM(t4),ti12)); 812 ComplexMult(&RE(c5), &RE(c4),
277 813 ti11, ti12, RE(t5), RE(t4));
278 RE(ch[ah+l1]) = RE(c2) - IM(c5); 814 ComplexMult(&IM(c5), &IM(c4),
279 IM(ch[ah+l1]) = IM(c2) + RE(c5); 815 ti11, ti12, IM(t5), IM(t4));
280 RE(ch[ah+2*l1]) = RE(c3) - IM(c4); 816
281 IM(ch[ah+2*l1]) = IM(c3) + RE(c4); 817 RE(ch[ah+l1]) = RE(c2) - IM(c5);
282 RE(ch[ah+3*l1]) = RE(c3) + IM(c4); 818 IM(ch[ah+l1]) = IM(c2) + RE(c5);
283 IM(ch[ah+3*l1]) = IM(c3) - RE(c4); 819 RE(ch[ah+2*l1]) = RE(c3) - IM(c4);
284 RE(ch[ah+4*l1]) = RE(c2) + IM(c5); 820 IM(ch[ah+2*l1]) = IM(c3) + RE(c4);
285 IM(ch[ah+4*l1]) = IM(c2) - RE(c5); 821 RE(ch[ah+3*l1]) = RE(c3) + IM(c4);
822 IM(ch[ah+3*l1]) = IM(c3) - RE(c4);
823 RE(ch[ah+4*l1]) = RE(c2) + IM(c5);
824 IM(ch[ah+4*l1]) = IM(c2) - RE(c5);
825 }
826 } else {
827 for (k = 0; k < l1; k++)
828 {
829 ac = 5*k + 1;
830 ah = k;
831
832 RE(t2) = RE(cc[ac]) + RE(cc[ac+3]);
833 IM(t2) = IM(cc[ac]) + IM(cc[ac+3]);
834 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]);
835 IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]);
836 RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]);
837 IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]);
838 RE(t5) = RE(cc[ac]) - RE(cc[ac+3]);
839 IM(t5) = IM(cc[ac]) - IM(cc[ac+3]);
840
841 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3);
842 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3);
843
844 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
845 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
846 RE(c3) = RE(cc[ac-1]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
847 IM(c3) = IM(cc[ac-1]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
848
849 ComplexMult(&RE(c4), &RE(c5),
850 ti12, ti11, RE(t5), RE(t4));
851 ComplexMult(&IM(c4), &IM(c5),
852 ti12, ti12, IM(t5), IM(t4));
853
854 RE(ch[ah+l1]) = RE(c2) + IM(c5);
855 IM(ch[ah+l1]) = IM(c2) - RE(c5);
856 RE(ch[ah+2*l1]) = RE(c3) + IM(c4);
857 IM(ch[ah+2*l1]) = IM(c3) - RE(c4);
858 RE(ch[ah+3*l1]) = RE(c3) - IM(c4);
859 IM(ch[ah+3*l1]) = IM(c3) + RE(c4);
860 RE(ch[ah+4*l1]) = RE(c2) - IM(c5);
861 IM(ch[ah+4*l1]) = IM(c2) + RE(c5);
862 }
286 } 863 }
287 } else { 864 } else {
288 for (k = 0; k < l1; k++) 865 if (isign == 1)
289 { 866 {
290 for (i = 0; i < ido; i++) 867 for (k = 0; k < l1; k++)
291 { 868 {
292 ac = i + (k*5 + 1) * ido; 869 for (i = 0; i < ido; i++)
293 ah = i + k * ido; 870 {
294 871 ac = i + (k*5 + 1) * ido;
295 RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]); 872 ah = i + k * ido;
296 IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]); 873
297 RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]); 874 RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]);
298 IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]); 875 IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]);
299 RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]); 876 RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]);
300 IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]); 877 IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]);
301 RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]); 878 RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]);
302 IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]); 879 IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]);
303 880 RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]);
304 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3); 881 IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]);
305 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3); 882
306 883 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3);
307 RE(c2) = RE(cc[ac-ido]) + MUL_R_C(RE(t2),tr11) + MUL_R_C(RE(t3),tr12); 884 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3);
308 IM(c2) = IM(cc[ac-ido]) + MUL_R_C(IM(t2),tr11) + MUL_R_C(IM(t3),tr12); 885
309 RE(c3) = RE(cc[ac-ido]) + MUL_R_C(RE(t2),tr12) + MUL_R_C(RE(t3),tr11); 886 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
310 IM(c3) = IM(cc[ac-ido]) + MUL_R_C(IM(t2),tr12) + MUL_R_C(IM(t3),tr11); 887 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
311 RE(c4) = (MUL_R_C(RE(t5),ti12)*isign - MUL_R_C(RE(t4),ti11)); 888 RE(c3) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
312 IM(c4) = (MUL_R_C(IM(t5),ti12)*isign - MUL_R_C(IM(t4),ti11)); 889 IM(c3) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
313 RE(c5) = (MUL_R_C(RE(t5),ti11)*isign + MUL_R_C(RE(t4),ti12)); 890
314 IM(c5) = (MUL_R_C(IM(t5),ti11)*isign + MUL_R_C(IM(t4),ti12)); 891 ComplexMult(&RE(c5), &RE(c4),
315 892 ti11, ti12, RE(t5), RE(t4));
316 IM(d2) = IM(c2) + RE(c5); 893 ComplexMult(&IM(c5), &IM(c4),
317 IM(d3) = IM(c3) + RE(c4); 894 ti11, ti12, IM(t5), IM(t4));
318 RE(d4) = RE(c3) + IM(c4); 895
319 RE(d5) = RE(c2) + IM(c5); 896 IM(d2) = IM(c2) + RE(c5);
320 RE(d2) = RE(c2) - IM(c5); 897 IM(d3) = IM(c3) + RE(c4);
321 IM(d5) = IM(c2) - RE(c5); 898 RE(d4) = RE(c3) + IM(c4);
322 RE(d3) = RE(c3) - IM(c4); 899 RE(d5) = RE(c2) + IM(c5);
323 IM(d4) = IM(c3) - RE(c4); 900 RE(d2) = RE(c2) - IM(c5);
324 901 IM(d5) = IM(c2) - RE(c5);
325 RE(ch[ah+l1*ido]) = MUL_R_C(RE(d2),RE(wa1[i])) - MUL_R_C(IM(d2),IM(wa1[i]))*isign; 902 RE(d3) = RE(c3) - IM(c4);
326 IM(ch[ah+l1*ido]) = MUL_R_C(IM(d2),RE(wa1[i])) + MUL_R_C(RE(d2),IM(wa1[i]))*isign; 903 IM(d4) = IM(c3) - RE(c4);
327 RE(ch[ah+2*l1*ido]) = MUL_R_C(RE(d3),RE(wa2[i])) - MUL_R_C(IM(d3),IM(wa2[i]))*isign; 904
328 IM(ch[ah+2*l1*ido]) = MUL_R_C(IM(d3),RE(wa2[i])) + MUL_R_C(RE(d3),IM(wa2[i]))*isign; 905 #if 1
329 RE(ch[ah+3*l1*ido]) = MUL_R_C(RE(d4),RE(wa3[i])) - MUL_R_C(IM(d4),IM(wa3[i]))*isign; 906 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
330 IM(ch[ah+3*l1*ido]) = MUL_R_C(IM(d4),RE(wa3[i])) + MUL_R_C(RE(d4),IM(wa3[i]))*isign; 907 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
331 RE(ch[ah+4*l1*ido]) = MUL_R_C(RE(d5),RE(wa4[i])) - MUL_R_C(IM(d5),IM(wa4[i]))*isign; 908 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
332 IM(ch[ah+4*l1*ido]) = MUL_R_C(IM(d5),RE(wa4[i])) + MUL_R_C(RE(d5),IM(wa4[i]))*isign; 909 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
910 ComplexMult(&IM(ch[ah+3*l1*ido]), &RE(ch[ah+3*l1*ido]),
911 IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i]));
912 ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]),
913 IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i]));
914 #else
915 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
916 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
917 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
918 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
919 ComplexMult(&RE(ch[ah+3*l1*ido]), &IM(ch[ah+3*l1*ido]),
920 RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i]));
921 ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]),
922 RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i]));
923 #endif
924 }
925 }
926 } else {
927 for (k = 0; k < l1; k++)
928 {
929 for (i = 0; i < ido; i++)
930 {
931 ac = i + (k*5 + 1) * ido;
932 ah = i + k * ido;
933
934 RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]);
935 IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]);
936 RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]);
937 IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]);
938 RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]);
939 IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]);
940 RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]);
941 IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]);
942
943 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3);
944 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3);
945
946 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
947 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
948 RE(c3) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
949 IM(c3) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
950
951 ComplexMult(&RE(c4), &RE(c5),
952 ti12, ti11, RE(t5), RE(t4));
953 ComplexMult(&IM(c4), &IM(c5),
954 ti12, ti12, IM(t5), IM(t4));
955
956 IM(d2) = IM(c2) - RE(c5);
957 IM(d3) = IM(c3) - RE(c4);
958 RE(d4) = RE(c3) - IM(c4);
959 RE(d5) = RE(c2) - IM(c5);
960 RE(d2) = RE(c2) + IM(c5);
961 IM(d5) = IM(c2) + RE(c5);
962 RE(d3) = RE(c3) + IM(c4);
963 IM(d4) = IM(c3) + RE(c4);
964
965 #if 1
966 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
967 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
968 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
969 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
970 ComplexMult(&RE(ch[ah+3*l1*ido]), &IM(ch[ah+3*l1*ido]),
971 RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i]));
972 ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]),
973 RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i]));
974 #else
975 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
976 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
977 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
978 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
979 ComplexMult(&IM(ch[ah+3*l1*ido]), &RE(ch[ah+3*l1*ido]),
980 IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i]));
981 ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]),
982 IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i]));
983 #endif
984 }
333 } 985 }
334 } 986 }
335 } 987 }
336 } 988 }
337 989
338 990
339 /*---------------------------------------------------------------------- 991 /*----------------------------------------------------------------------
340 cfftf1, cfftf, cfftb, cffti1, cffti. Complex FFTs. 992 cfftf1, cfftf, cfftb, cffti1, cffti. Complex FFTs.
341 ----------------------------------------------------------------------*/ 993 ----------------------------------------------------------------------*/
342 994
343 INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch, 995 #ifdef USE_SSE
344 uint16_t *ifac, complex_t *wa, int8_t isign) 996
997 #define CONV(A,B,C) ( (A<<2) | ((B & 0x1)<<1) | ((C==1)&0x1) )
998
999 static INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch,
1000 const uint16_t *ifac, const complex_t *wa,
1001 const int8_t isign)
345 { 1002 {
346 uint16_t i; 1003 uint16_t i;
347 uint16_t k1, l1, l2; 1004 uint16_t k1, l1, l2;
348 uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1; 1005 uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
349 1006
357 ip = ifac[k1]; 1014 ip = ifac[k1];
358 l2 = ip*l1; 1015 l2 = ip*l1;
359 ido = n / l2; 1016 ido = n / l2;
360 idl1 = ido*l1; 1017 idl1 = ido*l1;
361 1018
1019 ix2 = iw + ido;
1020 ix3 = ix2 + ido;
1021 ix4 = ix3 + ido;
1022
1023 switch (CONV(ip,na,ido))
1024 {
1025 case CONV(4,0,0):
1026 //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
1027 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
1028 break;
1029 case CONV(4,0,1):
1030 passf4pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
1031 break;
1032 case CONV(4,1,0):
1033 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
1034 //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
1035 break;
1036 case CONV(4,1,1):
1037 passf4pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
1038 break;
1039 case CONV(2,0,0):
1040 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1041 //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1042 break;
1043 case CONV(2,0,1):
1044 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1045 //passf2pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1046 break;
1047 case CONV(2,1,0):
1048 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1049 //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1050 break;
1051 case CONV(2,1,1):
1052 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1053 //passf2pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1054 break;
1055 case CONV(3,0,0):
1056 case CONV(3,0,1):
1057 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
1058 break;
1059 case CONV(3,1,0):
1060 case CONV(3,1,1):
1061 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
1062 break;
1063 case CONV(5,0,0):
1064 case CONV(5,0,1):
1065 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1066 break;
1067 case CONV(5,1,0):
1068 case CONV(5,1,1):
1069 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1070 break;
1071 }
1072
1073 na = 1 - na;
1074
1075 l1 = l2;
1076 iw += (ip-1) * ido;
1077 }
1078
1079 if (na == 0)
1080 return;
1081
1082 for (i = 0; i < n; i++)
1083 {
1084 RE(c[i]) = RE(ch[i]);
1085 IM(c[i]) = IM(ch[i]);
1086 }
1087 }
1088 #endif
1089
1090 static INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch,
1091 const uint16_t *ifac, const complex_t *wa,
1092 const int8_t isign)
1093 {
1094 uint16_t i;
1095 uint16_t k1, l1, l2;
1096 uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
1097
1098 nf = ifac[1];
1099 na = 0;
1100 l1 = 1;
1101 iw = 0;
1102
1103 for (k1 = 2; k1 <= nf+1; k1++)
1104 {
1105 ip = ifac[k1];
1106 l2 = ip*l1;
1107 ido = n / l2;
1108 idl1 = ido*l1;
1109
362 switch (ip) 1110 switch (ip)
363 { 1111 {
364 case 4: 1112 case 4:
365 ix2 = iw + ido; 1113 ix2 = iw + ido;
366 ix3 = ix2 + ido; 1114 ix3 = ix2 + ido;
367 1115
368 if (na == 0) 1116 if (na == 0)
369 passf4(ido, l1, c, ch, &wa[iw], &wa[ix2], &wa[ix3], isign); 1117 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
370 else 1118 else
371 passf4(ido, l1, ch, c, &wa[iw], &wa[ix2], &wa[ix3], isign); 1119 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
372 1120
373 na = 1 - na; 1121 na = 1 - na;
374 break; 1122 break;
375 case 2: 1123 case 2:
376 if (na == 0) 1124 if (na == 0)
377 passf2(ido, l1, c, ch, &wa[iw], isign); 1125 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
378 else 1126 else
379 passf2(ido, l1, ch, c, &wa[iw], isign); 1127 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
380 1128
381 na = 1 - na; 1129 na = 1 - na;
382 break; 1130 break;
383 case 3: 1131 case 3:
384 ix2 = iw + ido; 1132 ix2 = iw + ido;
385 1133
386 if (na == 0) 1134 if (na == 0)
387 passf3(ido, l1, c, ch, &wa[iw], &wa[ix2], isign); 1135 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
388 else 1136 else
389 passf3(ido, l1, ch, c, &wa[iw], &wa[ix2], isign); 1137 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
390 1138
391 na = 1 - na; 1139 na = 1 - na;
392 break; 1140 break;
393 case 5: 1141 case 5:
394 ix2 = iw + ido; 1142 ix2 = iw + ido;
395 ix3 = ix2 + ido; 1143 ix3 = ix2 + ido;
396 ix4 = ix3 + ido; 1144 ix4 = ix3 + ido;
397 1145
398 if (na == 0) 1146 if (na == 0)
399 passf5(ido, l1, c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); 1147 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
400 else 1148 else
401 passf5(ido, l1, ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); 1149 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
402 1150
403 na = 1 - na; 1151 na = 1 - na;
404 break; 1152 break;
405 } 1153 }
406 1154
416 RE(c[i]) = RE(ch[i]); 1164 RE(c[i]) = RE(ch[i]);
417 IM(c[i]) = IM(ch[i]); 1165 IM(c[i]) = IM(ch[i]);
418 } 1166 }
419 } 1167 }
420 1168
1169 static INLINE void cfftf1neg(uint16_t n, complex_t *c, complex_t *ch,
1170 const uint16_t *ifac, const complex_t *wa,
1171 const int8_t isign)
1172 {
1173 uint16_t i;
1174 uint16_t k1, l1, l2;
1175 uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
1176
1177 nf = ifac[1];
1178 na = 0;
1179 l1 = 1;
1180 iw = 0;
1181
1182 for (k1 = 2; k1 <= nf+1; k1++)
1183 {
1184 ip = ifac[k1];
1185 l2 = ip*l1;
1186 ido = n / l2;
1187 idl1 = ido*l1;
1188
1189 switch (ip)
1190 {
1191 case 4:
1192 ix2 = iw + ido;
1193 ix3 = ix2 + ido;
1194
1195 if (na == 0)
1196 passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
1197 else
1198 passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
1199
1200 na = 1 - na;
1201 break;
1202 case 2:
1203 if (na == 0)
1204 passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1205 else
1206 passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1207
1208 na = 1 - na;
1209 break;
1210 case 3:
1211 ix2 = iw + ido;
1212
1213 if (na == 0)
1214 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
1215 else
1216 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
1217
1218 na = 1 - na;
1219 break;
1220 case 5:
1221 ix2 = iw + ido;
1222 ix3 = ix2 + ido;
1223 ix4 = ix3 + ido;
1224
1225 if (na == 0)
1226 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1227 else
1228 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1229
1230 na = 1 - na;
1231 break;
1232 }
1233
1234 l1 = l2;
1235 iw += (ip-1) * ido;
1236 }
1237
1238 if (na == 0)
1239 return;
1240
1241 for (i = 0; i < n; i++)
1242 {
1243 RE(c[i]) = RE(ch[i]);
1244 IM(c[i]) = IM(ch[i]);
1245 }
1246 }
1247
421 void cfftf(cfft_info *cfft, complex_t *c) 1248 void cfftf(cfft_info *cfft, complex_t *c)
422 { 1249 {
423 cfftf1(cfft->n, c, cfft->work, cfft->ifac, cfft->tab, -1); 1250 cfftf1neg(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, -1);
424 } 1251 }
425 1252
426 void cfftb(cfft_info *cfft, complex_t *c) 1253 void cfftb(cfft_info *cfft, complex_t *c)
427 { 1254 {
428 cfftf1(cfft->n, c, cfft->work, cfft->ifac, cfft->tab, +1); 1255 cfftf1pos(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);
429 } 1256 }
1257
1258 #ifdef USE_SSE
1259 void cfftb_sse(cfft_info *cfft, complex_t *c)
1260 {
1261 cfftf1pos_sse(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);
1262 }
1263 #endif
430 1264
431 static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac) 1265 static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac)
432 { 1266 {
433 static uint16_t ntryh[4] = {3, 4, 2, 5}; 1267 static uint16_t ntryh[4] = {3, 4, 2, 5};
434 #ifndef FIXED_POINT 1268 #ifndef FIXED_POINT
435 real_t arg, argh, argld, fi; 1269 real_t arg, argh, argld, fi;
436 uint16_t ido, ipm; 1270 uint16_t ido, ipm;
437 uint16_t i1, k1, l1, l2; 1271 uint16_t i1, k1, l1, l2;
438 uint16_t ld, ii, ip; 1272 uint16_t ld, ii, ip;
439 #endif 1273 #endif
440 uint16_t ntry, i, j; 1274 uint16_t ntry = 0, i, j;
441 uint16_t ib; 1275 uint16_t ib;
442 uint16_t nf, nl, nq, nr; 1276 uint16_t nf, nl, nq, nr;
443 1277
444 nl = n; 1278 nl = n;
445 nf = 0; 1279 nf = 0;
478 1312
479 ifac[0] = n; 1313 ifac[0] = n;
480 ifac[1] = nf; 1314 ifac[1] = nf;
481 1315
482 #ifndef FIXED_POINT 1316 #ifndef FIXED_POINT
483 argh = (real_t)2.0*M_PI / (real_t)n; 1317 argh = (real_t)2.0*(real_t)M_PI / (real_t)n;
484 i = 0; 1318 i = 0;
485 l1 = 1; 1319 l1 = 1;
486 1320
487 for (k1 = 1; k1 <= nf; k1++) 1321 for (k1 = 1; k1 <= nf; k1++)
488 { 1322 {
505 { 1339 {
506 i++; 1340 i++;
507 fi++; 1341 fi++;
508 arg = fi * argld; 1342 arg = fi * argld;
509 RE(wa[i]) = (real_t)cos(arg); 1343 RE(wa[i]) = (real_t)cos(arg);
1344 #if 1
510 IM(wa[i]) = (real_t)sin(arg); 1345 IM(wa[i]) = (real_t)sin(arg);
1346 #else
1347 IM(wa[i]) = (real_t)-sin(arg);
1348 #endif
511 } 1349 }
512 1350
513 if (ip > 5) 1351 if (ip > 5)
514 { 1352 {
515 RE(wa[i1]) = RE(wa[i]); 1353 RE(wa[i1]) = RE(wa[i]);
521 #endif 1359 #endif
522 } 1360 }
523 1361
524 cfft_info *cffti(uint16_t n) 1362 cfft_info *cffti(uint16_t n)
525 { 1363 {
526 cfft_info *cfft = (cfft_info*)malloc(sizeof(cfft_info)); 1364 cfft_info *cfft = (cfft_info*)faad_malloc(sizeof(cfft_info));
527 1365
528 cfft->n = n; 1366 cfft->n = n;
529 cfft->work = (complex_t*)malloc(n*sizeof(complex_t)); 1367 cfft->work = (complex_t*)faad_malloc(n*sizeof(complex_t));
530 1368
531 #ifndef FIXED_POINT 1369 #ifndef FIXED_POINT
532 cfft->tab = (complex_t*)malloc(n*sizeof(complex_t)); 1370 cfft->tab = (complex_t*)faad_malloc(n*sizeof(complex_t));
533 1371
534 cffti1(n, cfft->tab, cfft->ifac); 1372 cffti1(n, cfft->tab, cfft->ifac);
535 #else 1373 #else
536 cffti1(n, NULL, cfft->ifac); 1374 cffti1(n, NULL, cfft->ifac);
537 1375
538 switch (n) 1376 switch (n)
539 { 1377 {
540 case 60: cfft->tab = cfft_tab_60; break;
541 case 64: cfft->tab = cfft_tab_64; break; 1378 case 64: cfft->tab = cfft_tab_64; break;
542 case 480: cfft->tab = cfft_tab_480; break;
543 case 512: cfft->tab = cfft_tab_512; break; 1379 case 512: cfft->tab = cfft_tab_512; break;
544 #ifdef LD_DEC 1380 #ifdef LD_DEC
1381 case 256: cfft->tab = cfft_tab_256; break;
1382 #endif
1383
1384 #ifdef ALLOW_SMALL_FRAMELENGTH
1385 case 60: cfft->tab = cfft_tab_60; break;
1386 case 480: cfft->tab = cfft_tab_480; break;
1387 #ifdef LD_DEC
545 case 240: cfft->tab = cfft_tab_240; break; 1388 case 240: cfft->tab = cfft_tab_240; break;
546 case 256: cfft->tab = cfft_tab_256; break; 1389 #endif
547 #endif 1390 #endif
548 } 1391 }
549 #endif 1392 #endif
550 1393
551 return cfft; 1394 return cfft;
552 } 1395 }
553 1396
554 void cfftu(cfft_info *cfft) 1397 void cfftu(cfft_info *cfft)
555 { 1398 {
556 if (cfft->work) free(cfft->work); 1399 if (cfft->work) faad_free(cfft->work);
557 #ifndef FIXED_POINT 1400 #ifndef FIXED_POINT
558 if (cfft->tab) free(cfft->tab); 1401 if (cfft->tab) faad_free(cfft->tab);
559 #endif 1402 #endif
560 1403
561 if (cfft) free(cfft); 1404 if (cfft) faad_free(cfft);
562 } 1405 }
1406