10725
|
1 /*
|
|
2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
|
12527
|
3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
|
10725
|
4 **
|
|
5 ** This program is free software; you can redistribute it and/or modify
|
|
6 ** it under the terms of the GNU General Public License as published by
|
|
7 ** the Free Software Foundation; either version 2 of the License, or
|
|
8 ** (at your option) any later version.
|
|
9 **
|
|
10 ** This program is distributed in the hope that it will be useful,
|
|
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 ** GNU General Public License for more details.
|
|
14 **
|
|
15 ** You should have received a copy of the GNU General Public License
|
|
16 ** along with this program; if not, write to the Free Software
|
|
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
18 **
|
|
19 ** Any non-GPL usage of this software or parts of this software is strictly
|
|
20 ** forbidden.
|
|
21 **
|
|
22 ** Commercial non-GPL licensing of this software is possible.
|
|
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
|
|
24 **
|
12527
|
25 ** $Id: filtbank.c,v 1.2 2003/10/03 22:22:27 alex Exp $
|
10725
|
26 **/
|
|
27
|
|
28 #include "common.h"
|
|
29 #include "structs.h"
|
|
30
|
|
31 #include <stdlib.h>
|
|
32 #include <string.h>
|
|
33 #ifdef _WIN32_WCE
|
|
34 #define assert(x)
|
|
35 #else
|
|
36 #include <assert.h>
|
|
37 #endif
|
|
38
|
|
39 #include "filtbank.h"
|
|
40 #include "decoder.h"
|
|
41 #include "syntax.h"
|
|
42 #include "kbd_win.h"
|
|
43 #include "sine_win.h"
|
|
44 #include "mdct.h"
|
|
45
|
|
46
|
|
47 fb_info *filter_bank_init(uint16_t frame_len)
|
|
48 {
|
|
49 uint16_t nshort = frame_len/8;
|
|
50 #ifdef LD_DEC
|
|
51 uint16_t frame_len_ld = frame_len/2;
|
|
52 #endif
|
|
53
|
12527
|
54 fb_info *fb = (fb_info*)faad_malloc(sizeof(fb_info));
|
10725
|
55 memset(fb, 0, sizeof(fb_info));
|
|
56
|
|
57 /* normal */
|
|
58 fb->mdct256 = faad_mdct_init(2*nshort);
|
|
59 fb->mdct2048 = faad_mdct_init(2*frame_len);
|
|
60 #ifdef LD_DEC
|
|
61 /* LD */
|
|
62 fb->mdct1024 = faad_mdct_init(2*frame_len_ld);
|
|
63 #endif
|
|
64
|
12527
|
65 #ifdef ALLOW_SMALL_FRAMELENGTH
|
10725
|
66 if (frame_len == 1024)
|
|
67 {
|
12527
|
68 #endif
|
10725
|
69 fb->long_window[0] = sine_long_1024;
|
|
70 fb->short_window[0] = sine_short_128;
|
|
71 fb->long_window[1] = kbd_long_1024;
|
|
72 fb->short_window[1] = kbd_short_128;
|
|
73 #ifdef LD_DEC
|
|
74 fb->ld_window[0] = sine_mid_512;
|
|
75 fb->ld_window[1] = ld_mid_512;
|
|
76 #endif
|
12527
|
77 #ifdef ALLOW_SMALL_FRAMELENGTH
|
10725
|
78 } else /* (frame_len == 960) */ {
|
|
79 fb->long_window[0] = sine_long_960;
|
|
80 fb->short_window[0] = sine_short_120;
|
|
81 fb->long_window[1] = kbd_long_960;
|
|
82 fb->short_window[1] = kbd_short_120;
|
|
83 #ifdef LD_DEC
|
|
84 fb->ld_window[0] = sine_mid_480;
|
|
85 fb->ld_window[1] = ld_mid_480;
|
|
86 #endif
|
|
87 }
|
12527
|
88 #endif
|
|
89
|
|
90 #ifdef USE_SSE
|
|
91 if (cpu_has_sse())
|
|
92 {
|
|
93 fb->if_func = ifilter_bank_sse;
|
|
94 } else {
|
|
95 fb->if_func = ifilter_bank;
|
|
96 }
|
|
97 #endif
|
10725
|
98
|
|
99 return fb;
|
|
100 }
|
|
101
|
|
102 void filter_bank_end(fb_info *fb)
|
|
103 {
|
|
104 if (fb != NULL)
|
|
105 {
|
12527
|
106 #ifdef PROFILE
|
|
107 printf("FB: %I64d cycles\n", fb->cycles);
|
|
108 #endif
|
|
109
|
10725
|
110 faad_mdct_end(fb->mdct256);
|
|
111 faad_mdct_end(fb->mdct2048);
|
|
112 #ifdef LD_DEC
|
|
113 faad_mdct_end(fb->mdct1024);
|
|
114 #endif
|
|
115
|
12527
|
116 faad_free(fb);
|
10725
|
117 }
|
|
118 }
|
|
119
|
12527
|
120 static INLINE void imdct_long(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
|
10725
|
121 {
|
12527
|
122 #ifdef LD_DEC
|
|
123 mdct_info *mdct = NULL;
|
10725
|
124
|
|
125 switch (len)
|
|
126 {
|
|
127 case 2048:
|
|
128 case 1920:
|
|
129 mdct = fb->mdct2048;
|
|
130 break;
|
|
131 case 1024:
|
|
132 case 960:
|
|
133 mdct = fb->mdct1024;
|
|
134 break;
|
|
135 }
|
|
136
|
|
137 faad_imdct(mdct, in_data, out_data);
|
12527
|
138 #else
|
|
139 faad_imdct(fb->mdct2048, in_data, out_data);
|
|
140 #endif
|
10725
|
141 }
|
|
142
|
12527
|
143 #ifdef USE_SSE
|
|
144 static INLINE void imdct_long_sse(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
|
|
145 {
|
|
146 #ifdef LD_DEC
|
|
147 mdct_info *mdct = NULL;
|
|
148
|
|
149 switch (len)
|
|
150 {
|
|
151 case 2048:
|
|
152 case 1920:
|
|
153 mdct = fb->mdct2048;
|
|
154 break;
|
|
155 case 1024:
|
|
156 case 960:
|
|
157 mdct = fb->mdct1024;
|
|
158 break;
|
|
159 }
|
|
160
|
|
161 faad_imdct_sse(mdct, in_data, out_data);
|
|
162 #else
|
|
163 faad_imdct_sse(fb->mdct2048, in_data, out_data);
|
|
164 #endif
|
|
165 }
|
|
166 #endif
|
|
167
|
10725
|
168 #ifdef LTP_DEC
|
|
169 static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
|
|
170 {
|
12527
|
171 mdct_info *mdct = NULL;
|
10725
|
172
|
|
173 switch (len)
|
|
174 {
|
|
175 case 2048:
|
|
176 case 1920:
|
|
177 mdct = fb->mdct2048;
|
|
178 break;
|
|
179 case 256:
|
|
180 case 240:
|
|
181 mdct = fb->mdct256;
|
|
182 break;
|
|
183 #ifdef LD_DEC
|
|
184 case 1024:
|
|
185 case 960:
|
|
186 mdct = fb->mdct1024;
|
|
187 break;
|
|
188 #endif
|
|
189 }
|
|
190
|
|
191 faad_mdct(mdct, in_data, out_data);
|
|
192 }
|
|
193 #endif
|
|
194
|
|
195 void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
|
|
196 uint8_t window_shape_prev, real_t *freq_in,
|
12527
|
197 real_t *time_out, real_t *overlap,
|
|
198 uint8_t object_type, uint16_t frame_len)
|
10725
|
199 {
|
|
200 int16_t i;
|
12527
|
201 ALIGN real_t transf_buf[2*1024] = {0};
|
10725
|
202
|
12527
|
203 const real_t *window_long = NULL;
|
|
204 const real_t *window_long_prev = NULL;
|
|
205 const real_t *window_short = NULL;
|
|
206 const real_t *window_short_prev = NULL;
|
10725
|
207
|
|
208 uint16_t nlong = frame_len;
|
|
209 uint16_t nshort = frame_len/8;
|
|
210 uint16_t trans = nshort/2;
|
|
211
|
|
212 uint16_t nflat_ls = (nlong-nshort)/2;
|
|
213
|
12527
|
214 #ifdef PROFILE
|
|
215 int64_t count = faad_get_ts();
|
|
216 #endif
|
|
217
|
|
218 #ifdef LD_DEC
|
|
219 if (object_type == LD)
|
|
220 {
|
|
221 window_long = fb->ld_window[window_shape];
|
|
222 window_long_prev = fb->ld_window[window_shape_prev];
|
|
223 } else {
|
|
224 #endif
|
|
225 window_long = fb->long_window[window_shape];
|
|
226 window_long_prev = fb->long_window[window_shape_prev];
|
|
227 window_short = fb->short_window[window_shape];
|
|
228 window_short_prev = fb->short_window[window_shape_prev];
|
|
229 #ifdef LD_DEC
|
|
230 }
|
|
231 #endif
|
|
232
|
|
233
|
|
234 switch (window_sequence)
|
|
235 {
|
|
236 case ONLY_LONG_SEQUENCE:
|
|
237 imdct_long(fb, freq_in, transf_buf, 2*nlong);
|
|
238 for (i = 0; i < nlong; i+=4)
|
|
239 {
|
|
240 time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]);
|
|
241 time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
|
|
242 time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
|
|
243 time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
|
|
244 }
|
|
245 for (i = 0; i < nlong; i+=4)
|
|
246 {
|
|
247 overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
|
|
248 overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]);
|
|
249 overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]);
|
|
250 overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]);
|
|
251 }
|
|
252 break;
|
|
253
|
|
254 case LONG_START_SEQUENCE:
|
|
255 imdct_long(fb, freq_in, transf_buf, 2*nlong);
|
|
256 for (i = 0; i < nlong; i+=4)
|
|
257 {
|
|
258 time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]);
|
|
259 time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
|
|
260 time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
|
|
261 time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
|
|
262 }
|
|
263 for (i = 0; i < nflat_ls; i++)
|
|
264 overlap[i] = transf_buf[nlong+i];
|
|
265 for (i = 0; i < nshort; i++)
|
|
266 overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]);
|
|
267 for (i = 0; i < nflat_ls; i++)
|
|
268 overlap[nflat_ls+nshort+i] = 0;
|
|
269 break;
|
|
270
|
|
271 case EIGHT_SHORT_SEQUENCE:
|
|
272 faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0);
|
|
273 faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1);
|
|
274 faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2);
|
|
275 faad_imdct(fb->mdct256, freq_in+3*nshort, transf_buf+2*nshort*3);
|
|
276 faad_imdct(fb->mdct256, freq_in+4*nshort, transf_buf+2*nshort*4);
|
|
277 faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5);
|
|
278 faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6);
|
|
279 faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7);
|
|
280 for (i = 0; i < nflat_ls; i++)
|
|
281 time_out[i] = overlap[i];
|
|
282 for(i = 0; i < nshort; i++)
|
|
283 {
|
|
284 time_out[nflat_ls+ i] = overlap[nflat_ls+ i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]);
|
|
285 time_out[nflat_ls+1*nshort+i] = overlap[nflat_ls+nshort*1+i] + MUL_F(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*2+i],window_short[i]);
|
|
286 time_out[nflat_ls+2*nshort+i] = overlap[nflat_ls+nshort*2+i] + MUL_F(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*4+i],window_short[i]);
|
|
287 time_out[nflat_ls+3*nshort+i] = overlap[nflat_ls+nshort*3+i] + MUL_F(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*6+i],window_short[i]);
|
|
288 if (i < trans)
|
|
289 time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
|
|
290 }
|
|
291 for(i = 0; i < nshort; i++)
|
|
292 {
|
|
293 if (i >= trans)
|
|
294 overlap[nflat_ls+4*nshort+i-nlong] = MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
|
|
295 overlap[nflat_ls+5*nshort+i-nlong] = MUL_F(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*10+i],window_short[i]);
|
|
296 overlap[nflat_ls+6*nshort+i-nlong] = MUL_F(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*12+i],window_short[i]);
|
|
297 overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]);
|
|
298 overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]);
|
|
299 }
|
|
300 for (i = 0; i < nflat_ls; i++)
|
|
301 overlap[nflat_ls+nshort+i] = 0;
|
|
302 break;
|
|
303
|
|
304 case LONG_STOP_SEQUENCE:
|
|
305 imdct_long(fb, freq_in, transf_buf, 2*nlong);
|
|
306 for (i = 0; i < nflat_ls; i++)
|
|
307 time_out[i] = overlap[i];
|
|
308 for (i = 0; i < nshort; i++)
|
|
309 time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]);
|
|
310 for (i = 0; i < nflat_ls; i++)
|
|
311 time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i];
|
|
312 for (i = 0; i < nlong; i++)
|
|
313 overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
|
|
314 break;
|
|
315 }
|
|
316
|
|
317 #ifdef PROFILE
|
|
318 count = faad_get_ts() - count;
|
|
319 fb->cycles += count;
|
|
320 #endif
|
|
321 }
|
|
322
|
|
323 #ifdef USE_SSE
|
|
324 void ifilter_bank_sse(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
|
|
325 uint8_t window_shape_prev, real_t *freq_in,
|
|
326 real_t *time_out, uint8_t object_type, uint16_t frame_len)
|
|
327 {
|
|
328 int16_t i;
|
|
329 ALIGN real_t transf_buf[2*1024] = {0};
|
|
330
|
|
331 const real_t *window_long = NULL;
|
|
332 const real_t *window_long_prev = NULL;
|
|
333 const real_t *window_short = NULL;
|
|
334 const real_t *window_short_prev = NULL;
|
|
335
|
|
336 uint16_t nlong = frame_len;
|
|
337 uint16_t nshort = frame_len/8;
|
|
338 uint16_t trans = nshort/2;
|
|
339
|
|
340 uint16_t nflat_ls = (nlong-nshort)/2;
|
|
341
|
|
342 #ifdef PROFILE
|
|
343 int64_t count = faad_get_ts();
|
|
344 #endif
|
10725
|
345
|
|
346 #ifdef LD_DEC
|
|
347 if (object_type == LD)
|
|
348 {
|
|
349 window_long = fb->ld_window[window_shape];
|
|
350 window_long_prev = fb->ld_window[window_shape_prev];
|
|
351 } else {
|
|
352 #endif
|
|
353 window_long = fb->long_window[window_shape];
|
|
354 window_long_prev = fb->long_window[window_shape_prev];
|
|
355 window_short = fb->short_window[window_shape];
|
|
356 window_short_prev = fb->short_window[window_shape_prev];
|
|
357 #ifdef LD_DEC
|
|
358 }
|
|
359 #endif
|
|
360
|
|
361 switch (window_sequence)
|
|
362 {
|
|
363 case ONLY_LONG_SEQUENCE:
|
12527
|
364 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
|
10989
|
365 for (i = 0; i < nlong; i+=4)
|
10725
|
366 {
|
12527
|
367 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
|
|
368
|
|
369 m1 = _mm_load_ps(&transf_buf[i]);
|
|
370 m2 = _mm_load_ps(&window_long_prev[i]);
|
|
371 m6 = _mm_load_ps(&window_long[nlong-4-i]);
|
|
372 m3 = _mm_load_ps(&time_out[nlong+i]);
|
|
373 m5 = _mm_load_ps(&transf_buf[nlong+i]);
|
|
374
|
|
375 m4 = _mm_mul_ps(m1, m2);
|
|
376 m7 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 1, 2, 3));
|
|
377
|
|
378 m4 = _mm_add_ps(m4, m3);
|
|
379 m8 = _mm_mul_ps(m5, m7);
|
|
380
|
|
381 _mm_store_ps(&time_out[i], m4);
|
|
382 _mm_store_ps(&time_out[nlong+i], m8);
|
10725
|
383 }
|
|
384 break;
|
|
385
|
|
386 case LONG_START_SEQUENCE:
|
12527
|
387 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
|
10989
|
388 for (i = 0; i < nlong; i+=4)
|
|
389 {
|
12527
|
390 __m128 m1 = _mm_load_ps(&transf_buf[i]);
|
|
391 __m128 m2 = _mm_load_ps(&window_long_prev[i]);
|
|
392 __m128 m3 = _mm_load_ps(&time_out[nlong+i]);
|
|
393
|
|
394 __m128 m4 = _mm_mul_ps(m1, m2);
|
|
395 m4 = _mm_add_ps(m4, m3);
|
|
396
|
|
397 _mm_store_ps(&time_out[i], m4);
|
|
398 }
|
|
399 for (i = 0; i < nflat_ls; i+=4)
|
|
400 {
|
|
401 __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
|
|
402 _mm_store_ps(&time_out[nlong+i], m1);
|
10989
|
403 }
|
12527
|
404 for (i = 0; i < nshort; i+=4)
|
|
405 {
|
|
406 __m128 m1 = _mm_load_ps(&transf_buf[nlong+nflat_ls+i]);
|
|
407 __m128 m2 = _mm_load_ps(&window_short[nshort-4-i]);
|
|
408 __m128 m3, m4;
|
|
409
|
|
410 m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
|
|
411
|
|
412 m4 = _mm_mul_ps(m1, m3);
|
|
413
|
|
414 _mm_store_ps(&time_out[nlong+nflat_ls+i], m4);
|
|
415 }
|
|
416 for (i = 0; i < nflat_ls; i+=4)
|
|
417 {
|
|
418 __m128 m1 = _mm_setzero_ps();
|
|
419 _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
|
|
420 }
|
10725
|
421 break;
|
|
422
|
|
423 case EIGHT_SHORT_SEQUENCE:
|
12527
|
424 faad_imdct_sse(fb->mdct256, &freq_in[0*nshort], &transf_buf[2*nshort*0]);
|
|
425 faad_imdct_sse(fb->mdct256, &freq_in[1*nshort], &transf_buf[2*nshort*1]);
|
|
426 faad_imdct_sse(fb->mdct256, &freq_in[2*nshort], &transf_buf[2*nshort*2]);
|
|
427 faad_imdct_sse(fb->mdct256, &freq_in[3*nshort], &transf_buf[2*nshort*3]);
|
|
428 faad_imdct_sse(fb->mdct256, &freq_in[4*nshort], &transf_buf[2*nshort*4]);
|
|
429 faad_imdct_sse(fb->mdct256, &freq_in[5*nshort], &transf_buf[2*nshort*5]);
|
|
430 faad_imdct_sse(fb->mdct256, &freq_in[6*nshort], &transf_buf[2*nshort*6]);
|
|
431 faad_imdct_sse(fb->mdct256, &freq_in[7*nshort], &transf_buf[2*nshort*7]);
|
|
432 for (i = 0; i < nflat_ls; i+=4)
|
|
433 {
|
|
434 __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
|
|
435 _mm_store_ps(&time_out[i], m1);
|
|
436 }
|
|
437 for (i = 0; i < nshort; i+=4)
|
|
438 {
|
|
439 __m128 m1 = _mm_load_ps(&transf_buf[nshort*0+i]);
|
|
440 __m128 m2 = _mm_load_ps(&window_short_prev[i]);
|
|
441 __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);
|
|
442
|
|
443 __m128 m4 = _mm_mul_ps(m1, m2);
|
|
444 m4 = _mm_add_ps(m4, m3);
|
|
445
|
|
446 _mm_store_ps(&time_out[nflat_ls+i], m4);
|
|
447 }
|
|
448 for (i = 0; i < nshort; i+=4)
|
|
449 {
|
|
450 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
|
|
451 m1 = _mm_load_ps(&transf_buf[nshort*1+i]);
|
|
452 m2 = _mm_load_ps(&window_short[nshort-4-i]);
|
|
453 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*1+i]);
|
|
454 m6 = _mm_load_ps(&transf_buf[nshort*2+i]);
|
|
455 m7 = _mm_load_ps(&window_short[i]);
|
|
456
|
|
457 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
|
|
458
|
|
459 m4 = _mm_mul_ps(m1, m5);
|
|
460 m8 = _mm_mul_ps(m6, m7);
|
|
461 m4 = _mm_add_ps(m4, m3);
|
|
462 m4 = _mm_add_ps(m4, m8);
|
|
463
|
|
464 _mm_store_ps(&time_out[nflat_ls+1*nshort+i], m4);
|
|
465 }
|
|
466 for (i = 0; i < nshort; i+=4)
|
|
467 {
|
|
468 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
|
|
469 m1 = _mm_load_ps(&transf_buf[nshort*3+i]);
|
|
470 m2 = _mm_load_ps(&window_short[nshort-4-i]);
|
|
471 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*2+i]);
|
|
472 m6 = _mm_load_ps(&transf_buf[nshort*4+i]);
|
|
473 m7 = _mm_load_ps(&window_short[i]);
|
|
474
|
|
475 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
|
|
476
|
|
477 m4 = _mm_mul_ps(m1, m5);
|
|
478 m8 = _mm_mul_ps(m6, m7);
|
|
479 m4 = _mm_add_ps(m4, m3);
|
|
480 m4 = _mm_add_ps(m4, m8);
|
|
481
|
|
482 _mm_store_ps(&time_out[nflat_ls+2*nshort+i], m4);
|
|
483 }
|
|
484 for (i = 0; i < nshort; i+=4)
|
|
485 {
|
|
486 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
|
|
487 m1 = _mm_load_ps(&transf_buf[nshort*5+i]);
|
|
488 m2 = _mm_load_ps(&window_short[nshort-4-i]);
|
|
489 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*3+i]);
|
|
490 m6 = _mm_load_ps(&transf_buf[nshort*6+i]);
|
|
491 m7 = _mm_load_ps(&window_short[i]);
|
|
492
|
|
493 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
|
|
494
|
|
495 m4 = _mm_mul_ps(m1, m5);
|
|
496 m8 = _mm_mul_ps(m6, m7);
|
|
497 m4 = _mm_add_ps(m4, m3);
|
|
498 m4 = _mm_add_ps(m4, m8);
|
|
499
|
|
500 _mm_store_ps(&time_out[nflat_ls+3*nshort+i], m4);
|
|
501 }
|
|
502 for(i = 0; i < trans; i+=4)
|
10725
|
503 {
|
12527
|
504 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
|
|
505 m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
|
|
506 m2 = _mm_load_ps(&window_short[nshort-4-i]);
|
|
507 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*4+i]);
|
|
508 m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
|
|
509 m7 = _mm_load_ps(&window_short[i]);
|
|
510
|
|
511 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
|
|
512
|
|
513 m4 = _mm_mul_ps(m1, m5);
|
|
514 m8 = _mm_mul_ps(m6, m7);
|
|
515 m4 = _mm_add_ps(m4, m3);
|
|
516 m4 = _mm_add_ps(m4, m8);
|
|
517
|
|
518 _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m4);
|
|
519 }
|
|
520 for (i = trans; i < nshort; i+=4)
|
|
521 {
|
|
522 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
|
|
523 m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
|
|
524 m2 = _mm_load_ps(&window_short[nshort-4-i]);
|
|
525 m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
|
|
526 m7 = _mm_load_ps(&window_short[i]);
|
|
527
|
|
528 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
|
|
529
|
|
530 m4 = _mm_mul_ps(m1, m5);
|
|
531 m8 = _mm_mul_ps(m6, m7);
|
|
532 m3 = _mm_add_ps(m4, m8);
|
|
533
|
|
534 _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m3);
|
|
535 }
|
|
536 for (i = 0; i < nshort; i+=4)
|
|
537 {
|
|
538 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
|
|
539 m1 = _mm_load_ps(&transf_buf[nshort*9+i]);
|
|
540 m2 = _mm_load_ps(&window_short[nshort-4-i]);
|
|
541 m6 = _mm_load_ps(&transf_buf[nshort*10+i]);
|
|
542 m7 = _mm_load_ps(&window_short[i]);
|
|
543
|
|
544 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
|
|
545
|
|
546 m4 = _mm_mul_ps(m1, m5);
|
|
547 m8 = _mm_mul_ps(m6, m7);
|
|
548 m3 = _mm_add_ps(m4, m8);
|
|
549
|
|
550 _mm_store_ps(&time_out[nflat_ls+5*nshort+i], m3);
|
10725
|
551 }
|
12527
|
552 for (i = 0; i < nshort; i+=4)
|
|
553 {
|
|
554 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
|
|
555 m1 = _mm_load_ps(&transf_buf[nshort*11+i]);
|
|
556 m2 = _mm_load_ps(&window_short[nshort-4-i]);
|
|
557 m6 = _mm_load_ps(&transf_buf[nshort*12+i]);
|
|
558 m7 = _mm_load_ps(&window_short[i]);
|
|
559
|
|
560 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
|
|
561
|
|
562 m4 = _mm_mul_ps(m1, m5);
|
|
563 m8 = _mm_mul_ps(m6, m7);
|
|
564 m3 = _mm_add_ps(m4, m8);
|
|
565
|
|
566 _mm_store_ps(&time_out[nflat_ls+6*nshort+i], m3);
|
|
567 }
|
|
568 for (i = 0; i < nshort; i+=4)
|
|
569 {
|
|
570 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
|
|
571 m1 = _mm_load_ps(&transf_buf[nshort*13+i]);
|
|
572 m2 = _mm_load_ps(&window_short[nshort-4-i]);
|
|
573 m6 = _mm_load_ps(&transf_buf[nshort*14+i]);
|
|
574 m7 = _mm_load_ps(&window_short[i]);
|
|
575
|
|
576 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
|
|
577
|
|
578 m4 = _mm_mul_ps(m1, m5);
|
|
579 m8 = _mm_mul_ps(m6, m7);
|
|
580 m3 = _mm_add_ps(m4, m8);
|
|
581
|
|
582 _mm_store_ps(&time_out[nflat_ls+7*nshort+i], m3);
|
|
583 }
|
|
584 for (i = 0; i < nshort; i+=4)
|
|
585 {
|
|
586 __m128 m1, m2, m3, m5;
|
|
587 m1 = _mm_load_ps(&transf_buf[nshort*15+i]);
|
|
588 m2 = _mm_load_ps(&window_short[nshort-4-i]);
|
|
589
|
|
590 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
|
|
591
|
|
592 m3 = _mm_mul_ps(m1, m5);
|
|
593
|
|
594 _mm_store_ps(&time_out[nflat_ls+8*nshort+i], m3);
|
|
595 }
|
|
596 for (i = 0; i < nflat_ls; i+=4)
|
|
597 {
|
|
598 __m128 m1 = _mm_setzero_ps();
|
|
599 _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
|
|
600 }
|
10725
|
601 break;
|
|
602
|
|
603 case LONG_STOP_SEQUENCE:
|
12527
|
604 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
|
|
605 for (i = 0; i < nflat_ls; i+=4)
|
|
606 {
|
|
607 __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
|
|
608 _mm_store_ps(&time_out[i], m1);
|
|
609 }
|
|
610 for (i = 0; i < nshort; i+=4)
|
|
611 {
|
|
612 __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+i]);
|
|
613 __m128 m2 = _mm_load_ps(&window_short_prev[i]);
|
|
614 __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);
|
|
615
|
|
616 __m128 m4 = _mm_mul_ps(m1, m2);
|
|
617 m4 = _mm_add_ps(m4, m3);
|
|
618
|
|
619 _mm_store_ps(&time_out[nflat_ls+i], m4);
|
|
620 }
|
|
621 for (i = 0; i < nflat_ls; i+=4)
|
|
622 {
|
|
623 __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+nshort+i]);
|
|
624 __m128 m2 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort+i]);
|
|
625
|
|
626 __m128 m3 = _mm_add_ps(m1, m2);
|
|
627
|
|
628 _mm_store_ps(&time_out[nflat_ls+nshort+i], m3);
|
|
629 }
|
|
630 for (i = 0; i < nlong; i+=4)
|
|
631 {
|
|
632 __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
|
|
633 __m128 m2 = _mm_load_ps(&window_long[nlong-4-i]);
|
|
634 __m128 m3, m4;
|
|
635
|
|
636 m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
|
|
637
|
|
638 m4 = _mm_mul_ps(m1, m3);
|
|
639
|
|
640 _mm_store_ps(&time_out[nlong+i], m4);
|
|
641 }
|
10725
|
642 break;
|
|
643 }
|
|
644
|
12527
|
645 #ifdef PROFILE
|
|
646 count = faad_get_ts() - count;
|
|
647 fb->cycles += count;
|
|
648 #endif
|
10725
|
649 }
|
12527
|
650 #endif
|
10725
|
651
|
|
652 #ifdef LTP_DEC
|
|
653 /* only works for LTP -> no overlapping, no short blocks */
|
|
654 void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
|
|
655 uint8_t window_shape_prev, real_t *in_data, real_t *out_mdct,
|
|
656 uint8_t object_type, uint16_t frame_len)
|
|
657 {
|
|
658 int16_t i;
|
12527
|
659 ALIGN real_t windowed_buf[2*1024] = {0};
|
10725
|
660
|
12527
|
661 const real_t *window_long = NULL;
|
|
662 const real_t *window_long_prev = NULL;
|
|
663 const real_t *window_short = NULL;
|
|
664 const real_t *window_short_prev = NULL;
|
10725
|
665
|
|
666 uint16_t nlong = frame_len;
|
|
667 uint16_t nshort = frame_len/8;
|
|
668 uint16_t nflat_ls = (nlong-nshort)/2;
|
|
669
|
|
670 assert(window_sequence != EIGHT_SHORT_SEQUENCE);
|
|
671
|
|
672 #ifdef LD_DEC
|
|
673 if (object_type == LD)
|
|
674 {
|
|
675 window_long = fb->ld_window[window_shape];
|
|
676 window_long_prev = fb->ld_window[window_shape_prev];
|
|
677 } else {
|
|
678 #endif
|
|
679 window_long = fb->long_window[window_shape];
|
|
680 window_long_prev = fb->long_window[window_shape_prev];
|
|
681 window_short = fb->short_window[window_shape];
|
|
682 window_short_prev = fb->short_window[window_shape_prev];
|
|
683 #ifdef LD_DEC
|
|
684 }
|
|
685 #endif
|
|
686
|
|
687 switch(window_sequence)
|
|
688 {
|
|
689 case ONLY_LONG_SEQUENCE:
|
|
690 for (i = nlong-1; i >= 0; i--)
|
|
691 {
|
12527
|
692 windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
|
|
693 windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
|
10725
|
694 }
|
|
695 mdct(fb, windowed_buf, out_mdct, 2*nlong);
|
|
696 break;
|
|
697
|
|
698 case LONG_START_SEQUENCE:
|
|
699 for (i = 0; i < nlong; i++)
|
12527
|
700 windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
|
10725
|
701 for (i = 0; i < nflat_ls; i++)
|
|
702 windowed_buf[i+nlong] = in_data[i+nlong];
|
|
703 for (i = 0; i < nshort; i++)
|
12527
|
704 windowed_buf[i+nlong+nflat_ls] = MUL_F(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]);
|
10725
|
705 for (i = 0; i < nflat_ls; i++)
|
|
706 windowed_buf[i+nlong+nflat_ls+nshort] = 0;
|
|
707 mdct(fb, windowed_buf, out_mdct, 2*nlong);
|
|
708 break;
|
|
709
|
|
710 case LONG_STOP_SEQUENCE:
|
|
711 for (i = 0; i < nflat_ls; i++)
|
|
712 windowed_buf[i] = 0;
|
|
713 for (i = 0; i < nshort; i++)
|
12527
|
714 windowed_buf[i+nflat_ls] = MUL_F(in_data[i+nflat_ls], window_short_prev[i]);
|
10725
|
715 for (i = 0; i < nflat_ls; i++)
|
|
716 windowed_buf[i+nflat_ls+nshort] = in_data[i+nflat_ls+nshort];
|
|
717 for (i = 0; i < nlong; i++)
|
12527
|
718 windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
|
10725
|
719 mdct(fb, windowed_buf, out_mdct, 2*nlong);
|
|
720 break;
|
|
721 }
|
|
722 }
|
|
723 #endif
|