Mercurial > mplayer.hg
annotate libfaad2/filtbank.c @ 13415:0e06453daf08
Cumulative patch 1.727 and 1.722
Better description of Loren Merritt's 3-pass mode, better qns desc., and a
couple of x264 encoding options (based a documentation I read)
author | gpoirier |
---|---|
date | Tue, 21 Sep 2004 09:42:33 +0000 |
parents | d81145997036 |
children | 6d50ef45a058 |
rev | line source |
---|---|
10725 | 1 /* |
2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding | |
12527 | 3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com |
10725 | 4 ** |
5 ** This program is free software; you can redistribute it and/or modify | |
6 ** it under the terms of the GNU General Public License as published by | |
7 ** the Free Software Foundation; either version 2 of the License, or | |
8 ** (at your option) any later version. | |
9 ** | |
10 ** This program is distributed in the hope that it will be useful, | |
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 ** GNU General Public License for more details. | |
14 ** | |
15 ** You should have received a copy of the GNU General Public License | |
16 ** along with this program; if not, write to the Free Software | |
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
18 ** | |
19 ** Any non-GPL usage of this software or parts of this software is strictly | |
20 ** forbidden. | |
21 ** | |
22 ** Commercial non-GPL licensing of this software is possible. | |
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com. | |
24 ** | |
12625
d81145997036
More information about modifications to comply more closely with GPL 2a.
diego
parents:
12527
diff
changeset
|
25 ** Initially modified for use with MPlayer by Arpad Gereöffy on 2003/08/30 |
d81145997036
More information about modifications to comply more closely with GPL 2a.
diego
parents:
12527
diff
changeset
|
26 ** $Id: filtbank.c,v 1.3 2004/06/02 22:59:02 diego Exp $ |
d81145997036
More information about modifications to comply more closely with GPL 2a.
diego
parents:
12527
diff
changeset
|
27 ** detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/ |
10725 | 28 **/ |
29 | |
30 #include "common.h" | |
31 #include "structs.h" | |
32 | |
33 #include <stdlib.h> | |
34 #include <string.h> | |
35 #ifdef _WIN32_WCE | |
36 #define assert(x) | |
37 #else | |
38 #include <assert.h> | |
39 #endif | |
40 | |
41 #include "filtbank.h" | |
42 #include "decoder.h" | |
43 #include "syntax.h" | |
44 #include "kbd_win.h" | |
45 #include "sine_win.h" | |
46 #include "mdct.h" | |
47 | |
48 | |
49 fb_info *filter_bank_init(uint16_t frame_len) | |
50 { | |
51 uint16_t nshort = frame_len/8; | |
52 #ifdef LD_DEC | |
53 uint16_t frame_len_ld = frame_len/2; | |
54 #endif | |
55 | |
12527 | 56 fb_info *fb = (fb_info*)faad_malloc(sizeof(fb_info)); |
10725 | 57 memset(fb, 0, sizeof(fb_info)); |
58 | |
59 /* normal */ | |
60 fb->mdct256 = faad_mdct_init(2*nshort); | |
61 fb->mdct2048 = faad_mdct_init(2*frame_len); | |
62 #ifdef LD_DEC | |
63 /* LD */ | |
64 fb->mdct1024 = faad_mdct_init(2*frame_len_ld); | |
65 #endif | |
66 | |
12527 | 67 #ifdef ALLOW_SMALL_FRAMELENGTH |
10725 | 68 if (frame_len == 1024) |
69 { | |
12527 | 70 #endif |
10725 | 71 fb->long_window[0] = sine_long_1024; |
72 fb->short_window[0] = sine_short_128; | |
73 fb->long_window[1] = kbd_long_1024; | |
74 fb->short_window[1] = kbd_short_128; | |
75 #ifdef LD_DEC | |
76 fb->ld_window[0] = sine_mid_512; | |
77 fb->ld_window[1] = ld_mid_512; | |
78 #endif | |
12527 | 79 #ifdef ALLOW_SMALL_FRAMELENGTH |
10725 | 80 } else /* (frame_len == 960) */ { |
81 fb->long_window[0] = sine_long_960; | |
82 fb->short_window[0] = sine_short_120; | |
83 fb->long_window[1] = kbd_long_960; | |
84 fb->short_window[1] = kbd_short_120; | |
85 #ifdef LD_DEC | |
86 fb->ld_window[0] = sine_mid_480; | |
87 fb->ld_window[1] = ld_mid_480; | |
88 #endif | |
89 } | |
12527 | 90 #endif |
91 | |
92 #ifdef USE_SSE | |
93 if (cpu_has_sse()) | |
94 { | |
95 fb->if_func = ifilter_bank_sse; | |
96 } else { | |
97 fb->if_func = ifilter_bank; | |
98 } | |
99 #endif | |
10725 | 100 |
101 return fb; | |
102 } | |
103 | |
104 void filter_bank_end(fb_info *fb) | |
105 { | |
106 if (fb != NULL) | |
107 { | |
12527 | 108 #ifdef PROFILE |
109 printf("FB: %I64d cycles\n", fb->cycles); | |
110 #endif | |
111 | |
10725 | 112 faad_mdct_end(fb->mdct256); |
113 faad_mdct_end(fb->mdct2048); | |
114 #ifdef LD_DEC | |
115 faad_mdct_end(fb->mdct1024); | |
116 #endif | |
117 | |
12527 | 118 faad_free(fb); |
10725 | 119 } |
120 } | |
121 | |
12527 | 122 static INLINE void imdct_long(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) |
10725 | 123 { |
12527 | 124 #ifdef LD_DEC |
125 mdct_info *mdct = NULL; | |
10725 | 126 |
127 switch (len) | |
128 { | |
129 case 2048: | |
130 case 1920: | |
131 mdct = fb->mdct2048; | |
132 break; | |
133 case 1024: | |
134 case 960: | |
135 mdct = fb->mdct1024; | |
136 break; | |
137 } | |
138 | |
139 faad_imdct(mdct, in_data, out_data); | |
12527 | 140 #else |
141 faad_imdct(fb->mdct2048, in_data, out_data); | |
142 #endif | |
10725 | 143 } |
144 | |
12527 | 145 #ifdef USE_SSE |
146 static INLINE void imdct_long_sse(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) | |
147 { | |
148 #ifdef LD_DEC | |
149 mdct_info *mdct = NULL; | |
150 | |
151 switch (len) | |
152 { | |
153 case 2048: | |
154 case 1920: | |
155 mdct = fb->mdct2048; | |
156 break; | |
157 case 1024: | |
158 case 960: | |
159 mdct = fb->mdct1024; | |
160 break; | |
161 } | |
162 | |
163 faad_imdct_sse(mdct, in_data, out_data); | |
164 #else | |
165 faad_imdct_sse(fb->mdct2048, in_data, out_data); | |
166 #endif | |
167 } | |
168 #endif | |
169 | |
10725 | 170 #ifdef LTP_DEC |
171 static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) | |
172 { | |
12527 | 173 mdct_info *mdct = NULL; |
10725 | 174 |
175 switch (len) | |
176 { | |
177 case 2048: | |
178 case 1920: | |
179 mdct = fb->mdct2048; | |
180 break; | |
181 case 256: | |
182 case 240: | |
183 mdct = fb->mdct256; | |
184 break; | |
185 #ifdef LD_DEC | |
186 case 1024: | |
187 case 960: | |
188 mdct = fb->mdct1024; | |
189 break; | |
190 #endif | |
191 } | |
192 | |
193 faad_mdct(mdct, in_data, out_data); | |
194 } | |
195 #endif | |
196 | |
197 void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, | |
198 uint8_t window_shape_prev, real_t *freq_in, | |
12527 | 199 real_t *time_out, real_t *overlap, |
200 uint8_t object_type, uint16_t frame_len) | |
10725 | 201 { |
202 int16_t i; | |
12527 | 203 ALIGN real_t transf_buf[2*1024] = {0}; |
10725 | 204 |
12527 | 205 const real_t *window_long = NULL; |
206 const real_t *window_long_prev = NULL; | |
207 const real_t *window_short = NULL; | |
208 const real_t *window_short_prev = NULL; | |
10725 | 209 |
210 uint16_t nlong = frame_len; | |
211 uint16_t nshort = frame_len/8; | |
212 uint16_t trans = nshort/2; | |
213 | |
214 uint16_t nflat_ls = (nlong-nshort)/2; | |
215 | |
12527 | 216 #ifdef PROFILE |
217 int64_t count = faad_get_ts(); | |
218 #endif | |
219 | |
220 #ifdef LD_DEC | |
221 if (object_type == LD) | |
222 { | |
223 window_long = fb->ld_window[window_shape]; | |
224 window_long_prev = fb->ld_window[window_shape_prev]; | |
225 } else { | |
226 #endif | |
227 window_long = fb->long_window[window_shape]; | |
228 window_long_prev = fb->long_window[window_shape_prev]; | |
229 window_short = fb->short_window[window_shape]; | |
230 window_short_prev = fb->short_window[window_shape_prev]; | |
231 #ifdef LD_DEC | |
232 } | |
233 #endif | |
234 | |
235 | |
236 switch (window_sequence) | |
237 { | |
238 case ONLY_LONG_SEQUENCE: | |
239 imdct_long(fb, freq_in, transf_buf, 2*nlong); | |
240 for (i = 0; i < nlong; i+=4) | |
241 { | |
242 time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); | |
243 time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); | |
244 time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); | |
245 time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); | |
246 } | |
247 for (i = 0; i < nlong; i+=4) | |
248 { | |
249 overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); | |
250 overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]); | |
251 overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]); | |
252 overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]); | |
253 } | |
254 break; | |
255 | |
256 case LONG_START_SEQUENCE: | |
257 imdct_long(fb, freq_in, transf_buf, 2*nlong); | |
258 for (i = 0; i < nlong; i+=4) | |
259 { | |
260 time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); | |
261 time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); | |
262 time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); | |
263 time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); | |
264 } | |
265 for (i = 0; i < nflat_ls; i++) | |
266 overlap[i] = transf_buf[nlong+i]; | |
267 for (i = 0; i < nshort; i++) | |
268 overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]); | |
269 for (i = 0; i < nflat_ls; i++) | |
270 overlap[nflat_ls+nshort+i] = 0; | |
271 break; | |
272 | |
273 case EIGHT_SHORT_SEQUENCE: | |
274 faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0); | |
275 faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1); | |
276 faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2); | |
277 faad_imdct(fb->mdct256, freq_in+3*nshort, transf_buf+2*nshort*3); | |
278 faad_imdct(fb->mdct256, freq_in+4*nshort, transf_buf+2*nshort*4); | |
279 faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5); | |
280 faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6); | |
281 faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7); | |
282 for (i = 0; i < nflat_ls; i++) | |
283 time_out[i] = overlap[i]; | |
284 for(i = 0; i < nshort; i++) | |
285 { | |
286 time_out[nflat_ls+ i] = overlap[nflat_ls+ i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]); | |
287 time_out[nflat_ls+1*nshort+i] = overlap[nflat_ls+nshort*1+i] + MUL_F(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*2+i],window_short[i]); | |
288 time_out[nflat_ls+2*nshort+i] = overlap[nflat_ls+nshort*2+i] + MUL_F(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*4+i],window_short[i]); | |
289 time_out[nflat_ls+3*nshort+i] = overlap[nflat_ls+nshort*3+i] + MUL_F(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*6+i],window_short[i]); | |
290 if (i < trans) | |
291 time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]); | |
292 } | |
293 for(i = 0; i < nshort; i++) | |
294 { | |
295 if (i >= trans) | |
296 overlap[nflat_ls+4*nshort+i-nlong] = MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]); | |
297 overlap[nflat_ls+5*nshort+i-nlong] = MUL_F(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*10+i],window_short[i]); | |
298 overlap[nflat_ls+6*nshort+i-nlong] = MUL_F(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*12+i],window_short[i]); | |
299 overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]); | |
300 overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]); | |
301 } | |
302 for (i = 0; i < nflat_ls; i++) | |
303 overlap[nflat_ls+nshort+i] = 0; | |
304 break; | |
305 | |
306 case LONG_STOP_SEQUENCE: | |
307 imdct_long(fb, freq_in, transf_buf, 2*nlong); | |
308 for (i = 0; i < nflat_ls; i++) | |
309 time_out[i] = overlap[i]; | |
310 for (i = 0; i < nshort; i++) | |
311 time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]); | |
312 for (i = 0; i < nflat_ls; i++) | |
313 time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i]; | |
314 for (i = 0; i < nlong; i++) | |
315 overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); | |
316 break; | |
317 } | |
318 | |
319 #ifdef PROFILE | |
320 count = faad_get_ts() - count; | |
321 fb->cycles += count; | |
322 #endif | |
323 } | |
324 | |
325 #ifdef USE_SSE | |
326 void ifilter_bank_sse(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, | |
327 uint8_t window_shape_prev, real_t *freq_in, | |
328 real_t *time_out, uint8_t object_type, uint16_t frame_len) | |
329 { | |
330 int16_t i; | |
331 ALIGN real_t transf_buf[2*1024] = {0}; | |
332 | |
333 const real_t *window_long = NULL; | |
334 const real_t *window_long_prev = NULL; | |
335 const real_t *window_short = NULL; | |
336 const real_t *window_short_prev = NULL; | |
337 | |
338 uint16_t nlong = frame_len; | |
339 uint16_t nshort = frame_len/8; | |
340 uint16_t trans = nshort/2; | |
341 | |
342 uint16_t nflat_ls = (nlong-nshort)/2; | |
343 | |
344 #ifdef PROFILE | |
345 int64_t count = faad_get_ts(); | |
346 #endif | |
10725 | 347 |
348 #ifdef LD_DEC | |
349 if (object_type == LD) | |
350 { | |
351 window_long = fb->ld_window[window_shape]; | |
352 window_long_prev = fb->ld_window[window_shape_prev]; | |
353 } else { | |
354 #endif | |
355 window_long = fb->long_window[window_shape]; | |
356 window_long_prev = fb->long_window[window_shape_prev]; | |
357 window_short = fb->short_window[window_shape]; | |
358 window_short_prev = fb->short_window[window_shape_prev]; | |
359 #ifdef LD_DEC | |
360 } | |
361 #endif | |
362 | |
363 switch (window_sequence) | |
364 { | |
365 case ONLY_LONG_SEQUENCE: | |
12527 | 366 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); |
10989 | 367 for (i = 0; i < nlong; i+=4) |
10725 | 368 { |
12527 | 369 __m128 m1, m2, m3, m4, m5, m6, m7, m8; |
370 | |
371 m1 = _mm_load_ps(&transf_buf[i]); | |
372 m2 = _mm_load_ps(&window_long_prev[i]); | |
373 m6 = _mm_load_ps(&window_long[nlong-4-i]); | |
374 m3 = _mm_load_ps(&time_out[nlong+i]); | |
375 m5 = _mm_load_ps(&transf_buf[nlong+i]); | |
376 | |
377 m4 = _mm_mul_ps(m1, m2); | |
378 m7 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 1, 2, 3)); | |
379 | |
380 m4 = _mm_add_ps(m4, m3); | |
381 m8 = _mm_mul_ps(m5, m7); | |
382 | |
383 _mm_store_ps(&time_out[i], m4); | |
384 _mm_store_ps(&time_out[nlong+i], m8); | |
10725 | 385 } |
386 break; | |
387 | |
388 case LONG_START_SEQUENCE: | |
12527 | 389 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); |
10989 | 390 for (i = 0; i < nlong; i+=4) |
391 { | |
12527 | 392 __m128 m1 = _mm_load_ps(&transf_buf[i]); |
393 __m128 m2 = _mm_load_ps(&window_long_prev[i]); | |
394 __m128 m3 = _mm_load_ps(&time_out[nlong+i]); | |
395 | |
396 __m128 m4 = _mm_mul_ps(m1, m2); | |
397 m4 = _mm_add_ps(m4, m3); | |
398 | |
399 _mm_store_ps(&time_out[i], m4); | |
400 } | |
401 for (i = 0; i < nflat_ls; i+=4) | |
402 { | |
403 __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]); | |
404 _mm_store_ps(&time_out[nlong+i], m1); | |
10989 | 405 } |
12527 | 406 for (i = 0; i < nshort; i+=4) |
407 { | |
408 __m128 m1 = _mm_load_ps(&transf_buf[nlong+nflat_ls+i]); | |
409 __m128 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
410 __m128 m3, m4; | |
411 | |
412 m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
413 | |
414 m4 = _mm_mul_ps(m1, m3); | |
415 | |
416 _mm_store_ps(&time_out[nlong+nflat_ls+i], m4); | |
417 } | |
418 for (i = 0; i < nflat_ls; i+=4) | |
419 { | |
420 __m128 m1 = _mm_setzero_ps(); | |
421 _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1); | |
422 } | |
10725 | 423 break; |
424 | |
425 case EIGHT_SHORT_SEQUENCE: | |
12527 | 426 faad_imdct_sse(fb->mdct256, &freq_in[0*nshort], &transf_buf[2*nshort*0]); |
427 faad_imdct_sse(fb->mdct256, &freq_in[1*nshort], &transf_buf[2*nshort*1]); | |
428 faad_imdct_sse(fb->mdct256, &freq_in[2*nshort], &transf_buf[2*nshort*2]); | |
429 faad_imdct_sse(fb->mdct256, &freq_in[3*nshort], &transf_buf[2*nshort*3]); | |
430 faad_imdct_sse(fb->mdct256, &freq_in[4*nshort], &transf_buf[2*nshort*4]); | |
431 faad_imdct_sse(fb->mdct256, &freq_in[5*nshort], &transf_buf[2*nshort*5]); | |
432 faad_imdct_sse(fb->mdct256, &freq_in[6*nshort], &transf_buf[2*nshort*6]); | |
433 faad_imdct_sse(fb->mdct256, &freq_in[7*nshort], &transf_buf[2*nshort*7]); | |
434 for (i = 0; i < nflat_ls; i+=4) | |
435 { | |
436 __m128 m1 = _mm_load_ps(&time_out[nlong+i]); | |
437 _mm_store_ps(&time_out[i], m1); | |
438 } | |
439 for (i = 0; i < nshort; i+=4) | |
440 { | |
441 __m128 m1 = _mm_load_ps(&transf_buf[nshort*0+i]); | |
442 __m128 m2 = _mm_load_ps(&window_short_prev[i]); | |
443 __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]); | |
444 | |
445 __m128 m4 = _mm_mul_ps(m1, m2); | |
446 m4 = _mm_add_ps(m4, m3); | |
447 | |
448 _mm_store_ps(&time_out[nflat_ls+i], m4); | |
449 } | |
450 for (i = 0; i < nshort; i+=4) | |
451 { | |
452 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
453 m1 = _mm_load_ps(&transf_buf[nshort*1+i]); | |
454 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
455 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*1+i]); | |
456 m6 = _mm_load_ps(&transf_buf[nshort*2+i]); | |
457 m7 = _mm_load_ps(&window_short[i]); | |
458 | |
459 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
460 | |
461 m4 = _mm_mul_ps(m1, m5); | |
462 m8 = _mm_mul_ps(m6, m7); | |
463 m4 = _mm_add_ps(m4, m3); | |
464 m4 = _mm_add_ps(m4, m8); | |
465 | |
466 _mm_store_ps(&time_out[nflat_ls+1*nshort+i], m4); | |
467 } | |
468 for (i = 0; i < nshort; i+=4) | |
469 { | |
470 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
471 m1 = _mm_load_ps(&transf_buf[nshort*3+i]); | |
472 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
473 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*2+i]); | |
474 m6 = _mm_load_ps(&transf_buf[nshort*4+i]); | |
475 m7 = _mm_load_ps(&window_short[i]); | |
476 | |
477 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
478 | |
479 m4 = _mm_mul_ps(m1, m5); | |
480 m8 = _mm_mul_ps(m6, m7); | |
481 m4 = _mm_add_ps(m4, m3); | |
482 m4 = _mm_add_ps(m4, m8); | |
483 | |
484 _mm_store_ps(&time_out[nflat_ls+2*nshort+i], m4); | |
485 } | |
486 for (i = 0; i < nshort; i+=4) | |
487 { | |
488 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
489 m1 = _mm_load_ps(&transf_buf[nshort*5+i]); | |
490 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
491 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*3+i]); | |
492 m6 = _mm_load_ps(&transf_buf[nshort*6+i]); | |
493 m7 = _mm_load_ps(&window_short[i]); | |
494 | |
495 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
496 | |
497 m4 = _mm_mul_ps(m1, m5); | |
498 m8 = _mm_mul_ps(m6, m7); | |
499 m4 = _mm_add_ps(m4, m3); | |
500 m4 = _mm_add_ps(m4, m8); | |
501 | |
502 _mm_store_ps(&time_out[nflat_ls+3*nshort+i], m4); | |
503 } | |
504 for(i = 0; i < trans; i+=4) | |
10725 | 505 { |
12527 | 506 __m128 m1, m2, m3, m4, m5, m6, m7, m8; |
507 m1 = _mm_load_ps(&transf_buf[nshort*7+i]); | |
508 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
509 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*4+i]); | |
510 m6 = _mm_load_ps(&transf_buf[nshort*8+i]); | |
511 m7 = _mm_load_ps(&window_short[i]); | |
512 | |
513 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
514 | |
515 m4 = _mm_mul_ps(m1, m5); | |
516 m8 = _mm_mul_ps(m6, m7); | |
517 m4 = _mm_add_ps(m4, m3); | |
518 m4 = _mm_add_ps(m4, m8); | |
519 | |
520 _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m4); | |
521 } | |
522 for (i = trans; i < nshort; i+=4) | |
523 { | |
524 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
525 m1 = _mm_load_ps(&transf_buf[nshort*7+i]); | |
526 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
527 m6 = _mm_load_ps(&transf_buf[nshort*8+i]); | |
528 m7 = _mm_load_ps(&window_short[i]); | |
529 | |
530 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
531 | |
532 m4 = _mm_mul_ps(m1, m5); | |
533 m8 = _mm_mul_ps(m6, m7); | |
534 m3 = _mm_add_ps(m4, m8); | |
535 | |
536 _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m3); | |
537 } | |
538 for (i = 0; i < nshort; i+=4) | |
539 { | |
540 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
541 m1 = _mm_load_ps(&transf_buf[nshort*9+i]); | |
542 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
543 m6 = _mm_load_ps(&transf_buf[nshort*10+i]); | |
544 m7 = _mm_load_ps(&window_short[i]); | |
545 | |
546 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
547 | |
548 m4 = _mm_mul_ps(m1, m5); | |
549 m8 = _mm_mul_ps(m6, m7); | |
550 m3 = _mm_add_ps(m4, m8); | |
551 | |
552 _mm_store_ps(&time_out[nflat_ls+5*nshort+i], m3); | |
10725 | 553 } |
12527 | 554 for (i = 0; i < nshort; i+=4) |
555 { | |
556 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
557 m1 = _mm_load_ps(&transf_buf[nshort*11+i]); | |
558 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
559 m6 = _mm_load_ps(&transf_buf[nshort*12+i]); | |
560 m7 = _mm_load_ps(&window_short[i]); | |
561 | |
562 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
563 | |
564 m4 = _mm_mul_ps(m1, m5); | |
565 m8 = _mm_mul_ps(m6, m7); | |
566 m3 = _mm_add_ps(m4, m8); | |
567 | |
568 _mm_store_ps(&time_out[nflat_ls+6*nshort+i], m3); | |
569 } | |
570 for (i = 0; i < nshort; i+=4) | |
571 { | |
572 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
573 m1 = _mm_load_ps(&transf_buf[nshort*13+i]); | |
574 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
575 m6 = _mm_load_ps(&transf_buf[nshort*14+i]); | |
576 m7 = _mm_load_ps(&window_short[i]); | |
577 | |
578 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
579 | |
580 m4 = _mm_mul_ps(m1, m5); | |
581 m8 = _mm_mul_ps(m6, m7); | |
582 m3 = _mm_add_ps(m4, m8); | |
583 | |
584 _mm_store_ps(&time_out[nflat_ls+7*nshort+i], m3); | |
585 } | |
586 for (i = 0; i < nshort; i+=4) | |
587 { | |
588 __m128 m1, m2, m3, m5; | |
589 m1 = _mm_load_ps(&transf_buf[nshort*15+i]); | |
590 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
591 | |
592 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
593 | |
594 m3 = _mm_mul_ps(m1, m5); | |
595 | |
596 _mm_store_ps(&time_out[nflat_ls+8*nshort+i], m3); | |
597 } | |
598 for (i = 0; i < nflat_ls; i+=4) | |
599 { | |
600 __m128 m1 = _mm_setzero_ps(); | |
601 _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1); | |
602 } | |
10725 | 603 break; |
604 | |
605 case LONG_STOP_SEQUENCE: | |
12527 | 606 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); |
607 for (i = 0; i < nflat_ls; i+=4) | |
608 { | |
609 __m128 m1 = _mm_load_ps(&time_out[nlong+i]); | |
610 _mm_store_ps(&time_out[i], m1); | |
611 } | |
612 for (i = 0; i < nshort; i+=4) | |
613 { | |
614 __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+i]); | |
615 __m128 m2 = _mm_load_ps(&window_short_prev[i]); | |
616 __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]); | |
617 | |
618 __m128 m4 = _mm_mul_ps(m1, m2); | |
619 m4 = _mm_add_ps(m4, m3); | |
620 | |
621 _mm_store_ps(&time_out[nflat_ls+i], m4); | |
622 } | |
623 for (i = 0; i < nflat_ls; i+=4) | |
624 { | |
625 __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+nshort+i]); | |
626 __m128 m2 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort+i]); | |
627 | |
628 __m128 m3 = _mm_add_ps(m1, m2); | |
629 | |
630 _mm_store_ps(&time_out[nflat_ls+nshort+i], m3); | |
631 } | |
632 for (i = 0; i < nlong; i+=4) | |
633 { | |
634 __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]); | |
635 __m128 m2 = _mm_load_ps(&window_long[nlong-4-i]); | |
636 __m128 m3, m4; | |
637 | |
638 m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
639 | |
640 m4 = _mm_mul_ps(m1, m3); | |
641 | |
642 _mm_store_ps(&time_out[nlong+i], m4); | |
643 } | |
10725 | 644 break; |
645 } | |
646 | |
12527 | 647 #ifdef PROFILE |
648 count = faad_get_ts() - count; | |
649 fb->cycles += count; | |
650 #endif | |
10725 | 651 } |
12527 | 652 #endif |
10725 | 653 |
654 #ifdef LTP_DEC | |
655 /* only works for LTP -> no overlapping, no short blocks */ | |
656 void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, | |
657 uint8_t window_shape_prev, real_t *in_data, real_t *out_mdct, | |
658 uint8_t object_type, uint16_t frame_len) | |
659 { | |
660 int16_t i; | |
12527 | 661 ALIGN real_t windowed_buf[2*1024] = {0}; |
10725 | 662 |
12527 | 663 const real_t *window_long = NULL; |
664 const real_t *window_long_prev = NULL; | |
665 const real_t *window_short = NULL; | |
666 const real_t *window_short_prev = NULL; | |
10725 | 667 |
668 uint16_t nlong = frame_len; | |
669 uint16_t nshort = frame_len/8; | |
670 uint16_t nflat_ls = (nlong-nshort)/2; | |
671 | |
672 assert(window_sequence != EIGHT_SHORT_SEQUENCE); | |
673 | |
674 #ifdef LD_DEC | |
675 if (object_type == LD) | |
676 { | |
677 window_long = fb->ld_window[window_shape]; | |
678 window_long_prev = fb->ld_window[window_shape_prev]; | |
679 } else { | |
680 #endif | |
681 window_long = fb->long_window[window_shape]; | |
682 window_long_prev = fb->long_window[window_shape_prev]; | |
683 window_short = fb->short_window[window_shape]; | |
684 window_short_prev = fb->short_window[window_shape_prev]; | |
685 #ifdef LD_DEC | |
686 } | |
687 #endif | |
688 | |
689 switch(window_sequence) | |
690 { | |
691 case ONLY_LONG_SEQUENCE: | |
692 for (i = nlong-1; i >= 0; i--) | |
693 { | |
12527 | 694 windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]); |
695 windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]); | |
10725 | 696 } |
697 mdct(fb, windowed_buf, out_mdct, 2*nlong); | |
698 break; | |
699 | |
700 case LONG_START_SEQUENCE: | |
701 for (i = 0; i < nlong; i++) | |
12527 | 702 windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]); |
10725 | 703 for (i = 0; i < nflat_ls; i++) |
704 windowed_buf[i+nlong] = in_data[i+nlong]; | |
705 for (i = 0; i < nshort; i++) | |
12527 | 706 windowed_buf[i+nlong+nflat_ls] = MUL_F(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]); |
10725 | 707 for (i = 0; i < nflat_ls; i++) |
708 windowed_buf[i+nlong+nflat_ls+nshort] = 0; | |
709 mdct(fb, windowed_buf, out_mdct, 2*nlong); | |
710 break; | |
711 | |
712 case LONG_STOP_SEQUENCE: | |
713 for (i = 0; i < nflat_ls; i++) | |
714 windowed_buf[i] = 0; | |
715 for (i = 0; i < nshort; i++) | |
12527 | 716 windowed_buf[i+nflat_ls] = MUL_F(in_data[i+nflat_ls], window_short_prev[i]); |
10725 | 717 for (i = 0; i < nflat_ls; i++) |
718 windowed_buf[i+nflat_ls+nshort] = in_data[i+nflat_ls+nshort]; | |
719 for (i = 0; i < nlong; i++) | |
12527 | 720 windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]); |
10725 | 721 mdct(fb, windowed_buf, out_mdct, 2*nlong); |
722 break; | |
723 } | |
724 } | |
725 #endif |