Mercurial > mplayer.hg
annotate libfaad2/filtbank.c @ 17080:b3ef98ca8677
multithreaded decoding
author | michael |
---|---|
date | Fri, 02 Dec 2005 22:00:50 +0000 |
parents | 2ae5ab4331ca |
children | 59b6fa5b4201 |
rev | line source |
---|---|
10725 | 1 /* |
2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding | |
12527 | 3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com |
10725 | 4 ** |
5 ** This program is free software; you can redistribute it and/or modify | |
6 ** it under the terms of the GNU General Public License as published by | |
7 ** the Free Software Foundation; either version 2 of the License, or | |
8 ** (at your option) any later version. | |
9 ** | |
10 ** This program is distributed in the hope that it will be useful, | |
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 ** GNU General Public License for more details. | |
14 ** | |
15 ** You should have received a copy of the GNU General Public License | |
16 ** along with this program; if not, write to the Free Software | |
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
18 ** | |
19 ** Any non-GPL usage of this software or parts of this software is strictly | |
20 ** forbidden. | |
21 ** | |
22 ** Commercial non-GPL licensing of this software is possible. | |
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com. | |
24 ** | |
14727
2ae5ab4331ca
Remove modification notice from files that have not been locally modified.
diego
parents:
13453
diff
changeset
|
25 ** $Id: filtbank.c,v 1.38 2004/06/30 12:45:56 menno Exp $ |
10725 | 26 **/ |
27 | |
28 #include "common.h" | |
29 #include "structs.h" | |
30 | |
31 #include <stdlib.h> | |
32 #include <string.h> | |
33 #ifdef _WIN32_WCE | |
34 #define assert(x) | |
35 #else | |
36 #include <assert.h> | |
37 #endif | |
38 | |
39 #include "filtbank.h" | |
40 #include "decoder.h" | |
41 #include "syntax.h" | |
42 #include "kbd_win.h" | |
43 #include "sine_win.h" | |
44 #include "mdct.h" | |
45 | |
46 | |
47 fb_info *filter_bank_init(uint16_t frame_len) | |
48 { | |
49 uint16_t nshort = frame_len/8; | |
50 #ifdef LD_DEC | |
51 uint16_t frame_len_ld = frame_len/2; | |
52 #endif | |
53 | |
12527 | 54 fb_info *fb = (fb_info*)faad_malloc(sizeof(fb_info)); |
10725 | 55 memset(fb, 0, sizeof(fb_info)); |
56 | |
57 /* normal */ | |
58 fb->mdct256 = faad_mdct_init(2*nshort); | |
59 fb->mdct2048 = faad_mdct_init(2*frame_len); | |
60 #ifdef LD_DEC | |
61 /* LD */ | |
62 fb->mdct1024 = faad_mdct_init(2*frame_len_ld); | |
63 #endif | |
64 | |
12527 | 65 #ifdef ALLOW_SMALL_FRAMELENGTH |
10725 | 66 if (frame_len == 1024) |
67 { | |
12527 | 68 #endif |
10725 | 69 fb->long_window[0] = sine_long_1024; |
70 fb->short_window[0] = sine_short_128; | |
71 fb->long_window[1] = kbd_long_1024; | |
72 fb->short_window[1] = kbd_short_128; | |
73 #ifdef LD_DEC | |
74 fb->ld_window[0] = sine_mid_512; | |
75 fb->ld_window[1] = ld_mid_512; | |
76 #endif | |
12527 | 77 #ifdef ALLOW_SMALL_FRAMELENGTH |
10725 | 78 } else /* (frame_len == 960) */ { |
79 fb->long_window[0] = sine_long_960; | |
80 fb->short_window[0] = sine_short_120; | |
81 fb->long_window[1] = kbd_long_960; | |
82 fb->short_window[1] = kbd_short_120; | |
83 #ifdef LD_DEC | |
84 fb->ld_window[0] = sine_mid_480; | |
85 fb->ld_window[1] = ld_mid_480; | |
86 #endif | |
87 } | |
12527 | 88 #endif |
89 | |
90 #ifdef USE_SSE | |
91 if (cpu_has_sse()) | |
92 { | |
93 fb->if_func = ifilter_bank_sse; | |
94 } else { | |
95 fb->if_func = ifilter_bank; | |
96 } | |
97 #endif | |
10725 | 98 |
99 return fb; | |
100 } | |
101 | |
102 void filter_bank_end(fb_info *fb) | |
103 { | |
104 if (fb != NULL) | |
105 { | |
12527 | 106 #ifdef PROFILE |
107 printf("FB: %I64d cycles\n", fb->cycles); | |
108 #endif | |
109 | |
10725 | 110 faad_mdct_end(fb->mdct256); |
111 faad_mdct_end(fb->mdct2048); | |
112 #ifdef LD_DEC | |
113 faad_mdct_end(fb->mdct1024); | |
114 #endif | |
115 | |
12527 | 116 faad_free(fb); |
10725 | 117 } |
118 } | |
119 | |
12527 | 120 static INLINE void imdct_long(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) |
10725 | 121 { |
12527 | 122 #ifdef LD_DEC |
123 mdct_info *mdct = NULL; | |
10725 | 124 |
125 switch (len) | |
126 { | |
127 case 2048: | |
128 case 1920: | |
129 mdct = fb->mdct2048; | |
130 break; | |
131 case 1024: | |
132 case 960: | |
133 mdct = fb->mdct1024; | |
134 break; | |
135 } | |
136 | |
137 faad_imdct(mdct, in_data, out_data); | |
12527 | 138 #else |
139 faad_imdct(fb->mdct2048, in_data, out_data); | |
140 #endif | |
10725 | 141 } |
142 | |
12527 | 143 #ifdef USE_SSE |
144 static INLINE void imdct_long_sse(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) | |
145 { | |
146 #ifdef LD_DEC | |
147 mdct_info *mdct = NULL; | |
148 | |
149 switch (len) | |
150 { | |
151 case 2048: | |
152 case 1920: | |
153 mdct = fb->mdct2048; | |
154 break; | |
155 case 1024: | |
156 case 960: | |
157 mdct = fb->mdct1024; | |
158 break; | |
159 } | |
160 | |
161 faad_imdct_sse(mdct, in_data, out_data); | |
162 #else | |
163 faad_imdct_sse(fb->mdct2048, in_data, out_data); | |
164 #endif | |
165 } | |
166 #endif | |
167 | |
10725 | 168 #ifdef LTP_DEC |
169 static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) | |
170 { | |
12527 | 171 mdct_info *mdct = NULL; |
10725 | 172 |
173 switch (len) | |
174 { | |
175 case 2048: | |
176 case 1920: | |
177 mdct = fb->mdct2048; | |
178 break; | |
179 case 256: | |
180 case 240: | |
181 mdct = fb->mdct256; | |
182 break; | |
183 #ifdef LD_DEC | |
184 case 1024: | |
185 case 960: | |
186 mdct = fb->mdct1024; | |
187 break; | |
188 #endif | |
189 } | |
190 | |
191 faad_mdct(mdct, in_data, out_data); | |
192 } | |
193 #endif | |
194 | |
195 void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, | |
196 uint8_t window_shape_prev, real_t *freq_in, | |
12527 | 197 real_t *time_out, real_t *overlap, |
198 uint8_t object_type, uint16_t frame_len) | |
10725 | 199 { |
200 int16_t i; | |
12527 | 201 ALIGN real_t transf_buf[2*1024] = {0}; |
10725 | 202 |
12527 | 203 const real_t *window_long = NULL; |
204 const real_t *window_long_prev = NULL; | |
205 const real_t *window_short = NULL; | |
206 const real_t *window_short_prev = NULL; | |
10725 | 207 |
208 uint16_t nlong = frame_len; | |
209 uint16_t nshort = frame_len/8; | |
210 uint16_t trans = nshort/2; | |
211 | |
212 uint16_t nflat_ls = (nlong-nshort)/2; | |
213 | |
12527 | 214 #ifdef PROFILE |
215 int64_t count = faad_get_ts(); | |
216 #endif | |
217 | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
218 /* select windows of current frame and previous frame (Sine or KBD) */ |
12527 | 219 #ifdef LD_DEC |
220 if (object_type == LD) | |
221 { | |
222 window_long = fb->ld_window[window_shape]; | |
223 window_long_prev = fb->ld_window[window_shape_prev]; | |
224 } else { | |
225 #endif | |
226 window_long = fb->long_window[window_shape]; | |
227 window_long_prev = fb->long_window[window_shape_prev]; | |
228 window_short = fb->short_window[window_shape]; | |
229 window_short_prev = fb->short_window[window_shape_prev]; | |
230 #ifdef LD_DEC | |
231 } | |
232 #endif | |
233 | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
234 #if 0 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
235 for (i = 0; i < 1024; i++) |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
236 { |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
237 printf("%d\n", freq_in[i]); |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
238 } |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
239 #endif |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
240 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
241 #if 0 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
242 printf("%d %d\n", window_sequence, window_shape); |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
243 #endif |
12527 | 244 |
245 switch (window_sequence) | |
246 { | |
247 case ONLY_LONG_SEQUENCE: | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
248 /* perform iMDCT */ |
12527 | 249 imdct_long(fb, freq_in, transf_buf, 2*nlong); |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
250 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
251 /* add second half output of previous frame to windowed output of current frame */ |
12527 | 252 for (i = 0; i < nlong; i+=4) |
253 { | |
254 time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); | |
255 time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); | |
256 time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); | |
257 time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); | |
258 } | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
259 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
260 /* window the second half and save as overlap for next frame */ |
12527 | 261 for (i = 0; i < nlong; i+=4) |
262 { | |
263 overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); | |
264 overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]); | |
265 overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]); | |
266 overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]); | |
267 } | |
268 break; | |
269 | |
270 case LONG_START_SEQUENCE: | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
271 /* perform iMDCT */ |
12527 | 272 imdct_long(fb, freq_in, transf_buf, 2*nlong); |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
273 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
274 /* add second half output of previous frame to windowed output of current frame */ |
12527 | 275 for (i = 0; i < nlong; i+=4) |
276 { | |
277 time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); | |
278 time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); | |
279 time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); | |
280 time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); | |
281 } | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
282 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
283 /* window the second half and save as overlap for next frame */ |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
284 /* construct second half window using padding with 1's and 0's */ |
12527 | 285 for (i = 0; i < nflat_ls; i++) |
286 overlap[i] = transf_buf[nlong+i]; | |
287 for (i = 0; i < nshort; i++) | |
288 overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]); | |
289 for (i = 0; i < nflat_ls; i++) | |
290 overlap[nflat_ls+nshort+i] = 0; | |
291 break; | |
292 | |
293 case EIGHT_SHORT_SEQUENCE: | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
294 /* perform iMDCT for each short block */ |
12527 | 295 faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0); |
296 faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1); | |
297 faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2); | |
298 faad_imdct(fb->mdct256, freq_in+3*nshort, transf_buf+2*nshort*3); | |
299 faad_imdct(fb->mdct256, freq_in+4*nshort, transf_buf+2*nshort*4); | |
300 faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5); | |
301 faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6); | |
302 faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7); | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
303 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
304 /* add second half output of previous frame to windowed output of current frame */ |
12527 | 305 for (i = 0; i < nflat_ls; i++) |
306 time_out[i] = overlap[i]; | |
307 for(i = 0; i < nshort; i++) | |
308 { | |
309 time_out[nflat_ls+ i] = overlap[nflat_ls+ i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]); | |
310 time_out[nflat_ls+1*nshort+i] = overlap[nflat_ls+nshort*1+i] + MUL_F(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*2+i],window_short[i]); | |
311 time_out[nflat_ls+2*nshort+i] = overlap[nflat_ls+nshort*2+i] + MUL_F(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*4+i],window_short[i]); | |
312 time_out[nflat_ls+3*nshort+i] = overlap[nflat_ls+nshort*3+i] + MUL_F(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*6+i],window_short[i]); | |
313 if (i < trans) | |
314 time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]); | |
315 } | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
316 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
317 /* window the second half and save as overlap for next frame */ |
12527 | 318 for(i = 0; i < nshort; i++) |
319 { | |
320 if (i >= trans) | |
321 overlap[nflat_ls+4*nshort+i-nlong] = MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]); | |
322 overlap[nflat_ls+5*nshort+i-nlong] = MUL_F(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*10+i],window_short[i]); | |
323 overlap[nflat_ls+6*nshort+i-nlong] = MUL_F(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*12+i],window_short[i]); | |
324 overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]); | |
325 overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]); | |
326 } | |
327 for (i = 0; i < nflat_ls; i++) | |
328 overlap[nflat_ls+nshort+i] = 0; | |
329 break; | |
330 | |
331 case LONG_STOP_SEQUENCE: | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
332 /* perform iMDCT */ |
12527 | 333 imdct_long(fb, freq_in, transf_buf, 2*nlong); |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
334 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
335 /* add second half output of previous frame to windowed output of current frame */ |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
336 /* construct first half window using padding with 1's and 0's */ |
12527 | 337 for (i = 0; i < nflat_ls; i++) |
338 time_out[i] = overlap[i]; | |
339 for (i = 0; i < nshort; i++) | |
340 time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]); | |
341 for (i = 0; i < nflat_ls; i++) | |
342 time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i]; | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
343 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
344 /* window the second half and save as overlap for next frame */ |
12527 | 345 for (i = 0; i < nlong; i++) |
346 overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); | |
347 break; | |
348 } | |
349 | |
13453
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
350 #if 0 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
351 for (i = 0; i < 1024; i++) |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
352 { |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
353 //printf("%d\n", time_out[i]); |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
354 printf("0x%.8X\n", time_out[i]); |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
355 } |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
356 #endif |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
357 |
6d50ef45a058
Update FAAD to a 2.1 beta CVS snapshot from 2004.07.12.
diego
parents:
12625
diff
changeset
|
358 |
12527 | 359 #ifdef PROFILE |
360 count = faad_get_ts() - count; | |
361 fb->cycles += count; | |
362 #endif | |
363 } | |
364 | |
365 #ifdef USE_SSE | |
366 void ifilter_bank_sse(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, | |
367 uint8_t window_shape_prev, real_t *freq_in, | |
368 real_t *time_out, uint8_t object_type, uint16_t frame_len) | |
369 { | |
370 int16_t i; | |
371 ALIGN real_t transf_buf[2*1024] = {0}; | |
372 | |
373 const real_t *window_long = NULL; | |
374 const real_t *window_long_prev = NULL; | |
375 const real_t *window_short = NULL; | |
376 const real_t *window_short_prev = NULL; | |
377 | |
378 uint16_t nlong = frame_len; | |
379 uint16_t nshort = frame_len/8; | |
380 uint16_t trans = nshort/2; | |
381 | |
382 uint16_t nflat_ls = (nlong-nshort)/2; | |
383 | |
384 #ifdef PROFILE | |
385 int64_t count = faad_get_ts(); | |
386 #endif | |
10725 | 387 |
388 #ifdef LD_DEC | |
389 if (object_type == LD) | |
390 { | |
391 window_long = fb->ld_window[window_shape]; | |
392 window_long_prev = fb->ld_window[window_shape_prev]; | |
393 } else { | |
394 #endif | |
395 window_long = fb->long_window[window_shape]; | |
396 window_long_prev = fb->long_window[window_shape_prev]; | |
397 window_short = fb->short_window[window_shape]; | |
398 window_short_prev = fb->short_window[window_shape_prev]; | |
399 #ifdef LD_DEC | |
400 } | |
401 #endif | |
402 | |
403 switch (window_sequence) | |
404 { | |
405 case ONLY_LONG_SEQUENCE: | |
12527 | 406 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); |
10989 | 407 for (i = 0; i < nlong; i+=4) |
10725 | 408 { |
12527 | 409 __m128 m1, m2, m3, m4, m5, m6, m7, m8; |
410 | |
411 m1 = _mm_load_ps(&transf_buf[i]); | |
412 m2 = _mm_load_ps(&window_long_prev[i]); | |
413 m6 = _mm_load_ps(&window_long[nlong-4-i]); | |
414 m3 = _mm_load_ps(&time_out[nlong+i]); | |
415 m5 = _mm_load_ps(&transf_buf[nlong+i]); | |
416 | |
417 m4 = _mm_mul_ps(m1, m2); | |
418 m7 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 1, 2, 3)); | |
419 | |
420 m4 = _mm_add_ps(m4, m3); | |
421 m8 = _mm_mul_ps(m5, m7); | |
422 | |
423 _mm_store_ps(&time_out[i], m4); | |
424 _mm_store_ps(&time_out[nlong+i], m8); | |
10725 | 425 } |
426 break; | |
427 | |
428 case LONG_START_SEQUENCE: | |
12527 | 429 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); |
10989 | 430 for (i = 0; i < nlong; i+=4) |
431 { | |
12527 | 432 __m128 m1 = _mm_load_ps(&transf_buf[i]); |
433 __m128 m2 = _mm_load_ps(&window_long_prev[i]); | |
434 __m128 m3 = _mm_load_ps(&time_out[nlong+i]); | |
435 | |
436 __m128 m4 = _mm_mul_ps(m1, m2); | |
437 m4 = _mm_add_ps(m4, m3); | |
438 | |
439 _mm_store_ps(&time_out[i], m4); | |
440 } | |
441 for (i = 0; i < nflat_ls; i+=4) | |
442 { | |
443 __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]); | |
444 _mm_store_ps(&time_out[nlong+i], m1); | |
10989 | 445 } |
12527 | 446 for (i = 0; i < nshort; i+=4) |
447 { | |
448 __m128 m1 = _mm_load_ps(&transf_buf[nlong+nflat_ls+i]); | |
449 __m128 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
450 __m128 m3, m4; | |
451 | |
452 m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
453 | |
454 m4 = _mm_mul_ps(m1, m3); | |
455 | |
456 _mm_store_ps(&time_out[nlong+nflat_ls+i], m4); | |
457 } | |
458 for (i = 0; i < nflat_ls; i+=4) | |
459 { | |
460 __m128 m1 = _mm_setzero_ps(); | |
461 _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1); | |
462 } | |
10725 | 463 break; |
464 | |
465 case EIGHT_SHORT_SEQUENCE: | |
12527 | 466 faad_imdct_sse(fb->mdct256, &freq_in[0*nshort], &transf_buf[2*nshort*0]); |
467 faad_imdct_sse(fb->mdct256, &freq_in[1*nshort], &transf_buf[2*nshort*1]); | |
468 faad_imdct_sse(fb->mdct256, &freq_in[2*nshort], &transf_buf[2*nshort*2]); | |
469 faad_imdct_sse(fb->mdct256, &freq_in[3*nshort], &transf_buf[2*nshort*3]); | |
470 faad_imdct_sse(fb->mdct256, &freq_in[4*nshort], &transf_buf[2*nshort*4]); | |
471 faad_imdct_sse(fb->mdct256, &freq_in[5*nshort], &transf_buf[2*nshort*5]); | |
472 faad_imdct_sse(fb->mdct256, &freq_in[6*nshort], &transf_buf[2*nshort*6]); | |
473 faad_imdct_sse(fb->mdct256, &freq_in[7*nshort], &transf_buf[2*nshort*7]); | |
474 for (i = 0; i < nflat_ls; i+=4) | |
475 { | |
476 __m128 m1 = _mm_load_ps(&time_out[nlong+i]); | |
477 _mm_store_ps(&time_out[i], m1); | |
478 } | |
479 for (i = 0; i < nshort; i+=4) | |
480 { | |
481 __m128 m1 = _mm_load_ps(&transf_buf[nshort*0+i]); | |
482 __m128 m2 = _mm_load_ps(&window_short_prev[i]); | |
483 __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]); | |
484 | |
485 __m128 m4 = _mm_mul_ps(m1, m2); | |
486 m4 = _mm_add_ps(m4, m3); | |
487 | |
488 _mm_store_ps(&time_out[nflat_ls+i], m4); | |
489 } | |
490 for (i = 0; i < nshort; i+=4) | |
491 { | |
492 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
493 m1 = _mm_load_ps(&transf_buf[nshort*1+i]); | |
494 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
495 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*1+i]); | |
496 m6 = _mm_load_ps(&transf_buf[nshort*2+i]); | |
497 m7 = _mm_load_ps(&window_short[i]); | |
498 | |
499 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
500 | |
501 m4 = _mm_mul_ps(m1, m5); | |
502 m8 = _mm_mul_ps(m6, m7); | |
503 m4 = _mm_add_ps(m4, m3); | |
504 m4 = _mm_add_ps(m4, m8); | |
505 | |
506 _mm_store_ps(&time_out[nflat_ls+1*nshort+i], m4); | |
507 } | |
508 for (i = 0; i < nshort; i+=4) | |
509 { | |
510 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
511 m1 = _mm_load_ps(&transf_buf[nshort*3+i]); | |
512 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
513 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*2+i]); | |
514 m6 = _mm_load_ps(&transf_buf[nshort*4+i]); | |
515 m7 = _mm_load_ps(&window_short[i]); | |
516 | |
517 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
518 | |
519 m4 = _mm_mul_ps(m1, m5); | |
520 m8 = _mm_mul_ps(m6, m7); | |
521 m4 = _mm_add_ps(m4, m3); | |
522 m4 = _mm_add_ps(m4, m8); | |
523 | |
524 _mm_store_ps(&time_out[nflat_ls+2*nshort+i], m4); | |
525 } | |
526 for (i = 0; i < nshort; i+=4) | |
527 { | |
528 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
529 m1 = _mm_load_ps(&transf_buf[nshort*5+i]); | |
530 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
531 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*3+i]); | |
532 m6 = _mm_load_ps(&transf_buf[nshort*6+i]); | |
533 m7 = _mm_load_ps(&window_short[i]); | |
534 | |
535 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
536 | |
537 m4 = _mm_mul_ps(m1, m5); | |
538 m8 = _mm_mul_ps(m6, m7); | |
539 m4 = _mm_add_ps(m4, m3); | |
540 m4 = _mm_add_ps(m4, m8); | |
541 | |
542 _mm_store_ps(&time_out[nflat_ls+3*nshort+i], m4); | |
543 } | |
544 for(i = 0; i < trans; i+=4) | |
10725 | 545 { |
12527 | 546 __m128 m1, m2, m3, m4, m5, m6, m7, m8; |
547 m1 = _mm_load_ps(&transf_buf[nshort*7+i]); | |
548 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
549 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*4+i]); | |
550 m6 = _mm_load_ps(&transf_buf[nshort*8+i]); | |
551 m7 = _mm_load_ps(&window_short[i]); | |
552 | |
553 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
554 | |
555 m4 = _mm_mul_ps(m1, m5); | |
556 m8 = _mm_mul_ps(m6, m7); | |
557 m4 = _mm_add_ps(m4, m3); | |
558 m4 = _mm_add_ps(m4, m8); | |
559 | |
560 _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m4); | |
561 } | |
562 for (i = trans; i < nshort; i+=4) | |
563 { | |
564 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
565 m1 = _mm_load_ps(&transf_buf[nshort*7+i]); | |
566 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
567 m6 = _mm_load_ps(&transf_buf[nshort*8+i]); | |
568 m7 = _mm_load_ps(&window_short[i]); | |
569 | |
570 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
571 | |
572 m4 = _mm_mul_ps(m1, m5); | |
573 m8 = _mm_mul_ps(m6, m7); | |
574 m3 = _mm_add_ps(m4, m8); | |
575 | |
576 _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m3); | |
577 } | |
578 for (i = 0; i < nshort; i+=4) | |
579 { | |
580 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
581 m1 = _mm_load_ps(&transf_buf[nshort*9+i]); | |
582 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
583 m6 = _mm_load_ps(&transf_buf[nshort*10+i]); | |
584 m7 = _mm_load_ps(&window_short[i]); | |
585 | |
586 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
587 | |
588 m4 = _mm_mul_ps(m1, m5); | |
589 m8 = _mm_mul_ps(m6, m7); | |
590 m3 = _mm_add_ps(m4, m8); | |
591 | |
592 _mm_store_ps(&time_out[nflat_ls+5*nshort+i], m3); | |
10725 | 593 } |
12527 | 594 for (i = 0; i < nshort; i+=4) |
595 { | |
596 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
597 m1 = _mm_load_ps(&transf_buf[nshort*11+i]); | |
598 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
599 m6 = _mm_load_ps(&transf_buf[nshort*12+i]); | |
600 m7 = _mm_load_ps(&window_short[i]); | |
601 | |
602 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
603 | |
604 m4 = _mm_mul_ps(m1, m5); | |
605 m8 = _mm_mul_ps(m6, m7); | |
606 m3 = _mm_add_ps(m4, m8); | |
607 | |
608 _mm_store_ps(&time_out[nflat_ls+6*nshort+i], m3); | |
609 } | |
610 for (i = 0; i < nshort; i+=4) | |
611 { | |
612 __m128 m1, m2, m3, m4, m5, m6, m7, m8; | |
613 m1 = _mm_load_ps(&transf_buf[nshort*13+i]); | |
614 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
615 m6 = _mm_load_ps(&transf_buf[nshort*14+i]); | |
616 m7 = _mm_load_ps(&window_short[i]); | |
617 | |
618 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
619 | |
620 m4 = _mm_mul_ps(m1, m5); | |
621 m8 = _mm_mul_ps(m6, m7); | |
622 m3 = _mm_add_ps(m4, m8); | |
623 | |
624 _mm_store_ps(&time_out[nflat_ls+7*nshort+i], m3); | |
625 } | |
626 for (i = 0; i < nshort; i+=4) | |
627 { | |
628 __m128 m1, m2, m3, m5; | |
629 m1 = _mm_load_ps(&transf_buf[nshort*15+i]); | |
630 m2 = _mm_load_ps(&window_short[nshort-4-i]); | |
631 | |
632 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
633 | |
634 m3 = _mm_mul_ps(m1, m5); | |
635 | |
636 _mm_store_ps(&time_out[nflat_ls+8*nshort+i], m3); | |
637 } | |
638 for (i = 0; i < nflat_ls; i+=4) | |
639 { | |
640 __m128 m1 = _mm_setzero_ps(); | |
641 _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1); | |
642 } | |
10725 | 643 break; |
644 | |
645 case LONG_STOP_SEQUENCE: | |
12527 | 646 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); |
647 for (i = 0; i < nflat_ls; i+=4) | |
648 { | |
649 __m128 m1 = _mm_load_ps(&time_out[nlong+i]); | |
650 _mm_store_ps(&time_out[i], m1); | |
651 } | |
652 for (i = 0; i < nshort; i+=4) | |
653 { | |
654 __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+i]); | |
655 __m128 m2 = _mm_load_ps(&window_short_prev[i]); | |
656 __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]); | |
657 | |
658 __m128 m4 = _mm_mul_ps(m1, m2); | |
659 m4 = _mm_add_ps(m4, m3); | |
660 | |
661 _mm_store_ps(&time_out[nflat_ls+i], m4); | |
662 } | |
663 for (i = 0; i < nflat_ls; i+=4) | |
664 { | |
665 __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+nshort+i]); | |
666 __m128 m2 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort+i]); | |
667 | |
668 __m128 m3 = _mm_add_ps(m1, m2); | |
669 | |
670 _mm_store_ps(&time_out[nflat_ls+nshort+i], m3); | |
671 } | |
672 for (i = 0; i < nlong; i+=4) | |
673 { | |
674 __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]); | |
675 __m128 m2 = _mm_load_ps(&window_long[nlong-4-i]); | |
676 __m128 m3, m4; | |
677 | |
678 m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); | |
679 | |
680 m4 = _mm_mul_ps(m1, m3); | |
681 | |
682 _mm_store_ps(&time_out[nlong+i], m4); | |
683 } | |
10725 | 684 break; |
685 } | |
686 | |
12527 | 687 #ifdef PROFILE |
688 count = faad_get_ts() - count; | |
689 fb->cycles += count; | |
690 #endif | |
10725 | 691 } |
12527 | 692 #endif |
10725 | 693 |
694 #ifdef LTP_DEC | |
695 /* only works for LTP -> no overlapping, no short blocks */ | |
696 void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, | |
697 uint8_t window_shape_prev, real_t *in_data, real_t *out_mdct, | |
698 uint8_t object_type, uint16_t frame_len) | |
699 { | |
700 int16_t i; | |
12527 | 701 ALIGN real_t windowed_buf[2*1024] = {0}; |
10725 | 702 |
12527 | 703 const real_t *window_long = NULL; |
704 const real_t *window_long_prev = NULL; | |
705 const real_t *window_short = NULL; | |
706 const real_t *window_short_prev = NULL; | |
10725 | 707 |
708 uint16_t nlong = frame_len; | |
709 uint16_t nshort = frame_len/8; | |
710 uint16_t nflat_ls = (nlong-nshort)/2; | |
711 | |
712 assert(window_sequence != EIGHT_SHORT_SEQUENCE); | |
713 | |
714 #ifdef LD_DEC | |
715 if (object_type == LD) | |
716 { | |
717 window_long = fb->ld_window[window_shape]; | |
718 window_long_prev = fb->ld_window[window_shape_prev]; | |
719 } else { | |
720 #endif | |
721 window_long = fb->long_window[window_shape]; | |
722 window_long_prev = fb->long_window[window_shape_prev]; | |
723 window_short = fb->short_window[window_shape]; | |
724 window_short_prev = fb->short_window[window_shape_prev]; | |
725 #ifdef LD_DEC | |
726 } | |
727 #endif | |
728 | |
729 switch(window_sequence) | |
730 { | |
731 case ONLY_LONG_SEQUENCE: | |
732 for (i = nlong-1; i >= 0; i--) | |
733 { | |
12527 | 734 windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]); |
735 windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]); | |
10725 | 736 } |
737 mdct(fb, windowed_buf, out_mdct, 2*nlong); | |
738 break; | |
739 | |
740 case LONG_START_SEQUENCE: | |
741 for (i = 0; i < nlong; i++) | |
12527 | 742 windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]); |
10725 | 743 for (i = 0; i < nflat_ls; i++) |
744 windowed_buf[i+nlong] = in_data[i+nlong]; | |
745 for (i = 0; i < nshort; i++) | |
12527 | 746 windowed_buf[i+nlong+nflat_ls] = MUL_F(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]); |
10725 | 747 for (i = 0; i < nflat_ls; i++) |
748 windowed_buf[i+nlong+nflat_ls+nshort] = 0; | |
749 mdct(fb, windowed_buf, out_mdct, 2*nlong); | |
750 break; | |
751 | |
752 case LONG_STOP_SEQUENCE: | |
753 for (i = 0; i < nflat_ls; i++) | |
754 windowed_buf[i] = 0; | |
755 for (i = 0; i < nshort; i++) | |
12527 | 756 windowed_buf[i+nflat_ls] = MUL_F(in_data[i+nflat_ls], window_short_prev[i]); |
10725 | 757 for (i = 0; i < nflat_ls; i++) |
758 windowed_buf[i+nflat_ls+nshort] = in_data[i+nflat_ls+nshort]; | |
759 for (i = 0; i < nlong; i++) | |
12527 | 760 windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]); |
10725 | 761 mdct(fb, windowed_buf, out_mdct, 2*nlong); |
762 break; | |
763 } | |
764 } | |
765 #endif |