comparison libfaad2/sbr_qmf.c @ 12527:4a370c80fe5c

update to the 2.0 release of faad, patch by adland
author diego
date Wed, 02 Jun 2004 22:59:04 +0000
parents 3185f64f6350
children d81145997036
comparison
equal deleted inserted replaced
12526:e183ad37d24c 12527:4a370c80fe5c
1 /* 1 /*
2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding 2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
3 ** Copyright (C) 2003 M. Bakker, Ahead Software AG, http://www.nero.com 3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
4 ** 4 **
5 ** This program is free software; you can redistribute it and/or modify 5 ** This program is free software; you can redistribute it and/or modify
6 ** it under the terms of the GNU General Public License as published by 6 ** it under the terms of the GNU General Public License as published by
7 ** the Free Software Foundation; either version 2 of the License, or 7 ** the Free Software Foundation; either version 2 of the License, or
8 ** (at your option) any later version. 8 ** (at your option) any later version.
20 ** forbidden. 20 ** forbidden.
21 ** 21 **
22 ** Commercial non-GPL licensing of this software is possible. 22 ** Commercial non-GPL licensing of this software is possible.
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com. 23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
24 ** 24 **
25 ** $Id: sbr_qmf.c,v 1.13 2003/09/30 12:43:05 menno Exp $ 25 ** $Id: sbr_qmf.c,v 1.2 2003/10/03 22:22:27 alex Exp $
26 **/ 26 **/
27 27
28 #include "common.h" 28 #include "common.h"
29 #include "structs.h" 29 #include "structs.h"
30 30
39 #include "sbr_syntax.h" 39 #include "sbr_syntax.h"
40 40
41 41
42 qmfa_info *qmfa_init(uint8_t channels) 42 qmfa_info *qmfa_init(uint8_t channels)
43 { 43 {
44 qmfa_info *qmfa = (qmfa_info*)malloc(sizeof(qmfa_info)); 44 qmfa_info *qmfa = (qmfa_info*)faad_malloc(sizeof(qmfa_info));
45 qmfa->x = (real_t*)malloc(channels * 10 * sizeof(real_t)); 45 qmfa->x = (real_t*)faad_malloc(channels * 10 * sizeof(real_t));
46 memset(qmfa->x, 0, channels * 10 * sizeof(real_t)); 46 memset(qmfa->x, 0, channels * 10 * sizeof(real_t));
47 47
48 qmfa->channels = channels; 48 qmfa->channels = channels;
49 49
50 return qmfa; 50 return qmfa;
52 52
53 void qmfa_end(qmfa_info *qmfa) 53 void qmfa_end(qmfa_info *qmfa)
54 { 54 {
55 if (qmfa) 55 if (qmfa)
56 { 56 {
57 if (qmfa->x) free(qmfa->x); 57 if (qmfa->x) faad_free(qmfa->x);
58 free(qmfa); 58 faad_free(qmfa);
59 } 59 }
60 } 60 }
61 61
62 void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input, 62 void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input,
63 qmf_t *X, uint8_t offset, uint8_t kx) 63 qmf_t X[MAX_NTSRHFG][32], uint8_t offset, uint8_t kx)
64 { 64 {
65 ALIGN real_t u[64];
66 #ifndef SBR_LOW_POWER
67 ALIGN real_t x[64], y[64];
68 #else
69 ALIGN real_t y[32];
70 #endif
71 uint16_t in = 0;
65 uint8_t l; 72 uint8_t l;
66 real_t u[64];
67 #ifndef SBR_LOW_POWER
68 real_t x[64], y[64];
69 #else
70 real_t y[32];
71 #endif
72 const real_t *inptr = input;
73 73
74 /* qmf subsample l */ 74 /* qmf subsample l */
75 for (l = 0; l < sbr->numTimeSlotsRate; l++) 75 for (l = 0; l < sbr->numTimeSlotsRate; l++)
76 { 76 {
77 int16_t n; 77 int16_t n;
81 81
82 /* add new samples to input buffer x */ 82 /* add new samples to input buffer x */
83 for (n = 32 - 1; n >= 0; n--) 83 for (n = 32 - 1; n >= 0; n--)
84 { 84 {
85 #ifdef FIXED_POINT 85 #ifdef FIXED_POINT
86 qmfa->x[n] = (*inptr++) >> 5; 86 qmfa->x[n] = (input[in++]) >> 5;
87 #else 87 #else
88 qmfa->x[n] = *inptr++; 88 qmfa->x[n] = input[in++];
89 #endif 89 #endif
90 } 90 }
91 91
92 /* window and summation to create array u */ 92 /* window and summation to create array u */
93 for (n = 0; n < 64; n++) 93 for (n = 0; n < 64; n++)
94 { 94 {
95 u[n] = MUL_R_C(qmfa->x[n], qmf_c[2*n]) + 95 u[n] = MUL_F(qmfa->x[n], qmf_c[2*n]) +
96 MUL_R_C(qmfa->x[n + 64], qmf_c[2*(n + 64)]) + 96 MUL_F(qmfa->x[n + 64], qmf_c[2*(n + 64)]) +
97 MUL_R_C(qmfa->x[n + 128], qmf_c[2*(n + 128)]) + 97 MUL_F(qmfa->x[n + 128], qmf_c[2*(n + 128)]) +
98 MUL_R_C(qmfa->x[n + 192], qmf_c[2*(n + 192)]) + 98 MUL_F(qmfa->x[n + 192], qmf_c[2*(n + 192)]) +
99 MUL_R_C(qmfa->x[n + 256], qmf_c[2*(n + 256)]); 99 MUL_F(qmfa->x[n + 256], qmf_c[2*(n + 256)]);
100 } 100 }
101 101
102 /* calculate 32 subband samples by introducing X */ 102 /* calculate 32 subband samples by introducing X */
103 #ifdef SBR_LOW_POWER 103 #ifdef SBR_LOW_POWER
104 y[0] = u[48]; 104 y[0] = u[48];
112 for (n = 0; n < 32; n++) 112 for (n = 0; n < 32; n++)
113 { 113 {
114 if (n < kx) 114 if (n < kx)
115 { 115 {
116 #ifdef FIXED_POINT 116 #ifdef FIXED_POINT
117 QMF_RE(X[((l + offset)<<5) + n]) = u[n] << 1; 117 QMF_RE(X[l + offset][n]) = u[n] << 1;
118 #else 118 #else
119 QMF_RE(X[((l + offset)<<5) + n]) = 2. * u[n]; 119 QMF_RE(X[l + offset][n]) = 2. * u[n];
120 #endif 120 #endif
121 } else { 121 } else {
122 QMF_RE(X[((l + offset)<<5) + n]) = 0; 122 QMF_RE(X[l + offset][n]) = 0;
123 } 123 }
124 } 124 }
125 #else 125 #else
126 x[0] = u[0]; 126 x[0] = u[0];
127 for (n = 0; n < 31; n++)
128 {
129 x[2*n+1] = u[n+1] + u[63-n];
130 x[2*n+2] = u[n+1] - u[63-n];
131 }
127 x[63] = u[32]; 132 x[63] = u[32];
128 for (n = 2; n < 64; n += 2) 133
129 { 134 DCT4_64_kernel(y, x);
130 x[n-1] = u[(n>>1)];
131 x[n] = -u[64-(n>>1)];
132 }
133
134 DCT4_64(y, x);
135 135
136 for (n = 0; n < 32; n++) 136 for (n = 0; n < 32; n++)
137 { 137 {
138 if (n < kx) 138 if (n < kx)
139 { 139 {
140 #ifdef FIXED_POINT 140 #ifdef FIXED_POINT
141 QMF_RE(X[((l + offset)<<5) + n]) = y[n] << 1; 141 QMF_RE(X[l + offset][n]) = y[n] << 1;
142 QMF_IM(X[((l + offset)<<5) + n]) = -y[63-n] << 1; 142 QMF_IM(X[l + offset][n]) = -y[63-n] << 1;
143 #else 143 #else
144 QMF_RE(X[((l + offset)<<5) + n]) = 2. * y[n]; 144 QMF_RE(X[l + offset][n]) = 2. * y[n];
145 QMF_IM(X[((l + offset)<<5) + n]) = -2. * y[63-n]; 145 QMF_IM(X[l + offset][n]) = -2. * y[63-n];
146 #endif 146 #endif
147 } else { 147 } else {
148 QMF_RE(X[((l + offset)<<5) + n]) = 0; 148 QMF_RE(X[l + offset][n]) = 0;
149 QMF_IM(X[((l + offset)<<5) + n]) = 0; 149 QMF_IM(X[l + offset][n]) = 0;
150 } 150 }
151 } 151 }
152 #endif 152 #endif
153 } 153 }
154 } 154 }
155 155
156 qmfs_info *qmfs_init(uint8_t channels) 156 qmfs_info *qmfs_init(uint8_t channels)
157 { 157 {
158 int size = 0; 158 qmfs_info *qmfs = (qmfs_info*)faad_malloc(sizeof(qmfs_info));
159 qmfs_info *qmfs = (qmfs_info*)malloc(sizeof(qmfs_info)); 159
160 160 #ifndef SBR_LOW_POWER
161 qmfs->v[0] = (real_t*)malloc(channels * 10 * sizeof(real_t)); 161 qmfs->v[0] = (real_t*)faad_malloc(channels * 10 * sizeof(real_t));
162 memset(qmfs->v[0], 0, channels * 10 * sizeof(real_t)); 162 memset(qmfs->v[0], 0, channels * 10 * sizeof(real_t));
163 qmfs->v[1] = (real_t*)malloc(channels * 10 * sizeof(real_t)); 163 qmfs->v[1] = (real_t*)faad_malloc(channels * 10 * sizeof(real_t));
164 memset(qmfs->v[1], 0, channels * 10 * sizeof(real_t)); 164 memset(qmfs->v[1], 0, channels * 10 * sizeof(real_t));
165 #else
166 qmfs->v[0] = (real_t*)faad_malloc(channels * 20 * sizeof(real_t));
167 memset(qmfs->v[0], 0, channels * 20 * sizeof(real_t));
168 qmfs->v[1] = NULL;
169 #endif
165 170
166 qmfs->v_index = 0; 171 qmfs->v_index = 0;
167 172
168 qmfs->channels = channels; 173 qmfs->channels = channels;
169 174
175 #ifdef USE_SSE
176 if (cpu_has_sse())
177 {
178 qmfs->qmf_func = sbr_qmf_synthesis_64_sse;
179 } else {
180 qmfs->qmf_func = sbr_qmf_synthesis_64;
181 }
182 #endif
183
170 return qmfs; 184 return qmfs;
171 } 185 }
172 186
173 void qmfs_end(qmfs_info *qmfs) 187 void qmfs_end(qmfs_info *qmfs)
174 { 188 {
175 if (qmfs) 189 if (qmfs)
176 { 190 {
177 if (qmfs->v[0]) free(qmfs->v[0]); 191 if (qmfs->v[0]) faad_free(qmfs->v[0]);
178 if (qmfs->v[1]) free(qmfs->v[1]); 192 #ifndef SBR_LOW_POWER
179 free(qmfs); 193 if (qmfs->v[1]) faad_free(qmfs->v[1]);
194 #endif
195 faad_free(qmfs);
180 } 196 }
181 } 197 }
182 198
183 #ifdef SBR_LOW_POWER 199 #ifdef SBR_LOW_POWER
184 void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, const qmf_t *X, 200 void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
185 real_t *output) 201 real_t *output)
186 { 202 {
203 ALIGN real_t x[64];
204 ALIGN real_t y[64];
205 int16_t n, k, out = 0;
187 uint8_t l; 206 uint8_t l;
188 int16_t n, k;
189 real_t x[64];
190 real_t *outptr = output;
191 207
192 208
193 /* qmf subsample l */ 209 /* qmf subsample l */
194 for (l = 0; l < sbr->numTimeSlotsRate; l++) 210 for (l = 0; l < sbr->numTimeSlotsRate; l++)
195 { 211 {
212 //real_t *v0, *v1;
213
214 /* shift buffers */
215 //memmove(qmfs->v[0] + 64, qmfs->v[0], (640-64)*sizeof(real_t));
216 //memmove(qmfs->v[1] + 64, qmfs->v[1], (640-64)*sizeof(real_t));
217 memmove(qmfs->v[0] + 128, qmfs->v[0], (1280-128)*sizeof(real_t));
218
219 //v0 = qmfs->v[qmfs->v_index];
220 //v1 = qmfs->v[(qmfs->v_index + 1) & 0x1];
221 //qmfs->v_index = (qmfs->v_index + 1) & 0x1;
222
223 /* calculate 128 samples */
224 for (k = 0; k < 64; k++)
225 {
226 #ifdef FIXED_POINT
227 x[k] = QMF_RE(X[l][k]);
228 #else
229 x[k] = QMF_RE(X[l][k]) / 32.;
230 #endif
231 }
232
233 for (n = 0; n < 32; n++)
234 {
235 y[2*n] = -x[2*n];
236 y[2*n+1] = x[2*n+1];
237 }
238
239 DCT2_64_unscaled(x, x);
240
241 for (n = 0; n < 64; n++)
242 {
243 qmfs->v[0][n+32] = x[n];
244 }
245 for (n = 0; n < 32; n++)
246 {
247 qmfs->v[0][31 - n] = x[n + 1];
248 }
249 DST2_64_unscaled(x, y);
250 qmfs->v[0][96] = 0;
251 for (n = 1; n < 32; n++)
252 {
253 qmfs->v[0][n + 96] = x[n-1];
254 }
255
256 /* calculate 64 output samples and window */
257 for (k = 0; k < 64; k++)
258 {
259 #if 1
260 output[out++] = MUL_F(qmfs->v[0][k], qmf_c[k]) +
261 MUL_F(qmfs->v[0][192 + k], qmf_c[64 + k]) +
262 MUL_F(qmfs->v[0][256 + k], qmf_c[128 + k]) +
263 MUL_F(qmfs->v[0][256 + 192 + k], qmf_c[128 + 64 + k]) +
264 MUL_F(qmfs->v[0][512 + k], qmf_c[256 + k]) +
265 MUL_F(qmfs->v[0][512 + 192 + k], qmf_c[256 + 64 + k]) +
266 MUL_F(qmfs->v[0][768 + k], qmf_c[384 + k]) +
267 MUL_F(qmfs->v[0][768 + 192 + k], qmf_c[384 + 64 + k]) +
268 MUL_F(qmfs->v[0][1024 + k], qmf_c[512 + k]) +
269 MUL_F(qmfs->v[0][1024 + 192 + k], qmf_c[512 + 64 + k]);
270 #else
271 output[out++] = MUL_F(v0[k], qmf_c[k]) +
272 MUL_F(v0[64 + k], qmf_c[64 + k]) +
273 MUL_F(v0[128 + k], qmf_c[128 + k]) +
274 MUL_F(v0[192 + k], qmf_c[192 + k]) +
275 MUL_F(v0[256 + k], qmf_c[256 + k]) +
276 MUL_F(v0[320 + k], qmf_c[320 + k]) +
277 MUL_F(v0[384 + k], qmf_c[384 + k]) +
278 MUL_F(v0[448 + k], qmf_c[448 + k]) +
279 MUL_F(v0[512 + k], qmf_c[512 + k]) +
280 MUL_F(v0[576 + k], qmf_c[576 + k]);
281 #endif
282 }
283 }
284 }
285
286 void sbr_qmf_synthesis_64_sse(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
287 real_t *output)
288 {
289 ALIGN real_t x[64];
290 ALIGN real_t y[64];
291 ALIGN real_t y2[64];
292 int16_t n, k, out = 0;
293 uint8_t l;
294
295 /* qmf subsample l */
296 for (l = 0; l < sbr->numTimeSlotsRate; l++)
297 {
298 //real_t *v0, *v1;
299
300 /* shift buffers */
301 //memmove(qmfs->v[0] + 64, qmfs->v[0], (640-64)*sizeof(real_t));
302 //memmove(qmfs->v[1] + 64, qmfs->v[1], (640-64)*sizeof(real_t));
303 memmove(qmfs->v[0] + 128, qmfs->v[0], (1280-128)*sizeof(real_t));
304
305 //v0 = qmfs->v[qmfs->v_index];
306 //v1 = qmfs->v[(qmfs->v_index + 1) & 0x1];
307 //qmfs->v_index = (qmfs->v_index + 1) & 0x1;
308
309 /* calculate 128 samples */
310 for (k = 0; k < 64; k++)
311 {
312 #ifdef FIXED_POINT
313 x[k] = QMF_RE(X[l][k]);
314 #else
315 x[k] = QMF_RE(X[l][k]) / 32.;
316 #endif
317 }
318
319 for (n = 0; n < 32; n++)
320 {
321 y[2*n] = -x[2*n];
322 y[2*n+1] = x[2*n+1];
323 }
324
325 DCT2_64_unscaled(x, x);
326
327 for (n = 0; n < 64; n++)
328 {
329 qmfs->v[0][n+32] = x[n];
330 }
331 for (n = 0; n < 32; n++)
332 {
333 qmfs->v[0][31 - n] = x[n + 1];
334 }
335
336 DST2_64_unscaled(x, y);
337 qmfs->v[0][96] = 0;
338 for (n = 1; n < 32; n++)
339 {
340 qmfs->v[0][n + 96] = x[n-1];
341 }
342
343 /* calculate 64 output samples and window */
344 for (k = 0; k < 64; k++)
345 {
346 #if 1
347 output[out++] = MUL_F(qmfs->v[0][k], qmf_c[k]) +
348 MUL_F(qmfs->v[0][192 + k], qmf_c[64 + k]) +
349 MUL_F(qmfs->v[0][256 + k], qmf_c[128 + k]) +
350 MUL_F(qmfs->v[0][256 + 192 + k], qmf_c[128 + 64 + k]) +
351 MUL_F(qmfs->v[0][512 + k], qmf_c[256 + k]) +
352 MUL_F(qmfs->v[0][512 + 192 + k], qmf_c[256 + 64 + k]) +
353 MUL_F(qmfs->v[0][768 + k], qmf_c[384 + k]) +
354 MUL_F(qmfs->v[0][768 + 192 + k], qmf_c[384 + 64 + k]) +
355 MUL_F(qmfs->v[0][1024 + k], qmf_c[512 + k]) +
356 MUL_F(qmfs->v[0][1024 + 192 + k], qmf_c[512 + 64 + k]);
357 #else
358 output[out++] = MUL_F(v0[k], qmf_c[k]) +
359 MUL_F(v0[64 + k], qmf_c[64 + k]) +
360 MUL_F(v0[128 + k], qmf_c[128 + k]) +
361 MUL_F(v0[192 + k], qmf_c[192 + k]) +
362 MUL_F(v0[256 + k], qmf_c[256 + k]) +
363 MUL_F(v0[320 + k], qmf_c[320 + k]) +
364 MUL_F(v0[384 + k], qmf_c[384 + k]) +
365 MUL_F(v0[448 + k], qmf_c[448 + k]) +
366 MUL_F(v0[512 + k], qmf_c[512 + k]) +
367 MUL_F(v0[576 + k], qmf_c[576 + k]);
368 #endif
369 }
370 }
371 }
372 #else
373 void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
374 real_t *output)
375 {
376 ALIGN real_t x1[64], x2[64];
377 real_t scale = 1.f/64.f;
378 int16_t n, k, out = 0;
379 uint8_t l;
380
381
382 /* qmf subsample l */
383 for (l = 0; l < sbr->numTimeSlotsRate; l++)
384 {
196 real_t *v0, *v1; 385 real_t *v0, *v1;
197 386
198 /* shift buffers */ 387 /* shift buffers */
199 memmove(qmfs->v[0] + 64, qmfs->v[0], (640-64)*sizeof(real_t)); 388 memmove(qmfs->v[0] + 64, qmfs->v[0], (640-64)*sizeof(real_t));
200 memmove(qmfs->v[1] + 64, qmfs->v[1], (640-64)*sizeof(real_t)); 389 memmove(qmfs->v[1] + 64, qmfs->v[1], (640-64)*sizeof(real_t));
202 v0 = qmfs->v[qmfs->v_index]; 391 v0 = qmfs->v[qmfs->v_index];
203 v1 = qmfs->v[(qmfs->v_index + 1) & 0x1]; 392 v1 = qmfs->v[(qmfs->v_index + 1) & 0x1];
204 qmfs->v_index = (qmfs->v_index + 1) & 0x1; 393 qmfs->v_index = (qmfs->v_index + 1) & 0x1;
205 394
206 /* calculate 128 samples */ 395 /* calculate 128 samples */
207 for (k = 0; k < 64; k++) 396 x1[0] = scale*QMF_RE(X[l][0]);
208 { 397 x2[63] = scale*QMF_IM(X[l][0]);
209 #ifdef FIXED_POINT 398 for (k = 0; k < 31; k++)
210 x[k] = QMF_RE(X[(l<<6) + k]); 399 {
211 #else 400 x1[2*k+1] = scale*(QMF_RE(X[l][2*k+1]) - QMF_RE(X[l][2*k+2]));
212 x[k] = QMF_RE(X[(l<<6) + k]) / 32.; 401 x1[2*k+2] = scale*(QMF_RE(X[l][2*k+1]) + QMF_RE(X[l][2*k+2]));
213 #endif 402
214 } 403 x2[61 - 2*k] = scale*(QMF_IM(X[l][2*k+2]) - QMF_IM(X[l][2*k+1]));
215 404 x2[62 - 2*k] = scale*(QMF_IM(X[l][2*k+2]) + QMF_IM(X[l][2*k+1]));
216 DCT2_64_unscaled(x, x); 405 }
217 406 x1[63] = scale*QMF_RE(X[l][63]);
218 for (n = 0; n < 32; n++) 407 x2[0] = scale*QMF_IM(X[l][63]);
219 { 408
220 v0[n+32] = x[n]; 409 DCT4_64_kernel(x1, x1);
221 v1[n] = x[n+32]; 410 DCT4_64_kernel(x2, x2);
222 } 411
223 v0[0] = v1[0]; 412 for (n = 0; n < 32; n++)
224 for (n = 1; n < 32; n++) 413 {
225 { 414 v0[ 2*n] = x2[2*n] - x1[2*n];
226 v0[32 - n] = v0[n + 32]; 415 v1[63-2*n] = x2[2*n] + x1[2*n];
227 v1[n + 32] = -v1[32 - n]; 416 v0[ 2*n+1] = -x2[2*n+1] - x1[2*n+1];
228 } 417 v1[62-2*n] = -x2[2*n+1] + x1[2*n+1];
229 v1[32] = 0; 418 }
230 419
231 /* calculate 64 output samples and window */ 420 /* calculate 64 output samples and window */
232 for (k = 0; k < 64; k++) 421 for (k = 0; k < 64; k++)
233 { 422 {
234 *outptr++ = MUL_R_C(v0[k], qmf_c[k]) + 423 output[out++] = MUL_F(v0[k], qmf_c[k]) +
235 MUL_R_C(v0[64 + k], qmf_c[64 + k]) + 424 MUL_F(v0[64 + k], qmf_c[64 + k]) +
236 MUL_R_C(v0[128 + k], qmf_c[128 + k]) + 425 MUL_F(v0[128 + k], qmf_c[128 + k]) +
237 MUL_R_C(v0[192 + k], qmf_c[192 + k]) + 426 MUL_F(v0[192 + k], qmf_c[192 + k]) +
238 MUL_R_C(v0[256 + k], qmf_c[256 + k]) + 427 MUL_F(v0[256 + k], qmf_c[256 + k]) +
239 MUL_R_C(v0[320 + k], qmf_c[320 + k]) + 428 MUL_F(v0[320 + k], qmf_c[320 + k]) +
240 MUL_R_C(v0[384 + k], qmf_c[384 + k]) + 429 MUL_F(v0[384 + k], qmf_c[384 + k]) +
241 MUL_R_C(v0[448 + k], qmf_c[448 + k]) + 430 MUL_F(v0[448 + k], qmf_c[448 + k]) +
242 MUL_R_C(v0[512 + k], qmf_c[512 + k]) + 431 MUL_F(v0[512 + k], qmf_c[512 + k]) +
243 MUL_R_C(v0[576 + k], qmf_c[576 + k]); 432 MUL_F(v0[576 + k], qmf_c[576 + k]);
244 } 433 }
245 } 434 }
246 } 435 }
247 #else 436
248 void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, const qmf_t *X, 437 #ifdef USE_SSE
249 real_t *output) 438 void memmove_sse_576(real_t *out, const real_t *in)
250 { 439 {
440 __m128 m[144];
441 uint16_t i;
442
443 for (i = 0; i < 144; i++)
444 {
445 m[i] = _mm_load_ps(&in[i*4]);
446 }
447 for (i = 0; i < 144; i++)
448 {
449 _mm_store_ps(&out[i*4], m[i]);
450 }
451 }
452
453 void sbr_qmf_synthesis_64_sse(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
454 real_t *output)
455 {
456 ALIGN real_t x1[64], x2[64];
457 real_t scale = 1.f/64.f;
458 int16_t n, k, out = 0;
251 uint8_t l; 459 uint8_t l;
252 int16_t n, k;
253 real_t x1[64], x2[64];
254 real_t *outptr = output;
255 460
256 461
257 /* qmf subsample l */ 462 /* qmf subsample l */
258 for (l = 0; l < sbr->numTimeSlotsRate; l++) 463 for (l = 0; l < sbr->numTimeSlotsRate; l++)
259 { 464 {
260 real_t *v0, *v1; 465 real_t *v0, *v1;
261 466
262 /* shift buffers */ 467 /* shift buffers */
263 memmove(qmfs->v[0] + 64, qmfs->v[0], (640-64)*sizeof(real_t)); 468 memmove_sse_576(qmfs->v[0] + 64, qmfs->v[0]);
264 memmove(qmfs->v[1] + 64, qmfs->v[1], (640-64)*sizeof(real_t)); 469 memmove_sse_576(qmfs->v[1] + 64, qmfs->v[1]);
265 470
266 v0 = qmfs->v[qmfs->v_index]; 471 v0 = qmfs->v[qmfs->v_index];
267 v1 = qmfs->v[(qmfs->v_index + 1) & 0x1]; 472 v1 = qmfs->v[(qmfs->v_index + 1) & 0x1];
268 qmfs->v_index = (qmfs->v_index + 1) & 0x1; 473 qmfs->v_index = (qmfs->v_index + 1) & 0x1;
269 474
270 /* calculate 128 samples */ 475 /* calculate 128 samples */
271 for (k = 0; k < 64; k++) 476 x1[0] = scale*QMF_RE(X[l][0]);
272 { 477 x2[63] = scale*QMF_IM(X[l][0]);
273 x1[k] = QMF_RE(X[(l<<6) + k])/64.; 478 for (k = 0; k < 31; k++)
274 x2[63 - k] = QMF_IM(X[(l<<6) + k])/64.; 479 {
275 } 480 x1[2*k+1] = scale*(QMF_RE(X[l][2*k+1]) - QMF_RE(X[l][2*k+2]));
276 481 x1[2*k+2] = scale*(QMF_RE(X[l][2*k+1]) + QMF_RE(X[l][2*k+2]));
277 DCT4_64(x1, x1); 482
278 DCT4_64(x2, x2); 483 x2[61 - 2*k] = scale*(QMF_IM(X[l][2*k+2]) - QMF_IM(X[l][2*k+1]));
279 484 x2[62 - 2*k] = scale*(QMF_IM(X[l][2*k+2]) + QMF_IM(X[l][2*k+1]));
280 for (n = 0; n < 64; n+=2) 485 }
281 { 486 x1[63] = scale*QMF_RE(X[l][63]);
282 v0[n] = x2[n] - x1[n]; 487 x2[0] = scale*QMF_IM(X[l][63]);
283 v0[n+1] = -x2[n+1] - x1[n+1]; 488
284 v1[63-n] = x2[n] + x1[n]; 489 DCT4_64_kernel(x1, x1);
285 v1[63-n-1] = -x2[n+1] + x1[n+1]; 490 DCT4_64_kernel(x2, x2);
491
492 for (n = 0; n < 32; n++)
493 {
494 v0[ 2*n ] = x2[2*n] - x1[2*n];
495 v1[63- 2*n ] = x2[2*n] + x1[2*n];
496 v0[ 2*n+1 ] = -x2[2*n+1] - x1[2*n+1];
497 v1[63-(2*n+1)] = -x2[2*n+1] + x1[2*n+1];
286 } 498 }
287 499
288 /* calculate 64 output samples and window */ 500 /* calculate 64 output samples and window */
289 for (k = 0; k < 64; k++) 501 for (k = 0; k < 64; k+=4)
290 { 502 {
291 *outptr++ = MUL_R_C(v0[k], qmf_c[k]) + 503 __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
292 MUL_R_C(v0[64 + k], qmf_c[64 + k]) + 504 __m128 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;
293 MUL_R_C(v0[128 + k], qmf_c[128 + k]) + 505 __m128 s1, s2, s3, s4, s5, s6, s7, s8, s9;
294 MUL_R_C(v0[192 + k], qmf_c[192 + k]) + 506
295 MUL_R_C(v0[256 + k], qmf_c[256 + k]) + 507 m0 = _mm_load_ps(&v0[k]);
296 MUL_R_C(v0[320 + k], qmf_c[320 + k]) + 508 m1 = _mm_load_ps(&v0[k + 64]);
297 MUL_R_C(v0[384 + k], qmf_c[384 + k]) + 509 m2 = _mm_load_ps(&v0[k + 128]);
298 MUL_R_C(v0[448 + k], qmf_c[448 + k]) + 510 m3 = _mm_load_ps(&v0[k + 192]);
299 MUL_R_C(v0[512 + k], qmf_c[512 + k]) + 511 m4 = _mm_load_ps(&v0[k + 256]);
300 MUL_R_C(v0[576 + k], qmf_c[576 + k]); 512 c0 = _mm_load_ps(&qmf_c[k]);
301 } 513 c1 = _mm_load_ps(&qmf_c[k + 64]);
302 } 514 c2 = _mm_load_ps(&qmf_c[k + 128]);
303 } 515 c3 = _mm_load_ps(&qmf_c[k + 192]);
304 #endif 516 c4 = _mm_load_ps(&qmf_c[k + 256]);
305 517
306 #endif 518 m0 = _mm_mul_ps(m0, c0);
519 m1 = _mm_mul_ps(m1, c1);
520 m2 = _mm_mul_ps(m2, c2);
521 m3 = _mm_mul_ps(m3, c3);
522 m4 = _mm_mul_ps(m4, c4);
523
524 s1 = _mm_add_ps(m0, m1);
525 s2 = _mm_add_ps(m2, m3);
526 s6 = _mm_add_ps(s1, s2);
527
528 m5 = _mm_load_ps(&v0[k + 320]);
529 m6 = _mm_load_ps(&v0[k + 384]);
530 m7 = _mm_load_ps(&v0[k + 448]);
531 m8 = _mm_load_ps(&v0[k + 512]);
532 m9 = _mm_load_ps(&v0[k + 576]);
533 c5 = _mm_load_ps(&qmf_c[k + 320]);
534 c6 = _mm_load_ps(&qmf_c[k + 384]);
535 c7 = _mm_load_ps(&qmf_c[k + 448]);
536 c8 = _mm_load_ps(&qmf_c[k + 512]);
537 c9 = _mm_load_ps(&qmf_c[k + 576]);
538
539 m5 = _mm_mul_ps(m5, c5);
540 m6 = _mm_mul_ps(m6, c6);
541 m7 = _mm_mul_ps(m7, c7);
542 m8 = _mm_mul_ps(m8, c8);
543 m9 = _mm_mul_ps(m9, c9);
544
545 s3 = _mm_add_ps(m4, m5);
546 s4 = _mm_add_ps(m6, m7);
547 s5 = _mm_add_ps(m8, m9);
548 s7 = _mm_add_ps(s3, s4);
549 s8 = _mm_add_ps(s5, s6);
550 s9 = _mm_add_ps(s7, s8);
551
552 _mm_store_ps(&output[out], s9);
553 out += 4;
554 }
555 }
556 }
557 #endif
558 #endif
559
560 #endif