Mercurial > mplayer.hg
annotate liba52/downmix.c @ 16943:fab832f37083
Do not show cache-line size message, I've never seen a case where it was useful
author | reimar |
---|---|
date | Mon, 07 Nov 2005 16:16:14 +0000 |
parents | d6219ce521e9 |
children | 7b408d60de9e |
rev | line source |
---|---|
3394 | 1 /* |
2 * downmix.c | |
3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org> | |
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> | |
5 * | |
6 * This file is part of a52dec, a free ATSC A-52 stream decoder. | |
7 * See http://liba52.sourceforge.net/ for updates. | |
8 * | |
14991
07f1e7669772
Mark modified files as such to comply more closely with GPL ¡ø2a.
diego
parents:
12137
diff
changeset
|
9 * Modified for use with MPlayer, changes contained in liba52_changes.diff. |
07f1e7669772
Mark modified files as such to comply more closely with GPL ¡ø2a.
diego
parents:
12137
diff
changeset
|
10 * detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/ |
07f1e7669772
Mark modified files as such to comply more closely with GPL ¡ø2a.
diego
parents:
12137
diff
changeset
|
11 * $Id$ |
07f1e7669772
Mark modified files as such to comply more closely with GPL ¡ø2a.
diego
parents:
12137
diff
changeset
|
12 * |
3394 | 13 * a52dec is free software; you can redistribute it and/or modify |
14 * it under the terms of the GNU General Public License as published by | |
15 * the Free Software Foundation; either version 2 of the License, or | |
16 * (at your option) any later version. | |
17 * | |
18 * a52dec is distributed in the hope that it will be useful, | |
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 * GNU General Public License for more details. | |
22 * | |
23 * You should have received a copy of the GNU General Public License | |
24 * along with this program; if not, write to the Free Software | |
25 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
3625 | 26 * |
27 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) | |
3394 | 28 */ |
29 | |
30 #include "config.h" | |
31 | |
32 #include <string.h> | |
33 #include <inttypes.h> | |
34 | |
35 #include "a52.h" | |
36 #include "a52_internal.h" | |
3910
db1d556fcf58
runtime cpudetect the liba52 way part 2 (downmix.c)
michael
parents:
3904
diff
changeset
|
37 #include "mm_accel.h" |
3394 | 38 |
39 #define CONVERT(acmod,output) (((output) << 3) + (acmod)) | |
40 | |
3904 | 41 |
42 void (*downmix)(sample_t * samples, int acmod, int output, sample_t bias, | |
43 sample_t clev, sample_t slev)= NULL; | |
44 void (*upmix)(sample_t * samples, int acmod, int output)= NULL; | |
45 | |
46 static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, | |
47 sample_t clev, sample_t slev); | |
4233 | 48 static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, |
49 sample_t clev, sample_t slev); | |
3904 | 50 static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, |
51 sample_t clev, sample_t slev); | |
52 static void upmix_MMX (sample_t * samples, int acmod, int output); | |
53 static void upmix_C (sample_t * samples, int acmod, int output); | |
3910
db1d556fcf58
runtime cpudetect the liba52 way part 2 (downmix.c)
michael
parents:
3904
diff
changeset
|
54 |
db1d556fcf58
runtime cpudetect the liba52 way part 2 (downmix.c)
michael
parents:
3904
diff
changeset
|
55 void downmix_accel_init(uint32_t mm_accel) |
db1d556fcf58
runtime cpudetect the liba52 way part 2 (downmix.c)
michael
parents:
3904
diff
changeset
|
56 { |
db1d556fcf58
runtime cpudetect the liba52 way part 2 (downmix.c)
michael
parents:
3904
diff
changeset
|
57 upmix= upmix_C; |
db1d556fcf58
runtime cpudetect the liba52 way part 2 (downmix.c)
michael
parents:
3904
diff
changeset
|
58 downmix= downmix_C; |
16173 | 59 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
3910
db1d556fcf58
runtime cpudetect the liba52 way part 2 (downmix.c)
michael
parents:
3904
diff
changeset
|
60 if(mm_accel & MM_ACCEL_X86_MMX) upmix= upmix_MMX; |
db1d556fcf58
runtime cpudetect the liba52 way part 2 (downmix.c)
michael
parents:
3904
diff
changeset
|
61 if(mm_accel & MM_ACCEL_X86_SSE) downmix= downmix_SSE; |
4233 | 62 if(mm_accel & MM_ACCEL_X86_3DNOW) downmix= downmix_3dnow; |
3910
db1d556fcf58
runtime cpudetect the liba52 way part 2 (downmix.c)
michael
parents:
3904
diff
changeset
|
63 #endif |
db1d556fcf58
runtime cpudetect the liba52 way part 2 (downmix.c)
michael
parents:
3904
diff
changeset
|
64 } |
db1d556fcf58
runtime cpudetect the liba52 way part 2 (downmix.c)
michael
parents:
3904
diff
changeset
|
65 |
3394 | 66 int downmix_init (int input, int flags, sample_t * level, |
67 sample_t clev, sample_t slev) | |
68 { | |
69 static uint8_t table[11][8] = { | |
70 {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_STEREO, | |
71 A52_STEREO, A52_STEREO, A52_STEREO, A52_STEREO}, | |
72 {A52_MONO, A52_MONO, A52_MONO, A52_MONO, | |
73 A52_MONO, A52_MONO, A52_MONO, A52_MONO}, | |
74 {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_STEREO, | |
75 A52_STEREO, A52_STEREO, A52_STEREO, A52_STEREO}, | |
76 {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_3F, | |
77 A52_STEREO, A52_3F, A52_STEREO, A52_3F}, | |
78 {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_STEREO, | |
79 A52_2F1R, A52_2F1R, A52_2F1R, A52_2F1R}, | |
80 {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_STEREO, | |
81 A52_2F1R, A52_3F1R, A52_2F1R, A52_3F1R}, | |
82 {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_3F, | |
83 A52_2F2R, A52_2F2R, A52_2F2R, A52_2F2R}, | |
84 {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_3F, | |
85 A52_2F2R, A52_3F2R, A52_2F2R, A52_3F2R}, | |
86 {A52_CHANNEL1, A52_MONO, A52_MONO, A52_MONO, | |
87 A52_MONO, A52_MONO, A52_MONO, A52_MONO}, | |
88 {A52_CHANNEL2, A52_MONO, A52_MONO, A52_MONO, | |
89 A52_MONO, A52_MONO, A52_MONO, A52_MONO}, | |
90 {A52_CHANNEL, A52_DOLBY, A52_STEREO, A52_DOLBY, | |
91 A52_DOLBY, A52_DOLBY, A52_DOLBY, A52_DOLBY} | |
92 }; | |
93 int output; | |
94 | |
95 output = flags & A52_CHANNEL_MASK; | |
96 if (output > A52_DOLBY) | |
97 return -1; | |
3738 | 98 |
3394 | 99 output = table[output][input & 7]; |
100 | |
101 if ((output == A52_STEREO) && | |
102 ((input == A52_DOLBY) || ((input == A52_3F) && (clev == LEVEL_3DB)))) | |
103 output = A52_DOLBY; | |
104 | |
105 if (flags & A52_ADJUST_LEVEL) | |
106 switch (CONVERT (input & 7, output)) { | |
107 | |
108 case CONVERT (A52_3F, A52_MONO): | |
109 *level *= LEVEL_3DB / (1 + clev); | |
110 break; | |
111 | |
112 case CONVERT (A52_STEREO, A52_MONO): | |
113 case CONVERT (A52_2F2R, A52_2F1R): | |
114 case CONVERT (A52_3F2R, A52_3F1R): | |
115 level_3db: | |
116 *level *= LEVEL_3DB; | |
117 break; | |
118 | |
119 case CONVERT (A52_3F2R, A52_2F1R): | |
120 if (clev < LEVEL_PLUS3DB - 1) | |
121 goto level_3db; | |
122 /* break thru */ | |
123 case CONVERT (A52_3F, A52_STEREO): | |
124 case CONVERT (A52_3F1R, A52_2F1R): | |
125 case CONVERT (A52_3F1R, A52_2F2R): | |
126 case CONVERT (A52_3F2R, A52_2F2R): | |
127 *level /= 1 + clev; | |
128 break; | |
129 | |
130 case CONVERT (A52_2F1R, A52_MONO): | |
131 *level *= LEVEL_PLUS3DB / (2 + slev); | |
132 break; | |
133 | |
134 case CONVERT (A52_2F1R, A52_STEREO): | |
135 case CONVERT (A52_3F1R, A52_3F): | |
136 *level /= 1 + slev * LEVEL_3DB; | |
137 break; | |
138 | |
139 case CONVERT (A52_3F1R, A52_MONO): | |
140 *level *= LEVEL_3DB / (1 + clev + 0.5 * slev); | |
141 break; | |
142 | |
143 case CONVERT (A52_3F1R, A52_STEREO): | |
144 *level /= 1 + clev + slev * LEVEL_3DB; | |
145 break; | |
146 | |
147 case CONVERT (A52_2F2R, A52_MONO): | |
148 *level *= LEVEL_3DB / (1 + slev); | |
149 break; | |
150 | |
151 case CONVERT (A52_2F2R, A52_STEREO): | |
152 case CONVERT (A52_3F2R, A52_3F): | |
153 *level /= 1 + slev; | |
154 break; | |
155 | |
156 case CONVERT (A52_3F2R, A52_MONO): | |
157 *level *= LEVEL_3DB / (1 + clev + slev); | |
158 break; | |
159 | |
160 case CONVERT (A52_3F2R, A52_STEREO): | |
161 *level /= 1 + clev + slev; | |
162 break; | |
163 | |
164 case CONVERT (A52_MONO, A52_DOLBY): | |
165 *level *= LEVEL_PLUS3DB; | |
166 break; | |
167 | |
168 case CONVERT (A52_3F, A52_DOLBY): | |
169 case CONVERT (A52_2F1R, A52_DOLBY): | |
170 *level *= 1 / (1 + LEVEL_3DB); | |
171 break; | |
172 | |
173 case CONVERT (A52_3F1R, A52_DOLBY): | |
174 case CONVERT (A52_2F2R, A52_DOLBY): | |
175 *level *= 1 / (1 + 2 * LEVEL_3DB); | |
176 break; | |
177 | |
178 case CONVERT (A52_3F2R, A52_DOLBY): | |
179 *level *= 1 / (1 + 3 * LEVEL_3DB); | |
180 break; | |
181 } | |
182 return output; | |
183 } | |
184 | |
185 int downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level, | |
186 sample_t clev, sample_t slev) | |
187 { | |
188 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
189 | |
190 case CONVERT (A52_CHANNEL, A52_CHANNEL): | |
191 case CONVERT (A52_MONO, A52_MONO): | |
192 case CONVERT (A52_STEREO, A52_STEREO): | |
193 case CONVERT (A52_3F, A52_3F): | |
194 case CONVERT (A52_2F1R, A52_2F1R): | |
195 case CONVERT (A52_3F1R, A52_3F1R): | |
196 case CONVERT (A52_2F2R, A52_2F2R): | |
197 case CONVERT (A52_3F2R, A52_3F2R): | |
198 case CONVERT (A52_STEREO, A52_DOLBY): | |
199 coeff[0] = coeff[1] = coeff[2] = coeff[3] = coeff[4] = level; | |
200 return 0; | |
201 | |
202 case CONVERT (A52_CHANNEL, A52_MONO): | |
203 coeff[0] = coeff[1] = level * LEVEL_6DB; | |
204 return 3; | |
205 | |
206 case CONVERT (A52_STEREO, A52_MONO): | |
207 coeff[0] = coeff[1] = level * LEVEL_3DB; | |
208 return 3; | |
209 | |
210 case CONVERT (A52_3F, A52_MONO): | |
211 coeff[0] = coeff[2] = level * LEVEL_3DB; | |
212 coeff[1] = level * clev * LEVEL_PLUS3DB; | |
213 return 7; | |
214 | |
215 case CONVERT (A52_2F1R, A52_MONO): | |
216 coeff[0] = coeff[1] = level * LEVEL_3DB; | |
217 coeff[2] = level * slev * LEVEL_3DB; | |
218 return 7; | |
219 | |
220 case CONVERT (A52_2F2R, A52_MONO): | |
221 coeff[0] = coeff[1] = level * LEVEL_3DB; | |
222 coeff[2] = coeff[3] = level * slev * LEVEL_3DB; | |
223 return 15; | |
224 | |
225 case CONVERT (A52_3F1R, A52_MONO): | |
226 coeff[0] = coeff[2] = level * LEVEL_3DB; | |
227 coeff[1] = level * clev * LEVEL_PLUS3DB; | |
228 coeff[3] = level * slev * LEVEL_3DB; | |
229 return 15; | |
230 | |
231 case CONVERT (A52_3F2R, A52_MONO): | |
232 coeff[0] = coeff[2] = level * LEVEL_3DB; | |
233 coeff[1] = level * clev * LEVEL_PLUS3DB; | |
234 coeff[3] = coeff[4] = level * slev * LEVEL_3DB; | |
235 return 31; | |
236 | |
237 case CONVERT (A52_MONO, A52_DOLBY): | |
238 coeff[0] = level * LEVEL_3DB; | |
239 return 0; | |
240 | |
241 case CONVERT (A52_3F, A52_DOLBY): | |
242 clev = LEVEL_3DB; | |
243 case CONVERT (A52_3F, A52_STEREO): | |
244 case CONVERT (A52_3F1R, A52_2F1R): | |
245 case CONVERT (A52_3F2R, A52_2F2R): | |
246 coeff[0] = coeff[2] = coeff[3] = coeff[4] = level; | |
247 coeff[1] = level * clev; | |
248 return 7; | |
249 | |
250 case CONVERT (A52_2F1R, A52_DOLBY): | |
251 slev = 1; | |
252 case CONVERT (A52_2F1R, A52_STEREO): | |
253 coeff[0] = coeff[1] = level; | |
254 coeff[2] = level * slev * LEVEL_3DB; | |
255 return 7; | |
256 | |
257 case CONVERT (A52_3F1R, A52_DOLBY): | |
258 clev = LEVEL_3DB; | |
259 slev = 1; | |
260 case CONVERT (A52_3F1R, A52_STEREO): | |
261 coeff[0] = coeff[2] = level; | |
262 coeff[1] = level * clev; | |
263 coeff[3] = level * slev * LEVEL_3DB; | |
264 return 15; | |
265 | |
266 case CONVERT (A52_2F2R, A52_DOLBY): | |
267 slev = LEVEL_3DB; | |
268 case CONVERT (A52_2F2R, A52_STEREO): | |
269 coeff[0] = coeff[1] = level; | |
270 coeff[2] = coeff[3] = level * slev; | |
271 return 15; | |
272 | |
273 case CONVERT (A52_3F2R, A52_DOLBY): | |
274 clev = LEVEL_3DB; | |
275 case CONVERT (A52_3F2R, A52_2F1R): | |
276 slev = LEVEL_3DB; | |
277 case CONVERT (A52_3F2R, A52_STEREO): | |
278 coeff[0] = coeff[2] = level; | |
279 coeff[1] = level * clev; | |
280 coeff[3] = coeff[4] = level * slev; | |
281 return 31; | |
282 | |
283 case CONVERT (A52_3F1R, A52_3F): | |
284 coeff[0] = coeff[1] = coeff[2] = level; | |
285 coeff[3] = level * slev * LEVEL_3DB; | |
286 return 13; | |
287 | |
288 case CONVERT (A52_3F2R, A52_3F): | |
289 coeff[0] = coeff[1] = coeff[2] = level; | |
290 coeff[3] = coeff[4] = level * slev; | |
291 return 29; | |
292 | |
293 case CONVERT (A52_2F2R, A52_2F1R): | |
294 coeff[0] = coeff[1] = level; | |
295 coeff[2] = coeff[3] = level * LEVEL_3DB; | |
296 return 12; | |
297 | |
298 case CONVERT (A52_3F2R, A52_3F1R): | |
299 coeff[0] = coeff[1] = coeff[2] = level; | |
300 coeff[3] = coeff[4] = level * LEVEL_3DB; | |
301 return 24; | |
302 | |
303 case CONVERT (A52_2F1R, A52_2F2R): | |
304 coeff[0] = coeff[1] = level; | |
305 coeff[2] = level * LEVEL_3DB; | |
306 return 0; | |
307 | |
308 case CONVERT (A52_3F1R, A52_2F2R): | |
309 coeff[0] = coeff[2] = level; | |
310 coeff[1] = level * clev; | |
311 coeff[3] = level * LEVEL_3DB; | |
312 return 7; | |
313 | |
314 case CONVERT (A52_3F1R, A52_3F2R): | |
315 coeff[0] = coeff[1] = coeff[2] = level; | |
316 coeff[3] = level * LEVEL_3DB; | |
317 return 0; | |
318 | |
319 case CONVERT (A52_CHANNEL, A52_CHANNEL1): | |
320 coeff[0] = level; | |
321 coeff[1] = 0; | |
322 return 0; | |
323 | |
324 case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
325 coeff[0] = 0; | |
326 coeff[1] = level; | |
327 return 0; | |
328 } | |
329 | |
330 return -1; /* NOTREACHED */ | |
331 } | |
332 | |
333 static void mix2to1 (sample_t * dest, sample_t * src, sample_t bias) | |
334 { | |
335 int i; | |
336 | |
337 for (i = 0; i < 256; i++) | |
338 dest[i] += src[i] + bias; | |
339 } | |
340 | |
341 static void mix3to1 (sample_t * samples, sample_t bias) | |
342 { | |
343 int i; | |
344 | |
345 for (i = 0; i < 256; i++) | |
346 samples[i] += samples[i + 256] + samples[i + 512] + bias; | |
347 } | |
348 | |
349 static void mix4to1 (sample_t * samples, sample_t bias) | |
350 { | |
351 int i; | |
352 | |
353 for (i = 0; i < 256; i++) | |
354 samples[i] += (samples[i + 256] + samples[i + 512] + | |
355 samples[i + 768] + bias); | |
356 } | |
357 | |
358 static void mix5to1 (sample_t * samples, sample_t bias) | |
359 { | |
360 int i; | |
361 | |
362 for (i = 0; i < 256; i++) | |
363 samples[i] += (samples[i + 256] + samples[i + 512] + | |
364 samples[i + 768] + samples[i + 1024] + bias); | |
365 } | |
366 | |
367 static void mix3to2 (sample_t * samples, sample_t bias) | |
368 { | |
369 int i; | |
370 sample_t common; | |
371 | |
372 for (i = 0; i < 256; i++) { | |
373 common = samples[i + 256] + bias; | |
374 samples[i] += common; | |
375 samples[i + 256] = samples[i + 512] + common; | |
376 } | |
377 } | |
378 | |
379 static void mix21to2 (sample_t * left, sample_t * right, sample_t bias) | |
380 { | |
381 int i; | |
382 sample_t common; | |
383 | |
384 for (i = 0; i < 256; i++) { | |
385 common = right[i + 256] + bias; | |
386 left[i] += common; | |
387 right[i] += common; | |
388 } | |
389 } | |
390 | |
391 static void mix21toS (sample_t * samples, sample_t bias) | |
392 { | |
393 int i; | |
394 sample_t surround; | |
395 | |
396 for (i = 0; i < 256; i++) { | |
397 surround = samples[i + 512]; | |
398 samples[i] += bias - surround; | |
399 samples[i + 256] += bias + surround; | |
400 } | |
401 } | |
402 | |
403 static void mix31to2 (sample_t * samples, sample_t bias) | |
404 { | |
405 int i; | |
406 sample_t common; | |
407 | |
408 for (i = 0; i < 256; i++) { | |
409 common = samples[i + 256] + samples[i + 768] + bias; | |
410 samples[i] += common; | |
411 samples[i + 256] = samples[i + 512] + common; | |
412 } | |
413 } | |
414 | |
415 static void mix31toS (sample_t * samples, sample_t bias) | |
416 { | |
417 int i; | |
418 sample_t common, surround; | |
419 | |
420 for (i = 0; i < 256; i++) { | |
421 common = samples[i + 256] + bias; | |
422 surround = samples[i + 768]; | |
423 samples[i] += common - surround; | |
424 samples[i + 256] = samples[i + 512] + common + surround; | |
425 } | |
426 } | |
427 | |
428 static void mix22toS (sample_t * samples, sample_t bias) | |
429 { | |
430 int i; | |
431 sample_t surround; | |
432 | |
433 for (i = 0; i < 256; i++) { | |
434 surround = samples[i + 512] + samples[i + 768]; | |
435 samples[i] += bias - surround; | |
436 samples[i + 256] += bias + surround; | |
437 } | |
438 } | |
439 | |
440 static void mix32to2 (sample_t * samples, sample_t bias) | |
441 { | |
442 int i; | |
443 sample_t common; | |
444 | |
445 for (i = 0; i < 256; i++) { | |
446 common = samples[i + 256] + bias; | |
447 samples[i] += common + samples[i + 768]; | |
448 samples[i + 256] = common + samples[i + 512] + samples[i + 1024]; | |
449 } | |
450 } | |
451 | |
452 static void mix32toS (sample_t * samples, sample_t bias) | |
453 { | |
454 int i; | |
455 sample_t common, surround; | |
456 | |
457 for (i = 0; i < 256; i++) { | |
458 common = samples[i + 256] + bias; | |
459 surround = samples[i + 768] + samples[i + 1024]; | |
460 samples[i] += common - surround; | |
461 samples[i + 256] = samples[i + 512] + common + surround; | |
462 } | |
463 } | |
464 | |
465 static void move2to1 (sample_t * src, sample_t * dest, sample_t bias) | |
466 { | |
467 int i; | |
468 | |
469 for (i = 0; i < 256; i++) | |
470 dest[i] = src[i] + src[i + 256] + bias; | |
471 } | |
472 | |
473 static void zero (sample_t * samples) | |
474 { | |
475 int i; | |
476 for (i = 0; i < 256; i++) | |
477 samples[i] = 0; | |
478 } | |
479 | |
3904 | 480 static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, |
3394 | 481 sample_t clev, sample_t slev) |
482 { | |
483 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
484 | |
485 case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
486 memcpy (samples, samples + 256, 256 * sizeof (sample_t)); | |
487 break; | |
488 | |
489 case CONVERT (A52_CHANNEL, A52_MONO): | |
490 case CONVERT (A52_STEREO, A52_MONO): | |
491 mix_2to1: | |
492 mix2to1 (samples, samples + 256, bias); | |
493 break; | |
494 | |
495 case CONVERT (A52_2F1R, A52_MONO): | |
496 if (slev == 0) | |
497 goto mix_2to1; | |
498 case CONVERT (A52_3F, A52_MONO): | |
499 mix_3to1: | |
500 mix3to1 (samples, bias); | |
501 break; | |
502 | |
503 case CONVERT (A52_3F1R, A52_MONO): | |
504 if (slev == 0) | |
505 goto mix_3to1; | |
506 case CONVERT (A52_2F2R, A52_MONO): | |
507 if (slev == 0) | |
508 goto mix_2to1; | |
509 mix4to1 (samples, bias); | |
510 break; | |
511 | |
512 case CONVERT (A52_3F2R, A52_MONO): | |
513 if (slev == 0) | |
514 goto mix_3to1; | |
515 mix5to1 (samples, bias); | |
516 break; | |
517 | |
518 case CONVERT (A52_MONO, A52_DOLBY): | |
519 memcpy (samples + 256, samples, 256 * sizeof (sample_t)); | |
520 break; | |
521 | |
522 case CONVERT (A52_3F, A52_STEREO): | |
523 case CONVERT (A52_3F, A52_DOLBY): | |
524 mix_3to2: | |
525 mix3to2 (samples, bias); | |
526 break; | |
527 | |
528 case CONVERT (A52_2F1R, A52_STEREO): | |
529 if (slev == 0) | |
530 break; | |
531 mix21to2 (samples, samples + 256, bias); | |
532 break; | |
533 | |
534 case CONVERT (A52_2F1R, A52_DOLBY): | |
535 mix21toS (samples, bias); | |
536 break; | |
537 | |
538 case CONVERT (A52_3F1R, A52_STEREO): | |
539 if (slev == 0) | |
540 goto mix_3to2; | |
541 mix31to2 (samples, bias); | |
542 break; | |
543 | |
544 case CONVERT (A52_3F1R, A52_DOLBY): | |
545 mix31toS (samples, bias); | |
546 break; | |
547 | |
548 case CONVERT (A52_2F2R, A52_STEREO): | |
549 if (slev == 0) | |
550 break; | |
551 mix2to1 (samples, samples + 512, bias); | |
552 mix2to1 (samples + 256, samples + 768, bias); | |
553 break; | |
554 | |
555 case CONVERT (A52_2F2R, A52_DOLBY): | |
556 mix22toS (samples, bias); | |
557 break; | |
558 | |
559 case CONVERT (A52_3F2R, A52_STEREO): | |
560 if (slev == 0) | |
561 goto mix_3to2; | |
562 mix32to2 (samples, bias); | |
563 break; | |
564 | |
565 case CONVERT (A52_3F2R, A52_DOLBY): | |
566 mix32toS (samples, bias); | |
567 break; | |
568 | |
569 case CONVERT (A52_3F1R, A52_3F): | |
570 if (slev == 0) | |
571 break; | |
572 mix21to2 (samples, samples + 512, bias); | |
573 break; | |
574 | |
575 case CONVERT (A52_3F2R, A52_3F): | |
576 if (slev == 0) | |
577 break; | |
578 mix2to1 (samples, samples + 768, bias); | |
579 mix2to1 (samples + 512, samples + 1024, bias); | |
580 break; | |
581 | |
582 case CONVERT (A52_3F1R, A52_2F1R): | |
583 mix3to2 (samples, bias); | |
584 memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
585 break; | |
586 | |
587 case CONVERT (A52_2F2R, A52_2F1R): | |
588 mix2to1 (samples + 512, samples + 768, bias); | |
589 break; | |
590 | |
591 case CONVERT (A52_3F2R, A52_2F1R): | |
3678 | 592 mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used) |
3394 | 593 move2to1 (samples + 768, samples + 512, bias); |
594 break; | |
595 | |
596 case CONVERT (A52_3F2R, A52_3F1R): | |
597 mix2to1 (samples + 768, samples + 1024, bias); | |
598 break; | |
599 | |
600 case CONVERT (A52_2F1R, A52_2F2R): | |
601 memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); | |
602 break; | |
603 | |
604 case CONVERT (A52_3F1R, A52_2F2R): | |
605 mix3to2 (samples, bias); | |
606 memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
607 break; | |
608 | |
609 case CONVERT (A52_3F2R, A52_2F2R): | |
610 mix3to2 (samples, bias); | |
611 memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
612 memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); | |
613 break; | |
614 | |
615 case CONVERT (A52_3F1R, A52_3F2R): | |
12137 | 616 memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); |
3394 | 617 break; |
618 } | |
619 } | |
620 | |
3904 | 621 static void upmix_C (sample_t * samples, int acmod, int output) |
3394 | 622 { |
623 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
624 | |
625 case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
626 memcpy (samples + 256, samples, 256 * sizeof (sample_t)); | |
627 break; | |
628 | |
629 case CONVERT (A52_3F2R, A52_MONO): | |
630 zero (samples + 1024); | |
631 case CONVERT (A52_3F1R, A52_MONO): | |
632 case CONVERT (A52_2F2R, A52_MONO): | |
633 zero (samples + 768); | |
634 case CONVERT (A52_3F, A52_MONO): | |
635 case CONVERT (A52_2F1R, A52_MONO): | |
636 zero (samples + 512); | |
637 case CONVERT (A52_CHANNEL, A52_MONO): | |
638 case CONVERT (A52_STEREO, A52_MONO): | |
639 zero (samples + 256); | |
640 break; | |
641 | |
642 case CONVERT (A52_3F2R, A52_STEREO): | |
643 case CONVERT (A52_3F2R, A52_DOLBY): | |
644 zero (samples + 1024); | |
645 case CONVERT (A52_3F1R, A52_STEREO): | |
646 case CONVERT (A52_3F1R, A52_DOLBY): | |
647 zero (samples + 768); | |
648 case CONVERT (A52_3F, A52_STEREO): | |
649 case CONVERT (A52_3F, A52_DOLBY): | |
650 mix_3to2: | |
651 memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t)); | |
652 zero (samples + 256); | |
653 break; | |
654 | |
655 case CONVERT (A52_2F2R, A52_STEREO): | |
656 case CONVERT (A52_2F2R, A52_DOLBY): | |
657 zero (samples + 768); | |
658 case CONVERT (A52_2F1R, A52_STEREO): | |
659 case CONVERT (A52_2F1R, A52_DOLBY): | |
660 zero (samples + 512); | |
661 break; | |
662 | |
663 case CONVERT (A52_3F2R, A52_3F): | |
664 zero (samples + 1024); | |
665 case CONVERT (A52_3F1R, A52_3F): | |
666 case CONVERT (A52_2F2R, A52_2F1R): | |
667 zero (samples + 768); | |
668 break; | |
669 | |
670 case CONVERT (A52_3F2R, A52_3F1R): | |
671 zero (samples + 1024); | |
672 break; | |
673 | |
674 case CONVERT (A52_3F2R, A52_2F1R): | |
675 zero (samples + 1024); | |
676 case CONVERT (A52_3F1R, A52_2F1R): | |
677 mix_31to21: | |
678 memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); | |
679 goto mix_3to2; | |
680 | |
681 case CONVERT (A52_3F2R, A52_2F2R): | |
682 memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); | |
683 goto mix_31to21; | |
684 } | |
685 } | |
3904 | 686 |
16173 | 687 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
3904 | 688 static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias) |
689 { | |
690 asm volatile( | |
691 "movlps %2, %%xmm7 \n\t" | |
692 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 693 "mov $-1024, %%"REG_S" \n\t" |
4233 | 694 ".balign 16\n\t" |
3904 | 695 "1: \n\t" |
16173 | 696 "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
697 "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" | |
698 "addps (%1, %%"REG_S"), %%xmm0 \n\t" | |
699 "addps 16(%1, %%"REG_S"), %%xmm1\n\t" | |
3904 | 700 "addps %%xmm7, %%xmm0 \n\t" |
701 "addps %%xmm7, %%xmm1 \n\t" | |
16173 | 702 "movaps %%xmm0, (%1, %%"REG_S") \n\t" |
703 "movaps %%xmm1, 16(%1, %%"REG_S")\n\t" | |
704 "add $32, %%"REG_S" \n\t" | |
3904 | 705 " jnz 1b \n\t" |
706 :: "r" (src+256), "r" (dest+256), "m" (bias) | |
16173 | 707 : "%"REG_S |
3904 | 708 ); |
709 } | |
710 | |
711 static void mix3to1_SSE (sample_t * samples, sample_t bias) | |
712 { | |
713 asm volatile( | |
714 "movlps %1, %%xmm7 \n\t" | |
715 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 716 "mov $-1024, %%"REG_S" \n\t" |
4233 | 717 ".balign 16\n\t" |
3904 | 718 "1: \n\t" |
16173 | 719 "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
720 "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" | |
721 "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" | |
3904 | 722 "addps %%xmm7, %%xmm1 \n\t" |
723 "addps %%xmm1, %%xmm0 \n\t" | |
16173 | 724 "movaps %%xmm0, (%0, %%"REG_S") \n\t" |
725 "add $16, %%"REG_S" \n\t" | |
3904 | 726 " jnz 1b \n\t" |
727 :: "r" (samples+256), "m" (bias) | |
16173 | 728 : "%"REG_S |
3904 | 729 ); |
730 } | |
731 | |
732 static void mix4to1_SSE (sample_t * samples, sample_t bias) | |
733 { | |
734 asm volatile( | |
735 "movlps %1, %%xmm7 \n\t" | |
736 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 737 "mov $-1024, %%"REG_S" \n\t" |
4233 | 738 ".balign 16\n\t" |
3904 | 739 "1: \n\t" |
16173 | 740 "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
741 "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" | |
742 "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" | |
743 "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" | |
3904 | 744 "addps %%xmm7, %%xmm0 \n\t" |
745 "addps %%xmm1, %%xmm0 \n\t" | |
16173 | 746 "movaps %%xmm0, (%0, %%"REG_S") \n\t" |
747 "add $16, %%"REG_S" \n\t" | |
3904 | 748 " jnz 1b \n\t" |
749 :: "r" (samples+256), "m" (bias) | |
16173 | 750 : "%"REG_S |
3904 | 751 ); |
752 } | |
753 | |
754 static void mix5to1_SSE (sample_t * samples, sample_t bias) | |
755 { | |
756 asm volatile( | |
757 "movlps %1, %%xmm7 \n\t" | |
758 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 759 "mov $-1024, %%"REG_S" \n\t" |
4233 | 760 ".balign 16\n\t" |
3904 | 761 "1: \n\t" |
16173 | 762 "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
763 "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" | |
764 "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" | |
765 "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" | |
3904 | 766 "addps %%xmm7, %%xmm0 \n\t" |
16173 | 767 "addps 4096(%0, %%"REG_S"), %%xmm1\n\t" |
3904 | 768 "addps %%xmm1, %%xmm0 \n\t" |
16173 | 769 "movaps %%xmm0, (%0, %%"REG_S") \n\t" |
770 "add $16, %%"REG_S" \n\t" | |
3904 | 771 " jnz 1b \n\t" |
772 :: "r" (samples+256), "m" (bias) | |
16173 | 773 : "%"REG_S |
3904 | 774 ); |
775 } | |
776 | |
777 static void mix3to2_SSE (sample_t * samples, sample_t bias) | |
778 { | |
779 asm volatile( | |
780 "movlps %1, %%xmm7 \n\t" | |
781 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 782 "mov $-1024, %%"REG_S" \n\t" |
4233 | 783 ".balign 16\n\t" |
3904 | 784 "1: \n\t" |
16173 | 785 "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" |
3904 | 786 "addps %%xmm7, %%xmm0 \n\t" //common |
16173 | 787 "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
788 "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" | |
3904 | 789 "addps %%xmm0, %%xmm1 \n\t" |
790 "addps %%xmm0, %%xmm2 \n\t" | |
16173 | 791 "movaps %%xmm1, (%0, %%"REG_S") \n\t" |
792 "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" | |
793 "add $16, %%"REG_S" \n\t" | |
3904 | 794 " jnz 1b \n\t" |
795 :: "r" (samples+256), "m" (bias) | |
16173 | 796 : "%"REG_S |
3904 | 797 ); |
798 } | |
799 | |
800 static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias) | |
801 { | |
802 asm volatile( | |
803 "movlps %2, %%xmm7 \n\t" | |
804 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 805 "mov $-1024, %%"REG_S" \n\t" |
4233 | 806 ".balign 16\n\t" |
3904 | 807 "1: \n\t" |
16173 | 808 "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t" |
3904 | 809 "addps %%xmm7, %%xmm0 \n\t" //common |
16173 | 810 "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
811 "movaps (%1, %%"REG_S"), %%xmm2 \n\t" | |
3904 | 812 "addps %%xmm0, %%xmm1 \n\t" |
813 "addps %%xmm0, %%xmm2 \n\t" | |
16173 | 814 "movaps %%xmm1, (%0, %%"REG_S") \n\t" |
815 "movaps %%xmm2, (%1, %%"REG_S") \n\t" | |
816 "add $16, %%"REG_S" \n\t" | |
3904 | 817 " jnz 1b \n\t" |
818 :: "r" (left+256), "r" (right+256), "m" (bias) | |
16173 | 819 : "%"REG_S |
3904 | 820 ); |
821 } | |
822 | |
823 static void mix21toS_SSE (sample_t * samples, sample_t bias) | |
824 { | |
825 asm volatile( | |
826 "movlps %1, %%xmm7 \n\t" | |
827 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 828 "mov $-1024, %%"REG_S" \n\t" |
4233 | 829 ".balign 16\n\t" |
3904 | 830 "1: \n\t" |
16173 | 831 "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" // surround |
832 "movaps (%0, %%"REG_S"), %%xmm1 \n\t" | |
833 "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" | |
3904 | 834 "addps %%xmm7, %%xmm1 \n\t" |
835 "addps %%xmm7, %%xmm2 \n\t" | |
836 "subps %%xmm0, %%xmm1 \n\t" | |
837 "addps %%xmm0, %%xmm2 \n\t" | |
16173 | 838 "movaps %%xmm1, (%0, %%"REG_S") \n\t" |
839 "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" | |
840 "add $16, %%"REG_S" \n\t" | |
3904 | 841 " jnz 1b \n\t" |
842 :: "r" (samples+256), "m" (bias) | |
16173 | 843 : "%"REG_S |
3904 | 844 ); |
845 } | |
846 | |
847 static void mix31to2_SSE (sample_t * samples, sample_t bias) | |
848 { | |
849 asm volatile( | |
850 "movlps %1, %%xmm7 \n\t" | |
851 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 852 "mov $-1024, %%"REG_S" \n\t" |
4233 | 853 ".balign 16\n\t" |
3904 | 854 "1: \n\t" |
16173 | 855 "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" |
856 "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" | |
3904 | 857 "addps %%xmm7, %%xmm0 \n\t" // common |
16173 | 858 "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
859 "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" | |
3904 | 860 "addps %%xmm0, %%xmm1 \n\t" |
861 "addps %%xmm0, %%xmm2 \n\t" | |
16173 | 862 "movaps %%xmm1, (%0, %%"REG_S") \n\t" |
863 "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" | |
864 "add $16, %%"REG_S" \n\t" | |
3904 | 865 " jnz 1b \n\t" |
866 :: "r" (samples+256), "m" (bias) | |
16173 | 867 : "%"REG_S |
3904 | 868 ); |
869 } | |
870 | |
871 static void mix31toS_SSE (sample_t * samples, sample_t bias) | |
872 { | |
873 asm volatile( | |
874 "movlps %1, %%xmm7 \n\t" | |
875 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 876 "mov $-1024, %%"REG_S" \n\t" |
4233 | 877 ".balign 16\n\t" |
3904 | 878 "1: \n\t" |
16173 | 879 "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" |
880 "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround | |
3904 | 881 "addps %%xmm7, %%xmm0 \n\t" // common |
16173 | 882 "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
883 "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" | |
3904 | 884 "addps %%xmm0, %%xmm1 \n\t" |
885 "addps %%xmm0, %%xmm2 \n\t" | |
886 "subps %%xmm3, %%xmm1 \n\t" | |
887 "addps %%xmm3, %%xmm2 \n\t" | |
16173 | 888 "movaps %%xmm1, (%0, %%"REG_S") \n\t" |
889 "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" | |
890 "add $16, %%"REG_S" \n\t" | |
3904 | 891 " jnz 1b \n\t" |
892 :: "r" (samples+256), "m" (bias) | |
16173 | 893 : "%"REG_S |
3904 | 894 ); |
895 } | |
896 | |
897 static void mix22toS_SSE (sample_t * samples, sample_t bias) | |
898 { | |
899 asm volatile( | |
900 "movlps %1, %%xmm7 \n\t" | |
901 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 902 "mov $-1024, %%"REG_S" \n\t" |
4233 | 903 ".balign 16\n\t" |
3904 | 904 "1: \n\t" |
16173 | 905 "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" |
906 "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround | |
907 "movaps (%0, %%"REG_S"), %%xmm1 \n\t" | |
908 "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" | |
3904 | 909 "addps %%xmm7, %%xmm1 \n\t" |
910 "addps %%xmm7, %%xmm2 \n\t" | |
911 "subps %%xmm0, %%xmm1 \n\t" | |
912 "addps %%xmm0, %%xmm2 \n\t" | |
16173 | 913 "movaps %%xmm1, (%0, %%"REG_S") \n\t" |
914 "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" | |
915 "add $16, %%"REG_S" \n\t" | |
3904 | 916 " jnz 1b \n\t" |
917 :: "r" (samples+256), "m" (bias) | |
16173 | 918 : "%"REG_S |
3904 | 919 ); |
920 } | |
921 | |
922 static void mix32to2_SSE (sample_t * samples, sample_t bias) | |
923 { | |
924 asm volatile( | |
925 "movlps %1, %%xmm7 \n\t" | |
926 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 927 "mov $-1024, %%"REG_S" \n\t" |
4233 | 928 ".balign 16\n\t" |
3904 | 929 "1: \n\t" |
16173 | 930 "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" |
3904 | 931 "addps %%xmm7, %%xmm0 \n\t" // common |
932 "movaps %%xmm0, %%xmm1 \n\t" // common | |
16173 | 933 "addps (%0, %%"REG_S"), %%xmm0 \n\t" |
934 "addps 2048(%0, %%"REG_S"), %%xmm1\n\t" | |
935 "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" | |
936 "addps 4096(%0, %%"REG_S"), %%xmm1\n\t" | |
937 "movaps %%xmm0, (%0, %%"REG_S") \n\t" | |
938 "movaps %%xmm1, 1024(%0, %%"REG_S")\n\t" | |
939 "add $16, %%"REG_S" \n\t" | |
3904 | 940 " jnz 1b \n\t" |
941 :: "r" (samples+256), "m" (bias) | |
16173 | 942 : "%"REG_S |
3904 | 943 ); |
944 } | |
945 | |
946 static void mix32toS_SSE (sample_t * samples, sample_t bias) | |
947 { | |
948 asm volatile( | |
949 "movlps %1, %%xmm7 \n\t" | |
950 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 951 "mov $-1024, %%"REG_S" \n\t" |
4233 | 952 ".balign 16\n\t" |
3904 | 953 "1: \n\t" |
16173 | 954 "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" |
955 "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t" | |
3904 | 956 "addps %%xmm7, %%xmm0 \n\t" // common |
16173 | 957 "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround |
958 "movaps (%0, %%"REG_S"), %%xmm1 \n\t" | |
959 "movaps 2048(%0, %%"REG_S"), %%xmm3\n\t" | |
3904 | 960 "subps %%xmm2, %%xmm1 \n\t" |
961 "addps %%xmm2, %%xmm3 \n\t" | |
962 "addps %%xmm0, %%xmm1 \n\t" | |
963 "addps %%xmm0, %%xmm3 \n\t" | |
16173 | 964 "movaps %%xmm1, (%0, %%"REG_S") \n\t" |
965 "movaps %%xmm3, 1024(%0, %%"REG_S")\n\t" | |
966 "add $16, %%"REG_S" \n\t" | |
3904 | 967 " jnz 1b \n\t" |
968 :: "r" (samples+256), "m" (bias) | |
16173 | 969 : "%"REG_S |
3904 | 970 ); |
971 } | |
972 | |
973 static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias) | |
974 { | |
975 asm volatile( | |
976 "movlps %2, %%xmm7 \n\t" | |
977 "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
16173 | 978 "mov $-1024, %%"REG_S" \n\t" |
4233 | 979 ".balign 16\n\t" |
3904 | 980 "1: \n\t" |
16173 | 981 "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
982 "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" | |
983 "addps 1024(%0, %%"REG_S"), %%xmm0\n\t" | |
984 "addps 1040(%0, %%"REG_S"), %%xmm1\n\t" | |
3904 | 985 "addps %%xmm7, %%xmm0 \n\t" |
986 "addps %%xmm7, %%xmm1 \n\t" | |
16173 | 987 "movaps %%xmm0, (%1, %%"REG_S") \n\t" |
988 "movaps %%xmm1, 16(%1, %%"REG_S")\n\t" | |
989 "add $32, %%"REG_S" \n\t" | |
3904 | 990 " jnz 1b \n\t" |
991 :: "r" (src+256), "r" (dest+256), "m" (bias) | |
16173 | 992 : "%"REG_S |
3904 | 993 ); |
994 } | |
995 | |
996 static void zero_MMX(sample_t * samples) | |
997 { | |
998 asm volatile( | |
16173 | 999 "mov $-1024, %%"REG_S" \n\t" |
3904 | 1000 "pxor %%mm0, %%mm0 \n\t" |
4233 | 1001 ".balign 16\n\t" |
3904 | 1002 "1: \n\t" |
16173 | 1003 "movq %%mm0, (%0, %%"REG_S") \n\t" |
1004 "movq %%mm0, 8(%0, %%"REG_S") \n\t" | |
1005 "movq %%mm0, 16(%0, %%"REG_S") \n\t" | |
1006 "movq %%mm0, 24(%0, %%"REG_S") \n\t" | |
1007 "add $32, %%"REG_S" \n\t" | |
3904 | 1008 " jnz 1b \n\t" |
1009 "emms" | |
1010 :: "r" (samples+256) | |
16173 | 1011 : "%"REG_S |
3904 | 1012 ); |
1013 } | |
1014 | |
4233 | 1015 /* |
1016 I hope dest and src will be at least 8 byte aligned and size | |
1017 will devide on 8 without remain | |
1018 Note: untested and unused. | |
1019 */ | |
1020 static void copy_MMX(void *dest,const void *src,unsigned size) | |
1021 { | |
1022 unsigned i; | |
1023 size /= 64; | |
1024 for(i=0;i<size;i++) | |
1025 { | |
1026 __asm __volatile( | |
1027 "movq %0, %%mm0\n\t" | |
1028 "movq 8%0, %%mm1\n\t" | |
1029 "movq 16%0, %%mm2\n\t" | |
1030 "movq 24%0, %%mm3\n\t" | |
1031 "movq 32%0, %%mm4\n\t" | |
1032 "movq 40%0, %%mm5\n\t" | |
1033 "movq 48%0, %%mm6\n\t" | |
1034 "movq 56%0, %%mm7\n\t" | |
1035 "movq %%mm0, %1\n\t" | |
1036 "movq %%mm1, 8%1\n\t" | |
1037 "movq %%mm2, 16%1\n\t" | |
1038 "movq %%mm3, 24%1\n\t" | |
1039 "movq %%mm4, 32%1\n\t" | |
1040 "movq %%mm5, 40%1\n\t" | |
1041 "movq %%mm6, 48%1\n\t" | |
1042 "movq %%mm7, 56%1\n\t" | |
1043 : | |
1044 :"m"(src),"m"(dest)); | |
1045 } | |
1046 } | |
3904 | 1047 |
1048 static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, | |
1049 sample_t clev, sample_t slev) | |
1050 { | |
1051 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
1052 | |
1053 case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
1054 memcpy (samples, samples + 256, 256 * sizeof (sample_t)); | |
1055 break; | |
1056 | |
1057 case CONVERT (A52_CHANNEL, A52_MONO): | |
1058 case CONVERT (A52_STEREO, A52_MONO): | |
1059 mix_2to1_SSE: | |
1060 mix2to1_SSE (samples, samples + 256, bias); | |
1061 break; | |
1062 | |
1063 case CONVERT (A52_2F1R, A52_MONO): | |
1064 if (slev == 0) | |
1065 goto mix_2to1_SSE; | |
1066 case CONVERT (A52_3F, A52_MONO): | |
1067 mix_3to1_SSE: | |
1068 mix3to1_SSE (samples, bias); | |
1069 break; | |
1070 | |
1071 case CONVERT (A52_3F1R, A52_MONO): | |
1072 if (slev == 0) | |
1073 goto mix_3to1_SSE; | |
1074 case CONVERT (A52_2F2R, A52_MONO): | |
1075 if (slev == 0) | |
1076 goto mix_2to1_SSE; | |
1077 mix4to1_SSE (samples, bias); | |
1078 break; | |
1079 | |
1080 case CONVERT (A52_3F2R, A52_MONO): | |
1081 if (slev == 0) | |
1082 goto mix_3to1_SSE; | |
1083 mix5to1_SSE (samples, bias); | |
1084 break; | |
1085 | |
1086 case CONVERT (A52_MONO, A52_DOLBY): | |
1087 memcpy (samples + 256, samples, 256 * sizeof (sample_t)); | |
1088 break; | |
1089 | |
1090 case CONVERT (A52_3F, A52_STEREO): | |
1091 case CONVERT (A52_3F, A52_DOLBY): | |
1092 mix_3to2_SSE: | |
1093 mix3to2_SSE (samples, bias); | |
1094 break; | |
1095 | |
1096 case CONVERT (A52_2F1R, A52_STEREO): | |
1097 if (slev == 0) | |
1098 break; | |
1099 mix21to2_SSE (samples, samples + 256, bias); | |
1100 break; | |
1101 | |
1102 case CONVERT (A52_2F1R, A52_DOLBY): | |
1103 mix21toS_SSE (samples, bias); | |
1104 break; | |
1105 | |
1106 case CONVERT (A52_3F1R, A52_STEREO): | |
1107 if (slev == 0) | |
1108 goto mix_3to2_SSE; | |
1109 mix31to2_SSE (samples, bias); | |
1110 break; | |
1111 | |
1112 case CONVERT (A52_3F1R, A52_DOLBY): | |
1113 mix31toS_SSE (samples, bias); | |
1114 break; | |
1115 | |
1116 case CONVERT (A52_2F2R, A52_STEREO): | |
1117 if (slev == 0) | |
1118 break; | |
1119 mix2to1_SSE (samples, samples + 512, bias); | |
1120 mix2to1_SSE (samples + 256, samples + 768, bias); | |
1121 break; | |
1122 | |
1123 case CONVERT (A52_2F2R, A52_DOLBY): | |
1124 mix22toS_SSE (samples, bias); | |
1125 break; | |
1126 | |
1127 case CONVERT (A52_3F2R, A52_STEREO): | |
1128 if (slev == 0) | |
1129 goto mix_3to2_SSE; | |
1130 mix32to2_SSE (samples, bias); | |
1131 break; | |
1132 | |
1133 case CONVERT (A52_3F2R, A52_DOLBY): | |
1134 mix32toS_SSE (samples, bias); | |
1135 break; | |
1136 | |
1137 case CONVERT (A52_3F1R, A52_3F): | |
1138 if (slev == 0) | |
1139 break; | |
1140 mix21to2_SSE (samples, samples + 512, bias); | |
1141 break; | |
1142 | |
1143 case CONVERT (A52_3F2R, A52_3F): | |
1144 if (slev == 0) | |
1145 break; | |
1146 mix2to1_SSE (samples, samples + 768, bias); | |
1147 mix2to1_SSE (samples + 512, samples + 1024, bias); | |
1148 break; | |
1149 | |
1150 case CONVERT (A52_3F1R, A52_2F1R): | |
1151 mix3to2_SSE (samples, bias); | |
1152 memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
1153 break; | |
1154 | |
1155 case CONVERT (A52_2F2R, A52_2F1R): | |
1156 mix2to1_SSE (samples + 512, samples + 768, bias); | |
1157 break; | |
1158 | |
1159 case CONVERT (A52_3F2R, A52_2F1R): | |
1160 mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used) | |
1161 move2to1_SSE (samples + 768, samples + 512, bias); | |
1162 break; | |
1163 | |
1164 case CONVERT (A52_3F2R, A52_3F1R): | |
1165 mix2to1_SSE (samples + 768, samples + 1024, bias); | |
1166 break; | |
1167 | |
1168 case CONVERT (A52_2F1R, A52_2F2R): | |
1169 memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); | |
1170 break; | |
1171 | |
1172 case CONVERT (A52_3F1R, A52_2F2R): | |
1173 mix3to2_SSE (samples, bias); | |
1174 memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
1175 break; | |
1176 | |
1177 case CONVERT (A52_3F2R, A52_2F2R): | |
1178 mix3to2_SSE (samples, bias); | |
1179 memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
1180 memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); | |
1181 break; | |
1182 | |
1183 case CONVERT (A52_3F1R, A52_3F2R): | |
12137 | 1184 memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); |
3904 | 1185 break; |
1186 } | |
1187 } | |
1188 | |
1189 static void upmix_MMX (sample_t * samples, int acmod, int output) | |
1190 { | |
1191 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
1192 | |
1193 case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
1194 memcpy (samples + 256, samples, 256 * sizeof (sample_t)); | |
1195 break; | |
1196 | |
1197 case CONVERT (A52_3F2R, A52_MONO): | |
1198 zero_MMX (samples + 1024); | |
1199 case CONVERT (A52_3F1R, A52_MONO): | |
1200 case CONVERT (A52_2F2R, A52_MONO): | |
1201 zero_MMX (samples + 768); | |
1202 case CONVERT (A52_3F, A52_MONO): | |
1203 case CONVERT (A52_2F1R, A52_MONO): | |
1204 zero_MMX (samples + 512); | |
1205 case CONVERT (A52_CHANNEL, A52_MONO): | |
1206 case CONVERT (A52_STEREO, A52_MONO): | |
1207 zero_MMX (samples + 256); | |
1208 break; | |
1209 | |
1210 case CONVERT (A52_3F2R, A52_STEREO): | |
1211 case CONVERT (A52_3F2R, A52_DOLBY): | |
1212 zero_MMX (samples + 1024); | |
1213 case CONVERT (A52_3F1R, A52_STEREO): | |
1214 case CONVERT (A52_3F1R, A52_DOLBY): | |
1215 zero_MMX (samples + 768); | |
1216 case CONVERT (A52_3F, A52_STEREO): | |
1217 case CONVERT (A52_3F, A52_DOLBY): | |
1218 mix_3to2_MMX: | |
1219 memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t)); | |
1220 zero_MMX (samples + 256); | |
1221 break; | |
1222 | |
1223 case CONVERT (A52_2F2R, A52_STEREO): | |
1224 case CONVERT (A52_2F2R, A52_DOLBY): | |
1225 zero_MMX (samples + 768); | |
1226 case CONVERT (A52_2F1R, A52_STEREO): | |
1227 case CONVERT (A52_2F1R, A52_DOLBY): | |
1228 zero_MMX (samples + 512); | |
1229 break; | |
1230 | |
1231 case CONVERT (A52_3F2R, A52_3F): | |
1232 zero_MMX (samples + 1024); | |
1233 case CONVERT (A52_3F1R, A52_3F): | |
1234 case CONVERT (A52_2F2R, A52_2F1R): | |
1235 zero_MMX (samples + 768); | |
1236 break; | |
1237 | |
1238 case CONVERT (A52_3F2R, A52_3F1R): | |
1239 zero_MMX (samples + 1024); | |
1240 break; | |
1241 | |
1242 case CONVERT (A52_3F2R, A52_2F1R): | |
1243 zero_MMX (samples + 1024); | |
1244 case CONVERT (A52_3F1R, A52_2F1R): | |
1245 mix_31to21_MMX: | |
1246 memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); | |
1247 goto mix_3to2_MMX; | |
1248 | |
1249 case CONVERT (A52_3F2R, A52_2F2R): | |
1250 memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); | |
1251 goto mix_31to21_MMX; | |
1252 } | |
1253 } | |
4233 | 1254 |
1255 static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias) | |
1256 { | |
1257 asm volatile( | |
1258 "movd %2, %%mm7 \n\t" | |
1259 "punpckldq %2, %%mm7 \n\t" | |
16173 | 1260 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1261 ".balign 16\n\t" |
1262 "1: \n\t" | |
16173 | 1263 "movq (%0, %%"REG_S"), %%mm0 \n\t" |
1264 "movq 8(%0, %%"REG_S"), %%mm1 \n\t" | |
1265 "movq 16(%0, %%"REG_S"), %%mm2 \n\t" | |
1266 "movq 24(%0, %%"REG_S"), %%mm3 \n\t" | |
1267 "pfadd (%1, %%"REG_S"), %%mm0 \n\t" | |
1268 "pfadd 8(%1, %%"REG_S"), %%mm1 \n\t" | |
1269 "pfadd 16(%1, %%"REG_S"), %%mm2 \n\t" | |
1270 "pfadd 24(%1, %%"REG_S"), %%mm3 \n\t" | |
4233 | 1271 "pfadd %%mm7, %%mm0 \n\t" |
1272 "pfadd %%mm7, %%mm1 \n\t" | |
1273 "pfadd %%mm7, %%mm2 \n\t" | |
1274 "pfadd %%mm7, %%mm3 \n\t" | |
16173 | 1275 "movq %%mm0, (%1, %%"REG_S") \n\t" |
1276 "movq %%mm1, 8(%1, %%"REG_S") \n\t" | |
1277 "movq %%mm2, 16(%1, %%"REG_S") \n\t" | |
1278 "movq %%mm3, 24(%1, %%"REG_S") \n\t" | |
1279 "add $32, %%"REG_S" \n\t" | |
4233 | 1280 " jnz 1b \n\t" |
1281 :: "r" (src+256), "r" (dest+256), "m" (bias) | |
16173 | 1282 : "%"REG_S |
4233 | 1283 ); |
1284 } | |
1285 | |
1286 static void mix3to1_3dnow (sample_t * samples, sample_t bias) | |
1287 { | |
1288 asm volatile( | |
1289 "movd %1, %%mm7 \n\t" | |
1290 "punpckldq %1, %%mm7 \n\t" | |
16173 | 1291 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1292 ".balign 16\n\t" |
1293 "1: \n\t" | |
16173 | 1294 "movq (%0, %%"REG_S"), %%mm0 \n\t" |
1295 "movq 8(%0, %%"REG_S"), %%mm1 \n\t" | |
1296 "movq 1024(%0, %%"REG_S"), %%mm2\n\t" | |
1297 "movq 1032(%0, %%"REG_S"), %%mm3\n\t" | |
1298 "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" | |
1299 "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" | |
4233 | 1300 "pfadd %%mm7, %%mm0 \n\t" |
1301 "pfadd %%mm7, %%mm1 \n\t" | |
1302 "pfadd %%mm2, %%mm0 \n\t" | |
1303 "pfadd %%mm3, %%mm1 \n\t" | |
16173 | 1304 "movq %%mm0, (%0, %%"REG_S") \n\t" |
1305 "movq %%mm1, 8(%0, %%"REG_S") \n\t" | |
1306 "add $16, %%"REG_S" \n\t" | |
4233 | 1307 " jnz 1b \n\t" |
1308 :: "r" (samples+256), "m" (bias) | |
16173 | 1309 : "%"REG_S |
4233 | 1310 ); |
1311 } | |
1312 | |
1313 static void mix4to1_3dnow (sample_t * samples, sample_t bias) | |
1314 { | |
1315 asm volatile( | |
1316 "movd %1, %%mm7 \n\t" | |
1317 "punpckldq %1, %%mm7 \n\t" | |
16173 | 1318 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1319 ".balign 16\n\t" |
1320 "1: \n\t" | |
16173 | 1321 "movq (%0, %%"REG_S"), %%mm0 \n\t" |
1322 "movq 8(%0, %%"REG_S"), %%mm1 \n\t" | |
1323 "movq 1024(%0, %%"REG_S"), %%mm2\n\t" | |
1324 "movq 1032(%0, %%"REG_S"), %%mm3\n\t" | |
1325 "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" | |
1326 "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" | |
1327 "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t" | |
1328 "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t" | |
4233 | 1329 "pfadd %%mm7, %%mm0 \n\t" |
1330 "pfadd %%mm7, %%mm1 \n\t" | |
1331 "pfadd %%mm2, %%mm0 \n\t" | |
1332 "pfadd %%mm3, %%mm1 \n\t" | |
16173 | 1333 "movq %%mm0, (%0, %%"REG_S") \n\t" |
1334 "movq %%mm1, 8(%0, %%"REG_S") \n\t" | |
1335 "add $16, %%"REG_S" \n\t" | |
4233 | 1336 " jnz 1b \n\t" |
1337 :: "r" (samples+256), "m" (bias) | |
16173 | 1338 : "%"REG_S |
4233 | 1339 ); |
1340 } | |
1341 | |
1342 static void mix5to1_3dnow (sample_t * samples, sample_t bias) | |
1343 { | |
1344 asm volatile( | |
1345 "movd %1, %%mm7 \n\t" | |
1346 "punpckldq %1, %%mm7 \n\t" | |
16173 | 1347 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1348 ".balign 16\n\t" |
1349 "1: \n\t" | |
16173 | 1350 "movq (%0, %%"REG_S"), %%mm0 \n\t" |
1351 "movq 8(%0, %%"REG_S"), %%mm1 \n\t" | |
1352 "movq 1024(%0, %%"REG_S"), %%mm2\n\t" | |
1353 "movq 1032(%0, %%"REG_S"), %%mm3\n\t" | |
1354 "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" | |
1355 "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" | |
1356 "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t" | |
1357 "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t" | |
4233 | 1358 "pfadd %%mm7, %%mm0 \n\t" |
1359 "pfadd %%mm7, %%mm1 \n\t" | |
16173 | 1360 "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t" |
1361 "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t" | |
4233 | 1362 "pfadd %%mm2, %%mm0 \n\t" |
1363 "pfadd %%mm3, %%mm1 \n\t" | |
16173 | 1364 "movq %%mm0, (%0, %%"REG_S") \n\t" |
1365 "movq %%mm1, 8(%0, %%"REG_S") \n\t" | |
1366 "add $16, %%"REG_S" \n\t" | |
4233 | 1367 " jnz 1b \n\t" |
1368 :: "r" (samples+256), "m" (bias) | |
16173 | 1369 : "%"REG_S |
4233 | 1370 ); |
1371 } | |
1372 | |
1373 static void mix3to2_3dnow (sample_t * samples, sample_t bias) | |
1374 { | |
1375 asm volatile( | |
1376 "movd %1, %%mm7 \n\t" | |
1377 "punpckldq %1, %%mm7 \n\t" | |
16173 | 1378 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1379 ".balign 16\n\t" |
1380 "1: \n\t" | |
16173 | 1381 "movq 1024(%0, %%"REG_S"), %%mm0\n\t" |
1382 "movq 1032(%0, %%"REG_S"), %%mm1\n\t" | |
4233 | 1383 "pfadd %%mm7, %%mm0 \n\t" //common |
1384 "pfadd %%mm7, %%mm1 \n\t" //common | |
16173 | 1385 "movq (%0, %%"REG_S"), %%mm2 \n\t" |
1386 "movq 8(%0, %%"REG_S"), %%mm3 \n\t" | |
1387 "movq 2048(%0, %%"REG_S"), %%mm4\n\t" | |
1388 "movq 2056(%0, %%"REG_S"), %%mm5\n\t" | |
4233 | 1389 "pfadd %%mm0, %%mm2 \n\t" |
5912 | 1390 "pfadd %%mm1, %%mm3 \n\t" |
4233 | 1391 "pfadd %%mm0, %%mm4 \n\t" |
5912 | 1392 "pfadd %%mm1, %%mm5 \n\t" |
16173 | 1393 "movq %%mm2, (%0, %%"REG_S") \n\t" |
1394 "movq %%mm3, 8(%0, %%"REG_S") \n\t" | |
1395 "movq %%mm4, 1024(%0, %%"REG_S")\n\t" | |
1396 "movq %%mm5, 1032(%0, %%"REG_S")\n\t" | |
1397 "add $16, %%"REG_S" \n\t" | |
4233 | 1398 " jnz 1b \n\t" |
1399 :: "r" (samples+256), "m" (bias) | |
16173 | 1400 : "%"REG_S |
4233 | 1401 ); |
1402 } | |
1403 | |
1404 static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias) | |
1405 { | |
1406 asm volatile( | |
1407 "movd %2, %%mm7 \n\t" | |
1408 "punpckldq %2, %%mm7 \n\t" | |
16173 | 1409 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1410 ".balign 16\n\t" |
1411 "1: \n\t" | |
16173 | 1412 "movq 1024(%1, %%"REG_S"), %%mm0\n\t" |
1413 "movq 1032(%1, %%"REG_S"), %%mm1\n\t" | |
4233 | 1414 "pfadd %%mm7, %%mm0 \n\t" //common |
1415 "pfadd %%mm7, %%mm1 \n\t" //common | |
16173 | 1416 "movq (%0, %%"REG_S"), %%mm2 \n\t" |
1417 "movq 8(%0, %%"REG_S"), %%mm3 \n\t" | |
1418 "movq (%1, %%"REG_S"), %%mm4 \n\t" | |
1419 "movq 8(%1, %%"REG_S"), %%mm5 \n\t" | |
4233 | 1420 "pfadd %%mm0, %%mm2 \n\t" |
1421 "pfadd %%mm1, %%mm3 \n\t" | |
1422 "pfadd %%mm0, %%mm4 \n\t" | |
1423 "pfadd %%mm1, %%mm5 \n\t" | |
16173 | 1424 "movq %%mm2, (%0, %%"REG_S") \n\t" |
1425 "movq %%mm3, 8(%0, %%"REG_S") \n\t" | |
1426 "movq %%mm4, (%1, %%"REG_S") \n\t" | |
1427 "movq %%mm5, 8(%1, %%"REG_S") \n\t" | |
1428 "add $16, %%"REG_S" \n\t" | |
4233 | 1429 " jnz 1b \n\t" |
1430 :: "r" (left+256), "r" (right+256), "m" (bias) | |
16173 | 1431 : "%"REG_S |
4233 | 1432 ); |
1433 } | |
1434 | |
1435 static void mix21toS_3dnow (sample_t * samples, sample_t bias) | |
1436 { | |
1437 asm volatile( | |
1438 "movd %1, %%mm7 \n\t" | |
1439 "punpckldq %1, %%mm7 \n\t" | |
16173 | 1440 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1441 ".balign 16\n\t" |
1442 "1: \n\t" | |
16173 | 1443 "movq 2048(%0, %%"REG_S"), %%mm0\n\t" // surround |
1444 "movq 2056(%0, %%"REG_S"), %%mm1\n\t" // surround | |
1445 "movq (%0, %%"REG_S"), %%mm2 \n\t" | |
1446 "movq 8(%0, %%"REG_S"), %%mm3 \n\t" | |
1447 "movq 1024(%0, %%"REG_S"), %%mm4\n\t" | |
1448 "movq 1032(%0, %%"REG_S"), %%mm5\n\t" | |
4233 | 1449 "pfadd %%mm7, %%mm2 \n\t" |
1450 "pfadd %%mm7, %%mm3 \n\t" | |
1451 "pfadd %%mm7, %%mm4 \n\t" | |
1452 "pfadd %%mm7, %%mm5 \n\t" | |
1453 "pfsub %%mm0, %%mm2 \n\t" | |
1454 "pfsub %%mm1, %%mm3 \n\t" | |
1455 "pfadd %%mm0, %%mm4 \n\t" | |
1456 "pfadd %%mm1, %%mm5 \n\t" | |
16173 | 1457 "movq %%mm2, (%0, %%"REG_S") \n\t" |
1458 "movq %%mm3, 8(%0, %%"REG_S") \n\t" | |
1459 "movq %%mm4, 1024(%0, %%"REG_S")\n\t" | |
1460 "movq %%mm5, 1032(%0, %%"REG_S")\n\t" | |
1461 "add $16, %%"REG_S" \n\t" | |
4233 | 1462 " jnz 1b \n\t" |
1463 :: "r" (samples+256), "m" (bias) | |
16173 | 1464 : "%"REG_S |
4233 | 1465 ); |
1466 } | |
1467 | |
1468 static void mix31to2_3dnow (sample_t * samples, sample_t bias) | |
1469 { | |
1470 asm volatile( | |
1471 "movd %1, %%mm7 \n\t" | |
1472 "punpckldq %1, %%mm7 \n\t" | |
16173 | 1473 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1474 ".balign 16\n\t" |
1475 "1: \n\t" | |
16173 | 1476 "movq 1024(%0, %%"REG_S"), %%mm0\n\t" |
1477 "movq 1032(%0, %%"REG_S"), %%mm1\n\t" | |
1478 "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" | |
1479 "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" | |
4233 | 1480 "pfadd %%mm7, %%mm0 \n\t" // common |
1481 "pfadd %%mm7, %%mm1 \n\t" // common | |
16173 | 1482 "movq (%0, %%"REG_S"), %%mm2 \n\t" |
1483 "movq 8(%0, %%"REG_S"), %%mm3 \n\t" | |
1484 "movq 2048(%0, %%"REG_S"), %%mm4\n\t" | |
1485 "movq 2056(%0, %%"REG_S"), %%mm5\n\t" | |
4233 | 1486 "pfadd %%mm0, %%mm2 \n\t" |
1487 "pfadd %%mm1, %%mm3 \n\t" | |
1488 "pfadd %%mm0, %%mm4 \n\t" | |
1489 "pfadd %%mm1, %%mm5 \n\t" | |
16173 | 1490 "movq %%mm2, (%0, %%"REG_S") \n\t" |
1491 "movq %%mm3, 8(%0, %%"REG_S") \n\t" | |
1492 "movq %%mm4, 1024(%0, %%"REG_S")\n\t" | |
1493 "movq %%mm5, 1032(%0, %%"REG_S")\n\t" | |
1494 "add $16, %%"REG_S" \n\t" | |
4233 | 1495 " jnz 1b \n\t" |
1496 :: "r" (samples+256), "m" (bias) | |
16173 | 1497 : "%"REG_S |
4233 | 1498 ); |
1499 } | |
1500 | |
1501 static void mix31toS_3dnow (sample_t * samples, sample_t bias) | |
1502 { | |
1503 asm volatile( | |
1504 "movd %1, %%mm7 \n\t" | |
1505 "punpckldq %1, %%mm7 \n\t" | |
16173 | 1506 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1507 ".balign 16\n\t" |
1508 "1: \n\t" | |
16173 | 1509 "movq 1024(%0, %%"REG_S"), %%mm0\n\t" |
1510 "movq 1032(%0, %%"REG_S"), %%mm1\n\t" | |
4233 | 1511 "pfadd %%mm7, %%mm0 \n\t" // common |
1512 "pfadd %%mm7, %%mm1 \n\t" // common | |
16173 | 1513 "movq (%0, %%"REG_S"), %%mm2 \n\t" |
1514 "movq 8(%0, %%"REG_S"), %%mm3 \n\t" | |
1515 "movq 2048(%0, %%"REG_S"), %%mm4\n\t" | |
1516 "movq 2056(%0, %%"REG_S"), %%mm5\n\t" | |
4233 | 1517 "pfadd %%mm0, %%mm2 \n\t" |
1518 "pfadd %%mm1, %%mm3 \n\t" | |
1519 "pfadd %%mm0, %%mm4 \n\t" | |
1520 "pfadd %%mm1, %%mm5 \n\t" | |
16173 | 1521 "movq 3072(%0, %%"REG_S"), %%mm0\n\t" // surround |
1522 "movq 3080(%0, %%"REG_S"), %%mm1\n\t" // surround | |
4233 | 1523 "pfsub %%mm0, %%mm2 \n\t" |
1524 "pfsub %%mm1, %%mm3 \n\t" | |
1525 "pfadd %%mm0, %%mm4 \n\t" | |
1526 "pfadd %%mm1, %%mm5 \n\t" | |
16173 | 1527 "movq %%mm2, (%0, %%"REG_S") \n\t" |
1528 "movq %%mm3, 8(%0, %%"REG_S") \n\t" | |
1529 "movq %%mm4, 1024(%0, %%"REG_S")\n\t" | |
1530 "movq %%mm5, 1032(%0, %%"REG_S")\n\t" | |
1531 "add $16, %%"REG_S" \n\t" | |
4233 | 1532 " jnz 1b \n\t" |
1533 :: "r" (samples+256), "m" (bias) | |
16173 | 1534 : "%"REG_S |
4233 | 1535 ); |
1536 } | |
1537 | |
1538 static void mix22toS_3dnow (sample_t * samples, sample_t bias) | |
1539 { | |
1540 asm volatile( | |
1541 "movd %1, %%mm7 \n\t" | |
1542 "punpckldq %1, %%mm7 \n\t" | |
16173 | 1543 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1544 ".balign 16\n\t" |
1545 "1: \n\t" | |
16173 | 1546 "movq 2048(%0, %%"REG_S"), %%mm0\n\t" |
1547 "movq 2056(%0, %%"REG_S"), %%mm1\n\t" | |
1548 "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround | |
1549 "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround | |
1550 "movq (%0, %%"REG_S"), %%mm2 \n\t" | |
1551 "movq 8(%0, %%"REG_S"), %%mm3 \n\t" | |
1552 "movq 1024(%0, %%"REG_S"), %%mm4\n\t" | |
1553 "movq 1032(%0, %%"REG_S"), %%mm5\n\t" | |
4233 | 1554 "pfadd %%mm7, %%mm2 \n\t" |
1555 "pfadd %%mm7, %%mm3 \n\t" | |
1556 "pfadd %%mm7, %%mm4 \n\t" | |
1557 "pfadd %%mm7, %%mm5 \n\t" | |
1558 "pfsub %%mm0, %%mm2 \n\t" | |
1559 "pfsub %%mm1, %%mm3 \n\t" | |
1560 "pfadd %%mm0, %%mm4 \n\t" | |
1561 "pfadd %%mm1, %%mm5 \n\t" | |
16173 | 1562 "movq %%mm2, (%0, %%"REG_S") \n\t" |
1563 "movq %%mm3, 8(%0, %%"REG_S") \n\t" | |
1564 "movq %%mm4, 1024(%0, %%"REG_S")\n\t" | |
1565 "movq %%mm5, 1032(%0, %%"REG_S")\n\t" | |
1566 "add $16, %%"REG_S" \n\t" | |
4233 | 1567 " jnz 1b \n\t" |
1568 :: "r" (samples+256), "m" (bias) | |
16173 | 1569 : "%"REG_S |
4233 | 1570 ); |
1571 } | |
1572 | |
1573 static void mix32to2_3dnow (sample_t * samples, sample_t bias) | |
1574 { | |
1575 asm volatile( | |
1576 "movd %1, %%mm7 \n\t" | |
1577 "punpckldq %1, %%mm7 \n\t" | |
16173 | 1578 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1579 ".balign 16\n\t" |
1580 "1: \n\t" | |
16173 | 1581 "movq 1024(%0, %%"REG_S"), %%mm0\n\t" |
1582 "movq 1032(%0, %%"REG_S"), %%mm1\n\t" | |
4233 | 1583 "pfadd %%mm7, %%mm0 \n\t" // common |
1584 "pfadd %%mm7, %%mm1 \n\t" // common | |
1585 "movq %%mm0, %%mm2 \n\t" // common | |
1586 "movq %%mm1, %%mm3 \n\t" // common | |
16173 | 1587 "pfadd (%0, %%"REG_S"), %%mm0 \n\t" |
1588 "pfadd 8(%0, %%"REG_S"), %%mm1 \n\t" | |
1589 "pfadd 2048(%0, %%"REG_S"), %%mm2\n\t" | |
1590 "pfadd 2056(%0, %%"REG_S"), %%mm3\n\t" | |
1591 "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" | |
1592 "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" | |
1593 "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t" | |
1594 "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t" | |
1595 "movq %%mm0, (%0, %%"REG_S") \n\t" | |
1596 "movq %%mm1, 8(%0, %%"REG_S") \n\t" | |
1597 "movq %%mm2, 1024(%0, %%"REG_S")\n\t" | |
1598 "movq %%mm3, 1032(%0, %%"REG_S")\n\t" | |
1599 "add $16, %%"REG_S" \n\t" | |
4233 | 1600 " jnz 1b \n\t" |
1601 :: "r" (samples+256), "m" (bias) | |
16173 | 1602 : "%"REG_S |
4233 | 1603 ); |
1604 } | |
1605 | |
1606 /* todo: should be optimized better */ | |
1607 static void mix32toS_3dnow (sample_t * samples, sample_t bias) | |
1608 { | |
1609 asm volatile( | |
16173 | 1610 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1611 ".balign 16\n\t" |
1612 "1: \n\t" | |
1613 "movd %1, %%mm7 \n\t" | |
1614 "punpckldq %1, %%mm7 \n\t" | |
16173 | 1615 "movq 1024(%0, %%"REG_S"), %%mm0\n\t" |
1616 "movq 1032(%0, %%"REG_S"), %%mm1\n\t" | |
1617 "movq 3072(%0, %%"REG_S"), %%mm4\n\t" | |
1618 "movq 3080(%0, %%"REG_S"), %%mm5\n\t" | |
4233 | 1619 "pfadd %%mm7, %%mm0 \n\t" // common |
1620 "pfadd %%mm7, %%mm1 \n\t" // common | |
16173 | 1621 "pfadd 4096(%0, %%"REG_S"), %%mm4\n\t" // surround |
1622 "pfadd 4104(%0, %%"REG_S"), %%mm5\n\t" // surround | |
1623 "movq (%0, %%"REG_S"), %%mm2 \n\t" | |
1624 "movq 8(%0, %%"REG_S"), %%mm3 \n\t" | |
1625 "movq 2048(%0, %%"REG_S"), %%mm6\n\t" | |
1626 "movq 2056(%0, %%"REG_S"), %%mm7\n\t" | |
4233 | 1627 "pfsub %%mm4, %%mm2 \n\t" |
1628 "pfsub %%mm5, %%mm3 \n\t" | |
1629 "pfadd %%mm4, %%mm6 \n\t" | |
1630 "pfadd %%mm5, %%mm7 \n\t" | |
1631 "pfadd %%mm0, %%mm2 \n\t" | |
1632 "pfadd %%mm1, %%mm3 \n\t" | |
1633 "pfadd %%mm0, %%mm6 \n\t" | |
1634 "pfadd %%mm1, %%mm7 \n\t" | |
16173 | 1635 "movq %%mm2, (%0, %%"REG_S") \n\t" |
1636 "movq %%mm3, 8(%0, %%"REG_S") \n\t" | |
1637 "movq %%mm6, 1024(%0, %%"REG_S")\n\t" | |
1638 "movq %%mm7, 1032(%0, %%"REG_S")\n\t" | |
1639 "add $16, %%"REG_S" \n\t" | |
4233 | 1640 " jnz 1b \n\t" |
1641 :: "r" (samples+256), "m" (bias) | |
16173 | 1642 : "%"REG_S |
4233 | 1643 ); |
1644 } | |
1645 | |
1646 static void move2to1_3dnow (sample_t * src, sample_t * dest, sample_t bias) | |
1647 { | |
1648 asm volatile( | |
1649 "movd %2, %%mm7 \n\t" | |
1650 "punpckldq %2, %%mm7 \n\t" | |
16173 | 1651 "mov $-1024, %%"REG_S" \n\t" |
4233 | 1652 ".balign 16\n\t" |
1653 "1: \n\t" | |
16173 | 1654 "movq (%0, %%"REG_S"), %%mm0 \n\t" |
1655 "movq 8(%0, %%"REG_S"), %%mm1 \n\t" | |
1656 "movq 16(%0, %%"REG_S"), %%mm2 \n\t" | |
1657 "movq 24(%0, %%"REG_S"), %%mm3 \n\t" | |
1658 "pfadd 1024(%0, %%"REG_S"), %%mm0\n\t" | |
1659 "pfadd 1032(%0, %%"REG_S"), %%mm1\n\t" | |
1660 "pfadd 1040(%0, %%"REG_S"), %%mm2\n\t" | |
1661 "pfadd 1048(%0, %%"REG_S"), %%mm3\n\t" | |
4233 | 1662 "pfadd %%mm7, %%mm0 \n\t" |
1663 "pfadd %%mm7, %%mm1 \n\t" | |
1664 "pfadd %%mm7, %%mm2 \n\t" | |
1665 "pfadd %%mm7, %%mm3 \n\t" | |
16173 | 1666 "movq %%mm0, (%1, %%"REG_S") \n\t" |
1667 "movq %%mm1, 8(%1, %%"REG_S") \n\t" | |
1668 "movq %%mm2, 16(%1, %%"REG_S") \n\t" | |
1669 "movq %%mm3, 24(%1, %%"REG_S") \n\t" | |
1670 "add $32, %%"REG_S" \n\t" | |
4233 | 1671 " jnz 1b \n\t" |
1672 :: "r" (src+256), "r" (dest+256), "m" (bias) | |
16173 | 1673 : "%"REG_S |
4233 | 1674 ); |
1675 } | |
1676 | |
1677 static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, | |
1678 sample_t clev, sample_t slev) | |
1679 { | |
1680 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
1681 | |
1682 case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
1683 memcpy (samples, samples + 256, 256 * sizeof (sample_t)); | |
1684 break; | |
1685 | |
1686 case CONVERT (A52_CHANNEL, A52_MONO): | |
1687 case CONVERT (A52_STEREO, A52_MONO): | |
1688 mix_2to1_3dnow: | |
1689 mix2to1_3dnow (samples, samples + 256, bias); | |
1690 break; | |
1691 | |
1692 case CONVERT (A52_2F1R, A52_MONO): | |
1693 if (slev == 0) | |
1694 goto mix_2to1_3dnow; | |
1695 case CONVERT (A52_3F, A52_MONO): | |
1696 mix_3to1_3dnow: | |
1697 mix3to1_3dnow (samples, bias); | |
1698 break; | |
1699 | |
1700 case CONVERT (A52_3F1R, A52_MONO): | |
1701 if (slev == 0) | |
1702 goto mix_3to1_3dnow; | |
1703 case CONVERT (A52_2F2R, A52_MONO): | |
1704 if (slev == 0) | |
1705 goto mix_2to1_3dnow; | |
1706 mix4to1_3dnow (samples, bias); | |
1707 break; | |
1708 | |
1709 case CONVERT (A52_3F2R, A52_MONO): | |
1710 if (slev == 0) | |
1711 goto mix_3to1_3dnow; | |
1712 mix5to1_3dnow (samples, bias); | |
1713 break; | |
1714 | |
1715 case CONVERT (A52_MONO, A52_DOLBY): | |
1716 memcpy (samples + 256, samples, 256 * sizeof (sample_t)); | |
1717 break; | |
1718 | |
1719 case CONVERT (A52_3F, A52_STEREO): | |
1720 case CONVERT (A52_3F, A52_DOLBY): | |
1721 mix_3to2_3dnow: | |
1722 mix3to2_3dnow (samples, bias); | |
1723 break; | |
1724 | |
1725 case CONVERT (A52_2F1R, A52_STEREO): | |
1726 if (slev == 0) | |
1727 break; | |
1728 mix21to2_3dnow (samples, samples + 256, bias); | |
1729 break; | |
1730 | |
1731 case CONVERT (A52_2F1R, A52_DOLBY): | |
1732 mix21toS_3dnow (samples, bias); | |
1733 break; | |
1734 | |
1735 case CONVERT (A52_3F1R, A52_STEREO): | |
1736 if (slev == 0) | |
1737 goto mix_3to2_3dnow; | |
1738 mix31to2_3dnow (samples, bias); | |
1739 break; | |
1740 | |
1741 case CONVERT (A52_3F1R, A52_DOLBY): | |
1742 mix31toS_3dnow (samples, bias); | |
1743 break; | |
1744 | |
1745 case CONVERT (A52_2F2R, A52_STEREO): | |
1746 if (slev == 0) | |
1747 break; | |
1748 mix2to1_3dnow (samples, samples + 512, bias); | |
1749 mix2to1_3dnow (samples + 256, samples + 768, bias); | |
1750 break; | |
1751 | |
1752 case CONVERT (A52_2F2R, A52_DOLBY): | |
1753 mix22toS_3dnow (samples, bias); | |
1754 break; | |
1755 | |
1756 case CONVERT (A52_3F2R, A52_STEREO): | |
1757 if (slev == 0) | |
1758 goto mix_3to2_3dnow; | |
1759 mix32to2_3dnow (samples, bias); | |
1760 break; | |
1761 | |
1762 case CONVERT (A52_3F2R, A52_DOLBY): | |
1763 mix32toS_3dnow (samples, bias); | |
1764 break; | |
1765 | |
1766 case CONVERT (A52_3F1R, A52_3F): | |
1767 if (slev == 0) | |
1768 break; | |
1769 mix21to2_3dnow (samples, samples + 512, bias); | |
1770 break; | |
1771 | |
1772 case CONVERT (A52_3F2R, A52_3F): | |
1773 if (slev == 0) | |
1774 break; | |
1775 mix2to1_3dnow (samples, samples + 768, bias); | |
1776 mix2to1_3dnow (samples + 512, samples + 1024, bias); | |
1777 break; | |
1778 | |
1779 case CONVERT (A52_3F1R, A52_2F1R): | |
1780 mix3to2_3dnow (samples, bias); | |
1781 memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
1782 break; | |
1783 | |
1784 case CONVERT (A52_2F2R, A52_2F1R): | |
1785 mix2to1_3dnow (samples + 512, samples + 768, bias); | |
1786 break; | |
1787 | |
1788 case CONVERT (A52_3F2R, A52_2F1R): | |
1789 mix3to2_3dnow (samples, bias); //FIXME possible bug? (output doesnt seem to be used) | |
1790 move2to1_3dnow (samples + 768, samples + 512, bias); | |
1791 break; | |
1792 | |
1793 case CONVERT (A52_3F2R, A52_3F1R): | |
1794 mix2to1_3dnow (samples + 768, samples + 1024, bias); | |
1795 break; | |
1796 | |
1797 case CONVERT (A52_2F1R, A52_2F2R): | |
1798 memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); | |
1799 break; | |
1800 | |
1801 case CONVERT (A52_3F1R, A52_2F2R): | |
1802 mix3to2_3dnow (samples, bias); | |
1803 memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
1804 break; | |
1805 | |
1806 case CONVERT (A52_3F2R, A52_2F2R): | |
1807 mix3to2_3dnow (samples, bias); | |
1808 memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
1809 memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); | |
1810 break; | |
1811 | |
1812 case CONVERT (A52_3F1R, A52_3F2R): | |
12137 | 1813 memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); |
4233 | 1814 break; |
1815 } | |
1816 __asm __volatile("femms":::"memory"); | |
1817 } | |
1818 | |
16173 | 1819 #endif // ARCH_X86 || ARCH_X86_64 |