Mercurial > mplayer.hg
comparison liba52/liba52_changes.diff @ 14990:9de84a73f6d0
MPlayer-specific changes to liba52
author | diego |
---|---|
date | Tue, 22 Mar 2005 23:25:06 +0000 |
parents | |
children | 4bad7f00556e |
comparison
equal
deleted
inserted
replaced
14989:d55bd88c2b42 | 14990:9de84a73f6d0 |
---|---|
1 --- include/a52.h 2005-03-22 19:58:53.000000000 +0100 | |
2 +++ a52.h 2004-03-19 01:15:49.000000000 +0100 | |
3 @@ -19,6 +25,9 @@ | |
4 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
5 */ | |
6 | |
7 +#ifndef A52_H | |
8 +#define A52_H | |
9 + | |
10 #ifndef LIBA52_DOUBLE | |
11 typedef float sample_t; | |
12 #else | |
13 @@ -113,3 +122,10 @@ | |
14 void a52_dynrng (a52_state_t * state, | |
15 sample_t (* call) (sample_t, void *), void * data); | |
16 int a52_block (a52_state_t * state, sample_t * samples); | |
17 + | |
18 +void* a52_resample_init(uint32_t mm_accel,int flags,int chans); | |
19 +extern int (* a52_resample) (float * _f, int16_t * s16); | |
20 + | |
21 +uint16_t crc16_block(uint8_t *data,uint32_t num_bytes); | |
22 + | |
23 +#endif /* A52_H */ | |
24 --- liba52/a52_internal.h 2005-03-22 19:59:35.000000000 +0100 | |
25 +++ a52_internal.h 2004-03-19 01:15:49.000000000 +0100 | |
26 @@ -41,11 +43,12 @@ | |
27 | |
28 int downmix_init (int input, int flags, sample_t * level, | |
29 sample_t clev, sample_t slev); | |
30 +void downmix_accel_init(uint32_t mm_accel); | |
31 int downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level, | |
32 sample_t clev, sample_t slev); | |
33 -void downmix (sample_t * samples, int acmod, int output, sample_t bias, | |
34 +extern void (*downmix) (sample_t * samples, int acmod, int output, sample_t bias, | |
35 sample_t clev, sample_t slev); | |
36 -void upmix (sample_t * samples, int acmod, int output); | |
37 +extern void (*upmix) (sample_t * samples, int acmod, int output); | |
38 | |
39 void imdct_init (uint32_t mm_accel); | |
40 extern void (* imdct_256) (sample_t * data, sample_t * delay, sample_t bias); | |
41 --- liba52/bitstream.c 2005-03-22 19:59:35.000000000 +0100 | |
42 +++ bitstream.c 2004-03-19 01:15:49.000000000 +0100 | |
43 @@ -29,7 +35,12 @@ | |
44 | |
45 #define BUFFER_SIZE 4096 | |
46 | |
47 +#ifdef ALT_BITSTREAM_READER | |
48 +int indx=0; | |
49 +uint32_t * buffer_start; | |
50 +#else | |
51 static uint32_t * buffer_start; | |
52 +#endif | |
53 | |
54 uint32_t bits_left; | |
55 uint32_t current_word; | |
56 @@ -41,6 +52,9 @@ | |
57 align = (int)buf & 3; | |
58 buffer_start = (uint32_t *) (buf - align); | |
59 bits_left = 0; | |
60 +#ifdef ALT_BITSTREAM_READER | |
61 + indx=0; | |
62 +#endif | |
63 bitstream_get (align * 8); | |
64 } | |
65 | |
66 --- liba52/bitstream.h 2005-03-22 19:59:35.000000000 +0100 | |
67 +++ bitstream.h 2004-03-19 01:15:49.000000000 +0100 | |
68 @@ -19,6 +25,48 @@ | |
69 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
70 */ | |
71 | |
72 +/* code from ffmpeg/libavcodec */ | |
73 +#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC_ == 3 && __GNUC_MINOR__ > 0) | |
74 +# define always_inline __attribute__((always_inline)) inline | |
75 +#else | |
76 +# define always_inline inline | |
77 +#endif | |
78 + | |
79 +#if defined(__sparc__) || defined(hpux) | |
80 +/* | |
81 + * the alt bitstream reader performs unaligned memory accesses; that doesn't work | |
82 + * on sparc/hpux. For now, disable ALT_BITSTREAM_READER. | |
83 + */ | |
84 +#undef ALT_BITSTREAM_READER | |
85 +#else | |
86 +// alternative (faster) bitstram reader (reades upto 3 bytes over the end of the input) | |
87 +#define ALT_BITSTREAM_READER | |
88 + | |
89 +/* used to avoid missaligned exceptions on some archs (alpha, ...) */ | |
90 +#if defined (ARCH_X86) || defined(ARCH_ARMV4L) | |
91 +# define unaligned32(a) (*(uint32_t*)(a)) | |
92 +#else | |
93 +# ifdef __GNUC__ | |
94 +static always_inline uint32_t unaligned32(const void *v) { | |
95 + struct Unaligned { | |
96 + uint32_t i; | |
97 + } __attribute__((packed)); | |
98 + | |
99 + return ((const struct Unaligned *) v)->i; | |
100 +} | |
101 +# elif defined(__DECC) | |
102 +static inline uint32_t unaligned32(const void *v) { | |
103 + return *(const __unaligned uint32_t *) v; | |
104 +} | |
105 +# else | |
106 +static inline uint32_t unaligned32(const void *v) { | |
107 + return *(const uint32_t *) v; | |
108 +} | |
109 +# endif | |
110 +#endif //!ARCH_X86 | |
111 + | |
112 +#endif | |
113 + | |
114 /* (stolen from the kernel) */ | |
115 #ifdef WORDS_BIGENDIAN | |
116 | |
117 @@ -29,7 +77,7 @@ | |
118 # if defined (__i386__) | |
119 | |
120 # define swab32(x) __i386_swab32(x) | |
121 - static inline const uint32_t __i386_swab32(uint32_t x) | |
122 + static always_inline const uint32_t __i386_swab32(uint32_t x) | |
123 { | |
124 __asm__("bswap %0" : "=r" (x) : "0" (x)); | |
125 return x; | |
126 @@ -37,25 +85,42 @@ | |
127 | |
128 # else | |
129 | |
130 -# define swab32(x)\ | |
131 -((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) | \ | |
132 - (((uint8_t*)&x)[2] << 8) | (((uint8_t*)&x)[3])) | |
133 - | |
134 +# define swab32(x) __generic_swab32(x) | |
135 + static always_inline const uint32_t __generic_swab32(uint32_t x) | |
136 + { | |
137 + return ((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) | | |
138 + (((uint8_t*)&x)[2] << 8) | (((uint8_t*)&x)[3])); | |
139 + } | |
140 # endif | |
141 #endif | |
142 | |
143 +#ifdef ALT_BITSTREAM_READER | |
144 +extern uint32_t *buffer_start; | |
145 +extern int indx; | |
146 +#else | |
147 extern uint32_t bits_left; | |
148 extern uint32_t current_word; | |
149 +#endif | |
150 | |
151 void bitstream_set_ptr (uint8_t * buf); | |
152 uint32_t bitstream_get_bh(uint32_t num_bits); | |
153 int32_t bitstream_get_bh_2(uint32_t num_bits); | |
154 | |
155 + | |
156 static inline uint32_t | |
157 -bitstream_get(uint32_t num_bits) | |
158 +bitstream_get(uint32_t num_bits) // note num_bits is practically a constant due to inlineing | |
159 { | |
160 +#ifdef ALT_BITSTREAM_READER | |
161 + uint32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) ); | |
162 + | |
163 + result<<= (indx&0x07); | |
164 + result>>= 32 - num_bits; | |
165 + indx+= num_bits; | |
166 + | |
167 + return result; | |
168 +#else | |
169 uint32_t result; | |
170 - | |
171 + | |
172 if(num_bits < bits_left) { | |
173 result = (current_word << (32 - bits_left)) >> (32 - num_bits); | |
174 bits_left -= num_bits; | |
175 @@ -63,11 +128,30 @@ | |
176 } | |
177 | |
178 return bitstream_get_bh(num_bits); | |
179 +#endif | |
180 +} | |
181 + | |
182 +static inline void bitstream_skip(int num_bits) | |
183 +{ | |
184 +#ifdef ALT_BITSTREAM_READER | |
185 + indx+= num_bits; | |
186 +#else | |
187 + bitstream_get(num_bits); | |
188 +#endif | |
189 } | |
190 | |
191 static inline int32_t | |
192 bitstream_get_2(uint32_t num_bits) | |
193 { | |
194 +#ifdef ALT_BITSTREAM_READER | |
195 + int32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) ); | |
196 + | |
197 + result<<= (indx&0x07); | |
198 + result>>= 32 - num_bits; | |
199 + indx+= num_bits; | |
200 + | |
201 + return result; | |
202 +#else | |
203 int32_t result; | |
204 | |
205 if(num_bits < bits_left) { | |
206 @@ -77,4 +161,5 @@ | |
207 } | |
208 | |
209 return bitstream_get_bh_2(num_bits); | |
210 +#endif | |
211 } | |
212 --- liba52/downmix.c 2005-03-22 19:59:35.000000000 +0100 | |
213 +++ downmix.c 2004-04-12 18:42:14.000000000 +0200 | |
214 @@ -17,18 +23,46 @@ | |
215 * You should have received a copy of the GNU General Public License | |
216 * along with this program; if not, write to the Free Software | |
217 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
218 + * | |
219 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) | |
220 */ | |
221 | |
222 #include "config.h" | |
223 | |
224 -#include <inttypes.h> | |
225 #include <string.h> | |
226 +#include <inttypes.h> | |
227 | |
228 #include "a52.h" | |
229 #include "a52_internal.h" | |
230 +#include "mm_accel.h" | |
231 | |
232 #define CONVERT(acmod,output) (((output) << 3) + (acmod)) | |
233 | |
234 + | |
235 +void (*downmix)(sample_t * samples, int acmod, int output, sample_t bias, | |
236 + sample_t clev, sample_t slev)= NULL; | |
237 +void (*upmix)(sample_t * samples, int acmod, int output)= NULL; | |
238 + | |
239 +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, | |
240 + sample_t clev, sample_t slev); | |
241 +static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, | |
242 + sample_t clev, sample_t slev); | |
243 +static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, | |
244 + sample_t clev, sample_t slev); | |
245 +static void upmix_MMX (sample_t * samples, int acmod, int output); | |
246 +static void upmix_C (sample_t * samples, int acmod, int output); | |
247 + | |
248 +void downmix_accel_init(uint32_t mm_accel) | |
249 +{ | |
250 + upmix= upmix_C; | |
251 + downmix= downmix_C; | |
252 +#ifdef ARCH_X86 | |
253 + if(mm_accel & MM_ACCEL_X86_MMX) upmix= upmix_MMX; | |
254 + if(mm_accel & MM_ACCEL_X86_SSE) downmix= downmix_SSE; | |
255 + if(mm_accel & MM_ACCEL_X86_3DNOW) downmix= downmix_3dnow; | |
256 +#endif | |
257 +} | |
258 + | |
259 int downmix_init (int input, int flags, sample_t * level, | |
260 sample_t clev, sample_t slev) | |
261 { | |
262 @@ -61,7 +95,7 @@ | |
263 output = flags & A52_CHANNEL_MASK; | |
264 if (output > A52_DOLBY) | |
265 return -1; | |
266 - | |
267 + | |
268 output = table[output][input & 7]; | |
269 | |
270 if ((output == A52_STEREO) && | |
271 @@ -145,7 +179,6 @@ | |
272 *level *= 1 / (1 + 3 * LEVEL_3DB); | |
273 break; | |
274 } | |
275 - | |
276 return output; | |
277 } | |
278 | |
279 @@ -440,12 +473,11 @@ | |
280 static void zero (sample_t * samples) | |
281 { | |
282 int i; | |
283 - | |
284 for (i = 0; i < 256; i++) | |
285 samples[i] = 0; | |
286 } | |
287 | |
288 -void downmix (sample_t * samples, int acmod, int output, sample_t bias, | |
289 +static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, | |
290 sample_t clev, sample_t slev) | |
291 { | |
292 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
293 @@ -557,7 +589,7 @@ | |
294 break; | |
295 | |
296 case CONVERT (A52_3F2R, A52_2F1R): | |
297 - mix3to2 (samples, bias); | |
298 + mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used) | |
299 move2to1 (samples + 768, samples + 512, bias); | |
300 break; | |
301 | |
302 @@ -581,12 +613,12 @@ | |
303 break; | |
304 | |
305 case CONVERT (A52_3F1R, A52_3F2R): | |
306 - memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t)); | |
307 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); | |
308 break; | |
309 } | |
310 } | |
311 | |
312 -void upmix (sample_t * samples, int acmod, int output) | |
313 +static void upmix_C (sample_t * samples, int acmod, int output) | |
314 { | |
315 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
316 | |
317 @@ -651,3 +683,1137 @@ | |
318 goto mix_31to21; | |
319 } | |
320 } | |
321 + | |
322 +#ifdef ARCH_X86 | |
323 +static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias) | |
324 +{ | |
325 + asm volatile( | |
326 + "movlps %2, %%xmm7 \n\t" | |
327 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
328 + "movl $-1024, %%esi \n\t" | |
329 + ".balign 16\n\t" | |
330 + "1: \n\t" | |
331 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
332 + "movaps 16(%0, %%esi), %%xmm1 \n\t" | |
333 + "addps (%1, %%esi), %%xmm0 \n\t" | |
334 + "addps 16(%1, %%esi), %%xmm1 \n\t" | |
335 + "addps %%xmm7, %%xmm0 \n\t" | |
336 + "addps %%xmm7, %%xmm1 \n\t" | |
337 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
338 + "movaps %%xmm1, 16(%1, %%esi) \n\t" | |
339 + "addl $32, %%esi \n\t" | |
340 + " jnz 1b \n\t" | |
341 + :: "r" (src+256), "r" (dest+256), "m" (bias) | |
342 + : "%esi" | |
343 + ); | |
344 +} | |
345 + | |
346 +static void mix3to1_SSE (sample_t * samples, sample_t bias) | |
347 +{ | |
348 + asm volatile( | |
349 + "movlps %1, %%xmm7 \n\t" | |
350 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
351 + "movl $-1024, %%esi \n\t" | |
352 + ".balign 16\n\t" | |
353 + "1: \n\t" | |
354 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
355 + "movaps 1024(%0, %%esi), %%xmm1 \n\t" | |
356 + "addps 2048(%0, %%esi), %%xmm0 \n\t" | |
357 + "addps %%xmm7, %%xmm1 \n\t" | |
358 + "addps %%xmm1, %%xmm0 \n\t" | |
359 + "movaps %%xmm0, (%0, %%esi) \n\t" | |
360 + "addl $16, %%esi \n\t" | |
361 + " jnz 1b \n\t" | |
362 + :: "r" (samples+256), "m" (bias) | |
363 + : "%esi" | |
364 + ); | |
365 +} | |
366 + | |
367 +static void mix4to1_SSE (sample_t * samples, sample_t bias) | |
368 +{ | |
369 + asm volatile( | |
370 + "movlps %1, %%xmm7 \n\t" | |
371 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
372 + "movl $-1024, %%esi \n\t" | |
373 + ".balign 16\n\t" | |
374 + "1: \n\t" | |
375 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
376 + "movaps 1024(%0, %%esi), %%xmm1 \n\t" | |
377 + "addps 2048(%0, %%esi), %%xmm0 \n\t" | |
378 + "addps 3072(%0, %%esi), %%xmm1 \n\t" | |
379 + "addps %%xmm7, %%xmm0 \n\t" | |
380 + "addps %%xmm1, %%xmm0 \n\t" | |
381 + "movaps %%xmm0, (%0, %%esi) \n\t" | |
382 + "addl $16, %%esi \n\t" | |
383 + " jnz 1b \n\t" | |
384 + :: "r" (samples+256), "m" (bias) | |
385 + : "%esi" | |
386 + ); | |
387 +} | |
388 + | |
389 +static void mix5to1_SSE (sample_t * samples, sample_t bias) | |
390 +{ | |
391 + asm volatile( | |
392 + "movlps %1, %%xmm7 \n\t" | |
393 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
394 + "movl $-1024, %%esi \n\t" | |
395 + ".balign 16\n\t" | |
396 + "1: \n\t" | |
397 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
398 + "movaps 1024(%0, %%esi), %%xmm1 \n\t" | |
399 + "addps 2048(%0, %%esi), %%xmm0 \n\t" | |
400 + "addps 3072(%0, %%esi), %%xmm1 \n\t" | |
401 + "addps %%xmm7, %%xmm0 \n\t" | |
402 + "addps 4096(%0, %%esi), %%xmm1 \n\t" | |
403 + "addps %%xmm1, %%xmm0 \n\t" | |
404 + "movaps %%xmm0, (%0, %%esi) \n\t" | |
405 + "addl $16, %%esi \n\t" | |
406 + " jnz 1b \n\t" | |
407 + :: "r" (samples+256), "m" (bias) | |
408 + : "%esi" | |
409 + ); | |
410 +} | |
411 + | |
412 +static void mix3to2_SSE (sample_t * samples, sample_t bias) | |
413 +{ | |
414 + asm volatile( | |
415 + "movlps %1, %%xmm7 \n\t" | |
416 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
417 + "movl $-1024, %%esi \n\t" | |
418 + ".balign 16\n\t" | |
419 + "1: \n\t" | |
420 + "movaps 1024(%0, %%esi), %%xmm0 \n\t" | |
421 + "addps %%xmm7, %%xmm0 \n\t" //common | |
422 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
423 + "movaps 2048(%0, %%esi), %%xmm2 \n\t" | |
424 + "addps %%xmm0, %%xmm1 \n\t" | |
425 + "addps %%xmm0, %%xmm2 \n\t" | |
426 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
427 + "movaps %%xmm2, 1024(%0, %%esi) \n\t" | |
428 + "addl $16, %%esi \n\t" | |
429 + " jnz 1b \n\t" | |
430 + :: "r" (samples+256), "m" (bias) | |
431 + : "%esi" | |
432 + ); | |
433 +} | |
434 + | |
435 +static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias) | |
436 +{ | |
437 + asm volatile( | |
438 + "movlps %2, %%xmm7 \n\t" | |
439 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
440 + "movl $-1024, %%esi \n\t" | |
441 + ".balign 16\n\t" | |
442 + "1: \n\t" | |
443 + "movaps 1024(%1, %%esi), %%xmm0 \n\t" | |
444 + "addps %%xmm7, %%xmm0 \n\t" //common | |
445 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
446 + "movaps (%1, %%esi), %%xmm2 \n\t" | |
447 + "addps %%xmm0, %%xmm1 \n\t" | |
448 + "addps %%xmm0, %%xmm2 \n\t" | |
449 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
450 + "movaps %%xmm2, (%1, %%esi) \n\t" | |
451 + "addl $16, %%esi \n\t" | |
452 + " jnz 1b \n\t" | |
453 + :: "r" (left+256), "r" (right+256), "m" (bias) | |
454 + : "%esi" | |
455 + ); | |
456 +} | |
457 + | |
458 +static void mix21toS_SSE (sample_t * samples, sample_t bias) | |
459 +{ | |
460 + asm volatile( | |
461 + "movlps %1, %%xmm7 \n\t" | |
462 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
463 + "movl $-1024, %%esi \n\t" | |
464 + ".balign 16\n\t" | |
465 + "1: \n\t" | |
466 + "movaps 2048(%0, %%esi), %%xmm0 \n\t" // surround | |
467 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
468 + "movaps 1024(%0, %%esi), %%xmm2 \n\t" | |
469 + "addps %%xmm7, %%xmm1 \n\t" | |
470 + "addps %%xmm7, %%xmm2 \n\t" | |
471 + "subps %%xmm0, %%xmm1 \n\t" | |
472 + "addps %%xmm0, %%xmm2 \n\t" | |
473 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
474 + "movaps %%xmm2, 1024(%0, %%esi) \n\t" | |
475 + "addl $16, %%esi \n\t" | |
476 + " jnz 1b \n\t" | |
477 + :: "r" (samples+256), "m" (bias) | |
478 + : "%esi" | |
479 + ); | |
480 +} | |
481 + | |
482 +static void mix31to2_SSE (sample_t * samples, sample_t bias) | |
483 +{ | |
484 + asm volatile( | |
485 + "movlps %1, %%xmm7 \n\t" | |
486 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
487 + "movl $-1024, %%esi \n\t" | |
488 + ".balign 16\n\t" | |
489 + "1: \n\t" | |
490 + "movaps 1024(%0, %%esi), %%xmm0 \n\t" | |
491 + "addps 3072(%0, %%esi), %%xmm0 \n\t" | |
492 + "addps %%xmm7, %%xmm0 \n\t" // common | |
493 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
494 + "movaps 2048(%0, %%esi), %%xmm2 \n\t" | |
495 + "addps %%xmm0, %%xmm1 \n\t" | |
496 + "addps %%xmm0, %%xmm2 \n\t" | |
497 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
498 + "movaps %%xmm2, 1024(%0, %%esi) \n\t" | |
499 + "addl $16, %%esi \n\t" | |
500 + " jnz 1b \n\t" | |
501 + :: "r" (samples+256), "m" (bias) | |
502 + : "%esi" | |
503 + ); | |
504 +} | |
505 + | |
506 +static void mix31toS_SSE (sample_t * samples, sample_t bias) | |
507 +{ | |
508 + asm volatile( | |
509 + "movlps %1, %%xmm7 \n\t" | |
510 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
511 + "movl $-1024, %%esi \n\t" | |
512 + ".balign 16\n\t" | |
513 + "1: \n\t" | |
514 + "movaps 1024(%0, %%esi), %%xmm0 \n\t" | |
515 + "movaps 3072(%0, %%esi), %%xmm3 \n\t" // surround | |
516 + "addps %%xmm7, %%xmm0 \n\t" // common | |
517 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
518 + "movaps 2048(%0, %%esi), %%xmm2 \n\t" | |
519 + "addps %%xmm0, %%xmm1 \n\t" | |
520 + "addps %%xmm0, %%xmm2 \n\t" | |
521 + "subps %%xmm3, %%xmm1 \n\t" | |
522 + "addps %%xmm3, %%xmm2 \n\t" | |
523 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
524 + "movaps %%xmm2, 1024(%0, %%esi) \n\t" | |
525 + "addl $16, %%esi \n\t" | |
526 + " jnz 1b \n\t" | |
527 + :: "r" (samples+256), "m" (bias) | |
528 + : "%esi" | |
529 + ); | |
530 +} | |
531 + | |
532 +static void mix22toS_SSE (sample_t * samples, sample_t bias) | |
533 +{ | |
534 + asm volatile( | |
535 + "movlps %1, %%xmm7 \n\t" | |
536 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
537 + "movl $-1024, %%esi \n\t" | |
538 + ".balign 16\n\t" | |
539 + "1: \n\t" | |
540 + "movaps 2048(%0, %%esi), %%xmm0 \n\t" | |
541 + "addps 3072(%0, %%esi), %%xmm0 \n\t" // surround | |
542 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
543 + "movaps 1024(%0, %%esi), %%xmm2 \n\t" | |
544 + "addps %%xmm7, %%xmm1 \n\t" | |
545 + "addps %%xmm7, %%xmm2 \n\t" | |
546 + "subps %%xmm0, %%xmm1 \n\t" | |
547 + "addps %%xmm0, %%xmm2 \n\t" | |
548 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
549 + "movaps %%xmm2, 1024(%0, %%esi) \n\t" | |
550 + "addl $16, %%esi \n\t" | |
551 + " jnz 1b \n\t" | |
552 + :: "r" (samples+256), "m" (bias) | |
553 + : "%esi" | |
554 + ); | |
555 +} | |
556 + | |
557 +static void mix32to2_SSE (sample_t * samples, sample_t bias) | |
558 +{ | |
559 + asm volatile( | |
560 + "movlps %1, %%xmm7 \n\t" | |
561 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
562 + "movl $-1024, %%esi \n\t" | |
563 + ".balign 16\n\t" | |
564 + "1: \n\t" | |
565 + "movaps 1024(%0, %%esi), %%xmm0 \n\t" | |
566 + "addps %%xmm7, %%xmm0 \n\t" // common | |
567 + "movaps %%xmm0, %%xmm1 \n\t" // common | |
568 + "addps (%0, %%esi), %%xmm0 \n\t" | |
569 + "addps 2048(%0, %%esi), %%xmm1 \n\t" | |
570 + "addps 3072(%0, %%esi), %%xmm0 \n\t" | |
571 + "addps 4096(%0, %%esi), %%xmm1 \n\t" | |
572 + "movaps %%xmm0, (%0, %%esi) \n\t" | |
573 + "movaps %%xmm1, 1024(%0, %%esi) \n\t" | |
574 + "addl $16, %%esi \n\t" | |
575 + " jnz 1b \n\t" | |
576 + :: "r" (samples+256), "m" (bias) | |
577 + : "%esi" | |
578 + ); | |
579 +} | |
580 + | |
581 +static void mix32toS_SSE (sample_t * samples, sample_t bias) | |
582 +{ | |
583 + asm volatile( | |
584 + "movlps %1, %%xmm7 \n\t" | |
585 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
586 + "movl $-1024, %%esi \n\t" | |
587 + ".balign 16\n\t" | |
588 + "1: \n\t" | |
589 + "movaps 1024(%0, %%esi), %%xmm0 \n\t" | |
590 + "movaps 3072(%0, %%esi), %%xmm2 \n\t" | |
591 + "addps %%xmm7, %%xmm0 \n\t" // common | |
592 + "addps 4096(%0, %%esi), %%xmm2 \n\t" // surround | |
593 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
594 + "movaps 2048(%0, %%esi), %%xmm3 \n\t" | |
595 + "subps %%xmm2, %%xmm1 \n\t" | |
596 + "addps %%xmm2, %%xmm3 \n\t" | |
597 + "addps %%xmm0, %%xmm1 \n\t" | |
598 + "addps %%xmm0, %%xmm3 \n\t" | |
599 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
600 + "movaps %%xmm3, 1024(%0, %%esi) \n\t" | |
601 + "addl $16, %%esi \n\t" | |
602 + " jnz 1b \n\t" | |
603 + :: "r" (samples+256), "m" (bias) | |
604 + : "%esi" | |
605 + ); | |
606 +} | |
607 + | |
608 +static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias) | |
609 +{ | |
610 + asm volatile( | |
611 + "movlps %2, %%xmm7 \n\t" | |
612 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
613 + "movl $-1024, %%esi \n\t" | |
614 + ".balign 16\n\t" | |
615 + "1: \n\t" | |
616 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
617 + "movaps 16(%0, %%esi), %%xmm1 \n\t" | |
618 + "addps 1024(%0, %%esi), %%xmm0 \n\t" | |
619 + "addps 1040(%0, %%esi), %%xmm1 \n\t" | |
620 + "addps %%xmm7, %%xmm0 \n\t" | |
621 + "addps %%xmm7, %%xmm1 \n\t" | |
622 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
623 + "movaps %%xmm1, 16(%1, %%esi) \n\t" | |
624 + "addl $32, %%esi \n\t" | |
625 + " jnz 1b \n\t" | |
626 + :: "r" (src+256), "r" (dest+256), "m" (bias) | |
627 + : "%esi" | |
628 + ); | |
629 +} | |
630 + | |
631 +static void zero_MMX(sample_t * samples) | |
632 +{ | |
633 + asm volatile( | |
634 + "movl $-1024, %%esi \n\t" | |
635 + "pxor %%mm0, %%mm0 \n\t" | |
636 + ".balign 16\n\t" | |
637 + "1: \n\t" | |
638 + "movq %%mm0, (%0, %%esi) \n\t" | |
639 + "movq %%mm0, 8(%0, %%esi) \n\t" | |
640 + "movq %%mm0, 16(%0, %%esi) \n\t" | |
641 + "movq %%mm0, 24(%0, %%esi) \n\t" | |
642 + "addl $32, %%esi \n\t" | |
643 + " jnz 1b \n\t" | |
644 + "emms" | |
645 + :: "r" (samples+256) | |
646 + : "%esi" | |
647 + ); | |
648 +} | |
649 + | |
650 +/* | |
651 + I hope dest and src will be at least 8 byte aligned and size | |
652 + will devide on 8 without remain | |
653 + Note: untested and unused. | |
654 +*/ | |
655 +static void copy_MMX(void *dest,const void *src,unsigned size) | |
656 +{ | |
657 + unsigned i; | |
658 + size /= 64; | |
659 + for(i=0;i<size;i++) | |
660 + { | |
661 + __asm __volatile( | |
662 + "movq %0, %%mm0\n\t" | |
663 + "movq 8%0, %%mm1\n\t" | |
664 + "movq 16%0, %%mm2\n\t" | |
665 + "movq 24%0, %%mm3\n\t" | |
666 + "movq 32%0, %%mm4\n\t" | |
667 + "movq 40%0, %%mm5\n\t" | |
668 + "movq 48%0, %%mm6\n\t" | |
669 + "movq 56%0, %%mm7\n\t" | |
670 + "movq %%mm0, %1\n\t" | |
671 + "movq %%mm1, 8%1\n\t" | |
672 + "movq %%mm2, 16%1\n\t" | |
673 + "movq %%mm3, 24%1\n\t" | |
674 + "movq %%mm4, 32%1\n\t" | |
675 + "movq %%mm5, 40%1\n\t" | |
676 + "movq %%mm6, 48%1\n\t" | |
677 + "movq %%mm7, 56%1\n\t" | |
678 + : | |
679 + :"m"(src),"m"(dest)); | |
680 + } | |
681 +} | |
682 + | |
683 +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, | |
684 + sample_t clev, sample_t slev) | |
685 +{ | |
686 + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
687 + | |
688 + case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
689 + memcpy (samples, samples + 256, 256 * sizeof (sample_t)); | |
690 + break; | |
691 + | |
692 + case CONVERT (A52_CHANNEL, A52_MONO): | |
693 + case CONVERT (A52_STEREO, A52_MONO): | |
694 + mix_2to1_SSE: | |
695 + mix2to1_SSE (samples, samples + 256, bias); | |
696 + break; | |
697 + | |
698 + case CONVERT (A52_2F1R, A52_MONO): | |
699 + if (slev == 0) | |
700 + goto mix_2to1_SSE; | |
701 + case CONVERT (A52_3F, A52_MONO): | |
702 + mix_3to1_SSE: | |
703 + mix3to1_SSE (samples, bias); | |
704 + break; | |
705 + | |
706 + case CONVERT (A52_3F1R, A52_MONO): | |
707 + if (slev == 0) | |
708 + goto mix_3to1_SSE; | |
709 + case CONVERT (A52_2F2R, A52_MONO): | |
710 + if (slev == 0) | |
711 + goto mix_2to1_SSE; | |
712 + mix4to1_SSE (samples, bias); | |
713 + break; | |
714 + | |
715 + case CONVERT (A52_3F2R, A52_MONO): | |
716 + if (slev == 0) | |
717 + goto mix_3to1_SSE; | |
718 + mix5to1_SSE (samples, bias); | |
719 + break; | |
720 + | |
721 + case CONVERT (A52_MONO, A52_DOLBY): | |
722 + memcpy (samples + 256, samples, 256 * sizeof (sample_t)); | |
723 + break; | |
724 + | |
725 + case CONVERT (A52_3F, A52_STEREO): | |
726 + case CONVERT (A52_3F, A52_DOLBY): | |
727 + mix_3to2_SSE: | |
728 + mix3to2_SSE (samples, bias); | |
729 + break; | |
730 + | |
731 + case CONVERT (A52_2F1R, A52_STEREO): | |
732 + if (slev == 0) | |
733 + break; | |
734 + mix21to2_SSE (samples, samples + 256, bias); | |
735 + break; | |
736 + | |
737 + case CONVERT (A52_2F1R, A52_DOLBY): | |
738 + mix21toS_SSE (samples, bias); | |
739 + break; | |
740 + | |
741 + case CONVERT (A52_3F1R, A52_STEREO): | |
742 + if (slev == 0) | |
743 + goto mix_3to2_SSE; | |
744 + mix31to2_SSE (samples, bias); | |
745 + break; | |
746 + | |
747 + case CONVERT (A52_3F1R, A52_DOLBY): | |
748 + mix31toS_SSE (samples, bias); | |
749 + break; | |
750 + | |
751 + case CONVERT (A52_2F2R, A52_STEREO): | |
752 + if (slev == 0) | |
753 + break; | |
754 + mix2to1_SSE (samples, samples + 512, bias); | |
755 + mix2to1_SSE (samples + 256, samples + 768, bias); | |
756 + break; | |
757 + | |
758 + case CONVERT (A52_2F2R, A52_DOLBY): | |
759 + mix22toS_SSE (samples, bias); | |
760 + break; | |
761 + | |
762 + case CONVERT (A52_3F2R, A52_STEREO): | |
763 + if (slev == 0) | |
764 + goto mix_3to2_SSE; | |
765 + mix32to2_SSE (samples, bias); | |
766 + break; | |
767 + | |
768 + case CONVERT (A52_3F2R, A52_DOLBY): | |
769 + mix32toS_SSE (samples, bias); | |
770 + break; | |
771 + | |
772 + case CONVERT (A52_3F1R, A52_3F): | |
773 + if (slev == 0) | |
774 + break; | |
775 + mix21to2_SSE (samples, samples + 512, bias); | |
776 + break; | |
777 + | |
778 + case CONVERT (A52_3F2R, A52_3F): | |
779 + if (slev == 0) | |
780 + break; | |
781 + mix2to1_SSE (samples, samples + 768, bias); | |
782 + mix2to1_SSE (samples + 512, samples + 1024, bias); | |
783 + break; | |
784 + | |
785 + case CONVERT (A52_3F1R, A52_2F1R): | |
786 + mix3to2_SSE (samples, bias); | |
787 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
788 + break; | |
789 + | |
790 + case CONVERT (A52_2F2R, A52_2F1R): | |
791 + mix2to1_SSE (samples + 512, samples + 768, bias); | |
792 + break; | |
793 + | |
794 + case CONVERT (A52_3F2R, A52_2F1R): | |
795 + mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used) | |
796 + move2to1_SSE (samples + 768, samples + 512, bias); | |
797 + break; | |
798 + | |
799 + case CONVERT (A52_3F2R, A52_3F1R): | |
800 + mix2to1_SSE (samples + 768, samples + 1024, bias); | |
801 + break; | |
802 + | |
803 + case CONVERT (A52_2F1R, A52_2F2R): | |
804 + memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); | |
805 + break; | |
806 + | |
807 + case CONVERT (A52_3F1R, A52_2F2R): | |
808 + mix3to2_SSE (samples, bias); | |
809 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
810 + break; | |
811 + | |
812 + case CONVERT (A52_3F2R, A52_2F2R): | |
813 + mix3to2_SSE (samples, bias); | |
814 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
815 + memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); | |
816 + break; | |
817 + | |
818 + case CONVERT (A52_3F1R, A52_3F2R): | |
819 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); | |
820 + break; | |
821 + } | |
822 +} | |
823 + | |
824 +static void upmix_MMX (sample_t * samples, int acmod, int output) | |
825 +{ | |
826 + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
827 + | |
828 + case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
829 + memcpy (samples + 256, samples, 256 * sizeof (sample_t)); | |
830 + break; | |
831 + | |
832 + case CONVERT (A52_3F2R, A52_MONO): | |
833 + zero_MMX (samples + 1024); | |
834 + case CONVERT (A52_3F1R, A52_MONO): | |
835 + case CONVERT (A52_2F2R, A52_MONO): | |
836 + zero_MMX (samples + 768); | |
837 + case CONVERT (A52_3F, A52_MONO): | |
838 + case CONVERT (A52_2F1R, A52_MONO): | |
839 + zero_MMX (samples + 512); | |
840 + case CONVERT (A52_CHANNEL, A52_MONO): | |
841 + case CONVERT (A52_STEREO, A52_MONO): | |
842 + zero_MMX (samples + 256); | |
843 + break; | |
844 + | |
845 + case CONVERT (A52_3F2R, A52_STEREO): | |
846 + case CONVERT (A52_3F2R, A52_DOLBY): | |
847 + zero_MMX (samples + 1024); | |
848 + case CONVERT (A52_3F1R, A52_STEREO): | |
849 + case CONVERT (A52_3F1R, A52_DOLBY): | |
850 + zero_MMX (samples + 768); | |
851 + case CONVERT (A52_3F, A52_STEREO): | |
852 + case CONVERT (A52_3F, A52_DOLBY): | |
853 + mix_3to2_MMX: | |
854 + memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t)); | |
855 + zero_MMX (samples + 256); | |
856 + break; | |
857 + | |
858 + case CONVERT (A52_2F2R, A52_STEREO): | |
859 + case CONVERT (A52_2F2R, A52_DOLBY): | |
860 + zero_MMX (samples + 768); | |
861 + case CONVERT (A52_2F1R, A52_STEREO): | |
862 + case CONVERT (A52_2F1R, A52_DOLBY): | |
863 + zero_MMX (samples + 512); | |
864 + break; | |
865 + | |
866 + case CONVERT (A52_3F2R, A52_3F): | |
867 + zero_MMX (samples + 1024); | |
868 + case CONVERT (A52_3F1R, A52_3F): | |
869 + case CONVERT (A52_2F2R, A52_2F1R): | |
870 + zero_MMX (samples + 768); | |
871 + break; | |
872 + | |
873 + case CONVERT (A52_3F2R, A52_3F1R): | |
874 + zero_MMX (samples + 1024); | |
875 + break; | |
876 + | |
877 + case CONVERT (A52_3F2R, A52_2F1R): | |
878 + zero_MMX (samples + 1024); | |
879 + case CONVERT (A52_3F1R, A52_2F1R): | |
880 + mix_31to21_MMX: | |
881 + memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); | |
882 + goto mix_3to2_MMX; | |
883 + | |
884 + case CONVERT (A52_3F2R, A52_2F2R): | |
885 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); | |
886 + goto mix_31to21_MMX; | |
887 + } | |
888 +} | |
889 + | |
890 +static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias) | |
891 +{ | |
892 + asm volatile( | |
893 + "movd %2, %%mm7 \n\t" | |
894 + "punpckldq %2, %%mm7 \n\t" | |
895 + "movl $-1024, %%esi \n\t" | |
896 + ".balign 16\n\t" | |
897 + "1: \n\t" | |
898 + "movq (%0, %%esi), %%mm0 \n\t" | |
899 + "movq 8(%0, %%esi), %%mm1 \n\t" | |
900 + "movq 16(%0, %%esi), %%mm2 \n\t" | |
901 + "movq 24(%0, %%esi), %%mm3 \n\t" | |
902 + "pfadd (%1, %%esi), %%mm0 \n\t" | |
903 + "pfadd 8(%1, %%esi), %%mm1 \n\t" | |
904 + "pfadd 16(%1, %%esi), %%mm2 \n\t" | |
905 + "pfadd 24(%1, %%esi), %%mm3 \n\t" | |
906 + "pfadd %%mm7, %%mm0 \n\t" | |
907 + "pfadd %%mm7, %%mm1 \n\t" | |
908 + "pfadd %%mm7, %%mm2 \n\t" | |
909 + "pfadd %%mm7, %%mm3 \n\t" | |
910 + "movq %%mm0, (%1, %%esi) \n\t" | |
911 + "movq %%mm1, 8(%1, %%esi) \n\t" | |
912 + "movq %%mm2, 16(%1, %%esi) \n\t" | |
913 + "movq %%mm3, 24(%1, %%esi) \n\t" | |
914 + "addl $32, %%esi \n\t" | |
915 + " jnz 1b \n\t" | |
916 + :: "r" (src+256), "r" (dest+256), "m" (bias) | |
917 + : "%esi" | |
918 + ); | |
919 +} | |
920 + | |
921 +static void mix3to1_3dnow (sample_t * samples, sample_t bias) | |
922 +{ | |
923 + asm volatile( | |
924 + "movd %1, %%mm7 \n\t" | |
925 + "punpckldq %1, %%mm7 \n\t" | |
926 + "movl $-1024, %%esi \n\t" | |
927 + ".balign 16\n\t" | |
928 + "1: \n\t" | |
929 + "movq (%0, %%esi), %%mm0 \n\t" | |
930 + "movq 8(%0, %%esi), %%mm1 \n\t" | |
931 + "movq 1024(%0, %%esi), %%mm2 \n\t" | |
932 + "movq 1032(%0, %%esi), %%mm3 \n\t" | |
933 + "pfadd 2048(%0, %%esi), %%mm0 \n\t" | |
934 + "pfadd 2056(%0, %%esi), %%mm1 \n\t" | |
935 + "pfadd %%mm7, %%mm0 \n\t" | |
936 + "pfadd %%mm7, %%mm1 \n\t" | |
937 + "pfadd %%mm2, %%mm0 \n\t" | |
938 + "pfadd %%mm3, %%mm1 \n\t" | |
939 + "movq %%mm0, (%0, %%esi) \n\t" | |
940 + "movq %%mm1, 8(%0, %%esi) \n\t" | |
941 + "addl $16, %%esi \n\t" | |
942 + " jnz 1b \n\t" | |
943 + :: "r" (samples+256), "m" (bias) | |
944 + : "%esi" | |
945 + ); | |
946 +} | |
947 + | |
948 +static void mix4to1_3dnow (sample_t * samples, sample_t bias) | |
949 +{ | |
950 + asm volatile( | |
951 + "movd %1, %%mm7 \n\t" | |
952 + "punpckldq %1, %%mm7 \n\t" | |
953 + "movl $-1024, %%esi \n\t" | |
954 + ".balign 16\n\t" | |
955 + "1: \n\t" | |
956 + "movq (%0, %%esi), %%mm0 \n\t" | |
957 + "movq 8(%0, %%esi), %%mm1 \n\t" | |
958 + "movq 1024(%0, %%esi), %%mm2 \n\t" | |
959 + "movq 1032(%0, %%esi), %%mm3 \n\t" | |
960 + "pfadd 2048(%0, %%esi), %%mm0 \n\t" | |
961 + "pfadd 2056(%0, %%esi), %%mm1 \n\t" | |
962 + "pfadd 3072(%0, %%esi), %%mm2 \n\t" | |
963 + "pfadd 3080(%0, %%esi), %%mm3 \n\t" | |
964 + "pfadd %%mm7, %%mm0 \n\t" | |
965 + "pfadd %%mm7, %%mm1 \n\t" | |
966 + "pfadd %%mm2, %%mm0 \n\t" | |
967 + "pfadd %%mm3, %%mm1 \n\t" | |
968 + "movq %%mm0, (%0, %%esi) \n\t" | |
969 + "movq %%mm1, 8(%0, %%esi) \n\t" | |
970 + "addl $16, %%esi \n\t" | |
971 + " jnz 1b \n\t" | |
972 + :: "r" (samples+256), "m" (bias) | |
973 + : "%esi" | |
974 + ); | |
975 +} | |
976 + | |
977 +static void mix5to1_3dnow (sample_t * samples, sample_t bias) | |
978 +{ | |
979 + asm volatile( | |
980 + "movd %1, %%mm7 \n\t" | |
981 + "punpckldq %1, %%mm7 \n\t" | |
982 + "movl $-1024, %%esi \n\t" | |
983 + ".balign 16\n\t" | |
984 + "1: \n\t" | |
985 + "movq (%0, %%esi), %%mm0 \n\t" | |
986 + "movq 8(%0, %%esi), %%mm1 \n\t" | |
987 + "movq 1024(%0, %%esi), %%mm2 \n\t" | |
988 + "movq 1032(%0, %%esi), %%mm3 \n\t" | |
989 + "pfadd 2048(%0, %%esi), %%mm0 \n\t" | |
990 + "pfadd 2056(%0, %%esi), %%mm1 \n\t" | |
991 + "pfadd 3072(%0, %%esi), %%mm2 \n\t" | |
992 + "pfadd 3080(%0, %%esi), %%mm3 \n\t" | |
993 + "pfadd %%mm7, %%mm0 \n\t" | |
994 + "pfadd %%mm7, %%mm1 \n\t" | |
995 + "pfadd 4096(%0, %%esi), %%mm2 \n\t" | |
996 + "pfadd 4104(%0, %%esi), %%mm3 \n\t" | |
997 + "pfadd %%mm2, %%mm0 \n\t" | |
998 + "pfadd %%mm3, %%mm1 \n\t" | |
999 + "movq %%mm0, (%0, %%esi) \n\t" | |
1000 + "movq %%mm1, 8(%0, %%esi) \n\t" | |
1001 + "addl $16, %%esi \n\t" | |
1002 + " jnz 1b \n\t" | |
1003 + :: "r" (samples+256), "m" (bias) | |
1004 + : "%esi" | |
1005 + ); | |
1006 +} | |
1007 + | |
1008 +static void mix3to2_3dnow (sample_t * samples, sample_t bias) | |
1009 +{ | |
1010 + asm volatile( | |
1011 + "movd %1, %%mm7 \n\t" | |
1012 + "punpckldq %1, %%mm7 \n\t" | |
1013 + "movl $-1024, %%esi \n\t" | |
1014 + ".balign 16\n\t" | |
1015 + "1: \n\t" | |
1016 + "movq 1024(%0, %%esi), %%mm0 \n\t" | |
1017 + "movq 1032(%0, %%esi), %%mm1 \n\t" | |
1018 + "pfadd %%mm7, %%mm0 \n\t" //common | |
1019 + "pfadd %%mm7, %%mm1 \n\t" //common | |
1020 + "movq (%0, %%esi), %%mm2 \n\t" | |
1021 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
1022 + "movq 2048(%0, %%esi), %%mm4 \n\t" | |
1023 + "movq 2056(%0, %%esi), %%mm5 \n\t" | |
1024 + "pfadd %%mm0, %%mm2 \n\t" | |
1025 + "pfadd %%mm1, %%mm3 \n\t" | |
1026 + "pfadd %%mm0, %%mm4 \n\t" | |
1027 + "pfadd %%mm1, %%mm5 \n\t" | |
1028 + "movq %%mm2, (%0, %%esi) \n\t" | |
1029 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
1030 + "movq %%mm4, 1024(%0, %%esi) \n\t" | |
1031 + "movq %%mm5, 1032(%0, %%esi) \n\t" | |
1032 + "addl $16, %%esi \n\t" | |
1033 + " jnz 1b \n\t" | |
1034 + :: "r" (samples+256), "m" (bias) | |
1035 + : "%esi" | |
1036 + ); | |
1037 +} | |
1038 + | |
1039 +static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias) | |
1040 +{ | |
1041 + asm volatile( | |
1042 + "movd %2, %%mm7 \n\t" | |
1043 + "punpckldq %2, %%mm7 \n\t" | |
1044 + "movl $-1024, %%esi \n\t" | |
1045 + ".balign 16\n\t" | |
1046 + "1: \n\t" | |
1047 + "movq 1024(%1, %%esi), %%mm0 \n\t" | |
1048 + "movq 1032(%1, %%esi), %%mm1 \n\t" | |
1049 + "pfadd %%mm7, %%mm0 \n\t" //common | |
1050 + "pfadd %%mm7, %%mm1 \n\t" //common | |
1051 + "movq (%0, %%esi), %%mm2 \n\t" | |
1052 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
1053 + "movq (%1, %%esi), %%mm4 \n\t" | |
1054 + "movq 8(%1, %%esi), %%mm5 \n\t" | |
1055 + "pfadd %%mm0, %%mm2 \n\t" | |
1056 + "pfadd %%mm1, %%mm3 \n\t" | |
1057 + "pfadd %%mm0, %%mm4 \n\t" | |
1058 + "pfadd %%mm1, %%mm5 \n\t" | |
1059 + "movq %%mm2, (%0, %%esi) \n\t" | |
1060 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
1061 + "movq %%mm4, (%1, %%esi) \n\t" | |
1062 + "movq %%mm5, 8(%1, %%esi) \n\t" | |
1063 + "addl $16, %%esi \n\t" | |
1064 + " jnz 1b \n\t" | |
1065 + :: "r" (left+256), "r" (right+256), "m" (bias) | |
1066 + : "%esi" | |
1067 + ); | |
1068 +} | |
1069 + | |
1070 +static void mix21toS_3dnow (sample_t * samples, sample_t bias) | |
1071 +{ | |
1072 + asm volatile( | |
1073 + "movd %1, %%mm7 \n\t" | |
1074 + "punpckldq %1, %%mm7 \n\t" | |
1075 + "movl $-1024, %%esi \n\t" | |
1076 + ".balign 16\n\t" | |
1077 + "1: \n\t" | |
1078 + "movq 2048(%0, %%esi), %%mm0 \n\t" // surround | |
1079 + "movq 2056(%0, %%esi), %%mm1 \n\t" // surround | |
1080 + "movq (%0, %%esi), %%mm2 \n\t" | |
1081 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
1082 + "movq 1024(%0, %%esi), %%mm4 \n\t" | |
1083 + "movq 1032(%0, %%esi), %%mm5 \n\t" | |
1084 + "pfadd %%mm7, %%mm2 \n\t" | |
1085 + "pfadd %%mm7, %%mm3 \n\t" | |
1086 + "pfadd %%mm7, %%mm4 \n\t" | |
1087 + "pfadd %%mm7, %%mm5 \n\t" | |
1088 + "pfsub %%mm0, %%mm2 \n\t" | |
1089 + "pfsub %%mm1, %%mm3 \n\t" | |
1090 + "pfadd %%mm0, %%mm4 \n\t" | |
1091 + "pfadd %%mm1, %%mm5 \n\t" | |
1092 + "movq %%mm2, (%0, %%esi) \n\t" | |
1093 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
1094 + "movq %%mm4, 1024(%0, %%esi) \n\t" | |
1095 + "movq %%mm5, 1032(%0, %%esi) \n\t" | |
1096 + "addl $16, %%esi \n\t" | |
1097 + " jnz 1b \n\t" | |
1098 + :: "r" (samples+256), "m" (bias) | |
1099 + : "%esi" | |
1100 + ); | |
1101 +} | |
1102 + | |
1103 +static void mix31to2_3dnow (sample_t * samples, sample_t bias) | |
1104 +{ | |
1105 + asm volatile( | |
1106 + "movd %1, %%mm7 \n\t" | |
1107 + "punpckldq %1, %%mm7 \n\t" | |
1108 + "movl $-1024, %%esi \n\t" | |
1109 + ".balign 16\n\t" | |
1110 + "1: \n\t" | |
1111 + "movq 1024(%0, %%esi), %%mm0 \n\t" | |
1112 + "movq 1032(%0, %%esi), %%mm1 \n\t" | |
1113 + "pfadd 3072(%0, %%esi), %%mm0 \n\t" | |
1114 + "pfadd 3080(%0, %%esi), %%mm1 \n\t" | |
1115 + "pfadd %%mm7, %%mm0 \n\t" // common | |
1116 + "pfadd %%mm7, %%mm1 \n\t" // common | |
1117 + "movq (%0, %%esi), %%mm2 \n\t" | |
1118 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
1119 + "movq 2048(%0, %%esi), %%mm4 \n\t" | |
1120 + "movq 2056(%0, %%esi), %%mm5 \n\t" | |
1121 + "pfadd %%mm0, %%mm2 \n\t" | |
1122 + "pfadd %%mm1, %%mm3 \n\t" | |
1123 + "pfadd %%mm0, %%mm4 \n\t" | |
1124 + "pfadd %%mm1, %%mm5 \n\t" | |
1125 + "movq %%mm2, (%0, %%esi) \n\t" | |
1126 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
1127 + "movq %%mm4, 1024(%0, %%esi) \n\t" | |
1128 + "movq %%mm5, 1032(%0, %%esi) \n\t" | |
1129 + "addl $16, %%esi \n\t" | |
1130 + " jnz 1b \n\t" | |
1131 + :: "r" (samples+256), "m" (bias) | |
1132 + : "%esi" | |
1133 + ); | |
1134 +} | |
1135 + | |
1136 +static void mix31toS_3dnow (sample_t * samples, sample_t bias) | |
1137 +{ | |
1138 + asm volatile( | |
1139 + "movd %1, %%mm7 \n\t" | |
1140 + "punpckldq %1, %%mm7 \n\t" | |
1141 + "movl $-1024, %%esi \n\t" | |
1142 + ".balign 16\n\t" | |
1143 + "1: \n\t" | |
1144 + "movq 1024(%0, %%esi), %%mm0 \n\t" | |
1145 + "movq 1032(%0, %%esi), %%mm1 \n\t" | |
1146 + "pfadd %%mm7, %%mm0 \n\t" // common | |
1147 + "pfadd %%mm7, %%mm1 \n\t" // common | |
1148 + "movq (%0, %%esi), %%mm2 \n\t" | |
1149 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
1150 + "movq 2048(%0, %%esi), %%mm4 \n\t" | |
1151 + "movq 2056(%0, %%esi), %%mm5 \n\t" | |
1152 + "pfadd %%mm0, %%mm2 \n\t" | |
1153 + "pfadd %%mm1, %%mm3 \n\t" | |
1154 + "pfadd %%mm0, %%mm4 \n\t" | |
1155 + "pfadd %%mm1, %%mm5 \n\t" | |
1156 + "movq 3072(%0, %%esi), %%mm0 \n\t" // surround | |
1157 + "movq 3080(%0, %%esi), %%mm1 \n\t" // surround | |
1158 + "pfsub %%mm0, %%mm2 \n\t" | |
1159 + "pfsub %%mm1, %%mm3 \n\t" | |
1160 + "pfadd %%mm0, %%mm4 \n\t" | |
1161 + "pfadd %%mm1, %%mm5 \n\t" | |
1162 + "movq %%mm2, (%0, %%esi) \n\t" | |
1163 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
1164 + "movq %%mm4, 1024(%0, %%esi) \n\t" | |
1165 + "movq %%mm5, 1032(%0, %%esi) \n\t" | |
1166 + "addl $16, %%esi \n\t" | |
1167 + " jnz 1b \n\t" | |
1168 + :: "r" (samples+256), "m" (bias) | |
1169 + : "%esi" | |
1170 + ); | |
1171 +} | |
1172 + | |
1173 +static void mix22toS_3dnow (sample_t * samples, sample_t bias) | |
1174 +{ | |
1175 + asm volatile( | |
1176 + "movd %1, %%mm7 \n\t" | |
1177 + "punpckldq %1, %%mm7 \n\t" | |
1178 + "movl $-1024, %%esi \n\t" | |
1179 + ".balign 16\n\t" | |
1180 + "1: \n\t" | |
1181 + "movq 2048(%0, %%esi), %%mm0 \n\t" | |
1182 + "movq 2056(%0, %%esi), %%mm1 \n\t" | |
1183 + "pfadd 3072(%0, %%esi), %%mm0 \n\t" // surround | |
1184 + "pfadd 3080(%0, %%esi), %%mm1 \n\t" // surround | |
1185 + "movq (%0, %%esi), %%mm2 \n\t" | |
1186 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
1187 + "movq 1024(%0, %%esi), %%mm4 \n\t" | |
1188 + "movq 1032(%0, %%esi), %%mm5 \n\t" | |
1189 + "pfadd %%mm7, %%mm2 \n\t" | |
1190 + "pfadd %%mm7, %%mm3 \n\t" | |
1191 + "pfadd %%mm7, %%mm4 \n\t" | |
1192 + "pfadd %%mm7, %%mm5 \n\t" | |
1193 + "pfsub %%mm0, %%mm2 \n\t" | |
1194 + "pfsub %%mm1, %%mm3 \n\t" | |
1195 + "pfadd %%mm0, %%mm4 \n\t" | |
1196 + "pfadd %%mm1, %%mm5 \n\t" | |
1197 + "movq %%mm2, (%0, %%esi) \n\t" | |
1198 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
1199 + "movq %%mm4, 1024(%0, %%esi) \n\t" | |
1200 + "movq %%mm5, 1032(%0, %%esi) \n\t" | |
1201 + "addl $16, %%esi \n\t" | |
1202 + " jnz 1b \n\t" | |
1203 + :: "r" (samples+256), "m" (bias) | |
1204 + : "%esi" | |
1205 + ); | |
1206 +} | |
1207 + | |
1208 +static void mix32to2_3dnow (sample_t * samples, sample_t bias) | |
1209 +{ | |
1210 + asm volatile( | |
1211 + "movd %1, %%mm7 \n\t" | |
1212 + "punpckldq %1, %%mm7 \n\t" | |
1213 + "movl $-1024, %%esi \n\t" | |
1214 + ".balign 16\n\t" | |
1215 + "1: \n\t" | |
1216 + "movq 1024(%0, %%esi), %%mm0 \n\t" | |
1217 + "movq 1032(%0, %%esi), %%mm1 \n\t" | |
1218 + "pfadd %%mm7, %%mm0 \n\t" // common | |
1219 + "pfadd %%mm7, %%mm1 \n\t" // common | |
1220 + "movq %%mm0, %%mm2 \n\t" // common | |
1221 + "movq %%mm1, %%mm3 \n\t" // common | |
1222 + "pfadd (%0, %%esi), %%mm0 \n\t" | |
1223 + "pfadd 8(%0, %%esi), %%mm1 \n\t" | |
1224 + "pfadd 2048(%0, %%esi), %%mm2 \n\t" | |
1225 + "pfadd 2056(%0, %%esi), %%mm3 \n\t" | |
1226 + "pfadd 3072(%0, %%esi), %%mm0 \n\t" | |
1227 + "pfadd 3080(%0, %%esi), %%mm1 \n\t" | |
1228 + "pfadd 4096(%0, %%esi), %%mm2 \n\t" | |
1229 + "pfadd 4104(%0, %%esi), %%mm3 \n\t" | |
1230 + "movq %%mm0, (%0, %%esi) \n\t" | |
1231 + "movq %%mm1, 8(%0, %%esi) \n\t" | |
1232 + "movq %%mm2, 1024(%0, %%esi) \n\t" | |
1233 + "movq %%mm3, 1032(%0, %%esi) \n\t" | |
1234 + "addl $16, %%esi \n\t" | |
1235 + " jnz 1b \n\t" | |
1236 + :: "r" (samples+256), "m" (bias) | |
1237 + : "%esi" | |
1238 + ); | |
1239 +} | |
1240 + | |
1241 +/* todo: should be optimized better */ | |
1242 +static void mix32toS_3dnow (sample_t * samples, sample_t bias) | |
1243 +{ | |
1244 + asm volatile( | |
1245 + "movl $-1024, %%esi \n\t" | |
1246 + ".balign 16\n\t" | |
1247 + "1: \n\t" | |
1248 + "movd %1, %%mm7 \n\t" | |
1249 + "punpckldq %1, %%mm7 \n\t" | |
1250 + "movq 1024(%0, %%esi), %%mm0 \n\t" | |
1251 + "movq 1032(%0, %%esi), %%mm1 \n\t" | |
1252 + "movq 3072(%0, %%esi), %%mm4 \n\t" | |
1253 + "movq 3080(%0, %%esi), %%mm5 \n\t" | |
1254 + "pfadd %%mm7, %%mm0 \n\t" // common | |
1255 + "pfadd %%mm7, %%mm1 \n\t" // common | |
1256 + "pfadd 4096(%0, %%esi), %%mm4 \n\t" // surround | |
1257 + "pfadd 4104(%0, %%esi), %%mm5 \n\t" // surround | |
1258 + "movq (%0, %%esi), %%mm2 \n\t" | |
1259 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
1260 + "movq 2048(%0, %%esi), %%mm6 \n\t" | |
1261 + "movq 2056(%0, %%esi), %%mm7 \n\t" | |
1262 + "pfsub %%mm4, %%mm2 \n\t" | |
1263 + "pfsub %%mm5, %%mm3 \n\t" | |
1264 + "pfadd %%mm4, %%mm6 \n\t" | |
1265 + "pfadd %%mm5, %%mm7 \n\t" | |
1266 + "pfadd %%mm0, %%mm2 \n\t" | |
1267 + "pfadd %%mm1, %%mm3 \n\t" | |
1268 + "pfadd %%mm0, %%mm6 \n\t" | |
1269 + "pfadd %%mm1, %%mm7 \n\t" | |
1270 + "movq %%mm2, (%0, %%esi) \n\t" | |
1271 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
1272 + "movq %%mm6, 1024(%0, %%esi) \n\t" | |
1273 + "movq %%mm7, 1032(%0, %%esi) \n\t" | |
1274 + "addl $16, %%esi \n\t" | |
1275 + " jnz 1b \n\t" | |
1276 + :: "r" (samples+256), "m" (bias) | |
1277 + : "%esi" | |
1278 + ); | |
1279 +} | |
1280 + | |
1281 +static void move2to1_3dnow (sample_t * src, sample_t * dest, sample_t bias) | |
1282 +{ | |
1283 + asm volatile( | |
1284 + "movd %2, %%mm7 \n\t" | |
1285 + "punpckldq %2, %%mm7 \n\t" | |
1286 + "movl $-1024, %%esi \n\t" | |
1287 + ".balign 16\n\t" | |
1288 + "1: \n\t" | |
1289 + "movq (%0, %%esi), %%mm0 \n\t" | |
1290 + "movq 8(%0, %%esi), %%mm1 \n\t" | |
1291 + "movq 16(%0, %%esi), %%mm2 \n\t" | |
1292 + "movq 24(%0, %%esi), %%mm3 \n\t" | |
1293 + "pfadd 1024(%0, %%esi), %%mm0 \n\t" | |
1294 + "pfadd 1032(%0, %%esi), %%mm1 \n\t" | |
1295 + "pfadd 1040(%0, %%esi), %%mm2 \n\t" | |
1296 + "pfadd 1048(%0, %%esi), %%mm3 \n\t" | |
1297 + "pfadd %%mm7, %%mm0 \n\t" | |
1298 + "pfadd %%mm7, %%mm1 \n\t" | |
1299 + "pfadd %%mm7, %%mm2 \n\t" | |
1300 + "pfadd %%mm7, %%mm3 \n\t" | |
1301 + "movq %%mm0, (%1, %%esi) \n\t" | |
1302 + "movq %%mm1, 8(%1, %%esi) \n\t" | |
1303 + "movq %%mm2, 16(%1, %%esi) \n\t" | |
1304 + "movq %%mm3, 24(%1, %%esi) \n\t" | |
1305 + "addl $32, %%esi \n\t" | |
1306 + " jnz 1b \n\t" | |
1307 + :: "r" (src+256), "r" (dest+256), "m" (bias) | |
1308 + : "%esi" | |
1309 + ); | |
1310 +} | |
1311 + | |
1312 +static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, | |
1313 + sample_t clev, sample_t slev) | |
1314 +{ | |
1315 + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
1316 + | |
1317 + case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
1318 + memcpy (samples, samples + 256, 256 * sizeof (sample_t)); | |
1319 + break; | |
1320 + | |
1321 + case CONVERT (A52_CHANNEL, A52_MONO): | |
1322 + case CONVERT (A52_STEREO, A52_MONO): | |
1323 + mix_2to1_3dnow: | |
1324 + mix2to1_3dnow (samples, samples + 256, bias); | |
1325 + break; | |
1326 + | |
1327 + case CONVERT (A52_2F1R, A52_MONO): | |
1328 + if (slev == 0) | |
1329 + goto mix_2to1_3dnow; | |
1330 + case CONVERT (A52_3F, A52_MONO): | |
1331 + mix_3to1_3dnow: | |
1332 + mix3to1_3dnow (samples, bias); | |
1333 + break; | |
1334 + | |
1335 + case CONVERT (A52_3F1R, A52_MONO): | |
1336 + if (slev == 0) | |
1337 + goto mix_3to1_3dnow; | |
1338 + case CONVERT (A52_2F2R, A52_MONO): | |
1339 + if (slev == 0) | |
1340 + goto mix_2to1_3dnow; | |
1341 + mix4to1_3dnow (samples, bias); | |
1342 + break; | |
1343 + | |
1344 + case CONVERT (A52_3F2R, A52_MONO): | |
1345 + if (slev == 0) | |
1346 + goto mix_3to1_3dnow; | |
1347 + mix5to1_3dnow (samples, bias); | |
1348 + break; | |
1349 + | |
1350 + case CONVERT (A52_MONO, A52_DOLBY): | |
1351 + memcpy (samples + 256, samples, 256 * sizeof (sample_t)); | |
1352 + break; | |
1353 + | |
1354 + case CONVERT (A52_3F, A52_STEREO): | |
1355 + case CONVERT (A52_3F, A52_DOLBY): | |
1356 + mix_3to2_3dnow: | |
1357 + mix3to2_3dnow (samples, bias); | |
1358 + break; | |
1359 + | |
1360 + case CONVERT (A52_2F1R, A52_STEREO): | |
1361 + if (slev == 0) | |
1362 + break; | |
1363 + mix21to2_3dnow (samples, samples + 256, bias); | |
1364 + break; | |
1365 + | |
1366 + case CONVERT (A52_2F1R, A52_DOLBY): | |
1367 + mix21toS_3dnow (samples, bias); | |
1368 + break; | |
1369 + | |
1370 + case CONVERT (A52_3F1R, A52_STEREO): | |
1371 + if (slev == 0) | |
1372 + goto mix_3to2_3dnow; | |
1373 + mix31to2_3dnow (samples, bias); | |
1374 + break; | |
1375 + | |
1376 + case CONVERT (A52_3F1R, A52_DOLBY): | |
1377 + mix31toS_3dnow (samples, bias); | |
1378 + break; | |
1379 + | |
1380 + case CONVERT (A52_2F2R, A52_STEREO): | |
1381 + if (slev == 0) | |
1382 + break; | |
1383 + mix2to1_3dnow (samples, samples + 512, bias); | |
1384 + mix2to1_3dnow (samples + 256, samples + 768, bias); | |
1385 + break; | |
1386 + | |
1387 + case CONVERT (A52_2F2R, A52_DOLBY): | |
1388 + mix22toS_3dnow (samples, bias); | |
1389 + break; | |
1390 + | |
1391 + case CONVERT (A52_3F2R, A52_STEREO): | |
1392 + if (slev == 0) | |
1393 + goto mix_3to2_3dnow; | |
1394 + mix32to2_3dnow (samples, bias); | |
1395 + break; | |
1396 + | |
1397 + case CONVERT (A52_3F2R, A52_DOLBY): | |
1398 + mix32toS_3dnow (samples, bias); | |
1399 + break; | |
1400 + | |
1401 + case CONVERT (A52_3F1R, A52_3F): | |
1402 + if (slev == 0) | |
1403 + break; | |
1404 + mix21to2_3dnow (samples, samples + 512, bias); | |
1405 + break; | |
1406 + | |
1407 + case CONVERT (A52_3F2R, A52_3F): | |
1408 + if (slev == 0) | |
1409 + break; | |
1410 + mix2to1_3dnow (samples, samples + 768, bias); | |
1411 + mix2to1_3dnow (samples + 512, samples + 1024, bias); | |
1412 + break; | |
1413 + | |
1414 + case CONVERT (A52_3F1R, A52_2F1R): | |
1415 + mix3to2_3dnow (samples, bias); | |
1416 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
1417 + break; | |
1418 + | |
1419 + case CONVERT (A52_2F2R, A52_2F1R): | |
1420 + mix2to1_3dnow (samples + 512, samples + 768, bias); | |
1421 + break; | |
1422 + | |
1423 + case CONVERT (A52_3F2R, A52_2F1R): | |
1424 + mix3to2_3dnow (samples, bias); //FIXME possible bug? (output doesnt seem to be used) | |
1425 + move2to1_3dnow (samples + 768, samples + 512, bias); | |
1426 + break; | |
1427 + | |
1428 + case CONVERT (A52_3F2R, A52_3F1R): | |
1429 + mix2to1_3dnow (samples + 768, samples + 1024, bias); | |
1430 + break; | |
1431 + | |
1432 + case CONVERT (A52_2F1R, A52_2F2R): | |
1433 + memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); | |
1434 + break; | |
1435 + | |
1436 + case CONVERT (A52_3F1R, A52_2F2R): | |
1437 + mix3to2_3dnow (samples, bias); | |
1438 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
1439 + break; | |
1440 + | |
1441 + case CONVERT (A52_3F2R, A52_2F2R): | |
1442 + mix3to2_3dnow (samples, bias); | |
1443 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
1444 + memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); | |
1445 + break; | |
1446 + | |
1447 + case CONVERT (A52_3F1R, A52_3F2R): | |
1448 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); | |
1449 + break; | |
1450 + } | |
1451 + __asm __volatile("femms":::"memory"); | |
1452 +} | |
1453 + | |
1454 +#endif //ARCH_X86 | |
1455 --- liba52/imdct.c 2005-03-22 19:59:35.000000000 +0100 | |
1456 +++ imdct.c 2004-04-26 22:00:57.000000000 +0200 | |
1457 @@ -17,17 +23,32 @@ | |
1458 * You should have received a copy of the GNU General Public License | |
1459 * along with this program; if not, write to the Free Software | |
1460 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
1461 + * | |
1462 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) | |
1463 + * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru> | |
1464 + * michael did port them from libac3 (untested, perhaps totally broken) | |
1465 + * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org) | |
1466 */ | |
1467 | |
1468 #include "config.h" | |
1469 | |
1470 -#include <inttypes.h> | |
1471 #include <math.h> | |
1472 #include <stdio.h> | |
1473 +#ifndef M_PI | |
1474 +#define M_PI 3.1415926535897932384626433832795029 | |
1475 +#endif | |
1476 +#include <inttypes.h> | |
1477 | |
1478 #include "a52.h" | |
1479 #include "a52_internal.h" | |
1480 #include "mm_accel.h" | |
1481 +#include "mangle.h" | |
1482 + | |
1483 +#ifdef RUNTIME_CPUDETECT | |
1484 +#undef HAVE_3DNOWEX | |
1485 +#endif | |
1486 + | |
1487 +#define USE_AC3_C | |
1488 | |
1489 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias); | |
1490 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias); | |
1491 @@ -37,9 +58,22 @@ | |
1492 sample_t imag; | |
1493 } complex_t; | |
1494 | |
1495 +static void fft_128p(complex_t *a); | |
1496 + | |
1497 +static const int pm128[128] attribute_used __attribute__((aligned(16))) = | |
1498 +{ | |
1499 + 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120, | |
1500 + 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124, | |
1501 + 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122, | |
1502 + 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126, | |
1503 + 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121, | |
1504 + 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125, | |
1505 + 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123, | |
1506 + 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127 | |
1507 +}; | |
1508 | |
1509 /* 128 point bit-reverse LUT */ | |
1510 -static uint8_t bit_reverse_512[] = { | |
1511 +static uint8_t attribute_used bit_reverse_512[] = { | |
1512 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70, | |
1513 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78, | |
1514 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74, | |
1515 @@ -67,23 +101,42 @@ | |
1516 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b, | |
1517 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f}; | |
1518 | |
1519 -static complex_t buf[128]; | |
1520 +#ifdef ARCH_X86 | |
1521 +// NOTE: SSE needs 16byte alignment or it will segfault | |
1522 +// | |
1523 +static complex_t __attribute__((aligned(16))) buf[128]; | |
1524 +static float __attribute__((aligned(16))) sseSinCos1c[256]; | |
1525 +static float __attribute__((aligned(16))) sseSinCos1d[256]; | |
1526 +static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1}; | |
1527 +//static float __attribute__((aligned(16))) sseW0[4]; | |
1528 +static float __attribute__((aligned(16))) sseW1[8]; | |
1529 +static float __attribute__((aligned(16))) sseW2[16]; | |
1530 +static float __attribute__((aligned(16))) sseW3[32]; | |
1531 +static float __attribute__((aligned(16))) sseW4[64]; | |
1532 +static float __attribute__((aligned(16))) sseW5[128]; | |
1533 +static float __attribute__((aligned(16))) sseW6[256]; | |
1534 +static float __attribute__((aligned(16))) *sseW[7]= | |
1535 + {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; | |
1536 +static float __attribute__((aligned(16))) sseWindow[512]; | |
1537 +#else | |
1538 +static complex_t __attribute__((aligned(16))) buf[128]; | |
1539 +#endif | |
1540 | |
1541 /* Twiddle factor LUT */ | |
1542 -static complex_t w_1[1]; | |
1543 -static complex_t w_2[2]; | |
1544 -static complex_t w_4[4]; | |
1545 -static complex_t w_8[8]; | |
1546 -static complex_t w_16[16]; | |
1547 -static complex_t w_32[32]; | |
1548 -static complex_t w_64[64]; | |
1549 -static complex_t * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; | |
1550 +static complex_t __attribute__((aligned(16))) w_1[1]; | |
1551 +static complex_t __attribute__((aligned(16))) w_2[2]; | |
1552 +static complex_t __attribute__((aligned(16))) w_4[4]; | |
1553 +static complex_t __attribute__((aligned(16))) w_8[8]; | |
1554 +static complex_t __attribute__((aligned(16))) w_16[16]; | |
1555 +static complex_t __attribute__((aligned(16))) w_32[32]; | |
1556 +static complex_t __attribute__((aligned(16))) w_64[64]; | |
1557 +static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; | |
1558 | |
1559 /* Twiddle factors for IMDCT */ | |
1560 -static sample_t xcos1[128]; | |
1561 -static sample_t xsin1[128]; | |
1562 -static sample_t xcos2[64]; | |
1563 -static sample_t xsin2[64]; | |
1564 +static sample_t __attribute__((aligned(16))) xcos1[128]; | |
1565 +static sample_t __attribute__((aligned(16))) xsin1[128]; | |
1566 +static sample_t __attribute__((aligned(16))) xcos2[64]; | |
1567 +static sample_t __attribute__((aligned(16))) xsin2[64]; | |
1568 | |
1569 /* Windowing function for Modified DCT - Thank you acroread */ | |
1570 sample_t imdct_window[] = { | |
1571 @@ -145,16 +198,19 @@ | |
1572 void | |
1573 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias) | |
1574 { | |
1575 - int i,k; | |
1576 + int i; | |
1577 +#ifndef USE_AC3_C | |
1578 + int k; | |
1579 int p,q; | |
1580 int m; | |
1581 int two_m; | |
1582 int two_m_plus_one; | |
1583 | |
1584 - sample_t tmp_a_i; | |
1585 - sample_t tmp_a_r; | |
1586 sample_t tmp_b_i; | |
1587 sample_t tmp_b_r; | |
1588 +#endif | |
1589 + sample_t tmp_a_i; | |
1590 + sample_t tmp_a_r; | |
1591 | |
1592 sample_t *data_ptr; | |
1593 sample_t *delay_ptr; | |
1594 @@ -162,22 +218,21 @@ | |
1595 | |
1596 /* 512 IMDCT with source and dest data in 'data' */ | |
1597 | |
1598 - /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ | |
1599 + /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ | |
1600 for( i=0; i < 128; i++) { | |
1601 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ | |
1602 - buf[i].real = (data[256-2*i-1] * xcos1[i]) - (data[2*i] * xsin1[i]); | |
1603 - buf[i].imag = -1.0 * ((data[2*i] * xcos1[i]) + (data[256-2*i-1] * xsin1[i])); | |
1604 - } | |
1605 - | |
1606 - /* Bit reversed shuffling */ | |
1607 - for(i=0; i<128; i++) { | |
1608 - k = bit_reverse_512[i]; | |
1609 - if (k < i) | |
1610 - swap_cmplx(&buf[i],&buf[k]); | |
1611 +#ifdef USE_AC3_C | |
1612 + int j= pm128[i]; | |
1613 +#else | |
1614 + int j= bit_reverse_512[i]; | |
1615 +#endif | |
1616 + buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); | |
1617 + buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); | |
1618 } | |
1619 | |
1620 /* FFT Merge */ | |
1621 - for (m=0; m < 7; m++) { | |
1622 +/* unoptimized variant | |
1623 + for (m=1; m < 7; m++) { | |
1624 if(m) | |
1625 two_m = (1 << m); | |
1626 else | |
1627 @@ -185,8 +240,8 @@ | |
1628 | |
1629 two_m_plus_one = (1 << (m+1)); | |
1630 | |
1631 - for(k = 0; k < two_m; k++) { | |
1632 - for(i = 0; i < 128; i += two_m_plus_one) { | |
1633 + for(i = 0; i < 128; i += two_m_plus_one) { | |
1634 + for(k = 0; k < two_m; k++) { | |
1635 p = k + i; | |
1636 q = p + two_m; | |
1637 tmp_a_r = buf[p].real; | |
1638 @@ -200,7 +255,102 @@ | |
1639 } | |
1640 } | |
1641 } | |
1642 +*/ | |
1643 +#ifdef USE_AC3_C | |
1644 + fft_128p (&buf[0]); | |
1645 +#else | |
1646 + | |
1647 + /* 1. iteration */ | |
1648 + for(i = 0; i < 128; i += 2) { | |
1649 + tmp_a_r = buf[i].real; | |
1650 + tmp_a_i = buf[i].imag; | |
1651 + tmp_b_r = buf[i+1].real; | |
1652 + tmp_b_i = buf[i+1].imag; | |
1653 + buf[i].real = tmp_a_r + tmp_b_r; | |
1654 + buf[i].imag = tmp_a_i + tmp_b_i; | |
1655 + buf[i+1].real = tmp_a_r - tmp_b_r; | |
1656 + buf[i+1].imag = tmp_a_i - tmp_b_i; | |
1657 + } | |
1658 + | |
1659 + /* 2. iteration */ | |
1660 + // Note w[1]={{1,0}, {0,-1}} | |
1661 + for(i = 0; i < 128; i += 4) { | |
1662 + tmp_a_r = buf[i].real; | |
1663 + tmp_a_i = buf[i].imag; | |
1664 + tmp_b_r = buf[i+2].real; | |
1665 + tmp_b_i = buf[i+2].imag; | |
1666 + buf[i].real = tmp_a_r + tmp_b_r; | |
1667 + buf[i].imag = tmp_a_i + tmp_b_i; | |
1668 + buf[i+2].real = tmp_a_r - tmp_b_r; | |
1669 + buf[i+2].imag = tmp_a_i - tmp_b_i; | |
1670 + tmp_a_r = buf[i+1].real; | |
1671 + tmp_a_i = buf[i+1].imag; | |
1672 + tmp_b_r = buf[i+3].imag; | |
1673 + tmp_b_i = buf[i+3].real; | |
1674 + buf[i+1].real = tmp_a_r + tmp_b_r; | |
1675 + buf[i+1].imag = tmp_a_i - tmp_b_i; | |
1676 + buf[i+3].real = tmp_a_r - tmp_b_r; | |
1677 + buf[i+3].imag = tmp_a_i + tmp_b_i; | |
1678 + } | |
1679 | |
1680 + /* 3. iteration */ | |
1681 + for(i = 0; i < 128; i += 8) { | |
1682 + tmp_a_r = buf[i].real; | |
1683 + tmp_a_i = buf[i].imag; | |
1684 + tmp_b_r = buf[i+4].real; | |
1685 + tmp_b_i = buf[i+4].imag; | |
1686 + buf[i].real = tmp_a_r + tmp_b_r; | |
1687 + buf[i].imag = tmp_a_i + tmp_b_i; | |
1688 + buf[i+4].real = tmp_a_r - tmp_b_r; | |
1689 + buf[i+4].imag = tmp_a_i - tmp_b_i; | |
1690 + tmp_a_r = buf[1+i].real; | |
1691 + tmp_a_i = buf[1+i].imag; | |
1692 + tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; | |
1693 + tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; | |
1694 + buf[1+i].real = tmp_a_r + tmp_b_r; | |
1695 + buf[1+i].imag = tmp_a_i + tmp_b_i; | |
1696 + buf[i+5].real = tmp_a_r - tmp_b_r; | |
1697 + buf[i+5].imag = tmp_a_i - tmp_b_i; | |
1698 + tmp_a_r = buf[i+2].real; | |
1699 + tmp_a_i = buf[i+2].imag; | |
1700 + tmp_b_r = buf[i+6].imag; | |
1701 + tmp_b_i = - buf[i+6].real; | |
1702 + buf[i+2].real = tmp_a_r + tmp_b_r; | |
1703 + buf[i+2].imag = tmp_a_i + tmp_b_i; | |
1704 + buf[i+6].real = tmp_a_r - tmp_b_r; | |
1705 + buf[i+6].imag = tmp_a_i - tmp_b_i; | |
1706 + tmp_a_r = buf[i+3].real; | |
1707 + tmp_a_i = buf[i+3].imag; | |
1708 + tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; | |
1709 + tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; | |
1710 + buf[i+3].real = tmp_a_r + tmp_b_r; | |
1711 + buf[i+3].imag = tmp_a_i + tmp_b_i; | |
1712 + buf[i+7].real = tmp_a_r - tmp_b_r; | |
1713 + buf[i+7].imag = tmp_a_i - tmp_b_i; | |
1714 + } | |
1715 + | |
1716 + /* 4-7. iterations */ | |
1717 + for (m=3; m < 7; m++) { | |
1718 + two_m = (1 << m); | |
1719 + | |
1720 + two_m_plus_one = two_m<<1; | |
1721 + | |
1722 + for(i = 0; i < 128; i += two_m_plus_one) { | |
1723 + for(k = 0; k < two_m; k++) { | |
1724 + int p = k + i; | |
1725 + int q = p + two_m; | |
1726 + tmp_a_r = buf[p].real; | |
1727 + tmp_a_i = buf[p].imag; | |
1728 + tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
1729 + tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
1730 + buf[p].real = tmp_a_r + tmp_b_r; | |
1731 + buf[p].imag = tmp_a_i + tmp_b_i; | |
1732 + buf[q].real = tmp_a_r - tmp_b_r; | |
1733 + buf[q].imag = tmp_a_i - tmp_b_i; | |
1734 + } | |
1735 + } | |
1736 + } | |
1737 +#endif | |
1738 /* Post IFFT complex multiply plus IFFT complex conjugate*/ | |
1739 for( i=0; i < 128; i++) { | |
1740 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ | |
1741 @@ -219,12 +369,12 @@ | |
1742 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; | |
1743 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | |
1744 } | |
1745 - | |
1746 + | |
1747 for(i=0; i< 64; i++) { | |
1748 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; | |
1749 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | |
1750 } | |
1751 - | |
1752 + | |
1753 /* The trailing edge of the window goes into the delay line */ | |
1754 delay_ptr = delay; | |
1755 | |
1756 @@ -232,13 +382,717 @@ | |
1757 *delay_ptr++ = -buf[64+i].real * *--window_ptr; | |
1758 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; | |
1759 } | |
1760 - | |
1761 + | |
1762 for(i=0; i<64; i++) { | |
1763 *delay_ptr++ = buf[i].imag * *--window_ptr; | |
1764 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; | |
1765 } | |
1766 } | |
1767 | |
1768 +#ifdef HAVE_ALTIVEC | |
1769 + | |
1770 +#ifndef SYS_DARWIN | |
1771 +#include <altivec.h> | |
1772 +#endif | |
1773 + | |
1774 +// used to build registers permutation vectors (vcprm) | |
1775 +// the 's' are for words in the _s_econd vector | |
1776 +#define WORD_0 0x00,0x01,0x02,0x03 | |
1777 +#define WORD_1 0x04,0x05,0x06,0x07 | |
1778 +#define WORD_2 0x08,0x09,0x0a,0x0b | |
1779 +#define WORD_3 0x0c,0x0d,0x0e,0x0f | |
1780 +#define WORD_s0 0x10,0x11,0x12,0x13 | |
1781 +#define WORD_s1 0x14,0x15,0x16,0x17 | |
1782 +#define WORD_s2 0x18,0x19,0x1a,0x1b | |
1783 +#define WORD_s3 0x1c,0x1d,0x1e,0x1f | |
1784 + | |
1785 +#ifdef SYS_DARWIN | |
1786 +#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) | |
1787 +#else | |
1788 +#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} | |
1789 +#endif | |
1790 + | |
1791 +// vcprmle is used to keep the same index as in the SSE version. | |
1792 +// it's the same as vcprm, with the index inversed | |
1793 +// ('le' is Little Endian) | |
1794 +#define vcprmle(a,b,c,d) vcprm(d,c,b,a) | |
1795 + | |
1796 +// used to build inverse/identity vectors (vcii) | |
1797 +// n is _n_egative, p is _p_ositive | |
1798 +#define FLOAT_n -1. | |
1799 +#define FLOAT_p 1. | |
1800 + | |
1801 +#ifdef SYS_DARWIN | |
1802 +#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) | |
1803 +#else | |
1804 +#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} | |
1805 +#endif | |
1806 + | |
1807 +#ifdef SYS_DARWIN | |
1808 +#define FOUROF(a) (a) | |
1809 +#else | |
1810 +#define FOUROF(a) {a,a,a,a} | |
1811 +#endif | |
1812 + | |
1813 + | |
1814 +void | |
1815 +imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias) | |
1816 +{ | |
1817 + int i; | |
1818 + int k; | |
1819 + int p,q; | |
1820 + int m; | |
1821 + int two_m; | |
1822 + int two_m_plus_one; | |
1823 + | |
1824 + sample_t tmp_b_i; | |
1825 + sample_t tmp_b_r; | |
1826 + sample_t tmp_a_i; | |
1827 + sample_t tmp_a_r; | |
1828 + | |
1829 + sample_t *data_ptr; | |
1830 + sample_t *delay_ptr; | |
1831 + sample_t *window_ptr; | |
1832 + | |
1833 + /* 512 IMDCT with source and dest data in 'data' */ | |
1834 + | |
1835 + /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ | |
1836 + for( i=0; i < 128; i++) { | |
1837 + /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ | |
1838 + int j= bit_reverse_512[i]; | |
1839 + buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); | |
1840 + buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); | |
1841 + } | |
1842 + | |
1843 + /* 1. iteration */ | |
1844 + for(i = 0; i < 128; i += 2) { | |
1845 +#if 0 | |
1846 + tmp_a_r = buf[i].real; | |
1847 + tmp_a_i = buf[i].imag; | |
1848 + tmp_b_r = buf[i+1].real; | |
1849 + tmp_b_i = buf[i+1].imag; | |
1850 + buf[i].real = tmp_a_r + tmp_b_r; | |
1851 + buf[i].imag = tmp_a_i + tmp_b_i; | |
1852 + buf[i+1].real = tmp_a_r - tmp_b_r; | |
1853 + buf[i+1].imag = tmp_a_i - tmp_b_i; | |
1854 +#else | |
1855 + vector float temp, bufv; | |
1856 + | |
1857 + bufv = vec_ld(i << 3, (float*)buf); | |
1858 + temp = vec_perm(bufv, bufv, vcprm(2,3,0,1)); | |
1859 + bufv = vec_madd(bufv, vcii(p,p,n,n), temp); | |
1860 + vec_st(bufv, i << 3, (float*)buf); | |
1861 +#endif | |
1862 + } | |
1863 + | |
1864 + /* 2. iteration */ | |
1865 + // Note w[1]={{1,0}, {0,-1}} | |
1866 + for(i = 0; i < 128; i += 4) { | |
1867 +#if 0 | |
1868 + tmp_a_r = buf[i].real; | |
1869 + tmp_a_i = buf[i].imag; | |
1870 + tmp_b_r = buf[i+2].real; | |
1871 + tmp_b_i = buf[i+2].imag; | |
1872 + buf[i].real = tmp_a_r + tmp_b_r; | |
1873 + buf[i].imag = tmp_a_i + tmp_b_i; | |
1874 + buf[i+2].real = tmp_a_r - tmp_b_r; | |
1875 + buf[i+2].imag = tmp_a_i - tmp_b_i; | |
1876 + tmp_a_r = buf[i+1].real; | |
1877 + tmp_a_i = buf[i+1].imag; | |
1878 + /* WARNING: im <-> re here ! */ | |
1879 + tmp_b_r = buf[i+3].imag; | |
1880 + tmp_b_i = buf[i+3].real; | |
1881 + buf[i+1].real = tmp_a_r + tmp_b_r; | |
1882 + buf[i+1].imag = tmp_a_i - tmp_b_i; | |
1883 + buf[i+3].real = tmp_a_r - tmp_b_r; | |
1884 + buf[i+3].imag = tmp_a_i + tmp_b_i; | |
1885 +#else | |
1886 + vector float buf01, buf23, temp1, temp2; | |
1887 + | |
1888 + buf01 = vec_ld((i + 0) << 3, (float*)buf); | |
1889 + buf23 = vec_ld((i + 2) << 3, (float*)buf); | |
1890 + buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2)); | |
1891 + | |
1892 + temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01); | |
1893 + temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01); | |
1894 + | |
1895 + vec_st(temp1, (i + 0) << 3, (float*)buf); | |
1896 + vec_st(temp2, (i + 2) << 3, (float*)buf); | |
1897 +#endif | |
1898 + } | |
1899 + | |
1900 + /* 3. iteration */ | |
1901 + for(i = 0; i < 128; i += 8) { | |
1902 +#if 0 | |
1903 + tmp_a_r = buf[i].real; | |
1904 + tmp_a_i = buf[i].imag; | |
1905 + tmp_b_r = buf[i+4].real; | |
1906 + tmp_b_i = buf[i+4].imag; | |
1907 + buf[i].real = tmp_a_r + tmp_b_r; | |
1908 + buf[i].imag = tmp_a_i + tmp_b_i; | |
1909 + buf[i+4].real = tmp_a_r - tmp_b_r; | |
1910 + buf[i+4].imag = tmp_a_i - tmp_b_i; | |
1911 + tmp_a_r = buf[1+i].real; | |
1912 + tmp_a_i = buf[1+i].imag; | |
1913 + tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; | |
1914 + tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; | |
1915 + buf[1+i].real = tmp_a_r + tmp_b_r; | |
1916 + buf[1+i].imag = tmp_a_i + tmp_b_i; | |
1917 + buf[i+5].real = tmp_a_r - tmp_b_r; | |
1918 + buf[i+5].imag = tmp_a_i - tmp_b_i; | |
1919 + tmp_a_r = buf[i+2].real; | |
1920 + tmp_a_i = buf[i+2].imag; | |
1921 + /* WARNING re <-> im & sign */ | |
1922 + tmp_b_r = buf[i+6].imag; | |
1923 + tmp_b_i = - buf[i+6].real; | |
1924 + buf[i+2].real = tmp_a_r + tmp_b_r; | |
1925 + buf[i+2].imag = tmp_a_i + tmp_b_i; | |
1926 + buf[i+6].real = tmp_a_r - tmp_b_r; | |
1927 + buf[i+6].imag = tmp_a_i - tmp_b_i; | |
1928 + tmp_a_r = buf[i+3].real; | |
1929 + tmp_a_i = buf[i+3].imag; | |
1930 + tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; | |
1931 + tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; | |
1932 + buf[i+3].real = tmp_a_r + tmp_b_r; | |
1933 + buf[i+3].imag = tmp_a_i + tmp_b_i; | |
1934 + buf[i+7].real = tmp_a_r - tmp_b_r; | |
1935 + buf[i+7].imag = tmp_a_i - tmp_b_i; | |
1936 +#else | |
1937 + vector float buf01, buf23, buf45, buf67; | |
1938 + | |
1939 + buf01 = vec_ld((i + 0) << 3, (float*)buf); | |
1940 + buf23 = vec_ld((i + 2) << 3, (float*)buf); | |
1941 + | |
1942 + tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; | |
1943 + tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; | |
1944 + buf[i+5].real = tmp_b_r; | |
1945 + buf[i+5].imag = tmp_b_i; | |
1946 + tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; | |
1947 + tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; | |
1948 + buf[i+7].real = tmp_b_r; | |
1949 + buf[i+7].imag = tmp_b_i; | |
1950 + | |
1951 + buf23 = vec_ld((i + 2) << 3, (float*)buf); | |
1952 + buf45 = vec_ld((i + 4) << 3, (float*)buf); | |
1953 + buf67 = vec_ld((i + 6) << 3, (float*)buf); | |
1954 + buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3)); | |
1955 + | |
1956 + vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf); | |
1957 + vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf); | |
1958 + vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf); | |
1959 + vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf); | |
1960 +#endif | |
1961 + } | |
1962 + | |
1963 + /* 4-7. iterations */ | |
1964 + for (m=3; m < 7; m++) { | |
1965 + two_m = (1 << m); | |
1966 + | |
1967 + two_m_plus_one = two_m<<1; | |
1968 + | |
1969 + for(i = 0; i < 128; i += two_m_plus_one) { | |
1970 + for(k = 0; k < two_m; k+=2) { | |
1971 +#if 0 | |
1972 + int p = k + i; | |
1973 + int q = p + two_m; | |
1974 + tmp_a_r = buf[p].real; | |
1975 + tmp_a_i = buf[p].imag; | |
1976 + tmp_b_r = | |
1977 + buf[q].real * w[m][k].real - | |
1978 + buf[q].imag * w[m][k].imag; | |
1979 + tmp_b_i = | |
1980 + buf[q].imag * w[m][k].real + | |
1981 + buf[q].real * w[m][k].imag; | |
1982 + buf[p].real = tmp_a_r + tmp_b_r; | |
1983 + buf[p].imag = tmp_a_i + tmp_b_i; | |
1984 + buf[q].real = tmp_a_r - tmp_b_r; | |
1985 + buf[q].imag = tmp_a_i - tmp_b_i; | |
1986 + | |
1987 + tmp_a_r = buf[(p + 1)].real; | |
1988 + tmp_a_i = buf[(p + 1)].imag; | |
1989 + tmp_b_r = | |
1990 + buf[(q + 1)].real * w[m][(k + 1)].real - | |
1991 + buf[(q + 1)].imag * w[m][(k + 1)].imag; | |
1992 + tmp_b_i = | |
1993 + buf[(q + 1)].imag * w[m][(k + 1)].real + | |
1994 + buf[(q + 1)].real * w[m][(k + 1)].imag; | |
1995 + buf[(p + 1)].real = tmp_a_r + tmp_b_r; | |
1996 + buf[(p + 1)].imag = tmp_a_i + tmp_b_i; | |
1997 + buf[(q + 1)].real = tmp_a_r - tmp_b_r; | |
1998 + buf[(q + 1)].imag = tmp_a_i - tmp_b_i; | |
1999 +#else | |
2000 + int p = k + i; | |
2001 + int q = p + two_m; | |
2002 + vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4; | |
2003 + const vector float vczero = (const vector float)FOUROF(0.); | |
2004 + // first compute buf[q] and buf[q+1] | |
2005 + vecq = vec_ld(q << 3, (float*)buf); | |
2006 + vecw = vec_ld(0, (float*)&(w[m][k])); | |
2007 + temp1 = vec_madd(vecq, vecw, vczero); | |
2008 + temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2)); | |
2009 + temp2 = vec_madd(temp2, vecw, vczero); | |
2010 + temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2)); | |
2011 + temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3)); | |
2012 + vecq = vec_madd(temp4, vcii(n,p,n,p), temp3); | |
2013 + // then butterfly with buf[p] and buf[p+1] | |
2014 + vecp = vec_ld(p << 3, (float*)buf); | |
2015 + | |
2016 + temp1 = vec_add(vecp, vecq); | |
2017 + temp2 = vec_sub(vecp, vecq); | |
2018 + | |
2019 + vec_st(temp1, p << 3, (float*)buf); | |
2020 + vec_st(temp2, q << 3, (float*)buf); | |
2021 +#endif | |
2022 + } | |
2023 + } | |
2024 + } | |
2025 + | |
2026 + /* Post IFFT complex multiply plus IFFT complex conjugate*/ | |
2027 + for( i=0; i < 128; i+=4) { | |
2028 + /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ | |
2029 +#if 0 | |
2030 + tmp_a_r = buf[(i + 0)].real; | |
2031 + tmp_a_i = -1.0 * buf[(i + 0)].imag; | |
2032 + buf[(i + 0)].real = | |
2033 + (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]); | |
2034 + buf[(i + 0)].imag = | |
2035 + (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]); | |
2036 + | |
2037 + tmp_a_r = buf[(i + 1)].real; | |
2038 + tmp_a_i = -1.0 * buf[(i + 1)].imag; | |
2039 + buf[(i + 1)].real = | |
2040 + (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]); | |
2041 + buf[(i + 1)].imag = | |
2042 + (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]); | |
2043 + | |
2044 + tmp_a_r = buf[(i + 2)].real; | |
2045 + tmp_a_i = -1.0 * buf[(i + 2)].imag; | |
2046 + buf[(i + 2)].real = | |
2047 + (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]); | |
2048 + buf[(i + 2)].imag = | |
2049 + (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]); | |
2050 + | |
2051 + tmp_a_r = buf[(i + 3)].real; | |
2052 + tmp_a_i = -1.0 * buf[(i + 3)].imag; | |
2053 + buf[(i + 3)].real = | |
2054 + (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]); | |
2055 + buf[(i + 3)].imag = | |
2056 + (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]); | |
2057 +#else | |
2058 + vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2; | |
2059 + vector float temp0022, temp1133, tempCS01; | |
2060 + const vector float vczero = (const vector float)FOUROF(0.); | |
2061 + | |
2062 + bufv_0 = vec_ld((i + 0) << 3, (float*)buf); | |
2063 + bufv_2 = vec_ld((i + 2) << 3, (float*)buf); | |
2064 + | |
2065 + cosv = vec_ld(i << 2, xcos1); | |
2066 + sinv = vec_ld(i << 2, xsin1); | |
2067 + | |
2068 + temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2)); | |
2069 + temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3)); | |
2070 + tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1)); | |
2071 + temp1 = vec_madd(temp0022, tempCS01, vczero); | |
2072 + tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1)); | |
2073 + temp2 = vec_madd(temp1133, tempCS01, vczero); | |
2074 + bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1); | |
2075 + | |
2076 + vec_st(bufv_0, (i + 0) << 3, (float*)buf); | |
2077 + | |
2078 + /* idem with bufv_2 and high-order cosv/sinv */ | |
2079 + | |
2080 + temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2)); | |
2081 + temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3)); | |
2082 + tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3)); | |
2083 + temp1 = vec_madd(temp0022, tempCS01, vczero); | |
2084 + tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3)); | |
2085 + temp2 = vec_madd(temp1133, tempCS01, vczero); | |
2086 + bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1); | |
2087 + | |
2088 + vec_st(bufv_2, (i + 2) << 3, (float*)buf); | |
2089 + | |
2090 +#endif | |
2091 + } | |
2092 + | |
2093 + data_ptr = data; | |
2094 + delay_ptr = delay; | |
2095 + window_ptr = imdct_window; | |
2096 + | |
2097 + /* Window and convert to real valued signal */ | |
2098 + for(i=0; i< 64; i++) { | |
2099 + *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; | |
2100 + *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | |
2101 + } | |
2102 + | |
2103 + for(i=0; i< 64; i++) { | |
2104 + *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; | |
2105 + *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | |
2106 + } | |
2107 + | |
2108 + /* The trailing edge of the window goes into the delay line */ | |
2109 + delay_ptr = delay; | |
2110 + | |
2111 + for(i=0; i< 64; i++) { | |
2112 + *delay_ptr++ = -buf[64+i].real * *--window_ptr; | |
2113 + *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; | |
2114 + } | |
2115 + | |
2116 + for(i=0; i<64; i++) { | |
2117 + *delay_ptr++ = buf[i].imag * *--window_ptr; | |
2118 + *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; | |
2119 + } | |
2120 +} | |
2121 +#endif | |
2122 + | |
2123 + | |
2124 +// Stuff below this line is borrowed from libac3 | |
2125 +#include "srfftp.h" | |
2126 +#ifdef ARCH_X86 | |
2127 +#ifndef HAVE_3DNOW | |
2128 +#define HAVE_3DNOW 1 | |
2129 +#endif | |
2130 +#include "srfftp_3dnow.h" | |
2131 + | |
2132 +const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }}; | |
2133 +const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }}; | |
2134 +const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 }; | |
2135 + | |
2136 +#undef HAVE_3DNOWEX | |
2137 +#include "imdct_3dnow.h" | |
2138 +#define HAVE_3DNOWEX | |
2139 +#include "imdct_3dnow.h" | |
2140 + | |
2141 +void | |
2142 +imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias) | |
2143 +{ | |
2144 +/* int i,k; | |
2145 + int p,q;*/ | |
2146 + int m; | |
2147 + int two_m; | |
2148 + int two_m_plus_one; | |
2149 + | |
2150 +/* sample_t tmp_a_i; | |
2151 + sample_t tmp_a_r; | |
2152 + sample_t tmp_b_i; | |
2153 + sample_t tmp_b_r;*/ | |
2154 + | |
2155 + sample_t *data_ptr; | |
2156 + sample_t *delay_ptr; | |
2157 + sample_t *window_ptr; | |
2158 + | |
2159 + /* 512 IMDCT with source and dest data in 'data' */ | |
2160 + /* see the c version (dct_do_512()), its allmost identical, just in C */ | |
2161 + | |
2162 + /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ | |
2163 + /* Bit reversed shuffling */ | |
2164 + asm volatile( | |
2165 + "xorl %%esi, %%esi \n\t" | |
2166 + "leal "MANGLE(bit_reverse_512)", %%eax \n\t" | |
2167 + "movl $1008, %%edi \n\t" | |
2168 + "pushl %%ebp \n\t" //use ebp without telling gcc | |
2169 + ".balign 16 \n\t" | |
2170 + "1: \n\t" | |
2171 + "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI | |
2172 + "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI | |
2173 + "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi | |
2174 + "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi | |
2175 + "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR | |
2176 + "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t" | |
2177 + "mulps %%xmm0, %%xmm2 \n\t" | |
2178 + "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI | |
2179 + "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" | |
2180 + "subps %%xmm0, %%xmm2 \n\t" | |
2181 + "movzbl (%%eax), %%edx \n\t" | |
2182 + "movzbl 1(%%eax), %%ebp \n\t" | |
2183 + "movlps %%xmm2, (%1, %%edx,8) \n\t" | |
2184 + "movhps %%xmm2, (%1, %%ebp,8) \n\t" | |
2185 + "addl $16, %%esi \n\t" | |
2186 + "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap | |
2187 + "subl $16, %%edi \n\t" | |
2188 + " jnc 1b \n\t" | |
2189 + "popl %%ebp \n\t"//no we didnt touch ebp *g* | |
2190 + :: "b" (data), "c" (buf) | |
2191 + : "%esi", "%edi", "%eax", "%edx" | |
2192 + ); | |
2193 + | |
2194 + | |
2195 + /* FFT Merge */ | |
2196 +/* unoptimized variant | |
2197 + for (m=1; m < 7; m++) { | |
2198 + if(m) | |
2199 + two_m = (1 << m); | |
2200 + else | |
2201 + two_m = 1; | |
2202 + | |
2203 + two_m_plus_one = (1 << (m+1)); | |
2204 + | |
2205 + for(i = 0; i < 128; i += two_m_plus_one) { | |
2206 + for(k = 0; k < two_m; k++) { | |
2207 + p = k + i; | |
2208 + q = p + two_m; | |
2209 + tmp_a_r = buf[p].real; | |
2210 + tmp_a_i = buf[p].imag; | |
2211 + tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
2212 + tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
2213 + buf[p].real = tmp_a_r + tmp_b_r; | |
2214 + buf[p].imag = tmp_a_i + tmp_b_i; | |
2215 + buf[q].real = tmp_a_r - tmp_b_r; | |
2216 + buf[q].imag = tmp_a_i - tmp_b_i; | |
2217 + } | |
2218 + } | |
2219 + } | |
2220 +*/ | |
2221 + | |
2222 + /* 1. iteration */ | |
2223 + // Note w[0][0]={1,0} | |
2224 + asm volatile( | |
2225 + "xorps %%xmm1, %%xmm1 \n\t" | |
2226 + "xorps %%xmm2, %%xmm2 \n\t" | |
2227 + "movl %0, %%esi \n\t" | |
2228 + ".balign 16 \n\t" | |
2229 + "1: \n\t" | |
2230 + "movlps (%%esi), %%xmm0 \n\t" //buf[p] | |
2231 + "movlps 8(%%esi), %%xmm1\n\t" //buf[q] | |
2232 + "movhps (%%esi), %%xmm0 \n\t" //buf[p] | |
2233 + "movhps 8(%%esi), %%xmm2\n\t" //buf[q] | |
2234 + "addps %%xmm1, %%xmm0 \n\t" | |
2235 + "subps %%xmm2, %%xmm0 \n\t" | |
2236 + "movaps %%xmm0, (%%esi) \n\t" | |
2237 + "addl $16, %%esi \n\t" | |
2238 + "cmpl %1, %%esi \n\t" | |
2239 + " jb 1b \n\t" | |
2240 + :: "g" (buf), "r" (buf + 128) | |
2241 + : "%esi" | |
2242 + ); | |
2243 + | |
2244 + /* 2. iteration */ | |
2245 + // Note w[1]={{1,0}, {0,-1}} | |
2246 + asm volatile( | |
2247 + "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 | |
2248 + "movl %0, %%esi \n\t" | |
2249 + ".balign 16 \n\t" | |
2250 + "1: \n\t" | |
2251 + "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3 | |
2252 + "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 | |
2253 + "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 | |
2254 + "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 | |
2255 + "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1 | |
2256 + "addps %%xmm2, %%xmm0 \n\t" | |
2257 + "subps %%xmm2, %%xmm1 \n\t" | |
2258 + "movaps %%xmm0, (%%esi) \n\t" | |
2259 + "movaps %%xmm1, 16(%%esi) \n\t" | |
2260 + "addl $32, %%esi \n\t" | |
2261 + "cmpl %1, %%esi \n\t" | |
2262 + " jb 1b \n\t" | |
2263 + :: "g" (buf), "r" (buf + 128) | |
2264 + : "%esi" | |
2265 + ); | |
2266 + | |
2267 + /* 3. iteration */ | |
2268 +/* | |
2269 + Note sseW2+0={1,1,sqrt(2),sqrt(2)) | |
2270 + Note sseW2+16={0,0,sqrt(2),-sqrt(2)) | |
2271 + Note sseW2+32={0,0,-sqrt(2),-sqrt(2)) | |
2272 + Note sseW2+48={1,-1,sqrt(2),-sqrt(2)) | |
2273 +*/ | |
2274 + asm volatile( | |
2275 + "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" | |
2276 + "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" | |
2277 + "xorps %%xmm5, %%xmm5 \n\t" | |
2278 + "xorps %%xmm2, %%xmm2 \n\t" | |
2279 + "movl %0, %%esi \n\t" | |
2280 + ".balign 16 \n\t" | |
2281 + "1: \n\t" | |
2282 + "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5 | |
2283 + "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7 | |
2284 + "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 | |
2285 + "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 | |
2286 + "mulps %%xmm2, %%xmm4 \n\t" | |
2287 + "mulps %%xmm3, %%xmm5 \n\t" | |
2288 + "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 | |
2289 + "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 | |
2290 + "mulps %%xmm6, %%xmm3 \n\t" | |
2291 + "mulps %%xmm7, %%xmm2 \n\t" | |
2292 + "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 | |
2293 + "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3 | |
2294 + "addps %%xmm4, %%xmm2 \n\t" | |
2295 + "addps %%xmm5, %%xmm3 \n\t" | |
2296 + "movaps %%xmm2, %%xmm4 \n\t" | |
2297 + "movaps %%xmm3, %%xmm5 \n\t" | |
2298 + "addps %%xmm0, %%xmm2 \n\t" | |
2299 + "addps %%xmm1, %%xmm3 \n\t" | |
2300 + "subps %%xmm4, %%xmm0 \n\t" | |
2301 + "subps %%xmm5, %%xmm1 \n\t" | |
2302 + "movaps %%xmm2, (%%esi) \n\t" | |
2303 + "movaps %%xmm3, 16(%%esi) \n\t" | |
2304 + "movaps %%xmm0, 32(%%esi) \n\t" | |
2305 + "movaps %%xmm1, 48(%%esi) \n\t" | |
2306 + "addl $64, %%esi \n\t" | |
2307 + "cmpl %1, %%esi \n\t" | |
2308 + " jb 1b \n\t" | |
2309 + :: "g" (buf), "r" (buf + 128) | |
2310 + : "%esi" | |
2311 + ); | |
2312 + | |
2313 + /* 4-7. iterations */ | |
2314 + for (m=3; m < 7; m++) { | |
2315 + two_m = (1 << m); | |
2316 + two_m_plus_one = two_m<<1; | |
2317 + asm volatile( | |
2318 + "movl %0, %%esi \n\t" | |
2319 + ".balign 16 \n\t" | |
2320 + "1: \n\t" | |
2321 + "xorl %%edi, %%edi \n\t" // k | |
2322 + "leal (%%esi, %3), %%edx \n\t" | |
2323 + "2: \n\t" | |
2324 + "movaps (%%edx, %%edi), %%xmm1 \n\t" | |
2325 + "movaps (%4, %%edi, 2), %%xmm2 \n\t" | |
2326 + "mulps %%xmm1, %%xmm2 \n\t" | |
2327 + "shufps $0xB1, %%xmm1, %%xmm1 \n\t" | |
2328 + "mulps 16(%4, %%edi, 2), %%xmm1 \n\t" | |
2329 + "movaps (%%esi, %%edi), %%xmm0 \n\t" | |
2330 + "addps %%xmm2, %%xmm1 \n\t" | |
2331 + "movaps %%xmm1, %%xmm2 \n\t" | |
2332 + "addps %%xmm0, %%xmm1 \n\t" | |
2333 + "subps %%xmm2, %%xmm0 \n\t" | |
2334 + "movaps %%xmm1, (%%esi, %%edi) \n\t" | |
2335 + "movaps %%xmm0, (%%edx, %%edi) \n\t" | |
2336 + "addl $16, %%edi \n\t" | |
2337 + "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0 | |
2338 + " jb 2b \n\t" | |
2339 + "addl %2, %%esi \n\t" | |
2340 + "cmpl %1, %%esi \n\t" | |
2341 + " jb 1b \n\t" | |
2342 + :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3), | |
2343 + "r" (sseW[m]) | |
2344 + : "%esi", "%edi", "%edx" | |
2345 + ); | |
2346 + } | |
2347 + | |
2348 + /* Post IFFT complex multiply plus IFFT complex conjugate*/ | |
2349 + asm volatile( | |
2350 + "movl $-1024, %%esi \n\t" | |
2351 + ".balign 16 \n\t" | |
2352 + "1: \n\t" | |
2353 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
2354 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
2355 + "shufps $0xB1, %%xmm0, %%xmm0 \n\t" | |
2356 + "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t" | |
2357 + "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" | |
2358 + "addps %%xmm1, %%xmm0 \n\t" | |
2359 + "movaps %%xmm0, (%0, %%esi) \n\t" | |
2360 + "addl $16, %%esi \n\t" | |
2361 + " jnz 1b \n\t" | |
2362 + :: "r" (buf+128) | |
2363 + : "%esi" | |
2364 + ); | |
2365 + | |
2366 + | |
2367 + data_ptr = data; | |
2368 + delay_ptr = delay; | |
2369 + window_ptr = imdct_window; | |
2370 + | |
2371 + /* Window and convert to real valued signal */ | |
2372 + asm volatile( | |
2373 + "xorl %%edi, %%edi \n\t" // 0 | |
2374 + "xorl %%esi, %%esi \n\t" // 0 | |
2375 + "movss %3, %%xmm2 \n\t" // bias | |
2376 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | |
2377 + ".balign 16 \n\t" | |
2378 + "1: \n\t" | |
2379 + "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? | |
2380 + "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? | |
2381 + "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? | |
2382 + "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? | |
2383 + "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A | |
2384 + "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | |
2385 + "addps (%2, %%esi), %%xmm0 \n\t" | |
2386 + "addps %%xmm2, %%xmm0 \n\t" | |
2387 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
2388 + "addl $16, %%esi \n\t" | |
2389 + "subl $16, %%edi \n\t" | |
2390 + "cmpl $512, %%esi \n\t" | |
2391 + " jb 1b \n\t" | |
2392 + :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | |
2393 + : "%esi", "%edi" | |
2394 + ); | |
2395 + data_ptr+=128; | |
2396 + delay_ptr+=128; | |
2397 +// window_ptr+=128; | |
2398 + | |
2399 + asm volatile( | |
2400 + "movl $1024, %%edi \n\t" // 512 | |
2401 + "xorl %%esi, %%esi \n\t" // 0 | |
2402 + "movss %3, %%xmm2 \n\t" // bias | |
2403 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | |
2404 + ".balign 16 \n\t" | |
2405 + "1: \n\t" | |
2406 + "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A | |
2407 + "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C | |
2408 + "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C | |
2409 + "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A | |
2410 + "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A | |
2411 + "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | |
2412 + "addps (%2, %%esi), %%xmm0 \n\t" | |
2413 + "addps %%xmm2, %%xmm0 \n\t" | |
2414 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
2415 + "addl $16, %%esi \n\t" | |
2416 + "subl $16, %%edi \n\t" | |
2417 + "cmpl $512, %%esi \n\t" | |
2418 + " jb 1b \n\t" | |
2419 + :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | |
2420 + : "%esi", "%edi" | |
2421 + ); | |
2422 + data_ptr+=128; | |
2423 +// window_ptr+=128; | |
2424 + | |
2425 + /* The trailing edge of the window goes into the delay line */ | |
2426 + delay_ptr = delay; | |
2427 + | |
2428 + asm volatile( | |
2429 + "xorl %%edi, %%edi \n\t" // 0 | |
2430 + "xorl %%esi, %%esi \n\t" // 0 | |
2431 + ".balign 16 \n\t" | |
2432 + "1: \n\t" | |
2433 + "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A | |
2434 + "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C | |
2435 + "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C | |
2436 + "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A | |
2437 + "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A | |
2438 + "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | |
2439 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
2440 + "addl $16, %%esi \n\t" | |
2441 + "subl $16, %%edi \n\t" | |
2442 + "cmpl $512, %%esi \n\t" | |
2443 + " jb 1b \n\t" | |
2444 + :: "r" (buf+64), "r" (delay_ptr) | |
2445 + : "%esi", "%edi" | |
2446 + ); | |
2447 + delay_ptr+=128; | |
2448 +// window_ptr-=128; | |
2449 + | |
2450 + asm volatile( | |
2451 + "movl $1024, %%edi \n\t" // 1024 | |
2452 + "xorl %%esi, %%esi \n\t" // 0 | |
2453 + ".balign 16 \n\t" | |
2454 + "1: \n\t" | |
2455 + "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? | |
2456 + "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? | |
2457 + "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? | |
2458 + "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? | |
2459 + "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A | |
2460 + "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | |
2461 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
2462 + "addl $16, %%esi \n\t" | |
2463 + "subl $16, %%edi \n\t" | |
2464 + "cmpl $512, %%esi \n\t" | |
2465 + " jb 1b \n\t" | |
2466 + :: "r" (buf), "r" (delay_ptr) | |
2467 + : "%esi", "%edi" | |
2468 + ); | |
2469 +} | |
2470 +#endif //arch_x86 | |
2471 + | |
2472 void | |
2473 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias) | |
2474 { | |
2475 @@ -379,13 +1233,19 @@ | |
2476 { | |
2477 int i, j, k; | |
2478 | |
2479 - fprintf (stderr, "No accelerated IMDCT transform found\n"); | |
2480 - | |
2481 /* Twiddle factors to turn IFFT into IMDCT */ | |
2482 for (i = 0; i < 128; i++) { | |
2483 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); | |
2484 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); | |
2485 } | |
2486 +#ifdef ARCH_X86 | |
2487 + for (i = 0; i < 128; i++) { | |
2488 + sseSinCos1c[2*i+0]= xcos1[i]; | |
2489 + sseSinCos1c[2*i+1]= -xcos1[i]; | |
2490 + sseSinCos1d[2*i+0]= xsin1[i]; | |
2491 + sseSinCos1d[2*i+1]= xsin1[i]; | |
2492 + } | |
2493 +#endif | |
2494 | |
2495 /* More twiddle factors to turn IFFT into IMDCT */ | |
2496 for (i = 0; i < 64; i++) { | |
2497 @@ -400,7 +1260,334 @@ | |
2498 w[i][k].imag = sin (-M_PI * k / j); | |
2499 } | |
2500 } | |
2501 +#ifdef ARCH_X86 | |
2502 + for (i = 1; i < 7; i++) { | |
2503 + j = 1 << i; | |
2504 + for (k = 0; k < j; k+=2) { | |
2505 + | |
2506 + sseW[i][4*k + 0] = w[i][k+0].real; | |
2507 + sseW[i][4*k + 1] = w[i][k+0].real; | |
2508 + sseW[i][4*k + 2] = w[i][k+1].real; | |
2509 + sseW[i][4*k + 3] = w[i][k+1].real; | |
2510 + | |
2511 + sseW[i][4*k + 4] = -w[i][k+0].imag; | |
2512 + sseW[i][4*k + 5] = w[i][k+0].imag; | |
2513 + sseW[i][4*k + 6] = -w[i][k+1].imag; | |
2514 + sseW[i][4*k + 7] = w[i][k+1].imag; | |
2515 + | |
2516 + //we multiply more or less uninitalized numbers so we need to use exactly 0.0 | |
2517 + if(k==0) | |
2518 + { | |
2519 +// sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0; | |
2520 + sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0; | |
2521 + } | |
2522 + | |
2523 + if(2*k == j) | |
2524 + { | |
2525 + sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0; | |
2526 +// sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0); | |
2527 + } | |
2528 + } | |
2529 + } | |
2530 + | |
2531 + for(i=0; i<128; i++) | |
2532 + { | |
2533 + sseWindow[2*i+0]= -imdct_window[2*i+0]; | |
2534 + sseWindow[2*i+1]= imdct_window[2*i+1]; | |
2535 + } | |
2536 + | |
2537 + for(i=0; i<64; i++) | |
2538 + { | |
2539 + sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1]; | |
2540 + sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0]; | |
2541 + sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1]; | |
2542 + sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0]; | |
2543 + } | |
2544 +#endif // arch_x86 | |
2545 + | |
2546 imdct_512 = imdct_do_512; | |
2547 +#ifdef ARCH_X86 | |
2548 + if(mm_accel & MM_ACCEL_X86_SSE) | |
2549 + { | |
2550 + fprintf (stderr, "Using SSE optimized IMDCT transform\n"); | |
2551 + imdct_512 = imdct_do_512_sse; | |
2552 + } | |
2553 + else | |
2554 + if(mm_accel & MM_ACCEL_X86_3DNOWEXT) | |
2555 + { | |
2556 + fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n"); | |
2557 + imdct_512 = imdct_do_512_3dnowex; | |
2558 + } | |
2559 + else | |
2560 + if(mm_accel & MM_ACCEL_X86_3DNOW) | |
2561 + { | |
2562 + fprintf (stderr, "Using 3DNow optimized IMDCT transform\n"); | |
2563 + imdct_512 = imdct_do_512_3dnow; | |
2564 + } | |
2565 + else | |
2566 +#endif // arch_x86 | |
2567 +#ifdef HAVE_ALTIVEC | |
2568 + if (mm_accel & MM_ACCEL_PPC_ALTIVEC) | |
2569 + { | |
2570 + fprintf(stderr, "Using AltiVec optimized IMDCT transform\n"); | |
2571 + imdct_512 = imdct_do_512_altivec; | |
2572 + } | |
2573 + else | |
2574 +#endif | |
2575 + fprintf (stderr, "No accelerated IMDCT transform found\n"); | |
2576 imdct_256 = imdct_do_256; | |
2577 } | |
2578 } | |
2579 + | |
2580 +static void fft_asmb(int k, complex_t *x, complex_t *wTB, | |
2581 + const complex_t *d, const complex_t *d_3) | |
2582 +{ | |
2583 + register complex_t *x2k, *x3k, *x4k, *wB; | |
2584 + register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i; | |
2585 + | |
2586 + x2k = x + 2 * k; | |
2587 + x3k = x2k + 2 * k; | |
2588 + x4k = x3k + 2 * k; | |
2589 + wB = wTB + 2 * k; | |
2590 + | |
2591 + TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]); | |
2592 + TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]); | |
2593 + | |
2594 + --k; | |
2595 + for(;;) { | |
2596 + TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]); | |
2597 + TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]); | |
2598 + if (!--k) break; | |
2599 + x += 2; | |
2600 + x2k += 2; | |
2601 + x3k += 2; | |
2602 + x4k += 2; | |
2603 + d += 2; | |
2604 + d_3 += 2; | |
2605 + wTB += 2; | |
2606 + wB += 2; | |
2607 + } | |
2608 + | |
2609 +} | |
2610 + | |
2611 +static void fft_asmb16(complex_t *x, complex_t *wTB) | |
2612 +{ | |
2613 + register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i; | |
2614 + int k = 2; | |
2615 + | |
2616 + /* transform x[0], x[8], x[4], x[12] */ | |
2617 + TRANSZERO(x[0],x[4],x[8],x[12]); | |
2618 + | |
2619 + /* transform x[1], x[9], x[5], x[13] */ | |
2620 + TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]); | |
2621 + | |
2622 + /* transform x[2], x[10], x[6], x[14] */ | |
2623 + TRANSHALF_16(x[2],x[6],x[10],x[14]); | |
2624 + | |
2625 + /* transform x[3], x[11], x[7], x[15] */ | |
2626 + TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]); | |
2627 + | |
2628 +} | |
2629 + | |
2630 +static void fft_4(complex_t *x) | |
2631 +{ | |
2632 + /* delta_p = 1 here */ | |
2633 + /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} | |
2634 + */ | |
2635 + | |
2636 + register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i; | |
2637 + | |
2638 + yt_r = x[0].real; | |
2639 + yb_r = yt_r - x[2].real; | |
2640 + yt_r += x[2].real; | |
2641 + | |
2642 + u_r = x[1].real; | |
2643 + vi_i = x[3].real - u_r; | |
2644 + u_r += x[3].real; | |
2645 + | |
2646 + u_i = x[1].imag; | |
2647 + vi_r = u_i - x[3].imag; | |
2648 + u_i += x[3].imag; | |
2649 + | |
2650 + yt_i = yt_r; | |
2651 + yt_i += u_r; | |
2652 + x[0].real = yt_i; | |
2653 + yt_r -= u_r; | |
2654 + x[2].real = yt_r; | |
2655 + yt_i = yb_r; | |
2656 + yt_i += vi_r; | |
2657 + x[1].real = yt_i; | |
2658 + yb_r -= vi_r; | |
2659 + x[3].real = yb_r; | |
2660 + | |
2661 + yt_i = x[0].imag; | |
2662 + yb_i = yt_i - x[2].imag; | |
2663 + yt_i += x[2].imag; | |
2664 + | |
2665 + yt_r = yt_i; | |
2666 + yt_r += u_i; | |
2667 + x[0].imag = yt_r; | |
2668 + yt_i -= u_i; | |
2669 + x[2].imag = yt_i; | |
2670 + yt_r = yb_i; | |
2671 + yt_r += vi_i; | |
2672 + x[1].imag = yt_r; | |
2673 + yb_i -= vi_i; | |
2674 + x[3].imag = yb_i; | |
2675 +} | |
2676 + | |
2677 + | |
2678 +static void fft_8(complex_t *x) | |
2679 +{ | |
2680 + /* delta_p = diag{1, sqrt(i)} here */ | |
2681 + /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} | |
2682 + */ | |
2683 + register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i; | |
2684 + | |
2685 + wT1_r = x[1].real; | |
2686 + wT1_i = x[1].imag; | |
2687 + wB1_r = x[3].real; | |
2688 + wB1_i = x[3].imag; | |
2689 + | |
2690 + x[1] = x[2]; | |
2691 + x[2] = x[4]; | |
2692 + x[3] = x[6]; | |
2693 + fft_4(&x[0]); | |
2694 + | |
2695 + | |
2696 + /* x[0] x[4] */ | |
2697 + wT2_r = x[5].real; | |
2698 + wT2_r += x[7].real; | |
2699 + wT2_r += wT1_r; | |
2700 + wT2_r += wB1_r; | |
2701 + wT2_i = wT2_r; | |
2702 + wT2_r += x[0].real; | |
2703 + wT2_i = x[0].real - wT2_i; | |
2704 + x[0].real = wT2_r; | |
2705 + x[4].real = wT2_i; | |
2706 + | |
2707 + wT2_i = x[5].imag; | |
2708 + wT2_i += x[7].imag; | |
2709 + wT2_i += wT1_i; | |
2710 + wT2_i += wB1_i; | |
2711 + wT2_r = wT2_i; | |
2712 + wT2_r += x[0].imag; | |
2713 + wT2_i = x[0].imag - wT2_i; | |
2714 + x[0].imag = wT2_r; | |
2715 + x[4].imag = wT2_i; | |
2716 + | |
2717 + /* x[2] x[6] */ | |
2718 + wT2_r = x[5].imag; | |
2719 + wT2_r -= x[7].imag; | |
2720 + wT2_r += wT1_i; | |
2721 + wT2_r -= wB1_i; | |
2722 + wT2_i = wT2_r; | |
2723 + wT2_r += x[2].real; | |
2724 + wT2_i = x[2].real - wT2_i; | |
2725 + x[2].real = wT2_r; | |
2726 + x[6].real = wT2_i; | |
2727 + | |
2728 + wT2_i = x[5].real; | |
2729 + wT2_i -= x[7].real; | |
2730 + wT2_i += wT1_r; | |
2731 + wT2_i -= wB1_r; | |
2732 + wT2_r = wT2_i; | |
2733 + wT2_r += x[2].imag; | |
2734 + wT2_i = x[2].imag - wT2_i; | |
2735 + x[2].imag = wT2_i; | |
2736 + x[6].imag = wT2_r; | |
2737 + | |
2738 + | |
2739 + /* x[1] x[5] */ | |
2740 + wT2_r = wT1_r; | |
2741 + wT2_r += wB1_i; | |
2742 + wT2_r -= x[5].real; | |
2743 + wT2_r -= x[7].imag; | |
2744 + wT2_i = wT1_i; | |
2745 + wT2_i -= wB1_r; | |
2746 + wT2_i -= x[5].imag; | |
2747 + wT2_i += x[7].real; | |
2748 + | |
2749 + wB2_r = wT2_r; | |
2750 + wB2_r += wT2_i; | |
2751 + wT2_i -= wT2_r; | |
2752 + wB2_r *= HSQRT2; | |
2753 + wT2_i *= HSQRT2; | |
2754 + wT2_r = wB2_r; | |
2755 + wB2_r += x[1].real; | |
2756 + wT2_r = x[1].real - wT2_r; | |
2757 + | |
2758 + wB2_i = x[5].real; | |
2759 + x[1].real = wB2_r; | |
2760 + x[5].real = wT2_r; | |
2761 + | |
2762 + wT2_r = wT2_i; | |
2763 + wT2_r += x[1].imag; | |
2764 + wT2_i = x[1].imag - wT2_i; | |
2765 + wB2_r = x[5].imag; | |
2766 + x[1].imag = wT2_r; | |
2767 + x[5].imag = wT2_i; | |
2768 + | |
2769 + /* x[3] x[7] */ | |
2770 + wT1_r -= wB1_i; | |
2771 + wT1_i += wB1_r; | |
2772 + wB1_r = wB2_i - x[7].imag; | |
2773 + wB1_i = wB2_r + x[7].real; | |
2774 + wT1_r -= wB1_r; | |
2775 + wT1_i -= wB1_i; | |
2776 + wB1_r = wT1_r + wT1_i; | |
2777 + wB1_r *= HSQRT2; | |
2778 + wT1_i -= wT1_r; | |
2779 + wT1_i *= HSQRT2; | |
2780 + wB2_r = x[3].real; | |
2781 + wB2_i = wB2_r + wT1_i; | |
2782 + wB2_r -= wT1_i; | |
2783 + x[3].real = wB2_i; | |
2784 + x[7].real = wB2_r; | |
2785 + wB2_i = x[3].imag; | |
2786 + wB2_r = wB2_i + wB1_r; | |
2787 + wB2_i -= wB1_r; | |
2788 + x[3].imag = wB2_i; | |
2789 + x[7].imag = wB2_r; | |
2790 +} | |
2791 + | |
2792 + | |
2793 +static void fft_128p(complex_t *a) | |
2794 +{ | |
2795 + fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]); | |
2796 + fft_asmb16(&a[0], &a[8]); | |
2797 + | |
2798 + fft_8(&a[16]), fft_8(&a[24]); | |
2799 + fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]); | |
2800 + | |
2801 + fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]); | |
2802 + fft_asmb16(&a[32], &a[40]); | |
2803 + | |
2804 + fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]); | |
2805 + fft_asmb16(&a[48], &a[56]); | |
2806 + | |
2807 + fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]); | |
2808 + | |
2809 + fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]); | |
2810 + /* fft_16(&a[64]); */ | |
2811 + fft_asmb16(&a[64], &a[72]); | |
2812 + | |
2813 + fft_8(&a[80]); fft_8(&a[88]); | |
2814 + | |
2815 + /* fft_32(&a[64]); */ | |
2816 + fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]); | |
2817 + | |
2818 + fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]); | |
2819 + /* fft_16(&a[96]); */ | |
2820 + fft_asmb16(&a[96], &a[104]); | |
2821 + | |
2822 + fft_8(&a[112]), fft_8(&a[120]); | |
2823 + /* fft_32(&a[96]); */ | |
2824 + fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]); | |
2825 + | |
2826 + /* fft_128(&a[0]); */ | |
2827 + fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]); | |
2828 +} | |
2829 + | |
2830 + | |
2831 + | |
2832 --- liba52/imdct_mlib.c 2005-03-22 19:59:35.000000000 +0100 | |
2833 +++ imdct_mlib.c 2004-03-19 01:15:51.000000000 +0100 | |
2834 @@ -23,11 +29,11 @@ | |
2835 | |
2836 #ifdef LIBA52_MLIB | |
2837 | |
2838 -#include <inttypes.h> | |
2839 -#include <string.h> | |
2840 #include <mlib_types.h> | |
2841 #include <mlib_status.h> | |
2842 #include <mlib_signal.h> | |
2843 +#include <string.h> | |
2844 +#include <inttypes.h> | |
2845 | |
2846 #include "a52.h" | |
2847 #include "a52_internal.h" | |
2848 @@ -42,7 +48,7 @@ | |
2849 sample_t *data_ptr; | |
2850 sample_t *delay_ptr; | |
2851 sample_t *window_ptr; | |
2852 - sample_t tmp[256] __attribute__ ((__aligned__ (16))); | |
2853 + sample_t tmp[256] __attribute__((aligned(16))); | |
2854 int i; | |
2855 | |
2856 memcpy(tmp, data, 256 * sizeof(sample_t)); | |
2857 @@ -91,7 +97,7 @@ | |
2858 sample_t *data_ptr; | |
2859 sample_t *delay_ptr; | |
2860 sample_t *window_ptr; | |
2861 - sample_t tmp[256] __attribute__ ((__aligned__ (16))); | |
2862 + sample_t tmp[256] __attribute__((aligned(16))); | |
2863 int i; | |
2864 | |
2865 memcpy(tmp, data, 256 * sizeof(sample_t)); | |
2866 --- include/mm_accel.h 2005-03-22 19:58:53.000000000 +0100 | |
2867 +++ mm_accel.h 2004-03-19 01:15:52.000000000 +0100 | |
2868 @@ -19,12 +25,22 @@ | |
2869 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
2870 */ | |
2871 | |
2872 +#ifndef MM_ACCEL_H | |
2873 +#define MM_ACCEL_H | |
2874 + | |
2875 /* generic accelerations */ | |
2876 #define MM_ACCEL_MLIB 0x00000001 | |
2877 | |
2878 /* x86 accelerations */ | |
2879 #define MM_ACCEL_X86_MMX 0x80000000 | |
2880 #define MM_ACCEL_X86_3DNOW 0x40000000 | |
2881 +#define MM_ACCEL_X86_3DNOWEXT 0x08000000 | |
2882 #define MM_ACCEL_X86_MMXEXT 0x20000000 | |
2883 +#define MM_ACCEL_X86_SSE 0x10000000 | |
2884 + | |
2885 +/* PPC accelerations */ | |
2886 +#define MM_ACCEL_PPC_ALTIVEC 0x00010000 | |
2887 | |
2888 uint32_t mm_accel (void); | |
2889 + | |
2890 +#endif /* MM_ACCEL_H */ | |
2891 --- liba52/parse.c 2005-03-22 19:59:35.000000000 +0100 | |
2892 +++ parse.c 2004-04-01 15:41:29.000000000 +0200 | |
2893 @@ -21,21 +27,19 @@ | |
2894 | |
2895 #include "config.h" | |
2896 | |
2897 -#include <inttypes.h> | |
2898 #include <stdlib.h> | |
2899 #include <string.h> | |
2900 +#include <inttypes.h> | |
2901 | |
2902 #include "a52.h" | |
2903 #include "a52_internal.h" | |
2904 #include "bitstream.h" | |
2905 #include "tables.h" | |
2906 +#include "mm_accel.h" | |
2907 | |
2908 #ifdef HAVE_MEMALIGN | |
2909 /* some systems have memalign() but no declaration for it */ | |
2910 void * memalign (size_t align, size_t size); | |
2911 -#else | |
2912 -/* assume malloc alignment is sufficient */ | |
2913 -#define memalign(align,size) malloc (size) | |
2914 #endif | |
2915 | |
2916 typedef struct { | |
2917 @@ -54,12 +58,28 @@ | |
2918 sample_t * samples; | |
2919 int i; | |
2920 | |
2921 - imdct_init (mm_accel); | |
2922 - | |
2923 samples = memalign (16, 256 * 12 * sizeof (sample_t)); | |
2924 +#if defined(__MINGW32__) && defined(HAVE_SSE) | |
2925 + for(i=0;i<10;i++){ | |
2926 + if((int)samples%16){ | |
2927 + sample_t* samplestmp=malloc(256 * 12 * sizeof (sample_t)); | |
2928 + free(samples); | |
2929 + samples = samplestmp; | |
2930 + } | |
2931 + else break; | |
2932 + } | |
2933 +#endif | |
2934 + if(((int)samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){ | |
2935 + mm_accel &=~MM_ACCEL_X86_SSE; | |
2936 + printf("liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n"); | |
2937 + } | |
2938 + | |
2939 if (samples == NULL) | |
2940 - return NULL; | |
2941 - | |
2942 + return NULL; | |
2943 + | |
2944 + imdct_init (mm_accel); | |
2945 + downmix_accel_init(mm_accel); | |
2946 + | |
2947 for (i = 0; i < 256 * 12; i++) | |
2948 samples[i] = 0; | |
2949 | |
2950 @@ -124,7 +144,7 @@ | |
2951 state->acmod = acmod = buf[6] >> 5; | |
2952 | |
2953 bitstream_set_ptr (buf + 6); | |
2954 - bitstream_get (3); /* skip acmod we already parsed */ | |
2955 + bitstream_skip (3); /* skip acmod we already parsed */ | |
2956 | |
2957 if ((acmod == 2) && (bitstream_get (2) == 2)) /* dsurmod */ | |
2958 acmod = A52_DOLBY; | |
2959 @@ -144,7 +164,7 @@ | |
2960 if (state->lfeon && (*flags & A52_LFE)) | |
2961 state->output |= A52_LFE; | |
2962 *flags = state->output; | |
2963 - // the 2* compensates for differences in imdct | |
2964 + /* the 2* compensates for differences in imdct */ | |
2965 state->dynrng = state->level = 2 * *level; | |
2966 state->bias = bias; | |
2967 state->dynrnge = 1; | |
2968 @@ -152,28 +172,28 @@ | |
2969 | |
2970 chaninfo = !acmod; | |
2971 do { | |
2972 - bitstream_get (5); /* dialnorm */ | |
2973 + bitstream_skip (5); /* dialnorm */ | |
2974 if (bitstream_get (1)) /* compre */ | |
2975 - bitstream_get (8); /* compr */ | |
2976 + bitstream_skip (8); /* compr */ | |
2977 if (bitstream_get (1)) /* langcode */ | |
2978 - bitstream_get (8); /* langcod */ | |
2979 + bitstream_skip (8); /* langcod */ | |
2980 if (bitstream_get (1)) /* audprodie */ | |
2981 - bitstream_get (7); /* mixlevel + roomtyp */ | |
2982 + bitstream_skip (7); /* mixlevel + roomtyp */ | |
2983 } while (chaninfo--); | |
2984 | |
2985 - bitstream_get (2); /* copyrightb + origbs */ | |
2986 + bitstream_skip (2); /* copyrightb + origbs */ | |
2987 | |
2988 if (bitstream_get (1)) /* timecod1e */ | |
2989 - bitstream_get (14); /* timecod1 */ | |
2990 + bitstream_skip (14); /* timecod1 */ | |
2991 if (bitstream_get (1)) /* timecod2e */ | |
2992 - bitstream_get (14); /* timecod2 */ | |
2993 + bitstream_skip (14); /* timecod2 */ | |
2994 | |
2995 if (bitstream_get (1)) { /* addbsie */ | |
2996 int addbsil; | |
2997 | |
2998 addbsil = bitstream_get (6); | |
2999 do { | |
3000 - bitstream_get (8); /* addbsi */ | |
3001 + bitstream_skip (8); /* addbsi */ | |
3002 } while (addbsil--); | |
3003 } | |
3004 | |
3005 @@ -647,7 +667,7 @@ | |
3006 if (parse_exponents (chexpstr[i], nchgrps, state->fbw_exp[i][0], | |
3007 state->fbw_exp[i] + 1)) | |
3008 return 1; | |
3009 - bitstream_get (2); /* gainrng */ | |
3010 + bitstream_skip (2); /* gainrng */ | |
3011 } | |
3012 if (lfeexpstr != EXP_REUSE) { | |
3013 do_bit_alloc |= 32; | |
3014 @@ -729,7 +749,7 @@ | |
3015 if (bitstream_get (1)) { /* skiple */ | |
3016 i = bitstream_get (9); /* skipl */ | |
3017 while (i--) | |
3018 - bitstream_get (8); | |
3019 + bitstream_skip (8); | |
3020 } | |
3021 | |
3022 if (state->output & A52_LFE) | |
3023 |