Mercurial > mplayer.hg
view liba52/liba52_changes.diff @ 26625:5b89b42f6d50
Only compile and use libmpeg2 AltiVec code when AltiVec is available. The
AltiVec code needs -maltivec to compile, but then AltiVec instructions
appear in other places of the code causing MPlayer to sigill.
Somehow upstream libmpeg2 manages not to sigill under what appear to be
the same circumstances. Enlightenment welcome.
author | diego |
---|---|
date | Sat, 03 May 2008 15:23:22 +0000 |
parents | 236ab58453f7 |
children | 2aadf9302854 |
line wrap: on
line source
--- include/a52.h 2006-06-12 15:04:57.000000000 +0200 +++ liba52/a52.h 2006-06-05 02:23:02.000000000 +0200 @@ -59,4 +63,9 @@ int a52_block (a52_state_t * state); void a52_free (a52_state_t * state); +void* a52_resample_init(uint32_t mm_accel,int flags,int chans); +extern int (* a52_resample) (float * _f, int16_t * s16); + +uint16_t crc16_block(uint8_t *data,uint32_t num_bytes); + #endif /* A52_H */ --- liba52/a52_internal.h 2006-06-12 15:05:07.000000000 +0200 +++ liba52/a52_internal.h 2006-06-05 02:23:02.000000000 +0200 @@ -103,18 +107,34 @@ #define DELTA_BIT_NONE (2) #define DELTA_BIT_RESERVED (3) +#ifdef ARCH_X86_64 +# define REG_a "rax" +# define REG_d "rdx" +# define REG_S "rsi" +# define REG_D "rdi" +# define REG_BP "rbp" +#else +# define REG_a "eax" +# define REG_d "edx" +# define REG_S "esi" +# define REG_D "edi" +# define REG_BP "ebp" +#endif + void a52_bit_allocate (a52_state_t * state, ba_t * ba, int bndstart, int start, int end, int fastleak, int slowleak, expbap_t * expbap); int a52_downmix_init (int input, int flags, sample_t * level, sample_t clev, sample_t slev); +void downmix_accel_init(uint32_t mm_accel); int a52_downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level, sample_t clev, sample_t slev); -void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias, +extern void (*a52_downmix) (sample_t * samples, int acmod, int output, sample_t bias, sample_t clev, sample_t slev); -void a52_upmix (sample_t * samples, int acmod, int output); +extern void (*a52_upmix) (sample_t * samples, int acmod, int output); void a52_imdct_init (uint32_t mm_accel); void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias); -void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias); +extern void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias); +void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias); --- liba52/bitstream.c 2006-06-12 15:05:07.000000000 +0200 +++ liba52/bitstream.c 2006-06-05 02:23:02.000000000 +0200 @@ -31,6 +35,10 @@ #define BUFFER_SIZE 4096 +#ifdef ALT_BITSTREAM_READER +int indx=0; +#endif + void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf) { int align; @@ -38,6 +46,9 @@ align = (long)buf & 3; state->buffer_start = (uint32_t *) (buf - align); state->bits_left = 0; +#ifdef ALT_BITSTREAM_READER + indx=0; +#endif bitstream_get (state, align * 8); } --- liba52/bitstream.h 2006-06-12 15:05:07.000000000 +0200 +++ liba52/bitstream.h 2006-06-05 02:23:02.000000000 +0200 @@ -21,6 +25,48 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +/* code from ffmpeg/libavcodec */ +#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC_ == 3 && __GNUC_MINOR__ > 0) +# define always_inline __attribute__((always_inline)) inline +#else +# define always_inline inline +#endif + +#if defined(__sparc__) || defined(hpux) +/* + * the alt bitstream reader performs unaligned memory accesses; that doesn't work + * on sparc/hpux. For now, disable ALT_BITSTREAM_READER. + */ +#undef ALT_BITSTREAM_READER +#else +// alternative (faster) bitstram reader (reades upto 3 bytes over the end of the input) +#define ALT_BITSTREAM_READER + +/* used to avoid misaligned exceptions on some archs (alpha, ...) */ +#if defined (ARCH_X86) || defined(ARCH_ARMV4L) +# define unaligned32(a) (*(uint32_t*)(a)) +#else +# ifdef __GNUC__ +static always_inline uint32_t unaligned32(const void *v) { + struct Unaligned { + uint32_t i; + } __attribute__((packed)); + + return ((const struct Unaligned *) v)->i; +} +# elif defined(__DECC) +static inline uint32_t unaligned32(const void *v) { + return *(const __unaligned uint32_t *) v; +} +# else +static inline uint32_t unaligned32(const void *v) { + return *(const uint32_t *) v; +} +# endif +#endif //!ARCH_X86 + +#endif + /* (stolen from the kernel) */ #ifdef WORDS_BIGENDIAN @@ -28,7 +74,7 @@ #else -# if 0 && defined (__i386__) +# if defined (__i386__) # define swab32(x) __i386_swab32(x) static inline const uint32_t __i386_swab32(uint32_t x) @@ -39,19 +85,34 @@ # else -# define swab32(x)\ -((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) | \ - (((uint8_t*)&x)[2] << 8) | (((uint8_t*)&x)[3])) - +# define swab32(x) __generic_swab32(x) + static always_inline const uint32_t __generic_swab32(uint32_t x) + { + return ((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) | + (((uint8_t*)&x)[2] << 8) | (((uint8_t*)&x)[3])); + } # endif #endif +#ifdef ALT_BITSTREAM_READER +extern int indx; +#endif + void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf); uint32_t a52_bitstream_get_bh (a52_state_t * state, uint32_t num_bits); int32_t a52_bitstream_get_bh_2 (a52_state_t * state, uint32_t num_bits); static inline uint32_t bitstream_get (a52_state_t * state, uint32_t num_bits) { +#ifdef ALT_BITSTREAM_READER + uint32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) ); + + result<<= (indx&0x07); + result>>= 32 - num_bits; + indx+= num_bits; + + return result; +#else uint32_t result; if (num_bits < state->bits_left) { @@ -61,10 +122,29 @@ } return a52_bitstream_get_bh (state, num_bits); +#endif +} + +static inline void bitstream_skip(a52_state_t * state, int num_bits) +{ +#ifdef ALT_BITSTREAM_READER + indx+= num_bits; +#else + bitstream_get(state, num_bits); +#endif } static inline int32_t bitstream_get_2 (a52_state_t * state, uint32_t num_bits) { +#ifdef ALT_BITSTREAM_READER + int32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) ); + + result<<= (indx&0x07); + result>>= 32 - num_bits; + indx+= num_bits; + + return result; +#else int32_t result; if (num_bits < state->bits_left) { @@ -74,4 +154,5 @@ } return a52_bitstream_get_bh_2 (state, num_bits); +#endif } --- liba52/downmix.c 2006-06-12 15:17:53.000000000 +0200 +++ liba52/downmix.c 2006-06-05 02:23:02.000000000 +0200 @@ -19,18 +23,46 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) */ #include "config.h" #include <string.h> #include <inttypes.h> #include "a52.h" #include "a52_internal.h" +#include "mm_accel.h" #define CONVERT(acmod,output) (((output) << 3) + (acmod)) + +void (*a52_downmix)(sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev)= NULL; +void (*a52_upmix)(sample_t * samples, int acmod, int output)= NULL; + +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev); +static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev); +static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev); +static void upmix_MMX (sample_t * samples, int acmod, int output); +static void upmix_C (sample_t * samples, int acmod, int output); + +void downmix_accel_init(uint32_t mm_accel) +{ + a52_upmix= upmix_C; + a52_downmix= downmix_C; +#if defined(ARCH_X86) || defined(ARCH_X86_64) + if(mm_accel & MM_ACCEL_X86_MMX) a52_upmix= upmix_MMX; + if(mm_accel & MM_ACCEL_X86_SSE) a52_downmix= downmix_SSE; + if(mm_accel & MM_ACCEL_X86_3DNOW) a52_downmix= downmix_3dnow; +#endif +} + int a52_downmix_init (int input, int flags, sample_t * level, sample_t clev, sample_t slev) { @@ -447,7 +479,7 @@ samples[i] = 0; } -void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias, +void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, sample_t clev, sample_t slev) { switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { @@ -559,7 +591,7 @@ break; case CONVERT (A52_3F2R, A52_2F1R): - mix3to2 (samples, bias); + mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used) move2to1 (samples + 768, samples + 512, bias); break; @@ -583,12 +615,12 @@ break; case CONVERT (A52_3F1R, A52_3F2R): - memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t)); + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); break; } } -void a52_upmix (sample_t * samples, int acmod, int output) +void upmix_C (sample_t * samples, int acmod, int output) { switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { @@ -653,3 +685,1137 @@ goto mix_31to21; } } + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias) +{ + asm volatile( + "movlps %2, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" + "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" + "addps (%1, %%"REG_S"), %%xmm0 \n\t" + "addps 16(%1, %%"REG_S"), %%xmm1\n\t" + "addps %%xmm7, %%xmm0 \n\t" + "addps %%xmm7, %%xmm1 \n\t" + "movaps %%xmm0, (%1, %%"REG_S") \n\t" + "movaps %%xmm1, 16(%1, %%"REG_S")\n\t" + "add $32, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (src+256), "r" (dest+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix3to1_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" + "addps %%xmm7, %%xmm1 \n\t" + "addps %%xmm1, %%xmm0 \n\t" + "movaps %%xmm0, (%0, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix4to1_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" + "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" + "addps %%xmm7, %%xmm0 \n\t" + "addps %%xmm1, %%xmm0 \n\t" + "movaps %%xmm0, (%0, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix5to1_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" + "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" + "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" + "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" + "addps %%xmm7, %%xmm0 \n\t" + "addps 4096(%0, %%"REG_S"), %%xmm1\n\t" + "addps %%xmm1, %%xmm0 \n\t" + "movaps %%xmm0, (%0, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix3to2_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" + "addps %%xmm7, %%xmm0 \n\t" //common + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" + "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%"REG_S") \n\t" + "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias) +{ + asm volatile( + "movlps %2, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t" + "addps %%xmm7, %%xmm0 \n\t" //common + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" + "movaps (%1, %%"REG_S"), %%xmm2 \n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%"REG_S") \n\t" + "movaps %%xmm2, (%1, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (left+256), "r" (right+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix21toS_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" // surround + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" + "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm7, %%xmm1 \n\t" + "addps %%xmm7, %%xmm2 \n\t" + "subps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%"REG_S") \n\t" + "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix31to2_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" + "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" + "addps %%xmm7, %%xmm0 \n\t" // common + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" + "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%"REG_S") \n\t" + "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix31toS_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" + "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround + "addps %%xmm7, %%xmm0 \n\t" // common + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" + "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "subps %%xmm3, %%xmm1 \n\t" + "addps %%xmm3, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%"REG_S") \n\t" + "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix22toS_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" + "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" + "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm7, %%xmm1 \n\t" + "addps %%xmm7, %%xmm2 \n\t" + "subps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%"REG_S") \n\t" + "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix32to2_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" + "addps %%xmm7, %%xmm0 \n\t" // common + "movaps %%xmm0, %%xmm1 \n\t" // common + "addps (%0, %%"REG_S"), %%xmm0 \n\t" + "addps 2048(%0, %%"REG_S"), %%xmm1\n\t" + "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" + "addps 4096(%0, %%"REG_S"), %%xmm1\n\t" + "movaps %%xmm0, (%0, %%"REG_S") \n\t" + "movaps %%xmm1, 1024(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix32toS_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" + "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm7, %%xmm0 \n\t" // common + "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" + "movaps 2048(%0, %%"REG_S"), %%xmm3\n\t" + "subps %%xmm2, %%xmm1 \n\t" + "addps %%xmm2, %%xmm3 \n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm3 \n\t" + "movaps %%xmm1, (%0, %%"REG_S") \n\t" + "movaps %%xmm3, 1024(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias) +{ + asm volatile( + "movlps %2, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" + "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" + "addps 1024(%0, %%"REG_S"), %%xmm0\n\t" + "addps 1040(%0, %%"REG_S"), %%xmm1\n\t" + "addps %%xmm7, %%xmm0 \n\t" + "addps %%xmm7, %%xmm1 \n\t" + "movaps %%xmm0, (%1, %%"REG_S") \n\t" + "movaps %%xmm1, 16(%1, %%"REG_S")\n\t" + "add $32, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (src+256), "r" (dest+256), "m" (bias) + : "%"REG_S + ); +} + +static void zero_MMX(sample_t * samples) +{ + asm volatile( + "mov $-1024, %%"REG_S" \n\t" + "pxor %%mm0, %%mm0 \n\t" + ASMALIGN(4) + "1: \n\t" + "movq %%mm0, (%0, %%"REG_S") \n\t" + "movq %%mm0, 8(%0, %%"REG_S") \n\t" + "movq %%mm0, 16(%0, %%"REG_S") \n\t" + "movq %%mm0, 24(%0, %%"REG_S") \n\t" + "add $32, %%"REG_S" \n\t" + " jnz 1b \n\t" + "emms" + :: "r" (samples+256) + : "%"REG_S + ); +} + +/* + I hope dest and src will be at least 8 byte aligned and size + will devide on 8 without remain + Note: untested and unused. +*/ +static void copy_MMX(void *dest,const void *src,unsigned size) +{ + unsigned i; + size /= 64; + for(i=0;i<size;i++) + { + __asm __volatile( + "movq %0, %%mm0\n\t" + "movq 8%0, %%mm1\n\t" + "movq 16%0, %%mm2\n\t" + "movq 24%0, %%mm3\n\t" + "movq 32%0, %%mm4\n\t" + "movq 40%0, %%mm5\n\t" + "movq 48%0, %%mm6\n\t" + "movq 56%0, %%mm7\n\t" + "movq %%mm0, %1\n\t" + "movq %%mm1, 8%1\n\t" + "movq %%mm2, 16%1\n\t" + "movq %%mm3, 24%1\n\t" + "movq %%mm4, 32%1\n\t" + "movq %%mm5, 40%1\n\t" + "movq %%mm6, 48%1\n\t" + "movq %%mm7, 56%1\n\t" + : + :"m"(src),"m"(dest)); + } +} + +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev) +{ + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { + + case CONVERT (A52_CHANNEL, A52_CHANNEL2): + memcpy (samples, samples + 256, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_CHANNEL, A52_MONO): + case CONVERT (A52_STEREO, A52_MONO): + mix_2to1_SSE: + mix2to1_SSE (samples, samples + 256, bias); + break; + + case CONVERT (A52_2F1R, A52_MONO): + if (slev == 0) + goto mix_2to1_SSE; + case CONVERT (A52_3F, A52_MONO): + mix_3to1_SSE: + mix3to1_SSE (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_MONO): + if (slev == 0) + goto mix_3to1_SSE; + case CONVERT (A52_2F2R, A52_MONO): + if (slev == 0) + goto mix_2to1_SSE; + mix4to1_SSE (samples, bias); + break; + + case CONVERT (A52_3F2R, A52_MONO): + if (slev == 0) + goto mix_3to1_SSE; + mix5to1_SSE (samples, bias); + break; + + case CONVERT (A52_MONO, A52_DOLBY): + memcpy (samples + 256, samples, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F, A52_STEREO): + case CONVERT (A52_3F, A52_DOLBY): + mix_3to2_SSE: + mix3to2_SSE (samples, bias); + break; + + case CONVERT (A52_2F1R, A52_STEREO): + if (slev == 0) + break; + mix21to2_SSE (samples, samples + 256, bias); + break; + + case CONVERT (A52_2F1R, A52_DOLBY): + mix21toS_SSE (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_STEREO): + if (slev == 0) + goto mix_3to2_SSE; + mix31to2_SSE (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_DOLBY): + mix31toS_SSE (samples, bias); + break; + + case CONVERT (A52_2F2R, A52_STEREO): + if (slev == 0) + break; + mix2to1_SSE (samples, samples + 512, bias); + mix2to1_SSE (samples + 256, samples + 768, bias); + break; + + case CONVERT (A52_2F2R, A52_DOLBY): + mix22toS_SSE (samples, bias); + break; + + case CONVERT (A52_3F2R, A52_STEREO): + if (slev == 0) + goto mix_3to2_SSE; + mix32to2_SSE (samples, bias); + break; + + case CONVERT (A52_3F2R, A52_DOLBY): + mix32toS_SSE (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_3F): + if (slev == 0) + break; + mix21to2_SSE (samples, samples + 512, bias); + break; + + case CONVERT (A52_3F2R, A52_3F): + if (slev == 0) + break; + mix2to1_SSE (samples, samples + 768, bias); + mix2to1_SSE (samples + 512, samples + 1024, bias); + break; + + case CONVERT (A52_3F1R, A52_2F1R): + mix3to2_SSE (samples, bias); + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_2F2R, A52_2F1R): + mix2to1_SSE (samples + 512, samples + 768, bias); + break; + + case CONVERT (A52_3F2R, A52_2F1R): + mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used) + move2to1_SSE (samples + 768, samples + 512, bias); + break; + + case CONVERT (A52_3F2R, A52_3F1R): + mix2to1_SSE (samples + 768, samples + 1024, bias); + break; + + case CONVERT (A52_2F1R, A52_2F2R): + memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F1R, A52_2F2R): + mix3to2_SSE (samples, bias); + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F2R, A52_2F2R): + mix3to2_SSE (samples, bias); + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); + memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F1R, A52_3F2R): + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); + break; + } +} + +static void upmix_MMX (sample_t * samples, int acmod, int output) +{ + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { + + case CONVERT (A52_CHANNEL, A52_CHANNEL2): + memcpy (samples + 256, samples, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F2R, A52_MONO): + zero_MMX (samples + 1024); + case CONVERT (A52_3F1R, A52_MONO): + case CONVERT (A52_2F2R, A52_MONO): + zero_MMX (samples + 768); + case CONVERT (A52_3F, A52_MONO): + case CONVERT (A52_2F1R, A52_MONO): + zero_MMX (samples + 512); + case CONVERT (A52_CHANNEL, A52_MONO): + case CONVERT (A52_STEREO, A52_MONO): + zero_MMX (samples + 256); + break; + + case CONVERT (A52_3F2R, A52_STEREO): + case CONVERT (A52_3F2R, A52_DOLBY): + zero_MMX (samples + 1024); + case CONVERT (A52_3F1R, A52_STEREO): + case CONVERT (A52_3F1R, A52_DOLBY): + zero_MMX (samples + 768); + case CONVERT (A52_3F, A52_STEREO): + case CONVERT (A52_3F, A52_DOLBY): + mix_3to2_MMX: + memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t)); + zero_MMX (samples + 256); + break; + + case CONVERT (A52_2F2R, A52_STEREO): + case CONVERT (A52_2F2R, A52_DOLBY): + zero_MMX (samples + 768); + case CONVERT (A52_2F1R, A52_STEREO): + case CONVERT (A52_2F1R, A52_DOLBY): + zero_MMX (samples + 512); + break; + + case CONVERT (A52_3F2R, A52_3F): + zero_MMX (samples + 1024); + case CONVERT (A52_3F1R, A52_3F): + case CONVERT (A52_2F2R, A52_2F1R): + zero_MMX (samples + 768); + break; + + case CONVERT (A52_3F2R, A52_3F1R): + zero_MMX (samples + 1024); + break; + + case CONVERT (A52_3F2R, A52_2F1R): + zero_MMX (samples + 1024); + case CONVERT (A52_3F1R, A52_2F1R): + mix_31to21_MMX: + memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); + goto mix_3to2_MMX; + + case CONVERT (A52_3F2R, A52_2F2R): + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); + goto mix_31to21_MMX; + } +} + +static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias) +{ + asm volatile( + "movd %2, %%mm7 \n\t" + "punpckldq %2, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0, %%"REG_S"), %%mm0 \n\t" + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" + "movq 16(%0, %%"REG_S"), %%mm2 \n\t" + "movq 24(%0, %%"REG_S"), %%mm3 \n\t" + "pfadd (%1, %%"REG_S"), %%mm0 \n\t" + "pfadd 8(%1, %%"REG_S"), %%mm1 \n\t" + "pfadd 16(%1, %%"REG_S"), %%mm2 \n\t" + "pfadd 24(%1, %%"REG_S"), %%mm3 \n\t" + "pfadd %%mm7, %%mm0 \n\t" + "pfadd %%mm7, %%mm1 \n\t" + "pfadd %%mm7, %%mm2 \n\t" + "pfadd %%mm7, %%mm3 \n\t" + "movq %%mm0, (%1, %%"REG_S") \n\t" + "movq %%mm1, 8(%1, %%"REG_S") \n\t" + "movq %%mm2, 16(%1, %%"REG_S") \n\t" + "movq %%mm3, 24(%1, %%"REG_S") \n\t" + "add $32, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (src+256), "r" (dest+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix3to1_3dnow (sample_t * samples, sample_t bias) +{ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0, %%"REG_S"), %%mm0 \n\t" + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" + "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" + "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" + "pfadd %%mm7, %%mm0 \n\t" + "pfadd %%mm7, %%mm1 \n\t" + "pfadd %%mm2, %%mm0 \n\t" + "pfadd %%mm3, %%mm1 \n\t" + "movq %%mm0, (%0, %%"REG_S") \n\t" + "movq %%mm1, 8(%0, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix4to1_3dnow (sample_t * samples, sample_t bias) +{ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0, %%"REG_S"), %%mm0 \n\t" + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" + "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" + "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" + "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t" + "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t" + "pfadd %%mm7, %%mm0 \n\t" + "pfadd %%mm7, %%mm1 \n\t" + "pfadd %%mm2, %%mm0 \n\t" + "pfadd %%mm3, %%mm1 \n\t" + "movq %%mm0, (%0, %%"REG_S") \n\t" + "movq %%mm1, 8(%0, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix5to1_3dnow (sample_t * samples, sample_t bias) +{ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0, %%"REG_S"), %%mm0 \n\t" + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" + "movq 1024(%0, %%"REG_S"), %%mm2\n\t" + "movq 1032(%0, %%"REG_S"), %%mm3\n\t" + "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" + "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" + "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t" + "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t" + "pfadd %%mm7, %%mm0 \n\t" + "pfadd %%mm7, %%mm1 \n\t" + "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t" + "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t" + "pfadd %%mm2, %%mm0 \n\t" + "pfadd %%mm3, %%mm1 \n\t" + "movq %%mm0, (%0, %%"REG_S") \n\t" + "movq %%mm1, 8(%0, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix3to2_3dnow (sample_t * samples, sample_t bias) +{ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" + "pfadd %%mm7, %%mm0 \n\t" //common + "pfadd %%mm7, %%mm1 \n\t" //common + "movq (%0, %%"REG_S"), %%mm2 \n\t" + "movq 8(%0, %%"REG_S"), %%mm3 \n\t" + "movq 2048(%0, %%"REG_S"), %%mm4\n\t" + "movq 2056(%0, %%"REG_S"), %%mm5\n\t" + "pfadd %%mm0, %%mm2 \n\t" + "pfadd %%mm1, %%mm3 \n\t" + "pfadd %%mm0, %%mm4 \n\t" + "pfadd %%mm1, %%mm5 \n\t" + "movq %%mm2, (%0, %%"REG_S") \n\t" + "movq %%mm3, 8(%0, %%"REG_S") \n\t" + "movq %%mm4, 1024(%0, %%"REG_S")\n\t" + "movq %%mm5, 1032(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias) +{ + asm volatile( + "movd %2, %%mm7 \n\t" + "punpckldq %2, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq 1024(%1, %%"REG_S"), %%mm0\n\t" + "movq 1032(%1, %%"REG_S"), %%mm1\n\t" + "pfadd %%mm7, %%mm0 \n\t" //common + "pfadd %%mm7, %%mm1 \n\t" //common + "movq (%0, %%"REG_S"), %%mm2 \n\t" + "movq 8(%0, %%"REG_S"), %%mm3 \n\t" + "movq (%1, %%"REG_S"), %%mm4 \n\t" + "movq 8(%1, %%"REG_S"), %%mm5 \n\t" + "pfadd %%mm0, %%mm2 \n\t" + "pfadd %%mm1, %%mm3 \n\t" + "pfadd %%mm0, %%mm4 \n\t" + "pfadd %%mm1, %%mm5 \n\t" + "movq %%mm2, (%0, %%"REG_S") \n\t" + "movq %%mm3, 8(%0, %%"REG_S") \n\t" + "movq %%mm4, (%1, %%"REG_S") \n\t" + "movq %%mm5, 8(%1, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (left+256), "r" (right+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix21toS_3dnow (sample_t * samples, sample_t bias) +{ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq 2048(%0, %%"REG_S"), %%mm0\n\t" // surround + "movq 2056(%0, %%"REG_S"), %%mm1\n\t" // surround + "movq (%0, %%"REG_S"), %%mm2 \n\t" + "movq 8(%0, %%"REG_S"), %%mm3 \n\t" + "movq 1024(%0, %%"REG_S"), %%mm4\n\t" + "movq 1032(%0, %%"REG_S"), %%mm5\n\t" + "pfadd %%mm7, %%mm2 \n\t" + "pfadd %%mm7, %%mm3 \n\t" + "pfadd %%mm7, %%mm4 \n\t" + "pfadd %%mm7, %%mm5 \n\t" + "pfsub %%mm0, %%mm2 \n\t" + "pfsub %%mm1, %%mm3 \n\t" + "pfadd %%mm0, %%mm4 \n\t" + "pfadd %%mm1, %%mm5 \n\t" + "movq %%mm2, (%0, %%"REG_S") \n\t" + "movq %%mm3, 8(%0, %%"REG_S") \n\t" + "movq %%mm4, 1024(%0, %%"REG_S")\n\t" + "movq %%mm5, 1032(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix31to2_3dnow (sample_t * samples, sample_t bias) +{ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" + "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" + "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" + "pfadd %%mm7, %%mm0 \n\t" // common + "pfadd %%mm7, %%mm1 \n\t" // common + "movq (%0, %%"REG_S"), %%mm2 \n\t" + "movq 8(%0, %%"REG_S"), %%mm3 \n\t" + "movq 2048(%0, %%"REG_S"), %%mm4\n\t" + "movq 2056(%0, %%"REG_S"), %%mm5\n\t" + "pfadd %%mm0, %%mm2 \n\t" + "pfadd %%mm1, %%mm3 \n\t" + "pfadd %%mm0, %%mm4 \n\t" + "pfadd %%mm1, %%mm5 \n\t" + "movq %%mm2, (%0, %%"REG_S") \n\t" + "movq %%mm3, 8(%0, %%"REG_S") \n\t" + "movq %%mm4, 1024(%0, %%"REG_S")\n\t" + "movq %%mm5, 1032(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix31toS_3dnow (sample_t * samples, sample_t bias) +{ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" + "pfadd %%mm7, %%mm0 \n\t" // common + "pfadd %%mm7, %%mm1 \n\t" // common + "movq (%0, %%"REG_S"), %%mm2 \n\t" + "movq 8(%0, %%"REG_S"), %%mm3 \n\t" + "movq 2048(%0, %%"REG_S"), %%mm4\n\t" + "movq 2056(%0, %%"REG_S"), %%mm5\n\t" + "pfadd %%mm0, %%mm2 \n\t" + "pfadd %%mm1, %%mm3 \n\t" + "pfadd %%mm0, %%mm4 \n\t" + "pfadd %%mm1, %%mm5 \n\t" + "movq 3072(%0, %%"REG_S"), %%mm0\n\t" // surround + "movq 3080(%0, %%"REG_S"), %%mm1\n\t" // surround + "pfsub %%mm0, %%mm2 \n\t" + "pfsub %%mm1, %%mm3 \n\t" + "pfadd %%mm0, %%mm4 \n\t" + "pfadd %%mm1, %%mm5 \n\t" + "movq %%mm2, (%0, %%"REG_S") \n\t" + "movq %%mm3, 8(%0, %%"REG_S") \n\t" + "movq %%mm4, 1024(%0, %%"REG_S")\n\t" + "movq %%mm5, 1032(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix22toS_3dnow (sample_t * samples, sample_t bias) +{ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq 2048(%0, %%"REG_S"), %%mm0\n\t" + "movq 2056(%0, %%"REG_S"), %%mm1\n\t" + "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround + "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround + "movq (%0, %%"REG_S"), %%mm2 \n\t" + "movq 8(%0, %%"REG_S"), %%mm3 \n\t" + "movq 1024(%0, %%"REG_S"), %%mm4\n\t" + "movq 1032(%0, %%"REG_S"), %%mm5\n\t" + "pfadd %%mm7, %%mm2 \n\t" + "pfadd %%mm7, %%mm3 \n\t" + "pfadd %%mm7, %%mm4 \n\t" + "pfadd %%mm7, %%mm5 \n\t" + "pfsub %%mm0, %%mm2 \n\t" + "pfsub %%mm1, %%mm3 \n\t" + "pfadd %%mm0, %%mm4 \n\t" + "pfadd %%mm1, %%mm5 \n\t" + "movq %%mm2, (%0, %%"REG_S") \n\t" + "movq %%mm3, 8(%0, %%"REG_S") \n\t" + "movq %%mm4, 1024(%0, %%"REG_S")\n\t" + "movq %%mm5, 1032(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void mix32to2_3dnow (sample_t * samples, sample_t bias) +{ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" + "pfadd %%mm7, %%mm0 \n\t" // common + "pfadd %%mm7, %%mm1 \n\t" // common + "movq %%mm0, %%mm2 \n\t" // common + "movq %%mm1, %%mm3 \n\t" // common + "pfadd (%0, %%"REG_S"), %%mm0 \n\t" + "pfadd 8(%0, %%"REG_S"), %%mm1 \n\t" + "pfadd 2048(%0, %%"REG_S"), %%mm2\n\t" + "pfadd 2056(%0, %%"REG_S"), %%mm3\n\t" + "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" + "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" + "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t" + "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t" + "movq %%mm0, (%0, %%"REG_S") \n\t" + "movq %%mm1, 8(%0, %%"REG_S") \n\t" + "movq %%mm2, 1024(%0, %%"REG_S")\n\t" + "movq %%mm3, 1032(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +/* todo: should be optimized better */ +static void mix32toS_3dnow (sample_t * samples, sample_t bias) +{ + asm volatile( + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" + "movq 1024(%0, %%"REG_S"), %%mm0\n\t" + "movq 1032(%0, %%"REG_S"), %%mm1\n\t" + "movq 3072(%0, %%"REG_S"), %%mm4\n\t" + "movq 3080(%0, %%"REG_S"), %%mm5\n\t" + "pfadd %%mm7, %%mm0 \n\t" // common + "pfadd %%mm7, %%mm1 \n\t" // common + "pfadd 4096(%0, %%"REG_S"), %%mm4\n\t" // surround + "pfadd 4104(%0, %%"REG_S"), %%mm5\n\t" // surround + "movq (%0, %%"REG_S"), %%mm2 \n\t" + "movq 8(%0, %%"REG_S"), %%mm3 \n\t" + "movq 2048(%0, %%"REG_S"), %%mm6\n\t" + "movq 2056(%0, %%"REG_S"), %%mm7\n\t" + "pfsub %%mm4, %%mm2 \n\t" + "pfsub %%mm5, %%mm3 \n\t" + "pfadd %%mm4, %%mm6 \n\t" + "pfadd %%mm5, %%mm7 \n\t" + "pfadd %%mm0, %%mm2 \n\t" + "pfadd %%mm1, %%mm3 \n\t" + "pfadd %%mm0, %%mm6 \n\t" + "pfadd %%mm1, %%mm7 \n\t" + "movq %%mm2, (%0, %%"REG_S") \n\t" + "movq %%mm3, 8(%0, %%"REG_S") \n\t" + "movq %%mm6, 1024(%0, %%"REG_S")\n\t" + "movq %%mm7, 1032(%0, %%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%"REG_S + ); +} + +static void move2to1_3dnow (sample_t * src, sample_t * dest, sample_t bias) +{ + asm volatile( + "movd %2, %%mm7 \n\t" + "punpckldq %2, %%mm7 \n\t" + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0, %%"REG_S"), %%mm0 \n\t" + "movq 8(%0, %%"REG_S"), %%mm1 \n\t" + "movq 16(%0, %%"REG_S"), %%mm2 \n\t" + "movq 24(%0, %%"REG_S"), %%mm3 \n\t" + "pfadd 1024(%0, %%"REG_S"), %%mm0\n\t" + "pfadd 1032(%0, %%"REG_S"), %%mm1\n\t" + "pfadd 1040(%0, %%"REG_S"), %%mm2\n\t" + "pfadd 1048(%0, %%"REG_S"), %%mm3\n\t" + "pfadd %%mm7, %%mm0 \n\t" + "pfadd %%mm7, %%mm1 \n\t" + "pfadd %%mm7, %%mm2 \n\t" + "pfadd %%mm7, %%mm3 \n\t" + "movq %%mm0, (%1, %%"REG_S") \n\t" + "movq %%mm1, 8(%1, %%"REG_S") \n\t" + "movq %%mm2, 16(%1, %%"REG_S") \n\t" + "movq %%mm3, 24(%1, %%"REG_S") \n\t" + "add $32, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (src+256), "r" (dest+256), "m" (bias) + : "%"REG_S + ); +} + +static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev) +{ + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { + + case CONVERT (A52_CHANNEL, A52_CHANNEL2): + memcpy (samples, samples + 256, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_CHANNEL, A52_MONO): + case CONVERT (A52_STEREO, A52_MONO): + mix_2to1_3dnow: + mix2to1_3dnow (samples, samples + 256, bias); + break; + + case CONVERT (A52_2F1R, A52_MONO): + if (slev == 0) + goto mix_2to1_3dnow; + case CONVERT (A52_3F, A52_MONO): + mix_3to1_3dnow: + mix3to1_3dnow (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_MONO): + if (slev == 0) + goto mix_3to1_3dnow; + case CONVERT (A52_2F2R, A52_MONO): + if (slev == 0) + goto mix_2to1_3dnow; + mix4to1_3dnow (samples, bias); + break; + + case CONVERT (A52_3F2R, A52_MONO): + if (slev == 0) + goto mix_3to1_3dnow; + mix5to1_3dnow (samples, bias); + break; + + case CONVERT (A52_MONO, A52_DOLBY): + memcpy (samples + 256, samples, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F, A52_STEREO): + case CONVERT (A52_3F, A52_DOLBY): + mix_3to2_3dnow: + mix3to2_3dnow (samples, bias); + break; + + case CONVERT (A52_2F1R, A52_STEREO): + if (slev == 0) + break; + mix21to2_3dnow (samples, samples + 256, bias); + break; + + case CONVERT (A52_2F1R, A52_DOLBY): + mix21toS_3dnow (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_STEREO): + if (slev == 0) + goto mix_3to2_3dnow; + mix31to2_3dnow (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_DOLBY): + mix31toS_3dnow (samples, bias); + break; + + case CONVERT (A52_2F2R, A52_STEREO): + if (slev == 0) + break; + mix2to1_3dnow (samples, samples + 512, bias); + mix2to1_3dnow (samples + 256, samples + 768, bias); + break; + + case CONVERT (A52_2F2R, A52_DOLBY): + mix22toS_3dnow (samples, bias); + break; + + case CONVERT (A52_3F2R, A52_STEREO): + if (slev == 0) + goto mix_3to2_3dnow; + mix32to2_3dnow (samples, bias); + break; + + case CONVERT (A52_3F2R, A52_DOLBY): + mix32toS_3dnow (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_3F): + if (slev == 0) + break; + mix21to2_3dnow (samples, samples + 512, bias); + break; + + case CONVERT (A52_3F2R, A52_3F): + if (slev == 0) + break; + mix2to1_3dnow (samples, samples + 768, bias); + mix2to1_3dnow (samples + 512, samples + 1024, bias); + break; + + case CONVERT (A52_3F1R, A52_2F1R): + mix3to2_3dnow (samples, bias); + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_2F2R, A52_2F1R): + mix2to1_3dnow (samples + 512, samples + 768, bias); + break; + + case CONVERT (A52_3F2R, A52_2F1R): + mix3to2_3dnow (samples, bias); //FIXME possible bug? (output doesnt seem to be used) + move2to1_3dnow (samples + 768, samples + 512, bias); + break; + + case CONVERT (A52_3F2R, A52_3F1R): + mix2to1_3dnow (samples + 768, samples + 1024, bias); + break; + + case CONVERT (A52_2F1R, A52_2F2R): + memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F1R, A52_2F2R): + mix3to2_3dnow (samples, bias); + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F2R, A52_2F2R): + mix3to2_3dnow (samples, bias); + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); + memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F1R, A52_3F2R): + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); + break; + } + __asm __volatile("femms":::"memory"); +} + +#endif // ARCH_X86 || ARCH_X86_64 --- liba52/imdct.c 2008-02-19 00:18:33.000000000 +0100 +++ liba52/imdct.c 2008-02-19 00:16:40.000000000 +0100 @@ -22,6 +26,11 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) + * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru> + * michael did port them from libac3 (untested, perhaps totally broken) + * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org) */ #include "config.h" @@ -39,12 +48,49 @@ #include "a52.h" #include "a52_internal.h" #include "mm_accel.h" +#include "mangle.h" + +void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias); + +#ifdef RUNTIME_CPUDETECT +#undef HAVE_3DNOWEX +#endif typedef struct complex_s { sample_t real; sample_t imag; } complex_t; +static const int pm128[128] attribute_used __attribute__((aligned(16))) = +{ + 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120, + 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124, + 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122, + 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126, + 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121, + 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125, + 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123, + 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127 +}; + +static uint8_t attribute_used bit_reverse_512[] = { + 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70, + 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78, + 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74, + 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c, + 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72, + 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a, + 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76, + 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e, + 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71, + 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79, + 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75, + 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d, + 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73, + 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b, + 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77, + 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f}; + static uint8_t fftorder[] = { 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176, 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88, @@ -56,6 +102,40 @@ 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86 }; +static complex_t __attribute__((aligned(16))) buf[128]; + +/* Twiddle factor LUT */ +static complex_t __attribute__((aligned(16))) w_1[1]; +static complex_t __attribute__((aligned(16))) w_2[2]; +static complex_t __attribute__((aligned(16))) w_4[4]; +static complex_t __attribute__((aligned(16))) w_8[8]; +static complex_t __attribute__((aligned(16))) w_16[16]; +static complex_t __attribute__((aligned(16))) w_32[32]; +static complex_t __attribute__((aligned(16))) w_64[64]; +static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; + +/* Twiddle factors for IMDCT */ +static sample_t __attribute__((aligned(16))) xcos1[128]; +static sample_t __attribute__((aligned(16))) xsin1[128]; + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +// NOTE: SSE needs 16byte alignment or it will segfault +// +static float __attribute__((aligned(16))) sseSinCos1c[256]; +static float __attribute__((aligned(16))) sseSinCos1d[256]; +static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1}; +//static float __attribute__((aligned(16))) sseW0[4]; +static float __attribute__((aligned(16))) sseW1[8]; +static float __attribute__((aligned(16))) sseW2[16]; +static float __attribute__((aligned(16))) sseW3[32]; +static float __attribute__((aligned(16))) sseW4[64]; +static float __attribute__((aligned(16))) sseW5[128]; +static float __attribute__((aligned(16))) sseW6[256]; +static float __attribute__((aligned(16))) *sseW[7]= + {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; +static float __attribute__((aligned(16))) sseWindow[512]; +#endif + /* Root values for IFFT */ static sample_t roots16[3]; static sample_t roots32[7]; @@ -241,7 +321,7 @@ ifft_pass (buf, roots128 - 32, 32); } -void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias) +void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias) { int i, k; sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2; @@ -285,6 +365,707 @@ } } +#ifdef HAVE_ALTIVEC + +#ifdef HAVE_ALTIVEC_H +#include <altivec.h> +#endif + +// used to build registers permutation vectors (vcprm) +// the 's' are for words in the _s_econd vector +#define WORD_0 0x00,0x01,0x02,0x03 +#define WORD_1 0x04,0x05,0x06,0x07 +#define WORD_2 0x08,0x09,0x0a,0x0b +#define WORD_3 0x0c,0x0d,0x0e,0x0f +#define WORD_s0 0x10,0x11,0x12,0x13 +#define WORD_s1 0x14,0x15,0x16,0x17 +#define WORD_s2 0x18,0x19,0x1a,0x1b +#define WORD_s3 0x1c,0x1d,0x1e,0x1f + +#ifdef __APPLE_CC__ +#define AVV(x...) (x) +#else +#define AVV(x...) {x} +#endif + +#define vcprm(a,b,c,d) (const vector unsigned char)AVV(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) +#define vcii(a,b,c,d) (const vector float)AVV(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) + +#define FOUROF(a) AVV(a,a,a,a) + +// vcprmle is used to keep the same index as in the SSE version. +// it's the same as vcprm, with the index inversed +// ('le' is Little Endian) +#define vcprmle(a,b,c,d) vcprm(d,c,b,a) + +// used to build inverse/identity vectors (vcii) +// n is _n_egative, p is _p_ositive +#define FLOAT_n -1. +#define FLOAT_p 1. + + +void +imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias) +{ + int i; + int k; + int p,q; + int m; + long two_m; + long two_m_plus_one; + + sample_t tmp_b_i; + sample_t tmp_b_r; + sample_t tmp_a_i; + sample_t tmp_a_r; + + sample_t *data_ptr; + sample_t *delay_ptr; + sample_t *window_ptr; + + /* 512 IMDCT with source and dest data in 'data' */ + + /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ + for( i=0; i < 128; i++) { + /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ + int j= bit_reverse_512[i]; + buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); + buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); + } + + /* 1. iteration */ + for(i = 0; i < 128; i += 2) { +#if 0 + tmp_a_r = buf[i].real; + tmp_a_i = buf[i].imag; + tmp_b_r = buf[i+1].real; + tmp_b_i = buf[i+1].imag; + buf[i].real = tmp_a_r + tmp_b_r; + buf[i].imag = tmp_a_i + tmp_b_i; + buf[i+1].real = tmp_a_r - tmp_b_r; + buf[i+1].imag = tmp_a_i - tmp_b_i; +#else + vector float temp, bufv; + + bufv = vec_ld(i << 3, (float*)buf); + temp = vec_perm(bufv, bufv, vcprm(2,3,0,1)); + bufv = vec_madd(bufv, vcii(p,p,n,n), temp); + vec_st(bufv, i << 3, (float*)buf); +#endif + } + + /* 2. iteration */ + // Note w[1]={{1,0}, {0,-1}} + for(i = 0; i < 128; i += 4) { +#if 0 + tmp_a_r = buf[i].real; + tmp_a_i = buf[i].imag; + tmp_b_r = buf[i+2].real; + tmp_b_i = buf[i+2].imag; + buf[i].real = tmp_a_r + tmp_b_r; + buf[i].imag = tmp_a_i + tmp_b_i; + buf[i+2].real = tmp_a_r - tmp_b_r; + buf[i+2].imag = tmp_a_i - tmp_b_i; + tmp_a_r = buf[i+1].real; + tmp_a_i = buf[i+1].imag; + /* WARNING: im <-> re here ! */ + tmp_b_r = buf[i+3].imag; + tmp_b_i = buf[i+3].real; + buf[i+1].real = tmp_a_r + tmp_b_r; + buf[i+1].imag = tmp_a_i - tmp_b_i; + buf[i+3].real = tmp_a_r - tmp_b_r; + buf[i+3].imag = tmp_a_i + tmp_b_i; +#else + vector float buf01, buf23, temp1, temp2; + + buf01 = vec_ld((i + 0) << 3, (float*)buf); + buf23 = vec_ld((i + 2) << 3, (float*)buf); + buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2)); + + temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01); + temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01); + + vec_st(temp1, (i + 0) << 3, (float*)buf); + vec_st(temp2, (i + 2) << 3, (float*)buf); +#endif + } + + /* 3. iteration */ + for(i = 0; i < 128; i += 8) { +#if 0 + tmp_a_r = buf[i].real; + tmp_a_i = buf[i].imag; + tmp_b_r = buf[i+4].real; + tmp_b_i = buf[i+4].imag; + buf[i].real = tmp_a_r + tmp_b_r; + buf[i].imag = tmp_a_i + tmp_b_i; + buf[i+4].real = tmp_a_r - tmp_b_r; + buf[i+4].imag = tmp_a_i - tmp_b_i; + tmp_a_r = buf[1+i].real; + tmp_a_i = buf[1+i].imag; + tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; + tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; + buf[1+i].real = tmp_a_r + tmp_b_r; + buf[1+i].imag = tmp_a_i + tmp_b_i; + buf[i+5].real = tmp_a_r - tmp_b_r; + buf[i+5].imag = tmp_a_i - tmp_b_i; + tmp_a_r = buf[i+2].real; + tmp_a_i = buf[i+2].imag; + /* WARNING re <-> im & sign */ + tmp_b_r = buf[i+6].imag; + tmp_b_i = - buf[i+6].real; + buf[i+2].real = tmp_a_r + tmp_b_r; + buf[i+2].imag = tmp_a_i + tmp_b_i; + buf[i+6].real = tmp_a_r - tmp_b_r; + buf[i+6].imag = tmp_a_i - tmp_b_i; + tmp_a_r = buf[i+3].real; + tmp_a_i = buf[i+3].imag; + tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; + tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; + buf[i+3].real = tmp_a_r + tmp_b_r; + buf[i+3].imag = tmp_a_i + tmp_b_i; + buf[i+7].real = tmp_a_r - tmp_b_r; + buf[i+7].imag = tmp_a_i - tmp_b_i; +#else + vector float buf01, buf23, buf45, buf67; + + buf01 = vec_ld((i + 0) << 3, (float*)buf); + buf23 = vec_ld((i + 2) << 3, (float*)buf); + + tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; + tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; + buf[i+5].real = tmp_b_r; + buf[i+5].imag = tmp_b_i; + tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; + tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; + buf[i+7].real = tmp_b_r; + buf[i+7].imag = tmp_b_i; + + buf23 = vec_ld((i + 2) << 3, (float*)buf); + buf45 = vec_ld((i + 4) << 3, (float*)buf); + buf67 = vec_ld((i + 6) << 3, (float*)buf); + buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3)); + + vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf); + vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf); + vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf); + vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf); +#endif + } + + /* 4-7. iterations */ + for (m=3; m < 7; m++) { + two_m = (1 << m); + + two_m_plus_one = two_m<<1; + + for(i = 0; i < 128; i += two_m_plus_one) { + for(k = 0; k < two_m; k+=2) { +#if 0 + int p = k + i; + int q = p + two_m; + tmp_a_r = buf[p].real; + tmp_a_i = buf[p].imag; + tmp_b_r = + buf[q].real * w[m][k].real - + buf[q].imag * w[m][k].imag; + tmp_b_i = + buf[q].imag * w[m][k].real + + buf[q].real * w[m][k].imag; + buf[p].real = tmp_a_r + tmp_b_r; + buf[p].imag = tmp_a_i + tmp_b_i; + buf[q].real = tmp_a_r - tmp_b_r; + buf[q].imag = tmp_a_i - tmp_b_i; + + tmp_a_r = buf[(p + 1)].real; + tmp_a_i = buf[(p + 1)].imag; + tmp_b_r = + buf[(q + 1)].real * w[m][(k + 1)].real - + buf[(q + 1)].imag * w[m][(k + 1)].imag; + tmp_b_i = + buf[(q + 1)].imag * w[m][(k + 1)].real + + buf[(q + 1)].real * w[m][(k + 1)].imag; + buf[(p + 1)].real = tmp_a_r + tmp_b_r; + buf[(p + 1)].imag = tmp_a_i + tmp_b_i; + buf[(q + 1)].real = tmp_a_r - tmp_b_r; + buf[(q + 1)].imag = tmp_a_i - tmp_b_i; +#else + int p = k + i; + int q = p + two_m; + vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4; + const vector float vczero = (const vector float)FOUROF(0.); + // first compute buf[q] and buf[q+1] + vecq = vec_ld(q << 3, (float*)buf); + vecw = vec_ld(0, (float*)&(w[m][k])); + temp1 = vec_madd(vecq, vecw, vczero); + temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2)); + temp2 = vec_madd(temp2, vecw, vczero); + temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2)); + temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3)); + vecq = vec_madd(temp4, vcii(n,p,n,p), temp3); + // then butterfly with buf[p] and buf[p+1] + vecp = vec_ld(p << 3, (float*)buf); + + temp1 = vec_add(vecp, vecq); + temp2 = vec_sub(vecp, vecq); + + vec_st(temp1, p << 3, (float*)buf); + vec_st(temp2, q << 3, (float*)buf); +#endif + } + } + } + + /* Post IFFT complex multiply plus IFFT complex conjugate*/ + for( i=0; i < 128; i+=4) { + /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ +#if 0 + tmp_a_r = buf[(i + 0)].real; + tmp_a_i = -1.0 * buf[(i + 0)].imag; + buf[(i + 0)].real = + (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]); + buf[(i + 0)].imag = + (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]); + + tmp_a_r = buf[(i + 1)].real; + tmp_a_i = -1.0 * buf[(i + 1)].imag; + buf[(i + 1)].real = + (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]); + buf[(i + 1)].imag = + (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]); + + tmp_a_r = buf[(i + 2)].real; + tmp_a_i = -1.0 * buf[(i + 2)].imag; + buf[(i + 2)].real = + (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]); + buf[(i + 2)].imag = + (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]); + + tmp_a_r = buf[(i + 3)].real; + tmp_a_i = -1.0 * buf[(i + 3)].imag; + buf[(i + 3)].real = + (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]); + buf[(i + 3)].imag = + (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]); +#else + vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2; + vector float temp0022, temp1133, tempCS01; + const vector float vczero = (const vector float)FOUROF(0.); + + bufv_0 = vec_ld((i + 0) << 3, (float*)buf); + bufv_2 = vec_ld((i + 2) << 3, (float*)buf); + + cosv = vec_ld(i << 2, xcos1); + sinv = vec_ld(i << 2, xsin1); + + temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2)); + temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3)); + tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1)); + temp1 = vec_madd(temp0022, tempCS01, vczero); + tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1)); + temp2 = vec_madd(temp1133, tempCS01, vczero); + bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1); + + vec_st(bufv_0, (i + 0) << 3, (float*)buf); + + /* idem with bufv_2 and high-order cosv/sinv */ + + temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2)); + temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3)); + tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3)); + temp1 = vec_madd(temp0022, tempCS01, vczero); + tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3)); + temp2 = vec_madd(temp1133, tempCS01, vczero); + bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1); + + vec_st(bufv_2, (i + 2) << 3, (float*)buf); + +#endif + } + + data_ptr = data; + delay_ptr = delay; + window_ptr = a52_imdct_window; + + /* Window and convert to real valued signal */ + for(i=0; i< 64; i++) { + *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; + *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; + } + + for(i=0; i< 64; i++) { + *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; + *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; + } + + /* The trailing edge of the window goes into the delay line */ + delay_ptr = delay; + + for(i=0; i< 64; i++) { + *delay_ptr++ = -buf[64+i].real * *--window_ptr; + *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; + } + + for(i=0; i<64; i++) { + *delay_ptr++ = buf[i].imag * *--window_ptr; + *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; + } +} +#endif + + +// Stuff below this line is borrowed from libac3 +#include "srfftp.h" +#if defined(ARCH_X86) || defined(ARCH_X86_64) +#ifndef HAVE_3DNOW +#define HAVE_3DNOW 1 +#endif +#include "srfftp_3dnow.h" + +const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }}; +const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }}; +const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 }; + +#undef HAVE_3DNOWEX +#include "imdct_3dnow.h" +#define HAVE_3DNOWEX +#include "imdct_3dnow.h" + +void +imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias) +{ +/* int i,k; + int p,q;*/ + int m; + long two_m; + long two_m_plus_one; + long two_m_plus_one_shl3; + complex_t *buf_offset; + +/* sample_t tmp_a_i; + sample_t tmp_a_r; + sample_t tmp_b_i; + sample_t tmp_b_r;*/ + + sample_t *data_ptr; + sample_t *delay_ptr; + sample_t *window_ptr; + + /* 512 IMDCT with source and dest data in 'data' */ + /* see the c version (dct_do_512()), its allmost identical, just in C */ + + /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ + /* Bit reversed shuffling */ + asm volatile( + "xor %%"REG_S", %%"REG_S" \n\t" + "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t" + "mov $1008, %%"REG_D" \n\t" + "push %%"REG_BP" \n\t" //use ebp without telling gcc + ASMALIGN(4) + "1: \n\t" + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI + "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi + "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi + "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR + "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t" + "mulps %%xmm0, %%xmm2 \n\t" + "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI + "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t" + "subps %%xmm0, %%xmm2 \n\t" + "movzb (%%"REG_a"), %%"REG_d" \n\t" + "movzb 1(%%"REG_a"), %%"REG_BP" \n\t" + "movlps %%xmm2, (%1, %%"REG_d", 8) \n\t" + "movhps %%xmm2, (%1, %%"REG_BP", 8) \n\t" + "add $16, %%"REG_S" \n\t" + "add $2, %%"REG_a" \n\t" // avoid complex addressing for P4 crap + "sub $16, %%"REG_D" \n\t" + "jnc 1b \n\t" + "pop %%"REG_BP" \n\t"//no we didnt touch ebp *g* + :: "b" (data), "c" (buf) + : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d + ); + + + /* FFT Merge */ +/* unoptimized variant + for (m=1; m < 7; m++) { + if(m) + two_m = (1 << m); + else + two_m = 1; + + two_m_plus_one = (1 << (m+1)); + + for(i = 0; i < 128; i += two_m_plus_one) { + for(k = 0; k < two_m; k++) { + p = k + i; + q = p + two_m; + tmp_a_r = buf[p].real; + tmp_a_i = buf[p].imag; + tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; + tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; + buf[p].real = tmp_a_r + tmp_b_r; + buf[p].imag = tmp_a_i + tmp_b_i; + buf[q].real = tmp_a_r - tmp_b_r; + buf[q].imag = tmp_a_i - tmp_b_i; + } + } + } +*/ + + /* 1. iteration */ + // Note w[0][0]={1,0} + asm volatile( + "xorps %%xmm1, %%xmm1 \n\t" + "xorps %%xmm2, %%xmm2 \n\t" + "mov %0, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p] + "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q] + "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p] + "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q] + "addps %%xmm1, %%xmm0 \n\t" + "subps %%xmm2, %%xmm0 \n\t" + "movaps %%xmm0, (%%"REG_S")\n\t" + "add $16, %%"REG_S" \n\t" + "cmp %1, %%"REG_S" \n\t" + " jb 1b \n\t" + :: "g" (buf), "r" (buf + 128) + : "%"REG_S + ); + + /* 2. iteration */ + // Note w[1]={{1,0}, {0,-1}} + asm volatile( + "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 + "mov %0, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3 + "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 + "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 + "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1 + "movaps (%%"REG_S"), %%xmm1 \n\t" //r0,i0,r1,i1 + "addps %%xmm2, %%xmm0 \n\t" + "subps %%xmm2, %%xmm1 \n\t" + "movaps %%xmm0, (%%"REG_S") \n\t" + "movaps %%xmm1, 16(%%"REG_S") \n\t" + "add $32, %%"REG_S" \n\t" + "cmp %1, %%"REG_S" \n\t" + " jb 1b \n\t" + :: "g" (buf), "r" (buf + 128) + : "%"REG_S + ); + + /* 3. iteration */ +/* + Note sseW2+0={1,1,sqrt(2),sqrt(2)) + Note sseW2+16={0,0,sqrt(2),-sqrt(2)) + Note sseW2+32={0,0,-sqrt(2),-sqrt(2)) + Note sseW2+48={1,-1,sqrt(2),-sqrt(2)) +*/ + asm volatile( + "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" + "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" + "xorps %%xmm5, %%xmm5 \n\t" + "xorps %%xmm2, %%xmm2 \n\t" + "mov %0, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5 + "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7 + "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 + "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 + "mulps %%xmm2, %%xmm4 \n\t" + "mulps %%xmm3, %%xmm5 \n\t" + "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 + "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 + "mulps %%xmm6, %%xmm3 \n\t" + "mulps %%xmm7, %%xmm2 \n\t" + "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1 + "movaps 16(%%"REG_S"), %%xmm1 \n\t" //r2,i2,r3,i3 + "addps %%xmm4, %%xmm2 \n\t" + "addps %%xmm5, %%xmm3 \n\t" + "movaps %%xmm2, %%xmm4 \n\t" + "movaps %%xmm3, %%xmm5 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "addps %%xmm1, %%xmm3 \n\t" + "subps %%xmm4, %%xmm0 \n\t" + "subps %%xmm5, %%xmm1 \n\t" + "movaps %%xmm2, (%%"REG_S") \n\t" + "movaps %%xmm3, 16(%%"REG_S") \n\t" + "movaps %%xmm0, 32(%%"REG_S") \n\t" + "movaps %%xmm1, 48(%%"REG_S") \n\t" + "add $64, %%"REG_S" \n\t" + "cmp %1, %%"REG_S" \n\t" + " jb 1b \n\t" + :: "g" (buf), "r" (buf + 128) + : "%"REG_S + ); + + /* 4-7. iterations */ + for (m=3; m < 7; m++) { + two_m = (1 << m); + two_m_plus_one = two_m<<1; + two_m_plus_one_shl3 = (two_m_plus_one<<3); + buf_offset = buf+128; + asm volatile( + "mov %0, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "xor %%"REG_D", %%"REG_D" \n\t" // k + "lea (%%"REG_S", %3), %%"REG_d" \n\t" + "2: \n\t" + "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t" + "movaps (%4, %%"REG_D", 2), %%xmm2 \n\t" + "mulps %%xmm1, %%xmm2 \n\t" + "shufps $0xB1, %%xmm1, %%xmm1 \n\t" + "mulps 16(%4, %%"REG_D", 2), %%xmm1 \n\t" + "movaps (%%"REG_S", %%"REG_D"), %%xmm0 \n\t" + "addps %%xmm2, %%xmm1 \n\t" + "movaps %%xmm1, %%xmm2 \n\t" + "addps %%xmm0, %%xmm1 \n\t" + "subps %%xmm2, %%xmm0 \n\t" + "movaps %%xmm1, (%%"REG_S", %%"REG_D") \n\t" + "movaps %%xmm0, (%%"REG_d", %%"REG_D") \n\t" + "add $16, %%"REG_D" \n\t" + "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0 + "jb 2b \n\t" + "add %2, %%"REG_S" \n\t" + "cmp %1, %%"REG_S" \n\t" + " jb 1b \n\t" + :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3), + "r" (sseW[m]) + : "%"REG_S, "%"REG_D, "%"REG_d + ); + } + + /* Post IFFT complex multiply plus IFFT complex conjugate*/ + asm volatile( + "mov $-1024, %%"REG_S" \n\t" + ASMALIGN(4) + "1: \n\t" + "movaps (%0, %%"REG_S"), %%xmm0 \n\t" + "movaps (%0, %%"REG_S"), %%xmm1 \n\t" + "shufps $0xB1, %%xmm0, %%xmm0 \n\t" + "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t" + "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t" + "addps %%xmm1, %%xmm0 \n\t" + "movaps %%xmm0, (%0, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (buf+128) + : "%"REG_S + ); + + + data_ptr = data; + delay_ptr = delay; + window_ptr = a52_imdct_window; + + /* Window and convert to real valued signal */ + asm volatile( + "xor %%"REG_D", %%"REG_D" \n\t" // 0 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 + "movss %3, %%xmm2 \n\t" // bias + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... + ASMALIGN(4) + "1: \n\t" + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? + "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A + "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" + "addps (%2, %%"REG_S"), %%xmm0 \n\t" + "addps %%xmm2, %%xmm0 \n\t" + "movaps %%xmm0, (%1, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + "sub $16, %%"REG_D" \n\t" + "cmp $512, %%"REG_S" \n\t" + " jb 1b \n\t" + :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) + : "%"REG_S, "%"REG_D + ); + data_ptr+=128; + delay_ptr+=128; +// window_ptr+=128; + + asm volatile( + "mov $1024, %%"REG_D" \n\t" // 512 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 + "movss %3, %%xmm2 \n\t" // bias + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... + ASMALIGN(4) + "1: \n\t" + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A + "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A + "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" + "addps (%2, %%"REG_S"), %%xmm0 \n\t" + "addps %%xmm2, %%xmm0 \n\t" + "movaps %%xmm0, (%1, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + "sub $16, %%"REG_D" \n\t" + "cmp $512, %%"REG_S" \n\t" + " jb 1b \n\t" + :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) + : "%"REG_S, "%"REG_D + ); + data_ptr+=128; +// window_ptr+=128; + + /* The trailing edge of the window goes into the delay line */ + delay_ptr = delay; + + asm volatile( + "xor %%"REG_D", %%"REG_D" \n\t" // 0 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 + ASMALIGN(4) + "1: \n\t" + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A + "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A + "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" + "movaps %%xmm0, (%1, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + "sub $16, %%"REG_D" \n\t" + "cmp $512, %%"REG_S" \n\t" + " jb 1b \n\t" + :: "r" (buf+64), "r" (delay_ptr) + : "%"REG_S, "%"REG_D + ); + delay_ptr+=128; +// window_ptr-=128; + + asm volatile( + "mov $1024, %%"REG_D" \n\t" // 1024 + "xor %%"REG_S", %%"REG_S" \n\t" // 0 + ASMALIGN(4) + "1: \n\t" + "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? + "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? + "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? + "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? + "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A + "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" + "movaps %%xmm0, (%1, %%"REG_S") \n\t" + "add $16, %%"REG_S" \n\t" + "sub $16, %%"REG_D" \n\t" + "cmp $512, %%"REG_S" \n\t" + " jb 1b \n\t" + :: "r" (buf), "r" (delay_ptr) + : "%"REG_S, "%"REG_D + ); +} +#endif // ARCH_X86 || ARCH_X86_64 + void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias) { int i, k; @@ -364,7 +1145,7 @@ void a52_imdct_init (uint32_t mm_accel) { - int i, k; + int i, j, k; double sum; /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */ @@ -416,6 +1197,99 @@ post2[i].real = cos ((M_PI / 128) * (i + 0.5)); post2[i].imag = sin ((M_PI / 128) * (i + 0.5)); } + for (i = 0; i < 128; i++) { + xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); + xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); + } + for (i = 0; i < 7; i++) { + j = 1 << i; + for (k = 0; k < j; k++) { + w[i][k].real = cos (-M_PI * k / j); + w[i][k].imag = sin (-M_PI * k / j); + } + } +#if defined(ARCH_X86) || defined(ARCH_X86_64) + for (i = 0; i < 128; i++) { + sseSinCos1c[2*i+0]= xcos1[i]; + sseSinCos1c[2*i+1]= -xcos1[i]; + sseSinCos1d[2*i+0]= xsin1[i]; + sseSinCos1d[2*i+1]= xsin1[i]; + } + for (i = 1; i < 7; i++) { + j = 1 << i; + for (k = 0; k < j; k+=2) { + + sseW[i][4*k + 0] = w[i][k+0].real; + sseW[i][4*k + 1] = w[i][k+0].real; + sseW[i][4*k + 2] = w[i][k+1].real; + sseW[i][4*k + 3] = w[i][k+1].real; + + sseW[i][4*k + 4] = -w[i][k+0].imag; + sseW[i][4*k + 5] = w[i][k+0].imag; + sseW[i][4*k + 6] = -w[i][k+1].imag; + sseW[i][4*k + 7] = w[i][k+1].imag; + + //we multiply more or less uninitalized numbers so we need to use exactly 0.0 + if(k==0) + { +// sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0; + sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0; + } + + if(2*k == j) + { + sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0; +// sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0); + } + } + } + + for(i=0; i<128; i++) + { + sseWindow[2*i+0]= -a52_imdct_window[2*i+0]; + sseWindow[2*i+1]= a52_imdct_window[2*i+1]; + } + + for(i=0; i<64; i++) + { + sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1]; + sseWindow[256 + 2*i+1]= a52_imdct_window[254 - 2*i+0]; + sseWindow[384 + 2*i+0]= a52_imdct_window[126 - 2*i+1]; + sseWindow[384 + 2*i+1]= -a52_imdct_window[126 - 2*i+0]; + } +#endif + a52_imdct_512 = imdct_do_512; + ifft128 = ifft128_c; + ifft64 = ifft64_c; + +#if defined(ARCH_X86) || defined(ARCH_X86_64) + if(mm_accel & MM_ACCEL_X86_SSE) + { + fprintf (stderr, "Using SSE optimized IMDCT transform\n"); + a52_imdct_512 = imdct_do_512_sse; + } + else + if(mm_accel & MM_ACCEL_X86_3DNOWEXT) + { + fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n"); + a52_imdct_512 = imdct_do_512_3dnowex; + } + else + if(mm_accel & MM_ACCEL_X86_3DNOW) + { + fprintf (stderr, "Using 3DNow optimized IMDCT transform\n"); + a52_imdct_512 = imdct_do_512_3dnow; + } + else +#endif // ARCH_X86 || ARCH_X86_64 +#ifdef HAVE_ALTIVEC + if (mm_accel & MM_ACCEL_PPC_ALTIVEC) + { + fprintf(stderr, "Using AltiVec optimized IMDCT transform\n"); + a52_imdct_512 = imdct_do_512_altivec; + } + else +#endif #ifdef LIBA52_DJBFFT if (mm_accel & MM_ACCEL_DJBFFT) { @@ -426,7 +1300,5 @@ #endif { fprintf (stderr, "No accelerated IMDCT transform found\n"); - ifft128 = ifft128_c; - ifft64 = ifft64_c; } } --- include/mm_accel.h 2006-06-12 15:05:00.000000000 +0200 +++ liba52/mm_accel.h 2006-06-05 02:23:04.000000000 +0200 @@ -30,7 +34,12 @@ /* x86 accelerations */ #define MM_ACCEL_X86_MMX 0x80000000 #define MM_ACCEL_X86_3DNOW 0x40000000 +#define MM_ACCEL_X86_3DNOWEXT 0x08000000 #define MM_ACCEL_X86_MMXEXT 0x20000000 +#define MM_ACCEL_X86_SSE 0x10000000 + +/* PPC accelerations */ +#define MM_ACCEL_PPC_ALTIVEC 0x00010000 uint32_t mm_accel (void); --- liba52/parse.c 2006-12-05 08:08:01.000000000 +0100 +++ liba52/parse.c 2006-12-05 08:08:44.000000000 +0100 @@ -28,6 +28,7 @@ #include "config.h" #include <stdlib.h> +#include <stdio.h> #include <string.h> #include <inttypes.h> @@ -35,13 +36,12 @@ #include "a52_internal.h" #include "bitstream.h" #include "tables.h" +#include "mm_accel.h" +#include "libavutil/avutil.h" #ifdef HAVE_MEMALIGN /* some systems have memalign() but no declaration for it */ void * memalign (size_t align, size_t size); -#else -/* assume malloc alignment is sufficient */ -#define memalign(align,size) malloc (size) #endif typedef struct { @@ -64,7 +64,16 @@ if (state == NULL) return NULL; +#if defined(__MINGW32__) && defined(HAVE_SSE) + state->samples = av_malloc(256 * 12 * sizeof (sample_t)); +#else state->samples = memalign (16, 256 * 12 * sizeof (sample_t)); +#endif + if(((int)state->samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){ + mm_accel &=~MM_ACCEL_X86_SSE; + fprintf(stderr, "liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n"); + } + if (state->samples == NULL) { free (state); return NULL; @@ -78,6 +87,7 @@ state->lfsr_state = 1; a52_imdct_init (mm_accel); + downmix_accel_init(mm_accel); return state; } @@ -145,7 +155,7 @@ state->acmod = acmod = buf[6] >> 5; a52_bitstream_set_ptr (state, buf + 6); - bitstream_get (state, 3); /* skip acmod we already parsed */ + bitstream_skip (state, 3); /* skip acmod we already parsed */ if ((acmod == 2) && (bitstream_get (state, 2) == 2)) /* dsurmod */ acmod = A52_DOLBY; @@ -176,28 +186,28 @@ chaninfo = !acmod; do { - bitstream_get (state, 5); /* dialnorm */ + bitstream_skip (state, 5); /* dialnorm */ if (bitstream_get (state, 1)) /* compre */ - bitstream_get (state, 8); /* compr */ + bitstream_skip (state, 8); /* compr */ if (bitstream_get (state, 1)) /* langcode */ - bitstream_get (state, 8); /* langcod */ + bitstream_skip (state, 8); /* langcod */ if (bitstream_get (state, 1)) /* audprodie */ - bitstream_get (state, 7); /* mixlevel + roomtyp */ + bitstream_skip (state, 7); /* mixlevel + roomtyp */ } while (chaninfo--); - bitstream_get (state, 2); /* copyrightb + origbs */ + bitstream_skip (state, 2); /* copyrightb + origbs */ if (bitstream_get (state, 1)) /* timecod1e */ - bitstream_get (state, 14); /* timecod1 */ + bitstream_skip (state, 14); /* timecod1 */ if (bitstream_get (state, 1)) /* timecod2e */ - bitstream_get (state, 14); /* timecod2 */ + bitstream_skip (state, 14); /* timecod2 */ if (bitstream_get (state, 1)) { /* addbsie */ int addbsil; addbsil = bitstream_get (state, 6); do { - bitstream_get (state, 8); /* addbsi */ + bitstream_skip (state, 8); /* addbsi */ } while (addbsil--); } @@ -684,7 +694,7 @@ state->fbw_expbap[i].exp[0], state->fbw_expbap[i].exp + 1)) return 1; - bitstream_get (state, 2); /* gainrng */ + bitstream_skip (state, 2); /* gainrng */ } if (lfeexpstr != EXP_REUSE) { do_bit_alloc |= 32; @@ -759,7 +769,7 @@ if (bitstream_get (state, 1)) { /* skiple */ i = bitstream_get (state, 9); /* skipl */ while (i--) - bitstream_get (state, 8); + bitstream_skip (state, 8); } samples = state->samples; @@ -900,6 +910,10 @@ void a52_free (a52_state_t * state) { - free (state->samples); +#if defined(__MINGW32__) && defined(HAVE_SSE) + av_free (state->samples); +#else + free (state->samples); +#endif free (state); }