view liba52/liba52_changes.diff @ 30251:fbb33d643fe6

Remove hackish and not actually working code to make vidix compile with compilers without inline assembly. Almost no OS provides inb() etc. in its libraries, and removing the broken fallback code makes it compile on Solaris with Sun C compiler.
author reimar
date Tue, 12 Jan 2010 20:20:49 +0000
parents cd3ae709054f
children
line wrap: on
line source

--- include/a52.h	2006-06-12 15:04:57.000000000 +0200
+++ liba52/a52.h	2006-06-05 02:23:02.000000000 +0200
@@ -59,4 +66,9 @@
 int a52_block (a52_state_t * state);
 void a52_free (a52_state_t * state);

+void* a52_resample_init(uint32_t mm_accel,int flags,int chans);
+extern int (* a52_resample) (float * _f, int16_t * s16);
+
+uint16_t crc16_block(uint8_t *data,uint32_t num_bytes);
+
 #endif /* A52_H */
--- liba52/a52_internal.h	2006-06-12 15:05:07.000000000 +0200
+++ liba52/a52_internal.h	2006-06-05 02:23:02.000000000 +0200
@@ -103,18 +107,34 @@
 #define DELTA_BIT_NONE (2)
 #define DELTA_BIT_RESERVED (3)

+#if ARCH_X86_64
+# define REG_a "rax"
+# define REG_d "rdx"
+# define REG_S "rsi"
+# define REG_D "rdi"
+# define REG_BP "rbp"
+#else
+# define REG_a "eax"
+# define REG_d "edx"
+# define REG_S "esi"
+# define REG_D "edi"
+# define REG_BP "ebp"
+#endif
+
 void a52_bit_allocate (a52_state_t * state, ba_t * ba, int bndstart,
 		       int start, int end, int fastleak, int slowleak,
 		       expbap_t * expbap);

 int a52_downmix_init (int input, int flags, sample_t * level,
 		      sample_t clev, sample_t slev);
+void downmix_accel_init(uint32_t mm_accel);
 int a52_downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level,
 		       sample_t clev, sample_t slev);
-void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias,
+extern void (*a52_downmix) (sample_t * samples, int acmod, int output, sample_t bias,
 		  sample_t clev, sample_t slev);
-void a52_upmix (sample_t * samples, int acmod, int output);
+extern void (*a52_upmix) (sample_t * samples, int acmod, int output);

 void a52_imdct_init (uint32_t mm_accel);
 void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias);
-void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias);
+extern void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
+void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias);
--- liba52/bitstream.c	2006-06-12 15:05:07.000000000 +0200
+++ liba52/bitstream.c	2006-06-05 02:23:02.000000000 +0200
@@ -31,6 +35,10 @@

 #define BUFFER_SIZE 4096

+#ifdef ALT_BITSTREAM_READER
+int indx=0;
+#endif
+
 void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf)
 {
     int align;
@@ -38,6 +46,9 @@
     align = (long)buf & 3;
     state->buffer_start = (uint32_t *) (buf - align);
     state->bits_left = 0;
+#ifdef ALT_BITSTREAM_READER
+    indx=0;
+#endif
     bitstream_get (state, align * 8);
 }

--- liba52/bitstream.h	2006-06-12 15:05:07.000000000 +0200
+++ liba52/bitstream.h	2006-06-05 02:23:02.000000000 +0200
@@ -21,6 +25,42 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */

+/* code from ffmpeg/libavcodec */
+#if defined(__sparc__) || defined(hpux)
+/*
+ * the alt bitstream reader performs unaligned memory accesses; that doesn't work
+ * on sparc/hpux.  For now, disable ALT_BITSTREAM_READER.
+ */
+#undef	ALT_BITSTREAM_READER
+#else
+// alternative (faster) bitstram reader (reades upto 3 bytes over the end of the input)
+#define ALT_BITSTREAM_READER
+
+/* used to avoid misaligned exceptions on some archs (alpha, ...) */
+#if ARCH_X86 || HAVE_ARMV6
+#    define unaligned32(a) (*(uint32_t*)(a))
+#else
+#    ifdef __GNUC__
+static inline uint32_t unaligned32(const void *v) {
+    struct Unaligned {
+	uint32_t i;
+    } __attribute__((packed));
+
+    return ((const struct Unaligned *) v)->i;
+}
+#    elif defined(__DECC)
+static inline uint32_t unaligned32(const void *v) {
+    return *(const __unaligned uint32_t *) v;
+}
+#    else
+static inline uint32_t unaligned32(const void *v) {
+    return *(const uint32_t *) v;
+}
+#    endif
+#endif //!ARCH_X86
+
+#endif
+
 /* (stolen from the kernel) */
 #if HAVE_BIGENDIAN

@@ -28,7 +68,7 @@

 #else

-#	if 0 && defined (__i386__)
+#	if defined (__i386__)

 #	define swab32(x) __i386_swab32(x)
 	static inline const uint32_t __i386_swab32(uint32_t x)
@@ -39,19 +79,34 @@

 #	else

-#	define swab32(x)\
-((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) |  \
- (((uint8_t*)&x)[2] << 8)  | (((uint8_t*)&x)[3]))
-
+#	define swab32(x) __generic_swab32(x)
+	static inline const uint32_t __generic_swab32(uint32_t x)
+	{
+		return ((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) |
+		 (((uint8_t*)&x)[2] << 8)  | (((uint8_t*)&x)[3]));
+	}
 #	endif
 #endif

+#ifdef ALT_BITSTREAM_READER
+extern int indx;
+#endif
+
 void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf);
 uint32_t a52_bitstream_get_bh (a52_state_t * state, uint32_t num_bits);
 int32_t a52_bitstream_get_bh_2 (a52_state_t * state, uint32_t num_bits);

 static inline uint32_t bitstream_get (a52_state_t * state, uint32_t num_bits)
 {
+#ifdef ALT_BITSTREAM_READER
+    uint32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) );
+
+    result<<= (indx&0x07);
+    result>>= 32 - num_bits;
+    indx+= num_bits;
+
+    return result;
+#else
     uint32_t result;

     if (num_bits < state->bits_left) {
@@ -61,10 +116,29 @@
     }

     return a52_bitstream_get_bh (state, num_bits);
+#endif
+}
+
+static inline void bitstream_skip(a52_state_t * state, int num_bits)
+{
+#ifdef ALT_BITSTREAM_READER
+	indx+= num_bits;
+#else
+	bitstream_get(state, num_bits);
+#endif
 }

 static inline int32_t bitstream_get_2 (a52_state_t * state, uint32_t num_bits)
 {
+#ifdef ALT_BITSTREAM_READER
+    int32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) );
+
+    result<<= (indx&0x07);
+    result>>= 32 - num_bits;
+    indx+= num_bits;
+
+    return result;
+#else
     int32_t result;

     if (num_bits < state->bits_left) {
@@ -74,4 +148,5 @@
     }

     return a52_bitstream_get_bh_2 (state, num_bits);
+#endif
 }
--- liba52/downmix.c	2006-06-12 15:17:53.000000000 +0200
+++ liba52/downmix.c	2006-06-05 02:23:02.000000000 +0200
@@ -19,18 +23,46 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
  */

 #include "config.h"

 #include <string.h>
 #include <inttypes.h>

 #include "a52.h"
 #include "a52_internal.h"
+#include "mm_accel.h"

 #define CONVERT(acmod,output) (((output) << 3) + (acmod))

+
+void (*a52_downmix)(sample_t * samples, int acmod, int output, sample_t bias,
+	      sample_t clev, sample_t slev)= NULL;
+void (*a52_upmix)(sample_t * samples, int acmod, int output)= NULL;
+
+static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias,
+	      sample_t clev, sample_t slev);
+static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias,
+	      sample_t clev, sample_t slev);
+static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
+	      sample_t clev, sample_t slev);
+static void upmix_MMX (sample_t * samples, int acmod, int output);
+static void upmix_C (sample_t * samples, int acmod, int output);
+
+void downmix_accel_init(uint32_t mm_accel)
+{
+    a52_upmix= upmix_C;
+    a52_downmix= downmix_C;
+#if ARCH_X86 || ARCH_X86_64
+    if(mm_accel & MM_ACCEL_X86_MMX) a52_upmix= upmix_MMX;
+    if(mm_accel & MM_ACCEL_X86_SSE) a52_downmix= downmix_SSE;
+    if(mm_accel & MM_ACCEL_X86_3DNOW) a52_downmix= downmix_3dnow;
+#endif
+}
+
 int a52_downmix_init (int input, int flags, sample_t * level,
 		      sample_t clev, sample_t slev)
 {
@@ -447,7 +479,7 @@
 	samples[i] = 0;
 }

-void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias,
+void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
 		  sample_t clev, sample_t slev)
 {
     switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
@@ -559,7 +591,7 @@
 	break;

     case CONVERT (A52_3F2R, A52_2F1R):
-	mix3to2 (samples, bias);
+	mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
 	move2to1 (samples + 768, samples + 512, bias);
 	break;

@@ -583,12 +615,12 @@
 	break;

     case CONVERT (A52_3F1R, A52_3F2R):
-	memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t));
+	memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
 	break;
     }
 }

-void a52_upmix (sample_t * samples, int acmod, int output)
+void upmix_C (sample_t * samples, int acmod, int output)
 {
     switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {

@@ -653,3 +685,1104 @@
 	goto mix_31to21;
     }
 }
+
+#if ARCH_X86 || ARCH_X86_64
+static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias)
+{
+	__asm__ volatile(
+	"movlps %2, %%xmm7		\n\t"
+	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+	"mov $-1024, %%"REG_S"		\n\t"
+	ASMALIGN(4)
+	"1:				\n\t"
+	"movaps (%0, %%"REG_S"), %%xmm0	\n\t"
+	"movaps 16(%0, %%"REG_S"), %%xmm1\n\t"
+	"addps (%1, %%"REG_S"), %%xmm0	\n\t"
+	"addps 16(%1, %%"REG_S"), %%xmm1\n\t"
+	"addps %%xmm7, %%xmm0		\n\t"
+	"addps %%xmm7, %%xmm1		\n\t"
+	"movaps %%xmm0, (%1, %%"REG_S")	\n\t"
+	"movaps %%xmm1, 16(%1, %%"REG_S")\n\t"
+	"add $32, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (src+256), "r" (dest+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix3to1_SSE (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"movlps %1, %%xmm7		\n\t"
+	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+	"mov $-1024, %%"REG_S"		\n\t"
+	ASMALIGN(4)
+	"1:				\n\t"
+	"movaps (%0, %%"REG_S"), %%xmm0	\n\t"
+	"movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
+	"addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
+	"addps %%xmm7, %%xmm1		\n\t"
+	"addps %%xmm1, %%xmm0		\n\t"
+	"movaps %%xmm0, (%0, %%"REG_S")	\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix4to1_SSE (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"movlps %1, %%xmm7		\n\t"
+	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+	"mov $-1024, %%"REG_S"		\n\t"
+	ASMALIGN(4)
+	"1:				\n\t"
+	"movaps (%0, %%"REG_S"), %%xmm0	\n\t"
+	"movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
+	"addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
+	"addps 3072(%0, %%"REG_S"), %%xmm1\n\t"
+	"addps %%xmm7, %%xmm0		\n\t"
+	"addps %%xmm1, %%xmm0		\n\t"
+	"movaps %%xmm0, (%0, %%"REG_S")	\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix5to1_SSE (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"movlps %1, %%xmm7		\n\t"
+	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+	"mov $-1024, %%"REG_S"		\n\t"
+	ASMALIGN(4)
+	"1:				\n\t"
+	"movaps (%0, %%"REG_S"), %%xmm0	\n\t"
+	"movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
+	"addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
+	"addps 3072(%0, %%"REG_S"), %%xmm1\n\t"
+	"addps %%xmm7, %%xmm0		\n\t"
+	"addps 4096(%0, %%"REG_S"), %%xmm1\n\t"
+	"addps %%xmm1, %%xmm0		\n\t"
+	"movaps %%xmm0, (%0, %%"REG_S")	\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix3to2_SSE (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"movlps %1, %%xmm7		\n\t"
+	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+	"mov $-1024, %%"REG_S"		\n\t"
+	ASMALIGN(4)
+	"1:				\n\t"
+	"movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
+	"addps %%xmm7, %%xmm0		\n\t" //common
+	"movaps (%0, %%"REG_S"), %%xmm1	\n\t"
+	"movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
+	"addps %%xmm0, %%xmm1		\n\t"
+	"addps %%xmm0, %%xmm2		\n\t"
+	"movaps %%xmm1, (%0, %%"REG_S")	\n\t"
+	"movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias)
+{
+	__asm__ volatile(
+		"movlps %2, %%xmm7		\n\t"
+		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+		"mov $-1024, %%"REG_S"		\n\t"
+		ASMALIGN(4)
+		"1:				\n\t"
+		"movaps 1024(%1, %%"REG_S"), %%xmm0\n\t"
+		"addps %%xmm7, %%xmm0		\n\t" //common
+		"movaps (%0, %%"REG_S"), %%xmm1	\n\t"
+		"movaps (%1, %%"REG_S"), %%xmm2	\n\t"
+		"addps %%xmm0, %%xmm1		\n\t"
+		"addps %%xmm0, %%xmm2		\n\t"
+		"movaps %%xmm1, (%0, %%"REG_S")	\n\t"
+		"movaps %%xmm2, (%1, %%"REG_S")	\n\t"
+		"add $16, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (left+256), "r" (right+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix21toS_SSE (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+		"movlps %1, %%xmm7		\n\t"
+		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+		"mov $-1024, %%"REG_S"		\n\t"
+		ASMALIGN(4)
+		"1:				\n\t"
+		"movaps 2048(%0, %%"REG_S"), %%xmm0\n\t"  // surround
+		"movaps (%0, %%"REG_S"), %%xmm1	\n\t"
+		"movaps 1024(%0, %%"REG_S"), %%xmm2\n\t"
+		"addps %%xmm7, %%xmm1		\n\t"
+		"addps %%xmm7, %%xmm2		\n\t"
+		"subps %%xmm0, %%xmm1		\n\t"
+		"addps %%xmm0, %%xmm2		\n\t"
+		"movaps %%xmm1, (%0, %%"REG_S")	\n\t"
+		"movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
+		"add $16, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix31to2_SSE (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+		"movlps %1, %%xmm7		\n\t"
+		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+		"mov $-1024, %%"REG_S"		\n\t"
+		ASMALIGN(4)
+		"1:				\n\t"
+		"movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
+		"addps 3072(%0, %%"REG_S"), %%xmm0\n\t"
+		"addps %%xmm7, %%xmm0		\n\t" // common
+		"movaps (%0, %%"REG_S"), %%xmm1	\n\t"
+		"movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
+		"addps %%xmm0, %%xmm1		\n\t"
+		"addps %%xmm0, %%xmm2		\n\t"
+		"movaps %%xmm1, (%0, %%"REG_S")	\n\t"
+		"movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
+		"add $16, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix31toS_SSE (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+		"movlps %1, %%xmm7		\n\t"
+		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+		"mov $-1024, %%"REG_S"		\n\t"
+		ASMALIGN(4)
+		"1:				\n\t"
+		"movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
+		"movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround
+		"addps %%xmm7, %%xmm0		\n\t" // common
+		"movaps (%0, %%"REG_S"), %%xmm1	\n\t"
+		"movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
+		"addps %%xmm0, %%xmm1		\n\t"
+		"addps %%xmm0, %%xmm2		\n\t"
+		"subps %%xmm3, %%xmm1		\n\t"
+		"addps %%xmm3, %%xmm2		\n\t"
+		"movaps %%xmm1, (%0, %%"REG_S")	\n\t"
+		"movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
+		"add $16, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix22toS_SSE (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+		"movlps %1, %%xmm7		\n\t"
+		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+		"mov $-1024, %%"REG_S"		\n\t"
+		ASMALIGN(4)
+		"1:				\n\t"
+		"movaps 2048(%0, %%"REG_S"), %%xmm0\n\t"
+		"addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround
+		"movaps (%0, %%"REG_S"), %%xmm1	\n\t"
+		"movaps 1024(%0, %%"REG_S"), %%xmm2\n\t"
+		"addps %%xmm7, %%xmm1		\n\t"
+		"addps %%xmm7, %%xmm2		\n\t"
+		"subps %%xmm0, %%xmm1		\n\t"
+		"addps %%xmm0, %%xmm2		\n\t"
+		"movaps %%xmm1, (%0, %%"REG_S")	\n\t"
+		"movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
+		"add $16, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix32to2_SSE (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"movlps %1, %%xmm7		\n\t"
+	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+	"mov $-1024, %%"REG_S"		\n\t"
+	ASMALIGN(4)
+	"1:				\n\t"
+	"movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
+	"addps %%xmm7, %%xmm0		\n\t" // common
+	"movaps %%xmm0, %%xmm1		\n\t" // common
+	"addps (%0, %%"REG_S"), %%xmm0	\n\t"
+	"addps 2048(%0, %%"REG_S"), %%xmm1\n\t"
+	"addps 3072(%0, %%"REG_S"), %%xmm0\n\t"
+	"addps 4096(%0, %%"REG_S"), %%xmm1\n\t"
+	"movaps %%xmm0, (%0, %%"REG_S")	\n\t"
+	"movaps %%xmm1, 1024(%0, %%"REG_S")\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix32toS_SSE (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"movlps %1, %%xmm7		\n\t"
+	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+	"mov $-1024, %%"REG_S"		\n\t"
+	ASMALIGN(4)
+	"1:				\n\t"
+	"movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
+	"movaps 3072(%0, %%"REG_S"), %%xmm2\n\t"
+	"addps %%xmm7, %%xmm0		\n\t" // common
+	"addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround
+	"movaps (%0, %%"REG_S"), %%xmm1	\n\t"
+	"movaps 2048(%0, %%"REG_S"), %%xmm3\n\t"
+	"subps %%xmm2, %%xmm1		\n\t"
+	"addps %%xmm2, %%xmm3		\n\t"
+	"addps %%xmm0, %%xmm1		\n\t"
+	"addps %%xmm0, %%xmm3		\n\t"
+	"movaps %%xmm1, (%0, %%"REG_S")	\n\t"
+	"movaps %%xmm3, 1024(%0, %%"REG_S")\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias)
+{
+	__asm__ volatile(
+		"movlps %2, %%xmm7		\n\t"
+		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
+		"mov $-1024, %%"REG_S"		\n\t"
+		ASMALIGN(4)
+		"1:				\n\t"
+		"movaps (%0, %%"REG_S"), %%xmm0	\n\t"
+		"movaps 16(%0, %%"REG_S"), %%xmm1\n\t"
+		"addps 1024(%0, %%"REG_S"), %%xmm0\n\t"
+		"addps 1040(%0, %%"REG_S"), %%xmm1\n\t"
+		"addps %%xmm7, %%xmm0		\n\t"
+		"addps %%xmm7, %%xmm1		\n\t"
+		"movaps %%xmm0, (%1, %%"REG_S")	\n\t"
+		"movaps %%xmm1, 16(%1, %%"REG_S")\n\t"
+		"add $32, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (src+256), "r" (dest+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void zero_MMX(sample_t * samples)
+{
+	__asm__ volatile(
+		"mov $-1024, %%"REG_S"		\n\t"
+		"pxor %%mm0, %%mm0		\n\t"
+		ASMALIGN(4)
+		"1:				\n\t"
+		"movq %%mm0, (%0, %%"REG_S")	\n\t"
+		"movq %%mm0, 8(%0, %%"REG_S")	\n\t"
+		"movq %%mm0, 16(%0, %%"REG_S")	\n\t"
+		"movq %%mm0, 24(%0, %%"REG_S")	\n\t"
+		"add $32, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+		"emms"
+	:: "r" (samples+256)
+	: "%"REG_S
+	);
+}
+
+static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias,
+	      sample_t clev, sample_t slev)
+{
+    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
+
+    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
+	memcpy (samples, samples + 256, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_CHANNEL, A52_MONO):
+    case CONVERT (A52_STEREO, A52_MONO):
+    mix_2to1_SSE:
+	mix2to1_SSE (samples, samples + 256, bias);
+	break;
+
+    case CONVERT (A52_2F1R, A52_MONO):
+	if (slev == 0)
+	    goto mix_2to1_SSE;
+    case CONVERT (A52_3F, A52_MONO):
+    mix_3to1_SSE:
+	mix3to1_SSE (samples, bias);
+	break;
+
+    case CONVERT (A52_3F1R, A52_MONO):
+	if (slev == 0)
+	    goto mix_3to1_SSE;
+    case CONVERT (A52_2F2R, A52_MONO):
+	if (slev == 0)
+	    goto mix_2to1_SSE;
+	mix4to1_SSE (samples, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_MONO):
+	if (slev == 0)
+	    goto mix_3to1_SSE;
+	mix5to1_SSE (samples, bias);
+	break;
+
+    case CONVERT (A52_MONO, A52_DOLBY):
+	memcpy (samples + 256, samples, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_3F, A52_STEREO):
+    case CONVERT (A52_3F, A52_DOLBY):
+    mix_3to2_SSE:
+	mix3to2_SSE (samples, bias);
+	break;
+
+    case CONVERT (A52_2F1R, A52_STEREO):
+	if (slev == 0)
+	    break;
+	mix21to2_SSE (samples, samples + 256, bias);
+	break;
+
+    case CONVERT (A52_2F1R, A52_DOLBY):
+	mix21toS_SSE (samples, bias);
+	break;
+
+    case CONVERT (A52_3F1R, A52_STEREO):
+	if (slev == 0)
+	    goto mix_3to2_SSE;
+	mix31to2_SSE (samples, bias);
+	break;
+
+    case CONVERT (A52_3F1R, A52_DOLBY):
+	mix31toS_SSE (samples, bias);
+	break;
+
+    case CONVERT (A52_2F2R, A52_STEREO):
+	if (slev == 0)
+	    break;
+	mix2to1_SSE (samples, samples + 512, bias);
+	mix2to1_SSE (samples + 256, samples + 768, bias);
+	break;
+
+    case CONVERT (A52_2F2R, A52_DOLBY):
+	mix22toS_SSE (samples, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_STEREO):
+	if (slev == 0)
+	    goto mix_3to2_SSE;
+	mix32to2_SSE (samples, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_DOLBY):
+	mix32toS_SSE (samples, bias);
+	break;
+
+    case CONVERT (A52_3F1R, A52_3F):
+	if (slev == 0)
+	    break;
+	mix21to2_SSE (samples, samples + 512, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_3F):
+	if (slev == 0)
+	    break;
+	mix2to1_SSE (samples, samples + 768, bias);
+	mix2to1_SSE (samples + 512, samples + 1024, bias);
+	break;
+
+    case CONVERT (A52_3F1R, A52_2F1R):
+	mix3to2_SSE (samples, bias);
+	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_2F2R, A52_2F1R):
+	mix2to1_SSE (samples + 512, samples + 768, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_2F1R):
+	mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
+	move2to1_SSE (samples + 768, samples + 512, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_3F1R):
+	mix2to1_SSE (samples + 768, samples + 1024, bias);
+	break;
+
+    case CONVERT (A52_2F1R, A52_2F2R):
+	memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_3F1R, A52_2F2R):
+	mix3to2_SSE (samples, bias);
+	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_3F2R, A52_2F2R):
+	mix3to2_SSE (samples, bias);
+	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
+	memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_3F1R, A52_3F2R):
+	memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
+	break;
+    }
+}
+
+static void upmix_MMX (sample_t * samples, int acmod, int output)
+{
+    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
+
+    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
+	memcpy (samples + 256, samples, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_3F2R, A52_MONO):
+	zero_MMX (samples + 1024);
+    case CONVERT (A52_3F1R, A52_MONO):
+    case CONVERT (A52_2F2R, A52_MONO):
+	zero_MMX (samples + 768);
+    case CONVERT (A52_3F, A52_MONO):
+    case CONVERT (A52_2F1R, A52_MONO):
+	zero_MMX (samples + 512);
+    case CONVERT (A52_CHANNEL, A52_MONO):
+    case CONVERT (A52_STEREO, A52_MONO):
+	zero_MMX (samples + 256);
+	break;
+
+    case CONVERT (A52_3F2R, A52_STEREO):
+    case CONVERT (A52_3F2R, A52_DOLBY):
+	zero_MMX (samples + 1024);
+    case CONVERT (A52_3F1R, A52_STEREO):
+    case CONVERT (A52_3F1R, A52_DOLBY):
+	zero_MMX (samples + 768);
+    case CONVERT (A52_3F, A52_STEREO):
+    case CONVERT (A52_3F, A52_DOLBY):
+    mix_3to2_MMX:
+	memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t));
+	zero_MMX (samples + 256);
+	break;
+
+    case CONVERT (A52_2F2R, A52_STEREO):
+    case CONVERT (A52_2F2R, A52_DOLBY):
+	zero_MMX (samples + 768);
+    case CONVERT (A52_2F1R, A52_STEREO):
+    case CONVERT (A52_2F1R, A52_DOLBY):
+	zero_MMX (samples + 512);
+	break;
+
+    case CONVERT (A52_3F2R, A52_3F):
+	zero_MMX (samples + 1024);
+    case CONVERT (A52_3F1R, A52_3F):
+    case CONVERT (A52_2F2R, A52_2F1R):
+	zero_MMX (samples + 768);
+	break;
+
+    case CONVERT (A52_3F2R, A52_3F1R):
+	zero_MMX (samples + 1024);
+	break;
+
+    case CONVERT (A52_3F2R, A52_2F1R):
+	zero_MMX (samples + 1024);
+    case CONVERT (A52_3F1R, A52_2F1R):
+    mix_31to21_MMX:
+	memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
+	goto mix_3to2_MMX;
+
+    case CONVERT (A52_3F2R, A52_2F2R):
+	memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
+	goto mix_31to21_MMX;
+    }
+}
+
+static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias)
+{
+	__asm__ volatile(
+	"movd  %2, %%mm7	\n\t"
+	"punpckldq %2, %%mm7	\n\t"
+	"mov $-1024, %%"REG_S"	\n\t"
+	ASMALIGN(4)
+	"1:			\n\t"
+	"movq  (%0, %%"REG_S"), %%mm0	\n\t"
+	"movq  8(%0, %%"REG_S"), %%mm1	\n\t"
+	"movq  16(%0, %%"REG_S"), %%mm2	\n\t"
+	"movq  24(%0, %%"REG_S"), %%mm3	\n\t"
+	"pfadd (%1, %%"REG_S"), %%mm0	\n\t"
+	"pfadd 8(%1, %%"REG_S"), %%mm1	\n\t"
+	"pfadd 16(%1, %%"REG_S"), %%mm2	\n\t"
+	"pfadd 24(%1, %%"REG_S"), %%mm3	\n\t"
+	"pfadd %%mm7, %%mm0		\n\t"
+	"pfadd %%mm7, %%mm1		\n\t"
+	"pfadd %%mm7, %%mm2		\n\t"
+	"pfadd %%mm7, %%mm3		\n\t"
+	"movq  %%mm0, (%1, %%"REG_S")	\n\t"
+	"movq  %%mm1, 8(%1, %%"REG_S")	\n\t"
+	"movq  %%mm2, 16(%1, %%"REG_S")	\n\t"
+	"movq  %%mm3, 24(%1, %%"REG_S")	\n\t"
+	"add $32, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (src+256), "r" (dest+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix3to1_3dnow (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"movd  %1, %%mm7	\n\t"
+	"punpckldq %1, %%mm7	\n\t"
+	"mov $-1024, %%"REG_S"	\n\t"
+	ASMALIGN(4)
+	"1:			\n\t"
+	"movq  (%0, %%"REG_S"), %%mm0	\n\t"
+	"movq  8(%0, %%"REG_S"), %%mm1	\n\t"
+	"movq  1024(%0, %%"REG_S"), %%mm2\n\t"
+	"movq  1032(%0, %%"REG_S"), %%mm3\n\t"
+	"pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
+	"pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
+	"pfadd %%mm7, %%mm0		\n\t"
+	"pfadd %%mm7, %%mm1		\n\t"
+	"pfadd %%mm2, %%mm0		\n\t"
+	"pfadd %%mm3, %%mm1		\n\t"
+	"movq  %%mm0, (%0, %%"REG_S")	\n\t"
+	"movq  %%mm1, 8(%0, %%"REG_S")	\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix4to1_3dnow (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"movd  %1, %%mm7	\n\t"
+	"punpckldq %1, %%mm7	\n\t"
+	"mov $-1024, %%"REG_S"	\n\t"
+	ASMALIGN(4)
+	"1:			\n\t"
+	"movq  (%0, %%"REG_S"), %%mm0	\n\t"
+	"movq  8(%0, %%"REG_S"), %%mm1	\n\t"
+	"movq  1024(%0, %%"REG_S"), %%mm2\n\t"
+	"movq  1032(%0, %%"REG_S"), %%mm3\n\t"
+	"pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
+	"pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
+	"pfadd 3072(%0, %%"REG_S"), %%mm2\n\t"
+	"pfadd 3080(%0, %%"REG_S"), %%mm3\n\t"
+	"pfadd %%mm7, %%mm0		\n\t"
+	"pfadd %%mm7, %%mm1		\n\t"
+	"pfadd %%mm2, %%mm0		\n\t"
+	"pfadd %%mm3, %%mm1		\n\t"
+	"movq  %%mm0, (%0, %%"REG_S")	\n\t"
+	"movq  %%mm1, 8(%0, %%"REG_S")	\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix5to1_3dnow (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"movd  %1, %%mm7	\n\t"
+	"punpckldq %1, %%mm7	\n\t"
+	"mov $-1024, %%"REG_S"	\n\t"
+	ASMALIGN(4)
+	"1:			\n\t"
+	"movq  (%0, %%"REG_S"), %%mm0	\n\t"
+	"movq  8(%0, %%"REG_S"), %%mm1	\n\t"
+	"movq  1024(%0, %%"REG_S"), %%mm2\n\t"
+	"movq  1032(%0, %%"REG_S"), %%mm3\n\t"
+	"pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
+	"pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
+	"pfadd 3072(%0, %%"REG_S"), %%mm2\n\t"
+	"pfadd 3080(%0, %%"REG_S"), %%mm3\n\t"
+	"pfadd %%mm7, %%mm0		\n\t"
+	"pfadd %%mm7, %%mm1		\n\t"
+	"pfadd 4096(%0, %%"REG_S"), %%mm2\n\t"
+	"pfadd 4104(%0, %%"REG_S"), %%mm3\n\t"
+	"pfadd %%mm2, %%mm0		\n\t"
+	"pfadd %%mm3, %%mm1		\n\t"
+	"movq  %%mm0, (%0, %%"REG_S")	\n\t"
+	"movq  %%mm1, 8(%0, %%"REG_S")	\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix3to2_3dnow (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"movd  %1, %%mm7	\n\t"
+	"punpckldq %1, %%mm7	\n\t"
+	"mov $-1024, %%"REG_S"	\n\t"
+	ASMALIGN(4)
+	"1:			\n\t"
+	"movq   1024(%0, %%"REG_S"), %%mm0\n\t"
+	"movq   1032(%0, %%"REG_S"), %%mm1\n\t"
+	"pfadd  %%mm7, %%mm0		\n\t" //common
+	"pfadd  %%mm7, %%mm1		\n\t" //common
+	"movq   (%0, %%"REG_S"), %%mm2	\n\t"
+	"movq   8(%0, %%"REG_S"), %%mm3	\n\t"
+	"movq   2048(%0, %%"REG_S"), %%mm4\n\t"
+	"movq   2056(%0, %%"REG_S"), %%mm5\n\t"
+	"pfadd  %%mm0, %%mm2		\n\t"
+	"pfadd  %%mm1, %%mm3		\n\t"
+	"pfadd  %%mm0, %%mm4		\n\t"
+	"pfadd  %%mm1, %%mm5		\n\t"
+	"movq   %%mm2, (%0, %%"REG_S")	\n\t"
+	"movq   %%mm3, 8(%0, %%"REG_S")	\n\t"
+	"movq   %%mm4, 1024(%0, %%"REG_S")\n\t"
+	"movq   %%mm5, 1032(%0, %%"REG_S")\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias)
+{
+	__asm__ volatile(
+		"movd  %2, %%mm7	\n\t"
+		"punpckldq %2, %%mm7	\n\t"
+		"mov $-1024, %%"REG_S"	\n\t"
+		ASMALIGN(4)
+		"1:			\n\t"
+		"movq  1024(%1, %%"REG_S"), %%mm0\n\t"
+		"movq  1032(%1, %%"REG_S"), %%mm1\n\t"
+		"pfadd %%mm7, %%mm0		\n\t" //common
+		"pfadd %%mm7, %%mm1		\n\t" //common
+		"movq  (%0, %%"REG_S"), %%mm2	\n\t"
+		"movq  8(%0, %%"REG_S"), %%mm3	\n\t"
+		"movq  (%1, %%"REG_S"), %%mm4	\n\t"
+		"movq  8(%1, %%"REG_S"), %%mm5	\n\t"
+		"pfadd %%mm0, %%mm2		\n\t"
+		"pfadd %%mm1, %%mm3		\n\t"
+		"pfadd %%mm0, %%mm4		\n\t"
+		"pfadd %%mm1, %%mm5		\n\t"
+		"movq  %%mm2, (%0, %%"REG_S")	\n\t"
+		"movq  %%mm3, 8(%0, %%"REG_S")	\n\t"
+		"movq  %%mm4, (%1, %%"REG_S")	\n\t"
+		"movq  %%mm5, 8(%1, %%"REG_S")	\n\t"
+		"add $16, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (left+256), "r" (right+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix21toS_3dnow (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+		"movd  %1, %%mm7	\n\t"
+		"punpckldq %1, %%mm7	\n\t"
+		"mov $-1024, %%"REG_S"	\n\t"
+		ASMALIGN(4)
+		"1:			\n\t"
+		"movq  2048(%0, %%"REG_S"), %%mm0\n\t"  // surround
+		"movq  2056(%0, %%"REG_S"), %%mm1\n\t"  // surround
+		"movq  (%0, %%"REG_S"), %%mm2	\n\t"
+		"movq  8(%0, %%"REG_S"), %%mm3	\n\t"
+		"movq  1024(%0, %%"REG_S"), %%mm4\n\t"
+		"movq  1032(%0, %%"REG_S"), %%mm5\n\t"
+		"pfadd %%mm7, %%mm2		\n\t"
+		"pfadd %%mm7, %%mm3		\n\t"
+		"pfadd %%mm7, %%mm4		\n\t"
+		"pfadd %%mm7, %%mm5		\n\t"
+		"pfsub %%mm0, %%mm2		\n\t"
+		"pfsub %%mm1, %%mm3		\n\t"
+		"pfadd %%mm0, %%mm4		\n\t"
+		"pfadd %%mm1, %%mm5		\n\t"
+		"movq  %%mm2, (%0, %%"REG_S")	\n\t"
+		"movq  %%mm3, 8(%0, %%"REG_S")	\n\t"
+		"movq  %%mm4, 1024(%0, %%"REG_S")\n\t"
+		"movq  %%mm5, 1032(%0, %%"REG_S")\n\t"
+		"add $16, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix31to2_3dnow (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+		"movd  %1, %%mm7	\n\t"
+		"punpckldq %1, %%mm7	\n\t"
+		"mov $-1024, %%"REG_S"	\n\t"
+		ASMALIGN(4)
+		"1:			\n\t"
+		"movq  1024(%0, %%"REG_S"), %%mm0\n\t"
+		"movq  1032(%0, %%"REG_S"), %%mm1\n\t"
+		"pfadd 3072(%0, %%"REG_S"), %%mm0\n\t"
+		"pfadd 3080(%0, %%"REG_S"), %%mm1\n\t"
+		"pfadd %%mm7, %%mm0		\n\t" // common
+		"pfadd %%mm7, %%mm1		\n\t" // common
+		"movq  (%0, %%"REG_S"), %%mm2	\n\t"
+		"movq  8(%0, %%"REG_S"), %%mm3	\n\t"
+		"movq  2048(%0, %%"REG_S"), %%mm4\n\t"
+		"movq  2056(%0, %%"REG_S"), %%mm5\n\t"
+		"pfadd %%mm0, %%mm2		\n\t"
+		"pfadd %%mm1, %%mm3		\n\t"
+		"pfadd %%mm0, %%mm4		\n\t"
+		"pfadd %%mm1, %%mm5		\n\t"
+		"movq  %%mm2, (%0, %%"REG_S")	\n\t"
+		"movq  %%mm3, 8(%0, %%"REG_S")	\n\t"
+		"movq  %%mm4, 1024(%0, %%"REG_S")\n\t"
+		"movq  %%mm5, 1032(%0, %%"REG_S")\n\t"
+		"add $16, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix31toS_3dnow (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+		"movd  %1, %%mm7	\n\t"
+		"punpckldq %1, %%mm7	\n\t"
+		"mov $-1024, %%"REG_S"	\n\t"
+		ASMALIGN(4)
+		"1:			\n\t"
+		"movq   1024(%0, %%"REG_S"), %%mm0\n\t"
+		"movq   1032(%0, %%"REG_S"), %%mm1\n\t"
+		"pfadd  %%mm7, %%mm0		\n\t" // common
+		"pfadd  %%mm7, %%mm1		\n\t" // common
+		"movq   (%0, %%"REG_S"), %%mm2	\n\t"
+		"movq   8(%0, %%"REG_S"), %%mm3	\n\t"
+		"movq   2048(%0, %%"REG_S"), %%mm4\n\t"
+		"movq   2056(%0, %%"REG_S"), %%mm5\n\t"
+		"pfadd  %%mm0, %%mm2		\n\t"
+		"pfadd  %%mm1, %%mm3		\n\t"
+		"pfadd  %%mm0, %%mm4		\n\t"
+		"pfadd  %%mm1, %%mm5		\n\t"
+		"movq   3072(%0, %%"REG_S"), %%mm0\n\t" // surround
+		"movq   3080(%0, %%"REG_S"), %%mm1\n\t" // surround
+		"pfsub  %%mm0, %%mm2		\n\t"
+		"pfsub  %%mm1, %%mm3		\n\t"
+		"pfadd  %%mm0, %%mm4		\n\t"
+		"pfadd  %%mm1, %%mm5		\n\t"
+		"movq   %%mm2, (%0, %%"REG_S")	\n\t"
+		"movq   %%mm3, 8(%0, %%"REG_S")	\n\t"
+		"movq   %%mm4, 1024(%0, %%"REG_S")\n\t"
+		"movq   %%mm5, 1032(%0, %%"REG_S")\n\t"
+		"add $16, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix22toS_3dnow (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+		"movd  %1, %%mm7	\n\t"
+		"punpckldq %1, %%mm7	\n\t"
+		"mov $-1024, %%"REG_S"	\n\t"
+		ASMALIGN(4)
+		"1:			\n\t"
+		"movq  2048(%0, %%"REG_S"), %%mm0\n\t"
+		"movq  2056(%0, %%"REG_S"), %%mm1\n\t"
+		"pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround
+		"pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround
+		"movq  (%0, %%"REG_S"), %%mm2	\n\t"
+		"movq  8(%0, %%"REG_S"), %%mm3	\n\t"
+		"movq  1024(%0, %%"REG_S"), %%mm4\n\t"
+		"movq  1032(%0, %%"REG_S"), %%mm5\n\t"
+		"pfadd %%mm7, %%mm2		\n\t"
+		"pfadd %%mm7, %%mm3		\n\t"
+		"pfadd %%mm7, %%mm4		\n\t"
+		"pfadd %%mm7, %%mm5		\n\t"
+		"pfsub %%mm0, %%mm2		\n\t"
+		"pfsub %%mm1, %%mm3		\n\t"
+		"pfadd %%mm0, %%mm4		\n\t"
+		"pfadd %%mm1, %%mm5		\n\t"
+		"movq  %%mm2, (%0, %%"REG_S")	\n\t"
+		"movq  %%mm3, 8(%0, %%"REG_S")	\n\t"
+		"movq  %%mm4, 1024(%0, %%"REG_S")\n\t"
+		"movq  %%mm5, 1032(%0, %%"REG_S")\n\t"
+		"add $16, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void mix32to2_3dnow (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"movd  %1, %%mm7	\n\t"
+	"punpckldq %1, %%mm7	\n\t"
+	"mov $-1024, %%"REG_S"	\n\t"
+	ASMALIGN(4)
+	"1:			\n\t"
+	"movq   1024(%0, %%"REG_S"), %%mm0\n\t"
+	"movq   1032(%0, %%"REG_S"), %%mm1\n\t"
+	"pfadd  %%mm7, %%mm0		\n\t" // common
+	"pfadd  %%mm7, %%mm1		\n\t" // common
+	"movq   %%mm0, %%mm2		\n\t" // common
+	"movq   %%mm1, %%mm3		\n\t" // common
+	"pfadd  (%0, %%"REG_S"), %%mm0	\n\t"
+	"pfadd  8(%0, %%"REG_S"), %%mm1	\n\t"
+	"pfadd  2048(%0, %%"REG_S"), %%mm2\n\t"
+	"pfadd  2056(%0, %%"REG_S"), %%mm3\n\t"
+	"pfadd  3072(%0, %%"REG_S"), %%mm0\n\t"
+	"pfadd  3080(%0, %%"REG_S"), %%mm1\n\t"
+	"pfadd  4096(%0, %%"REG_S"), %%mm2\n\t"
+	"pfadd  4104(%0, %%"REG_S"), %%mm3\n\t"
+	"movq   %%mm0, (%0, %%"REG_S")	\n\t"
+	"movq   %%mm1, 8(%0, %%"REG_S")	\n\t"
+	"movq   %%mm2, 1024(%0, %%"REG_S")\n\t"
+	"movq   %%mm3, 1032(%0, %%"REG_S")\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+/* todo: should be optimized better */
+static void mix32toS_3dnow (sample_t * samples, sample_t bias)
+{
+	__asm__ volatile(
+	"mov $-1024, %%"REG_S"		\n\t"
+	ASMALIGN(4)
+	"1:			\n\t"
+	"movd  %1, %%mm7		\n\t"
+	"punpckldq %1, %%mm7		\n\t"
+	"movq  1024(%0, %%"REG_S"), %%mm0\n\t"
+	"movq  1032(%0, %%"REG_S"), %%mm1\n\t"
+	"movq  3072(%0, %%"REG_S"), %%mm4\n\t"
+	"movq  3080(%0, %%"REG_S"), %%mm5\n\t"
+	"pfadd %%mm7, %%mm0		\n\t" // common
+	"pfadd %%mm7, %%mm1		\n\t" // common
+	"pfadd 4096(%0, %%"REG_S"), %%mm4\n\t" // surround
+	"pfadd 4104(%0, %%"REG_S"), %%mm5\n\t" // surround
+	"movq  (%0, %%"REG_S"), %%mm2	\n\t"
+	"movq  8(%0, %%"REG_S"), %%mm3	\n\t"
+	"movq  2048(%0, %%"REG_S"), %%mm6\n\t"
+	"movq  2056(%0, %%"REG_S"), %%mm7\n\t"
+	"pfsub %%mm4, %%mm2		\n\t"
+	"pfsub %%mm5, %%mm3		\n\t"
+	"pfadd %%mm4, %%mm6		\n\t"
+	"pfadd %%mm5, %%mm7		\n\t"
+	"pfadd %%mm0, %%mm2		\n\t"
+	"pfadd %%mm1, %%mm3		\n\t"
+	"pfadd %%mm0, %%mm6		\n\t"
+	"pfadd %%mm1, %%mm7		\n\t"
+	"movq  %%mm2, (%0, %%"REG_S")	\n\t"
+	"movq  %%mm3, 8(%0, %%"REG_S")	\n\t"
+	"movq  %%mm6, 1024(%0, %%"REG_S")\n\t"
+	"movq  %%mm7, 1032(%0, %%"REG_S")\n\t"
+	"add $16, %%"REG_S"		\n\t"
+	" jnz 1b			\n\t"
+	:: "r" (samples+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void move2to1_3dnow (sample_t * src, sample_t * dest, sample_t bias)
+{
+	__asm__ volatile(
+		"movd  %2, %%mm7	\n\t"
+		"punpckldq %2, %%mm7	\n\t"
+		"mov $-1024, %%"REG_S"	\n\t"
+		ASMALIGN(4)
+		"1:			\n\t"
+		"movq  (%0, %%"REG_S"), %%mm0	\n\t"
+		"movq  8(%0, %%"REG_S"), %%mm1	\n\t"
+		"movq  16(%0, %%"REG_S"), %%mm2	\n\t"
+		"movq  24(%0, %%"REG_S"), %%mm3	\n\t"
+		"pfadd 1024(%0, %%"REG_S"), %%mm0\n\t"
+		"pfadd 1032(%0, %%"REG_S"), %%mm1\n\t"
+		"pfadd 1040(%0, %%"REG_S"), %%mm2\n\t"
+		"pfadd 1048(%0, %%"REG_S"), %%mm3\n\t"
+		"pfadd %%mm7, %%mm0		\n\t"
+		"pfadd %%mm7, %%mm1		\n\t"
+		"pfadd %%mm7, %%mm2		\n\t"
+		"pfadd %%mm7, %%mm3		\n\t"
+		"movq  %%mm0, (%1, %%"REG_S")	\n\t"
+		"movq  %%mm1, 8(%1, %%"REG_S")	\n\t"
+		"movq  %%mm2, 16(%1, %%"REG_S")	\n\t"
+		"movq  %%mm3, 24(%1, %%"REG_S")	\n\t"
+		"add $32, %%"REG_S"		\n\t"
+		" jnz 1b			\n\t"
+	:: "r" (src+256), "r" (dest+256), "m" (bias)
+	: "%"REG_S
+	);
+}
+
+static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias,
+	      sample_t clev, sample_t slev)
+{
+    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
+
+    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
+	memcpy (samples, samples + 256, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_CHANNEL, A52_MONO):
+    case CONVERT (A52_STEREO, A52_MONO):
+    mix_2to1_3dnow:
+	mix2to1_3dnow (samples, samples + 256, bias);
+	break;
+
+    case CONVERT (A52_2F1R, A52_MONO):
+	if (slev == 0)
+	    goto mix_2to1_3dnow;
+    case CONVERT (A52_3F, A52_MONO):
+    mix_3to1_3dnow:
+	mix3to1_3dnow (samples, bias);
+	break;
+
+    case CONVERT (A52_3F1R, A52_MONO):
+	if (slev == 0)
+	    goto mix_3to1_3dnow;
+    case CONVERT (A52_2F2R, A52_MONO):
+	if (slev == 0)
+	    goto mix_2to1_3dnow;
+	mix4to1_3dnow (samples, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_MONO):
+	if (slev == 0)
+	    goto mix_3to1_3dnow;
+	mix5to1_3dnow (samples, bias);
+	break;
+
+    case CONVERT (A52_MONO, A52_DOLBY):
+	memcpy (samples + 256, samples, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_3F, A52_STEREO):
+    case CONVERT (A52_3F, A52_DOLBY):
+    mix_3to2_3dnow:
+	mix3to2_3dnow (samples, bias);
+	break;
+
+    case CONVERT (A52_2F1R, A52_STEREO):
+	if (slev == 0)
+	    break;
+	mix21to2_3dnow (samples, samples + 256, bias);
+	break;
+
+    case CONVERT (A52_2F1R, A52_DOLBY):
+	mix21toS_3dnow (samples, bias);
+	break;
+
+    case CONVERT (A52_3F1R, A52_STEREO):
+	if (slev == 0)
+	    goto mix_3to2_3dnow;
+	mix31to2_3dnow (samples, bias);
+	break;
+
+    case CONVERT (A52_3F1R, A52_DOLBY):
+	mix31toS_3dnow (samples, bias);
+	break;
+
+    case CONVERT (A52_2F2R, A52_STEREO):
+	if (slev == 0)
+	    break;
+	mix2to1_3dnow (samples, samples + 512, bias);
+	mix2to1_3dnow (samples + 256, samples + 768, bias);
+	break;
+
+    case CONVERT (A52_2F2R, A52_DOLBY):
+	mix22toS_3dnow (samples, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_STEREO):
+	if (slev == 0)
+	    goto mix_3to2_3dnow;
+	mix32to2_3dnow (samples, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_DOLBY):
+	mix32toS_3dnow (samples, bias);
+	break;
+
+    case CONVERT (A52_3F1R, A52_3F):
+	if (slev == 0)
+	    break;
+	mix21to2_3dnow (samples, samples + 512, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_3F):
+	if (slev == 0)
+	    break;
+	mix2to1_3dnow (samples, samples + 768, bias);
+	mix2to1_3dnow (samples + 512, samples + 1024, bias);
+	break;
+
+    case CONVERT (A52_3F1R, A52_2F1R):
+	mix3to2_3dnow (samples, bias);
+	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_2F2R, A52_2F1R):
+	mix2to1_3dnow (samples + 512, samples + 768, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_2F1R):
+	mix3to2_3dnow (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
+	move2to1_3dnow (samples + 768, samples + 512, bias);
+	break;
+
+    case CONVERT (A52_3F2R, A52_3F1R):
+	mix2to1_3dnow (samples + 768, samples + 1024, bias);
+	break;
+
+    case CONVERT (A52_2F1R, A52_2F2R):
+	memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_3F1R, A52_2F2R):
+	mix3to2_3dnow (samples, bias);
+	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_3F2R, A52_2F2R):
+	mix3to2_3dnow (samples, bias);
+	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
+	memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t));
+	break;
+
+    case CONVERT (A52_3F1R, A52_3F2R):
+	memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
+	break;
+    }
+    __asm__ volatile("femms":::"memory");
+}
+
+#endif // ARCH_X86 || ARCH_X86_64
--- liba52/imdct.c	2008-02-19 00:18:33.000000000 +0100
+++ liba52/imdct.c	2008-02-19 00:16:40.000000000 +0100
@@ -22,6 +26,11 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
+ * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
+ *   michael did port them from libac3 (untested, perhaps totally broken)
+ * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
  */

 #include "config.h"
@@ -39,12 +48,50 @@
 #include "a52.h"
 #include "a52_internal.h"
 #include "mm_accel.h"
+#include "mangle.h"
+
+void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
+
+#if CONFIG_RUNTIME_CPUDETECT
+#undef HAVE_AMD3DNOWEXT
+#define HAVE_AMD3DNOWEXT 0
+#endif

 typedef struct complex_s {
     sample_t real;
     sample_t imag;
 } complex_t;

+static const int pm128[128] attribute_used __attribute__((aligned(16))) =
+{
+	0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
+	4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
+	2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
+	6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
+	1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
+	5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
+	3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
+	7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
+};
+
+static uint8_t attribute_used bit_reverse_512[] = {
+	0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
+	0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
+	0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
+	0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
+	0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
+	0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
+	0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
+	0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
+	0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
+	0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
+	0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
+	0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
+	0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
+	0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
+	0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
+	0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
+
 static uint8_t fftorder[] = {
       0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176,
       8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88,
@@ -56,6 +103,40 @@
       6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86
 };

+static complex_t __attribute__((aligned(16))) buf[128];
+
+/* Twiddle factor LUT */
+static complex_t __attribute__((aligned(16))) w_1[1];
+static complex_t __attribute__((aligned(16))) w_2[2];
+static complex_t __attribute__((aligned(16))) w_4[4];
+static complex_t __attribute__((aligned(16))) w_8[8];
+static complex_t __attribute__((aligned(16))) w_16[16];
+static complex_t __attribute__((aligned(16))) w_32[32];
+static complex_t __attribute__((aligned(16))) w_64[64];
+static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
+
+/* Twiddle factors for IMDCT */
+static sample_t __attribute__((aligned(16))) xcos1[128];
+static sample_t __attribute__((aligned(16))) xsin1[128];
+
+#if ARCH_X86 || ARCH_X86_64
+// NOTE: SSE needs 16byte alignment or it will segfault
+//
+static float __attribute__((aligned(16))) sseSinCos1c[256];
+static float __attribute__((aligned(16))) sseSinCos1d[256];
+static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
+//static float __attribute__((aligned(16))) sseW0[4];
+static float __attribute__((aligned(16))) sseW1[8];
+static float __attribute__((aligned(16))) sseW2[16];
+static float __attribute__((aligned(16))) sseW3[32];
+static float __attribute__((aligned(16))) sseW4[64];
+static float __attribute__((aligned(16))) sseW5[128];
+static float __attribute__((aligned(16))) sseW6[256];
+static float __attribute__((aligned(16))) *sseW[7]=
+	{NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
+static float __attribute__((aligned(16))) sseWindow[512];
+#endif
+
 /* Root values for IFFT */
 static sample_t roots16[3];
 static sample_t roots32[7];
@@ -241,7 +322,7 @@
     ifft_pass (buf, roots128 - 32, 32);
 }

-void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias)
+void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias)
 {
     int i, k;
     sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2;
@@ -285,6 +366,704 @@
     }
 }

+#if HAVE_ALTIVEC
+
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+// used to build registers permutation vectors (vcprm)
+// the 's' are for words in the _s_econd vector
+#define WORD_0 0x00,0x01,0x02,0x03
+#define WORD_1 0x04,0x05,0x06,0x07
+#define WORD_2 0x08,0x09,0x0a,0x0b
+#define WORD_3 0x0c,0x0d,0x0e,0x0f
+#define WORD_s0 0x10,0x11,0x12,0x13
+#define WORD_s1 0x14,0x15,0x16,0x17
+#define WORD_s2 0x18,0x19,0x1a,0x1b
+#define WORD_s3 0x1c,0x1d,0x1e,0x1f
+
+#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
+#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
+
+#define FOUROF(a) {a,a,a,a}
+
+// vcprmle is used to keep the same index as in the SSE version.
+// it's the same as vcprm, with the index inversed
+// ('le' is Little Endian)
+#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
+
+// used to build inverse/identity vectors (vcii)
+// n is _n_egative, p is _p_ositive
+#define FLOAT_n -1.
+#define FLOAT_p 1.
+
+
+void
+imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
+{
+  int i;
+  int k;
+  int p,q;
+  int m;
+  long two_m;
+  long two_m_plus_one;
+
+  sample_t tmp_b_i;
+  sample_t tmp_b_r;
+  sample_t tmp_a_i;
+  sample_t tmp_a_r;
+
+  sample_t *data_ptr;
+  sample_t *delay_ptr;
+  sample_t *window_ptr;
+
+  /* 512 IMDCT with source and dest data in 'data' */
+
+  /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
+  for( i=0; i < 128; i++) {
+    /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
+    int j= bit_reverse_512[i];
+    buf[i].real =         (data[256-2*j-1] * xcos1[j])  -  (data[2*j]       * xsin1[j]);
+    buf[i].imag = -1.0 * ((data[2*j]       * xcos1[j])  +  (data[256-2*j-1] * xsin1[j]));
+  }
+
+  /* 1. iteration */
+  for(i = 0; i < 128; i += 2) {
+#if 0
+    tmp_a_r = buf[i].real;
+    tmp_a_i = buf[i].imag;
+    tmp_b_r = buf[i+1].real;
+    tmp_b_i = buf[i+1].imag;
+    buf[i].real = tmp_a_r + tmp_b_r;
+    buf[i].imag =  tmp_a_i + tmp_b_i;
+    buf[i+1].real = tmp_a_r - tmp_b_r;
+    buf[i+1].imag =  tmp_a_i - tmp_b_i;
+#else
+    vector float temp, bufv;
+
+    bufv = vec_ld(i << 3, (float*)buf);
+    temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
+    bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
+    vec_st(bufv, i << 3, (float*)buf);
+#endif
+  }
+
+  /* 2. iteration */
+  // Note w[1]={{1,0}, {0,-1}}
+  for(i = 0; i < 128; i += 4) {
+#if 0
+    tmp_a_r = buf[i].real;
+    tmp_a_i = buf[i].imag;
+    tmp_b_r = buf[i+2].real;
+    tmp_b_i = buf[i+2].imag;
+    buf[i].real = tmp_a_r + tmp_b_r;
+    buf[i].imag =  tmp_a_i + tmp_b_i;
+    buf[i+2].real = tmp_a_r - tmp_b_r;
+    buf[i+2].imag =  tmp_a_i - tmp_b_i;
+    tmp_a_r = buf[i+1].real;
+    tmp_a_i = buf[i+1].imag;
+    /* WARNING: im <-> re here ! */
+    tmp_b_r = buf[i+3].imag;
+    tmp_b_i = buf[i+3].real;
+    buf[i+1].real = tmp_a_r + tmp_b_r;
+    buf[i+1].imag =  tmp_a_i - tmp_b_i;
+    buf[i+3].real = tmp_a_r - tmp_b_r;
+    buf[i+3].imag =  tmp_a_i + tmp_b_i;
+#else
+    vector float buf01, buf23, temp1, temp2;
+
+    buf01 = vec_ld((i + 0) << 3, (float*)buf);
+    buf23 = vec_ld((i + 2) << 3, (float*)buf);
+    buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
+
+    temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
+    temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
+
+    vec_st(temp1, (i + 0) << 3, (float*)buf);
+    vec_st(temp2, (i + 2) << 3, (float*)buf);
+#endif
+  }
+
+  /* 3. iteration */
+  for(i = 0; i < 128; i += 8) {
+#if 0
+    tmp_a_r = buf[i].real;
+    tmp_a_i = buf[i].imag;
+    tmp_b_r = buf[i+4].real;
+    tmp_b_i = buf[i+4].imag;
+    buf[i].real = tmp_a_r + tmp_b_r;
+    buf[i].imag =  tmp_a_i + tmp_b_i;
+    buf[i+4].real = tmp_a_r - tmp_b_r;
+    buf[i+4].imag =  tmp_a_i - tmp_b_i;
+    tmp_a_r = buf[1+i].real;
+    tmp_a_i = buf[1+i].imag;
+    tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
+    tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
+    buf[1+i].real = tmp_a_r + tmp_b_r;
+    buf[1+i].imag =  tmp_a_i + tmp_b_i;
+    buf[i+5].real = tmp_a_r - tmp_b_r;
+    buf[i+5].imag =  tmp_a_i - tmp_b_i;
+    tmp_a_r = buf[i+2].real;
+    tmp_a_i = buf[i+2].imag;
+    /* WARNING re <-> im & sign */
+    tmp_b_r = buf[i+6].imag;
+    tmp_b_i = - buf[i+6].real;
+    buf[i+2].real = tmp_a_r + tmp_b_r;
+    buf[i+2].imag =  tmp_a_i + tmp_b_i;
+    buf[i+6].real = tmp_a_r - tmp_b_r;
+    buf[i+6].imag =  tmp_a_i - tmp_b_i;
+    tmp_a_r = buf[i+3].real;
+    tmp_a_i = buf[i+3].imag;
+    tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
+    tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
+    buf[i+3].real = tmp_a_r + tmp_b_r;
+    buf[i+3].imag =  tmp_a_i + tmp_b_i;
+    buf[i+7].real = tmp_a_r - tmp_b_r;
+    buf[i+7].imag =  tmp_a_i - tmp_b_i;
+#else
+    vector float buf01, buf23, buf45, buf67;
+
+    buf01 = vec_ld((i + 0) << 3, (float*)buf);
+    buf23 = vec_ld((i + 2) << 3, (float*)buf);
+
+    tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
+    tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
+    buf[i+5].real = tmp_b_r;
+    buf[i+5].imag = tmp_b_i;
+    tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
+    tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
+    buf[i+7].real = tmp_b_r;
+    buf[i+7].imag = tmp_b_i;
+
+    buf23 = vec_ld((i + 2) << 3, (float*)buf);
+    buf45 = vec_ld((i + 4) << 3, (float*)buf);
+    buf67 = vec_ld((i + 6) << 3, (float*)buf);
+    buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
+
+    vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
+    vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
+    vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
+    vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
+#endif
+  }
+
+  /* 4-7. iterations */
+  for (m=3; m < 7; m++) {
+    two_m = (1 << m);
+
+    two_m_plus_one = two_m<<1;
+
+    for(i = 0; i < 128; i += two_m_plus_one) {
+      for(k = 0; k < two_m; k+=2) {
+#if 0
+        int p = k + i;
+        int q = p + two_m;
+        tmp_a_r = buf[p].real;
+        tmp_a_i = buf[p].imag;
+        tmp_b_r =
+          buf[q].real * w[m][k].real -
+          buf[q].imag * w[m][k].imag;
+        tmp_b_i =
+          buf[q].imag * w[m][k].real +
+          buf[q].real * w[m][k].imag;
+        buf[p].real = tmp_a_r + tmp_b_r;
+        buf[p].imag =  tmp_a_i + tmp_b_i;
+        buf[q].real = tmp_a_r - tmp_b_r;
+        buf[q].imag =  tmp_a_i - tmp_b_i;
+
+        tmp_a_r = buf[(p + 1)].real;
+        tmp_a_i = buf[(p + 1)].imag;
+        tmp_b_r =
+          buf[(q + 1)].real * w[m][(k + 1)].real -
+          buf[(q + 1)].imag * w[m][(k + 1)].imag;
+        tmp_b_i =
+          buf[(q + 1)].imag * w[m][(k + 1)].real +
+          buf[(q + 1)].real * w[m][(k + 1)].imag;
+        buf[(p + 1)].real = tmp_a_r + tmp_b_r;
+        buf[(p + 1)].imag =  tmp_a_i + tmp_b_i;
+        buf[(q + 1)].real = tmp_a_r - tmp_b_r;
+        buf[(q + 1)].imag =  tmp_a_i - tmp_b_i;
+#else
+        int p = k + i;
+        int q = p + two_m;
+        vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
+        const vector float vczero = (const vector float)FOUROF(0.);
+        // first compute buf[q] and buf[q+1]
+        vecq = vec_ld(q << 3, (float*)buf);
+        vecw = vec_ld(0, (float*)&(w[m][k]));
+        temp1 = vec_madd(vecq, vecw, vczero);
+        temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
+        temp2 = vec_madd(temp2, vecw, vczero);
+        temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
+        temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
+        vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
+        // then butterfly with buf[p] and buf[p+1]
+        vecp = vec_ld(p << 3, (float*)buf);
+
+        temp1 = vec_add(vecp, vecq);
+        temp2 = vec_sub(vecp, vecq);
+
+        vec_st(temp1, p << 3, (float*)buf);
+        vec_st(temp2, q << 3, (float*)buf);
+#endif
+      }
+    }
+  }
+
+  /* Post IFFT complex multiply  plus IFFT complex conjugate*/
+  for( i=0; i < 128; i+=4) {
+    /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
+#if 0
+    tmp_a_r =        buf[(i + 0)].real;
+    tmp_a_i = -1.0 * buf[(i + 0)].imag;
+    buf[(i + 0)].real =
+      (tmp_a_r * xcos1[(i + 0)])  -  (tmp_a_i  * xsin1[(i + 0)]);
+    buf[(i + 0)].imag =
+      (tmp_a_r * xsin1[(i + 0)])  +  (tmp_a_i  * xcos1[(i + 0)]);
+
+    tmp_a_r =        buf[(i + 1)].real;
+    tmp_a_i = -1.0 * buf[(i + 1)].imag;
+    buf[(i + 1)].real =
+      (tmp_a_r * xcos1[(i + 1)])  -  (tmp_a_i  * xsin1[(i + 1)]);
+    buf[(i + 1)].imag =
+      (tmp_a_r * xsin1[(i + 1)])  +  (tmp_a_i  * xcos1[(i + 1)]);
+
+    tmp_a_r =        buf[(i + 2)].real;
+    tmp_a_i = -1.0 * buf[(i + 2)].imag;
+    buf[(i + 2)].real =
+      (tmp_a_r * xcos1[(i + 2)])  -  (tmp_a_i  * xsin1[(i + 2)]);
+    buf[(i + 2)].imag =
+      (tmp_a_r * xsin1[(i + 2)])  +  (tmp_a_i  * xcos1[(i + 2)]);
+
+    tmp_a_r =        buf[(i + 3)].real;
+    tmp_a_i = -1.0 * buf[(i + 3)].imag;
+    buf[(i + 3)].real =
+      (tmp_a_r * xcos1[(i + 3)])  -  (tmp_a_i  * xsin1[(i + 3)]);
+    buf[(i + 3)].imag =
+      (tmp_a_r * xsin1[(i + 3)])  +  (tmp_a_i  * xcos1[(i + 3)]);
+#else
+    vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
+    vector float temp0022, temp1133, tempCS01;
+    const vector float vczero = (const vector float)FOUROF(0.);
+
+    bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
+    bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
+
+    cosv = vec_ld(i << 2, xcos1);
+    sinv = vec_ld(i << 2, xsin1);
+
+    temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
+    temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
+    tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
+    temp1 = vec_madd(temp0022, tempCS01, vczero);
+    tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
+    temp2 = vec_madd(temp1133, tempCS01, vczero);
+    bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
+
+    vec_st(bufv_0, (i + 0) << 3, (float*)buf);
+
+    /* idem with bufv_2 and high-order cosv/sinv */
+
+    temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
+    temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
+    tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
+    temp1 = vec_madd(temp0022, tempCS01, vczero);
+    tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
+    temp2 = vec_madd(temp1133, tempCS01, vczero);
+    bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
+
+    vec_st(bufv_2, (i + 2) << 3, (float*)buf);
+
+#endif
+  }
+
+  data_ptr = data;
+  delay_ptr = delay;
+  window_ptr = a52_imdct_window;
+
+  /* Window and convert to real valued signal */
+  for(i=0; i< 64; i++) {
+    *data_ptr++   = -buf[64+i].imag   * *window_ptr++ + *delay_ptr++ + bias;
+    *data_ptr++   =  buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
+  }
+
+  for(i=0; i< 64; i++) {
+    *data_ptr++  = -buf[i].real       * *window_ptr++ + *delay_ptr++ + bias;
+    *data_ptr++  =  buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
+  }
+
+  /* The trailing edge of the window goes into the delay line */
+  delay_ptr = delay;
+
+  for(i=0; i< 64; i++) {
+    *delay_ptr++  = -buf[64+i].real   * *--window_ptr;
+    *delay_ptr++  =  buf[64-i-1].imag * *--window_ptr;
+  }
+
+  for(i=0; i<64; i++) {
+    *delay_ptr++  =  buf[i].imag       * *--window_ptr;
+    *delay_ptr++  = -buf[128-i-1].real * *--window_ptr;
+  }
+}
+#endif
+
+
+// Stuff below this line is borrowed from libac3
+#include "srfftp.h"
+#if ARCH_X86 || ARCH_X86_64
+#undef HAVE_AMD3DNOW
+#define HAVE_AMD3DNOW 1
+#include "srfftp_3dnow.h"
+
+const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
+const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
+const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
+
+#undef HAVE_AMD3DNOWEXT
+#define HAVE_AMD3DNOWEXT 0
+#include "imdct_3dnow.h"
+#undef HAVE_AMD3DNOWEXT
+#define HAVE_AMD3DNOWEXT 1
+#include "imdct_3dnow.h"
+
+#if !ARCH_X86_64 || !defined(PIC)
+void
+imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
+{
+/*	int i,k;
+    int p,q;*/
+    int m;
+    long two_m;
+    long two_m_plus_one;
+    long two_m_plus_one_shl3;
+    complex_t *buf_offset;
+
+/*  sample_t tmp_a_i;
+    sample_t tmp_a_r;
+    sample_t tmp_b_i;
+    sample_t tmp_b_r;*/
+
+    sample_t *data_ptr;
+    sample_t *delay_ptr;
+    sample_t *window_ptr;
+
+    /* 512 IMDCT with source and dest data in 'data' */
+    /* see the c version (dct_do_512()), its allmost identical, just in C */
+
+    /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
+    /* Bit reversed shuffling */
+	__asm__ volatile(
+		"xor %%"REG_S", %%"REG_S"		\n\t"
+		"lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
+		"mov $1008, %%"REG_D"			\n\t"
+		"push %%"REG_BP"			\n\t" //use ebp without telling gcc
+		ASMALIGN(4)
+		"1:					\n\t"
+		"movlps (%0, %%"REG_S"), %%xmm0	\n\t" // XXXI
+		"movhps 8(%0, %%"REG_D"), %%xmm0	\n\t" // RXXI
+		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // XXXi
+		"movhps (%0, %%"REG_D"), %%xmm1	\n\t" // rXXi
+		"shufps $0x33, %%xmm1, %%xmm0		\n\t" // irIR
+		"movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
+		"mulps %%xmm0, %%xmm2			\n\t"
+		"shufps $0xB1, %%xmm0, %%xmm0		\n\t" // riRI
+		"mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
+		"subps %%xmm0, %%xmm2			\n\t"
+		"movzb (%%"REG_a"), %%"REG_d"		\n\t"
+		"movzb 1(%%"REG_a"), %%"REG_BP"		\n\t"
+		"movlps %%xmm2, (%1, %%"REG_d", 8)	\n\t"
+		"movhps %%xmm2, (%1, %%"REG_BP", 8)	\n\t"
+		"add $16, %%"REG_S"			\n\t"
+		"add $2, %%"REG_a"			\n\t" // avoid complex addressing for P4 crap
+		"sub $16, %%"REG_D"			\n\t"
+		"jnc 1b				 	\n\t"
+		"pop %%"REG_BP"				\n\t"//no we didnt touch ebp *g*
+		:: "b" (data), "c" (buf)
+		: "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
+	);
+
+
+    /* FFT Merge */
+/* unoptimized variant
+    for (m=1; m < 7; m++) {
+	if(m)
+	    two_m = (1 << m);
+	else
+	    two_m = 1;
+
+	two_m_plus_one = (1 << (m+1));
+
+	for(i = 0; i < 128; i += two_m_plus_one) {
+	    for(k = 0; k < two_m; k++) {
+		p = k + i;
+		q = p + two_m;
+		tmp_a_r = buf[p].real;
+		tmp_a_i = buf[p].imag;
+		tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
+		tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
+		buf[p].real = tmp_a_r + tmp_b_r;
+		buf[p].imag =  tmp_a_i + tmp_b_i;
+		buf[q].real = tmp_a_r - tmp_b_r;
+		buf[q].imag =  tmp_a_i - tmp_b_i;
+	    }
+	}
+    }
+*/
+
+    /* 1. iteration */
+	// Note w[0][0]={1,0}
+	__asm__ volatile(
+		"xorps %%xmm1, %%xmm1	\n\t"
+		"xorps %%xmm2, %%xmm2	\n\t"
+		"mov %0, %%"REG_S"	\n\t"
+		ASMALIGN(4)
+		"1:			\n\t"
+		"movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
+		"movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
+		"movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
+		"movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
+		"addps %%xmm1, %%xmm0	\n\t"
+		"subps %%xmm2, %%xmm0	\n\t"
+		"movaps %%xmm0, (%%"REG_S")\n\t"
+		"add $16, %%"REG_S"	\n\t"
+		"cmp %1, %%"REG_S"	\n\t"
+		" jb 1b			\n\t"
+		:: "g" (buf), "r" (buf + 128)
+		: "%"REG_S
+	);
+
+    /* 2. iteration */
+	// Note w[1]={{1,0}, {0,-1}}
+	__asm__ volatile(
+		"movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
+		"mov %0, %%"REG_S"		\n\t"
+		ASMALIGN(4)
+		"1:				\n\t"
+		"movaps 16(%%"REG_S"), %%xmm2	\n\t" //r2,i2,r3,i3
+		"shufps $0xB4, %%xmm2, %%xmm2	\n\t" //r2,i2,i3,r3
+		"mulps %%xmm7, %%xmm2		\n\t" //r2,i2,i3,-r3
+		"movaps (%%"REG_S"), %%xmm0	\n\t" //r0,i0,r1,i1
+		"movaps (%%"REG_S"), %%xmm1	\n\t" //r0,i0,r1,i1
+		"addps %%xmm2, %%xmm0		\n\t"
+		"subps %%xmm2, %%xmm1		\n\t"
+		"movaps %%xmm0, (%%"REG_S")	\n\t"
+		"movaps %%xmm1, 16(%%"REG_S")	\n\t"
+		"add $32, %%"REG_S"	\n\t"
+		"cmp %1, %%"REG_S"	\n\t"
+		" jb 1b			\n\t"
+		:: "g" (buf), "r" (buf + 128)
+		: "%"REG_S
+	);
+
+    /* 3. iteration */
+/*
+ Note sseW2+0={1,1,sqrt(2),sqrt(2))
+ Note sseW2+16={0,0,sqrt(2),-sqrt(2))
+ Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
+ Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
+*/
+	__asm__ volatile(
+		"movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
+		"movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
+		"xorps %%xmm5, %%xmm5		\n\t"
+		"xorps %%xmm2, %%xmm2		\n\t"
+		"mov %0, %%"REG_S"		\n\t"
+		ASMALIGN(4)
+		"1:				\n\t"
+		"movaps 32(%%"REG_S"), %%xmm2	\n\t" //r4,i4,r5,i5
+		"movaps 48(%%"REG_S"), %%xmm3	\n\t" //r6,i6,r7,i7
+		"movaps "MANGLE(sseW2)", %%xmm4	\n\t" //r4,i4,r5,i5
+		"movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
+		"mulps %%xmm2, %%xmm4		\n\t"
+		"mulps %%xmm3, %%xmm5		\n\t"
+		"shufps $0xB1, %%xmm2, %%xmm2	\n\t" //i4,r4,i5,r5
+		"shufps $0xB1, %%xmm3, %%xmm3	\n\t" //i6,r6,i7,r7
+		"mulps %%xmm6, %%xmm3		\n\t"
+		"mulps %%xmm7, %%xmm2		\n\t"
+		"movaps (%%"REG_S"), %%xmm0	\n\t" //r0,i0,r1,i1
+		"movaps 16(%%"REG_S"), %%xmm1	\n\t" //r2,i2,r3,i3
+		"addps %%xmm4, %%xmm2		\n\t"
+		"addps %%xmm5, %%xmm3		\n\t"
+		"movaps %%xmm2, %%xmm4		\n\t"
+		"movaps %%xmm3, %%xmm5		\n\t"
+		"addps %%xmm0, %%xmm2		\n\t"
+		"addps %%xmm1, %%xmm3		\n\t"
+		"subps %%xmm4, %%xmm0		\n\t"
+		"subps %%xmm5, %%xmm1		\n\t"
+		"movaps %%xmm2, (%%"REG_S")	\n\t"
+		"movaps %%xmm3, 16(%%"REG_S")	\n\t"
+		"movaps %%xmm0, 32(%%"REG_S")	\n\t"
+		"movaps %%xmm1, 48(%%"REG_S")	\n\t"
+		"add $64, %%"REG_S"	\n\t"
+		"cmp %1, %%"REG_S"	\n\t"
+		" jb 1b			\n\t"
+		:: "g" (buf), "r" (buf + 128)
+		: "%"REG_S
+	);
+
+    /* 4-7. iterations */
+    for (m=3; m < 7; m++) {
+	two_m = (1 << m);
+	two_m_plus_one = two_m<<1;
+	two_m_plus_one_shl3 = (two_m_plus_one<<3);
+	buf_offset = buf+128;
+	__asm__ volatile(
+		"mov %0, %%"REG_S"			\n\t"
+		ASMALIGN(4)
+		"1:					\n\t"
+		"xor %%"REG_D", %%"REG_D"		\n\t" // k
+		"lea (%%"REG_S", %3), %%"REG_d"		\n\t"
+		"2:					\n\t"
+		"movaps (%%"REG_d", %%"REG_D"), %%xmm1	\n\t"
+		"movaps (%4, %%"REG_D", 2), %%xmm2	\n\t"
+		"mulps %%xmm1, %%xmm2			\n\t"
+		"shufps $0xB1, %%xmm1, %%xmm1		\n\t"
+		"mulps 16(%4, %%"REG_D", 2), %%xmm1	\n\t"
+		"movaps (%%"REG_S", %%"REG_D"), %%xmm0	\n\t"
+		"addps %%xmm2, %%xmm1			\n\t"
+		"movaps %%xmm1, %%xmm2			\n\t"
+		"addps %%xmm0, %%xmm1			\n\t"
+		"subps %%xmm2, %%xmm0			\n\t"
+		"movaps %%xmm1, (%%"REG_S", %%"REG_D")	\n\t"
+		"movaps %%xmm0, (%%"REG_d", %%"REG_D")	\n\t"
+		"add $16, %%"REG_D"			\n\t"
+		"cmp %3, %%"REG_D"			\n\t" //FIXME (opt) count against 0
+		"jb 2b					\n\t"
+		"add %2, %%"REG_S"			\n\t"
+		"cmp %1, %%"REG_S"			\n\t"
+		" jb 1b					\n\t"
+		:: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
+		   "r" (sseW[m])
+		: "%"REG_S, "%"REG_D, "%"REG_d
+	);
+    }
+
+    /* Post IFFT complex multiply  plus IFFT complex conjugate*/
+	__asm__ volatile(
+		"mov $-1024, %%"REG_S"			\n\t"
+		ASMALIGN(4)
+		"1:					\n\t"
+		"movaps (%0, %%"REG_S"), %%xmm0		\n\t"
+		"movaps (%0, %%"REG_S"), %%xmm1		\n\t"
+		"shufps $0xB1, %%xmm0, %%xmm0		\n\t"
+		"mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
+		"mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
+		"addps %%xmm1, %%xmm0			\n\t"
+		"movaps %%xmm0, (%0, %%"REG_S")		\n\t"
+		"add $16, %%"REG_S"			\n\t"
+		" jnz 1b				\n\t"
+		:: "r" (buf+128)
+		: "%"REG_S
+	);
+
+
+    data_ptr = data;
+    delay_ptr = delay;
+    window_ptr = a52_imdct_window;
+
+    /* Window and convert to real valued signal */
+	__asm__ volatile(
+		"xor %%"REG_D", %%"REG_D"		\n\t"  // 0
+		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
+		"movss %3, %%xmm2			\n\t"  // bias
+		"shufps $0x00, %%xmm2, %%xmm2		\n\t"  // bias, bias, ...
+		ASMALIGN(4)
+		"1:					\n\t"
+		"movlps (%0, %%"REG_S"), %%xmm0		\n\t" // ? ? A ?
+		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? C ?
+		"movhps -16(%0, %%"REG_D"), %%xmm1	\n\t" // ? D C ?
+		"movhps -8(%0, %%"REG_D"), %%xmm0	\n\t" // ? B A ?
+		"shufps $0x99, %%xmm1, %%xmm0		\n\t" // D C B A
+		"mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
+		"addps (%2, %%"REG_S"), %%xmm0		\n\t"
+		"addps %%xmm2, %%xmm0			\n\t"
+		"movaps %%xmm0, (%1, %%"REG_S")		\n\t"
+		"add  $16, %%"REG_S"			\n\t"
+		"sub  $16, %%"REG_D"			\n\t"
+		"cmp  $512, %%"REG_S"			\n\t"
+		" jb 1b					\n\t"
+		:: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
+		: "%"REG_S, "%"REG_D
+	);
+	data_ptr+=128;
+	delay_ptr+=128;
+//	window_ptr+=128;
+
+	__asm__ volatile(
+		"mov $1024, %%"REG_D"			\n\t"  // 512
+		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
+		"movss %3, %%xmm2			\n\t"  // bias
+		"shufps $0x00, %%xmm2, %%xmm2		\n\t"  // bias, bias, ...
+		ASMALIGN(4)
+		"1:					\n\t"
+		"movlps (%0, %%"REG_S"), %%xmm0		\n\t" // ? ? ? A
+		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? ? C
+		"movhps -16(%0, %%"REG_D"), %%xmm1	\n\t" // D ? ? C
+		"movhps -8(%0, %%"REG_D"), %%xmm0	\n\t" // B ? ? A
+		"shufps $0xCC, %%xmm1, %%xmm0		\n\t" // D C B A
+		"mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
+		"addps (%2, %%"REG_S"), %%xmm0		\n\t"
+		"addps %%xmm2, %%xmm0			\n\t"
+		"movaps %%xmm0, (%1, %%"REG_S")		\n\t"
+		"add $16, %%"REG_S"			\n\t"
+		"sub $16, %%"REG_D"			\n\t"
+		"cmp $512, %%"REG_S"			\n\t"
+		" jb 1b					\n\t"
+		:: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
+		: "%"REG_S, "%"REG_D
+	);
+	data_ptr+=128;
+//	window_ptr+=128;
+
+    /* The trailing edge of the window goes into the delay line */
+    delay_ptr = delay;
+
+	__asm__ volatile(
+		"xor %%"REG_D", %%"REG_D"		\n\t"  // 0
+		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
+		ASMALIGN(4)
+		"1:					\n\t"
+		"movlps (%0, %%"REG_S"), %%xmm0		\n\t" // ? ? ? A
+		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? ? C
+		"movhps -16(%0, %%"REG_D"), %%xmm1	\n\t" // D ? ? C
+		"movhps -8(%0, %%"REG_D"), %%xmm0	\n\t" // B ? ? A
+		"shufps $0xCC, %%xmm1, %%xmm0		\n\t" // D C B A
+		"mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
+		"movaps %%xmm0, (%1, %%"REG_S")		\n\t"
+		"add $16, %%"REG_S"			\n\t"
+		"sub $16, %%"REG_D"			\n\t"
+		"cmp $512, %%"REG_S"			\n\t"
+		" jb 1b					\n\t"
+		:: "r" (buf+64), "r" (delay_ptr)
+		: "%"REG_S, "%"REG_D
+	);
+	delay_ptr+=128;
+//	window_ptr-=128;
+
+	__asm__ volatile(
+		"mov $1024, %%"REG_D"			\n\t"  // 1024
+		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
+		ASMALIGN(4)
+		"1:					\n\t"
+		"movlps (%0, %%"REG_S"), %%xmm0	\n\t" // ? ? A ?
+		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? C ?
+		"movhps -16(%0, %%"REG_D"), %%xmm1	\n\t" // ? D C ?
+		"movhps -8(%0, %%"REG_D"), %%xmm0	\n\t" // ? B A ?
+		"shufps $0x99, %%xmm1, %%xmm0		\n\t" // D C B A
+		"mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
+		"movaps %%xmm0, (%1, %%"REG_S")		\n\t"
+		"add $16, %%"REG_S"			\n\t"
+		"sub $16, %%"REG_D"			\n\t"
+		"cmp $512, %%"REG_S"			\n\t"
+		" jb 1b					\n\t"
+		:: "r" (buf), "r" (delay_ptr)
+		: "%"REG_S, "%"REG_D
+	);
+}
+#endif
+#endif // ARCH_X86 || ARCH_X86_64
+
 void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias)
 {
     int i, k;
@@ -364,7 +1143,7 @@

 void a52_imdct_init (uint32_t mm_accel)
 {
-    int i, k;
+    int i, j, k;
     double sum;

     /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */
@@ -416,6 +1195,101 @@
 	post2[i].real = cos ((M_PI / 128) * (i + 0.5));
 	post2[i].imag = sin ((M_PI / 128) * (i + 0.5));
     }
+    for (i = 0; i < 128; i++) {
+	xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
+	xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
+    }
+    for (i = 0; i < 7; i++) {
+	j = 1 << i;
+	for (k = 0; k < j; k++) {
+	    w[i][k].real = cos (-M_PI * k / j);
+	    w[i][k].imag = sin (-M_PI * k / j);
+	}
+    }
+#if ARCH_X86 || ARCH_X86_64
+	for (i = 0; i < 128; i++) {
+	    sseSinCos1c[2*i+0]= xcos1[i];
+	    sseSinCos1c[2*i+1]= -xcos1[i];
+	    sseSinCos1d[2*i+0]= xsin1[i];
+	    sseSinCos1d[2*i+1]= xsin1[i];
+	}
+	for (i = 1; i < 7; i++) {
+	    j = 1 << i;
+	    for (k = 0; k < j; k+=2) {
+
+	    	sseW[i][4*k + 0] = w[i][k+0].real;
+	    	sseW[i][4*k + 1] = w[i][k+0].real;
+	    	sseW[i][4*k + 2] = w[i][k+1].real;
+	    	sseW[i][4*k + 3] = w[i][k+1].real;
+
+	    	sseW[i][4*k + 4] = -w[i][k+0].imag;
+	    	sseW[i][4*k + 5] = w[i][k+0].imag;
+	    	sseW[i][4*k + 6] = -w[i][k+1].imag;
+	    	sseW[i][4*k + 7] = w[i][k+1].imag;
+
+	//we multiply more or less uninitalized numbers so we need to use exactly 0.0
+		if(k==0)
+		{
+//			sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
+			sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
+		}
+
+		if(2*k == j)
+		{
+			sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
+//			sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
+		}
+	    }
+	}
+
+	for(i=0; i<128; i++)
+	{
+		sseWindow[2*i+0]= -a52_imdct_window[2*i+0];
+		sseWindow[2*i+1]=  a52_imdct_window[2*i+1];
+	}
+
+	for(i=0; i<64; i++)
+	{
+		sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1];
+		sseWindow[256 + 2*i+1]=  a52_imdct_window[254 - 2*i+0];
+		sseWindow[384 + 2*i+0]=  a52_imdct_window[126 - 2*i+1];
+		sseWindow[384 + 2*i+1]= -a52_imdct_window[126 - 2*i+0];
+	}
+#endif
+	a52_imdct_512 = imdct_do_512;
+	ifft128 = ifft128_c;
+	ifft64 = ifft64_c;
+
+#if ARCH_X86 || ARCH_X86_64
+#if !ARCH_X86_64 || !defined(PIC)
+	if(mm_accel & MM_ACCEL_X86_SSE)
+	{
+	  fprintf (stderr, "Using SSE optimized IMDCT transform\n");
+	  a52_imdct_512 = imdct_do_512_sse;
+	}
+	else
+#endif
+	if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
+	{
+	  fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
+	  a52_imdct_512 = imdct_do_512_3dnowex;
+	}
+	else
+	if(mm_accel & MM_ACCEL_X86_3DNOW)
+	{
+	  fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
+	  a52_imdct_512 = imdct_do_512_3dnow;
+	}
+	else
+#endif // ARCH_X86 || ARCH_X86_64
+#if HAVE_ALTIVEC
+        if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
+	{
+	  fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
+          a52_imdct_512 = imdct_do_512_altivec;
+	}
+	else
+#endif

 #ifdef LIBA52_DJBFFT
     if (mm_accel & MM_ACCEL_DJBFFT) {
@@ -426,7 +1300,5 @@
 #endif
     {
 	fprintf (stderr, "No accelerated IMDCT transform found\n");
-	ifft128 = ifft128_c;
-	ifft64 = ifft64_c;
     }
 }
--- include/mm_accel.h	2006-06-12 15:05:00.000000000 +0200
+++ liba52/mm_accel.h	2006-06-05 02:23:04.000000000 +0200
@@ -30,7 +34,12 @@
 /* x86 accelerations */
 #define MM_ACCEL_X86_MMX	0x80000000
 #define MM_ACCEL_X86_3DNOW	0x40000000
+#define MM_ACCEL_X86_3DNOWEXT	0x08000000
 #define MM_ACCEL_X86_MMXEXT	0x20000000
+#define MM_ACCEL_X86_SSE	0x10000000
+
+/* PPC accelerations */
+#define MM_ACCEL_PPC_ALTIVEC	0x00010000

 uint32_t mm_accel (void);

--- liba52/parse.c	2006-12-05 08:08:01.000000000 +0100
+++ liba52/parse.c	2006-12-05 08:08:44.000000000 +0100
@@ -24,6 +28,7 @@
 #include "config.h"

 #include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
 #include <inttypes.h>

@@ -31,13 +36,12 @@
 #include "a52_internal.h"
 #include "bitstream.h"
 #include "tables.h"
+#include "mm_accel.h"
+#include "libavutil/avutil.h"

-#ifdef HAVE_MEMALIGN
+#if HAVE_MEMALIGN
 /* some systems have memalign() but no declaration for it */
 void * memalign (size_t align, size_t size);
-#else
-/* assume malloc alignment is sufficient */
-#define memalign(align,size) malloc (size)
 #endif

 typedef struct {
@@ -60,7 +64,16 @@
     if (state == NULL)
 	return NULL;

+#if defined(__MINGW32__) && defined(HAVE_SSE)
+    state->samples = av_malloc(256 * 12 * sizeof (sample_t));
+#else
     state->samples = memalign (16, 256 * 12 * sizeof (sample_t));
+#endif
+    if(((int)state->samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){
+      mm_accel &=~MM_ACCEL_X86_SSE;
+      fprintf(stderr, "liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n");
+    }
+
     if (state->samples == NULL) {
 	free (state);
 	return NULL;
@@ -74,6 +87,7 @@
     state->lfsr_state = 1;

     a52_imdct_init (mm_accel);
+    downmix_accel_init(mm_accel);

     return state;
 }
@@ -141,7 +155,7 @@
     state->acmod = acmod = buf[6] >> 5;

     a52_bitstream_set_ptr (state, buf + 6);
-    bitstream_get (state, 3);	/* skip acmod we already parsed */
+    bitstream_skip (state, 3);	/* skip acmod we already parsed */

     if ((acmod == 2) && (bitstream_get (state, 2) == 2))	/* dsurmod */
 	acmod = A52_DOLBY;
@@ -172,28 +186,28 @@

     chaninfo = !acmod;
     do {
-	bitstream_get (state, 5);	/* dialnorm */
+	bitstream_skip (state, 5);	/* dialnorm */
 	if (bitstream_get (state, 1))	/* compre */
-	    bitstream_get (state, 8);	/* compr */
+	    bitstream_skip (state, 8);	/* compr */
 	if (bitstream_get (state, 1))	/* langcode */
-	    bitstream_get (state, 8);	/* langcod */
+	    bitstream_skip (state, 8);	/* langcod */
 	if (bitstream_get (state, 1))	/* audprodie */
-	    bitstream_get (state, 7);	/* mixlevel + roomtyp */
+	    bitstream_skip (state, 7);	/* mixlevel + roomtyp */
     } while (chaninfo--);

-    bitstream_get (state, 2);		/* copyrightb + origbs */
+    bitstream_skip (state, 2);		/* copyrightb + origbs */

     if (bitstream_get (state, 1))	/* timecod1e */
-	bitstream_get (state, 14);	/* timecod1 */
+	bitstream_skip (state, 14);	/* timecod1 */
     if (bitstream_get (state, 1))	/* timecod2e */
-	bitstream_get (state, 14);	/* timecod2 */
+	bitstream_skip (state, 14);	/* timecod2 */

     if (bitstream_get (state, 1)) {	/* addbsie */
 	int addbsil;

 	addbsil = bitstream_get (state, 6);
 	do {
-	    bitstream_get (state, 8);	/* addbsi */
+	    bitstream_skip (state, 8);	/* addbsi */
 	} while (addbsil--);
     }

@@ -680,7 +694,7 @@
 				 state->fbw_expbap[i].exp[0],
 				 state->fbw_expbap[i].exp + 1))
 		return 1;
-	    bitstream_get (state, 2);	/* gainrng */
+	    bitstream_skip (state, 2);	/* gainrng */
 	}
     if (lfeexpstr != EXP_REUSE) {
 	do_bit_alloc |= 32;
@@ -755,7 +769,7 @@
     if (bitstream_get (state, 1)) {	/* skiple */
 	i = bitstream_get (state, 9);	/* skipl */
 	while (i--)
-	    bitstream_get (state, 8);
+	    bitstream_skip (state, 8);
     }

     samples = state->samples;
@@ -896,6 +910,10 @@

 void a52_free (a52_state_t * state)
 {
-    free (state->samples);
+#if defined(__MINGW32__) && defined(HAVE_SSE)
+    av_free (state->samples);
+#else
+     free (state->samples);
+#endif
     free (state);
 }