changeset 1245:03b7e2955a20

Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
author nick
date Fri, 29 Jun 2001 17:55:35 +0000
parents a2c71bf9a7d3
children 7f69c1dd1e91
files mp3lib/Makefile mp3lib/d_cpu.h mp3lib/d_cpu.s mp3lib/dct36.c mp3lib/dct64_3dnow.s mp3lib/dct64_MMX.s mp3lib/dct64_k7.s mp3lib/decod386.c mp3lib/decode_3dnow.s mp3lib/decode_MMX.s mp3lib/decode_k7.s mp3lib/decode_sse.s mp3lib/layer2.c mp3lib/layer3.c mp3lib/mpg123.h mp3lib/sr1.c mp3lib/tabinit.c mp3lib/tabinit_MMX.s mp3lib/test2.c
diffstat 19 files changed, 3170 insertions(+), 2274 deletions(-) [+]
line wrap: on
line diff
--- a/mp3lib/Makefile	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/Makefile	Fri Jun 29 17:55:35 2001 +0000
@@ -1,8 +1,10 @@
 
 include config.mak
 
-SRCS = sr1.c d_cpu.s decode_i586.s $(OPTIONAL_SRCS)
-OBJS = sr1.o d_cpu.o decode_i586.o $(OPTIONAL_OBJS)
+SRCS = sr1.c d_cpu.s decode_i586.s dct64_MMX.s decode_MMX.s tabinit_MMX.s\
+dct36_3dnow.s dct64_3dnow.s dct36_k7.s dct64_k7.s
+OBJS = sr1.o d_cpu.o decode_i586.o dct64_MMX.o decode_MMX.o tabinit_MMX.o\
+dct36_3dnow.o dct64_3dnow.o dct36_k7.o dct64_k7.o
 # OBJS = $(SRCS:.c,.s=.o)
 CFLAGS  = $(OPTFLAGS) $(EXTRA_INC)
 
--- a/mp3lib/d_cpu.h	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/d_cpu.h	Fri Jun 29 17:55:35 2001 +0000
@@ -9,9 +9,12 @@
 unsigned int _CpuID;
 unsigned int _i586;
 unsigned int _3dnow;
+unsigned int _isse;
+unsigned int _has_mmx;
 
 extern unsigned long CpuDetect( void );
 extern unsigned long ipentium( void );
+extern unsigned long isse( void );
 extern unsigned long a3dnow( void );
 
 #endif
--- a/mp3lib/d_cpu.s	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/d_cpu.s	Fri Jun 29 17:55:35 2001 +0000
@@ -9,6 +9,7 @@
 .globl CpuDetect
 .globl ipentium
 .globl a3dnow
+.globl isse
 
 / ---------------------------------------------------------------------------
 /  in C: unsigned long CpuDetect( void );
@@ -45,7 +46,9 @@
 
 / ---------------------------------------------------------------------------
 /  in C: unsigled long ipentium( void );
-/   return: 0 if the processor is not P5 or above else above 1.
+/  return: 0 if this processor i386 or i486
+/          1 otherwise
+/          2 if this cpu supports mmx
 / ---------------------------------------------------------------------------
 ipentium:
         pushl  %ebx
@@ -63,10 +66,15 @@
         jz     no_cpuid
         movl   $1,%eax
         cpuid
-        shrl   $8,%eax
-        cmpl   $5,%eax
-        jb     no_cpuid
-        movl   $1,%eax
+	movl   %eax, %ecx
+	xorl   %eax, %eax
+        shrl   $8,%ecx
+        cmpl   $5,%ecx
+        jb     exit
+        incl   %eax
+	test   $0x00800000, %edx
+	jz     exit
+	incl   %eax
         jmp    exit
 no_cpuid:
         xorl   %eax,%eax
@@ -113,3 +121,33 @@
         popl   %edx
         popl   %ebx
         ret
+
+/ ---------------------------------------------------------------------------
+/  in C: unsigned long isse( void );
+/  return: 0 if this processor does not support sse
+/          1 otherwise
+/          2 if this cpu supports sse2 extension
+/ ---------------------------------------------------------------------------
+isse:
+        pushl  %ebx
+        pushl  %edx
+        pushl  %ecx
+
+        call   ipentium
+        testl  %eax,%eax
+        jz     exit3
+
+        movl   $1,%eax
+        cpuid
+	xorl   %eax, %eax
+        testl  $0x02000000,%edx
+        jz     exit3
+	incl   %eax
+        testl  $0x04000000,%edx
+        jz     exit3
+        incl   %eax
+exit3:
+        popl   %ecx
+        popl   %edx
+        popl   %ebx
+        ret
--- a/mp3lib/dct36.c	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/dct36.c	Fri Jun 29 17:55:35 2001 +0000
@@ -193,7 +193,7 @@
     sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \
 	MACRO0(v); }
 
-    register const real *c = nCOS9;
+    register const real *c = COS9;
     register real *out2 = o2;
 	register real *w = wintab;
 	register real *out1 = o1;
--- a/mp3lib/dct64_3dnow.s	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/dct64_3dnow.s	Fri Jun 29 17:55:35 2001 +0000
@@ -1,706 +1,932 @@
-///
-/// Replacement of dct64() with AMD's 3DNow! SIMD operations support
-///
-/// Syuuhei Kashiyama <squash@mb.kcom.ne.jp>
-///
-/// The author of this program disclaim whole expressed or implied
-/// warranties with regard to this program, and in no event shall the
-/// author of this program liable to whatever resulted from the use of
-/// this program. Use it at your own risk.
-///
+# This code was taken from http://www.mpg123.org
+# See ChangeLog of mpg123-0.59s-pre.1 for detail
+# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+# Partial 3dnow! optimization by Nick Kurshev
+#
+# TODO: finish 3dnow! optimization at least in scalar mode
+#
 
-        .globl dct64_3dnow
-        .type    dct64_3dnow,@function
-dct64_3dnow:
-        subl $256,%esp
-        pushl %ebp
-        pushl %edi
-        pushl %esi
-        pushl %ebx
-        leal 16(%esp),%ebx
-        movl 284(%esp),%edi
-        movl 276(%esp),%ebp
-        movl 280(%esp),%edx
-        leal 128(%ebx),%esi
+.data
+	.align 8
+plus_minus_3dnow: .long 0x00000000, 0x80000000
+costab:
+	.long 1056974725
+	.long 1057056395
+	.long 1057223771
+	.long 1057485416
+	.long 1057855544
+	.long 1058356026
+	.long 1059019886
+	.long 1059897405
+	.long 1061067246
+	.long 1062657950
+	.long 1064892987
+	.long 1066774581
+	.long 1069414683
+	.long 1073984175
+	.long 1079645762
+	.long 1092815430
+	.long 1057005197
+	.long 1057342072
+	.long 1058087743
+	.long 1059427869
+	.long 1061799040
+	.long 1065862217
+	.long 1071413542
+	.long 1084439708
+	.long 1057128951
+	.long 1058664893
+	.long 1063675095
+	.long 1076102863
+	.long 1057655764
+	.long 1067924853
+	.long 1060439283
 
-        / femms
+.text
+
+	.align 16
+
+.globl dct64_MMX_3dnow
+dct64_MMX_3dnow:
+	pushl %ebx
+	pushl %esi
+	pushl %edi
+	subl $256,%esp
+	movl 280(%esp),%eax
 
-        // 1
-        movl pnts,%eax
-        movq 0(%edi),%mm0
-        movq %mm0,%mm1
-        movd 124(%edi),%mm2
-        punpckldq 120(%edi),%mm2
-        movq 0(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,0(%ebx)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,124(%ebx)
-        psrlq $32,%mm1
-        movd %mm1,120(%ebx)
-        movq 8(%edi),%mm4
-        movq %mm4,%mm5
-        movd 116(%edi),%mm6
-        punpckldq 112(%edi),%mm6
-        movq 8(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,8(%ebx)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,116(%ebx)
-        psrlq $32,%mm5
-        movd %mm5,112(%ebx)
-        movq 16(%edi),%mm0
-        movq %mm0,%mm1
-        movd 108(%edi),%mm2
-        punpckldq 104(%edi),%mm2
-        movq 16(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,16(%ebx)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,108(%ebx)
-        psrlq $32,%mm1
-        movd %mm1,104(%ebx)
-        movq 24(%edi),%mm4
-        movq %mm4,%mm5
-        movd 100(%edi),%mm6
-        punpckldq 96(%edi),%mm6
-        movq 24(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,24(%ebx)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,100(%ebx)
-        psrlq $32,%mm5
-        movd %mm5,96(%ebx)
-        movq 32(%edi),%mm0
-        movq %mm0,%mm1
-        movd 92(%edi),%mm2
-        punpckldq 88(%edi),%mm2
-        movq 32(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,32(%ebx)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,92(%ebx)
-        psrlq $32,%mm1
-        movd %mm1,88(%ebx)
-        movq 40(%edi),%mm4
-        movq %mm4,%mm5
-        movd 84(%edi),%mm6
-        punpckldq 80(%edi),%mm6
-        movq 40(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,40(%ebx)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,84(%ebx)
-        psrlq $32,%mm5
-        movd %mm5,80(%ebx)
-        movq 48(%edi),%mm0
-        movq %mm0,%mm1
-        movd 76(%edi),%mm2
-        punpckldq 72(%edi),%mm2
-        movq 48(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,48(%ebx)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,76(%ebx)
-        psrlq $32,%mm1
-        movd %mm1,72(%ebx)
-        movq 56(%edi),%mm4
-        movq %mm4,%mm5
-        movd 68(%edi),%mm6
-        punpckldq 64(%edi),%mm6
-        movq 56(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,56(%ebx)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,68(%ebx)
-        psrlq $32,%mm5
-        movd %mm5,64(%ebx)
+	leal 128(%esp),%edx
+	movl 272(%esp),%esi
+	movl 276(%esp),%edi
+	movl $costab,%ebx
+	orl %ecx,%ecx
+	movl %esp,%ecx
+	femms	
+/* Phase 1*/
+	movq	(%eax), %mm0
+	movq	8(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	120(%eax), %mm1
+	movq	112(%eax), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%edx)
+	movq	%mm4, 8(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	(%ebx), %mm3
+	pfmul	8(%ebx), %mm7
+	movd	%mm3, 124(%edx)
+	movd	%mm7, 116(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 120(%edx)
+	movd	%mm7, 112(%edx)
+
+	movq	16(%eax), %mm0
+	movq	24(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	104(%eax), %mm1
+	movq	96(%eax), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 16(%edx)
+	movq	%mm4, 24(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	16(%ebx), %mm3
+	pfmul	24(%ebx), %mm7
+	movd	%mm3, 108(%edx)
+	movd	%mm7, 100(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 104(%edx)
+	movd	%mm7, 96(%edx)
+
+	movq	32(%eax), %mm0
+	movq	40(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	88(%eax), %mm1
+	movq	80(%eax), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 32(%edx)
+	movq	%mm4, 40(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	32(%ebx), %mm3
+	pfmul	40(%ebx), %mm7
+	movd	%mm3, 92(%edx)
+	movd	%mm7, 84(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 88(%edx)
+	movd	%mm7, 80(%edx)
+
+	movq	48(%eax), %mm0
+	movq	56(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	72(%eax), %mm1
+	movq	64(%eax), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 48(%edx)
+	movq	%mm4, 56(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	48(%ebx), %mm3
+	pfmul	56(%ebx), %mm7
+	movd	%mm3, 76(%edx)
+	movd	%mm7, 68(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 72(%edx)
+	movd	%mm7, 64(%edx)
+
+/* Phase 2*/
 
-        // 2
-        movl pnts+4,%eax
-        / 0, 14
-        movq 0(%ebx),%mm0
-        movq %mm0,%mm1
-        movd 60(%ebx),%mm2
-        punpckldq 56(%ebx),%mm2
-        movq 0(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,0(%esi)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,60(%esi)
-        psrlq $32,%mm1
-        movd %mm1,56(%esi)
-        / 16, 30
-        movq 64(%ebx),%mm0
-        movq %mm0,%mm1
-        movd 124(%ebx),%mm2
-        punpckldq 120(%ebx),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,64(%esi)
-        pfsubr %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,124(%esi)
-        psrlq $32,%mm1
-        movd %mm1,120(%esi)
-        movq 8(%ebx),%mm4
-        / 2, 12
-        movq %mm4,%mm5
-        movd 52(%ebx),%mm6
-        punpckldq 48(%ebx),%mm6
-        movq 8(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,8(%esi)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,52(%esi)
-        psrlq $32,%mm5
-        movd %mm5,48(%esi)
-        movq 72(%ebx),%mm4
-        / 18, 28
-        movq %mm4,%mm5
-        movd 116(%ebx),%mm6
-        punpckldq 112(%ebx),%mm6
-        pfadd %mm6,%mm4
-        movq %mm4,72(%esi)
-        pfsubr %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,116(%esi)
-        psrlq $32,%mm5
-        movd %mm5,112(%esi)
-        movq 16(%ebx),%mm0
-        / 4, 10
-        movq %mm0,%mm1
-        movd 44(%ebx),%mm2
-        punpckldq 40(%ebx),%mm2
-        movq 16(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,16(%esi)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,44(%esi)
-        psrlq $32,%mm1
-        movd %mm1,40(%esi)
-        movq 80(%ebx),%mm0
-        / 20, 26
-        movq %mm0,%mm1
-        movd 108(%ebx),%mm2
-        punpckldq 104(%ebx),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,80(%esi)
-        pfsubr %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,108(%esi)
-        psrlq $32,%mm1
-        movd %mm1,104(%esi)
-        movq 24(%ebx),%mm4
-        / 6, 8
-        movq %mm4,%mm5
-        movd 36(%ebx),%mm6
-        punpckldq 32(%ebx),%mm6
-        movq 24(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,24(%esi)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,36(%esi)
-        psrlq $32,%mm5
-        movd %mm5,32(%esi)
-        movq 88(%ebx),%mm4
-        / 22, 24
-        movq %mm4,%mm5
-        movd 100(%ebx),%mm6
-        punpckldq 96(%ebx),%mm6
-        pfadd %mm6,%mm4
-        movq %mm4,88(%esi)
-        pfsubr %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,100(%esi)
-        psrlq $32,%mm5
-        movd %mm5,96(%esi)
+	movq	(%edx), %mm0
+	movq	8(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	56(%edx), %mm1
+	movq	48(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%ecx)
+	movq	%mm4, 8(%ecx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	64(%ebx), %mm3
+	pfmul	72(%ebx), %mm7
+	movd	%mm3, 60(%ecx)
+	movd	%mm7, 52(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 56(%ecx)
+	movd	%mm7, 48(%ecx)
+	
+	movq	16(%edx), %mm0
+	movq	24(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	40(%edx), %mm1
+	movq	32(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 16(%ecx)
+	movq	%mm4, 24(%ecx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	80(%ebx), %mm3
+	pfmul	88(%ebx), %mm7
+	movd	%mm3, 44(%ecx)
+	movd	%mm7, 36(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 40(%ecx)
+	movd	%mm7, 32(%ecx)
+
+/* Phase 3*/
+
+	movq	64(%edx), %mm0
+	movq	72(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	120(%edx), %mm1
+	movq	112(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 64(%ecx)
+	movq	%mm4, 72(%ecx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	64(%ebx), %mm3
+	pfmul	72(%ebx), %mm7
+	movd	%mm3, 124(%ecx)
+	movd	%mm7, 116(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 120(%ecx)
+	movd	%mm7, 112(%ecx)
+
+	movq	80(%edx), %mm0
+	movq	88(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	104(%edx), %mm1
+	movq	96(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 80(%ecx)
+	movq	%mm4, 88(%ecx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	80(%ebx), %mm3
+	pfmul	88(%ebx), %mm7
+	movd	%mm3, 108(%ecx)
+	movd	%mm7, 100(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 104(%ecx)
+	movd	%mm7, 96(%ecx)
+	
+/* Phase 4*/
 
-        // 3
-        movl pnts+8,%eax
-        movq 0(%eax),%mm0
-        movq 8(%eax),%mm1
-        movq 0(%esi),%mm2
-        / 0, 6
-        movq %mm2,%mm3
-        movd 28(%esi),%mm4
-        punpckldq 24(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsub %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,0(%ebx)
-        movd %mm3,28(%ebx)
-        psrlq $32,%mm3
-        movd %mm3,24(%ebx)
-        movq 8(%esi),%mm5
-        / 2, 4
-        movq %mm5,%mm6
-        movd 20(%esi),%mm7
-        punpckldq 16(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsub %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,8(%ebx)
-        movd %mm6,20(%ebx)
-        psrlq $32,%mm6
-        movd %mm6,16(%ebx)
-        movq 32(%esi),%mm2
-        / 8, 14
-        movq %mm2,%mm3
-        movd 60(%esi),%mm4
-        punpckldq 56(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsubr %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,32(%ebx)
-        movd %mm3,60(%ebx)
-        psrlq $32,%mm3
-        movd %mm3,56(%ebx)
-        movq 40(%esi),%mm5
-        / 10, 12
-        movq %mm5,%mm6
-        movd 52(%esi),%mm7
-        punpckldq 48(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsubr %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,40(%ebx)
-        movd %mm6,52(%ebx)
-        psrlq $32,%mm6
-        movd %mm6,48(%ebx)
-        movq 64(%esi),%mm2
-        / 16, 22
-        movq %mm2,%mm3
-        movd 92(%esi),%mm4
-        punpckldq 88(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsub %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,64(%ebx)
-        movd %mm3,92(%ebx)
-        psrlq $32,%mm3
-        movd %mm3,88(%ebx)
-        movq 72(%esi),%mm5
-        / 18, 20
-        movq %mm5,%mm6
-        movd 84(%esi),%mm7
-        punpckldq 80(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsub %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,72(%ebx)
-        movd %mm6,84(%ebx)
-        psrlq $32,%mm6
-        movd %mm6,80(%ebx)
-        movq 96(%esi),%mm2
-        / 24, 30
-        movq %mm2,%mm3
-        movd 124(%esi),%mm4
-        punpckldq 120(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsubr %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,96(%ebx)
-        movd %mm3,124(%ebx)
-        psrlq $32,%mm3
-        movd %mm3,120(%ebx)
-        movq 104(%esi),%mm5
-        / 26, 28
-        movq %mm5,%mm6
-        movd 116(%esi),%mm7
-        punpckldq 112(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsubr %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,104(%ebx)
-        movd %mm6,116(%ebx)
-        psrlq $32,%mm6
-        movd %mm6,112(%ebx)
+	movq	(%ecx), %mm0
+	movq	8(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	24(%ecx), %mm1
+	movq	16(%ecx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%edx)
+	movq	%mm4, 8(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	movd	%mm3, 28(%edx)
+	movd	%mm7, 20(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 24(%edx)
+	movd	%mm7, 16(%edx)
+
+	movq	32(%ecx), %mm0
+	movq	40(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	56(%ecx), %mm1
+	movq	48(%ecx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 32(%edx)
+	movq	%mm4, 40(%edx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	movd	%mm3, 60(%edx)
+	movd	%mm7, 52(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 56(%edx)
+	movd	%mm7, 48(%edx)
+
+	movq	64(%ecx), %mm0
+	movq	72(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	88(%ecx), %mm1
+	movq	80(%ecx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 64(%edx)
+	movq	%mm4, 72(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	movd	%mm3, 92(%edx)
+	movd	%mm7, 84(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 88(%edx)
+	movd	%mm7, 80(%edx)
+
+	movq	96(%ecx), %mm0
+	movq	104(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	120(%ecx), %mm1
+	movq	112(%ecx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 96(%edx)
+	movq	%mm4, 104(%edx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	movd	%mm3, 124(%edx)
+	movd	%mm7, 116(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 120(%edx)
+	movd	%mm7, 112(%edx)
+
+/* Phase 5 */
+
+	movq	(%edx), %mm0
+	movq	16(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	8(%edx), %mm1
+	movq	24(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%ecx)
+	movq	%mm4, 16(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	movd	%mm3, 12(%ecx)
+	movd	%mm7, 28(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 8(%ecx)
+	movd	%mm7, 24(%ecx)
 
-        // 4
-        movl pnts+12,%eax
-        movq 0(%eax),%mm0
-        movq 0(%ebx),%mm1
-        / 0
-        movq %mm1,%mm2
-        movd 12(%ebx),%mm3
-        punpckldq 8(%ebx),%mm3
-        pfadd %mm3,%mm1
-        pfsub %mm3,%mm2
-        pfmul %mm0,%mm2
-        movq %mm1,0(%esi)
-        movd %mm2,12(%esi)
-        psrlq $32,%mm2
-        movd %mm2,8(%esi)
-        movq 16(%ebx),%mm4
-        / 4
-        movq %mm4,%mm5
-        movd 28(%ebx),%mm6
-        punpckldq 24(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,16(%esi)
-        movd %mm5,28(%esi)
-        psrlq $32,%mm5
-        movd %mm5,24(%esi)
-        movq 32(%ebx),%mm1
-        / 8
-        movq %mm1,%mm2
-        movd 44(%ebx),%mm3
-        punpckldq 40(%ebx),%mm3
-        pfadd %mm3,%mm1
-        pfsub %mm3,%mm2
-        pfmul %mm0,%mm2
-        movq %mm1,32(%esi)
-        movd %mm2,44(%esi)
-        psrlq $32,%mm2
-        movd %mm2,40(%esi)
-        movq 48(%ebx),%mm4
-        / 12
-        movq %mm4,%mm5
-        movd 60(%ebx),%mm6
-        punpckldq 56(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,48(%esi)
-        movd %mm5,60(%esi)
-        psrlq $32,%mm5
-        movd %mm5,56(%esi)
-        movq 64(%ebx),%mm1
-        / 16
-        movq %mm1,%mm2
-        movd 76(%ebx),%mm3
-        punpckldq 72(%ebx),%mm3
-        pfadd %mm3,%mm1
-        pfsub %mm3,%mm2
-        pfmul %mm0,%mm2
-        movq %mm1,64(%esi)
-        movd %mm2,76(%esi)
-        psrlq $32,%mm2
-        movd %mm2,72(%esi)
-        movq 80(%ebx),%mm4
-        / 20
-        movq %mm4,%mm5
-        movd 92(%ebx),%mm6
-        punpckldq 88(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,80(%esi)
-        movd %mm5,92(%esi)
-        psrlq $32,%mm5
-        movd %mm5,88(%esi)
-        movq 96(%ebx),%mm1
-        / 24
-        movq %mm1,%mm2
-        movd 108(%ebx),%mm3
-        punpckldq 104(%ebx),%mm3
-        pfadd %mm3,%mm1
-        pfsub %mm3,%mm2
-        pfmul %mm0,%mm2
-        movq %mm1,96(%esi)
-        movd %mm2,108(%esi)
-        psrlq $32,%mm2
-        movd %mm2,104(%esi)
-        movq 112(%ebx),%mm4
-        / 28
-        movq %mm4,%mm5
-        movd 124(%ebx),%mm6
-        punpckldq 120(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,112(%esi)
-        movd %mm5,124(%esi)
-        psrlq $32,%mm5
-        movd %mm5,120(%esi)
+	movq	32(%edx), %mm0
+	movq	48(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	40(%edx), %mm1
+	movq	56(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 32(%ecx)
+	movq	%mm4, 48(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	movd	%mm3, 44(%ecx)
+	movd	%mm7, 60(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 40(%ecx)
+	movd	%mm7, 56(%ecx)
+
+	movq	64(%edx), %mm0
+	movq	80(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	72(%edx), %mm1
+	movq	88(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 64(%ecx)
+	movq	%mm4, 80(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	movd	%mm3, 76(%ecx)
+	movd	%mm7, 92(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 72(%ecx)
+	movd	%mm7, 88(%ecx)
+
+	movq	96(%edx), %mm0
+	movq	112(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	104(%edx), %mm1
+	movq	120(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 96(%ecx)
+	movq	%mm4, 112(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	movd	%mm3, 108(%ecx)
+	movd	%mm7, 124(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 104(%ecx)
+	movd	%mm7, 120(%ecx)
+	
+/* Phase 6. This is the end of easy road. */
+	movl	$1, %eax
+	movd	%eax, %mm7
+	pi2fd	%mm7, %mm7
+	movq	32(%ecx), %mm0
+	punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */	
+	movq	%mm0, %mm1
+	movq	plus_minus_3dnow, %mm6
+	/* n.b.: pfpnacc */
+	pxor	%mm6, %mm1
+	pfacc	%mm1, %mm0
+	/**/
+	pfmul	%mm7, %mm0
+	movq	%mm0, 32(%edx)
+	femms
+
+	flds   44(%ecx)
+	fsubs  40(%ecx)
+	fmuls 120(%ebx)
+
+	fsts   44(%edx)
+	fadds  40(%ecx) /* pfacc 40(ecx), 56(%ecx) */
+	fadds  44(%ecx)
+	fstps  40(%edx)
+
+	flds   48(%ecx)
+	fsubs  52(%ecx)
+	fmuls 120(%ebx)
+
+	flds   60(%ecx)
+	fsubs  56(%ecx)
+	fmuls 120(%ebx)
+
+	fld      %st(0)
+	fadds  56(%ecx)
+	fadds  60(%ecx)
+
+	fld      %st(0)
+	fadds  48(%ecx)
+	fadds  52(%ecx)
+	fstps  48(%edx)
+	fadd     %st(2)
+	fstps  56(%edx)
+	fsts   60(%edx)
+	faddp    %st(1)
+	fstps  52(%edx)
+/*---*/
+	flds   64(%ecx)
+	fadds  68(%ecx)
+	fstps  64(%edx)
+
+	flds   64(%ecx)
+	fsubs  68(%ecx)
+	fmuls 120(%ebx)
+	fstps  68(%edx)
+
+	flds   76(%ecx)
+	fsubs  72(%ecx)
+	fmuls 120(%ebx)
+	fsts   76(%edx)
+	fadds  72(%ecx)
+	fadds  76(%ecx)
+	fstps  72(%edx)
+
+	flds   92(%ecx)
+	fsubs  88(%ecx)
+	fmuls 120(%ebx)
+	fsts   92(%edx)
+	fadds  92(%ecx)
+	fadds  88(%ecx)
+
+	fld      %st(0)
+	fadds  80(%ecx)
+	fadds  84(%ecx)
+	fstps  80(%edx)
+
+	flds   80(%ecx)
+	fsubs  84(%ecx)
+	fmuls 120(%ebx)
+	fadd  %st(0), %st(1)
+	fadds 92(%edx)
+	fstps 84(%edx)
+	fstps 88(%edx)
+
+	flds   96(%ecx)
+	fadds 100(%ecx)
+	fstps  96(%edx)
+
+	flds   96(%ecx)
+	fsubs 100(%ecx)
+	fmuls 120(%ebx)
+	fstps 100(%edx)
+
+	flds  108(%ecx)
+	fsubs 104(%ecx)
+	fmuls 120(%ebx)
+	fsts  108(%edx)
+	fadds 104(%ecx)
+	fadds 108(%ecx)
+	fstps 104(%edx)
+
+	flds  124(%ecx)
+	fsubs 120(%ecx)
+	fmuls 120(%ebx)
+	fsts  124(%edx)
+	fadds 120(%ecx)
+	fadds 124(%ecx)
+
+	fld      %st(0)
+	fadds 112(%ecx)
+	fadds 116(%ecx)
+	fstps 112(%edx)
+
+	flds  112(%ecx)
+	fsubs 116(%ecx)
+	fmuls 120(%ebx)
+	fadd  %st(0),%st(1)
+	fadds 124(%edx)
+	fstps 116(%edx)
+	fstps 120(%edx)
+	jnz .L01
+	
+/* Phase 7*/
+
+	flds      (%ecx)
+	fadds    4(%ecx)
+	fstps 1024(%esi)
+
+	flds      (%ecx)
+	fsubs    4(%ecx)
+	fmuls  120(%ebx)
+	fsts      (%esi)
+	fstps     (%edi)
+
+	flds   12(%ecx)
+	fsubs   8(%ecx)
+	fmuls 120(%ebx)
+	fsts  512(%edi)
+	fadds  12(%ecx)
+	fadds   8(%ecx)
+	fstps 512(%esi)
+
+	flds   16(%ecx)
+	fsubs  20(%ecx)
+	fmuls 120(%ebx)
 
-        // 5
-        movl $-1,%eax
-        movd %eax,%mm1
-        movl $1,%eax
-        movd %eax,%mm0
-        / L | H
-        punpckldq %mm1,%mm0
-        pi2fd %mm0,%mm0
-        / 1.0 | -1.0
-        movd %eax,%mm1
-        pi2fd %mm1,%mm1
-        movl pnts+16,%eax
-        movd 0(%eax),%mm2
-        punpckldq %mm2,%mm1
-        / 1.0 | cos0
-        movq 0(%esi),%mm2
-        / 0
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq %mm2,0(%ebx)
-        movq 8(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm4,8(%ebx)
-        movq 16(%esi),%mm2
-        / 4
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq 24(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,16(%ebx)
-        movq %mm4,24(%ebx)
-        movq 32(%esi),%mm2
-        / 8
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq %mm2,32(%ebx)
-        movq 40(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm4,40(%ebx)
-        movq 48(%esi),%mm2
-        / 12
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq 56(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,48(%ebx)
-        movq %mm4,56(%ebx)
-        movq 64(%esi),%mm2
-        / 16
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq %mm2,64(%ebx)
-        movq 72(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm4,72(%ebx)
-        movq 80(%esi),%mm2
-        / 20
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq 88(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,80(%ebx)
-        movq %mm4,88(%ebx)
-        movq 96(%esi),%mm2
-        / 24
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq %mm2,96(%ebx)
-        movq 104(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm4,104(%ebx)
-        movq 112(%esi),%mm2
-        / 28
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq 120(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,112(%ebx)
-        movq %mm4,120(%ebx)
+	flds   28(%ecx)
+	fsubs  24(%ecx)
+	fmuls 120(%ebx)
+	fsts  768(%edi)
+	fld      %st(0)
+	fadds  24(%ecx)
+	fadds  28(%ecx)
+	fld      %st(0)
+	fadds  16(%ecx)
+	fadds  20(%ecx)
+	fstps 768(%esi)
+	fadd     %st(2)
+	fstps 256(%esi)
+	faddp    %st(1)
+	fstps 256(%edi)
+	
+/* Phase 8*/
+
+	flds   32(%edx)
+	fadds  48(%edx)
+	fstps 896(%esi)
+
+	flds   48(%edx)
+	fadds  40(%edx)
+	fstps 640(%esi)
+
+	flds   40(%edx)
+	fadds  56(%edx)
+	fstps 384(%esi)
+
+	flds   56(%edx)
+	fadds  36(%edx)
+	fstps 128(%esi)
+
+	flds   36(%edx)
+	fadds  52(%edx)
+	fstps 128(%edi)
+
+	flds   52(%edx)
+	fadds  44(%edx)
+	fstps 384(%edi)
+
+	flds   60(%edx)
+	fsts  896(%edi)
+	fadds  44(%edx)
+	fstps 640(%edi)
+
+	flds   96(%edx)
+	fadds 112(%edx)
+	fld      %st(0)
+	fadds  64(%edx)
+	fstps 960(%esi)
+	fadds  80(%edx)
+	fstps 832(%esi)
+
+	flds  112(%edx)
+	fadds 104(%edx)
+	fld      %st(0)
+	fadds  80(%edx)
+	fstps 704(%esi)
+	fadds  72(%edx)
+	fstps 576(%esi)
+
+	flds  104(%edx)
+	fadds 120(%edx)
+	fld      %st(0)
+	fadds  72(%edx)
+	fstps 448(%esi)
+	fadds  88(%edx)
+	fstps 320(%esi)
+
+	flds  120(%edx)
+	fadds 100(%edx)
+	fld      %st(0)
+	fadds  88(%edx)
+	fstps 192(%esi)
+	fadds  68(%edx)
+	fstps  64(%esi)
+
+	flds  100(%edx)
+	fadds 116(%edx)
+	fld      %st(0)
+	fadds  68(%edx)
+	fstps  64(%edi)
+	fadds  84(%edx)
+	fstps 192(%edi)
+
+	flds  116(%edx)
+	fadds 108(%edx)
+	fld      %st(0)
+	fadds  84(%edx)
+	fstps 320(%edi)
+	fadds  76(%edx)
+	fstps 448(%edi)
+
+	flds  108(%edx)
+	fadds 124(%edx)
+	fld      %st(0)
+	fadds  76(%edx)
+	fstps 576(%edi)
+	fadds  92(%edx)
+	fstps 704(%edi)
+
+	flds  124(%edx)
+	fsts  960(%edi)
+	fadds  92(%edx)
+	fstps 832(%edi)
+	jmp	.L_bye
+.L01:	
+/* Phase 9*/
+
+	flds      (%ecx)
+	fadds    4(%ecx)
+	fistp  512(%esi)
+
+	flds      (%ecx)
+	fsubs    4(%ecx)
+	fmuls  120(%ebx)
+
+	fistp     (%esi)
+
 
-        // Phase6
-        movl 0(%ebx),%eax
-        movl %eax,1024(%ebp)
-        movl 4(%ebx),%eax
-        movl %eax,0(%ebp)
-        movl %eax,0(%edx)
-        movl 8(%ebx),%eax
-        movl %eax,512(%ebp)
-        movl 12(%ebx),%eax
-        movl %eax,512(%edx)
+	flds    12(%ecx)
+	fsubs    8(%ecx)
+	fmuls  120(%ebx)
+	fist   256(%edi)
+	fadds   12(%ecx)
+	fadds    8(%ecx)
+	fistp  256(%esi)
 
-        movl 16(%ebx),%eax
-        movl %eax,768(%ebp)
-        movl 20(%ebx),%eax
-        movl %eax,256(%edx)
-
-        movl 24(%ebx),%eax
-        movl %eax,256(%ebp)
-        movl 28(%ebx),%eax
-        movl %eax,768(%edx)
+	flds   16(%ecx)
+	fsubs  20(%ecx)
+	fmuls 120(%ebx)
 
-        movq 32(%ebx),%mm0
-        movq 48(%ebx),%mm1
-        pfadd %mm1,%mm0
-        movd %mm0,896(%ebp)
-        psrlq $32,%mm0
-        movd %mm0,128(%edx)
-        movq 40(%ebx),%mm2
-        pfadd %mm2,%mm1
-        movd %mm1,640(%ebp)
-        psrlq $32,%mm1
-        movd %mm1,384(%edx)
+	flds   28(%ecx)
+	fsubs  24(%ecx)
+	fmuls 120(%ebx)
+	fist  384(%edi)
+	fld      %st(0)
+	fadds  24(%ecx)
+	fadds  28(%ecx)
+	fld      %st(0)
+	fadds  16(%ecx)
+	fadds  20(%ecx)
+	fistp  384(%esi)
+	fadd     %st(2)
+	fistp  128(%esi)
+	faddp    %st(1)
+	fistp  128(%edi)
+	
+/* Phase 10*/
+
+	flds    32(%edx)
+	fadds   48(%edx)
+	fistp  448(%esi)
 
-        movq 56(%ebx),%mm3
-        pfadd %mm3,%mm2
-        movd %mm2,384(%ebp)
-        psrlq $32,%mm2
-        movd %mm2,640(%edx)
+	flds   48(%edx)
+	fadds  40(%edx)
+	fistp 320(%esi)
+
+	flds   40(%edx)
+	fadds  56(%edx)
+	fistp 192(%esi)
+
+	flds   56(%edx)
+	fadds  36(%edx)
+	fistp  64(%esi)
 
-        movd 36(%ebx),%mm4
-        pfadd %mm4,%mm3
-        movd %mm3,128(%ebp)
-        psrlq $32,%mm3
-        movd %mm3,896(%edx)
-        movq 96(%ebx),%mm0
-        movq 64(%ebx),%mm1
+	flds   36(%edx)
+	fadds  52(%edx)
+	fistp  64(%edi)
+
+	flds   52(%edx)
+	fadds  44(%edx)
+	fistp 192(%edi)
+
+	flds   60(%edx)
+	fist   448(%edi)
+	fadds  44(%edx)
+	fistp 320(%edi)
 
-        movq 112(%ebx),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,%mm3
-        pfadd %mm1,%mm3
-        movd %mm3,960(%ebp)
-        psrlq $32,%mm3
-        movd %mm3,64(%edx)
-        movq 80(%ebx),%mm1
-        pfadd %mm1,%mm0
-        movd %mm0,832(%ebp)
-        psrlq $32,%mm0
-        movd %mm0,192(%edx)
-        movq 104(%ebx),%mm3
-        pfadd %mm3,%mm2
-        movq %mm2,%mm4
-        pfadd %mm1,%mm4
-        movd %mm4,704(%ebp)
-        psrlq $32,%mm4
-        movd %mm4,320(%edx)
-        movq 72(%ebx),%mm1
-        pfadd %mm1,%mm2
-        movd %mm2,576(%ebp)
-        psrlq $32,%mm2
-        movd %mm2,448(%edx)
+	flds   96(%edx)
+	fadds 112(%edx)
+	fld      %st(0)
+	fadds  64(%edx)
+	fistp 480(%esi)
+	fadds  80(%edx)
+	fistp 416(%esi)
+
+	flds  112(%edx)
+	fadds 104(%edx)
+	fld      %st(0)
+	fadds  80(%edx)
+	fistp 352(%esi)
+	fadds  72(%edx)
+	fistp 288(%esi)
+
+	flds  104(%edx)
+	fadds 120(%edx)
+	fld      %st(0)
+	fadds  72(%edx)
+	fistp 224(%esi)
+	fadds  88(%edx)
+	fistp 160(%esi)
+
+	flds  120(%edx)
+	fadds 100(%edx)
+	fld      %st(0)
+	fadds  88(%edx)
+	fistp  96(%esi)
+	fadds  68(%edx)
+	fistp  32(%esi)
 
-        movq 120(%ebx),%mm4
-        pfadd %mm4,%mm3
-        movq %mm3,%mm5
-        pfadd %mm1,%mm5
-        movd %mm5,448(%ebp)
-        psrlq $32,%mm5
-        movd %mm5,576(%edx)
-        movq 88(%ebx),%mm1
-        pfadd %mm1,%mm3
-        movd %mm3,320(%ebp)
-        psrlq $32,%mm3
-        movd %mm3,704(%edx)
+	flds  100(%edx)
+	fadds 116(%edx)
+	fld      %st(0)
+	fadds  68(%edx)
+	fistp  32(%edi)
+	fadds  84(%edx)
+	fistp  96(%edi)
+
+	flds  116(%edx)
+	fadds 108(%edx)
+	fld      %st(0)
+	fadds  84(%edx)
+	fistp 160(%edi)
+	fadds  76(%edx)
+	fistp 224(%edi)
 
-        movd 100(%ebx),%mm5
-        pfadd %mm5,%mm4
-        movq %mm4,%mm6
-        pfadd %mm1,%mm6
-        movd %mm6,192(%ebp)
-        psrlq $32,%mm6
-        movd %mm6,832(%edx)
-        movd 68(%ebx),%mm1
-        pfadd %mm1,%mm4
-        movd %mm4,64(%ebp)
-        psrlq $32,%mm4
-        movd %mm4,960(%edx)
+	flds  108(%edx)
+	fadds 124(%edx)
+	fld      %st(0)
+	fadds  76(%edx)
+	fistp 288(%edi)
+	fadds  92(%edx)
+	fistp 352(%edi)
 
-        / femms
+	flds  124(%edx)
+	fist  480(%edi)
+	fadds  92(%edx)
+	fistp 416(%edi)
+	movsw
+.L_bye:
+	addl $256,%esp
+	popl %edi
+	popl %esi
+	popl %ebx
+	ret
+	
 
-        popl %ebx
-        popl %esi
-        popl %edi
-        popl %ebp
-        addl $256,%esp
-
-        ret
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mp3lib/dct64_MMX.s	Fri Jun 29 17:55:35 2001 +0000
@@ -0,0 +1,1028 @@
+# This code was taken from http://www.mpg123.org
+# See ChangeLog of mpg123-0.59s-pre.1 for detail
+# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+
+.data
+	.align 4
+costab:
+	.long 1056974725
+	.long 1057056395
+	.long 1057223771
+	.long 1057485416
+	.long 1057855544
+	.long 1058356026
+	.long 1059019886
+	.long 1059897405
+	.long 1061067246
+	.long 1062657950
+	.long 1064892987
+	.long 1066774581
+	.long 1069414683
+	.long 1073984175
+	.long 1079645762
+	.long 1092815430
+	.long 1057005197
+	.long 1057342072
+	.long 1058087743
+	.long 1059427869
+	.long 1061799040
+	.long 1065862217
+	.long 1071413542
+	.long 1084439708
+	.long 1057128951
+	.long 1058664893
+	.long 1063675095
+	.long 1076102863
+	.long 1057655764
+	.long 1067924853
+	.long 1060439283
+
+.text
+
+	.align 16
+
+.globl dct64_MMX
+dct64_MMX:
+	pushl %ebx
+	pushl %esi
+	pushl %edi
+	subl $256,%esp
+	movl 280(%esp),%eax
+/* Phase 1*/
+	flds     (%eax)
+	leal 128(%esp),%edx
+	fadds 124(%eax)
+	movl 272(%esp),%esi
+	fstps    (%edx)
+	movl 276(%esp),%edi
+
+	flds    4(%eax)
+	movl $costab,%ebx
+	fadds 120(%eax)
+	orl %ecx,%ecx
+	fstps   4(%edx)
+
+	flds     (%eax)
+	movl %esp,%ecx
+	fsubs 124(%eax)
+	fmuls    (%ebx)
+	fstps 124(%edx)
+
+	flds    4(%eax)
+	fsubs 120(%eax)
+	fmuls   4(%ebx)
+	fstps 120(%edx)
+
+	flds    8(%eax)
+	fadds 116(%eax)
+	fstps   8(%edx)
+
+	flds   12(%eax)
+	fadds 112(%eax)
+	fstps  12(%edx)
+
+	flds    8(%eax)
+	fsubs 116(%eax)
+	fmuls   8(%ebx)
+	fstps 116(%edx)
+
+	flds   12(%eax)
+	fsubs 112(%eax)
+	fmuls  12(%ebx)
+	fstps 112(%edx)
+
+	flds   16(%eax)
+	fadds 108(%eax)
+	fstps  16(%edx)
+
+	flds   20(%eax)
+	fadds 104(%eax)
+	fstps  20(%edx)
+
+	flds   16(%eax)
+	fsubs 108(%eax)
+	fmuls  16(%ebx)
+	fstps 108(%edx)
+
+	flds   20(%eax)
+	fsubs 104(%eax)
+	fmuls  20(%ebx)
+	fstps 104(%edx)
+
+	flds   24(%eax)
+	fadds 100(%eax)
+	fstps  24(%edx)
+
+	flds   28(%eax)
+	fadds  96(%eax)
+	fstps  28(%edx)
+
+	flds   24(%eax)
+	fsubs 100(%eax)
+	fmuls  24(%ebx)
+	fstps 100(%edx)
+
+	flds   28(%eax)
+	fsubs  96(%eax)
+	fmuls  28(%ebx)
+	fstps  96(%edx)
+
+	flds   32(%eax)
+	fadds  92(%eax)
+	fstps  32(%edx)
+
+	flds   36(%eax)
+	fadds  88(%eax)
+	fstps  36(%edx)
+
+	flds   32(%eax)
+	fsubs  92(%eax)
+	fmuls  32(%ebx)
+	fstps  92(%edx)
+
+	flds   36(%eax)
+	fsubs  88(%eax)
+	fmuls  36(%ebx)
+	fstps  88(%edx)
+
+	flds   40(%eax)
+	fadds  84(%eax)
+	fstps  40(%edx)
+
+	flds   44(%eax)
+	fadds  80(%eax)
+	fstps  44(%edx)
+
+	flds   40(%eax)
+	fsubs  84(%eax)
+	fmuls  40(%ebx)
+	fstps  84(%edx)
+
+	flds   44(%eax)
+	fsubs  80(%eax)
+	fmuls  44(%ebx)
+	fstps  80(%edx)
+
+	flds   48(%eax)
+	fadds  76(%eax)
+	fstps  48(%edx)
+
+	flds   52(%eax)
+	fadds  72(%eax)
+	fstps  52(%edx)
+
+	flds   48(%eax)
+	fsubs  76(%eax)
+	fmuls  48(%ebx)
+	fstps  76(%edx)
+
+	flds   52(%eax)
+	fsubs  72(%eax)
+	fmuls  52(%ebx)
+	fstps  72(%edx)
+
+	flds   56(%eax)
+	fadds  68(%eax)
+	fstps  56(%edx)
+
+	flds   60(%eax)
+	fadds  64(%eax)
+	fstps  60(%edx)
+
+	flds   56(%eax)
+	fsubs  68(%eax)
+	fmuls  56(%ebx)
+	fstps  68(%edx)
+
+	flds   60(%eax)
+	fsubs  64(%eax)
+	fmuls  60(%ebx)
+	fstps  64(%edx)
+	
+/* Phase 2*/
+
+	flds     (%edx)
+	fadds  60(%edx)
+	fstps    (%ecx)
+
+	flds    4(%edx)
+	fadds  56(%edx)
+	fstps   4(%ecx)
+
+	flds     (%edx)
+	fsubs  60(%edx)
+	fmuls  64(%ebx)
+	fstps  60(%ecx)
+
+	flds    4(%edx)
+	fsubs  56(%edx)
+	fmuls  68(%ebx)
+	fstps  56(%ecx)
+
+	flds    8(%edx)
+	fadds  52(%edx)
+	fstps   8(%ecx)
+
+	flds   12(%edx)
+	fadds  48(%edx)
+	fstps  12(%ecx)
+
+	flds    8(%edx)
+	fsubs  52(%edx)
+	fmuls  72(%ebx)
+	fstps  52(%ecx)
+
+	flds   12(%edx)
+	fsubs  48(%edx)
+	fmuls  76(%ebx)
+	fstps  48(%ecx)
+
+	flds   16(%edx)
+	fadds  44(%edx)
+	fstps  16(%ecx)
+
+	flds   20(%edx)
+	fadds  40(%edx)
+	fstps  20(%ecx)
+
+	flds   16(%edx)
+	fsubs  44(%edx)
+	fmuls  80(%ebx)
+	fstps  44(%ecx)
+
+	flds   20(%edx)
+	fsubs  40(%edx)
+	fmuls  84(%ebx)
+	fstps  40(%ecx)
+
+	flds   24(%edx)
+	fadds  36(%edx)
+	fstps  24(%ecx)
+
+	flds   28(%edx)
+	fadds  32(%edx)
+	fstps  28(%ecx)
+
+	flds   24(%edx)
+	fsubs  36(%edx)
+	fmuls  88(%ebx)
+	fstps  36(%ecx)
+
+	flds   28(%edx)
+	fsubs  32(%edx)
+	fmuls  92(%ebx)
+	fstps  32(%ecx)
+	
+/* Phase 3*/
+
+	flds   64(%edx)
+	fadds 124(%edx)
+	fstps  64(%ecx)
+
+	flds   68(%edx)
+	fadds 120(%edx)
+	fstps  68(%ecx)
+
+	flds  124(%edx)
+	fsubs  64(%edx)
+	fmuls  64(%ebx)
+	fstps 124(%ecx)
+
+	flds  120(%edx)
+	fsubs  68(%edx)
+	fmuls  68(%ebx)
+	fstps 120(%ecx)
+
+	flds   72(%edx)
+	fadds 116(%edx)
+	fstps  72(%ecx)
+
+	flds   76(%edx)
+	fadds 112(%edx)
+	fstps  76(%ecx)
+
+	flds  116(%edx)
+	fsubs  72(%edx)
+	fmuls  72(%ebx)
+	fstps 116(%ecx)
+
+	flds  112(%edx)
+	fsubs  76(%edx)
+	fmuls  76(%ebx)
+	fstps 112(%ecx)
+
+	flds   80(%edx)
+	fadds 108(%edx)
+	fstps  80(%ecx)
+
+	flds   84(%edx)
+	fadds 104(%edx)
+	fstps  84(%ecx)
+
+	flds  108(%edx)
+	fsubs  80(%edx)
+	fmuls  80(%ebx)
+	fstps 108(%ecx)
+
+	flds  104(%edx)
+	fsubs  84(%edx)
+	fmuls  84(%ebx)
+	fstps 104(%ecx)
+
+	flds   88(%edx)
+	fadds 100(%edx)
+	fstps  88(%ecx)
+
+	flds   92(%edx)
+	fadds  96(%edx)
+	fstps  92(%ecx)
+
+	flds  100(%edx)
+	fsubs  88(%edx)
+	fmuls  88(%ebx)
+	fstps 100(%ecx)
+
+	flds   96(%edx)
+	fsubs  92(%edx)
+	fmuls  92(%ebx)
+	fstps  96(%ecx)
+	
+/* Phase 4*/
+
+	flds     (%ecx)
+	fadds  28(%ecx)
+	fstps    (%edx)
+
+	flds     (%ecx)
+	fsubs  28(%ecx)
+	fmuls  96(%ebx)
+	fstps  28(%edx)
+
+	flds    4(%ecx)
+	fadds  24(%ecx)
+	fstps   4(%edx)
+
+	flds    4(%ecx)
+	fsubs  24(%ecx)
+	fmuls 100(%ebx)
+	fstps  24(%edx)
+
+	flds    8(%ecx)
+	fadds  20(%ecx)
+	fstps   8(%edx)
+
+	flds    8(%ecx)
+	fsubs  20(%ecx)
+	fmuls 104(%ebx)
+	fstps  20(%edx)
+
+	flds   12(%ecx)
+	fadds  16(%ecx)
+	fstps  12(%edx)
+
+	flds   12(%ecx)
+	fsubs  16(%ecx)
+	fmuls 108(%ebx)
+	fstps  16(%edx)
+
+	flds   32(%ecx)
+	fadds  60(%ecx)
+	fstps  32(%edx)
+
+	flds   60(%ecx)
+	fsubs  32(%ecx)
+	fmuls  96(%ebx)
+	fstps  60(%edx)
+
+	flds   36(%ecx)
+	fadds  56(%ecx)
+	fstps  36(%edx)
+
+	flds   56(%ecx)
+	fsubs  36(%ecx)
+	fmuls 100(%ebx)
+	fstps  56(%edx)
+
+	flds   40(%ecx)
+	fadds  52(%ecx)
+	fstps  40(%edx)
+
+	flds   52(%ecx)
+	fsubs  40(%ecx)
+	fmuls 104(%ebx)
+	fstps  52(%edx)
+
+	flds   44(%ecx)
+	fadds  48(%ecx)
+	fstps  44(%edx)
+
+	flds   48(%ecx)
+	fsubs  44(%ecx)
+	fmuls 108(%ebx)
+	fstps  48(%edx)
+
+	flds   64(%ecx)
+	fadds  92(%ecx)
+	fstps  64(%edx)
+
+	flds   64(%ecx)
+	fsubs  92(%ecx)
+	fmuls  96(%ebx)
+	fstps  92(%edx)
+
+	flds   68(%ecx)
+	fadds  88(%ecx)
+	fstps  68(%edx)
+
+	flds   68(%ecx)
+	fsubs  88(%ecx)
+	fmuls 100(%ebx)
+	fstps  88(%edx)
+
+	flds   72(%ecx)
+	fadds  84(%ecx)
+	fstps  72(%edx)
+
+	flds   72(%ecx)
+	fsubs  84(%ecx)
+	fmuls 104(%ebx)
+	fstps  84(%edx)
+
+	flds   76(%ecx)
+	fadds  80(%ecx)
+	fstps  76(%edx)
+
+	flds   76(%ecx)
+	fsubs  80(%ecx)
+	fmuls 108(%ebx)
+	fstps  80(%edx)
+
+	flds   96(%ecx)
+	fadds 124(%ecx)
+	fstps  96(%edx)
+
+	flds  124(%ecx)
+	fsubs  96(%ecx)
+	fmuls  96(%ebx)
+	fstps 124(%edx)
+
+	flds  100(%ecx)
+	fadds 120(%ecx)
+	fstps 100(%edx)
+
+	flds  120(%ecx)
+	fsubs 100(%ecx)
+	fmuls 100(%ebx)
+	fstps 120(%edx)
+
+	flds  104(%ecx)
+	fadds 116(%ecx)
+	fstps 104(%edx)
+
+	flds  116(%ecx)
+	fsubs 104(%ecx)
+	fmuls 104(%ebx)
+	fstps 116(%edx)
+
+	flds  108(%ecx)
+	fadds 112(%ecx)
+	fstps 108(%edx)
+
+	flds  112(%ecx)
+	fsubs 108(%ecx)
+	fmuls 108(%ebx)
+	fstps 112(%edx)
+
+	flds     (%edx)
+	fadds  12(%edx)
+	fstps    (%ecx)
+
+	flds     (%edx)
+	fsubs  12(%edx)
+	fmuls 112(%ebx)
+	fstps  12(%ecx)
+
+	flds    4(%edx)
+	fadds   8(%edx)
+	fstps   4(%ecx)
+
+	flds    4(%edx)
+	fsubs   8(%edx)
+	fmuls 116(%ebx)
+	fstps   8(%ecx)
+
+	flds   16(%edx)
+	fadds  28(%edx)
+	fstps  16(%ecx)
+
+	flds   28(%edx)
+	fsubs  16(%edx)
+	fmuls 112(%ebx)
+	fstps  28(%ecx)
+
+	flds   20(%edx)
+	fadds  24(%edx)
+	fstps  20(%ecx)
+
+	flds   24(%edx)
+	fsubs  20(%edx)
+	fmuls 116(%ebx)
+	fstps  24(%ecx)
+
+	flds   32(%edx)
+	fadds  44(%edx)
+	fstps  32(%ecx)
+
+	flds   32(%edx)
+	fsubs  44(%edx)
+	fmuls 112(%ebx)
+	fstps  44(%ecx)
+
+	flds   36(%edx)
+	fadds  40(%edx)
+	fstps  36(%ecx)
+
+	flds   36(%edx)
+	fsubs  40(%edx)
+	fmuls 116(%ebx)
+	fstps  40(%ecx)
+
+	flds   48(%edx)
+	fadds  60(%edx)
+	fstps  48(%ecx)
+
+	flds   60(%edx)
+	fsubs  48(%edx)
+	fmuls 112(%ebx)
+	fstps  60(%ecx)
+
+	flds   52(%edx)
+	fadds  56(%edx)
+	fstps  52(%ecx)
+
+	flds   56(%edx)
+	fsubs  52(%edx)
+	fmuls 116(%ebx)
+	fstps  56(%ecx)
+
+	flds   64(%edx)
+	fadds  76(%edx)
+	fstps  64(%ecx)
+
+	flds   64(%edx)
+	fsubs  76(%edx)
+	fmuls 112(%ebx)
+	fstps  76(%ecx)
+
+	flds   68(%edx)
+	fadds  72(%edx)
+	fstps  68(%ecx)
+
+	flds   68(%edx)
+	fsubs  72(%edx)
+	fmuls 116(%ebx)
+	fstps  72(%ecx)
+
+	flds   80(%edx)
+	fadds  92(%edx)
+	fstps  80(%ecx)
+
+	flds   92(%edx)
+	fsubs  80(%edx)
+	fmuls 112(%ebx)
+	fstps  92(%ecx)
+
+	flds   84(%edx)
+	fadds  88(%edx)
+	fstps  84(%ecx)
+
+	flds   88(%edx)
+	fsubs  84(%edx)
+	fmuls 116(%ebx)
+	fstps  88(%ecx)
+
+	flds   96(%edx)
+	fadds 108(%edx)
+	fstps  96(%ecx)
+
+	flds   96(%edx)
+	fsubs 108(%edx)
+	fmuls 112(%ebx)
+	fstps 108(%ecx)
+
+	flds  100(%edx)
+	fadds 104(%edx)
+	fstps 100(%ecx)
+
+	flds  100(%edx)
+	fsubs 104(%edx)
+	fmuls 116(%ebx)
+	fstps 104(%ecx)
+
+	flds  112(%edx)
+	fadds 124(%edx)
+	fstps 112(%ecx)
+
+	flds  124(%edx)
+	fsubs 112(%edx)
+	fmuls 112(%ebx)
+	fstps 124(%ecx)
+
+	flds  116(%edx)
+	fadds 120(%edx)
+	fstps 116(%ecx)
+
+	flds  120(%edx)
+	fsubs 116(%edx)
+	fmuls 116(%ebx)
+	fstps 120(%ecx)
+	
+/* Phase 5*/
+
+	flds   32(%ecx)
+	fadds  36(%ecx)
+	fstps  32(%edx)
+
+	flds   32(%ecx)
+	fsubs  36(%ecx)
+	fmuls 120(%ebx)
+	fstps  36(%edx)
+
+	flds   44(%ecx)
+	fsubs  40(%ecx)
+	fmuls 120(%ebx)
+	fsts   44(%edx)
+	fadds  40(%ecx)
+	fadds  44(%ecx)
+	fstps  40(%edx)
+
+	flds   48(%ecx)
+	fsubs  52(%ecx)
+	fmuls 120(%ebx)
+
+	flds   60(%ecx)
+	fsubs  56(%ecx)
+	fmuls 120(%ebx)
+	fld      %st(0)
+	fadds  56(%ecx)
+	fadds  60(%ecx)
+	fld      %st(0)
+	fadds  48(%ecx)
+	fadds  52(%ecx)
+	fstps  48(%edx)
+	fadd     %st(2)
+	fstps  56(%edx)
+	fsts   60(%edx)
+	faddp    %st(1)
+	fstps  52(%edx)
+
+	flds   64(%ecx)
+	fadds  68(%ecx)
+	fstps  64(%edx)
+
+	flds   64(%ecx)
+	fsubs  68(%ecx)
+	fmuls 120(%ebx)
+	fstps  68(%edx)
+
+	flds   76(%ecx)
+	fsubs  72(%ecx)
+	fmuls 120(%ebx)
+	fsts   76(%edx)
+	fadds  72(%ecx)
+	fadds  76(%ecx)
+	fstps  72(%edx)
+
+	flds   92(%ecx)
+	fsubs  88(%ecx)
+	fmuls 120(%ebx)
+	fsts   92(%edx)
+	fadds  92(%ecx)
+	fadds  88(%ecx)
+	fld      %st(0)
+	fadds  80(%ecx)
+	fadds  84(%ecx)
+	fstps  80(%edx)
+
+	flds   80(%ecx)
+	fsubs  84(%ecx)
+	fmuls 120(%ebx)
+	fadd  %st(0), %st(1)
+	fadds 92(%edx)
+	fstps 84(%edx)
+	fstps 88(%edx)
+
+	flds   96(%ecx)
+	fadds 100(%ecx)
+	fstps  96(%edx)
+
+	flds   96(%ecx)
+	fsubs 100(%ecx)
+	fmuls 120(%ebx)
+	fstps 100(%edx)
+
+	flds  108(%ecx)
+	fsubs 104(%ecx)
+	fmuls 120(%ebx)
+	fsts  108(%edx)
+	fadds 104(%ecx)
+	fadds 108(%ecx)
+	fstps 104(%edx)
+
+	flds  124(%ecx)
+	fsubs 120(%ecx)
+	fmuls 120(%ebx)
+	fsts  124(%edx)
+	fadds 120(%ecx)
+	fadds 124(%ecx)
+	fld      %st(0)
+	fadds 112(%ecx)
+	fadds 116(%ecx)
+	fstps 112(%edx)
+
+	flds  112(%ecx)
+	fsubs 116(%ecx)
+	fmuls 120(%ebx)
+	fadd  %st(0),%st(1)
+	fadds 124(%edx)
+	fstps 116(%edx)
+	fstps 120(%edx)
+	jnz .L01
+	
+/* Phase 6*/
+
+	flds      (%ecx)
+	fadds    4(%ecx)
+	fstps 1024(%esi)
+
+	flds      (%ecx)
+	fsubs    4(%ecx)
+	fmuls  120(%ebx)
+	fsts      (%esi)
+	fstps     (%edi)
+
+	flds   12(%ecx)
+	fsubs   8(%ecx)
+	fmuls 120(%ebx)
+	fsts  512(%edi)
+	fadds  12(%ecx)
+	fadds   8(%ecx)
+	fstps 512(%esi)
+
+	flds   16(%ecx)
+	fsubs  20(%ecx)
+	fmuls 120(%ebx)
+
+	flds   28(%ecx)
+	fsubs  24(%ecx)
+	fmuls 120(%ebx)
+	fsts  768(%edi)
+	fld      %st(0)
+	fadds  24(%ecx)
+	fadds  28(%ecx)
+	fld      %st(0)
+	fadds  16(%ecx)
+	fadds  20(%ecx)
+	fstps 768(%esi)
+	fadd     %st(2)
+	fstps 256(%esi)
+	faddp    %st(1)
+	fstps 256(%edi)
+	
+/* Phase 7*/
+
+	flds   32(%edx)
+	fadds  48(%edx)
+	fstps 896(%esi)
+
+	flds   48(%edx)
+	fadds  40(%edx)
+	fstps 640(%esi)
+
+	flds   40(%edx)
+	fadds  56(%edx)
+	fstps 384(%esi)
+
+	flds   56(%edx)
+	fadds  36(%edx)
+	fstps 128(%esi)
+
+	flds   36(%edx)
+	fadds  52(%edx)
+	fstps 128(%edi)
+
+	flds   52(%edx)
+	fadds  44(%edx)
+	fstps 384(%edi)
+
+	flds   60(%edx)
+	fsts  896(%edi)
+	fadds  44(%edx)
+	fstps 640(%edi)
+
+	flds   96(%edx)
+	fadds 112(%edx)
+	fld      %st(0)
+	fadds  64(%edx)
+	fstps 960(%esi)
+	fadds  80(%edx)
+	fstps 832(%esi)
+
+	flds  112(%edx)
+	fadds 104(%edx)
+	fld      %st(0)
+	fadds  80(%edx)
+	fstps 704(%esi)
+	fadds  72(%edx)
+	fstps 576(%esi)
+
+	flds  104(%edx)
+	fadds 120(%edx)
+	fld      %st(0)
+	fadds  72(%edx)
+	fstps 448(%esi)
+	fadds  88(%edx)
+	fstps 320(%esi)
+
+	flds  120(%edx)
+	fadds 100(%edx)
+	fld      %st(0)
+	fadds  88(%edx)
+	fstps 192(%esi)
+	fadds  68(%edx)
+	fstps  64(%esi)
+
+	flds  100(%edx)
+	fadds 116(%edx)
+	fld      %st(0)
+	fadds  68(%edx)
+	fstps  64(%edi)
+	fadds  84(%edx)
+	fstps 192(%edi)
+
+	flds  116(%edx)
+	fadds 108(%edx)
+	fld      %st(0)
+	fadds  84(%edx)
+	fstps 320(%edi)
+	fadds  76(%edx)
+	fstps 448(%edi)
+
+	flds  108(%edx)
+	fadds 124(%edx)
+	fld      %st(0)
+	fadds  76(%edx)
+	fstps 576(%edi)
+	fadds  92(%edx)
+	fstps 704(%edi)
+
+	flds  124(%edx)
+	fsts  960(%edi)
+	fadds  92(%edx)
+	fstps 832(%edi)
+	addl $256,%esp
+	popl %edi
+	popl %esi
+	popl %ebx
+	ret
+.L01:	
+/* Phase 8*/
+
+	flds      (%ecx)
+	fadds    4(%ecx)
+	fistp  512(%esi)
+
+	flds      (%ecx)
+	fsubs    4(%ecx)
+	fmuls  120(%ebx)
+
+	fistp     (%esi)
+
+
+	flds    12(%ecx)
+	fsubs    8(%ecx)
+	fmuls  120(%ebx)
+	fist   256(%edi)
+	fadds   12(%ecx)
+	fadds    8(%ecx)
+	fistp  256(%esi)
+
+	flds   16(%ecx)
+	fsubs  20(%ecx)
+	fmuls 120(%ebx)
+
+	flds   28(%ecx)
+	fsubs  24(%ecx)
+	fmuls 120(%ebx)
+	fist  384(%edi)
+	fld      %st(0)
+	fadds  24(%ecx)
+	fadds  28(%ecx)
+	fld      %st(0)
+	fadds  16(%ecx)
+	fadds  20(%ecx)
+	fistp  384(%esi)
+	fadd     %st(2)
+	fistp  128(%esi)
+	faddp    %st(1)
+	fistp  128(%edi)
+	
+/* Phase 9*/
+
+	flds    32(%edx)
+	fadds   48(%edx)
+	fistp  448(%esi)
+
+	flds   48(%edx)
+	fadds  40(%edx)
+	fistp 320(%esi)
+
+	flds   40(%edx)
+	fadds  56(%edx)
+	fistp 192(%esi)
+
+	flds   56(%edx)
+	fadds  36(%edx)
+	fistp  64(%esi)
+
+	flds   36(%edx)
+	fadds  52(%edx)
+	fistp  64(%edi)
+
+	flds   52(%edx)
+	fadds  44(%edx)
+	fistp 192(%edi)
+
+	flds   60(%edx)
+	fist   448(%edi)
+	fadds  44(%edx)
+	fistp 320(%edi)
+
+	flds   96(%edx)
+	fadds 112(%edx)
+	fld      %st(0)
+	fadds  64(%edx)
+	fistp 480(%esi)
+	fadds  80(%edx)
+	fistp 416(%esi)
+
+	flds  112(%edx)
+	fadds 104(%edx)
+	fld      %st(0)
+	fadds  80(%edx)
+	fistp 352(%esi)
+	fadds  72(%edx)
+	fistp 288(%esi)
+
+	flds  104(%edx)
+	fadds 120(%edx)
+	fld      %st(0)
+	fadds  72(%edx)
+	fistp 224(%esi)
+	fadds  88(%edx)
+	fistp 160(%esi)
+
+	flds  120(%edx)
+	fadds 100(%edx)
+	fld      %st(0)
+	fadds  88(%edx)
+	fistp  96(%esi)
+	fadds  68(%edx)
+	fistp  32(%esi)
+
+	flds  100(%edx)
+	fadds 116(%edx)
+	fld      %st(0)
+	fadds  68(%edx)
+	fistp  32(%edi)
+	fadds  84(%edx)
+	fistp  96(%edi)
+
+	flds  116(%edx)
+	fadds 108(%edx)
+	fld      %st(0)
+	fadds  84(%edx)
+	fistp 160(%edi)
+	fadds  76(%edx)
+	fistp 224(%edi)
+
+	flds  108(%edx)
+	fadds 124(%edx)
+	fld      %st(0)
+	fadds  76(%edx)
+	fistp 288(%edi)
+	fadds  92(%edx)
+	fistp 352(%edi)
+
+	flds  124(%edx)
+	fist  480(%edi)
+	fadds  92(%edx)
+	fistp 416(%edi)
+	movsw
+	addl $256,%esp
+	popl %edi
+	popl %esi
+	popl %ebx
+	ret
+	
+
--- a/mp3lib/dct64_k7.s	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/dct64_k7.s	Fri Jun 29 17:55:35 2001 +0000
@@ -1,677 +1,804 @@
-///
-/// Replacement of dct64() with AMD's 3DNowEx(DSP)! SIMD operations support
-///
-/// This code based 'dct64_3dnow.s' by Syuuhei Kashiyama
-/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
-///
-///  - added new opcodes PSWAPD, PFPNACC
-///  - decreased number of opcodes (as it was suggested by k7 manual)
-///    (using memory reference as operand of instructions)
-///  - Phase 6 is rewritten with mixing of cpu and mmx opcodes
-///  - change function name for support 3DNowEx! automatic detect
-///  - negation of 3dnow reg was replaced with PXOR 0x800000000, MMi instead 
-///    of PFMUL as it was suggested by athlon manual. (Two not separated PFMUL
-///    can not be paired, but PXOR can be).
-///
-/// note: because K7 processors are an aggresive out-of-order three-way
-///       superscalar ones instruction order is not significand for them.
-///
-/// Modified by Nick Kurshev <nickols_k@mail.ru>
-///
-/// The author of this program disclaim whole expressed or implied
-/// warranties with regard to this program, and in no event shall the
-/// author of this program liable to whatever resulted from the use of
-/// this program. Use it at your own risk.
-///
+# This code was taken from http://www.mpg123.org
+# See ChangeLog of mpg123-0.59s-pre.1 for detail
+# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+# Partial 3dnowex-DSP! optimization by Nick Kurshev
+#
+# TODO: finish 3dnow! optimization at least in scalar mode
+#
 
 .data
-        .align 8
+	.align 8
 plus_minus_3dnow: .long 0x00000000, 0x80000000
+costab:
+	.long 1056974725
+	.long 1057056395
+	.long 1057223771
+	.long 1057485416
+	.long 1057855544
+	.long 1058356026
+	.long 1059019886
+	.long 1059897405
+	.long 1061067246
+	.long 1062657950
+	.long 1064892987
+	.long 1066774581
+	.long 1069414683
+	.long 1073984175
+	.long 1079645762
+	.long 1092815430
+	.long 1057005197
+	.long 1057342072
+	.long 1058087743
+	.long 1059427869
+	.long 1061799040
+	.long 1065862217
+	.long 1071413542
+	.long 1084439708
+	.long 1057128951
+	.long 1058664893
+	.long 1063675095
+	.long 1076102863
+	.long 1057655764
+	.long 1067924853
+	.long 1060439283
 
 .text
-        .globl dct64_3dnowex
-        .type    dct64_3dnowex,@function
+
+	.align 16
+
+.globl dct64_MMX_3dnowex
+dct64_MMX_3dnowex:
+	pushl %ebx
+	pushl %esi
+	pushl %edi
+	subl $256,%esp
+	movl 280(%esp),%eax
 
-/* Discrete Cosine Tansform (DCT) for subband synthesis */
-/* void dct64(real *a,real *b,real *c) */
-dct64_3dnowex:
-        subl $256,%esp
-        pushl %ebp
-        pushl %edi
-        pushl %esi
-        pushl %ebx
-        leal 16(%esp),%ebx   /* ebx -> real tmp1[32] */
-        movl 284(%esp),%edi  /* edi -> c */
-        movl 276(%esp),%ebp  /* ebp -> a */
-        movl 280(%esp),%edx  /* edx -> b */
-        leal 128(%ebx),%esi  /* esi -> real tmp2[32] */
-
-        / femms
-
-        // 1
-        movl pnts,%eax
+	leal 128(%esp),%edx
+	movl 272(%esp),%esi
+	movl 276(%esp),%edi
+	movl $costab,%ebx
+	orl %ecx,%ecx
+	movl %esp,%ecx
+	femms	
+/* Phase 1*/
+	movq	(%eax), %mm0
+	movq	8(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	120(%eax), %mm1
+	movq	112(%eax), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%edx)
+	movq	%mm4, 8(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	(%ebx), %mm3
+	pfmul	8(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 120(%edx)
+	movq	%mm7, 112(%edx)
 
-        movq 0(%edi),%mm0        /* mm0 = c[0x00] | c[0x01]*/
-        movq %mm0,%mm1           /* mm1 = mm0 */
-        movd 124(%edi),%mm2      /* mm2 = c[0x1f] */
-        punpckldq 120(%edi),%mm2 /* mm2 = c[0x1f] | c[0x1E] */
-        pfadd %mm2,%mm0          /* mm0 = c[0x00]+c[0x1F] | c[0x1E]+c[0x01] */
-        movq %mm0,0(%ebx)        /* tmp[0, 1] = mm0 */
-        pfsub %mm2,%mm1          /* c[0x00]-c[0x1f] | c[0x01]-c[0x1e] */
-        pfmul 0(%eax),%mm1       /* (c[0x00]-c[0x1f])*pnts[0]|(c[0x01]-c[0x1e])*pnts[1]*/
-        pswapd %mm1, %mm1        /* (c[0x01]-c[0x1e])*pnts[1]|(c[0x00]-c[0x1f])*pnts[0]*/
-        movq   %mm1, 120(%ebx)   /* tmp1[30, 31]=mm1 */
+	movq	16(%eax), %mm0
+	movq	24(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	104(%eax), %mm1
+	movq	96(%eax), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 16(%edx)
+	movq	%mm4, 24(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	16(%ebx), %mm3
+	pfmul	24(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 104(%edx)
+	movq	%mm7, 96(%edx)
 
-        movq 8(%edi),%mm4
-        movq %mm4,%mm5
-        movd 116(%edi),%mm6
-        punpckldq 112(%edi),%mm6
-        pfadd %mm6,%mm4
-        movq %mm4,8(%ebx)
-        pfsub %mm6,%mm5
-        pfmul 8(%eax),%mm5
-        pswapd %mm5, %mm5
-        movq   %mm5, 112(%ebx)
-
-        movq 16(%edi),%mm0
-        movq %mm0,%mm1
-        movd 108(%edi),%mm2
-        punpckldq 104(%edi),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,16(%ebx)
-        pfsub %mm2,%mm1
-        pfmul 16(%eax),%mm1
-        pswapd %mm1, %mm1
-        movq   %mm1, 104(%ebx)
+	movq	32(%eax), %mm0
+	movq	40(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	88(%eax), %mm1
+	movq	80(%eax), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 32(%edx)
+	movq	%mm4, 40(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	32(%ebx), %mm3
+	pfmul	40(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 88(%edx)
+	movq	%mm7, 80(%edx)
 
-        movq 24(%edi),%mm4
-        movq %mm4,%mm5
-        movd 100(%edi),%mm6
-        punpckldq 96(%edi),%mm6
-        pfadd %mm6,%mm4
-        movq %mm4,24(%ebx)
-        pfsub %mm6,%mm5
-        pfmul 24(%eax),%mm5
-        pswapd %mm5, %mm5
-        movq   %mm5, 96(%ebx)
+	movq	48(%eax), %mm0
+	movq	56(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	72(%eax), %mm1
+	movq	64(%eax), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 48(%edx)
+	movq	%mm4, 56(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	48(%ebx), %mm3
+	pfmul	56(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 72(%edx)
+	movq	%mm7, 64(%edx)
 
-        movq 32(%edi),%mm0
-        movq %mm0,%mm1
-        movd 92(%edi),%mm2
-        punpckldq 88(%edi),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,32(%ebx)
-        pfsub %mm2,%mm1
-        pfmul 32(%eax),%mm1
-        pswapd %mm1, %mm1
-        movq   %mm1, 88(%ebx)
+/* Phase 2*/
 
-        movq 40(%edi),%mm4
-        movq %mm4,%mm5
-        movd 84(%edi),%mm6
-        punpckldq 80(%edi),%mm6
-        pfadd %mm6,%mm4
-        movq %mm4,40(%ebx)
-        pfsub %mm6,%mm5
-        pfmul 40(%eax),%mm5
-        pswapd %mm5, %mm5
-        movq   %mm5, 80(%ebx)
+	movq	(%edx), %mm0
+	movq	8(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	56(%edx), %mm1
+	movq	48(%edx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%ecx)
+	movq	%mm4, 8(%ecx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	64(%ebx), %mm3
+	pfmul	72(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 56(%ecx)
+	movq	%mm7, 48(%ecx)
+	
+	movq	16(%edx), %mm0
+	movq	24(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	40(%edx), %mm1
+	movq	32(%edx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 16(%ecx)
+	movq	%mm4, 24(%ecx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	80(%ebx), %mm3
+	pfmul	88(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 40(%ecx)
+	movq	%mm7, 32(%ecx)
 
-        movq 48(%edi),%mm0
-        movq %mm0,%mm1
-        movd 76(%edi),%mm2
-        punpckldq 72(%edi),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,48(%ebx)
-        pfsub %mm2,%mm1
-        pfmul 48(%eax),%mm1
-        pswapd %mm1, %mm1
-        movq   %mm1, 72(%ebx)
+/* Phase 3*/
 
-        movq 56(%edi),%mm4
-        movq %mm4,%mm5
-        movd 68(%edi),%mm6
-        punpckldq 64(%edi),%mm6
-        pfadd %mm6,%mm4
-        movq %mm4,56(%ebx)
-        pfsub %mm6,%mm5
-        pfmul 56(%eax),%mm5
-        pswapd %mm5, %mm5
-        movq   %mm5, 64(%ebx)
+	movq	64(%edx), %mm0
+	movq	72(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	120(%edx), %mm1
+	movq	112(%edx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 64(%ecx)
+	movq	%mm4, 72(%ecx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	64(%ebx), %mm3
+	pfmul	72(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 120(%ecx)
+	movq	%mm7, 112(%ecx)
 
-        // 2
-        movl pnts+4,%eax
-        / 0, 14
-        movq 0(%ebx),%mm0            /* mm0 = tmp1[0] | tmp1[1] */
-        movq %mm0,%mm1
-        movd 60(%ebx),%mm2           /* mm2 = tmp1[0x0F] */
-        punpckldq 56(%ebx),%mm2      /* mm2 = tmp1[0x0E] | tmp1[0x0F] */
-        movq 0(%eax),%mm3            /* mm3 = pnts[0] | pnts[1] */
-        pfadd %mm2,%mm0              /* mm0 = tmp1[0]+tmp1[0x0F]|tmp1[1]+tmp1[0x0E]*/
-        movq %mm0,0(%esi)            /* tmp2[0, 1] = mm0 */
-        pfsub %mm2,%mm1              /* mm1 = tmp1[0]-tmp1[0x0F]|tmp1[1]-tmp1[0x0E]*/
-        pfmul %mm3,%mm1              /* mm1 = (tmp1[0]-tmp1[0x0F])*pnts[0]|(tmp1[1]-tmp1[0x0E])*pnts[1]*/
-        pswapd %mm1, %mm1            /* mm1 = (tmp1[1]-tmp1[0x0E])*pnts[1]|(tmp1[0]-tmp1[0x0F])*pnts[0]*/
-        movq   %mm1, 56(%esi)        /* tmp2[0x0E, 0x0F] = mm1 */
-        / 16, 30
-        movq 64(%ebx),%mm0
-        movq %mm0,%mm1
-        movd 124(%ebx),%mm2
-        punpckldq 120(%ebx),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,64(%esi)
-        pfsubr %mm2,%mm1
-        pfmul %mm3,%mm1
-        pswapd %mm1, %mm1
-        movq   %mm1, 120(%esi)
-        movq 8(%ebx),%mm4
-        / 2, 12
-        movq %mm4,%mm5
-        movd 52(%ebx),%mm6
-        punpckldq 48(%ebx),%mm6
-        movq 8(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,8(%esi)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        pswapd %mm5, %mm5
-        movq   %mm5, 48(%esi)
-        movq 72(%ebx),%mm4
-        / 18, 28
-        movq %mm4,%mm5
-        movd 116(%ebx),%mm6
-        punpckldq 112(%ebx),%mm6
-        pfadd %mm6,%mm4
-        movq %mm4,72(%esi)
-        pfsubr %mm6,%mm5
-        pfmul %mm7,%mm5
-        pswapd %mm5, %mm5
-        movq   %mm5, 112(%esi)
-        movq 16(%ebx),%mm0
-        / 4, 10
-        movq %mm0,%mm1
-        movd 44(%ebx),%mm2
-        punpckldq 40(%ebx),%mm2
-        movq 16(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,16(%esi)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        pswapd %mm1, %mm1
-        movq   %mm1, 40(%esi)
-        movq 80(%ebx),%mm0
-        / 20, 26
-        movq %mm0,%mm1
-        movd 108(%ebx),%mm2
-        punpckldq 104(%ebx),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,80(%esi)
-        pfsubr %mm2,%mm1
-        pfmul %mm3,%mm1
-        pswapd %mm1, %mm1
-        movq   %mm1, 104(%esi)
-        movq 24(%ebx),%mm4
-        / 6, 8
-        movq %mm4,%mm5
-        movd 36(%ebx),%mm6
-        punpckldq 32(%ebx),%mm6
-        movq 24(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,24(%esi)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        pswapd %mm5, %mm5
-        movq   %mm5, 32(%esi)
-        movq 88(%ebx),%mm4
-        / 22, 24
-        movq %mm4,%mm5
-        movd 100(%ebx),%mm6
-        punpckldq 96(%ebx),%mm6
-        pfadd %mm6,%mm4
-        movq %mm4,88(%esi)
-        pfsubr %mm6,%mm5
-        pfmul %mm7,%mm5
-        pswapd %mm5, %mm5
-        movq   %mm5, 96(%esi)
+	movq	80(%edx), %mm0
+	movq	88(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	104(%edx), %mm1
+	movq	96(%edx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 80(%ecx)
+	movq	%mm4, 88(%ecx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	80(%ebx), %mm3
+	pfmul	88(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 104(%ecx)
+	movq	%mm7, 96(%ecx)
+	
+/* Phase 4*/
+
+	movq	(%ecx), %mm0
+	movq	8(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	24(%ecx), %mm1
+	movq	16(%ecx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%edx)
+	movq	%mm4, 8(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 24(%edx)
+	movq	%mm7, 16(%edx)
+
+	movq	32(%ecx), %mm0
+	movq	40(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	56(%ecx), %mm1
+	movq	48(%ecx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 32(%edx)
+	movq	%mm4, 40(%edx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 56(%edx)
+	movq	%mm7, 48(%edx)
+
+	movq	64(%ecx), %mm0
+	movq	72(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	88(%ecx), %mm1
+	movq	80(%ecx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 64(%edx)
+	movq	%mm4, 72(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 88(%edx)
+	movq	%mm7, 80(%edx)
 
-        // 3
-        movl pnts+8,%eax
-        movq 0(%eax),%mm0
-        movq 8(%eax),%mm1
-        movq 0(%esi),%mm2
-        / 0, 6
-        movq %mm2,%mm3
-        movd 28(%esi),%mm4
-        punpckldq 24(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsub %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,0(%ebx)
-        pswapd %mm3, %mm3
-        movq   %mm3, 24(%ebx)
-        movq 8(%esi),%mm5
-        / 2, 4
-        movq %mm5,%mm6
-        movd 20(%esi),%mm7
-        punpckldq 16(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsub %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,8(%ebx)
-        pswapd %mm6, %mm6
-        movq   %mm6, 16(%ebx)
-        movq 32(%esi),%mm2
-        / 8, 14
-        movq %mm2,%mm3
-        movd 60(%esi),%mm4
-        punpckldq 56(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsubr %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,32(%ebx)
-        pswapd %mm3, %mm3
-        movq   %mm3, 56(%ebx)
-        movq 40(%esi),%mm5
-        / 10, 12
-        movq %mm5,%mm6
-        movd 52(%esi),%mm7
-        punpckldq 48(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsubr %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,40(%ebx)
-        pswapd %mm6, %mm6
-        movq   %mm6, 48(%ebx)
-        movq 64(%esi),%mm2
-        / 16, 22
-        movq %mm2,%mm3
-        movd 92(%esi),%mm4
-        punpckldq 88(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsub %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,64(%ebx)
-        pswapd %mm3, %mm3
-        movq   %mm3, 88(%ebx)
-        movq 72(%esi),%mm5
-        / 18, 20
-        movq %mm5,%mm6
-        movd 84(%esi),%mm7
-        punpckldq 80(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsub %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,72(%ebx)
-        pswapd %mm6, %mm6
-        movq   %mm6, 80(%ebx)
-        movq 96(%esi),%mm2
-        / 24, 30
-        movq %mm2,%mm3
-        movd 124(%esi),%mm4
-        punpckldq 120(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsubr %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,96(%ebx)
-        pswapd %mm3, %mm3
-        movq   %mm3, 120(%ebx)
-        movq 104(%esi),%mm5
-        / 26, 28
-        movq %mm5,%mm6
-        movd 116(%esi),%mm7
-        punpckldq 112(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsubr %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,104(%ebx)
-        pswapd %mm6, %mm6
-        movq   %mm6, 112(%ebx)
+	movq	96(%ecx), %mm0
+	movq	104(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	120(%ecx), %mm1
+	movq	112(%ecx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 96(%edx)
+	movq	%mm4, 104(%edx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 120(%edx)
+	movq	%mm7, 112(%edx)
+
+/* Phase 5 */
+
+	movq	(%edx), %mm0
+	movq	16(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	8(%edx), %mm1
+	movq	24(%edx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%ecx)
+	movq	%mm4, 16(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 8(%ecx)
+	movq	%mm7, 24(%ecx)
+
+	movq	32(%edx), %mm0
+	movq	48(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	40(%edx), %mm1
+	movq	56(%edx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 32(%ecx)
+	movq	%mm4, 48(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 40(%ecx)
+	movq	%mm7, 56(%ecx)
+
+	movq	64(%edx), %mm0
+	movq	80(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	72(%edx), %mm1
+	movq	88(%edx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 64(%ecx)
+	movq	%mm4, 80(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 72(%ecx)
+	movq	%mm7, 88(%ecx)
+
+	movq	96(%edx), %mm0
+	movq	112(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	104(%edx), %mm1
+	movq	120(%edx), %mm5
+	pswapd	%mm1, %mm1
+	pswapd	%mm5, %mm5
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 96(%ecx)
+	movq	%mm4, 112(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	pswapd	%mm3, %mm3
+	pswapd	%mm7, %mm7
+	movq	%mm3, 104(%ecx)
+	movq	%mm7, 120(%ecx)
+	
+/* Phase 6. This is the end of easy road. */
+	movl	$1, %eax
+	movd	%eax, %mm7
+	pi2fd	%mm7, %mm7
+	movq	32(%ecx), %mm0
+	punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */	
+	movq	%mm0, %mm1
+	movq	plus_minus_3dnow, %mm6
+	/* n.b.: pfpnacc */
+	pxor	%mm6, %mm1
+	pfacc	%mm1, %mm0
+	/**/
+	pfmul	%mm7, %mm0
+	movq	%mm0, 32(%edx)
+	femms
 
-        // 4
-        movl pnts+12,%eax    
-        movq 0(%eax),%mm0      /* mm0 = pnts[3] | pnts[4] */
-        movq 0(%ebx),%mm1      /* mm1 = tmp1[0] | tmp1[1] */
-        / 0
-        movq %mm1,%mm2
-        movd 12(%ebx),%mm3     /* mm3 = tmp1[3] */
-        punpckldq 8(%ebx),%mm3 /* mm3 = tmp1[3] | tmp1[2] */
-        pfadd %mm3,%mm1        /* mm1 = tmp1[0]+tmp1[3] | tmp1[1]+tmp1[2]*/
-        pfsub %mm3,%mm2        /* mm2 = tmp1[0]-tmp1[3] | tmp1[0]-tmp1[2]*/
-        pfmul %mm0,%mm2        /* mm2 = tmp1[0]-tmp1[3]*pnts[3]|tmp1[0]-tmp1[2]*pnts[4]*/
-        movq %mm1,0(%esi)      /* tmp2[0, 1] = mm1 */
-        pswapd %mm2, %mm2      /* mm2 = tmp1[0]-tmp1[2]*pnts[4]|tmp1[0]-tmp1[3]*pnts[3] */
-        movq   %mm2, 8(%esi)   /* tmp2[2, 3] = mm2 */
-        movq 16(%ebx),%mm4
-        / 4
-        movq %mm4,%mm5
-        movd 28(%ebx),%mm6
-        punpckldq 24(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,16(%esi)
-        pswapd %mm5, %mm5
-        movq   %mm5, 24(%esi)
-        movq 32(%ebx),%mm1
-        / 8
-        movq %mm1,%mm2
-        movd 44(%ebx),%mm3
-        punpckldq 40(%ebx),%mm3
-        pfadd %mm3,%mm1
-        pfsub %mm3,%mm2
-        pfmul %mm0,%mm2
-        movq %mm1,32(%esi)
-        pswapd %mm2, %mm2
-        movq   %mm2, 40(%esi)
-        movq 48(%ebx),%mm4
-        / 12
-        movq %mm4,%mm5
-        movd 60(%ebx),%mm6
-        punpckldq 56(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,48(%esi)
-        pswapd %mm5, %mm5
-        movq   %mm5, 56(%esi)
-        movq 64(%ebx),%mm1
-        / 16
-        movq %mm1,%mm2
-        movd 76(%ebx),%mm3
-        punpckldq 72(%ebx),%mm3
-        pfadd %mm3,%mm1
-        pfsub %mm3,%mm2
-        pfmul %mm0,%mm2
-        movq %mm1,64(%esi)
-        pswapd %mm2, %mm2
-        movq   %mm2, 72(%esi)
-        movq 80(%ebx),%mm4
-        / 20
-        movq %mm4,%mm5
-        movd 92(%ebx),%mm6
-        punpckldq 88(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,80(%esi)
-        pswapd %mm5, %mm5
-        movq   %mm5, 88(%esi)
-        movq 96(%ebx),%mm1
-        / 24
-        movq %mm1,%mm2
-        movd 108(%ebx),%mm3
-        punpckldq 104(%ebx),%mm3
-        pfadd %mm3,%mm1
-        pfsub %mm3,%mm2
-        pfmul %mm0,%mm2
-        movq %mm1,96(%esi)
-        pswapd %mm2, %mm2
-        movq   %mm2, 104(%esi)
-        movq 112(%ebx),%mm4
-        / 28
-        movq %mm4,%mm5
-        movd 124(%ebx),%mm6
-        punpckldq 120(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,112(%esi)
-        pswapd %mm5, %mm5
-        movq   %mm5, 120(%esi)
+	flds   44(%ecx)
+	fsubs  40(%ecx)
+	fmuls 120(%ebx)
+
+	fsts   44(%edx)
+	fadds  40(%ecx) /* pfacc 40(ecx), 56(%ecx) */
+	fadds  44(%ecx)
+	fstps  40(%edx)
+
+	flds   48(%ecx)
+	fsubs  52(%ecx)
+	fmuls 120(%ebx)
+
+	flds   60(%ecx)
+	fsubs  56(%ecx)
+	fmuls 120(%ebx)
+
+	fld      %st(0)
+	fadds  56(%ecx)
+	fadds  60(%ecx)
+
+	fld      %st(0)
+	fadds  48(%ecx)
+	fadds  52(%ecx)
+	fstps  48(%edx)
+	fadd     %st(2)
+	fstps  56(%edx)
+	fsts   60(%edx)
+	faddp    %st(1)
+	fstps  52(%edx)
+/*---*/
+	flds   64(%ecx)
+	fadds  68(%ecx)
+	fstps  64(%edx)
+
+	flds   64(%ecx)
+	fsubs  68(%ecx)
+	fmuls 120(%ebx)
+	fstps  68(%edx)
+
+	flds   76(%ecx)
+	fsubs  72(%ecx)
+	fmuls 120(%ebx)
+	fsts   76(%edx)
+	fadds  72(%ecx)
+	fadds  76(%ecx)
+	fstps  72(%edx)
+
+	flds   92(%ecx)
+	fsubs  88(%ecx)
+	fmuls 120(%ebx)
+	fsts   92(%edx)
+	fadds  92(%ecx)
+	fadds  88(%ecx)
+
+	fld      %st(0)
+	fadds  80(%ecx)
+	fadds  84(%ecx)
+	fstps  80(%edx)
+
+	flds   80(%ecx)
+	fsubs  84(%ecx)
+	fmuls 120(%ebx)
+	fadd  %st(0), %st(1)
+	fadds 92(%edx)
+	fstps 84(%edx)
+	fstps 88(%edx)
+
+	flds   96(%ecx)
+	fadds 100(%ecx)
+	fstps  96(%edx)
+
+	flds   96(%ecx)
+	fsubs 100(%ecx)
+	fmuls 120(%ebx)
+	fstps 100(%edx)
+
+	flds  108(%ecx)
+	fsubs 104(%ecx)
+	fmuls 120(%ebx)
+	fsts  108(%edx)
+	fadds 104(%ecx)
+	fadds 108(%ecx)
+	fstps 104(%edx)
+
+	flds  124(%ecx)
+	fsubs 120(%ecx)
+	fmuls 120(%ebx)
+	fsts  124(%edx)
+	fadds 120(%ecx)
+	fadds 124(%ecx)
 
-        // 5
-	movq plus_minus_3dnow, %mm0 /* mm0 = 1.0 | -1.0 */
-        movl $1,%eax
-        movd %eax,%mm1
-        pi2fd %mm1,%mm1
-        movl pnts+16,%eax
-        movd 0(%eax),%mm2
-        punpckldq %mm2,%mm1   /* mm1 = 1.0 | cos0 */
-        movq 0(%esi),%mm2     /* mm2 = tmp2[0] | tmp2[1] */
-        / 0
-	pfpnacc %mm2, %mm2
-	pswapd %mm2, %mm2     /* mm2 = tmp2[0]+tmp2[1]|tmp2[0]-tmp2[1]*/
-        pfmul %mm1,%mm2       /* mm2 = tmp2[0]+tmp2[1]|(tmp2[0]-tmp2[1])*cos0*/
-        movq %mm2,0(%ebx)     /* tmp1[0, 1] = mm2 */
-        movq 8(%esi),%mm4     /* mm4 = tmp2[2] | tmp2[3]*/
-	pfpnacc %mm4, %mm4
-	pswapd  %mm4, %mm4    /* mm4 = tmp2[2]+tmp2[3]|tmp2[2]-tmp2[3]*/
-        pxor  %mm0,%mm4       /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/
-        pfmul %mm1,%mm4       /* mm4 = tmp2[2]+tmp2[3]|(tmp2[3]-tmp2[2])*cos0*/
-        movq %mm4,%mm5
-        psrlq $32,%mm5        /* mm5 = (tmp2[3]-tmp2[2])*cos0 */
-        pfacc %mm5,%mm4       /* mm4 = tmp2[2]+tmp2[3]+(tmp2[3]-tmp2[2])*cos0|(tmp2[3]-tmp2[2])*cos0*/
-        movq %mm4,8(%ebx)     /* tmp1[2, 3] = mm4 */
-        movq 16(%esi),%mm2
-        / 4
-	pfpnacc %mm2, %mm2
-	pswapd %mm2, %mm2
+	fld      %st(0)
+	fadds 112(%ecx)
+	fadds 116(%ecx)
+	fstps 112(%edx)
+
+	flds  112(%ecx)
+	fsubs 116(%ecx)
+	fmuls 120(%ebx)
+	fadd  %st(0),%st(1)
+	fadds 124(%edx)
+	fstps 116(%edx)
+	fstps 120(%edx)
+	jnz .L01
+	
+/* Phase 7*/
+
+	flds      (%ecx)
+	fadds    4(%ecx)
+	fstps 1024(%esi)
+
+	flds      (%ecx)
+	fsubs    4(%ecx)
+	fmuls  120(%ebx)
+	fsts      (%esi)
+	fstps     (%edi)
+
+	flds   12(%ecx)
+	fsubs   8(%ecx)
+	fmuls 120(%ebx)
+	fsts  512(%edi)
+	fadds  12(%ecx)
+	fadds   8(%ecx)
+	fstps 512(%esi)
+
+	flds   16(%ecx)
+	fsubs  20(%ecx)
+	fmuls 120(%ebx)
 
-        pfmul %mm1,%mm2
-        movq 24(%esi),%mm4
-	pfpnacc %mm4, %mm4
-	pswapd  %mm4, %mm4
+	flds   28(%ecx)
+	fsubs  24(%ecx)
+	fmuls 120(%ebx)
+	fsts  768(%edi)
+	fld      %st(0)
+	fadds  24(%ecx)
+	fadds  28(%ecx)
+	fld      %st(0)
+	fadds  16(%ecx)
+	fadds  20(%ecx)
+	fstps 768(%esi)
+	fadd     %st(2)
+	fstps 256(%esi)
+	faddp    %st(1)
+	fstps 256(%edi)
+	
+/* Phase 8*/
+
+	flds   32(%edx)
+	fadds  48(%edx)
+	fstps 896(%esi)
+
+	flds   48(%edx)
+	fadds  40(%edx)
+	fstps 640(%esi)
 
-        pxor  %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,16(%ebx)
-        movq %mm4,24(%ebx)
-        movq 32(%esi),%mm2
-        / 8
-	pfpnacc %mm2, %mm2
-	pswapd %mm2, %mm2
+	flds   40(%edx)
+	fadds  56(%edx)
+	fstps 384(%esi)
+
+	flds   56(%edx)
+	fadds  36(%edx)
+	fstps 128(%esi)
+
+	flds   36(%edx)
+	fadds  52(%edx)
+	fstps 128(%edi)
+
+	flds   52(%edx)
+	fadds  44(%edx)
+	fstps 384(%edi)
+
+	flds   60(%edx)
+	fsts  896(%edi)
+	fadds  44(%edx)
+	fstps 640(%edi)
+
+	flds   96(%edx)
+	fadds 112(%edx)
+	fld      %st(0)
+	fadds  64(%edx)
+	fstps 960(%esi)
+	fadds  80(%edx)
+	fstps 832(%esi)
 
-        pfmul %mm1,%mm2
-        movq %mm2,32(%ebx)
-        movq 40(%esi),%mm4
-	pfpnacc %mm4, %mm4
-	pswapd  %mm4, %mm4
-        pxor  %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm4,40(%ebx)
-        movq 48(%esi),%mm2
-        / 12
-	pfpnacc %mm2, %mm2
-	pswapd %mm2, %mm2
-        pfmul %mm1,%mm2
-        movq 56(%esi),%mm4
-	pfpnacc %mm4, %mm4
-	pswapd  %mm4, %mm4
-        pxor  %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,48(%ebx)
-        movq %mm4,56(%ebx)
-        movq 64(%esi),%mm2
-        / 16
-	pfpnacc %mm2, %mm2
-	pswapd %mm2, %mm2
-        pfmul %mm1,%mm2
-        movq %mm2,64(%ebx)
-        movq 72(%esi),%mm4
-	pfpnacc %mm4, %mm4
-	pswapd  %mm4, %mm4
-        pxor  %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm4,72(%ebx)
-        movq 80(%esi),%mm2
-        / 20
-	pfpnacc %mm2, %mm2
-	pswapd %mm2, %mm2
-        pfmul %mm1,%mm2
-        movq 88(%esi),%mm4
-	pfpnacc %mm4, %mm4
-	pswapd  %mm4, %mm4
-        pxor  %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,80(%ebx)
-        movq %mm4,88(%ebx)
-        movq 96(%esi),%mm2
-        / 24
-	pfpnacc %mm2, %mm2
-	pswapd %mm2, %mm2
-        pfmul %mm1,%mm2
-        movq %mm2,96(%ebx)
-        movq 104(%esi),%mm4
-	pfpnacc %mm4, %mm4
-	pswapd  %mm4, %mm4
-        pxor  %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm4,104(%ebx)
-        movq 112(%esi),%mm2
-        / 28
-	pfpnacc %mm2, %mm2
-	pswapd %mm2, %mm2
-        pfmul %mm1,%mm2
-        movq 120(%esi),%mm4
-	pfpnacc %mm4, %mm4
-	pswapd  %mm4, %mm4
-        pxor  %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,112(%ebx)
-        movq %mm4,120(%ebx)
+	flds  112(%edx)
+	fadds 104(%edx)
+	fld      %st(0)
+	fadds  80(%edx)
+	fstps 704(%esi)
+	fadds  72(%edx)
+	fstps 576(%esi)
+
+	flds  104(%edx)
+	fadds 120(%edx)
+	fld      %st(0)
+	fadds  72(%edx)
+	fstps 448(%esi)
+	fadds  88(%edx)
+	fstps 320(%esi)
+
+	flds  120(%edx)
+	fadds 100(%edx)
+	fld      %st(0)
+	fadds  88(%edx)
+	fstps 192(%esi)
+	fadds  68(%edx)
+	fstps  64(%esi)
+
+	flds  100(%edx)
+	fadds 116(%edx)
+	fld      %st(0)
+	fadds  68(%edx)
+	fstps  64(%edi)
+	fadds  84(%edx)
+	fstps 192(%edi)
+
+	flds  116(%edx)
+	fadds 108(%edx)
+	fld      %st(0)
+	fadds  84(%edx)
+	fstps 320(%edi)
+	fadds  76(%edx)
+	fstps 448(%edi)
+
+	flds  108(%edx)
+	fadds 124(%edx)
+	fld      %st(0)
+	fadds  76(%edx)
+	fstps 576(%edi)
+	fadds  92(%edx)
+	fstps 704(%edi)
+
+	flds  124(%edx)
+	fsts  960(%edi)
+	fadds  92(%edx)
+	fstps 832(%edi)
+	jmp	.L_bye
+.L01:	
+/* Phase 9*/
+
+	flds      (%ecx)
+	fadds    4(%ecx)
+	fistp  512(%esi)
+
+	flds      (%ecx)
+	fsubs    4(%ecx)
+	fmuls  120(%ebx)
+
+	fistp     (%esi)
+
+
+	flds    12(%ecx)
+	fsubs    8(%ecx)
+	fmuls  120(%ebx)
+	fist   256(%edi)
+	fadds   12(%ecx)
+	fadds    8(%ecx)
+	fistp  256(%esi)
+
+	flds   16(%ecx)
+	fsubs  20(%ecx)
+	fmuls 120(%ebx)
+
+	flds   28(%ecx)
+	fsubs  24(%ecx)
+	fmuls 120(%ebx)
+	fist  384(%edi)
+	fld      %st(0)
+	fadds  24(%ecx)
+	fadds  28(%ecx)
+	fld      %st(0)
+	fadds  16(%ecx)
+	fadds  20(%ecx)
+	fistp  384(%esi)
+	fadd     %st(2)
+	fistp  128(%esi)
+	faddp    %st(1)
+	fistp  128(%edi)
+	
+/* Phase 10*/
 
-        // Phase6
-        movd 0(%ebx),%mm0
-        movd %mm0,1024(%ebp)
-        movl 4(%ebx),%eax
-        movl %eax,0(%ebp)
-        movl %eax,0(%edx)
-        movd 8(%ebx),%mm2
-        movd %mm2,512(%ebp)
-        movd 12(%ebx),%mm3
-        movd %mm3,512(%edx)
+	flds    32(%edx)
+	fadds   48(%edx)
+	fistp  448(%esi)
+
+	flds   48(%edx)
+	fadds  40(%edx)
+	fistp 320(%esi)
 
-        movl 16(%ebx),%eax
-        movl %eax,768(%ebp)
-        movd 20(%ebx),%mm5
-        movd %mm5,256(%edx)
+	flds   40(%edx)
+	fadds  56(%edx)
+	fistp 192(%esi)
 
-        movd 24(%ebx),%mm6
-        movd %mm6,256(%ebp)
-        movd 28(%ebx),%mm7
-        movd %mm7,768(%edx)
+	flds   56(%edx)
+	fadds  36(%edx)
+	fistp  64(%esi)
+
+	flds   36(%edx)
+	fadds  52(%edx)
+	fistp  64(%edi)
 
-        movq 32(%ebx),%mm0       /* mm0 = tmp1[8] | tmp1[9] */
-        movq 48(%ebx),%mm1       /* mm1 = tmp1[12] | tmp1[13] */
-        pfadd %mm1,%mm0          /* mm0 = tmp1[8]+tmp1[12]| tmp1[9]+tmp1[13]*/
-        movd %mm0,896(%ebp)      /* a[0xE0] = tmp1[8]+tmp1[12] */
-        psrlq $32,%mm0
-        movd %mm0,128(%edx)      /* a[0x20] = tmp1[9]+tmp1[13] */
-        movq 40(%ebx),%mm2
-        pfadd %mm2,%mm1
-        movd %mm1,640(%ebp)
-        psrlq $32,%mm1
-        movd %mm1,384(%edx)
+	flds   52(%edx)
+	fadds  44(%edx)
+	fistp 192(%edi)
+
+	flds   60(%edx)
+	fist   448(%edi)
+	fadds  44(%edx)
+	fistp 320(%edi)
 
-        movq 56(%ebx),%mm3
-        pfadd %mm3,%mm2
-        movd %mm2,384(%ebp)
-        psrlq $32,%mm2
-        movd %mm2,640(%edx)
+	flds   96(%edx)
+	fadds 112(%edx)
+	fld      %st(0)
+	fadds  64(%edx)
+	fistp 480(%esi)
+	fadds  80(%edx)
+	fistp 416(%esi)
 
-        movd 36(%ebx),%mm4
-        pfadd %mm4,%mm3
-        movd %mm3,128(%ebp)
-        psrlq $32,%mm3
-        movd %mm3,896(%edx)
-        movq 96(%ebx),%mm0
-        movq 64(%ebx),%mm1
+	flds  112(%edx)
+	fadds 104(%edx)
+	fld      %st(0)
+	fadds  80(%edx)
+	fistp 352(%esi)
+	fadds  72(%edx)
+	fistp 288(%esi)
 
-        movq 112(%ebx),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,%mm3
-        pfadd %mm1,%mm3
-        movd %mm3,960(%ebp)
-        psrlq $32,%mm3
-        movd %mm3,64(%edx)
-        movq 80(%ebx),%mm1
-        pfadd %mm1,%mm0
-        movd %mm0,832(%ebp)
-        psrlq $32,%mm0
-        movd %mm0,192(%edx)
-        movq 104(%ebx),%mm3
-        pfadd %mm3,%mm2
-        movq %mm2,%mm4
-        pfadd %mm1,%mm4
-        movd %mm4,704(%ebp)
-        psrlq $32,%mm4
-        movd %mm4,320(%edx)
-        movq 72(%ebx),%mm1
-        pfadd %mm1,%mm2
-        movd %mm2,576(%ebp)
-        psrlq $32,%mm2
-        movd %mm2,448(%edx)
+	flds  104(%edx)
+	fadds 120(%edx)
+	fld      %st(0)
+	fadds  72(%edx)
+	fistp 224(%esi)
+	fadds  88(%edx)
+	fistp 160(%esi)
+
+	flds  120(%edx)
+	fadds 100(%edx)
+	fld      %st(0)
+	fadds  88(%edx)
+	fistp  96(%esi)
+	fadds  68(%edx)
+	fistp  32(%esi)
+
+	flds  100(%edx)
+	fadds 116(%edx)
+	fld      %st(0)
+	fadds  68(%edx)
+	fistp  32(%edi)
+	fadds  84(%edx)
+	fistp  96(%edi)
 
-        movq 120(%ebx),%mm4
-        pfadd %mm4,%mm3
-        movq %mm3,%mm5
-        pfadd %mm1,%mm5
-        movd %mm5,448(%ebp)
-        psrlq $32,%mm5
-        movd %mm5,576(%edx)
-        movq 88(%ebx),%mm1
-        pfadd %mm1,%mm3
-        movd %mm3,320(%ebp)
-        psrlq $32,%mm3
-        movd %mm3,704(%edx)
+	flds  116(%edx)
+	fadds 108(%edx)
+	fld      %st(0)
+	fadds  84(%edx)
+	fistp 160(%edi)
+	fadds  76(%edx)
+	fistp 224(%edi)
 
-        movd 100(%ebx),%mm5
-        pfadd %mm5,%mm4
-        movq %mm4,%mm6
-        pfadd %mm1,%mm6
-        movd %mm6,192(%ebp)
-        psrlq $32,%mm6
-        movd %mm6,832(%edx)
-        movd 68(%ebx),%mm1
-        pfadd %mm1,%mm4
-        movd %mm4,64(%ebp)
-        psrlq $32,%mm4
-        movd %mm4,960(%edx)
+	flds  108(%edx)
+	fadds 124(%edx)
+	fld      %st(0)
+	fadds  76(%edx)
+	fistp 288(%edi)
+	fadds  92(%edx)
+	fistp 352(%edi)
 
-        / femms
+	flds  124(%edx)
+	fist  480(%edi)
+	fadds  92(%edx)
+	fistp 416(%edi)
+	movsw
+.L_bye:
+	addl $256,%esp
+	popl %edi
+	popl %esi
+	popl %ebx
+	ret
+	
 
-        popl %ebx
-        popl %esi
-        popl %edi
-        popl %ebp
-        addl $256,%esp
-
-        ret  $12
-
--- a/mp3lib/decod386.c	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/decod386.c	Fri Jun 29 17:55:35 2001 +0000
@@ -105,6 +105,15 @@
 }
 #endif
 
+synth_func_t synth_func;
+
+int synth_1to1_MMX( real *bandPtr,int channel,short * samples)
+{
+    static short buffs[2][2][0x110];
+    static int bo = 1;
+    synth_1to1_MMX_s(bandPtr, channel, samples, (short *) buffs, &bo); 
+    return 0;
+  } 
 
 static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt)
 {
@@ -117,40 +126,13 @@
   int clip = 0;
   int bo1;
 
-  #ifdef HAVE_SSE_MP3
-  //if ( _3dnow )
+  if ( synth_func )
    {
     int ret;
-    ret=synth_1to1_sse( bandPtr,channel,out+*pnt );
-    *pnt+=128;
-    return ret;
-   }
-  #endif
-  #ifdef HAVE_3DNOWEX
-  if ( _3dnow > 1 )
-   {
-    int ret;
-    ret=synth_1to1_3dnowex( bandPtr,channel,out+*pnt );
+    ret=(*synth_func)( bandPtr,channel,samples);
     *pnt+=128;
     return ret;
    }
-  #endif
-  #ifdef HAVE_3DNOW
-  if ( _3dnow )
-   {
-    int ret;
-    ret=synth_1to1_3dnow( bandPtr,channel,out+*pnt );
-    *pnt+=128;
-    return ret;
-   }
-  #endif
-  if ( _i586 )
-   {
-     int ret;
-     ret=synth_1to1_pent( bandPtr,channel,out+*pnt );
-     *pnt+=128;
-     return ret;
-   }
 
   if(!channel) {     /* channel=0 */
     bo--;
--- a/mp3lib/decode_3dnow.s	Fri Jun 29 10:54:41 2001 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,265 +0,0 @@
-/ synth_1to1_3dnow works the same way as the c version of
-/ synth_1to1. this assembler code based 'decode-i586.s'
-/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
-/ have been made:
-/ - use {MMX,3DNow!} instruction for reduce cpu
-/ - remove unused(?) local symbols
-/
-/ useful sources of information on optimizing 3DNow! code include:
-/ AMD 3DNow! Technology Manual (Publication #21928)
-/     English:  http://www.amd.com/K6/k6docs/pdf/21928d.pdf
-/    (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
-/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
-/     English:  http://www.amd.com/K6/k6docs/pdf/21924b.pdf
-/
-/ This code was tested only AMD-K6-2 processor Linux systems,
-/ please tell me:
-/ - whether this code works on other 3DNow! capable processors
-/  (ex.IDT-C6-2) or not
-/ - whether this code works on other OSes or not
-/
-/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
-/                    <kim@comtec.co.jp>               - after  1.Apr.1998
-
-/ Enhancments for q-word operation by Michael Hipp
-
-.bss
-        .comm   buffs,4352,4
-.data
-        .align 4
-bo:
-        .long 1
-.text
-.globl synth_1to1_3dnow
-synth_1to1_3dnow:
-        subl  $12,%esp
-        pushl %ebp
-        pushl %edi
-        pushl %esi
-        pushl %ebx
-        movl  32(%esp),%eax
-        movl  40(%esp),%esi
-        movl  $0,%edi
-        movl  bo,%ebp
-        cmpl  %edi,36(%esp)
-        jne   .L48
-        decl  %ebp
-        andl  $15,%ebp
-        movl  %ebp,bo
-        movl  $buffs,%ecx
-        jmp   .L49
-.L48:
-        addl  $2,%esi
-        movl  $buffs+2176,%ecx
-.L49:
-        testl $1,%ebp
-        je    .L50
-        movl  %ecx,%ebx
-        movl  %ebp,16(%esp)
-        pushl %eax
-        movl  20(%esp),%edx
-        leal  (%ebx,%edx,4),%eax
-        pushl %eax
-        movl  24(%esp),%eax
-        incl  %eax
-        andl  $15,%eax
-        leal  1088(,%eax,4),%eax
-        addl  %ebx,%eax
-        jmp   .L74
-.L50:
-        leal  1088(%ecx),%ebx
-        leal  1(%ebp),%edx
-        movl  %edx,16(%esp)
-        pushl %eax
-        leal  1092(%ecx,%ebp,4),%eax
-        pushl %eax
-        leal  (%ecx,%ebp,4),%eax
-.L74:
-        pushl %eax
-        call  dct64_3dnow
-        addl  $12,%esp
-        movl  16(%esp),%edx
-        leal  0(,%edx,4),%edx
-        movl  $decwin+64,%eax
-        movl  %eax,%ecx
-        subl  %edx,%ecx
-        movl  $16,%ebp
-
-.L55:
-        movq  (%ecx),%mm4
-        movq  (%ebx),%mm3
-        movq  8(%ecx),%mm0
-        movq  8(%ebx),%mm1
-        pfmul %mm3,%mm4
-
-        movq  16(%ecx),%mm2
-        pfmul %mm1,%mm0
-        movq  16(%ebx),%mm3
-        pfadd %mm0,%mm4
-
-        movq  24(%ecx),%mm0
-        pfmul %mm2,%mm3
-        movq  24(%ebx),%mm1
-        pfadd %mm3,%mm4
-
-        movq  32(%ecx),%mm2
-        pfmul %mm1,%mm0
-        movq  32(%ebx),%mm3
-        pfadd %mm0,%mm4
-
-        movq  40(%ecx),%mm0
-        pfmul %mm2,%mm3
-        movq  40(%ebx),%mm1
-        pfadd %mm3,%mm4
-
-        movq  48(%ecx),%mm2
-        pfmul %mm1,%mm0
-        movq  48(%ebx),%mm3
-        pfadd %mm0,%mm4
-
-        movq  56(%ecx),%mm0
-        pfmul %mm2,%mm3
-        movq  56(%ebx),%mm1
-        pfadd %mm3,%mm4
-
-        pfmul %mm1,%mm0
-        pfadd %mm0,%mm4
-
-        movq  %mm4,%mm0
-        psrlq $32,%mm0
-        pfsub %mm0,%mm4
-
-        pf2id %mm4,%mm4
-        movd  %mm4,%eax
-
-        sar   $16,%eax
-        movw  %ax,(%esi)
-
-        addl  $64,%ebx
-        subl  $-128,%ecx
-        addl  $4,%esi
-        decl  %ebp
-        jnz  .L55
-
-/ --- end of  loop 1 ---
-
-        movd  (%ecx),%mm2
-        movd  (%ebx),%mm1
-        pfmul %mm1,%mm2
-
-        movd  8(%ecx),%mm0
-        movd  8(%ebx),%mm1
-        pfmul %mm0,%mm1
-        pfadd %mm1,%mm2
-
-        movd  16(%ecx),%mm0
-        movd  16(%ebx),%mm1
-        pfmul %mm0,%mm1
-        pfadd %mm1,%mm2
-
-        movd  24(%ecx),%mm0
-        movd  24(%ebx),%mm1
-        pfmul %mm0,%mm1
-        pfadd %mm1,%mm2
-
-        movd  32(%ecx),%mm0
-        movd  32(%ebx),%mm1
-        pfmul %mm0,%mm1
-        pfadd %mm1,%mm2
-
-        movd  40(%ecx),%mm0
-        movd  40(%ebx),%mm1
-        pfmul %mm0,%mm1
-        pfadd %mm1,%mm2
-
-        movd  48(%ecx),%mm0
-        movd  48(%ebx),%mm1
-        pfmul %mm0,%mm1
-        pfadd %mm1,%mm2
-
-        movd  56(%ecx),%mm0
-        movd  56(%ebx),%mm1
-        pfmul %mm0,%mm1
-        pfadd %mm1,%mm2
-
-        pf2id %mm2,%mm2
-        movd  %mm2,%eax
-
-        sar   $16,%eax
-
-        movw  %ax,(%esi)
-
-        addl  $-64,%ebx
-        addl  $4,%esi
-        addl  $256,%ecx
-        movl  $15,%ebp
-
-.L68:
-        psubd %mm0,%mm0
-
-        movq  (%ebx),%mm1
-        movq  (%ecx),%mm2
-        pfmul %mm1,%mm2
-        pfsub %mm2,%mm0
-
-        movq  8(%ebx),%mm3
-        movq  8(%ecx),%mm4
-        pfmul %mm3,%mm4
-        pfsub %mm4,%mm0
-
-        movq  16(%ebx),%mm1
-        movq  16(%ecx),%mm2
-        pfmul %mm1,%mm2
-        pfsub %mm2,%mm0
-
-        movq  24(%ebx),%mm3
-        movq  24(%ecx),%mm4
-        pfmul %mm3,%mm4
-        pfsub %mm4,%mm0
-
-        movq  32(%ebx),%mm1
-        movq  32(%ecx),%mm2
-        pfmul %mm1,%mm2
-        pfsub %mm2,%mm0
-
-        movq  40(%ebx),%mm3
-        movq  40(%ecx),%mm4
-        pfmul %mm3,%mm4
-        pfsub %mm4,%mm0
-
-        movq  48(%ebx),%mm1
-        movq  48(%ecx),%mm2
-        pfmul %mm1,%mm2
-        pfsub %mm2,%mm0
-
-        movq  56(%ebx),%mm3
-        movq  56(%ecx),%mm4
-        pfmul %mm3,%mm4
-        pfsub %mm4,%mm0
-
-        pfacc %mm0,%mm0
-
-        pf2id %mm0,%mm0
-        movd  %mm0,%eax
-
-        sar   $16,%eax
-
-        movw  %ax,(%esi)
-
-        addl  $-64,%ebx
-        subl  $-128,%ecx
-        addl  $4,%esi
-        decl  %ebp
-        jnz   .L68
-
-/ --- end of loop 2
-
-        femms
-
-        movl  %edi,%eax
-        popl  %ebx
-        popl  %esi
-        popl  %edi
-        popl  %ebp
-        addl  $12,%esp
-        ret
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mp3lib/decode_MMX.s	Fri Jun 29 17:55:35 2001 +0000
@@ -0,0 +1,117 @@
+# this code comes under GPL
+# This code was taken from http://www.mpg123.org
+# See ChangeLog of mpg123-0.59s-pre.1 for detail
+# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+#
+# TODO: Partial loops unrolling and removing MOVW insn.
+#
+
+.text
+
+.globl synth_1to1_MMX_s
+
+synth_1to1_MMX_s:
+        pushl %ebp
+        pushl %edi
+        pushl %esi
+        pushl %ebx
+        movl 24(%esp),%ecx              
+        movl 28(%esp),%edi              
+        movl $15,%ebx
+        movl 36(%esp),%edx              
+        leal (%edi,%ecx,2),%edi
+	decl %ecx
+        movl 32(%esp),%esi              
+        movl (%edx),%eax                
+        jecxz .L1
+        decl %eax
+        andl %ebx,%eax                  
+        leal 1088(%esi),%esi                            
+        movl %eax,(%edx)                
+.L1:
+        leal (%esi,%eax,2),%edx         
+        movl %eax,%ebp                  
+        incl %eax                       
+        pushl 20(%esp)                  
+        andl %ebx,%eax                  
+        leal 544(%esi,%eax,2),%ecx      
+        incl %ebx                       
+	testl $1, %eax
+	jnz .L2                       
+        xchgl %edx,%ecx
+	incl %ebp
+        leal 544(%esi),%esi           
+.L2: 
+	emms
+        pushl %edx
+        pushl %ecx
+        call *dct64_MMX_func
+        addl $12,%esp
+	leal 1(%ebx), %ecx
+        subl %ebp,%ebx                
+
+	leal decwins(%ebx,%ebx,1), %edx
+.L3: 
+        movq  (%edx),%mm0
+        pmaddwd (%esi),%mm0
+        movq  8(%edx),%mm1
+        pmaddwd 8(%esi),%mm1
+        movq  16(%edx),%mm2
+        pmaddwd 16(%esi),%mm2
+        movq  24(%edx),%mm3
+        pmaddwd 24(%esi),%mm3
+        paddd %mm1,%mm0
+        paddd %mm2,%mm0
+        paddd %mm3,%mm0
+        movq  %mm0,%mm1
+        psrlq $32,%mm1
+        paddd %mm1,%mm0
+        psrad $13,%mm0
+        packssdw %mm0,%mm0
+        movd %mm0,%eax
+	movw %ax, (%edi)
+
+        leal 32(%esi),%esi
+        leal 64(%edx),%edx
+        leal 4(%edi),%edi                
+	decl %ecx
+        jnz  .L3
+
+
+        subl $64,%esi                    
+        movl $15,%ecx
+.L4: 
+        movq  (%edx),%mm0
+        pmaddwd (%esi),%mm0
+        movq  8(%edx),%mm1
+        pmaddwd 8(%esi),%mm1
+        movq  16(%edx),%mm2
+        pmaddwd 16(%esi),%mm2
+        movq  24(%edx),%mm3
+        pmaddwd 24(%esi),%mm3
+        paddd %mm1,%mm0
+        paddd %mm2,%mm0
+        paddd %mm3,%mm0
+        movq  %mm0,%mm1
+        psrlq $32,%mm1
+        paddd %mm0,%mm1
+        psrad $13,%mm1
+        packssdw %mm1,%mm1
+        psubd %mm0,%mm0
+        psubsw %mm1,%mm0
+        movd %mm0,%eax
+	movw %ax,(%edi)
+
+        subl $32,%esi
+        addl $64,%edx
+        leal 4(%edi),%edi                
+        decl %ecx
+	jnz  .L4
+	emms
+        popl %ebx
+        popl %esi
+        popl %edi
+        popl %ebp
+        ret
+
+
--- a/mp3lib/decode_k7.s	Fri Jun 29 10:54:41 2001 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,364 +0,0 @@
-///
-/// Replacement of synth_1to1() with AMD's 3DNowEx(DSP)! SIMD operations support
-///
-/// This code based 'decode_3dnow.s' by Syuuhei Kashiyama
-/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
-///
-///  - Added new opcode PFNACC
-///  - decreased number of opcodes (as it was suggested by k7 manual)
-///    (using memory reference as operand of instructions)
-///  - added PREFETCHW opcode. It has different semantic on k7 than on k6-2
-///    and saves 15-25 cpu clocks for athlon.
-///  - partial unrolling loops for removing slower MOVW insns.
-///    (Note: probably same operation should be done for decode_3dnow.s)
-///  - change function name for support 3DNowEx! automatic detect
-///  - added loops alignment
-///
-/// note: because K7 processors are an aggresive out-of-order three-way
-///       superscalar ones instruction order is not significand for them.
-///
-/// Benchmark: measured by mplayer on Duron-700:
-///      3dNow! optimized code                              - 1.4% of cpu usage
-///      k7 optimized code (without partial loop unrolling) - 1.3% of cpu usage
-///      k7 optimized code                                  - 1.1% of cpu usage
-/// Note: K6-2 users have an chance with partial loops unrolling
-///
-/// Modified by Nick Kurshev <nickols_k@mail.ru>
-///
-/ synth_1to1_3dnow works the same way as the c version of
-/ synth_1to1. this assembler code based 'decode-i586.s'
-/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
-/ have been made:
-/ - use {MMX,3DNow!} instruction for reduce cpu
-/ - remove unused(?) local symbols
-/
-/ useful sources of information on optimizing 3DNow! code include:
-/ AMD 3DNow! Technology Manual (Publication #21928)
-/     English:  http://www.amd.com/K6/k6docs/pdf/21928d.pdf
-/    (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
-/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
-/     English:  http://www.amd.com/K6/k6docs/pdf/21924b.pdf
-/
-/ This code was tested only AMD-K6-2 processor Linux systems,
-/ please tell me:
-/ - whether this code works on other 3DNow! capable processors
-/  (ex.IDT-C6-2) or not
-/ - whether this code works on other OSes or not
-/
-/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
-/                    <kim@comtec.co.jp>               - after  1.Apr.1998
-
-/ Enhancments for q-word operation by Michael Hipp
-
-.bss
-        .comm   buffs,4352,4
-.data
-        .align 8
-null_one: .long 0x0000ffff, 0x0000ffff
-one_null: .long 0xffff0000, 0xffff0000
-bo:       .long 1
-.text
-/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
-.globl synth_1to1_3dnowex
-synth_1to1_3dnowex:
-        subl  $12,%esp
-        pushl %ebp
-        pushl %edi
-        pushl %esi
-        pushl %ebx
-	
-        movl  32(%esp),%eax
-        movl  40(%esp),%esi
-        movl  $0,%edi
-        movl  bo,%ebp
-        cmpl  %edi,36(%esp)
-        jne   .L48
-        decl  %ebp
-        andl  $15,%ebp
-        movl  %ebp,bo
-        movl  $buffs,%ecx
-        jmp   .L49
-.L48:
-        addl  $2,%esi
-        movl  $buffs+2176,%ecx
-.L49:
-        testl $1,%ebp
-        je    .L50
-        movl  %ecx,%ebx
-        movl  %ebp,16(%esp)
-        pushl %eax
-        movl  20(%esp),%edx
-        leal  (%ebx,%edx,4),%eax
-        pushl %eax
-        movl  24(%esp),%eax
-        incl  %eax
-        andl  $15,%eax
-        leal  1088(,%eax,4),%eax
-        addl  %ebx,%eax
-        jmp   .L74
-.L50:
-        leal  1088(%ecx),%ebx
-        leal  1(%ebp),%edx
-        movl  %edx,16(%esp)
-        pushl %eax
-        leal  1092(%ecx,%ebp,4),%eax
-        pushl %eax
-        leal  (%ecx,%ebp,4),%eax
-.L74:
-        pushl %eax
-        call  dct64_3dnowex
-        movl  16(%esp),%edx
-        leal  0(,%edx,4),%edx
-        movl  $decwin+64,%eax
-        movl  %eax,%ecx            
-        subl  %edx,%ecx
-        movl  $8,%ebp
-	prefetchw (%esi)
-.align 16
-.L55:
-
-        movq  (%ecx),%mm0
-        pfmul (%ebx),%mm0
-        movq  128(%ecx),%mm4
-        pfmul 64(%ebx),%mm4
-
-        movq  8(%ecx),%mm1
-        pfmul 8(%ebx),%mm1
-        pfadd %mm1,%mm0
-        movq  136(%ecx),%mm5
-        pfmul 72(%ebx),%mm5
-        pfadd %mm5,%mm4
-
-        movq  16(%ebx),%mm2
-        pfmul 16(%ecx),%mm2
-        pfadd %mm2,%mm0
-        movq  80(%ebx),%mm6
-        pfmul 144(%ecx),%mm6
-        pfadd %mm6,%mm4
-
-        movq  24(%ecx),%mm3
-        pfmul 24(%ebx),%mm3
-        pfadd %mm3,%mm0
-        movq  152(%ecx),%mm7
-        pfmul 88(%ebx),%mm7
-        pfadd %mm7,%mm4
-
-        movq  32(%ebx),%mm1
-        pfmul 32(%ecx),%mm1
-        pfadd %mm1,%mm0
-        movq  96(%ebx),%mm5
-        pfmul 160(%ecx),%mm5
-        pfadd %mm5,%mm4
-
-        movq  40(%ecx),%mm2
-        pfmul 40(%ebx),%mm2
-	pfadd %mm2,%mm0
-        movq  168(%ecx),%mm6
-        pfmul 104(%ebx),%mm6
-	pfadd %mm6,%mm4
-
-        movq  48(%ebx),%mm3
-        pfmul 48(%ecx),%mm3
-        pfadd %mm3,%mm0
-        movq  112(%ebx),%mm7
-        pfmul 176(%ecx),%mm7
-        pfadd %mm7,%mm4
-
-        movq  56(%ecx),%mm1
-        pfmul 56(%ebx),%mm1
-        pfadd %mm1,%mm0
-        movq  184(%ecx),%mm5
-        pfmul 120(%ebx),%mm5
-        pfadd %mm5,%mm4
-
-	pfnacc %mm4, %mm0
-	movq   (%esi), %mm1
-	pf2id  %mm0, %mm0
-	pand   one_null, %mm1
-	psrld  $16,%mm0
-	pand   null_one, %mm0
-	por    %mm0, %mm1
-	movq   %mm1,(%esi)
-	
-        addl  $128,%ebx
-        addl  $256,%ecx
-        addl  $8,%esi
-        decl  %ebp
-        jnz  .L55
-
-/ --- end of  loop 1 ---
-
-	prefetchw (%esi)  /* prefetching for writing this block and next loop */
-
-        movd  (%ecx),%mm0
-        pfmul (%ebx),%mm0
-
-        movd  8(%ebx),%mm1
-        pfmul 8(%ecx),%mm1
-        pfadd %mm1,%mm0
-
-        movd  16(%ebx),%mm2
-        pfmul 16(%ecx),%mm2
-        pfadd %mm2,%mm0
-
-        movd  24(%ebx),%mm3
-        pfmul 24(%ecx),%mm3
-        pfadd %mm3,%mm0
-
-        movd  32(%ebx),%mm4
-        pfmul 32(%ecx),%mm4
-        pfadd %mm4,%mm0
-
-        movd  40(%ebx),%mm5
-        pfmul 40(%ecx),%mm5
-        pfadd %mm5,%mm0
-
-        movd  48(%ebx),%mm6
-        pfmul 48(%ecx),%mm6
-        pfadd %mm6,%mm0
-
-        movd  56(%ebx),%mm7
-        pfmul 56(%ecx),%mm7
-        pfadd %mm7,%mm0
-
-        pf2id %mm0,%mm0
-        movd  %mm0,%eax
-
-        sar   $16,%eax
-
-        movw  %ax,(%esi)
-
-        subl  $64,%ebx
-        addl  $4,%esi
-        addl  $256,%ecx
-        movl  $7,%ebp
-.align 16
-.L68:
-	pxor  %mm0, %mm0
-	pxor  %mm4, %mm4
-
-        movq  (%ecx),%mm1
-        pfmul (%ebx),%mm1
-        pfsub %mm1,%mm0
-        movq  128(%ecx),%mm5
-        pfmul -64(%ebx),%mm5
-        pfsub %mm5,%mm4
-
-        movq  8(%ecx),%mm2
-        pfmul 8(%ebx),%mm2
-        pfsub %mm2,%mm0
-        movq  136(%ecx),%mm6
-        pfmul -56(%ebx),%mm6
-        pfsub %mm6,%mm4
-
-        movq  16(%ecx),%mm3
-        pfmul 16(%ebx),%mm3
-        pfsub %mm3,%mm0
-        movq  144(%ecx),%mm7
-        pfmul -48(%ebx),%mm7
-        pfsub %mm7,%mm4
-
-        movq  24(%ecx),%mm1
-        pfmul 24(%ebx),%mm1
-        pfsub %mm1,%mm0
-        movq  152(%ecx),%mm5
-        pfmul -40(%ebx),%mm5
-        pfsub %mm5,%mm4
-
-        movq  32(%ecx),%mm2
-        pfmul 32(%ebx),%mm2
-        pfsub %mm2,%mm0
-        movq  160(%ecx),%mm6
-        pfmul -32(%ebx),%mm6
-        pfsub %mm6,%mm4
-
-        movq  40(%ecx),%mm3
-        pfmul 40(%ebx),%mm3
-        pfsub %mm3,%mm0
-        movq  168(%ecx),%mm7
-        pfmul -24(%ebx),%mm7
-        pfsub %mm7,%mm4
-
-        movq  48(%ecx),%mm1
-        pfmul 48(%ebx),%mm1
-        pfsub %mm1,%mm0
-        movq  176(%ecx),%mm5
-        pfmul -16(%ebx),%mm5
-        pfsub %mm5,%mm4
-
-        movq  56(%ecx),%mm2
-        pfmul 56(%ebx),%mm2
-        pfsub %mm2,%mm0
-        movq  184(%ecx),%mm6
-        pfmul -8(%ebx),%mm6
-        pfsub %mm6,%mm4
-
-        pfacc  %mm4,%mm0
-	movq   (%esi), %mm1
-	pf2id  %mm0, %mm0
-	pand   one_null, %mm1
-	psrld  $16,%mm0
-	pand   null_one, %mm0
-	por    %mm0, %mm1
-	movq   %mm1,(%esi)
-
-        subl  $128,%ebx
-        addl  $256,%ecx
-        addl  $8,%esi
-        decl  %ebp
-        jnz   .L68
-
-/ --- end of loop 2
-
-	pxor  %mm0, %mm0
-
-        movq  (%ecx),%mm1
-        pfmul (%ebx),%mm1
-        pfsub %mm1,%mm0
-
-        movq  8(%ecx),%mm2
-        pfmul 8(%ebx),%mm2
-        pfsub %mm2,%mm0
-
-        movq  16(%ecx),%mm3
-        pfmul 16(%ebx),%mm3
-        pfsub %mm3,%mm0
-
-        movq  24(%ecx),%mm4
-        pfmul 24(%ebx),%mm4
-        pfsub %mm4,%mm0
-
-        movq  32(%ecx),%mm5
-        pfmul 32(%ebx),%mm5
-        pfsub %mm5,%mm0
-
-        movq  40(%ecx),%mm6
-        pfmul 40(%ebx),%mm6
-        pfsub %mm6,%mm0
-
-        movq  48(%ecx),%mm7
-        pfmul 48(%ebx),%mm7
-        pfsub %mm7,%mm0
-
-        movq  56(%ecx),%mm1
-        pfmul 56(%ebx),%mm1
-        pfsub %mm1,%mm0
-
-        pfacc %mm0,%mm0
-
-        pf2id %mm0,%mm0
-        movd  %mm0,%eax
-
-        sar   $16,%eax
-
-        movw  %ax,(%esi)
-
-        femms
-
-        movl  %edi,%eax
-        popl  %ebx
-        popl  %esi
-        popl  %edi
-        popl  %ebp
-        addl  $12,%esp
-        ret
--- a/mp3lib/decode_sse.s	Fri Jun 29 10:54:41 2001 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,201 +0,0 @@
-///
-/// Replacement of synth_1to1() with Intel's SSE SIMD operations support
-///
-/// This code based 'decode_k7.s' by Nick Kurshev
-/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
-///
-///  - SSE optimization
-///  - change function name for support SSE automatic detect
-///
-/// Modified by Nick Kurshev <nickols_k@mail.ru>
-///
-/ synth_1to1_3dnow works the same way as the c version of
-/ synth_1to1. this assembler code based 'decode-i586.s'
-/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
-/ have been made:
-/ - use {MMX,3DNow!} instruction for reduce cpu
-/ - remove unused(?) local symbols
-/
-/ useful sources of information on optimizing 3DNow! code include:
-/ AMD 3DNow! Technology Manual (Publication #21928)
-/     English:  http://www.amd.com/K6/k6docs/pdf/21928d.pdf
-/    (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
-/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
-/     English:  http://www.amd.com/K6/k6docs/pdf/21924b.pdf
-/
-/ This code was tested only AMD-K6-2 processor Linux systems,
-/ please tell me:
-/ - whether this code works on other 3DNow! capable processors
-/  (ex.IDT-C6-2) or not
-/ - whether this code works on other OSes or not
-/
-/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
-/                    <kim@comtec.co.jp>               - after  1.Apr.1998
-
-/ Enhancments for q-word operation by Michael Hipp
-
-.bss
-        .comm   buffs,4352,4
-.data
-        .align 4
-bo:
-        .long 1
-.text
-/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
-.globl synth_1to1_sse
-synth_1to1_sse:
-        subl  $12,%esp
-        pushl %ebp
-        pushl %edi
-        pushl %esi
-        pushl %ebx
-	
-        movl  32(%esp),%eax
-        movl  40(%esp),%esi
-        movl  $0,%edi
-        movl  bo,%ebp
-        cmpl  %edi,36(%esp)
-        jne   .L48
-        decl  %ebp
-        andl  $15,%ebp
-        movl  %ebp,bo
-        movl  $buffs,%ecx
-        jmp   .L49
-.L48:
-        addl  $2,%esi
-        movl  $buffs+2176,%ecx
-.L49:
-        testl $1,%ebp
-        je    .L50
-        movl  %ecx,%ebx
-        movl  %ebp,16(%esp)
-        pushl %eax
-        movl  20(%esp),%edx
-        leal  (%ebx,%edx,4),%eax
-        pushl %eax
-        movl  24(%esp),%eax
-        incl  %eax
-        andl  $15,%eax
-        leal  1088(,%eax,4),%eax
-        addl  %ebx,%eax
-        jmp   .L74
-.L50:
-        leal  1088(%ecx),%ebx
-        leal  1(%ebp),%edx
-        movl  %edx,16(%esp)
-        pushl %eax
-        leal  1092(%ecx,%ebp,4),%eax
-        pushl %eax
-        leal  (%ecx,%ebp,4),%eax
-.L74:
-        pushl %eax
-        call  dct64
-	addl  $12, %esp
-        movl  16(%esp),%edx
-        leal  0(,%edx,4),%edx
-        movl  $decwin+64,%eax
-        movl  %eax,%ecx            
-        subl  %edx,%ecx
-        movl  $16,%ebp
-
-.L55:
-	movups (%ecx), %xmm0
-	mulps  (%ebx), %xmm0
-	movups 16(%ecx), %xmm1
-	mulps  16(%ebx), %xmm1
-	addps  %xmm1, %xmm0
-	movups 32(%ecx), %xmm1
-	mulps  32(%ebx), %xmm1
-	addps  %xmm1, %xmm0
-	movups 48(%ecx), %xmm1
-	mulps  48(%ebx), %xmm1
-	addps  %xmm1, %xmm0
-/* pfnacc ->  PFNACC mmreg1, mmreg2  performs the following operations: */
-/* temp = mmreg2 */
-/* mmreg1[31:0] = mmreg1[31:0] - mmreg1[63:32] */
-/* mmreg1[63:32]= temp [31:0] - temp[63:32] */
-/* save difference of mmreg1's low-word and high-word into mmreg1's low-word */
-/* save difference of mmreg2's low-word and high-word into mmreg1's high-word */
-	movhlps %xmm0, %xmm1
-	addps   %xmm1, %xmm0
-	movaps  %xmm0, %xmm1
-	shufps  $0x55, %xmm1, %xmm1 /* fake of pfnacc. 1|1|1|1 */
-
-	subss	%xmm1, %xmm0
-	cvtss2si %xmm0, %eax
-
-/        sar   $16,%eax
-        movw  %ax,(%esi)
-
-        addl  $64,%ebx
-        subl  $-128,%ecx
-        addl  $4,%esi
-        decl  %ebp
-        jnz  .L55
-
-/ --- end of  loop 1 ---
-
-	movups (%ecx), %xmm0
-	mulps  (%ebx), %xmm0
-	movups 16(%ecx), %xmm1
-	mulps  16(%ebx), %xmm1
-	addps  %xmm1, %xmm0
-	movups 32(%ecx), %xmm1
-	mulps  32(%ebx), %xmm1
-	addps  %xmm1, %xmm0
-	movups 48(%ecx), %xmm1
-	mulps  48(%ebx), %xmm1
-	addps  %xmm1, %xmm0
-	movhlps %xmm0, %xmm1	
-	addss	%xmm1, %xmm0
-	cvtss2si %xmm0, %eax
-
-/        sar   $16,%eax
-
-        movw  %ax,(%esi)
-
-        addl  $-64,%ebx
-        addl  $4,%esi
-        addl  $256,%ecx
-        movl  $15,%ebp
-
-.L68:
-	xorps %xmm0, %xmm0
-	movups (%ecx), %xmm1
-	mulps  (%ebx), %xmm1
-	subps  %xmm1, %xmm0
-	movups 16(%ecx), %xmm1
-	mulps  16(%ebx), %xmm1
-	subps  %xmm1, %xmm0
-	movups 32(%ecx), %xmm1
-	mulps  32(%ebx), %xmm1
-	subps  %xmm1, %xmm0
-	movups 48(%ecx), %xmm1
-	mulps  48(%ebx), %xmm1
-	subps  %xmm1, %xmm0
-	movhlps %xmm0, %xmm1
-	subps	%xmm1, %xmm0
-	movaps	%xmm0, %xmm1
-	shufps $0x55, %xmm1, %xmm1 /* fake of pfacc 1|1|1|1 */
-	addss  %xmm1, %xmm0
-	cvtss2si %xmm0, %eax
-
-/        sar   $16,%eax
-
-        movw  %ax,(%esi)
-
-        addl  $-64,%ebx
-        subl  $-128,%ecx
-        addl  $4,%esi
-        decl  %ebp
-        jnz   .L68
-
-/ --- end of loop 2
-
-        movl  %edi,%eax
-        popl  %ebx
-        popl  %esi
-        popl  %edi
-        popl  %ebp
-        addl  $12,%esp
-        ret
--- a/mp3lib/layer2.c	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/layer2.c	Fri Jun 29 17:55:35 2001 +0000
@@ -50,8 +50,16 @@
   {
     double m=mulmul[k];
     table = muls[k];
+    if(_has_mmx) 
+    {
+        for(j=3,i=0;i<63;i++,j--)
+	  *table++ = 16384 * m * pow(2.0,(double) j / 3.0);
+    }
+    else
     for(j=3,i=0;i<63;i++,j--)
+    {
       *table++ = m * pow(2.0,(double) j / 3.0);
+    }
     *table++ = 0.0;
   }
 }
--- a/mp3lib/layer3.c	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/layer3.c	Fri Jun 29 17:55:35 2001 +0000
@@ -22,9 +22,9 @@
 #define GP2MAX (256+118+4)
 static real gainpow2[GP2MAX];
 
-static real nCOS9[9];
+real COS9[9];
 static real COS6_1,COS6_2;
-static real tfcos36[9];
+real tfcos36[9];
 static real tfcos12[3];
 #ifdef NEW_DCT9
 static real cos9[3],cos18[3];
@@ -111,8 +111,12 @@
   int i,j,k,l;
 
   for(i=-256;i<118+4;i++)
-    gainpow2[i+256] = pow((double)2.0,-0.25 * (double) (i+210) );
-
+  {
+    if(_has_mmx)
+      gainpow2[i+256] = 16384.0 * pow((double)2.0,-0.25 * (double) (i+210) );
+    else
+      gainpow2[i+256] = pow((double)2.0,-0.25 * (double) (i+210) );
+  }
   for(i=0;i<8207;i++)
     ispow[i] = pow((double)i,(double)4.0/3.0);
 
@@ -139,7 +143,7 @@
   }
 
   for(i=0;i<9;i++)
-    nCOS9[i] = cos( M_PI / 18.0 * (double) i);
+    COS9[i] = cos( M_PI / 18.0 * (double) i);
 
   for(i=0;i<9;i++)
     tfcos36[i] = 0.5 / cos ( M_PI * (double) (i*2+1) / 36.0 );
@@ -1533,6 +1537,9 @@
 /*
  * III_hybrid
  */
+ 
+dct36_func_t dct36_func;
+  
 static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT],real tsOut[SSLIMIT][SBLIMIT],
    int ch,struct gr_info_s *gr_info)
 {
@@ -1553,8 +1560,8 @@
 
    if(gr_info->mixed_block_flag) {
      sb = 2;
-     dct36(fsIn[0],rawout1,rawout2,win[0],tspnt);
-     dct36(fsIn[1],rawout1+18,rawout2+18,win1[0],tspnt+1);
+     (*dct36_func)(fsIn[0],rawout1,rawout2,win[0],tspnt);
+     (*dct36_func)(fsIn[1],rawout1+18,rawout2+18,win1[0],tspnt+1);
      rawout1 += 36; rawout2 += 36; tspnt += 2;
    }
  
@@ -1567,8 +1574,8 @@
    }
    else {
      for (; sb<gr_info->maxb; sb+=2,tspnt+=2,rawout1+=36,rawout2+=36) {
-       dct36(fsIn[sb],rawout1,rawout2,win[bt],tspnt);
-       dct36(fsIn[sb+1],rawout1+18,rawout2+18,win1[bt],tspnt+1);
+       (*dct36_func)(fsIn[sb],rawout1,rawout2,win[bt],tspnt);
+       (*dct36_func)(fsIn[sb+1],rawout1+18,rawout2+18,win1[bt],tspnt+1);
      }
    }
 
--- a/mp3lib/mpg123.h	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/mpg123.h	Fri Jun 29 17:55:35 2001 +0000
@@ -104,33 +104,22 @@
 };
 
 static long freqs[9];
-#ifdef HAVE_3DNOW
-        real decwin[2*(512+32)];
-#else
-        real decwin[512+32];
-#endif
-       real *pnts[];
+extern real decwin[(512+32)];
+extern real *pnts[];
 
 static int do_layer2(struct frame *fr,int single);
 static int do_layer3(struct frame *fr,int single);
 static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt);
 
-extern int  synth_1to1_pent( real *,int,unsigned char * );
+extern int synth_1to1_pent( real *,int,short * );
+extern void make_decode_tables_MMX(long scaleval);
+extern int synth_1to1_MMX( real *,int,short * );
+extern int synth_1to1_MMX_s(real *, int, short *, short *, int *);
 extern void dct64(real *a,real *b,real *c);
 
-#ifdef HAVE_3DNOW
- extern void dct64_3dnow( real *,real *, real * );
- extern void dct36_3dnow(real *,real *,real *,real *,real *);
- extern int  synth_1to1_3dnow( real *,int,unsigned char * );
-#endif
-#ifdef HAVE_3DNOWEX
- extern void dct64_3dnowex( real *,real *, real * );
- extern void dct36_3dnowex(real *,real *,real *,real *,real *);
- extern int  synth_1to1_3dnowex( real *,int,unsigned char * );
-#endif
-#ifdef HAVE_SSE_MP3
-// extern void dct64_3dnow( real *,real *, real * );
-// extern void dct36_3dnow(real *,real *,real *,real *,real *);
- extern int  synth_1to1_sse( real *,int,unsigned char * );
-#endif
+extern void dct36_3dnow(real *,real *,real *,real *,real *);
+extern void dct36_3dnowex(real *,real *,real *,real *,real *);
+extern void dct36_sse(real *,real *,real *,real *,real *);
 
+typedef int (*synth_func_t)( real *,int,short * );
+typedef void (*dct36_func_t)(real *,real *,real *,real *,real *);
--- a/mp3lib/sr1.c	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/sr1.c	Fri Jun 29 17:55:35 2001 +0000
@@ -343,6 +343,12 @@
 
 static int tables_done_flag=0;
 
+/* It's hidden from gcc in assembler */
+extern void dct64_MMX( void );
+extern void dct64_MMX_3dnow( void );
+extern void dct64_MMX_3dnowex( void );
+void (*dct64_MMX_func)( void );
+
 // Init decoder tables.  Call first, once!
 #ifdef USE_FAKE_MONO
 void MP3_Init(int fakemono){
@@ -351,20 +357,41 @@
 #endif
     _CpuID=CpuDetect();
     _i586=ipentium();
-#ifdef HAVE_3DNOW
+#ifndef HAVE_MMX
+    _i586 &= 1;
+#endif
     _3dnow=a3dnow();
+#ifndef HAVE_3DNOW
+    _3dnow = 0;
 #endif
-
+#ifndef HAVE_3DNOWEX
+    _3dnow &= 1;
+#endif
+    _isse=isse();
+#ifndef HAVE_SSE
+    _isse = 0;
+#endif
+#ifndef HAVE_SSE2
+    _isse &= 1;
+#endif
+    _has_mmx=_i586>1||_3dnow||_isse;
     printf( "mp3lib: Processor ID: %x\n",_CpuID );
-    printf( "mp3lib: i586 processor %sdetected.\n",(_i586?"":"not ") );
-#ifdef HAVE_3DNOW
-    printf( "mp3lib: AMD 3dnow! extension %sdetected.\n",(_3dnow?"":"not ") );
-#endif
-#ifdef HAVE_3DNOWEX
-    printf( "mp3lib: AMD 3dnow-dsp! extension %sdetected.\n",(_3dnow>1?"":"not ") );
-#endif
+    if(_i586&&!_3dnow&&!_isse)
+      printf( "mp3lib: Using Pentium%s optimized decore.\n",(_i586>1?"-MMX":""));
+    else
+    if(_isse) 
+    /*
+       Note: It's ok, Since K8 will have SSE2 support and will much faster
+       of P4 ;) 
+     */
+      printf( "mp3lib: Using SSE%s! optimized decore.\n",(_isse>1?"2":""));
+    else
+    if(_3dnow)
+      printf( "mp3lib: Using AMD 3dnow%s! optimized decore.\n",(_3dnow>1?"-dsp(k7)":""));
 
-    make_decode_tables(outscale);
+/* Use it for any MMX cpu */
+   if(_has_mmx)	make_decode_tables_MMX(outscale);
+   else		make_decode_tables(outscale);
 #ifdef USE_FAKE_MONO
     if (fakemono == 1)
         fr.synth=synth_1to1_l;
@@ -381,6 +408,42 @@
     init_layer2();
     init_layer3(fr.down_sample_sblimit);
     tables_done_flag=1;
+
+    dct36_func=dct36;
+  if(_isse)
+  {
+    synth_func=synth_1to1_MMX;
+    dct64_MMX_func=dct64_MMX;
+  }    
+  else
+  if ( _3dnow > 1 )
+  {
+     synth_func=synth_1to1_MMX;
+     dct36_func=dct36_3dnowex;
+     dct64_MMX_func=dct64_MMX_3dnowex;
+  }
+  else
+  if ( _3dnow )
+  {
+    synth_func=synth_1to1_MMX;
+    dct36_func=dct36_3dnow;
+    dct64_MMX_func=dct64_MMX_3dnow;
+  }
+  else
+  if ( _i586 > 1)
+  {
+    synth_func=synth_1to1_MMX;
+    dct64_MMX_func=dct64_MMX;
+  }    
+  else
+  if ( _i586 )
+  {
+    synth_func=synth_1to1_pent;
+  }    
+  else
+  {
+    synth_func = NULL;
+  }
 }
 
 #if 0
--- a/mp3lib/tabinit.c	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/tabinit.c	Fri Jun 29 17:55:35 2001 +0000
@@ -1,20 +1,7 @@
-
+real decwin[(512+32)], cos64[32], cos32[16], cos16[8], cos8[4], cos4[2];
+real *pnts[]={ cos64,cos32,cos16,cos8,cos4 };
 
-#ifdef HAVE_3DNOW
-        real decwin[2*(512+32)] __attribute__((aligned(8)));
-        real cos64[32] __attribute__((aligned(8)));
-	real cos32[16] __attribute__((aligned(8)));
-	real cos16[8] __attribute__((aligned(8)));
-	real cos8[4] __attribute__((aligned(8)));
-	real cos4[2] __attribute__((aligned(8)));
-        real *pnts[]={ cos64,cos32,cos16,cos8,cos4 };
-#else
-        real decwin[512+32];
-        real cos64[16],cos32[8],cos16[4],cos8[2],cos4[1];
-        real *pnts[] = { cos64,cos32,cos16,cos8,cos4 };
-#endif
-
-long intwinbase[] = {
+static long intwinbase[] = {
      0,    -1,    -1,    -1,    -1,    -1,    -1,    -2,    -2,    -2,
     -2,    -3,    -3,    -4,    -4,    -5,    -5,    -6,    -7,    -7,
     -8,    -9,   -10,   -11,   -13,   -14,   -16,   -17,   -19,   -21,
@@ -42,7 +29,7 @@
  64019, 65290, 66494, 67629, 68692, 69679, 70590, 71420, 72169, 72835,
  73415, 73908, 74313, 74630, 74856, 74992, 75038 };
 
-       void make_decode_tables(long scaleval)
+void make_decode_tables(long scaleval)
 {
   int i,j,k,kr,divv;
   real *table,*costab;
@@ -53,17 +40,13 @@
     kr=0x10>>i; divv=0x40>>i;
     costab = pnts[i];
     for(k=0;k<kr;k++) costab[k] = 1.0 / (2.0 * cos(M_PI * ((double) k * 2.0 + 1.0) / (double) divv));
-    #ifdef HAVE_3DNOW
-     if ( _3dnow ) for(k=0;k<kr;k++) costab[k+kr]=-costab[k];
-    #endif
-
   }
 
   table = decwin;
   scaleval = -scaleval;
   for(i=0,j=0;i<256;i++,j++,table+=32)
   {
-         if(table < decwin+512+16)
+    if(table < decwin+512+16)
       table[16] = table[0] = (double) intwinbase[j] / 65536.0 * (double) scaleval;
     if(i % 32 == 31)
       table -= 1023;
@@ -80,14 +63,6 @@
     if(i % 64 == 63)
       scaleval = - scaleval;
   }
-  #ifdef HAVE_3DNOW
-   if ( _3dnow )
-    for(i=0;i<512+32;i++)
-     {
-      decwin[512+31-i]*=65536.0; // allows faster clipping in 3dnow code
-      decwin[512+32+i]=decwin[512+31-i];
-     }
-  #endif
 }
 
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mp3lib/tabinit_MMX.s	Fri Jun 29 17:55:35 2001 +0000
@@ -0,0 +1,161 @@
+# This code was taken from http://www.mpg123.org
+# See ChangeLog of mpg123-0.59s-pre.1 for detail
+# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+.bss
+	.align 8
+    	.comm	decwin,2176,32
+	.align 8
+	.comm	decwins,2176,32
+.data
+	.align 8
+intwinbase_MMX:
+	.value      0,    -1,    -1,    -1,    -1,    -1,    -1,    -2
+	.value     -2,    -2,    -2,    -3,    -3,    -4,    -4,    -5
+	.value     -5,    -6,    -7,    -7,    -8,    -9,   -10,   -11
+	.value    -13,   -14,   -16,   -17,   -19,   -21,   -24,   -26
+	.value    -29,   -31,   -35,   -38,   -41,   -45,   -49,   -53
+	.value    -58,   -63,   -68,   -73,   -79,   -85,   -91,   -97
+	.value   -104,  -111,  -117,  -125,  -132,  -139,  -147,  -154
+	.value   -161,  -169,  -176,  -183,  -190,  -196,  -202,  -208
+	.value   -213,  -218,  -222,  -225,  -227,  -228,  -228,  -227
+	.value   -224,  -221,  -215,  -208,  -200,  -189,  -177,  -163
+	.value   -146,  -127,  -106,   -83,   -57,   -29,     2,    36
+	.value     72,   111,   153,   197,   244,   294,   347,   401
+	.value    459,   519,   581,   645,   711,   779,   848,   919
+	.value    991,  1064,  1137,  1210,  1283,  1356,  1428,  1498
+	.value   1567,  1634,  1698,  1759,  1817,  1870,  1919,  1962
+	.value   2001,  2032,  2057,  2075,  2085,  2087,  2080,  2063
+	.value   2037,  2000,  1952,  1893,  1822,  1739,  1644,  1535
+	.value   1414,  1280,  1131,   970,   794,   605,   402,   185
+	.value    -45,  -288,  -545,  -814, -1095, -1388, -1692, -2006
+	.value  -2330, -2663, -3004, -3351, -3705, -4063, -4425, -4788
+	.value  -5153, -5517, -5879, -6237, -6589, -6935, -7271, -7597
+	.value  -7910, -8209, -8491, -8755, -8998, -9219, -9416, -9585
+	.value  -9727, -9838, -9916, -9959, -9966, -9935, -9863, -9750
+	.value  -9592, -9389, -9139, -8840, -8492, -8092, -7640, -7134
+	.value  -6574, -5959, -5288, -4561, -3776, -2935, -2037, -1082
+	.value    -70,   998,  2122,  3300,  4533,  5818,  7154,  8540
+	.value   9975, 11455, 12980, 14548, 16155, 17799, 19478, 21189
+	.value  22929, 24694, 26482, 28289, 30112, 31947,-26209,-24360
+	.value -22511,-20664,-18824,-16994,-15179,-13383,-11610, -9863
+	.value  -8147, -6466, -4822, -3222, -1667,  -162,  1289,  2684
+	.value   4019,  5290,  6494,  7629,  8692,  9679, 10590, 11420
+	.value  12169, 12835, 13415, 13908, 14313, 14630, 14856, 14992
+	.value  15038
+
+intwindiv:
+	.long 0x47800000			# 65536.0
+.text
+	.align 32
+.globl make_decode_tables_MMX
+make_decode_tables_MMX:
+	pushl %edi
+	pushl %esi
+	pushl %ebx
+
+	xorl %ecx,%ecx
+	xorl %ebx,%ebx
+	movl $32,%esi
+	movl $intwinbase_MMX,%edi
+	negl 16(%esp)				# scaleval
+	pushl $2				# intwinbase step
+.L00:
+	cmpl $528,%ecx
+	jnc .L02
+	movswl (%edi),%eax
+	cmpl $intwinbase_MMX+444,%edi
+	jc .L01
+	addl $60000,%eax
+.L01:
+	pushl %eax
+	fildl (%esp)
+	fdivs intwindiv
+	fimull 24(%esp)
+	popl %eax
+	fsts  decwin(,%ecx,4)
+	fstps decwin+64(,%ecx,4)
+.L02:
+	leal -1(%esi),%edx
+	and %ebx,%edx
+	cmp $31,%edx
+	jnz .L03
+	addl $-1023,%ecx
+	test %esi,%ebx
+	jz  .L03
+	negl 20(%esp)
+.L03:
+	addl %esi,%ecx
+	addl (%esp),%edi
+	incl %ebx
+	cmpl $intwinbase_MMX,%edi
+	jz .L04
+	cmp $256,%ebx
+	jnz .L00
+	negl (%esp)
+	jmp .L00
+.L04:
+	popl %eax
+
+	xorl %ecx,%ecx
+	xorl %ebx,%ebx
+	pushl $2
+.L05:
+	cmpl $528,%ecx
+	jnc .L11
+	movswl (%edi),%eax
+	cmpl $intwinbase_MMX+444,%edi
+	jc .L06
+	addl $60000,%eax
+.L06:
+	cltd
+	imull 20(%esp)
+	shrdl $17,%edx,%eax
+	cmpl $32767,%eax
+	movl $1055,%edx
+	jle .L07
+	movl $32767,%eax
+	jmp .L08
+.L07:
+	cmpl $-32767,%eax
+	jge .L08
+	movl $-32767,%eax
+.L08:
+	cmpl $512,%ecx
+	jnc .L09
+	subl %ecx,%edx
+	movw %ax,decwins(,%edx,2)
+	movw %ax,decwins-32(,%edx,2)
+.L09:
+	testl $1,%ecx
+	jnz .L10
+	negl %eax
+.L10:
+	movw %ax,decwins(,%ecx,2)
+	movw %ax,decwins+32(,%ecx,2)
+.L11:
+	leal -1(%esi),%edx
+	and %ebx,%edx
+	cmp $31,%edx
+	jnz .L12
+	addl $-1023,%ecx
+	test %esi,%ebx
+	jz  .L12
+	negl 20(%esp)
+.L12:
+	addl %esi,%ecx
+	addl (%esp),%edi
+	incl %ebx
+	cmpl $intwinbase_MMX,%edi
+	jz .L13
+	cmp $256,%ebx
+	jnz .L05
+	negl (%esp)
+	jmp .L05
+.L13:
+	popl %eax
+	
+	popl %ebx
+	popl %esi
+	popl %edi
+	ret
+
--- a/mp3lib/test2.c	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/test2.c	Fri Jun 29 17:55:35 2001 +0000
@@ -1,5 +1,5 @@
 
-// gcc test.c -I.. -L. -lMP3 -lm -o test2 -O4
+//gcc test2.c -O2 -I.. -L. ../libvo/aclib.c -lMP3 -lm -o test2
 
 #include <stdio.h>
 #include <stdlib.h>