changeset 1271:2864e32cd267

Finished 3dnow optimization (in scalar mode) and minor improvements
author nick
date Wed, 04 Jul 2001 09:47:56 +0000
parents 8a9fa696b77d
children 89e9625b3c7d
files mp3lib/dct64_3dnow.s mp3lib/dct64_MMX.s mp3lib/dct64_k7.s mp3lib/decode_MMX.s
diffstat 4 files changed, 761 insertions(+), 785 deletions(-) [+]
line wrap: on
line diff
--- a/mp3lib/dct64_3dnow.s	Wed Jul 04 07:00:15 2001 +0000
+++ b/mp3lib/dct64_3dnow.s	Wed Jul 04 09:47:56 2001 +0000
@@ -3,45 +3,10 @@
 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
 # Partial 3dnow! optimization by Nick Kurshev
 #
-# TODO: finish 3dnow! optimization at least in scalar mode
+# TODO: optimize scalar 3dnow! code
+# Warning: Phases 7 & 8 are not tested
 #
 
-.data
-	.align 8
-plus_minus_3dnow: .long 0x00000000, 0x80000000
-costab:
-	.long 1056974725
-	.long 1057056395
-	.long 1057223771
-	.long 1057485416
-	.long 1057855544
-	.long 1058356026
-	.long 1059019886
-	.long 1059897405
-	.long 1061067246
-	.long 1062657950
-	.long 1064892987
-	.long 1066774581
-	.long 1069414683
-	.long 1073984175
-	.long 1079645762
-	.long 1092815430
-	.long 1057005197
-	.long 1057342072
-	.long 1058087743
-	.long 1059427869
-	.long 1061799040
-	.long 1065862217
-	.long 1071413542
-	.long 1084439708
-	.long 1057128951
-	.long 1058664893
-	.long 1063675095
-	.long 1076102863
-	.long 1057655764
-	.long 1067924853
-	.long 1060439283
-
 .text
 
 	.align 16
@@ -53,14 +18,13 @@
 	pushl %edi
 	subl $256,%esp
 	movl 280(%esp),%eax
-
 	leal 128(%esp),%edx
 	movl 272(%esp),%esi
 	movl 276(%esp),%edi
-	movl $costab,%ebx
+	movl $costab_mmx,%ebx
 	orl %ecx,%ecx
 	movl %esp,%ecx
-	femms	
+
 /* Phase 1*/
 	movq	(%eax), %mm0
 	movq	8(%eax), %mm4
@@ -535,395 +499,419 @@
 	movd	%mm7, 120(%ecx)
 	
 /* Phase 6. This is the end of easy road. */
-	movl	$1, %eax
-	movd	%eax, %mm7
-	pi2fd	%mm7, %mm7
-	movq	32(%ecx), %mm0
-	punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */	
-	movq	%mm0, %mm1
-	movq	plus_minus_3dnow, %mm6
-	/* n.b.: pfpnacc */
-	pxor	%mm6, %mm1
-	pfacc	%mm1, %mm0
-	/**/
-	pfmul	%mm7, %mm0
-	movq	%mm0, 32(%edx)
-	femms
+/* Code below is coded in scalar mode. Should be optimized */
+
+	movd   32(%ecx), %mm0
+	pfadd  36(%ecx), %mm0
+	movd   %mm0, 32(%edx)
+
+	movd   32(%ecx), %mm0
+	pfsub  36(%ecx), %mm0
+	pfmul  120(%ebx),%mm0
+	movd   %mm0, 36(%edx)
 
-	flds   44(%ecx)
-	fsubs  40(%ecx)
-	fmuls 120(%ebx)
+	movd   44(%ecx), %mm0
+	pfsub  40(%ecx), %mm0
+	pfmul  120(%ebx),%mm0
 
-	fsts   44(%edx)
-	fadds  40(%ecx) /* pfacc 40(ecx), 56(%ecx) */
-	fadds  44(%ecx)
-	fstps  40(%edx)
+	movd   %mm0, 44(%edx)
+	pfadd  40(%ecx), %mm0
+	pfadd  44(%ecx), %mm0
+	movd   %mm0, 40(%edx)
+
+	movd   48(%ecx), %mm3
+	pfsub  52(%ecx), %mm3
+	pfmul 120(%ebx), %mm3
 
-	flds   48(%ecx)
-	fsubs  52(%ecx)
-	fmuls 120(%ebx)
+	movd   60(%ecx), %mm2
+	pfsub  56(%ecx), %mm2
+	pfmul 120(%ebx), %mm2
+	movq	%mm2, %mm1
 
-	flds   60(%ecx)
-	fsubs  56(%ecx)
-	fmuls 120(%ebx)
-
-	fld      %st(0)
-	fadds  56(%ecx)
-	fadds  60(%ecx)
+	pfadd  56(%ecx), %mm1
+	pfadd  60(%ecx), %mm1
+	movq	%mm1, %mm0
 
-	fld      %st(0)
-	fadds  48(%ecx)
-	fadds  52(%ecx)
-	fstps  48(%edx)
-	fadd     %st(2)
-	fstps  56(%edx)
-	fsts   60(%edx)
-	faddp    %st(1)
-	fstps  52(%edx)
+	pfadd  48(%ecx), %mm0
+	pfadd  52(%ecx), %mm0
+	movd   %mm0, 48(%edx)
+	pfadd  %mm3, %mm1
+	movd   %mm1, 56(%edx)
+	movd   %mm2, 60(%edx)
+	pfadd  %mm3, %mm2
+	movd   %mm2, 52(%edx)
+
 /*---*/
-	flds   64(%ecx)
-	fadds  68(%ecx)
-	fstps  64(%edx)
+	movd   64(%ecx), %mm0
+	pfadd  68(%ecx), %mm0
+	movd   %mm0, 64(%edx)
 
-	flds   64(%ecx)
-	fsubs  68(%ecx)
-	fmuls 120(%ebx)
-	fstps  68(%edx)
+	movd   64(%ecx), %mm0
+	pfsub  68(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd   %mm0, 68(%edx)
 
-	flds   76(%ecx)
-	fsubs  72(%ecx)
-	fmuls 120(%ebx)
-	fsts   76(%edx)
-	fadds  72(%ecx)
-	fadds  76(%ecx)
-	fstps  72(%edx)
-
-	flds   92(%ecx)
-	fsubs  88(%ecx)
-	fmuls 120(%ebx)
-	fsts   92(%edx)
-	fadds  92(%ecx)
-	fadds  88(%ecx)
+	movd   76(%ecx), %mm0
+	pfsub  72(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd   %mm0, 76(%edx)
+	pfadd  72(%ecx), %mm0
+	pfadd  76(%ecx), %mm0
+	movd   %mm0, 72(%edx)
 
-	fld      %st(0)
-	fadds  80(%ecx)
-	fadds  84(%ecx)
-	fstps  80(%edx)
+	movd   92(%ecx), %mm0
+	pfsub  88(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd   %mm0, 92(%edx)
+	pfadd  92(%ecx), %mm0
+	pfadd  88(%ecx), %mm0
+	movq   %mm0, %mm1
+	
+	pfadd  80(%ecx), %mm0
+	pfadd  84(%ecx), %mm0
+	movd   %mm0, 80(%edx)
 
-	flds   80(%ecx)
-	fsubs  84(%ecx)
-	fmuls 120(%ebx)
-	fadd  %st(0), %st(1)
-	fadds 92(%edx)
-	fstps 84(%edx)
-	fstps 88(%edx)
+	movd   80(%ecx), %mm0
+	pfsub  84(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	pfadd  %mm0, %mm1
+	pfadd  92(%edx), %mm0
+	movd   %mm0, 84(%edx)
+	movd   %mm1, 88(%edx)
 
-	flds   96(%ecx)
-	fadds 100(%ecx)
-	fstps  96(%edx)
+	movd   96(%ecx), %mm0
+	pfadd 100(%ecx), %mm0
+	movd   %mm0, 96(%edx)
 
-	flds   96(%ecx)
-	fsubs 100(%ecx)
-	fmuls 120(%ebx)
-	fstps 100(%edx)
+	movd   96(%ecx), %mm0
+	pfsub 100(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd  %mm0, 100(%edx)
 
-	flds  108(%ecx)
-	fsubs 104(%ecx)
-	fmuls 120(%ebx)
-	fsts  108(%edx)
-	fadds 104(%ecx)
-	fadds 108(%ecx)
-	fstps 104(%edx)
+	movd  108(%ecx), %mm0
+	pfsub 104(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd  %mm0, 108(%edx)
+	pfadd 104(%ecx), %mm0
+	pfadd 108(%ecx), %mm0
+	movd  %mm0, 104(%edx)
 
-	flds  124(%ecx)
-	fsubs 120(%ecx)
-	fmuls 120(%ebx)
-	fsts  124(%edx)
-	fadds 120(%ecx)
-	fadds 124(%ecx)
+	movd  124(%ecx), %mm0
+	pfsub 120(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd  %mm0, 124(%edx)
+	pfadd 120(%ecx), %mm0
+	pfadd 124(%ecx), %mm0
+	movq  %mm0, %mm1
 
-	fld      %st(0)
-	fadds 112(%ecx)
-	fadds 116(%ecx)
-	fstps 112(%edx)
+	pfadd 112(%ecx), %mm0
+	pfadd 116(%ecx), %mm0
+	movd  %mm0, 112(%edx)
 
-	flds  112(%ecx)
-	fsubs 116(%ecx)
-	fmuls 120(%ebx)
-	fadd  %st(0),%st(1)
-	fadds 124(%edx)
-	fstps 116(%edx)
-	fstps 120(%edx)
+	movd  112(%ecx), %mm0
+	pfsub 116(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	pfadd %mm0,%mm1
+	pfadd 124(%edx), %mm0
+	movd  %mm0, 116(%edx)
+	movd  %mm1, 120(%edx)
+
 	jnz .L01
 	
 /* Phase 7*/
+/* Code below is coded in scalar mode. Should be optimized */
 
-	flds      (%ecx)
-	fadds    4(%ecx)
-	fstps 1024(%esi)
+	movd      (%ecx), %mm0
+	pfadd    4(%ecx), %mm0
+	movd     %mm0, 1024(%esi)
 
-	flds      (%ecx)
-	fsubs    4(%ecx)
-	fmuls  120(%ebx)
-	fsts      (%esi)
-	fstps     (%edi)
+	movd      (%ecx), %mm0
+	pfsub    4(%ecx), %mm0
+	pfmul  120(%ebx), %mm0
+	movd      %mm0, (%esi)
+	movd      %mm0, (%edi)
 
-	flds   12(%ecx)
-	fsubs   8(%ecx)
-	fmuls 120(%ebx)
-	fsts  512(%edi)
-	fadds  12(%ecx)
-	fadds   8(%ecx)
-	fstps 512(%esi)
+	movd   12(%ecx), %mm0
+	pfsub   8(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd    %mm0, 512(%edi)
+	pfadd   12(%ecx), %mm0
+	pfadd   8(%ecx), %mm0
+	movd    %mm0, 512(%esi)
 
-	flds   16(%ecx)
-	fsubs  20(%ecx)
-	fmuls 120(%ebx)
+	movd   16(%ecx), %mm0
+	pfsub  20(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movq	%mm0, %mm3
 
-	flds   28(%ecx)
-	fsubs  24(%ecx)
-	fmuls 120(%ebx)
-	fsts  768(%edi)
-	fld      %st(0)
-	fadds  24(%ecx)
-	fadds  28(%ecx)
-	fld      %st(0)
-	fadds  16(%ecx)
-	fadds  20(%ecx)
-	fstps 768(%esi)
-	fadd     %st(2)
-	fstps 256(%esi)
-	faddp    %st(1)
-	fstps 256(%edi)
+	movd   28(%ecx), %mm0
+	pfsub  24(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd    %mm0, 768(%edi)
+	movq	%mm0, %mm2
+	
+	pfadd  24(%ecx), %mm0
+	pfadd  28(%ecx), %mm0
+	movq	%mm0, %mm1
+
+	pfadd  16(%ecx), %mm0
+	pfadd  20(%ecx), %mm0
+	movd   %mm0, 768(%esi)
+	pfadd  %mm3, %mm1
+	movd   %mm1, 256(%esi)
+	pfadd  %mm3, %mm2
+	movd   %mm2, 256(%edi)
 	
 /* Phase 8*/
 
-	flds   32(%edx)
-	fadds  48(%edx)
-	fstps 896(%esi)
-
-	flds   48(%edx)
-	fadds  40(%edx)
-	fstps 640(%esi)
-
-	flds   40(%edx)
-	fadds  56(%edx)
-	fstps 384(%esi)
-
-	flds   56(%edx)
-	fadds  36(%edx)
-	fstps 128(%esi)
-
-	flds   36(%edx)
-	fadds  52(%edx)
-	fstps 128(%edi)
+	movq   32(%edx), %mm0
+	movq   48(%edx), %mm1
+	pfadd  48(%edx), %mm0
+	pfadd  40(%edx), %mm1
+	movd   %mm0, 896(%esi)
+	movd   %mm1, 640(%esi)
+	psrlq  $32, %mm0
+	psrlq  $32, %mm1
+	movd   %mm0, 128(%edi)
+	movd   %mm1, 384(%edi)
 
-	flds   52(%edx)
-	fadds  44(%edx)
-	fstps 384(%edi)
-
-	flds   60(%edx)
-	fsts  896(%edi)
-	fadds  44(%edx)
-	fstps 640(%edi)
+	movd   40(%edx), %mm0
+	pfadd  56(%edx), %mm0
+	movd   %mm0, 384(%esi)
 
-	flds   96(%edx)
-	fadds 112(%edx)
-	fld      %st(0)
-	fadds  64(%edx)
-	fstps 960(%esi)
-	fadds  80(%edx)
-	fstps 832(%esi)
+	movd   56(%edx), %mm0
+	pfadd  36(%edx), %mm0
+	movd   %mm0, 128(%esi)
 
-	flds  112(%edx)
-	fadds 104(%edx)
-	fld      %st(0)
-	fadds  80(%edx)
-	fstps 704(%esi)
-	fadds  72(%edx)
-	fstps 576(%esi)
+	movd   60(%edx), %mm0
+	movd   %mm0, 896(%edi)
+	pfadd  44(%edx), %mm0
+	movd   %mm0, 640(%edi)
 
-	flds  104(%edx)
-	fadds 120(%edx)
-	fld      %st(0)
-	fadds  72(%edx)
-	fstps 448(%esi)
-	fadds  88(%edx)
-	fstps 320(%esi)
-
-	flds  120(%edx)
-	fadds 100(%edx)
-	fld      %st(0)
-	fadds  88(%edx)
-	fstps 192(%esi)
-	fadds  68(%edx)
-	fstps  64(%esi)
+	movq   96(%edx), %mm0
+	movq   112(%edx), %mm2
+	movq   104(%edx), %mm4
+	pfadd  112(%edx), %mm0
+	pfadd  104(%edx), %mm2
+	pfadd  120(%edx), %mm4
+	movq   %mm0, %mm1
+	movq   %mm2, %mm3
+	movq   %mm4, %mm5
+	pfadd  64(%edx), %mm0
+	pfadd  80(%edx), %mm2
+	pfadd  72(%edx), %mm4
+	movd   %mm0, 960(%esi)
+	movd   %mm2, 704(%esi)
+	movd   %mm4, 448(%esi)
+	psrlq  $32, %mm0
+	psrlq  $32, %mm2
+	psrlq  $32, %mm4
+	movd   %mm0, 64(%edi)
+	movd   %mm2, 320(%edi)
+	movd   %mm4, 576(%edi)
+	pfadd  80(%edx), %mm1
+	pfadd  72(%edx), %mm3
+	pfadd  88(%edx), %mm5
+	movd   %mm1, 832(%esi)
+	movd   %mm3, 576(%esi)
+	movd   %mm5, 320(%esi)
+	psrlq  $32, %mm1
+	psrlq  $32, %mm3
+	psrlq  $32, %mm5
+	movd   %mm1, 192(%edi)
+	movd   %mm3, 448(%edi)
+	movd   %mm5, 704(%edi)
 
-	flds  100(%edx)
-	fadds 116(%edx)
-	fld      %st(0)
-	fadds  68(%edx)
-	fstps  64(%edi)
-	fadds  84(%edx)
-	fstps 192(%edi)
+	movd   120(%edx), %mm0
+	pfadd  100(%edx), %mm0
+	movq   %mm0, %mm1
+	pfadd  88(%edx), %mm0
+	movd   %mm0, 192(%esi)
+	pfadd  68(%edx), %mm1
+	movd   %mm1, 64(%esi)
 
-	flds  116(%edx)
-	fadds 108(%edx)
-	fld      %st(0)
-	fadds  84(%edx)
-	fstps 320(%edi)
-	fadds  76(%edx)
-	fstps 448(%edi)
+	movd  124(%edx), %mm0
+	movd  %mm0, 960(%edi)
+	pfadd  92(%edx), %mm0
+	movd  %mm0, 832(%edi)
 
-	flds  108(%edx)
-	fadds 124(%edx)
-	fld      %st(0)
-	fadds  76(%edx)
-	fstps 576(%edi)
-	fadds  92(%edx)
-	fstps 704(%edi)
-
-	flds  124(%edx)
-	fsts  960(%edi)
-	fadds  92(%edx)
-	fstps 832(%edi)
 	jmp	.L_bye
 .L01:	
 /* Phase 9*/
-
-	flds      (%ecx)
-	fadds    4(%ecx)
-	fistp  512(%esi)
-
-	flds      (%ecx)
-	fsubs    4(%ecx)
-	fmuls  120(%ebx)
+	movd      (%ecx), %mm0
+	pfadd    4(%ecx), %mm0
+	pf2id    %mm0, %mm0
+	movd	 %mm0, %eax
+	movw     %ax, 512(%esi)
 
-	fistp     (%esi)
-
+	movd      (%ecx), %mm0
+	pfsub    4(%ecx), %mm0
+	pfmul  120(%ebx), %mm0
+	pf2id    %mm0, %mm0
+	movd	 %mm0, %eax
+	movw     %ax, (%esi)
 
-	flds    12(%ecx)
-	fsubs    8(%ecx)
-	fmuls  120(%ebx)
-	fist   256(%edi)
-	fadds   12(%ecx)
-	fadds    8(%ecx)
-	fistp  256(%esi)
+	movd    12(%ecx), %mm0
+	pfsub    8(%ecx), %mm0
+	pfmul  120(%ebx), %mm0
+	pf2id    %mm0, %mm7
+	movd	 %mm7, %eax
+	movw     %ax, 256(%edi)
+	pfadd   12(%ecx), %mm0
+	pfadd    8(%ecx), %mm0
+	pf2id    %mm0, %mm0
+	movd	 %mm0, %eax
+	movw     %ax, 256(%esi)
 
-	flds   16(%ecx)
-	fsubs  20(%ecx)
-	fmuls 120(%ebx)
+	movd   16(%ecx), %mm0
+	pfsub  20(%ecx), %mm0
+	pfmul  120(%ebx), %mm0
+	movq   %mm0, %mm3
 
-	flds   28(%ecx)
-	fsubs  24(%ecx)
-	fmuls 120(%ebx)
-	fist  384(%edi)
-	fld      %st(0)
-	fadds  24(%ecx)
-	fadds  28(%ecx)
-	fld      %st(0)
-	fadds  16(%ecx)
-	fadds  20(%ecx)
-	fistp  384(%esi)
-	fadd     %st(2)
-	fistp  128(%esi)
-	faddp    %st(1)
-	fistp  128(%edi)
+	movd   28(%ecx), %mm0
+	pfsub  24(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	pf2id  %mm0, %mm7
+	movd   %mm7, %eax
+	movw   %ax, 384(%edi)
+	movq   %mm0, %mm2
+	
+	pfadd  24(%ecx), %mm0
+	pfadd  28(%ecx), %mm0
+	movq   %mm0, %mm1
+	pfadd  16(%ecx), %mm0
+	pfadd  20(%ecx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 384(%esi)
+	pfadd  %mm3, %mm1
+	pf2id  %mm1, %mm1
+	movd   %mm1, %eax
+	movw   %ax, 128(%esi)
+	pfadd  %mm3, %mm2
+	pf2id  %mm2, %mm2
+	movd   %mm2, %eax
+	movw   %ax, 128(%edi)
+	
 	
 /* Phase 10*/
 
-	flds    32(%edx)
-	fadds   48(%edx)
-	fistp  448(%esi)
-
-	flds   48(%edx)
-	fadds  40(%edx)
-	fistp 320(%esi)
-
-	flds   40(%edx)
-	fadds  56(%edx)
-	fistp 192(%esi)
-
-	flds   56(%edx)
-	fadds  36(%edx)
-	fistp  64(%esi)
-
-	flds   36(%edx)
-	fadds  52(%edx)
-	fistp  64(%edi)
+	movq    32(%edx), %mm0
+	movq    48(%edx), %mm1
+	pfadd   48(%edx), %mm0
+	pfadd   40(%edx), %mm1
+	pf2id   %mm0, %mm0
+	pf2id   %mm1, %mm1
+	movd	%mm0, %eax
+	movd	%mm1, %ecx
+	movw    %ax, 448(%esi)
+	movw    %cx, 320(%esi)
+	psrlq   $32, %mm0
+	psrlq   $32, %mm1
+	movd	%mm0, %eax
+	movd	%mm1, %ecx
+	movw    %ax, 64(%edi)
+	movw    %cx, 192(%edi)
 
-	flds   52(%edx)
-	fadds  44(%edx)
-	fistp 192(%edi)
-
-	flds   60(%edx)
-	fist   448(%edi)
-	fadds  44(%edx)
-	fistp 320(%edi)
+	movd   40(%edx), %mm0
+	pfadd  56(%edx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 192(%esi)
 
-	flds   96(%edx)
-	fadds 112(%edx)
-	fld      %st(0)
-	fadds  64(%edx)
-	fistp 480(%esi)
-	fadds  80(%edx)
-	fistp 416(%esi)
+	movd   56(%edx), %mm0
+	pfadd  36(%edx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 64(%esi)
 
-	flds  112(%edx)
-	fadds 104(%edx)
-	fld      %st(0)
-	fadds  80(%edx)
-	fistp 352(%esi)
-	fadds  72(%edx)
-	fistp 288(%esi)
+	movd   60(%edx), %mm0
+	pf2id  %mm0, %mm7
+	movd   %mm7, %eax
+	movw   %ax, 448(%edi)
+	pfadd  44(%edx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 320(%edi)
 
-	flds  104(%edx)
-	fadds 120(%edx)
-	fld      %st(0)
-	fadds  72(%edx)
-	fistp 224(%esi)
-	fadds  88(%edx)
-	fistp 160(%esi)
-
-	flds  120(%edx)
-	fadds 100(%edx)
-	fld      %st(0)
-	fadds  88(%edx)
-	fistp  96(%esi)
-	fadds  68(%edx)
-	fistp  32(%esi)
+	movq   96(%edx), %mm0
+	movq  112(%edx), %mm2
+	movq  104(%edx), %mm4
+	pfadd 112(%edx), %mm0
+	pfadd 104(%edx), %mm2
+	pfadd 120(%edx), %mm4
+	movq  %mm0, %mm1
+	movq  %mm2, %mm3
+	movq  %mm4, %mm5
+	pfadd  64(%edx), %mm0
+	pfadd  80(%edx), %mm2
+	pfadd  72(%edx), %mm4
+	pf2id  %mm0, %mm7
+	pf2id  %mm2, %mm6
+	pf2id  %mm4, %mm4
+	movd   %mm7, %eax
+	movd   %mm6, %ecx
+	movd   %mm4, %ebx
+	movw   %ax, 480(%esi)
+	movw   %cx, 352(%esi)
+	movw   %bx, 224(%esi)
+	psrlq  $32, %mm7
+	psrlq  $32, %mm6
+	psrlq  $32, %mm4
+	movd   %mm7, %eax
+	movd   %mm6, %ecx
+	movd   %mm4, %ebx
+	movw   %ax, 32(%edi)
+	movw   %cx, 160(%edi)
+	movw   %bx, 288(%edi)
+	pfadd  80(%edx), %mm1
+	pfadd  72(%edx), %mm3
+	pfadd  88(%edx), %mm5
+	pf2id  %mm1, %mm1
+	pf2id  %mm3, %mm3
+	pf2id  %mm5, %mm5
+	movd   %mm1, %eax
+	movd   %mm3, %ecx
+	movd   %mm5, %ebx
+	movw   %ax, 416(%esi)
+	movw   %cx, 288(%esi)
+	movw   %bx, 160(%esi)
+	psrlq  $32, %mm1
+	psrlq  $32, %mm3
+	psrlq  $32, %mm5
+	movd   %mm1, %eax
+	movd   %mm3, %ecx
+	movd   %mm5, %ebx
+	movw   %ax, 96(%edi)
+	movw   %cx, 224(%edi)
+	movw   %bx, 352(%edi)
 
-	flds  100(%edx)
-	fadds 116(%edx)
-	fld      %st(0)
-	fadds  68(%edx)
-	fistp  32(%edi)
-	fadds  84(%edx)
-	fistp  96(%edi)
+	movd  120(%edx), %mm0
+	pfadd 100(%edx), %mm0
+	movq  %mm0, %mm1
+	pfadd  88(%edx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 96(%esi)
+	pfadd  68(%edx), %mm1
+	pf2id  %mm1, %mm1
+	movd   %mm1, %eax
+	movw   %ax, 32(%esi)
 
-	flds  116(%edx)
-	fadds 108(%edx)
-	fld      %st(0)
-	fadds  84(%edx)
-	fistp 160(%edi)
-	fadds  76(%edx)
-	fistp 224(%edi)
+	movq  124(%edx), %mm0
+	pf2id  %mm0, %mm1
+	movd   %mm1, %eax
+	movw   %ax, 480(%edi)
+	pfadd  92(%edx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 416(%edi)
 
-	flds  108(%edx)
-	fadds 124(%edx)
-	fld      %st(0)
-	fadds  76(%edx)
-	fistp 288(%edi)
-	fadds  92(%edx)
-	fistp 352(%edi)
+	movsw
 
-	flds  124(%edx)
-	fist  480(%edi)
-	fadds  92(%edx)
-	fistp 416(%edi)
-	movsw
 .L_bye:
 	addl $256,%esp
+	femms
 	popl %edi
 	popl %esi
 	popl %ebx
--- a/mp3lib/dct64_MMX.s	Wed Jul 04 07:00:15 2001 +0000
+++ b/mp3lib/dct64_MMX.s	Wed Jul 04 09:47:56 2001 +0000
@@ -1,41 +1,7 @@
 # This code was taken from http://www.mpg123.org
 # See ChangeLog of mpg123-0.59s-pre.1 for detail
 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
-
-.data
-	.align 4
-costab:
-	.long 1056974725
-	.long 1057056395
-	.long 1057223771
-	.long 1057485416
-	.long 1057855544
-	.long 1058356026
-	.long 1059019886
-	.long 1059897405
-	.long 1061067246
-	.long 1062657950
-	.long 1064892987
-	.long 1066774581
-	.long 1069414683
-	.long 1073984175
-	.long 1079645762
-	.long 1092815430
-	.long 1057005197
-	.long 1057342072
-	.long 1058087743
-	.long 1059427869
-	.long 1061799040
-	.long 1065862217
-	.long 1071413542
-	.long 1084439708
-	.long 1057128951
-	.long 1058664893
-	.long 1063675095
-	.long 1076102863
-	.long 1057655764
-	.long 1067924853
-	.long 1060439283
+#
 
 .text
 
@@ -57,7 +23,7 @@
 	movl 276(%esp),%edi
 
 	flds    4(%eax)
-	movl $costab,%ebx
+	movl $costab_mmx,%ebx
 	fadds 120(%eax)
 	orl %ecx,%ecx
 	fstps   4(%edx)
--- a/mp3lib/dct64_k7.s	Wed Jul 04 07:00:15 2001 +0000
+++ b/mp3lib/dct64_k7.s	Wed Jul 04 09:47:56 2001 +0000
@@ -3,45 +3,10 @@
 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
 # Partial 3dnowex-DSP! optimization by Nick Kurshev
 #
-# TODO: finish 3dnow! optimization at least in scalar mode
+# TODO: optimize scalar 3dnow! code
+# Warning: Phases 7 & 8 are not tested
 #
 
-.data
-	.align 8
-plus_minus_3dnow: .long 0x00000000, 0x80000000
-costab:
-	.long 1056974725
-	.long 1057056395
-	.long 1057223771
-	.long 1057485416
-	.long 1057855544
-	.long 1058356026
-	.long 1059019886
-	.long 1059897405
-	.long 1061067246
-	.long 1062657950
-	.long 1064892987
-	.long 1066774581
-	.long 1069414683
-	.long 1073984175
-	.long 1079645762
-	.long 1092815430
-	.long 1057005197
-	.long 1057342072
-	.long 1058087743
-	.long 1059427869
-	.long 1061799040
-	.long 1065862217
-	.long 1071413542
-	.long 1084439708
-	.long 1057128951
-	.long 1058664893
-	.long 1063675095
-	.long 1076102863
-	.long 1057655764
-	.long 1067924853
-	.long 1060439283
-
 .text
 
 	.align 16
@@ -57,10 +22,10 @@
 	leal 128(%esp),%edx
 	movl 272(%esp),%esi
 	movl 276(%esp),%edi
-	movl $costab,%ebx
+	movl $costab_mmx,%ebx
 	orl %ecx,%ecx
 	movl %esp,%ecx
-	femms	
+
 /* Phase 1*/
 	movq	(%eax), %mm0
 	movq	8(%eax), %mm4
@@ -407,395 +372,419 @@
 	movq	%mm7, 120(%ecx)
 	
 /* Phase 6. This is the end of easy road. */
-	movl	$1, %eax
-	movd	%eax, %mm7
-	pi2fd	%mm7, %mm7
-	movq	32(%ecx), %mm0
-	punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */	
-	movq	%mm0, %mm1
-	movq	plus_minus_3dnow, %mm6
-	/* n.b.: pfpnacc */
-	pxor	%mm6, %mm1
-	pfacc	%mm1, %mm0
-	/**/
-	pfmul	%mm7, %mm0
-	movq	%mm0, 32(%edx)
-	femms
+/* Code below is coded in scalar mode. Should be optimized */
+
+	movd   32(%ecx), %mm0
+	pfadd  36(%ecx), %mm0
+	movd   %mm0, 32(%edx)
+
+	movd   32(%ecx), %mm0
+	pfsub  36(%ecx), %mm0
+	pfmul  120(%ebx),%mm0
+	movd   %mm0, 36(%edx)
 
-	flds   44(%ecx)
-	fsubs  40(%ecx)
-	fmuls 120(%ebx)
+	movd   44(%ecx), %mm0
+	pfsub  40(%ecx), %mm0
+	pfmul  120(%ebx),%mm0
 
-	fsts   44(%edx)
-	fadds  40(%ecx) /* pfacc 40(ecx), 56(%ecx) */
-	fadds  44(%ecx)
-	fstps  40(%edx)
+	movd   %mm0, 44(%edx)
+	pfadd  40(%ecx), %mm0
+	pfadd  44(%ecx), %mm0
+	movd   %mm0, 40(%edx)
+
+	movd   48(%ecx), %mm3
+	pfsub  52(%ecx), %mm3
+	pfmul 120(%ebx), %mm3
 
-	flds   48(%ecx)
-	fsubs  52(%ecx)
-	fmuls 120(%ebx)
+	movd   60(%ecx), %mm2
+	pfsub  56(%ecx), %mm2
+	pfmul 120(%ebx), %mm2
+	movq	%mm2, %mm1
 
-	flds   60(%ecx)
-	fsubs  56(%ecx)
-	fmuls 120(%ebx)
-
-	fld      %st(0)
-	fadds  56(%ecx)
-	fadds  60(%ecx)
+	pfadd  56(%ecx), %mm1
+	pfadd  60(%ecx), %mm1
+	movq	%mm1, %mm0
 
-	fld      %st(0)
-	fadds  48(%ecx)
-	fadds  52(%ecx)
-	fstps  48(%edx)
-	fadd     %st(2)
-	fstps  56(%edx)
-	fsts   60(%edx)
-	faddp    %st(1)
-	fstps  52(%edx)
+	pfadd  48(%ecx), %mm0
+	pfadd  52(%ecx), %mm0
+	movd   %mm0, 48(%edx)
+	pfadd  %mm3, %mm1
+	movd   %mm1, 56(%edx)
+	movd   %mm2, 60(%edx)
+	pfadd  %mm3, %mm2
+	movd   %mm2, 52(%edx)
+
 /*---*/
-	flds   64(%ecx)
-	fadds  68(%ecx)
-	fstps  64(%edx)
+	movd   64(%ecx), %mm0
+	pfadd  68(%ecx), %mm0
+	movd   %mm0, 64(%edx)
 
-	flds   64(%ecx)
-	fsubs  68(%ecx)
-	fmuls 120(%ebx)
-	fstps  68(%edx)
+	movd   64(%ecx), %mm0
+	pfsub  68(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd   %mm0, 68(%edx)
 
-	flds   76(%ecx)
-	fsubs  72(%ecx)
-	fmuls 120(%ebx)
-	fsts   76(%edx)
-	fadds  72(%ecx)
-	fadds  76(%ecx)
-	fstps  72(%edx)
-
-	flds   92(%ecx)
-	fsubs  88(%ecx)
-	fmuls 120(%ebx)
-	fsts   92(%edx)
-	fadds  92(%ecx)
-	fadds  88(%ecx)
+	movd   76(%ecx), %mm0
+	pfsub  72(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd   %mm0, 76(%edx)
+	pfadd  72(%ecx), %mm0
+	pfadd  76(%ecx), %mm0
+	movd   %mm0, 72(%edx)
 
-	fld      %st(0)
-	fadds  80(%ecx)
-	fadds  84(%ecx)
-	fstps  80(%edx)
+	movd   92(%ecx), %mm0
+	pfsub  88(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd   %mm0, 92(%edx)
+	pfadd  92(%ecx), %mm0
+	pfadd  88(%ecx), %mm0
+	movq   %mm0, %mm1
+	
+	pfadd  80(%ecx), %mm0
+	pfadd  84(%ecx), %mm0
+	movd   %mm0, 80(%edx)
 
-	flds   80(%ecx)
-	fsubs  84(%ecx)
-	fmuls 120(%ebx)
-	fadd  %st(0), %st(1)
-	fadds 92(%edx)
-	fstps 84(%edx)
-	fstps 88(%edx)
+	movd   80(%ecx), %mm0
+	pfsub  84(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	pfadd  %mm0, %mm1
+	pfadd  92(%edx), %mm0
+	movd   %mm0, 84(%edx)
+	movd   %mm1, 88(%edx)
 
-	flds   96(%ecx)
-	fadds 100(%ecx)
-	fstps  96(%edx)
+	movd   96(%ecx), %mm0
+	pfadd 100(%ecx), %mm0
+	movd   %mm0, 96(%edx)
 
-	flds   96(%ecx)
-	fsubs 100(%ecx)
-	fmuls 120(%ebx)
-	fstps 100(%edx)
+	movd   96(%ecx), %mm0
+	pfsub 100(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd  %mm0, 100(%edx)
 
-	flds  108(%ecx)
-	fsubs 104(%ecx)
-	fmuls 120(%ebx)
-	fsts  108(%edx)
-	fadds 104(%ecx)
-	fadds 108(%ecx)
-	fstps 104(%edx)
+	movd  108(%ecx), %mm0
+	pfsub 104(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd  %mm0, 108(%edx)
+	pfadd 104(%ecx), %mm0
+	pfadd 108(%ecx), %mm0
+	movd  %mm0, 104(%edx)
 
-	flds  124(%ecx)
-	fsubs 120(%ecx)
-	fmuls 120(%ebx)
-	fsts  124(%edx)
-	fadds 120(%ecx)
-	fadds 124(%ecx)
+	movd  124(%ecx), %mm0
+	pfsub 120(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd  %mm0, 124(%edx)
+	pfadd 120(%ecx), %mm0
+	pfadd 124(%ecx), %mm0
+	movq  %mm0, %mm1
 
-	fld      %st(0)
-	fadds 112(%ecx)
-	fadds 116(%ecx)
-	fstps 112(%edx)
+	pfadd 112(%ecx), %mm0
+	pfadd 116(%ecx), %mm0
+	movd  %mm0, 112(%edx)
 
-	flds  112(%ecx)
-	fsubs 116(%ecx)
-	fmuls 120(%ebx)
-	fadd  %st(0),%st(1)
-	fadds 124(%edx)
-	fstps 116(%edx)
-	fstps 120(%edx)
+	movd  112(%ecx), %mm0
+	pfsub 116(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	pfadd %mm0,%mm1
+	pfadd 124(%edx), %mm0
+	movd  %mm0, 116(%edx)
+	movd  %mm1, 120(%edx)
+
 	jnz .L01
 	
 /* Phase 7*/
+/* Code below is coded in scalar mode. Should be optimized */
 
-	flds      (%ecx)
-	fadds    4(%ecx)
-	fstps 1024(%esi)
+	movd      (%ecx), %mm0
+	pfadd    4(%ecx), %mm0
+	movd     %mm0, 1024(%esi)
 
-	flds      (%ecx)
-	fsubs    4(%ecx)
-	fmuls  120(%ebx)
-	fsts      (%esi)
-	fstps     (%edi)
+	movd      (%ecx), %mm0
+	pfsub    4(%ecx), %mm0
+	pfmul  120(%ebx), %mm0
+	movd      %mm0, (%esi)
+	movd      %mm0, (%edi)
 
-	flds   12(%ecx)
-	fsubs   8(%ecx)
-	fmuls 120(%ebx)
-	fsts  512(%edi)
-	fadds  12(%ecx)
-	fadds   8(%ecx)
-	fstps 512(%esi)
+	movd   12(%ecx), %mm0
+	pfsub   8(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd    %mm0, 512(%edi)
+	pfadd   12(%ecx), %mm0
+	pfadd   8(%ecx), %mm0
+	movd    %mm0, 512(%esi)
 
-	flds   16(%ecx)
-	fsubs  20(%ecx)
-	fmuls 120(%ebx)
+	movd   16(%ecx), %mm0
+	pfsub  20(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movq	%mm0, %mm3
 
-	flds   28(%ecx)
-	fsubs  24(%ecx)
-	fmuls 120(%ebx)
-	fsts  768(%edi)
-	fld      %st(0)
-	fadds  24(%ecx)
-	fadds  28(%ecx)
-	fld      %st(0)
-	fadds  16(%ecx)
-	fadds  20(%ecx)
-	fstps 768(%esi)
-	fadd     %st(2)
-	fstps 256(%esi)
-	faddp    %st(1)
-	fstps 256(%edi)
+	movd   28(%ecx), %mm0
+	pfsub  24(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	movd    %mm0, 768(%edi)
+	movq	%mm0, %mm2
+	
+	pfadd  24(%ecx), %mm0
+	pfadd  28(%ecx), %mm0
+	movq	%mm0, %mm1
+
+	pfadd  16(%ecx), %mm0
+	pfadd  20(%ecx), %mm0
+	movd   %mm0, 768(%esi)
+	pfadd  %mm3, %mm1
+	movd   %mm1, 256(%esi)
+	pfadd  %mm3, %mm2
+	movd   %mm2, 256(%edi)
 	
 /* Phase 8*/
 
-	flds   32(%edx)
-	fadds  48(%edx)
-	fstps 896(%esi)
-
-	flds   48(%edx)
-	fadds  40(%edx)
-	fstps 640(%esi)
-
-	flds   40(%edx)
-	fadds  56(%edx)
-	fstps 384(%esi)
-
-	flds   56(%edx)
-	fadds  36(%edx)
-	fstps 128(%esi)
-
-	flds   36(%edx)
-	fadds  52(%edx)
-	fstps 128(%edi)
+	movq   32(%edx), %mm0
+	movq   48(%edx), %mm1
+	pfadd  48(%edx), %mm0
+	pfadd  40(%edx), %mm1
+	movd   %mm0, 896(%esi)
+	movd   %mm1, 640(%esi)
+	psrlq  $32, %mm0
+	psrlq  $32, %mm1
+	movd   %mm0, 128(%edi)
+	movd   %mm1, 384(%edi)
 
-	flds   52(%edx)
-	fadds  44(%edx)
-	fstps 384(%edi)
-
-	flds   60(%edx)
-	fsts  896(%edi)
-	fadds  44(%edx)
-	fstps 640(%edi)
+	movd   40(%edx), %mm0
+	pfadd  56(%edx), %mm0
+	movd   %mm0, 384(%esi)
 
-	flds   96(%edx)
-	fadds 112(%edx)
-	fld      %st(0)
-	fadds  64(%edx)
-	fstps 960(%esi)
-	fadds  80(%edx)
-	fstps 832(%esi)
+	movd   56(%edx), %mm0
+	pfadd  36(%edx), %mm0
+	movd   %mm0, 128(%esi)
 
-	flds  112(%edx)
-	fadds 104(%edx)
-	fld      %st(0)
-	fadds  80(%edx)
-	fstps 704(%esi)
-	fadds  72(%edx)
-	fstps 576(%esi)
+	movd   60(%edx), %mm0
+	movd   %mm0, 896(%edi)
+	pfadd  44(%edx), %mm0
+	movd   %mm0, 640(%edi)
 
-	flds  104(%edx)
-	fadds 120(%edx)
-	fld      %st(0)
-	fadds  72(%edx)
-	fstps 448(%esi)
-	fadds  88(%edx)
-	fstps 320(%esi)
-
-	flds  120(%edx)
-	fadds 100(%edx)
-	fld      %st(0)
-	fadds  88(%edx)
-	fstps 192(%esi)
-	fadds  68(%edx)
-	fstps  64(%esi)
+	movq   96(%edx), %mm0
+	movq   112(%edx), %mm2
+	movq   104(%edx), %mm4
+	pfadd  112(%edx), %mm0
+	pfadd  104(%edx), %mm2
+	pfadd  120(%edx), %mm4
+	movq   %mm0, %mm1
+	movq   %mm2, %mm3
+	movq   %mm4, %mm5
+	pfadd  64(%edx), %mm0
+	pfadd  80(%edx), %mm2
+	pfadd  72(%edx), %mm4
+	movd   %mm0, 960(%esi)
+	movd   %mm2, 704(%esi)
+	movd   %mm4, 448(%esi)
+	psrlq  $32, %mm0
+	psrlq  $32, %mm2
+	psrlq  $32, %mm4
+	movd   %mm0, 64(%edi)
+	movd   %mm2, 320(%edi)
+	movd   %mm4, 576(%edi)
+	pfadd  80(%edx), %mm1
+	pfadd  72(%edx), %mm3
+	pfadd  88(%edx), %mm5
+	movd   %mm1, 832(%esi)
+	movd   %mm3, 576(%esi)
+	movd   %mm5, 320(%esi)
+	psrlq  $32, %mm1
+	psrlq  $32, %mm3
+	psrlq  $32, %mm5
+	movd   %mm1, 192(%edi)
+	movd   %mm3, 448(%edi)
+	movd   %mm5, 704(%edi)
 
-	flds  100(%edx)
-	fadds 116(%edx)
-	fld      %st(0)
-	fadds  68(%edx)
-	fstps  64(%edi)
-	fadds  84(%edx)
-	fstps 192(%edi)
+	movd   120(%edx), %mm0
+	pfadd  100(%edx), %mm0
+	movq   %mm0, %mm1
+	pfadd  88(%edx), %mm0
+	movd   %mm0, 192(%esi)
+	pfadd  68(%edx), %mm1
+	movd   %mm1, 64(%esi)
 
-	flds  116(%edx)
-	fadds 108(%edx)
-	fld      %st(0)
-	fadds  84(%edx)
-	fstps 320(%edi)
-	fadds  76(%edx)
-	fstps 448(%edi)
+	movd  124(%edx), %mm0
+	movd  %mm0, 960(%edi)
+	pfadd  92(%edx), %mm0
+	movd  %mm0, 832(%edi)
 
-	flds  108(%edx)
-	fadds 124(%edx)
-	fld      %st(0)
-	fadds  76(%edx)
-	fstps 576(%edi)
-	fadds  92(%edx)
-	fstps 704(%edi)
-
-	flds  124(%edx)
-	fsts  960(%edi)
-	fadds  92(%edx)
-	fstps 832(%edi)
 	jmp	.L_bye
 .L01:	
 /* Phase 9*/
-
-	flds      (%ecx)
-	fadds    4(%ecx)
-	fistp  512(%esi)
-
-	flds      (%ecx)
-	fsubs    4(%ecx)
-	fmuls  120(%ebx)
+	movd      (%ecx), %mm0
+	pfadd    4(%ecx), %mm0
+	pf2id    %mm0, %mm0
+	movd	 %mm0, %eax
+	movw     %ax, 512(%esi)
 
-	fistp     (%esi)
-
+	movd      (%ecx), %mm0
+	pfsub    4(%ecx), %mm0
+	pfmul  120(%ebx), %mm0
+	pf2id    %mm0, %mm0
+	movd	 %mm0, %eax
+	movw     %ax, (%esi)
 
-	flds    12(%ecx)
-	fsubs    8(%ecx)
-	fmuls  120(%ebx)
-	fist   256(%edi)
-	fadds   12(%ecx)
-	fadds    8(%ecx)
-	fistp  256(%esi)
+	movd    12(%ecx), %mm0
+	pfsub    8(%ecx), %mm0
+	pfmul  120(%ebx), %mm0
+	pf2id    %mm0, %mm7
+	movd	 %mm7, %eax
+	movw     %ax, 256(%edi)
+	pfadd   12(%ecx), %mm0
+	pfadd    8(%ecx), %mm0
+	pf2id    %mm0, %mm0
+	movd	 %mm0, %eax
+	movw     %ax, 256(%esi)
 
-	flds   16(%ecx)
-	fsubs  20(%ecx)
-	fmuls 120(%ebx)
+	movd   16(%ecx), %mm0
+	pfsub  20(%ecx), %mm0
+	pfmul  120(%ebx), %mm0
+	movq   %mm0, %mm3
 
-	flds   28(%ecx)
-	fsubs  24(%ecx)
-	fmuls 120(%ebx)
-	fist  384(%edi)
-	fld      %st(0)
-	fadds  24(%ecx)
-	fadds  28(%ecx)
-	fld      %st(0)
-	fadds  16(%ecx)
-	fadds  20(%ecx)
-	fistp  384(%esi)
-	fadd     %st(2)
-	fistp  128(%esi)
-	faddp    %st(1)
-	fistp  128(%edi)
+	movd   28(%ecx), %mm0
+	pfsub  24(%ecx), %mm0
+	pfmul 120(%ebx), %mm0
+	pf2id  %mm0, %mm7
+	movd   %mm7, %eax
+	movw   %ax, 384(%edi)
+	movq   %mm0, %mm2
+	
+	pfadd  24(%ecx), %mm0
+	pfadd  28(%ecx), %mm0
+	movq   %mm0, %mm1
+	pfadd  16(%ecx), %mm0
+	pfadd  20(%ecx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 384(%esi)
+	pfadd  %mm3, %mm1
+	pf2id  %mm1, %mm1
+	movd   %mm1, %eax
+	movw   %ax, 128(%esi)
+	pfadd  %mm3, %mm2
+	pf2id  %mm2, %mm2
+	movd   %mm2, %eax
+	movw   %ax, 128(%edi)
+	
 	
 /* Phase 10*/
 
-	flds    32(%edx)
-	fadds   48(%edx)
-	fistp  448(%esi)
-
-	flds   48(%edx)
-	fadds  40(%edx)
-	fistp 320(%esi)
-
-	flds   40(%edx)
-	fadds  56(%edx)
-	fistp 192(%esi)
-
-	flds   56(%edx)
-	fadds  36(%edx)
-	fistp  64(%esi)
-
-	flds   36(%edx)
-	fadds  52(%edx)
-	fistp  64(%edi)
+	movq    32(%edx), %mm0
+	movq    48(%edx), %mm1
+	pfadd   48(%edx), %mm0
+	pfadd   40(%edx), %mm1
+	pf2id   %mm0, %mm0
+	pf2id   %mm1, %mm1
+	movd	%mm0, %eax
+	movd	%mm1, %ecx
+	movw    %ax, 448(%esi)
+	movw    %cx, 320(%esi)
+	psrlq   $32, %mm0
+	psrlq   $32, %mm1
+	movd	%mm0, %eax
+	movd	%mm1, %ecx
+	movw    %ax, 64(%edi)
+	movw    %cx, 192(%edi)
 
-	flds   52(%edx)
-	fadds  44(%edx)
-	fistp 192(%edi)
-
-	flds   60(%edx)
-	fist   448(%edi)
-	fadds  44(%edx)
-	fistp 320(%edi)
+	movd   40(%edx), %mm0
+	pfadd  56(%edx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 192(%esi)
 
-	flds   96(%edx)
-	fadds 112(%edx)
-	fld      %st(0)
-	fadds  64(%edx)
-	fistp 480(%esi)
-	fadds  80(%edx)
-	fistp 416(%esi)
+	movd   56(%edx), %mm0
+	pfadd  36(%edx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 64(%esi)
 
-	flds  112(%edx)
-	fadds 104(%edx)
-	fld      %st(0)
-	fadds  80(%edx)
-	fistp 352(%esi)
-	fadds  72(%edx)
-	fistp 288(%esi)
+	movd   60(%edx), %mm0
+	pf2id  %mm0, %mm7
+	movd   %mm7, %eax
+	movw   %ax, 448(%edi)
+	pfadd  44(%edx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 320(%edi)
 
-	flds  104(%edx)
-	fadds 120(%edx)
-	fld      %st(0)
-	fadds  72(%edx)
-	fistp 224(%esi)
-	fadds  88(%edx)
-	fistp 160(%esi)
-
-	flds  120(%edx)
-	fadds 100(%edx)
-	fld      %st(0)
-	fadds  88(%edx)
-	fistp  96(%esi)
-	fadds  68(%edx)
-	fistp  32(%esi)
+	movq   96(%edx), %mm0
+	movq  112(%edx), %mm2
+	movq  104(%edx), %mm4
+	pfadd 112(%edx), %mm0
+	pfadd 104(%edx), %mm2
+	pfadd 120(%edx), %mm4
+	movq  %mm0, %mm1
+	movq  %mm2, %mm3
+	movq  %mm4, %mm5
+	pfadd  64(%edx), %mm0
+	pfadd  80(%edx), %mm2
+	pfadd  72(%edx), %mm4
+	pf2id  %mm0, %mm7
+	pf2id  %mm2, %mm6
+	pf2id  %mm4, %mm4
+	movd   %mm7, %eax
+	movd   %mm6, %ecx
+	movd   %mm4, %ebx
+	movw   %ax, 480(%esi)
+	movw   %cx, 352(%esi)
+	movw   %bx, 224(%esi)
+	psrlq  $32, %mm7
+	psrlq  $32, %mm6
+	psrlq  $32, %mm4
+	movd   %mm7, %eax
+	movd   %mm6, %ecx
+	movd   %mm4, %ebx
+	movw   %ax, 32(%edi)
+	movw   %cx, 160(%edi)
+	movw   %bx, 288(%edi)
+	pfadd  80(%edx), %mm1
+	pfadd  72(%edx), %mm3
+	pfadd  88(%edx), %mm5
+	pf2id  %mm1, %mm1
+	pf2id  %mm3, %mm3
+	pf2id  %mm5, %mm5
+	movd   %mm1, %eax
+	movd   %mm3, %ecx
+	movd   %mm5, %ebx
+	movw   %ax, 416(%esi)
+	movw   %cx, 288(%esi)
+	movw   %bx, 160(%esi)
+	psrlq  $32, %mm1
+	psrlq  $32, %mm3
+	psrlq  $32, %mm5
+	movd   %mm1, %eax
+	movd   %mm3, %ecx
+	movd   %mm5, %ebx
+	movw   %ax, 96(%edi)
+	movw   %cx, 224(%edi)
+	movw   %bx, 352(%edi)
 
-	flds  100(%edx)
-	fadds 116(%edx)
-	fld      %st(0)
-	fadds  68(%edx)
-	fistp  32(%edi)
-	fadds  84(%edx)
-	fistp  96(%edi)
+	movd  120(%edx), %mm0
+	pfadd 100(%edx), %mm0
+	movq  %mm0, %mm1
+	pfadd  88(%edx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 96(%esi)
+	pfadd  68(%edx), %mm1
+	pf2id  %mm1, %mm1
+	movd   %mm1, %eax
+	movw   %ax, 32(%esi)
 
-	flds  116(%edx)
-	fadds 108(%edx)
-	fld      %st(0)
-	fadds  84(%edx)
-	fistp 160(%edi)
-	fadds  76(%edx)
-	fistp 224(%edi)
+	movq  124(%edx), %mm0
+	pf2id  %mm0, %mm1
+	movd   %mm1, %eax
+	movw   %ax, 480(%edi)
+	pfadd  92(%edx), %mm0
+	pf2id  %mm0, %mm0
+	movd   %mm0, %eax
+	movw   %ax, 416(%edi)
 
-	flds  108(%edx)
-	fadds 124(%edx)
-	fld      %st(0)
-	fadds  76(%edx)
-	fistp 288(%edi)
-	fadds  92(%edx)
-	fistp 352(%edi)
+	movsw
 
-	flds  124(%edx)
-	fist  480(%edi)
-	fadds  92(%edx)
-	fistp 416(%edi)
-	movsw
 .L_bye:
 	addl $256,%esp
+	femms
 	popl %edi
 	popl %esi
 	popl %ebx
--- a/mp3lib/decode_MMX.s	Wed Jul 04 07:00:15 2001 +0000
+++ b/mp3lib/decode_MMX.s	Wed Jul 04 09:47:56 2001 +0000
@@ -11,6 +11,39 @@
 .align 8
 null_one: .long 0x0000ffff, 0x0000ffff
 one_null: .long 0xffff0000, 0xffff0000
+.globl costab_mmx
+costab_mmx:
+	.long 1056974725
+	.long 1057056395
+	.long 1057223771
+	.long 1057485416
+	.long 1057855544
+	.long 1058356026
+	.long 1059019886
+	.long 1059897405
+	.long 1061067246
+	.long 1062657950
+	.long 1064892987
+	.long 1066774581
+	.long 1069414683
+	.long 1073984175
+	.long 1079645762
+	.long 1092815430
+	.long 1057005197
+	.long 1057342072
+	.long 1058087743
+	.long 1059427869
+	.long 1061799040
+	.long 1065862217
+	.long 1071413542
+	.long 1084439708
+	.long 1057128951
+	.long 1058664893
+	.long 1063675095
+	.long 1076102863
+	.long 1057655764
+	.long 1067924853
+	.long 1060439283
 
 .text