diff mp3lib/dct64_3dnow.s @ 1245:03b7e2955a20

Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
author nick
date Fri, 29 Jun 2001 17:55:35 +0000
parents 3b5f5d1c5041
children 2864e32cd267
line wrap: on
line diff
--- a/mp3lib/dct64_3dnow.s	Fri Jun 29 10:54:41 2001 +0000
+++ b/mp3lib/dct64_3dnow.s	Fri Jun 29 17:55:35 2001 +0000
@@ -1,706 +1,932 @@
-///
-/// Replacement of dct64() with AMD's 3DNow! SIMD operations support
-///
-/// Syuuhei Kashiyama <squash@mb.kcom.ne.jp>
-///
-/// The author of this program disclaim whole expressed or implied
-/// warranties with regard to this program, and in no event shall the
-/// author of this program liable to whatever resulted from the use of
-/// this program. Use it at your own risk.
-///
+# This code was taken from http://www.mpg123.org
+# See ChangeLog of mpg123-0.59s-pre.1 for detail
+# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+# Partial 3dnow! optimization by Nick Kurshev
+#
+# TODO: finish 3dnow! optimization at least in scalar mode
+#
 
-        .globl dct64_3dnow
-        .type    dct64_3dnow,@function
-dct64_3dnow:
-        subl $256,%esp
-        pushl %ebp
-        pushl %edi
-        pushl %esi
-        pushl %ebx
-        leal 16(%esp),%ebx
-        movl 284(%esp),%edi
-        movl 276(%esp),%ebp
-        movl 280(%esp),%edx
-        leal 128(%ebx),%esi
+.data
+	.align 8
+plus_minus_3dnow: .long 0x00000000, 0x80000000
+costab:
+	.long 1056974725
+	.long 1057056395
+	.long 1057223771
+	.long 1057485416
+	.long 1057855544
+	.long 1058356026
+	.long 1059019886
+	.long 1059897405
+	.long 1061067246
+	.long 1062657950
+	.long 1064892987
+	.long 1066774581
+	.long 1069414683
+	.long 1073984175
+	.long 1079645762
+	.long 1092815430
+	.long 1057005197
+	.long 1057342072
+	.long 1058087743
+	.long 1059427869
+	.long 1061799040
+	.long 1065862217
+	.long 1071413542
+	.long 1084439708
+	.long 1057128951
+	.long 1058664893
+	.long 1063675095
+	.long 1076102863
+	.long 1057655764
+	.long 1067924853
+	.long 1060439283
 
-        / femms
+.text
+
+	.align 16
+
+.globl dct64_MMX_3dnow
+dct64_MMX_3dnow:
+	pushl %ebx
+	pushl %esi
+	pushl %edi
+	subl $256,%esp
+	movl 280(%esp),%eax
 
-        // 1
-        movl pnts,%eax
-        movq 0(%edi),%mm0
-        movq %mm0,%mm1
-        movd 124(%edi),%mm2
-        punpckldq 120(%edi),%mm2
-        movq 0(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,0(%ebx)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,124(%ebx)
-        psrlq $32,%mm1
-        movd %mm1,120(%ebx)
-        movq 8(%edi),%mm4
-        movq %mm4,%mm5
-        movd 116(%edi),%mm6
-        punpckldq 112(%edi),%mm6
-        movq 8(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,8(%ebx)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,116(%ebx)
-        psrlq $32,%mm5
-        movd %mm5,112(%ebx)
-        movq 16(%edi),%mm0
-        movq %mm0,%mm1
-        movd 108(%edi),%mm2
-        punpckldq 104(%edi),%mm2
-        movq 16(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,16(%ebx)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,108(%ebx)
-        psrlq $32,%mm1
-        movd %mm1,104(%ebx)
-        movq 24(%edi),%mm4
-        movq %mm4,%mm5
-        movd 100(%edi),%mm6
-        punpckldq 96(%edi),%mm6
-        movq 24(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,24(%ebx)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,100(%ebx)
-        psrlq $32,%mm5
-        movd %mm5,96(%ebx)
-        movq 32(%edi),%mm0
-        movq %mm0,%mm1
-        movd 92(%edi),%mm2
-        punpckldq 88(%edi),%mm2
-        movq 32(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,32(%ebx)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,92(%ebx)
-        psrlq $32,%mm1
-        movd %mm1,88(%ebx)
-        movq 40(%edi),%mm4
-        movq %mm4,%mm5
-        movd 84(%edi),%mm6
-        punpckldq 80(%edi),%mm6
-        movq 40(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,40(%ebx)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,84(%ebx)
-        psrlq $32,%mm5
-        movd %mm5,80(%ebx)
-        movq 48(%edi),%mm0
-        movq %mm0,%mm1
-        movd 76(%edi),%mm2
-        punpckldq 72(%edi),%mm2
-        movq 48(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,48(%ebx)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,76(%ebx)
-        psrlq $32,%mm1
-        movd %mm1,72(%ebx)
-        movq 56(%edi),%mm4
-        movq %mm4,%mm5
-        movd 68(%edi),%mm6
-        punpckldq 64(%edi),%mm6
-        movq 56(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,56(%ebx)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,68(%ebx)
-        psrlq $32,%mm5
-        movd %mm5,64(%ebx)
+	leal 128(%esp),%edx
+	movl 272(%esp),%esi
+	movl 276(%esp),%edi
+	movl $costab,%ebx
+	orl %ecx,%ecx
+	movl %esp,%ecx
+	femms	
+/* Phase 1*/
+	movq	(%eax), %mm0
+	movq	8(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	120(%eax), %mm1
+	movq	112(%eax), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%edx)
+	movq	%mm4, 8(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	(%ebx), %mm3
+	pfmul	8(%ebx), %mm7
+	movd	%mm3, 124(%edx)
+	movd	%mm7, 116(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 120(%edx)
+	movd	%mm7, 112(%edx)
+
+	movq	16(%eax), %mm0
+	movq	24(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	104(%eax), %mm1
+	movq	96(%eax), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 16(%edx)
+	movq	%mm4, 24(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	16(%ebx), %mm3
+	pfmul	24(%ebx), %mm7
+	movd	%mm3, 108(%edx)
+	movd	%mm7, 100(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 104(%edx)
+	movd	%mm7, 96(%edx)
+
+	movq	32(%eax), %mm0
+	movq	40(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	88(%eax), %mm1
+	movq	80(%eax), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 32(%edx)
+	movq	%mm4, 40(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	32(%ebx), %mm3
+	pfmul	40(%ebx), %mm7
+	movd	%mm3, 92(%edx)
+	movd	%mm7, 84(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 88(%edx)
+	movd	%mm7, 80(%edx)
+
+	movq	48(%eax), %mm0
+	movq	56(%eax), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	72(%eax), %mm1
+	movq	64(%eax), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 48(%edx)
+	movq	%mm4, 56(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	48(%ebx), %mm3
+	pfmul	56(%ebx), %mm7
+	movd	%mm3, 76(%edx)
+	movd	%mm7, 68(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 72(%edx)
+	movd	%mm7, 64(%edx)
+
+/* Phase 2*/
 
-        // 2
-        movl pnts+4,%eax
-        / 0, 14
-        movq 0(%ebx),%mm0
-        movq %mm0,%mm1
-        movd 60(%ebx),%mm2
-        punpckldq 56(%ebx),%mm2
-        movq 0(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,0(%esi)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,60(%esi)
-        psrlq $32,%mm1
-        movd %mm1,56(%esi)
-        / 16, 30
-        movq 64(%ebx),%mm0
-        movq %mm0,%mm1
-        movd 124(%ebx),%mm2
-        punpckldq 120(%ebx),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,64(%esi)
-        pfsubr %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,124(%esi)
-        psrlq $32,%mm1
-        movd %mm1,120(%esi)
-        movq 8(%ebx),%mm4
-        / 2, 12
-        movq %mm4,%mm5
-        movd 52(%ebx),%mm6
-        punpckldq 48(%ebx),%mm6
-        movq 8(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,8(%esi)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,52(%esi)
-        psrlq $32,%mm5
-        movd %mm5,48(%esi)
-        movq 72(%ebx),%mm4
-        / 18, 28
-        movq %mm4,%mm5
-        movd 116(%ebx),%mm6
-        punpckldq 112(%ebx),%mm6
-        pfadd %mm6,%mm4
-        movq %mm4,72(%esi)
-        pfsubr %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,116(%esi)
-        psrlq $32,%mm5
-        movd %mm5,112(%esi)
-        movq 16(%ebx),%mm0
-        / 4, 10
-        movq %mm0,%mm1
-        movd 44(%ebx),%mm2
-        punpckldq 40(%ebx),%mm2
-        movq 16(%eax),%mm3
-        pfadd %mm2,%mm0
-        movq %mm0,16(%esi)
-        pfsub %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,44(%esi)
-        psrlq $32,%mm1
-        movd %mm1,40(%esi)
-        movq 80(%ebx),%mm0
-        / 20, 26
-        movq %mm0,%mm1
-        movd 108(%ebx),%mm2
-        punpckldq 104(%ebx),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,80(%esi)
-        pfsubr %mm2,%mm1
-        pfmul %mm3,%mm1
-        movd %mm1,108(%esi)
-        psrlq $32,%mm1
-        movd %mm1,104(%esi)
-        movq 24(%ebx),%mm4
-        / 6, 8
-        movq %mm4,%mm5
-        movd 36(%ebx),%mm6
-        punpckldq 32(%ebx),%mm6
-        movq 24(%eax),%mm7
-        pfadd %mm6,%mm4
-        movq %mm4,24(%esi)
-        pfsub %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,36(%esi)
-        psrlq $32,%mm5
-        movd %mm5,32(%esi)
-        movq 88(%ebx),%mm4
-        / 22, 24
-        movq %mm4,%mm5
-        movd 100(%ebx),%mm6
-        punpckldq 96(%ebx),%mm6
-        pfadd %mm6,%mm4
-        movq %mm4,88(%esi)
-        pfsubr %mm6,%mm5
-        pfmul %mm7,%mm5
-        movd %mm5,100(%esi)
-        psrlq $32,%mm5
-        movd %mm5,96(%esi)
+	movq	(%edx), %mm0
+	movq	8(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	56(%edx), %mm1
+	movq	48(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%ecx)
+	movq	%mm4, 8(%ecx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	64(%ebx), %mm3
+	pfmul	72(%ebx), %mm7
+	movd	%mm3, 60(%ecx)
+	movd	%mm7, 52(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 56(%ecx)
+	movd	%mm7, 48(%ecx)
+	
+	movq	16(%edx), %mm0
+	movq	24(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	40(%edx), %mm1
+	movq	32(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 16(%ecx)
+	movq	%mm4, 24(%ecx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	80(%ebx), %mm3
+	pfmul	88(%ebx), %mm7
+	movd	%mm3, 44(%ecx)
+	movd	%mm7, 36(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 40(%ecx)
+	movd	%mm7, 32(%ecx)
+
+/* Phase 3*/
+
+	movq	64(%edx), %mm0
+	movq	72(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	120(%edx), %mm1
+	movq	112(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 64(%ecx)
+	movq	%mm4, 72(%ecx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	64(%ebx), %mm3
+	pfmul	72(%ebx), %mm7
+	movd	%mm3, 124(%ecx)
+	movd	%mm7, 116(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 120(%ecx)
+	movd	%mm7, 112(%ecx)
+
+	movq	80(%edx), %mm0
+	movq	88(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	104(%edx), %mm1
+	movq	96(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 80(%ecx)
+	movq	%mm4, 88(%ecx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	80(%ebx), %mm3
+	pfmul	88(%ebx), %mm7
+	movd	%mm3, 108(%ecx)
+	movd	%mm7, 100(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 104(%ecx)
+	movd	%mm7, 96(%ecx)
+	
+/* Phase 4*/
 
-        // 3
-        movl pnts+8,%eax
-        movq 0(%eax),%mm0
-        movq 8(%eax),%mm1
-        movq 0(%esi),%mm2
-        / 0, 6
-        movq %mm2,%mm3
-        movd 28(%esi),%mm4
-        punpckldq 24(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsub %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,0(%ebx)
-        movd %mm3,28(%ebx)
-        psrlq $32,%mm3
-        movd %mm3,24(%ebx)
-        movq 8(%esi),%mm5
-        / 2, 4
-        movq %mm5,%mm6
-        movd 20(%esi),%mm7
-        punpckldq 16(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsub %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,8(%ebx)
-        movd %mm6,20(%ebx)
-        psrlq $32,%mm6
-        movd %mm6,16(%ebx)
-        movq 32(%esi),%mm2
-        / 8, 14
-        movq %mm2,%mm3
-        movd 60(%esi),%mm4
-        punpckldq 56(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsubr %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,32(%ebx)
-        movd %mm3,60(%ebx)
-        psrlq $32,%mm3
-        movd %mm3,56(%ebx)
-        movq 40(%esi),%mm5
-        / 10, 12
-        movq %mm5,%mm6
-        movd 52(%esi),%mm7
-        punpckldq 48(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsubr %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,40(%ebx)
-        movd %mm6,52(%ebx)
-        psrlq $32,%mm6
-        movd %mm6,48(%ebx)
-        movq 64(%esi),%mm2
-        / 16, 22
-        movq %mm2,%mm3
-        movd 92(%esi),%mm4
-        punpckldq 88(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsub %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,64(%ebx)
-        movd %mm3,92(%ebx)
-        psrlq $32,%mm3
-        movd %mm3,88(%ebx)
-        movq 72(%esi),%mm5
-        / 18, 20
-        movq %mm5,%mm6
-        movd 84(%esi),%mm7
-        punpckldq 80(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsub %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,72(%ebx)
-        movd %mm6,84(%ebx)
-        psrlq $32,%mm6
-        movd %mm6,80(%ebx)
-        movq 96(%esi),%mm2
-        / 24, 30
-        movq %mm2,%mm3
-        movd 124(%esi),%mm4
-        punpckldq 120(%esi),%mm4
-        pfadd %mm4,%mm2
-        pfsubr %mm4,%mm3
-        pfmul %mm0,%mm3
-        movq %mm2,96(%ebx)
-        movd %mm3,124(%ebx)
-        psrlq $32,%mm3
-        movd %mm3,120(%ebx)
-        movq 104(%esi),%mm5
-        / 26, 28
-        movq %mm5,%mm6
-        movd 116(%esi),%mm7
-        punpckldq 112(%esi),%mm7
-        pfadd %mm7,%mm5
-        pfsubr %mm7,%mm6
-        pfmul %mm1,%mm6
-        movq %mm5,104(%ebx)
-        movd %mm6,116(%ebx)
-        psrlq $32,%mm6
-        movd %mm6,112(%ebx)
+	movq	(%ecx), %mm0
+	movq	8(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	24(%ecx), %mm1
+	movq	16(%ecx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%edx)
+	movq	%mm4, 8(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	movd	%mm3, 28(%edx)
+	movd	%mm7, 20(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 24(%edx)
+	movd	%mm7, 16(%edx)
+
+	movq	32(%ecx), %mm0
+	movq	40(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	56(%ecx), %mm1
+	movq	48(%ecx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 32(%edx)
+	movq	%mm4, 40(%edx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	movd	%mm3, 60(%edx)
+	movd	%mm7, 52(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 56(%edx)
+	movd	%mm7, 48(%edx)
+
+	movq	64(%ecx), %mm0
+	movq	72(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	88(%ecx), %mm1
+	movq	80(%ecx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 64(%edx)
+	movq	%mm4, 72(%edx)
+	pfsub	%mm1, %mm3
+	pfsub	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	movd	%mm3, 92(%edx)
+	movd	%mm7, 84(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 88(%edx)
+	movd	%mm7, 80(%edx)
+
+	movq	96(%ecx), %mm0
+	movq	104(%ecx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	120(%ecx), %mm1
+	movq	112(%ecx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 96(%edx)
+	movq	%mm4, 104(%edx)
+	pfsubr	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	96(%ebx), %mm3
+	pfmul	104(%ebx), %mm7
+	movd	%mm3, 124(%edx)
+	movd	%mm7, 116(%edx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 120(%edx)
+	movd	%mm7, 112(%edx)
+
+/* Phase 5 */
+
+	movq	(%edx), %mm0
+	movq	16(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	8(%edx), %mm1
+	movq	24(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, (%ecx)
+	movq	%mm4, 16(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	movd	%mm3, 12(%ecx)
+	movd	%mm7, 28(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 8(%ecx)
+	movd	%mm7, 24(%ecx)
 
-        // 4
-        movl pnts+12,%eax
-        movq 0(%eax),%mm0
-        movq 0(%ebx),%mm1
-        / 0
-        movq %mm1,%mm2
-        movd 12(%ebx),%mm3
-        punpckldq 8(%ebx),%mm3
-        pfadd %mm3,%mm1
-        pfsub %mm3,%mm2
-        pfmul %mm0,%mm2
-        movq %mm1,0(%esi)
-        movd %mm2,12(%esi)
-        psrlq $32,%mm2
-        movd %mm2,8(%esi)
-        movq 16(%ebx),%mm4
-        / 4
-        movq %mm4,%mm5
-        movd 28(%ebx),%mm6
-        punpckldq 24(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,16(%esi)
-        movd %mm5,28(%esi)
-        psrlq $32,%mm5
-        movd %mm5,24(%esi)
-        movq 32(%ebx),%mm1
-        / 8
-        movq %mm1,%mm2
-        movd 44(%ebx),%mm3
-        punpckldq 40(%ebx),%mm3
-        pfadd %mm3,%mm1
-        pfsub %mm3,%mm2
-        pfmul %mm0,%mm2
-        movq %mm1,32(%esi)
-        movd %mm2,44(%esi)
-        psrlq $32,%mm2
-        movd %mm2,40(%esi)
-        movq 48(%ebx),%mm4
-        / 12
-        movq %mm4,%mm5
-        movd 60(%ebx),%mm6
-        punpckldq 56(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,48(%esi)
-        movd %mm5,60(%esi)
-        psrlq $32,%mm5
-        movd %mm5,56(%esi)
-        movq 64(%ebx),%mm1
-        / 16
-        movq %mm1,%mm2
-        movd 76(%ebx),%mm3
-        punpckldq 72(%ebx),%mm3
-        pfadd %mm3,%mm1
-        pfsub %mm3,%mm2
-        pfmul %mm0,%mm2
-        movq %mm1,64(%esi)
-        movd %mm2,76(%esi)
-        psrlq $32,%mm2
-        movd %mm2,72(%esi)
-        movq 80(%ebx),%mm4
-        / 20
-        movq %mm4,%mm5
-        movd 92(%ebx),%mm6
-        punpckldq 88(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,80(%esi)
-        movd %mm5,92(%esi)
-        psrlq $32,%mm5
-        movd %mm5,88(%esi)
-        movq 96(%ebx),%mm1
-        / 24
-        movq %mm1,%mm2
-        movd 108(%ebx),%mm3
-        punpckldq 104(%ebx),%mm3
-        pfadd %mm3,%mm1
-        pfsub %mm3,%mm2
-        pfmul %mm0,%mm2
-        movq %mm1,96(%esi)
-        movd %mm2,108(%esi)
-        psrlq $32,%mm2
-        movd %mm2,104(%esi)
-        movq 112(%ebx),%mm4
-        / 28
-        movq %mm4,%mm5
-        movd 124(%ebx),%mm6
-        punpckldq 120(%ebx),%mm6
-        pfadd %mm6,%mm4
-        pfsubr %mm6,%mm5
-        pfmul %mm0,%mm5
-        movq %mm4,112(%esi)
-        movd %mm5,124(%esi)
-        psrlq $32,%mm5
-        movd %mm5,120(%esi)
+	movq	32(%edx), %mm0
+	movq	48(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	40(%edx), %mm1
+	movq	56(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 32(%ecx)
+	movq	%mm4, 48(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	movd	%mm3, 44(%ecx)
+	movd	%mm7, 60(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 40(%ecx)
+	movd	%mm7, 56(%ecx)
+
+	movq	64(%edx), %mm0
+	movq	80(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	72(%edx), %mm1
+	movq	88(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 64(%ecx)
+	movq	%mm4, 80(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	movd	%mm3, 76(%ecx)
+	movd	%mm7, 92(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 72(%ecx)
+	movd	%mm7, 88(%ecx)
+
+	movq	96(%edx), %mm0
+	movq	112(%edx), %mm4
+	movq	%mm0, %mm3
+	movq	%mm4, %mm7
+	movq	104(%edx), %mm1
+	movq	120(%edx), %mm5
+	/* n.b.: pswapd*/	
+	movq	%mm1, %mm2
+	movq	%mm5, %mm6
+	psrlq	$32, %mm1
+	psrlq	$32, %mm5
+	punpckldq %mm2, %mm1
+	punpckldq %mm6, %mm5
+	/**/
+	pfadd	%mm1, %mm0
+	pfadd	%mm5, %mm4
+	movq	%mm0, 96(%ecx)
+	movq	%mm4, 112(%ecx)
+	pfsub	%mm1, %mm3
+	pfsubr	%mm5, %mm7
+	pfmul	112(%ebx), %mm3
+	pfmul	112(%ebx), %mm7
+	movd	%mm3, 108(%ecx)
+	movd	%mm7, 124(%ecx)
+	psrlq	$32, %mm3
+	psrlq	$32, %mm7
+	movd	%mm3, 104(%ecx)
+	movd	%mm7, 120(%ecx)
+	
+/* Phase 6. This is the end of easy road. */
+	movl	$1, %eax
+	movd	%eax, %mm7
+	pi2fd	%mm7, %mm7
+	movq	32(%ecx), %mm0
+	punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */	
+	movq	%mm0, %mm1
+	movq	plus_minus_3dnow, %mm6
+	/* n.b.: pfpnacc */
+	pxor	%mm6, %mm1
+	pfacc	%mm1, %mm0
+	/**/
+	pfmul	%mm7, %mm0
+	movq	%mm0, 32(%edx)
+	femms
+
+	flds   44(%ecx)
+	fsubs  40(%ecx)
+	fmuls 120(%ebx)
+
+	fsts   44(%edx)
+	fadds  40(%ecx) /* pfacc 40(ecx), 56(%ecx) */
+	fadds  44(%ecx)
+	fstps  40(%edx)
+
+	flds   48(%ecx)
+	fsubs  52(%ecx)
+	fmuls 120(%ebx)
+
+	flds   60(%ecx)
+	fsubs  56(%ecx)
+	fmuls 120(%ebx)
+
+	fld      %st(0)
+	fadds  56(%ecx)
+	fadds  60(%ecx)
+
+	fld      %st(0)
+	fadds  48(%ecx)
+	fadds  52(%ecx)
+	fstps  48(%edx)
+	fadd     %st(2)
+	fstps  56(%edx)
+	fsts   60(%edx)
+	faddp    %st(1)
+	fstps  52(%edx)
+/*---*/
+	flds   64(%ecx)
+	fadds  68(%ecx)
+	fstps  64(%edx)
+
+	flds   64(%ecx)
+	fsubs  68(%ecx)
+	fmuls 120(%ebx)
+	fstps  68(%edx)
+
+	flds   76(%ecx)
+	fsubs  72(%ecx)
+	fmuls 120(%ebx)
+	fsts   76(%edx)
+	fadds  72(%ecx)
+	fadds  76(%ecx)
+	fstps  72(%edx)
+
+	flds   92(%ecx)
+	fsubs  88(%ecx)
+	fmuls 120(%ebx)
+	fsts   92(%edx)
+	fadds  92(%ecx)
+	fadds  88(%ecx)
+
+	fld      %st(0)
+	fadds  80(%ecx)
+	fadds  84(%ecx)
+	fstps  80(%edx)
+
+	flds   80(%ecx)
+	fsubs  84(%ecx)
+	fmuls 120(%ebx)
+	fadd  %st(0), %st(1)
+	fadds 92(%edx)
+	fstps 84(%edx)
+	fstps 88(%edx)
+
+	flds   96(%ecx)
+	fadds 100(%ecx)
+	fstps  96(%edx)
+
+	flds   96(%ecx)
+	fsubs 100(%ecx)
+	fmuls 120(%ebx)
+	fstps 100(%edx)
+
+	flds  108(%ecx)
+	fsubs 104(%ecx)
+	fmuls 120(%ebx)
+	fsts  108(%edx)
+	fadds 104(%ecx)
+	fadds 108(%ecx)
+	fstps 104(%edx)
+
+	flds  124(%ecx)
+	fsubs 120(%ecx)
+	fmuls 120(%ebx)
+	fsts  124(%edx)
+	fadds 120(%ecx)
+	fadds 124(%ecx)
+
+	fld      %st(0)
+	fadds 112(%ecx)
+	fadds 116(%ecx)
+	fstps 112(%edx)
+
+	flds  112(%ecx)
+	fsubs 116(%ecx)
+	fmuls 120(%ebx)
+	fadd  %st(0),%st(1)
+	fadds 124(%edx)
+	fstps 116(%edx)
+	fstps 120(%edx)
+	jnz .L01
+	
+/* Phase 7*/
+
+	flds      (%ecx)
+	fadds    4(%ecx)
+	fstps 1024(%esi)
+
+	flds      (%ecx)
+	fsubs    4(%ecx)
+	fmuls  120(%ebx)
+	fsts      (%esi)
+	fstps     (%edi)
+
+	flds   12(%ecx)
+	fsubs   8(%ecx)
+	fmuls 120(%ebx)
+	fsts  512(%edi)
+	fadds  12(%ecx)
+	fadds   8(%ecx)
+	fstps 512(%esi)
+
+	flds   16(%ecx)
+	fsubs  20(%ecx)
+	fmuls 120(%ebx)
 
-        // 5
-        movl $-1,%eax
-        movd %eax,%mm1
-        movl $1,%eax
-        movd %eax,%mm0
-        / L | H
-        punpckldq %mm1,%mm0
-        pi2fd %mm0,%mm0
-        / 1.0 | -1.0
-        movd %eax,%mm1
-        pi2fd %mm1,%mm1
-        movl pnts+16,%eax
-        movd 0(%eax),%mm2
-        punpckldq %mm2,%mm1
-        / 1.0 | cos0
-        movq 0(%esi),%mm2
-        / 0
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq %mm2,0(%ebx)
-        movq 8(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm4,8(%ebx)
-        movq 16(%esi),%mm2
-        / 4
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq 24(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,16(%ebx)
-        movq %mm4,24(%ebx)
-        movq 32(%esi),%mm2
-        / 8
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq %mm2,32(%ebx)
-        movq 40(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm4,40(%ebx)
-        movq 48(%esi),%mm2
-        / 12
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq 56(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,48(%ebx)
-        movq %mm4,56(%ebx)
-        movq 64(%esi),%mm2
-        / 16
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq %mm2,64(%ebx)
-        movq 72(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm4,72(%ebx)
-        movq 80(%esi),%mm2
-        / 20
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq 88(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,80(%ebx)
-        movq %mm4,88(%ebx)
-        movq 96(%esi),%mm2
-        / 24
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq %mm2,96(%ebx)
-        movq 104(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm4,104(%ebx)
-        movq 112(%esi),%mm2
-        / 28
-        movq %mm2,%mm3
-        pfmul %mm0,%mm3
-        pfacc %mm3,%mm2
-        pfmul %mm1,%mm2
-        movq 120(%esi),%mm4
-        movq %mm4,%mm5
-        pfmul %mm0,%mm5
-        pfacc %mm5,%mm4
-        pfmul %mm0,%mm4
-        pfmul %mm1,%mm4
-        movq %mm4,%mm5
-        psrlq $32,%mm5
-        pfacc %mm5,%mm4
-        movq %mm2,%mm3
-        psrlq $32,%mm3
-        pfadd %mm4,%mm2
-        pfadd %mm3,%mm4
-        movq %mm2,112(%ebx)
-        movq %mm4,120(%ebx)
+	flds   28(%ecx)
+	fsubs  24(%ecx)
+	fmuls 120(%ebx)
+	fsts  768(%edi)
+	fld      %st(0)
+	fadds  24(%ecx)
+	fadds  28(%ecx)
+	fld      %st(0)
+	fadds  16(%ecx)
+	fadds  20(%ecx)
+	fstps 768(%esi)
+	fadd     %st(2)
+	fstps 256(%esi)
+	faddp    %st(1)
+	fstps 256(%edi)
+	
+/* Phase 8*/
+
+	flds   32(%edx)
+	fadds  48(%edx)
+	fstps 896(%esi)
+
+	flds   48(%edx)
+	fadds  40(%edx)
+	fstps 640(%esi)
+
+	flds   40(%edx)
+	fadds  56(%edx)
+	fstps 384(%esi)
+
+	flds   56(%edx)
+	fadds  36(%edx)
+	fstps 128(%esi)
+
+	flds   36(%edx)
+	fadds  52(%edx)
+	fstps 128(%edi)
+
+	flds   52(%edx)
+	fadds  44(%edx)
+	fstps 384(%edi)
+
+	flds   60(%edx)
+	fsts  896(%edi)
+	fadds  44(%edx)
+	fstps 640(%edi)
+
+	flds   96(%edx)
+	fadds 112(%edx)
+	fld      %st(0)
+	fadds  64(%edx)
+	fstps 960(%esi)
+	fadds  80(%edx)
+	fstps 832(%esi)
+
+	flds  112(%edx)
+	fadds 104(%edx)
+	fld      %st(0)
+	fadds  80(%edx)
+	fstps 704(%esi)
+	fadds  72(%edx)
+	fstps 576(%esi)
+
+	flds  104(%edx)
+	fadds 120(%edx)
+	fld      %st(0)
+	fadds  72(%edx)
+	fstps 448(%esi)
+	fadds  88(%edx)
+	fstps 320(%esi)
+
+	flds  120(%edx)
+	fadds 100(%edx)
+	fld      %st(0)
+	fadds  88(%edx)
+	fstps 192(%esi)
+	fadds  68(%edx)
+	fstps  64(%esi)
+
+	flds  100(%edx)
+	fadds 116(%edx)
+	fld      %st(0)
+	fadds  68(%edx)
+	fstps  64(%edi)
+	fadds  84(%edx)
+	fstps 192(%edi)
+
+	flds  116(%edx)
+	fadds 108(%edx)
+	fld      %st(0)
+	fadds  84(%edx)
+	fstps 320(%edi)
+	fadds  76(%edx)
+	fstps 448(%edi)
+
+	flds  108(%edx)
+	fadds 124(%edx)
+	fld      %st(0)
+	fadds  76(%edx)
+	fstps 576(%edi)
+	fadds  92(%edx)
+	fstps 704(%edi)
+
+	flds  124(%edx)
+	fsts  960(%edi)
+	fadds  92(%edx)
+	fstps 832(%edi)
+	jmp	.L_bye
+.L01:	
+/* Phase 9*/
+
+	flds      (%ecx)
+	fadds    4(%ecx)
+	fistp  512(%esi)
+
+	flds      (%ecx)
+	fsubs    4(%ecx)
+	fmuls  120(%ebx)
+
+	fistp     (%esi)
+
 
-        // Phase6
-        movl 0(%ebx),%eax
-        movl %eax,1024(%ebp)
-        movl 4(%ebx),%eax
-        movl %eax,0(%ebp)
-        movl %eax,0(%edx)
-        movl 8(%ebx),%eax
-        movl %eax,512(%ebp)
-        movl 12(%ebx),%eax
-        movl %eax,512(%edx)
+	flds    12(%ecx)
+	fsubs    8(%ecx)
+	fmuls  120(%ebx)
+	fist   256(%edi)
+	fadds   12(%ecx)
+	fadds    8(%ecx)
+	fistp  256(%esi)
 
-        movl 16(%ebx),%eax
-        movl %eax,768(%ebp)
-        movl 20(%ebx),%eax
-        movl %eax,256(%edx)
-
-        movl 24(%ebx),%eax
-        movl %eax,256(%ebp)
-        movl 28(%ebx),%eax
-        movl %eax,768(%edx)
+	flds   16(%ecx)
+	fsubs  20(%ecx)
+	fmuls 120(%ebx)
 
-        movq 32(%ebx),%mm0
-        movq 48(%ebx),%mm1
-        pfadd %mm1,%mm0
-        movd %mm0,896(%ebp)
-        psrlq $32,%mm0
-        movd %mm0,128(%edx)
-        movq 40(%ebx),%mm2
-        pfadd %mm2,%mm1
-        movd %mm1,640(%ebp)
-        psrlq $32,%mm1
-        movd %mm1,384(%edx)
+	flds   28(%ecx)
+	fsubs  24(%ecx)
+	fmuls 120(%ebx)
+	fist  384(%edi)
+	fld      %st(0)
+	fadds  24(%ecx)
+	fadds  28(%ecx)
+	fld      %st(0)
+	fadds  16(%ecx)
+	fadds  20(%ecx)
+	fistp  384(%esi)
+	fadd     %st(2)
+	fistp  128(%esi)
+	faddp    %st(1)
+	fistp  128(%edi)
+	
+/* Phase 10*/
+
+	flds    32(%edx)
+	fadds   48(%edx)
+	fistp  448(%esi)
 
-        movq 56(%ebx),%mm3
-        pfadd %mm3,%mm2
-        movd %mm2,384(%ebp)
-        psrlq $32,%mm2
-        movd %mm2,640(%edx)
+	flds   48(%edx)
+	fadds  40(%edx)
+	fistp 320(%esi)
+
+	flds   40(%edx)
+	fadds  56(%edx)
+	fistp 192(%esi)
+
+	flds   56(%edx)
+	fadds  36(%edx)
+	fistp  64(%esi)
 
-        movd 36(%ebx),%mm4
-        pfadd %mm4,%mm3
-        movd %mm3,128(%ebp)
-        psrlq $32,%mm3
-        movd %mm3,896(%edx)
-        movq 96(%ebx),%mm0
-        movq 64(%ebx),%mm1
+	flds   36(%edx)
+	fadds  52(%edx)
+	fistp  64(%edi)
+
+	flds   52(%edx)
+	fadds  44(%edx)
+	fistp 192(%edi)
+
+	flds   60(%edx)
+	fist   448(%edi)
+	fadds  44(%edx)
+	fistp 320(%edi)
 
-        movq 112(%ebx),%mm2
-        pfadd %mm2,%mm0
-        movq %mm0,%mm3
-        pfadd %mm1,%mm3
-        movd %mm3,960(%ebp)
-        psrlq $32,%mm3
-        movd %mm3,64(%edx)
-        movq 80(%ebx),%mm1
-        pfadd %mm1,%mm0
-        movd %mm0,832(%ebp)
-        psrlq $32,%mm0
-        movd %mm0,192(%edx)
-        movq 104(%ebx),%mm3
-        pfadd %mm3,%mm2
-        movq %mm2,%mm4
-        pfadd %mm1,%mm4
-        movd %mm4,704(%ebp)
-        psrlq $32,%mm4
-        movd %mm4,320(%edx)
-        movq 72(%ebx),%mm1
-        pfadd %mm1,%mm2
-        movd %mm2,576(%ebp)
-        psrlq $32,%mm2
-        movd %mm2,448(%edx)
+	flds   96(%edx)
+	fadds 112(%edx)
+	fld      %st(0)
+	fadds  64(%edx)
+	fistp 480(%esi)
+	fadds  80(%edx)
+	fistp 416(%esi)
+
+	flds  112(%edx)
+	fadds 104(%edx)
+	fld      %st(0)
+	fadds  80(%edx)
+	fistp 352(%esi)
+	fadds  72(%edx)
+	fistp 288(%esi)
+
+	flds  104(%edx)
+	fadds 120(%edx)
+	fld      %st(0)
+	fadds  72(%edx)
+	fistp 224(%esi)
+	fadds  88(%edx)
+	fistp 160(%esi)
+
+	flds  120(%edx)
+	fadds 100(%edx)
+	fld      %st(0)
+	fadds  88(%edx)
+	fistp  96(%esi)
+	fadds  68(%edx)
+	fistp  32(%esi)
 
-        movq 120(%ebx),%mm4
-        pfadd %mm4,%mm3
-        movq %mm3,%mm5
-        pfadd %mm1,%mm5
-        movd %mm5,448(%ebp)
-        psrlq $32,%mm5
-        movd %mm5,576(%edx)
-        movq 88(%ebx),%mm1
-        pfadd %mm1,%mm3
-        movd %mm3,320(%ebp)
-        psrlq $32,%mm3
-        movd %mm3,704(%edx)
+	flds  100(%edx)
+	fadds 116(%edx)
+	fld      %st(0)
+	fadds  68(%edx)
+	fistp  32(%edi)
+	fadds  84(%edx)
+	fistp  96(%edi)
+
+	flds  116(%edx)
+	fadds 108(%edx)
+	fld      %st(0)
+	fadds  84(%edx)
+	fistp 160(%edi)
+	fadds  76(%edx)
+	fistp 224(%edi)
 
-        movd 100(%ebx),%mm5
-        pfadd %mm5,%mm4
-        movq %mm4,%mm6
-        pfadd %mm1,%mm6
-        movd %mm6,192(%ebp)
-        psrlq $32,%mm6
-        movd %mm6,832(%edx)
-        movd 68(%ebx),%mm1
-        pfadd %mm1,%mm4
-        movd %mm4,64(%ebp)
-        psrlq $32,%mm4
-        movd %mm4,960(%edx)
+	flds  108(%edx)
+	fadds 124(%edx)
+	fld      %st(0)
+	fadds  76(%edx)
+	fistp 288(%edi)
+	fadds  92(%edx)
+	fistp 352(%edi)
 
-        / femms
+	flds  124(%edx)
+	fist  480(%edi)
+	fadds  92(%edx)
+	fistp 416(%edi)
+	movsw
+.L_bye:
+	addl $256,%esp
+	popl %edi
+	popl %esi
+	popl %ebx
+	ret
+	
 
-        popl %ebx
-        popl %esi
-        popl %edi
-        popl %ebp
-        addl $256,%esp
-
-        ret
-