Mercurial > mplayer.hg
changeset 1245:03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
author | nick |
---|---|
date | Fri, 29 Jun 2001 17:55:35 +0000 |
parents | a2c71bf9a7d3 |
children | 7f69c1dd1e91 |
files | mp3lib/Makefile mp3lib/d_cpu.h mp3lib/d_cpu.s mp3lib/dct36.c mp3lib/dct64_3dnow.s mp3lib/dct64_MMX.s mp3lib/dct64_k7.s mp3lib/decod386.c mp3lib/decode_3dnow.s mp3lib/decode_MMX.s mp3lib/decode_k7.s mp3lib/decode_sse.s mp3lib/layer2.c mp3lib/layer3.c mp3lib/mpg123.h mp3lib/sr1.c mp3lib/tabinit.c mp3lib/tabinit_MMX.s mp3lib/test2.c |
diffstat | 19 files changed, 3170 insertions(+), 2274 deletions(-) [+] |
line wrap: on
line diff
--- a/mp3lib/Makefile Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/Makefile Fri Jun 29 17:55:35 2001 +0000 @@ -1,8 +1,10 @@ include config.mak -SRCS = sr1.c d_cpu.s decode_i586.s $(OPTIONAL_SRCS) -OBJS = sr1.o d_cpu.o decode_i586.o $(OPTIONAL_OBJS) +SRCS = sr1.c d_cpu.s decode_i586.s dct64_MMX.s decode_MMX.s tabinit_MMX.s\ +dct36_3dnow.s dct64_3dnow.s dct36_k7.s dct64_k7.s +OBJS = sr1.o d_cpu.o decode_i586.o dct64_MMX.o decode_MMX.o tabinit_MMX.o\ +dct36_3dnow.o dct64_3dnow.o dct36_k7.o dct64_k7.o # OBJS = $(SRCS:.c,.s=.o) CFLAGS = $(OPTFLAGS) $(EXTRA_INC)
--- a/mp3lib/d_cpu.h Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/d_cpu.h Fri Jun 29 17:55:35 2001 +0000 @@ -9,9 +9,12 @@ unsigned int _CpuID; unsigned int _i586; unsigned int _3dnow; +unsigned int _isse; +unsigned int _has_mmx; extern unsigned long CpuDetect( void ); extern unsigned long ipentium( void ); +extern unsigned long isse( void ); extern unsigned long a3dnow( void ); #endif
--- a/mp3lib/d_cpu.s Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/d_cpu.s Fri Jun 29 17:55:35 2001 +0000 @@ -9,6 +9,7 @@ .globl CpuDetect .globl ipentium .globl a3dnow +.globl isse / --------------------------------------------------------------------------- / in C: unsigned long CpuDetect( void ); @@ -45,7 +46,9 @@ / --------------------------------------------------------------------------- / in C: unsigled long ipentium( void ); -/ return: 0 if the processor is not P5 or above else above 1. +/ return: 0 if this processor i386 or i486 +/ 1 otherwise +/ 2 if this cpu supports mmx / --------------------------------------------------------------------------- ipentium: pushl %ebx @@ -63,10 +66,15 @@ jz no_cpuid movl $1,%eax cpuid - shrl $8,%eax - cmpl $5,%eax - jb no_cpuid - movl $1,%eax + movl %eax, %ecx + xorl %eax, %eax + shrl $8,%ecx + cmpl $5,%ecx + jb exit + incl %eax + test $0x00800000, %edx + jz exit + incl %eax jmp exit no_cpuid: xorl %eax,%eax @@ -113,3 +121,33 @@ popl %edx popl %ebx ret + +/ --------------------------------------------------------------------------- +/ in C: unsigned long isse( void ); +/ return: 0 if this processor does not support sse +/ 1 otherwise +/ 2 if this cpu supports sse2 extension +/ --------------------------------------------------------------------------- +isse: + pushl %ebx + pushl %edx + pushl %ecx + + call ipentium + testl %eax,%eax + jz exit3 + + movl $1,%eax + cpuid + xorl %eax, %eax + testl $0x02000000,%edx + jz exit3 + incl %eax + testl $0x04000000,%edx + jz exit3 + incl %eax +exit3: + popl %ecx + popl %edx + popl %ebx + ret
--- a/mp3lib/dct36.c Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/dct36.c Fri Jun 29 17:55:35 2001 +0000 @@ -193,7 +193,7 @@ sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \ MACRO0(v); } - register const real *c = nCOS9; + register const real *c = COS9; register real *out2 = o2; register real *w = wintab; register real *out1 = o1;
--- a/mp3lib/dct64_3dnow.s Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/dct64_3dnow.s Fri Jun 29 17:55:35 2001 +0000 @@ -1,706 +1,932 @@ -/// -/// Replacement of dct64() with AMD's 3DNow! SIMD operations support -/// -/// Syuuhei Kashiyama <squash@mb.kcom.ne.jp> -/// -/// The author of this program disclaim whole expressed or implied -/// warranties with regard to this program, and in no event shall the -/// author of this program liable to whatever resulted from the use of -/// this program. Use it at your own risk. -/// +# This code was taken from http://www.mpg123.org +# See ChangeLog of mpg123-0.59s-pre.1 for detail +# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> +# Partial 3dnow! optimization by Nick Kurshev +# +# TODO: finish 3dnow! optimization at least in scalar mode +# - .globl dct64_3dnow - .type dct64_3dnow,@function -dct64_3dnow: - subl $256,%esp - pushl %ebp - pushl %edi - pushl %esi - pushl %ebx - leal 16(%esp),%ebx - movl 284(%esp),%edi - movl 276(%esp),%ebp - movl 280(%esp),%edx - leal 128(%ebx),%esi +.data + .align 8 +plus_minus_3dnow: .long 0x00000000, 0x80000000 +costab: + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 - / femms +.text + + .align 16 + +.globl dct64_MMX_3dnow +dct64_MMX_3dnow: + pushl %ebx + pushl %esi + pushl %edi + subl $256,%esp + movl 280(%esp),%eax - // 1 - movl pnts,%eax - movq 0(%edi),%mm0 - movq %mm0,%mm1 - movd 124(%edi),%mm2 - punpckldq 120(%edi),%mm2 - movq 0(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,0(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,124(%ebx) - psrlq $32,%mm1 - movd %mm1,120(%ebx) - movq 8(%edi),%mm4 - movq %mm4,%mm5 - movd 116(%edi),%mm6 - punpckldq 112(%edi),%mm6 - movq 8(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,8(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,116(%ebx) - psrlq $32,%mm5 - movd %mm5,112(%ebx) - movq 16(%edi),%mm0 - movq %mm0,%mm1 - movd 108(%edi),%mm2 - punpckldq 104(%edi),%mm2 - movq 16(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,16(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,108(%ebx) - psrlq $32,%mm1 - movd %mm1,104(%ebx) - movq 24(%edi),%mm4 - movq %mm4,%mm5 - movd 100(%edi),%mm6 - punpckldq 96(%edi),%mm6 - movq 24(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,24(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,100(%ebx) - psrlq $32,%mm5 - movd %mm5,96(%ebx) - movq 32(%edi),%mm0 - movq %mm0,%mm1 - movd 92(%edi),%mm2 - punpckldq 88(%edi),%mm2 - movq 32(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,32(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,92(%ebx) - psrlq $32,%mm1 - movd %mm1,88(%ebx) - movq 40(%edi),%mm4 - movq %mm4,%mm5 - movd 84(%edi),%mm6 - punpckldq 80(%edi),%mm6 - movq 40(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,40(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,84(%ebx) - psrlq $32,%mm5 - movd %mm5,80(%ebx) - movq 48(%edi),%mm0 - movq %mm0,%mm1 - movd 76(%edi),%mm2 - punpckldq 72(%edi),%mm2 - movq 48(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,48(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,76(%ebx) - psrlq $32,%mm1 - movd %mm1,72(%ebx) - movq 56(%edi),%mm4 - movq %mm4,%mm5 - movd 68(%edi),%mm6 - punpckldq 64(%edi),%mm6 - movq 56(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,56(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,68(%ebx) - psrlq $32,%mm5 - movd %mm5,64(%ebx) + leal 128(%esp),%edx + movl 272(%esp),%esi + movl 276(%esp),%edi + movl $costab,%ebx + orl %ecx,%ecx + movl %esp,%ecx + femms +/* Phase 1*/ + movq (%eax), %mm0 + movq 8(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%eax), %mm1 + movq 112(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%edx) + movq %mm4, 8(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul (%ebx), %mm3 + pfmul 8(%ebx), %mm7 + movd %mm3, 124(%edx) + movd %mm7, 116(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 120(%edx) + movd %mm7, 112(%edx) + + movq 16(%eax), %mm0 + movq 24(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%eax), %mm1 + movq 96(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 16(%edx) + movq %mm4, 24(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 16(%ebx), %mm3 + pfmul 24(%ebx), %mm7 + movd %mm3, 108(%edx) + movd %mm7, 100(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 104(%edx) + movd %mm7, 96(%edx) + + movq 32(%eax), %mm0 + movq 40(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 88(%eax), %mm1 + movq 80(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%edx) + movq %mm4, 40(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 32(%ebx), %mm3 + pfmul 40(%ebx), %mm7 + movd %mm3, 92(%edx) + movd %mm7, 84(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 88(%edx) + movd %mm7, 80(%edx) + + movq 48(%eax), %mm0 + movq 56(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 72(%eax), %mm1 + movq 64(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 48(%edx) + movq %mm4, 56(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 48(%ebx), %mm3 + pfmul 56(%ebx), %mm7 + movd %mm3, 76(%edx) + movd %mm7, 68(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 72(%edx) + movd %mm7, 64(%edx) + +/* Phase 2*/ - // 2 - movl pnts+4,%eax - / 0, 14 - movq 0(%ebx),%mm0 - movq %mm0,%mm1 - movd 60(%ebx),%mm2 - punpckldq 56(%ebx),%mm2 - movq 0(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,0(%esi) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,60(%esi) - psrlq $32,%mm1 - movd %mm1,56(%esi) - / 16, 30 - movq 64(%ebx),%mm0 - movq %mm0,%mm1 - movd 124(%ebx),%mm2 - punpckldq 120(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,64(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,124(%esi) - psrlq $32,%mm1 - movd %mm1,120(%esi) - movq 8(%ebx),%mm4 - / 2, 12 - movq %mm4,%mm5 - movd 52(%ebx),%mm6 - punpckldq 48(%ebx),%mm6 - movq 8(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,8(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,52(%esi) - psrlq $32,%mm5 - movd %mm5,48(%esi) - movq 72(%ebx),%mm4 - / 18, 28 - movq %mm4,%mm5 - movd 116(%ebx),%mm6 - punpckldq 112(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,72(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,116(%esi) - psrlq $32,%mm5 - movd %mm5,112(%esi) - movq 16(%ebx),%mm0 - / 4, 10 - movq %mm0,%mm1 - movd 44(%ebx),%mm2 - punpckldq 40(%ebx),%mm2 - movq 16(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,16(%esi) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,44(%esi) - psrlq $32,%mm1 - movd %mm1,40(%esi) - movq 80(%ebx),%mm0 - / 20, 26 - movq %mm0,%mm1 - movd 108(%ebx),%mm2 - punpckldq 104(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,80(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,108(%esi) - psrlq $32,%mm1 - movd %mm1,104(%esi) - movq 24(%ebx),%mm4 - / 6, 8 - movq %mm4,%mm5 - movd 36(%ebx),%mm6 - punpckldq 32(%ebx),%mm6 - movq 24(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,24(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,36(%esi) - psrlq $32,%mm5 - movd %mm5,32(%esi) - movq 88(%ebx),%mm4 - / 22, 24 - movq %mm4,%mm5 - movd 100(%ebx),%mm6 - punpckldq 96(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,88(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,100(%esi) - psrlq $32,%mm5 - movd %mm5,96(%esi) + movq (%edx), %mm0 + movq 8(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 56(%edx), %mm1 + movq 48(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%ecx) + movq %mm4, 8(%ecx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 64(%ebx), %mm3 + pfmul 72(%ebx), %mm7 + movd %mm3, 60(%ecx) + movd %mm7, 52(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 56(%ecx) + movd %mm7, 48(%ecx) + + movq 16(%edx), %mm0 + movq 24(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 40(%edx), %mm1 + movq 32(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 16(%ecx) + movq %mm4, 24(%ecx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 80(%ebx), %mm3 + pfmul 88(%ebx), %mm7 + movd %mm3, 44(%ecx) + movd %mm7, 36(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 40(%ecx) + movd %mm7, 32(%ecx) + +/* Phase 3*/ + + movq 64(%edx), %mm0 + movq 72(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%edx), %mm1 + movq 112(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%ecx) + movq %mm4, 72(%ecx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 64(%ebx), %mm3 + pfmul 72(%ebx), %mm7 + movd %mm3, 124(%ecx) + movd %mm7, 116(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 120(%ecx) + movd %mm7, 112(%ecx) + + movq 80(%edx), %mm0 + movq 88(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%edx), %mm1 + movq 96(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 80(%ecx) + movq %mm4, 88(%ecx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 80(%ebx), %mm3 + pfmul 88(%ebx), %mm7 + movd %mm3, 108(%ecx) + movd %mm7, 100(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 104(%ecx) + movd %mm7, 96(%ecx) + +/* Phase 4*/ - // 3 - movl pnts+8,%eax - movq 0(%eax),%mm0 - movq 8(%eax),%mm1 - movq 0(%esi),%mm2 - / 0, 6 - movq %mm2,%mm3 - movd 28(%esi),%mm4 - punpckldq 24(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,0(%ebx) - movd %mm3,28(%ebx) - psrlq $32,%mm3 - movd %mm3,24(%ebx) - movq 8(%esi),%mm5 - / 2, 4 - movq %mm5,%mm6 - movd 20(%esi),%mm7 - punpckldq 16(%esi),%mm7 - pfadd %mm7,%mm5 - pfsub %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,8(%ebx) - movd %mm6,20(%ebx) - psrlq $32,%mm6 - movd %mm6,16(%ebx) - movq 32(%esi),%mm2 - / 8, 14 - movq %mm2,%mm3 - movd 60(%esi),%mm4 - punpckldq 56(%esi),%mm4 - pfadd %mm4,%mm2 - pfsubr %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,32(%ebx) - movd %mm3,60(%ebx) - psrlq $32,%mm3 - movd %mm3,56(%ebx) - movq 40(%esi),%mm5 - / 10, 12 - movq %mm5,%mm6 - movd 52(%esi),%mm7 - punpckldq 48(%esi),%mm7 - pfadd %mm7,%mm5 - pfsubr %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,40(%ebx) - movd %mm6,52(%ebx) - psrlq $32,%mm6 - movd %mm6,48(%ebx) - movq 64(%esi),%mm2 - / 16, 22 - movq %mm2,%mm3 - movd 92(%esi),%mm4 - punpckldq 88(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,64(%ebx) - movd %mm3,92(%ebx) - psrlq $32,%mm3 - movd %mm3,88(%ebx) - movq 72(%esi),%mm5 - / 18, 20 - movq %mm5,%mm6 - movd 84(%esi),%mm7 - punpckldq 80(%esi),%mm7 - pfadd %mm7,%mm5 - pfsub %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,72(%ebx) - movd %mm6,84(%ebx) - psrlq $32,%mm6 - movd %mm6,80(%ebx) - movq 96(%esi),%mm2 - / 24, 30 - movq %mm2,%mm3 - movd 124(%esi),%mm4 - punpckldq 120(%esi),%mm4 - pfadd %mm4,%mm2 - pfsubr %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,96(%ebx) - movd %mm3,124(%ebx) - psrlq $32,%mm3 - movd %mm3,120(%ebx) - movq 104(%esi),%mm5 - / 26, 28 - movq %mm5,%mm6 - movd 116(%esi),%mm7 - punpckldq 112(%esi),%mm7 - pfadd %mm7,%mm5 - pfsubr %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,104(%ebx) - movd %mm6,116(%ebx) - psrlq $32,%mm6 - movd %mm6,112(%ebx) + movq (%ecx), %mm0 + movq 8(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 24(%ecx), %mm1 + movq 16(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%edx) + movq %mm4, 8(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 28(%edx) + movd %mm7, 20(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 24(%edx) + movd %mm7, 16(%edx) + + movq 32(%ecx), %mm0 + movq 40(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 56(%ecx), %mm1 + movq 48(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%edx) + movq %mm4, 40(%edx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 60(%edx) + movd %mm7, 52(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 56(%edx) + movd %mm7, 48(%edx) + + movq 64(%ecx), %mm0 + movq 72(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 88(%ecx), %mm1 + movq 80(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%edx) + movq %mm4, 72(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 92(%edx) + movd %mm7, 84(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 88(%edx) + movd %mm7, 80(%edx) + + movq 96(%ecx), %mm0 + movq 104(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%ecx), %mm1 + movq 112(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 96(%edx) + movq %mm4, 104(%edx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 124(%edx) + movd %mm7, 116(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 120(%edx) + movd %mm7, 112(%edx) + +/* Phase 5 */ + + movq (%edx), %mm0 + movq 16(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 8(%edx), %mm1 + movq 24(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%ecx) + movq %mm4, 16(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 12(%ecx) + movd %mm7, 28(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 8(%ecx) + movd %mm7, 24(%ecx) - // 4 - movl pnts+12,%eax - movq 0(%eax),%mm0 - movq 0(%ebx),%mm1 - / 0 - movq %mm1,%mm2 - movd 12(%ebx),%mm3 - punpckldq 8(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,0(%esi) - movd %mm2,12(%esi) - psrlq $32,%mm2 - movd %mm2,8(%esi) - movq 16(%ebx),%mm4 - / 4 - movq %mm4,%mm5 - movd 28(%ebx),%mm6 - punpckldq 24(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,16(%esi) - movd %mm5,28(%esi) - psrlq $32,%mm5 - movd %mm5,24(%esi) - movq 32(%ebx),%mm1 - / 8 - movq %mm1,%mm2 - movd 44(%ebx),%mm3 - punpckldq 40(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,32(%esi) - movd %mm2,44(%esi) - psrlq $32,%mm2 - movd %mm2,40(%esi) - movq 48(%ebx),%mm4 - / 12 - movq %mm4,%mm5 - movd 60(%ebx),%mm6 - punpckldq 56(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,48(%esi) - movd %mm5,60(%esi) - psrlq $32,%mm5 - movd %mm5,56(%esi) - movq 64(%ebx),%mm1 - / 16 - movq %mm1,%mm2 - movd 76(%ebx),%mm3 - punpckldq 72(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,64(%esi) - movd %mm2,76(%esi) - psrlq $32,%mm2 - movd %mm2,72(%esi) - movq 80(%ebx),%mm4 - / 20 - movq %mm4,%mm5 - movd 92(%ebx),%mm6 - punpckldq 88(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,80(%esi) - movd %mm5,92(%esi) - psrlq $32,%mm5 - movd %mm5,88(%esi) - movq 96(%ebx),%mm1 - / 24 - movq %mm1,%mm2 - movd 108(%ebx),%mm3 - punpckldq 104(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,96(%esi) - movd %mm2,108(%esi) - psrlq $32,%mm2 - movd %mm2,104(%esi) - movq 112(%ebx),%mm4 - / 28 - movq %mm4,%mm5 - movd 124(%ebx),%mm6 - punpckldq 120(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,112(%esi) - movd %mm5,124(%esi) - psrlq $32,%mm5 - movd %mm5,120(%esi) + movq 32(%edx), %mm0 + movq 48(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 40(%edx), %mm1 + movq 56(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%ecx) + movq %mm4, 48(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 44(%ecx) + movd %mm7, 60(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 40(%ecx) + movd %mm7, 56(%ecx) + + movq 64(%edx), %mm0 + movq 80(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 72(%edx), %mm1 + movq 88(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%ecx) + movq %mm4, 80(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 76(%ecx) + movd %mm7, 92(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 72(%ecx) + movd %mm7, 88(%ecx) + + movq 96(%edx), %mm0 + movq 112(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%edx), %mm1 + movq 120(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 96(%ecx) + movq %mm4, 112(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 108(%ecx) + movd %mm7, 124(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 104(%ecx) + movd %mm7, 120(%ecx) + +/* Phase 6. This is the end of easy road. */ + movl $1, %eax + movd %eax, %mm7 + pi2fd %mm7, %mm7 + movq 32(%ecx), %mm0 + punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */ + movq %mm0, %mm1 + movq plus_minus_3dnow, %mm6 + /* n.b.: pfpnacc */ + pxor %mm6, %mm1 + pfacc %mm1, %mm0 + /**/ + pfmul %mm7, %mm0 + movq %mm0, 32(%edx) + femms + + flds 44(%ecx) + fsubs 40(%ecx) + fmuls 120(%ebx) + + fsts 44(%edx) + fadds 40(%ecx) /* pfacc 40(ecx), 56(%ecx) */ + fadds 44(%ecx) + fstps 40(%edx) + + flds 48(%ecx) + fsubs 52(%ecx) + fmuls 120(%ebx) + + flds 60(%ecx) + fsubs 56(%ecx) + fmuls 120(%ebx) + + fld %st(0) + fadds 56(%ecx) + fadds 60(%ecx) + + fld %st(0) + fadds 48(%ecx) + fadds 52(%ecx) + fstps 48(%edx) + fadd %st(2) + fstps 56(%edx) + fsts 60(%edx) + faddp %st(1) + fstps 52(%edx) +/*---*/ + flds 64(%ecx) + fadds 68(%ecx) + fstps 64(%edx) + + flds 64(%ecx) + fsubs 68(%ecx) + fmuls 120(%ebx) + fstps 68(%edx) + + flds 76(%ecx) + fsubs 72(%ecx) + fmuls 120(%ebx) + fsts 76(%edx) + fadds 72(%ecx) + fadds 76(%ecx) + fstps 72(%edx) + + flds 92(%ecx) + fsubs 88(%ecx) + fmuls 120(%ebx) + fsts 92(%edx) + fadds 92(%ecx) + fadds 88(%ecx) + + fld %st(0) + fadds 80(%ecx) + fadds 84(%ecx) + fstps 80(%edx) + + flds 80(%ecx) + fsubs 84(%ecx) + fmuls 120(%ebx) + fadd %st(0), %st(1) + fadds 92(%edx) + fstps 84(%edx) + fstps 88(%edx) + + flds 96(%ecx) + fadds 100(%ecx) + fstps 96(%edx) + + flds 96(%ecx) + fsubs 100(%ecx) + fmuls 120(%ebx) + fstps 100(%edx) + + flds 108(%ecx) + fsubs 104(%ecx) + fmuls 120(%ebx) + fsts 108(%edx) + fadds 104(%ecx) + fadds 108(%ecx) + fstps 104(%edx) + + flds 124(%ecx) + fsubs 120(%ecx) + fmuls 120(%ebx) + fsts 124(%edx) + fadds 120(%ecx) + fadds 124(%ecx) + + fld %st(0) + fadds 112(%ecx) + fadds 116(%ecx) + fstps 112(%edx) + + flds 112(%ecx) + fsubs 116(%ecx) + fmuls 120(%ebx) + fadd %st(0),%st(1) + fadds 124(%edx) + fstps 116(%edx) + fstps 120(%edx) + jnz .L01 + +/* Phase 7*/ + + flds (%ecx) + fadds 4(%ecx) + fstps 1024(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + fsts (%esi) + fstps (%edi) + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fsts 512(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fstps 512(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) - // 5 - movl $-1,%eax - movd %eax,%mm1 - movl $1,%eax - movd %eax,%mm0 - / L | H - punpckldq %mm1,%mm0 - pi2fd %mm0,%mm0 - / 1.0 | -1.0 - movd %eax,%mm1 - pi2fd %mm1,%mm1 - movl pnts+16,%eax - movd 0(%eax),%mm2 - punpckldq %mm2,%mm1 - / 1.0 | cos0 - movq 0(%esi),%mm2 - / 0 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,0(%ebx) - movq 8(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,8(%ebx) - movq 16(%esi),%mm2 - / 4 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 24(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,16(%ebx) - movq %mm4,24(%ebx) - movq 32(%esi),%mm2 - / 8 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,32(%ebx) - movq 40(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,40(%ebx) - movq 48(%esi),%mm2 - / 12 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 56(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,48(%ebx) - movq %mm4,56(%ebx) - movq 64(%esi),%mm2 - / 16 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,64(%ebx) - movq 72(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,72(%ebx) - movq 80(%esi),%mm2 - / 20 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 88(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,80(%ebx) - movq %mm4,88(%ebx) - movq 96(%esi),%mm2 - / 24 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,96(%ebx) - movq 104(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,104(%ebx) - movq 112(%esi),%mm2 - / 28 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 120(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,112(%ebx) - movq %mm4,120(%ebx) + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fsts 768(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fstps 768(%esi) + fadd %st(2) + fstps 256(%esi) + faddp %st(1) + fstps 256(%edi) + +/* Phase 8*/ + + flds 32(%edx) + fadds 48(%edx) + fstps 896(%esi) + + flds 48(%edx) + fadds 40(%edx) + fstps 640(%esi) + + flds 40(%edx) + fadds 56(%edx) + fstps 384(%esi) + + flds 56(%edx) + fadds 36(%edx) + fstps 128(%esi) + + flds 36(%edx) + fadds 52(%edx) + fstps 128(%edi) + + flds 52(%edx) + fadds 44(%edx) + fstps 384(%edi) + + flds 60(%edx) + fsts 896(%edi) + fadds 44(%edx) + fstps 640(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fstps 960(%esi) + fadds 80(%edx) + fstps 832(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fstps 704(%esi) + fadds 72(%edx) + fstps 576(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fstps 448(%esi) + fadds 88(%edx) + fstps 320(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fstps 192(%esi) + fadds 68(%edx) + fstps 64(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fstps 64(%edi) + fadds 84(%edx) + fstps 192(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fstps 320(%edi) + fadds 76(%edx) + fstps 448(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fstps 576(%edi) + fadds 92(%edx) + fstps 704(%edi) + + flds 124(%edx) + fsts 960(%edi) + fadds 92(%edx) + fstps 832(%edi) + jmp .L_bye +.L01: +/* Phase 9*/ + + flds (%ecx) + fadds 4(%ecx) + fistp 512(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + + fistp (%esi) + - // Phase6 - movl 0(%ebx),%eax - movl %eax,1024(%ebp) - movl 4(%ebx),%eax - movl %eax,0(%ebp) - movl %eax,0(%edx) - movl 8(%ebx),%eax - movl %eax,512(%ebp) - movl 12(%ebx),%eax - movl %eax,512(%edx) + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fist 256(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fistp 256(%esi) - movl 16(%ebx),%eax - movl %eax,768(%ebp) - movl 20(%ebx),%eax - movl %eax,256(%edx) - - movl 24(%ebx),%eax - movl %eax,256(%ebp) - movl 28(%ebx),%eax - movl %eax,768(%edx) + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) - movq 32(%ebx),%mm0 - movq 48(%ebx),%mm1 - pfadd %mm1,%mm0 - movd %mm0,896(%ebp) - psrlq $32,%mm0 - movd %mm0,128(%edx) - movq 40(%ebx),%mm2 - pfadd %mm2,%mm1 - movd %mm1,640(%ebp) - psrlq $32,%mm1 - movd %mm1,384(%edx) + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fist 384(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fistp 384(%esi) + fadd %st(2) + fistp 128(%esi) + faddp %st(1) + fistp 128(%edi) + +/* Phase 10*/ + + flds 32(%edx) + fadds 48(%edx) + fistp 448(%esi) - movq 56(%ebx),%mm3 - pfadd %mm3,%mm2 - movd %mm2,384(%ebp) - psrlq $32,%mm2 - movd %mm2,640(%edx) + flds 48(%edx) + fadds 40(%edx) + fistp 320(%esi) + + flds 40(%edx) + fadds 56(%edx) + fistp 192(%esi) + + flds 56(%edx) + fadds 36(%edx) + fistp 64(%esi) - movd 36(%ebx),%mm4 - pfadd %mm4,%mm3 - movd %mm3,128(%ebp) - psrlq $32,%mm3 - movd %mm3,896(%edx) - movq 96(%ebx),%mm0 - movq 64(%ebx),%mm1 + flds 36(%edx) + fadds 52(%edx) + fistp 64(%edi) + + flds 52(%edx) + fadds 44(%edx) + fistp 192(%edi) + + flds 60(%edx) + fist 448(%edi) + fadds 44(%edx) + fistp 320(%edi) - movq 112(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,%mm3 - pfadd %mm1,%mm3 - movd %mm3,960(%ebp) - psrlq $32,%mm3 - movd %mm3,64(%edx) - movq 80(%ebx),%mm1 - pfadd %mm1,%mm0 - movd %mm0,832(%ebp) - psrlq $32,%mm0 - movd %mm0,192(%edx) - movq 104(%ebx),%mm3 - pfadd %mm3,%mm2 - movq %mm2,%mm4 - pfadd %mm1,%mm4 - movd %mm4,704(%ebp) - psrlq $32,%mm4 - movd %mm4,320(%edx) - movq 72(%ebx),%mm1 - pfadd %mm1,%mm2 - movd %mm2,576(%ebp) - psrlq $32,%mm2 - movd %mm2,448(%edx) + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fistp 480(%esi) + fadds 80(%edx) + fistp 416(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fistp 352(%esi) + fadds 72(%edx) + fistp 288(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fistp 224(%esi) + fadds 88(%edx) + fistp 160(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fistp 96(%esi) + fadds 68(%edx) + fistp 32(%esi) - movq 120(%ebx),%mm4 - pfadd %mm4,%mm3 - movq %mm3,%mm5 - pfadd %mm1,%mm5 - movd %mm5,448(%ebp) - psrlq $32,%mm5 - movd %mm5,576(%edx) - movq 88(%ebx),%mm1 - pfadd %mm1,%mm3 - movd %mm3,320(%ebp) - psrlq $32,%mm3 - movd %mm3,704(%edx) + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fistp 32(%edi) + fadds 84(%edx) + fistp 96(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fistp 160(%edi) + fadds 76(%edx) + fistp 224(%edi) - movd 100(%ebx),%mm5 - pfadd %mm5,%mm4 - movq %mm4,%mm6 - pfadd %mm1,%mm6 - movd %mm6,192(%ebp) - psrlq $32,%mm6 - movd %mm6,832(%edx) - movd 68(%ebx),%mm1 - pfadd %mm1,%mm4 - movd %mm4,64(%ebp) - psrlq $32,%mm4 - movd %mm4,960(%edx) + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fistp 288(%edi) + fadds 92(%edx) + fistp 352(%edi) - / femms + flds 124(%edx) + fist 480(%edi) + fadds 92(%edx) + fistp 416(%edi) + movsw +.L_bye: + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret + - popl %ebx - popl %esi - popl %edi - popl %ebp - addl $256,%esp - - ret -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mp3lib/dct64_MMX.s Fri Jun 29 17:55:35 2001 +0000 @@ -0,0 +1,1028 @@ +# This code was taken from http://www.mpg123.org +# See ChangeLog of mpg123-0.59s-pre.1 for detail +# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> + +.data + .align 4 +costab: + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 + +.text + + .align 16 + +.globl dct64_MMX +dct64_MMX: + pushl %ebx + pushl %esi + pushl %edi + subl $256,%esp + movl 280(%esp),%eax +/* Phase 1*/ + flds (%eax) + leal 128(%esp),%edx + fadds 124(%eax) + movl 272(%esp),%esi + fstps (%edx) + movl 276(%esp),%edi + + flds 4(%eax) + movl $costab,%ebx + fadds 120(%eax) + orl %ecx,%ecx + fstps 4(%edx) + + flds (%eax) + movl %esp,%ecx + fsubs 124(%eax) + fmuls (%ebx) + fstps 124(%edx) + + flds 4(%eax) + fsubs 120(%eax) + fmuls 4(%ebx) + fstps 120(%edx) + + flds 8(%eax) + fadds 116(%eax) + fstps 8(%edx) + + flds 12(%eax) + fadds 112(%eax) + fstps 12(%edx) + + flds 8(%eax) + fsubs 116(%eax) + fmuls 8(%ebx) + fstps 116(%edx) + + flds 12(%eax) + fsubs 112(%eax) + fmuls 12(%ebx) + fstps 112(%edx) + + flds 16(%eax) + fadds 108(%eax) + fstps 16(%edx) + + flds 20(%eax) + fadds 104(%eax) + fstps 20(%edx) + + flds 16(%eax) + fsubs 108(%eax) + fmuls 16(%ebx) + fstps 108(%edx) + + flds 20(%eax) + fsubs 104(%eax) + fmuls 20(%ebx) + fstps 104(%edx) + + flds 24(%eax) + fadds 100(%eax) + fstps 24(%edx) + + flds 28(%eax) + fadds 96(%eax) + fstps 28(%edx) + + flds 24(%eax) + fsubs 100(%eax) + fmuls 24(%ebx) + fstps 100(%edx) + + flds 28(%eax) + fsubs 96(%eax) + fmuls 28(%ebx) + fstps 96(%edx) + + flds 32(%eax) + fadds 92(%eax) + fstps 32(%edx) + + flds 36(%eax) + fadds 88(%eax) + fstps 36(%edx) + + flds 32(%eax) + fsubs 92(%eax) + fmuls 32(%ebx) + fstps 92(%edx) + + flds 36(%eax) + fsubs 88(%eax) + fmuls 36(%ebx) + fstps 88(%edx) + + flds 40(%eax) + fadds 84(%eax) + fstps 40(%edx) + + flds 44(%eax) + fadds 80(%eax) + fstps 44(%edx) + + flds 40(%eax) + fsubs 84(%eax) + fmuls 40(%ebx) + fstps 84(%edx) + + flds 44(%eax) + fsubs 80(%eax) + fmuls 44(%ebx) + fstps 80(%edx) + + flds 48(%eax) + fadds 76(%eax) + fstps 48(%edx) + + flds 52(%eax) + fadds 72(%eax) + fstps 52(%edx) + + flds 48(%eax) + fsubs 76(%eax) + fmuls 48(%ebx) + fstps 76(%edx) + + flds 52(%eax) + fsubs 72(%eax) + fmuls 52(%ebx) + fstps 72(%edx) + + flds 56(%eax) + fadds 68(%eax) + fstps 56(%edx) + + flds 60(%eax) + fadds 64(%eax) + fstps 60(%edx) + + flds 56(%eax) + fsubs 68(%eax) + fmuls 56(%ebx) + fstps 68(%edx) + + flds 60(%eax) + fsubs 64(%eax) + fmuls 60(%ebx) + fstps 64(%edx) + +/* Phase 2*/ + + flds (%edx) + fadds 60(%edx) + fstps (%ecx) + + flds 4(%edx) + fadds 56(%edx) + fstps 4(%ecx) + + flds (%edx) + fsubs 60(%edx) + fmuls 64(%ebx) + fstps 60(%ecx) + + flds 4(%edx) + fsubs 56(%edx) + fmuls 68(%ebx) + fstps 56(%ecx) + + flds 8(%edx) + fadds 52(%edx) + fstps 8(%ecx) + + flds 12(%edx) + fadds 48(%edx) + fstps 12(%ecx) + + flds 8(%edx) + fsubs 52(%edx) + fmuls 72(%ebx) + fstps 52(%ecx) + + flds 12(%edx) + fsubs 48(%edx) + fmuls 76(%ebx) + fstps 48(%ecx) + + flds 16(%edx) + fadds 44(%edx) + fstps 16(%ecx) + + flds 20(%edx) + fadds 40(%edx) + fstps 20(%ecx) + + flds 16(%edx) + fsubs 44(%edx) + fmuls 80(%ebx) + fstps 44(%ecx) + + flds 20(%edx) + fsubs 40(%edx) + fmuls 84(%ebx) + fstps 40(%ecx) + + flds 24(%edx) + fadds 36(%edx) + fstps 24(%ecx) + + flds 28(%edx) + fadds 32(%edx) + fstps 28(%ecx) + + flds 24(%edx) + fsubs 36(%edx) + fmuls 88(%ebx) + fstps 36(%ecx) + + flds 28(%edx) + fsubs 32(%edx) + fmuls 92(%ebx) + fstps 32(%ecx) + +/* Phase 3*/ + + flds 64(%edx) + fadds 124(%edx) + fstps 64(%ecx) + + flds 68(%edx) + fadds 120(%edx) + fstps 68(%ecx) + + flds 124(%edx) + fsubs 64(%edx) + fmuls 64(%ebx) + fstps 124(%ecx) + + flds 120(%edx) + fsubs 68(%edx) + fmuls 68(%ebx) + fstps 120(%ecx) + + flds 72(%edx) + fadds 116(%edx) + fstps 72(%ecx) + + flds 76(%edx) + fadds 112(%edx) + fstps 76(%ecx) + + flds 116(%edx) + fsubs 72(%edx) + fmuls 72(%ebx) + fstps 116(%ecx) + + flds 112(%edx) + fsubs 76(%edx) + fmuls 76(%ebx) + fstps 112(%ecx) + + flds 80(%edx) + fadds 108(%edx) + fstps 80(%ecx) + + flds 84(%edx) + fadds 104(%edx) + fstps 84(%ecx) + + flds 108(%edx) + fsubs 80(%edx) + fmuls 80(%ebx) + fstps 108(%ecx) + + flds 104(%edx) + fsubs 84(%edx) + fmuls 84(%ebx) + fstps 104(%ecx) + + flds 88(%edx) + fadds 100(%edx) + fstps 88(%ecx) + + flds 92(%edx) + fadds 96(%edx) + fstps 92(%ecx) + + flds 100(%edx) + fsubs 88(%edx) + fmuls 88(%ebx) + fstps 100(%ecx) + + flds 96(%edx) + fsubs 92(%edx) + fmuls 92(%ebx) + fstps 96(%ecx) + +/* Phase 4*/ + + flds (%ecx) + fadds 28(%ecx) + fstps (%edx) + + flds (%ecx) + fsubs 28(%ecx) + fmuls 96(%ebx) + fstps 28(%edx) + + flds 4(%ecx) + fadds 24(%ecx) + fstps 4(%edx) + + flds 4(%ecx) + fsubs 24(%ecx) + fmuls 100(%ebx) + fstps 24(%edx) + + flds 8(%ecx) + fadds 20(%ecx) + fstps 8(%edx) + + flds 8(%ecx) + fsubs 20(%ecx) + fmuls 104(%ebx) + fstps 20(%edx) + + flds 12(%ecx) + fadds 16(%ecx) + fstps 12(%edx) + + flds 12(%ecx) + fsubs 16(%ecx) + fmuls 108(%ebx) + fstps 16(%edx) + + flds 32(%ecx) + fadds 60(%ecx) + fstps 32(%edx) + + flds 60(%ecx) + fsubs 32(%ecx) + fmuls 96(%ebx) + fstps 60(%edx) + + flds 36(%ecx) + fadds 56(%ecx) + fstps 36(%edx) + + flds 56(%ecx) + fsubs 36(%ecx) + fmuls 100(%ebx) + fstps 56(%edx) + + flds 40(%ecx) + fadds 52(%ecx) + fstps 40(%edx) + + flds 52(%ecx) + fsubs 40(%ecx) + fmuls 104(%ebx) + fstps 52(%edx) + + flds 44(%ecx) + fadds 48(%ecx) + fstps 44(%edx) + + flds 48(%ecx) + fsubs 44(%ecx) + fmuls 108(%ebx) + fstps 48(%edx) + + flds 64(%ecx) + fadds 92(%ecx) + fstps 64(%edx) + + flds 64(%ecx) + fsubs 92(%ecx) + fmuls 96(%ebx) + fstps 92(%edx) + + flds 68(%ecx) + fadds 88(%ecx) + fstps 68(%edx) + + flds 68(%ecx) + fsubs 88(%ecx) + fmuls 100(%ebx) + fstps 88(%edx) + + flds 72(%ecx) + fadds 84(%ecx) + fstps 72(%edx) + + flds 72(%ecx) + fsubs 84(%ecx) + fmuls 104(%ebx) + fstps 84(%edx) + + flds 76(%ecx) + fadds 80(%ecx) + fstps 76(%edx) + + flds 76(%ecx) + fsubs 80(%ecx) + fmuls 108(%ebx) + fstps 80(%edx) + + flds 96(%ecx) + fadds 124(%ecx) + fstps 96(%edx) + + flds 124(%ecx) + fsubs 96(%ecx) + fmuls 96(%ebx) + fstps 124(%edx) + + flds 100(%ecx) + fadds 120(%ecx) + fstps 100(%edx) + + flds 120(%ecx) + fsubs 100(%ecx) + fmuls 100(%ebx) + fstps 120(%edx) + + flds 104(%ecx) + fadds 116(%ecx) + fstps 104(%edx) + + flds 116(%ecx) + fsubs 104(%ecx) + fmuls 104(%ebx) + fstps 116(%edx) + + flds 108(%ecx) + fadds 112(%ecx) + fstps 108(%edx) + + flds 112(%ecx) + fsubs 108(%ecx) + fmuls 108(%ebx) + fstps 112(%edx) + + flds (%edx) + fadds 12(%edx) + fstps (%ecx) + + flds (%edx) + fsubs 12(%edx) + fmuls 112(%ebx) + fstps 12(%ecx) + + flds 4(%edx) + fadds 8(%edx) + fstps 4(%ecx) + + flds 4(%edx) + fsubs 8(%edx) + fmuls 116(%ebx) + fstps 8(%ecx) + + flds 16(%edx) + fadds 28(%edx) + fstps 16(%ecx) + + flds 28(%edx) + fsubs 16(%edx) + fmuls 112(%ebx) + fstps 28(%ecx) + + flds 20(%edx) + fadds 24(%edx) + fstps 20(%ecx) + + flds 24(%edx) + fsubs 20(%edx) + fmuls 116(%ebx) + fstps 24(%ecx) + + flds 32(%edx) + fadds 44(%edx) + fstps 32(%ecx) + + flds 32(%edx) + fsubs 44(%edx) + fmuls 112(%ebx) + fstps 44(%ecx) + + flds 36(%edx) + fadds 40(%edx) + fstps 36(%ecx) + + flds 36(%edx) + fsubs 40(%edx) + fmuls 116(%ebx) + fstps 40(%ecx) + + flds 48(%edx) + fadds 60(%edx) + fstps 48(%ecx) + + flds 60(%edx) + fsubs 48(%edx) + fmuls 112(%ebx) + fstps 60(%ecx) + + flds 52(%edx) + fadds 56(%edx) + fstps 52(%ecx) + + flds 56(%edx) + fsubs 52(%edx) + fmuls 116(%ebx) + fstps 56(%ecx) + + flds 64(%edx) + fadds 76(%edx) + fstps 64(%ecx) + + flds 64(%edx) + fsubs 76(%edx) + fmuls 112(%ebx) + fstps 76(%ecx) + + flds 68(%edx) + fadds 72(%edx) + fstps 68(%ecx) + + flds 68(%edx) + fsubs 72(%edx) + fmuls 116(%ebx) + fstps 72(%ecx) + + flds 80(%edx) + fadds 92(%edx) + fstps 80(%ecx) + + flds 92(%edx) + fsubs 80(%edx) + fmuls 112(%ebx) + fstps 92(%ecx) + + flds 84(%edx) + fadds 88(%edx) + fstps 84(%ecx) + + flds 88(%edx) + fsubs 84(%edx) + fmuls 116(%ebx) + fstps 88(%ecx) + + flds 96(%edx) + fadds 108(%edx) + fstps 96(%ecx) + + flds 96(%edx) + fsubs 108(%edx) + fmuls 112(%ebx) + fstps 108(%ecx) + + flds 100(%edx) + fadds 104(%edx) + fstps 100(%ecx) + + flds 100(%edx) + fsubs 104(%edx) + fmuls 116(%ebx) + fstps 104(%ecx) + + flds 112(%edx) + fadds 124(%edx) + fstps 112(%ecx) + + flds 124(%edx) + fsubs 112(%edx) + fmuls 112(%ebx) + fstps 124(%ecx) + + flds 116(%edx) + fadds 120(%edx) + fstps 116(%ecx) + + flds 120(%edx) + fsubs 116(%edx) + fmuls 116(%ebx) + fstps 120(%ecx) + +/* Phase 5*/ + + flds 32(%ecx) + fadds 36(%ecx) + fstps 32(%edx) + + flds 32(%ecx) + fsubs 36(%ecx) + fmuls 120(%ebx) + fstps 36(%edx) + + flds 44(%ecx) + fsubs 40(%ecx) + fmuls 120(%ebx) + fsts 44(%edx) + fadds 40(%ecx) + fadds 44(%ecx) + fstps 40(%edx) + + flds 48(%ecx) + fsubs 52(%ecx) + fmuls 120(%ebx) + + flds 60(%ecx) + fsubs 56(%ecx) + fmuls 120(%ebx) + fld %st(0) + fadds 56(%ecx) + fadds 60(%ecx) + fld %st(0) + fadds 48(%ecx) + fadds 52(%ecx) + fstps 48(%edx) + fadd %st(2) + fstps 56(%edx) + fsts 60(%edx) + faddp %st(1) + fstps 52(%edx) + + flds 64(%ecx) + fadds 68(%ecx) + fstps 64(%edx) + + flds 64(%ecx) + fsubs 68(%ecx) + fmuls 120(%ebx) + fstps 68(%edx) + + flds 76(%ecx) + fsubs 72(%ecx) + fmuls 120(%ebx) + fsts 76(%edx) + fadds 72(%ecx) + fadds 76(%ecx) + fstps 72(%edx) + + flds 92(%ecx) + fsubs 88(%ecx) + fmuls 120(%ebx) + fsts 92(%edx) + fadds 92(%ecx) + fadds 88(%ecx) + fld %st(0) + fadds 80(%ecx) + fadds 84(%ecx) + fstps 80(%edx) + + flds 80(%ecx) + fsubs 84(%ecx) + fmuls 120(%ebx) + fadd %st(0), %st(1) + fadds 92(%edx) + fstps 84(%edx) + fstps 88(%edx) + + flds 96(%ecx) + fadds 100(%ecx) + fstps 96(%edx) + + flds 96(%ecx) + fsubs 100(%ecx) + fmuls 120(%ebx) + fstps 100(%edx) + + flds 108(%ecx) + fsubs 104(%ecx) + fmuls 120(%ebx) + fsts 108(%edx) + fadds 104(%ecx) + fadds 108(%ecx) + fstps 104(%edx) + + flds 124(%ecx) + fsubs 120(%ecx) + fmuls 120(%ebx) + fsts 124(%edx) + fadds 120(%ecx) + fadds 124(%ecx) + fld %st(0) + fadds 112(%ecx) + fadds 116(%ecx) + fstps 112(%edx) + + flds 112(%ecx) + fsubs 116(%ecx) + fmuls 120(%ebx) + fadd %st(0),%st(1) + fadds 124(%edx) + fstps 116(%edx) + fstps 120(%edx) + jnz .L01 + +/* Phase 6*/ + + flds (%ecx) + fadds 4(%ecx) + fstps 1024(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + fsts (%esi) + fstps (%edi) + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fsts 512(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fstps 512(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fsts 768(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fstps 768(%esi) + fadd %st(2) + fstps 256(%esi) + faddp %st(1) + fstps 256(%edi) + +/* Phase 7*/ + + flds 32(%edx) + fadds 48(%edx) + fstps 896(%esi) + + flds 48(%edx) + fadds 40(%edx) + fstps 640(%esi) + + flds 40(%edx) + fadds 56(%edx) + fstps 384(%esi) + + flds 56(%edx) + fadds 36(%edx) + fstps 128(%esi) + + flds 36(%edx) + fadds 52(%edx) + fstps 128(%edi) + + flds 52(%edx) + fadds 44(%edx) + fstps 384(%edi) + + flds 60(%edx) + fsts 896(%edi) + fadds 44(%edx) + fstps 640(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fstps 960(%esi) + fadds 80(%edx) + fstps 832(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fstps 704(%esi) + fadds 72(%edx) + fstps 576(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fstps 448(%esi) + fadds 88(%edx) + fstps 320(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fstps 192(%esi) + fadds 68(%edx) + fstps 64(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fstps 64(%edi) + fadds 84(%edx) + fstps 192(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fstps 320(%edi) + fadds 76(%edx) + fstps 448(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fstps 576(%edi) + fadds 92(%edx) + fstps 704(%edi) + + flds 124(%edx) + fsts 960(%edi) + fadds 92(%edx) + fstps 832(%edi) + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret +.L01: +/* Phase 8*/ + + flds (%ecx) + fadds 4(%ecx) + fistp 512(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + + fistp (%esi) + + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fist 256(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fistp 256(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fist 384(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fistp 384(%esi) + fadd %st(2) + fistp 128(%esi) + faddp %st(1) + fistp 128(%edi) + +/* Phase 9*/ + + flds 32(%edx) + fadds 48(%edx) + fistp 448(%esi) + + flds 48(%edx) + fadds 40(%edx) + fistp 320(%esi) + + flds 40(%edx) + fadds 56(%edx) + fistp 192(%esi) + + flds 56(%edx) + fadds 36(%edx) + fistp 64(%esi) + + flds 36(%edx) + fadds 52(%edx) + fistp 64(%edi) + + flds 52(%edx) + fadds 44(%edx) + fistp 192(%edi) + + flds 60(%edx) + fist 448(%edi) + fadds 44(%edx) + fistp 320(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fistp 480(%esi) + fadds 80(%edx) + fistp 416(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fistp 352(%esi) + fadds 72(%edx) + fistp 288(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fistp 224(%esi) + fadds 88(%edx) + fistp 160(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fistp 96(%esi) + fadds 68(%edx) + fistp 32(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fistp 32(%edi) + fadds 84(%edx) + fistp 96(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fistp 160(%edi) + fadds 76(%edx) + fistp 224(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fistp 288(%edi) + fadds 92(%edx) + fistp 352(%edi) + + flds 124(%edx) + fist 480(%edi) + fadds 92(%edx) + fistp 416(%edi) + movsw + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret + +
--- a/mp3lib/dct64_k7.s Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/dct64_k7.s Fri Jun 29 17:55:35 2001 +0000 @@ -1,677 +1,804 @@ -/// -/// Replacement of dct64() with AMD's 3DNowEx(DSP)! SIMD operations support -/// -/// This code based 'dct64_3dnow.s' by Syuuhei Kashiyama -/// <squash@mb.kcom.ne.jp>,only some types of changes have been made: -/// -/// - added new opcodes PSWAPD, PFPNACC -/// - decreased number of opcodes (as it was suggested by k7 manual) -/// (using memory reference as operand of instructions) -/// - Phase 6 is rewritten with mixing of cpu and mmx opcodes -/// - change function name for support 3DNowEx! automatic detect -/// - negation of 3dnow reg was replaced with PXOR 0x800000000, MMi instead -/// of PFMUL as it was suggested by athlon manual. (Two not separated PFMUL -/// can not be paired, but PXOR can be). -/// -/// note: because K7 processors are an aggresive out-of-order three-way -/// superscalar ones instruction order is not significand for them. -/// -/// Modified by Nick Kurshev <nickols_k@mail.ru> -/// -/// The author of this program disclaim whole expressed or implied -/// warranties with regard to this program, and in no event shall the -/// author of this program liable to whatever resulted from the use of -/// this program. Use it at your own risk. -/// +# This code was taken from http://www.mpg123.org +# See ChangeLog of mpg123-0.59s-pre.1 for detail +# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> +# Partial 3dnowex-DSP! optimization by Nick Kurshev +# +# TODO: finish 3dnow! optimization at least in scalar mode +# .data - .align 8 + .align 8 plus_minus_3dnow: .long 0x00000000, 0x80000000 +costab: + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 .text - .globl dct64_3dnowex - .type dct64_3dnowex,@function + + .align 16 + +.globl dct64_MMX_3dnowex +dct64_MMX_3dnowex: + pushl %ebx + pushl %esi + pushl %edi + subl $256,%esp + movl 280(%esp),%eax -/* Discrete Cosine Tansform (DCT) for subband synthesis */ -/* void dct64(real *a,real *b,real *c) */ -dct64_3dnowex: - subl $256,%esp - pushl %ebp - pushl %edi - pushl %esi - pushl %ebx - leal 16(%esp),%ebx /* ebx -> real tmp1[32] */ - movl 284(%esp),%edi /* edi -> c */ - movl 276(%esp),%ebp /* ebp -> a */ - movl 280(%esp),%edx /* edx -> b */ - leal 128(%ebx),%esi /* esi -> real tmp2[32] */ - - / femms - - // 1 - movl pnts,%eax + leal 128(%esp),%edx + movl 272(%esp),%esi + movl 276(%esp),%edi + movl $costab,%ebx + orl %ecx,%ecx + movl %esp,%ecx + femms +/* Phase 1*/ + movq (%eax), %mm0 + movq 8(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%eax), %mm1 + movq 112(%eax), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%edx) + movq %mm4, 8(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul (%ebx), %mm3 + pfmul 8(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 120(%edx) + movq %mm7, 112(%edx) - movq 0(%edi),%mm0 /* mm0 = c[0x00] | c[0x01]*/ - movq %mm0,%mm1 /* mm1 = mm0 */ - movd 124(%edi),%mm2 /* mm2 = c[0x1f] */ - punpckldq 120(%edi),%mm2 /* mm2 = c[0x1f] | c[0x1E] */ - pfadd %mm2,%mm0 /* mm0 = c[0x00]+c[0x1F] | c[0x1E]+c[0x01] */ - movq %mm0,0(%ebx) /* tmp[0, 1] = mm0 */ - pfsub %mm2,%mm1 /* c[0x00]-c[0x1f] | c[0x01]-c[0x1e] */ - pfmul 0(%eax),%mm1 /* (c[0x00]-c[0x1f])*pnts[0]|(c[0x01]-c[0x1e])*pnts[1]*/ - pswapd %mm1, %mm1 /* (c[0x01]-c[0x1e])*pnts[1]|(c[0x00]-c[0x1f])*pnts[0]*/ - movq %mm1, 120(%ebx) /* tmp1[30, 31]=mm1 */ + movq 16(%eax), %mm0 + movq 24(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%eax), %mm1 + movq 96(%eax), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 16(%edx) + movq %mm4, 24(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 16(%ebx), %mm3 + pfmul 24(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 104(%edx) + movq %mm7, 96(%edx) - movq 8(%edi),%mm4 - movq %mm4,%mm5 - movd 116(%edi),%mm6 - punpckldq 112(%edi),%mm6 - pfadd %mm6,%mm4 - movq %mm4,8(%ebx) - pfsub %mm6,%mm5 - pfmul 8(%eax),%mm5 - pswapd %mm5, %mm5 - movq %mm5, 112(%ebx) - - movq 16(%edi),%mm0 - movq %mm0,%mm1 - movd 108(%edi),%mm2 - punpckldq 104(%edi),%mm2 - pfadd %mm2,%mm0 - movq %mm0,16(%ebx) - pfsub %mm2,%mm1 - pfmul 16(%eax),%mm1 - pswapd %mm1, %mm1 - movq %mm1, 104(%ebx) + movq 32(%eax), %mm0 + movq 40(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 88(%eax), %mm1 + movq 80(%eax), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%edx) + movq %mm4, 40(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 32(%ebx), %mm3 + pfmul 40(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 88(%edx) + movq %mm7, 80(%edx) - movq 24(%edi),%mm4 - movq %mm4,%mm5 - movd 100(%edi),%mm6 - punpckldq 96(%edi),%mm6 - pfadd %mm6,%mm4 - movq %mm4,24(%ebx) - pfsub %mm6,%mm5 - pfmul 24(%eax),%mm5 - pswapd %mm5, %mm5 - movq %mm5, 96(%ebx) + movq 48(%eax), %mm0 + movq 56(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 72(%eax), %mm1 + movq 64(%eax), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 48(%edx) + movq %mm4, 56(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 48(%ebx), %mm3 + pfmul 56(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 72(%edx) + movq %mm7, 64(%edx) - movq 32(%edi),%mm0 - movq %mm0,%mm1 - movd 92(%edi),%mm2 - punpckldq 88(%edi),%mm2 - pfadd %mm2,%mm0 - movq %mm0,32(%ebx) - pfsub %mm2,%mm1 - pfmul 32(%eax),%mm1 - pswapd %mm1, %mm1 - movq %mm1, 88(%ebx) +/* Phase 2*/ - movq 40(%edi),%mm4 - movq %mm4,%mm5 - movd 84(%edi),%mm6 - punpckldq 80(%edi),%mm6 - pfadd %mm6,%mm4 - movq %mm4,40(%ebx) - pfsub %mm6,%mm5 - pfmul 40(%eax),%mm5 - pswapd %mm5, %mm5 - movq %mm5, 80(%ebx) + movq (%edx), %mm0 + movq 8(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 56(%edx), %mm1 + movq 48(%edx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%ecx) + movq %mm4, 8(%ecx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 64(%ebx), %mm3 + pfmul 72(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 56(%ecx) + movq %mm7, 48(%ecx) + + movq 16(%edx), %mm0 + movq 24(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 40(%edx), %mm1 + movq 32(%edx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 16(%ecx) + movq %mm4, 24(%ecx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 80(%ebx), %mm3 + pfmul 88(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 40(%ecx) + movq %mm7, 32(%ecx) - movq 48(%edi),%mm0 - movq %mm0,%mm1 - movd 76(%edi),%mm2 - punpckldq 72(%edi),%mm2 - pfadd %mm2,%mm0 - movq %mm0,48(%ebx) - pfsub %mm2,%mm1 - pfmul 48(%eax),%mm1 - pswapd %mm1, %mm1 - movq %mm1, 72(%ebx) +/* Phase 3*/ - movq 56(%edi),%mm4 - movq %mm4,%mm5 - movd 68(%edi),%mm6 - punpckldq 64(%edi),%mm6 - pfadd %mm6,%mm4 - movq %mm4,56(%ebx) - pfsub %mm6,%mm5 - pfmul 56(%eax),%mm5 - pswapd %mm5, %mm5 - movq %mm5, 64(%ebx) + movq 64(%edx), %mm0 + movq 72(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%edx), %mm1 + movq 112(%edx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%ecx) + movq %mm4, 72(%ecx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 64(%ebx), %mm3 + pfmul 72(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 120(%ecx) + movq %mm7, 112(%ecx) - // 2 - movl pnts+4,%eax - / 0, 14 - movq 0(%ebx),%mm0 /* mm0 = tmp1[0] | tmp1[1] */ - movq %mm0,%mm1 - movd 60(%ebx),%mm2 /* mm2 = tmp1[0x0F] */ - punpckldq 56(%ebx),%mm2 /* mm2 = tmp1[0x0E] | tmp1[0x0F] */ - movq 0(%eax),%mm3 /* mm3 = pnts[0] | pnts[1] */ - pfadd %mm2,%mm0 /* mm0 = tmp1[0]+tmp1[0x0F]|tmp1[1]+tmp1[0x0E]*/ - movq %mm0,0(%esi) /* tmp2[0, 1] = mm0 */ - pfsub %mm2,%mm1 /* mm1 = tmp1[0]-tmp1[0x0F]|tmp1[1]-tmp1[0x0E]*/ - pfmul %mm3,%mm1 /* mm1 = (tmp1[0]-tmp1[0x0F])*pnts[0]|(tmp1[1]-tmp1[0x0E])*pnts[1]*/ - pswapd %mm1, %mm1 /* mm1 = (tmp1[1]-tmp1[0x0E])*pnts[1]|(tmp1[0]-tmp1[0x0F])*pnts[0]*/ - movq %mm1, 56(%esi) /* tmp2[0x0E, 0x0F] = mm1 */ - / 16, 30 - movq 64(%ebx),%mm0 - movq %mm0,%mm1 - movd 124(%ebx),%mm2 - punpckldq 120(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,64(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - pswapd %mm1, %mm1 - movq %mm1, 120(%esi) - movq 8(%ebx),%mm4 - / 2, 12 - movq %mm4,%mm5 - movd 52(%ebx),%mm6 - punpckldq 48(%ebx),%mm6 - movq 8(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,8(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - pswapd %mm5, %mm5 - movq %mm5, 48(%esi) - movq 72(%ebx),%mm4 - / 18, 28 - movq %mm4,%mm5 - movd 116(%ebx),%mm6 - punpckldq 112(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,72(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - pswapd %mm5, %mm5 - movq %mm5, 112(%esi) - movq 16(%ebx),%mm0 - / 4, 10 - movq %mm0,%mm1 - movd 44(%ebx),%mm2 - punpckldq 40(%ebx),%mm2 - movq 16(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,16(%esi) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - pswapd %mm1, %mm1 - movq %mm1, 40(%esi) - movq 80(%ebx),%mm0 - / 20, 26 - movq %mm0,%mm1 - movd 108(%ebx),%mm2 - punpckldq 104(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,80(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - pswapd %mm1, %mm1 - movq %mm1, 104(%esi) - movq 24(%ebx),%mm4 - / 6, 8 - movq %mm4,%mm5 - movd 36(%ebx),%mm6 - punpckldq 32(%ebx),%mm6 - movq 24(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,24(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - pswapd %mm5, %mm5 - movq %mm5, 32(%esi) - movq 88(%ebx),%mm4 - / 22, 24 - movq %mm4,%mm5 - movd 100(%ebx),%mm6 - punpckldq 96(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,88(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - pswapd %mm5, %mm5 - movq %mm5, 96(%esi) + movq 80(%edx), %mm0 + movq 88(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%edx), %mm1 + movq 96(%edx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 80(%ecx) + movq %mm4, 88(%ecx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 80(%ebx), %mm3 + pfmul 88(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 104(%ecx) + movq %mm7, 96(%ecx) + +/* Phase 4*/ + + movq (%ecx), %mm0 + movq 8(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 24(%ecx), %mm1 + movq 16(%ecx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%edx) + movq %mm4, 8(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 24(%edx) + movq %mm7, 16(%edx) + + movq 32(%ecx), %mm0 + movq 40(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 56(%ecx), %mm1 + movq 48(%ecx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%edx) + movq %mm4, 40(%edx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 56(%edx) + movq %mm7, 48(%edx) + + movq 64(%ecx), %mm0 + movq 72(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 88(%ecx), %mm1 + movq 80(%ecx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%edx) + movq %mm4, 72(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 88(%edx) + movq %mm7, 80(%edx) - // 3 - movl pnts+8,%eax - movq 0(%eax),%mm0 - movq 8(%eax),%mm1 - movq 0(%esi),%mm2 - / 0, 6 - movq %mm2,%mm3 - movd 28(%esi),%mm4 - punpckldq 24(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,0(%ebx) - pswapd %mm3, %mm3 - movq %mm3, 24(%ebx) - movq 8(%esi),%mm5 - / 2, 4 - movq %mm5,%mm6 - movd 20(%esi),%mm7 - punpckldq 16(%esi),%mm7 - pfadd %mm7,%mm5 - pfsub %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,8(%ebx) - pswapd %mm6, %mm6 - movq %mm6, 16(%ebx) - movq 32(%esi),%mm2 - / 8, 14 - movq %mm2,%mm3 - movd 60(%esi),%mm4 - punpckldq 56(%esi),%mm4 - pfadd %mm4,%mm2 - pfsubr %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,32(%ebx) - pswapd %mm3, %mm3 - movq %mm3, 56(%ebx) - movq 40(%esi),%mm5 - / 10, 12 - movq %mm5,%mm6 - movd 52(%esi),%mm7 - punpckldq 48(%esi),%mm7 - pfadd %mm7,%mm5 - pfsubr %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,40(%ebx) - pswapd %mm6, %mm6 - movq %mm6, 48(%ebx) - movq 64(%esi),%mm2 - / 16, 22 - movq %mm2,%mm3 - movd 92(%esi),%mm4 - punpckldq 88(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,64(%ebx) - pswapd %mm3, %mm3 - movq %mm3, 88(%ebx) - movq 72(%esi),%mm5 - / 18, 20 - movq %mm5,%mm6 - movd 84(%esi),%mm7 - punpckldq 80(%esi),%mm7 - pfadd %mm7,%mm5 - pfsub %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,72(%ebx) - pswapd %mm6, %mm6 - movq %mm6, 80(%ebx) - movq 96(%esi),%mm2 - / 24, 30 - movq %mm2,%mm3 - movd 124(%esi),%mm4 - punpckldq 120(%esi),%mm4 - pfadd %mm4,%mm2 - pfsubr %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,96(%ebx) - pswapd %mm3, %mm3 - movq %mm3, 120(%ebx) - movq 104(%esi),%mm5 - / 26, 28 - movq %mm5,%mm6 - movd 116(%esi),%mm7 - punpckldq 112(%esi),%mm7 - pfadd %mm7,%mm5 - pfsubr %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,104(%ebx) - pswapd %mm6, %mm6 - movq %mm6, 112(%ebx) + movq 96(%ecx), %mm0 + movq 104(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%ecx), %mm1 + movq 112(%ecx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 96(%edx) + movq %mm4, 104(%edx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 120(%edx) + movq %mm7, 112(%edx) + +/* Phase 5 */ + + movq (%edx), %mm0 + movq 16(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 8(%edx), %mm1 + movq 24(%edx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%ecx) + movq %mm4, 16(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 8(%ecx) + movq %mm7, 24(%ecx) + + movq 32(%edx), %mm0 + movq 48(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 40(%edx), %mm1 + movq 56(%edx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%ecx) + movq %mm4, 48(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 40(%ecx) + movq %mm7, 56(%ecx) + + movq 64(%edx), %mm0 + movq 80(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 72(%edx), %mm1 + movq 88(%edx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%ecx) + movq %mm4, 80(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 72(%ecx) + movq %mm7, 88(%ecx) + + movq 96(%edx), %mm0 + movq 112(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%edx), %mm1 + movq 120(%edx), %mm5 + pswapd %mm1, %mm1 + pswapd %mm5, %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 96(%ecx) + movq %mm4, 112(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 104(%ecx) + movq %mm7, 120(%ecx) + +/* Phase 6. This is the end of easy road. */ + movl $1, %eax + movd %eax, %mm7 + pi2fd %mm7, %mm7 + movq 32(%ecx), %mm0 + punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */ + movq %mm0, %mm1 + movq plus_minus_3dnow, %mm6 + /* n.b.: pfpnacc */ + pxor %mm6, %mm1 + pfacc %mm1, %mm0 + /**/ + pfmul %mm7, %mm0 + movq %mm0, 32(%edx) + femms - // 4 - movl pnts+12,%eax - movq 0(%eax),%mm0 /* mm0 = pnts[3] | pnts[4] */ - movq 0(%ebx),%mm1 /* mm1 = tmp1[0] | tmp1[1] */ - / 0 - movq %mm1,%mm2 - movd 12(%ebx),%mm3 /* mm3 = tmp1[3] */ - punpckldq 8(%ebx),%mm3 /* mm3 = tmp1[3] | tmp1[2] */ - pfadd %mm3,%mm1 /* mm1 = tmp1[0]+tmp1[3] | tmp1[1]+tmp1[2]*/ - pfsub %mm3,%mm2 /* mm2 = tmp1[0]-tmp1[3] | tmp1[0]-tmp1[2]*/ - pfmul %mm0,%mm2 /* mm2 = tmp1[0]-tmp1[3]*pnts[3]|tmp1[0]-tmp1[2]*pnts[4]*/ - movq %mm1,0(%esi) /* tmp2[0, 1] = mm1 */ - pswapd %mm2, %mm2 /* mm2 = tmp1[0]-tmp1[2]*pnts[4]|tmp1[0]-tmp1[3]*pnts[3] */ - movq %mm2, 8(%esi) /* tmp2[2, 3] = mm2 */ - movq 16(%ebx),%mm4 - / 4 - movq %mm4,%mm5 - movd 28(%ebx),%mm6 - punpckldq 24(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,16(%esi) - pswapd %mm5, %mm5 - movq %mm5, 24(%esi) - movq 32(%ebx),%mm1 - / 8 - movq %mm1,%mm2 - movd 44(%ebx),%mm3 - punpckldq 40(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,32(%esi) - pswapd %mm2, %mm2 - movq %mm2, 40(%esi) - movq 48(%ebx),%mm4 - / 12 - movq %mm4,%mm5 - movd 60(%ebx),%mm6 - punpckldq 56(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,48(%esi) - pswapd %mm5, %mm5 - movq %mm5, 56(%esi) - movq 64(%ebx),%mm1 - / 16 - movq %mm1,%mm2 - movd 76(%ebx),%mm3 - punpckldq 72(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,64(%esi) - pswapd %mm2, %mm2 - movq %mm2, 72(%esi) - movq 80(%ebx),%mm4 - / 20 - movq %mm4,%mm5 - movd 92(%ebx),%mm6 - punpckldq 88(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,80(%esi) - pswapd %mm5, %mm5 - movq %mm5, 88(%esi) - movq 96(%ebx),%mm1 - / 24 - movq %mm1,%mm2 - movd 108(%ebx),%mm3 - punpckldq 104(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,96(%esi) - pswapd %mm2, %mm2 - movq %mm2, 104(%esi) - movq 112(%ebx),%mm4 - / 28 - movq %mm4,%mm5 - movd 124(%ebx),%mm6 - punpckldq 120(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,112(%esi) - pswapd %mm5, %mm5 - movq %mm5, 120(%esi) + flds 44(%ecx) + fsubs 40(%ecx) + fmuls 120(%ebx) + + fsts 44(%edx) + fadds 40(%ecx) /* pfacc 40(ecx), 56(%ecx) */ + fadds 44(%ecx) + fstps 40(%edx) + + flds 48(%ecx) + fsubs 52(%ecx) + fmuls 120(%ebx) + + flds 60(%ecx) + fsubs 56(%ecx) + fmuls 120(%ebx) + + fld %st(0) + fadds 56(%ecx) + fadds 60(%ecx) + + fld %st(0) + fadds 48(%ecx) + fadds 52(%ecx) + fstps 48(%edx) + fadd %st(2) + fstps 56(%edx) + fsts 60(%edx) + faddp %st(1) + fstps 52(%edx) +/*---*/ + flds 64(%ecx) + fadds 68(%ecx) + fstps 64(%edx) + + flds 64(%ecx) + fsubs 68(%ecx) + fmuls 120(%ebx) + fstps 68(%edx) + + flds 76(%ecx) + fsubs 72(%ecx) + fmuls 120(%ebx) + fsts 76(%edx) + fadds 72(%ecx) + fadds 76(%ecx) + fstps 72(%edx) + + flds 92(%ecx) + fsubs 88(%ecx) + fmuls 120(%ebx) + fsts 92(%edx) + fadds 92(%ecx) + fadds 88(%ecx) + + fld %st(0) + fadds 80(%ecx) + fadds 84(%ecx) + fstps 80(%edx) + + flds 80(%ecx) + fsubs 84(%ecx) + fmuls 120(%ebx) + fadd %st(0), %st(1) + fadds 92(%edx) + fstps 84(%edx) + fstps 88(%edx) + + flds 96(%ecx) + fadds 100(%ecx) + fstps 96(%edx) + + flds 96(%ecx) + fsubs 100(%ecx) + fmuls 120(%ebx) + fstps 100(%edx) + + flds 108(%ecx) + fsubs 104(%ecx) + fmuls 120(%ebx) + fsts 108(%edx) + fadds 104(%ecx) + fadds 108(%ecx) + fstps 104(%edx) + + flds 124(%ecx) + fsubs 120(%ecx) + fmuls 120(%ebx) + fsts 124(%edx) + fadds 120(%ecx) + fadds 124(%ecx) - // 5 - movq plus_minus_3dnow, %mm0 /* mm0 = 1.0 | -1.0 */ - movl $1,%eax - movd %eax,%mm1 - pi2fd %mm1,%mm1 - movl pnts+16,%eax - movd 0(%eax),%mm2 - punpckldq %mm2,%mm1 /* mm1 = 1.0 | cos0 */ - movq 0(%esi),%mm2 /* mm2 = tmp2[0] | tmp2[1] */ - / 0 - pfpnacc %mm2, %mm2 - pswapd %mm2, %mm2 /* mm2 = tmp2[0]+tmp2[1]|tmp2[0]-tmp2[1]*/ - pfmul %mm1,%mm2 /* mm2 = tmp2[0]+tmp2[1]|(tmp2[0]-tmp2[1])*cos0*/ - movq %mm2,0(%ebx) /* tmp1[0, 1] = mm2 */ - movq 8(%esi),%mm4 /* mm4 = tmp2[2] | tmp2[3]*/ - pfpnacc %mm4, %mm4 - pswapd %mm4, %mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[2]-tmp2[3]*/ - pxor %mm0,%mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/ - pfmul %mm1,%mm4 /* mm4 = tmp2[2]+tmp2[3]|(tmp2[3]-tmp2[2])*cos0*/ - movq %mm4,%mm5 - psrlq $32,%mm5 /* mm5 = (tmp2[3]-tmp2[2])*cos0 */ - pfacc %mm5,%mm4 /* mm4 = tmp2[2]+tmp2[3]+(tmp2[3]-tmp2[2])*cos0|(tmp2[3]-tmp2[2])*cos0*/ - movq %mm4,8(%ebx) /* tmp1[2, 3] = mm4 */ - movq 16(%esi),%mm2 - / 4 - pfpnacc %mm2, %mm2 - pswapd %mm2, %mm2 + fld %st(0) + fadds 112(%ecx) + fadds 116(%ecx) + fstps 112(%edx) + + flds 112(%ecx) + fsubs 116(%ecx) + fmuls 120(%ebx) + fadd %st(0),%st(1) + fadds 124(%edx) + fstps 116(%edx) + fstps 120(%edx) + jnz .L01 + +/* Phase 7*/ + + flds (%ecx) + fadds 4(%ecx) + fstps 1024(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + fsts (%esi) + fstps (%edi) + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fsts 512(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fstps 512(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) - pfmul %mm1,%mm2 - movq 24(%esi),%mm4 - pfpnacc %mm4, %mm4 - pswapd %mm4, %mm4 + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fsts 768(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fstps 768(%esi) + fadd %st(2) + fstps 256(%esi) + faddp %st(1) + fstps 256(%edi) + +/* Phase 8*/ + + flds 32(%edx) + fadds 48(%edx) + fstps 896(%esi) + + flds 48(%edx) + fadds 40(%edx) + fstps 640(%esi) - pxor %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,16(%ebx) - movq %mm4,24(%ebx) - movq 32(%esi),%mm2 - / 8 - pfpnacc %mm2, %mm2 - pswapd %mm2, %mm2 + flds 40(%edx) + fadds 56(%edx) + fstps 384(%esi) + + flds 56(%edx) + fadds 36(%edx) + fstps 128(%esi) + + flds 36(%edx) + fadds 52(%edx) + fstps 128(%edi) + + flds 52(%edx) + fadds 44(%edx) + fstps 384(%edi) + + flds 60(%edx) + fsts 896(%edi) + fadds 44(%edx) + fstps 640(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fstps 960(%esi) + fadds 80(%edx) + fstps 832(%esi) - pfmul %mm1,%mm2 - movq %mm2,32(%ebx) - movq 40(%esi),%mm4 - pfpnacc %mm4, %mm4 - pswapd %mm4, %mm4 - pxor %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,40(%ebx) - movq 48(%esi),%mm2 - / 12 - pfpnacc %mm2, %mm2 - pswapd %mm2, %mm2 - pfmul %mm1,%mm2 - movq 56(%esi),%mm4 - pfpnacc %mm4, %mm4 - pswapd %mm4, %mm4 - pxor %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,48(%ebx) - movq %mm4,56(%ebx) - movq 64(%esi),%mm2 - / 16 - pfpnacc %mm2, %mm2 - pswapd %mm2, %mm2 - pfmul %mm1,%mm2 - movq %mm2,64(%ebx) - movq 72(%esi),%mm4 - pfpnacc %mm4, %mm4 - pswapd %mm4, %mm4 - pxor %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,72(%ebx) - movq 80(%esi),%mm2 - / 20 - pfpnacc %mm2, %mm2 - pswapd %mm2, %mm2 - pfmul %mm1,%mm2 - movq 88(%esi),%mm4 - pfpnacc %mm4, %mm4 - pswapd %mm4, %mm4 - pxor %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,80(%ebx) - movq %mm4,88(%ebx) - movq 96(%esi),%mm2 - / 24 - pfpnacc %mm2, %mm2 - pswapd %mm2, %mm2 - pfmul %mm1,%mm2 - movq %mm2,96(%ebx) - movq 104(%esi),%mm4 - pfpnacc %mm4, %mm4 - pswapd %mm4, %mm4 - pxor %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,104(%ebx) - movq 112(%esi),%mm2 - / 28 - pfpnacc %mm2, %mm2 - pswapd %mm2, %mm2 - pfmul %mm1,%mm2 - movq 120(%esi),%mm4 - pfpnacc %mm4, %mm4 - pswapd %mm4, %mm4 - pxor %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,112(%ebx) - movq %mm4,120(%ebx) + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fstps 704(%esi) + fadds 72(%edx) + fstps 576(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fstps 448(%esi) + fadds 88(%edx) + fstps 320(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fstps 192(%esi) + fadds 68(%edx) + fstps 64(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fstps 64(%edi) + fadds 84(%edx) + fstps 192(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fstps 320(%edi) + fadds 76(%edx) + fstps 448(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fstps 576(%edi) + fadds 92(%edx) + fstps 704(%edi) + + flds 124(%edx) + fsts 960(%edi) + fadds 92(%edx) + fstps 832(%edi) + jmp .L_bye +.L01: +/* Phase 9*/ + + flds (%ecx) + fadds 4(%ecx) + fistp 512(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + + fistp (%esi) + + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fist 256(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fistp 256(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fist 384(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fistp 384(%esi) + fadd %st(2) + fistp 128(%esi) + faddp %st(1) + fistp 128(%edi) + +/* Phase 10*/ - // Phase6 - movd 0(%ebx),%mm0 - movd %mm0,1024(%ebp) - movl 4(%ebx),%eax - movl %eax,0(%ebp) - movl %eax,0(%edx) - movd 8(%ebx),%mm2 - movd %mm2,512(%ebp) - movd 12(%ebx),%mm3 - movd %mm3,512(%edx) + flds 32(%edx) + fadds 48(%edx) + fistp 448(%esi) + + flds 48(%edx) + fadds 40(%edx) + fistp 320(%esi) - movl 16(%ebx),%eax - movl %eax,768(%ebp) - movd 20(%ebx),%mm5 - movd %mm5,256(%edx) + flds 40(%edx) + fadds 56(%edx) + fistp 192(%esi) - movd 24(%ebx),%mm6 - movd %mm6,256(%ebp) - movd 28(%ebx),%mm7 - movd %mm7,768(%edx) + flds 56(%edx) + fadds 36(%edx) + fistp 64(%esi) + + flds 36(%edx) + fadds 52(%edx) + fistp 64(%edi) - movq 32(%ebx),%mm0 /* mm0 = tmp1[8] | tmp1[9] */ - movq 48(%ebx),%mm1 /* mm1 = tmp1[12] | tmp1[13] */ - pfadd %mm1,%mm0 /* mm0 = tmp1[8]+tmp1[12]| tmp1[9]+tmp1[13]*/ - movd %mm0,896(%ebp) /* a[0xE0] = tmp1[8]+tmp1[12] */ - psrlq $32,%mm0 - movd %mm0,128(%edx) /* a[0x20] = tmp1[9]+tmp1[13] */ - movq 40(%ebx),%mm2 - pfadd %mm2,%mm1 - movd %mm1,640(%ebp) - psrlq $32,%mm1 - movd %mm1,384(%edx) + flds 52(%edx) + fadds 44(%edx) + fistp 192(%edi) + + flds 60(%edx) + fist 448(%edi) + fadds 44(%edx) + fistp 320(%edi) - movq 56(%ebx),%mm3 - pfadd %mm3,%mm2 - movd %mm2,384(%ebp) - psrlq $32,%mm2 - movd %mm2,640(%edx) + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fistp 480(%esi) + fadds 80(%edx) + fistp 416(%esi) - movd 36(%ebx),%mm4 - pfadd %mm4,%mm3 - movd %mm3,128(%ebp) - psrlq $32,%mm3 - movd %mm3,896(%edx) - movq 96(%ebx),%mm0 - movq 64(%ebx),%mm1 + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fistp 352(%esi) + fadds 72(%edx) + fistp 288(%esi) - movq 112(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,%mm3 - pfadd %mm1,%mm3 - movd %mm3,960(%ebp) - psrlq $32,%mm3 - movd %mm3,64(%edx) - movq 80(%ebx),%mm1 - pfadd %mm1,%mm0 - movd %mm0,832(%ebp) - psrlq $32,%mm0 - movd %mm0,192(%edx) - movq 104(%ebx),%mm3 - pfadd %mm3,%mm2 - movq %mm2,%mm4 - pfadd %mm1,%mm4 - movd %mm4,704(%ebp) - psrlq $32,%mm4 - movd %mm4,320(%edx) - movq 72(%ebx),%mm1 - pfadd %mm1,%mm2 - movd %mm2,576(%ebp) - psrlq $32,%mm2 - movd %mm2,448(%edx) + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fistp 224(%esi) + fadds 88(%edx) + fistp 160(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fistp 96(%esi) + fadds 68(%edx) + fistp 32(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fistp 32(%edi) + fadds 84(%edx) + fistp 96(%edi) - movq 120(%ebx),%mm4 - pfadd %mm4,%mm3 - movq %mm3,%mm5 - pfadd %mm1,%mm5 - movd %mm5,448(%ebp) - psrlq $32,%mm5 - movd %mm5,576(%edx) - movq 88(%ebx),%mm1 - pfadd %mm1,%mm3 - movd %mm3,320(%ebp) - psrlq $32,%mm3 - movd %mm3,704(%edx) + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fistp 160(%edi) + fadds 76(%edx) + fistp 224(%edi) - movd 100(%ebx),%mm5 - pfadd %mm5,%mm4 - movq %mm4,%mm6 - pfadd %mm1,%mm6 - movd %mm6,192(%ebp) - psrlq $32,%mm6 - movd %mm6,832(%edx) - movd 68(%ebx),%mm1 - pfadd %mm1,%mm4 - movd %mm4,64(%ebp) - psrlq $32,%mm4 - movd %mm4,960(%edx) + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fistp 288(%edi) + fadds 92(%edx) + fistp 352(%edi) - / femms + flds 124(%edx) + fist 480(%edi) + fadds 92(%edx) + fistp 416(%edi) + movsw +.L_bye: + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret + - popl %ebx - popl %esi - popl %edi - popl %ebp - addl $256,%esp - - ret $12 -
--- a/mp3lib/decod386.c Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/decod386.c Fri Jun 29 17:55:35 2001 +0000 @@ -105,6 +105,15 @@ } #endif +synth_func_t synth_func; + +int synth_1to1_MMX( real *bandPtr,int channel,short * samples) +{ + static short buffs[2][2][0x110]; + static int bo = 1; + synth_1to1_MMX_s(bandPtr, channel, samples, (short *) buffs, &bo); + return 0; + } static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt) { @@ -117,40 +126,13 @@ int clip = 0; int bo1; - #ifdef HAVE_SSE_MP3 - //if ( _3dnow ) + if ( synth_func ) { int ret; - ret=synth_1to1_sse( bandPtr,channel,out+*pnt ); - *pnt+=128; - return ret; - } - #endif - #ifdef HAVE_3DNOWEX - if ( _3dnow > 1 ) - { - int ret; - ret=synth_1to1_3dnowex( bandPtr,channel,out+*pnt ); + ret=(*synth_func)( bandPtr,channel,samples); *pnt+=128; return ret; } - #endif - #ifdef HAVE_3DNOW - if ( _3dnow ) - { - int ret; - ret=synth_1to1_3dnow( bandPtr,channel,out+*pnt ); - *pnt+=128; - return ret; - } - #endif - if ( _i586 ) - { - int ret; - ret=synth_1to1_pent( bandPtr,channel,out+*pnt ); - *pnt+=128; - return ret; - } if(!channel) { /* channel=0 */ bo--;
--- a/mp3lib/decode_3dnow.s Fri Jun 29 10:54:41 2001 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,265 +0,0 @@ -/ synth_1to1_3dnow works the same way as the c version of -/ synth_1to1. this assembler code based 'decode-i586.s' -/ (by Stefan Bieschewski <stb@acm.org>), two types of changes -/ have been made: -/ - use {MMX,3DNow!} instruction for reduce cpu -/ - remove unused(?) local symbols -/ -/ useful sources of information on optimizing 3DNow! code include: -/ AMD 3DNow! Technology Manual (Publication #21928) -/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf -/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) -/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) -/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf -/ -/ This code was tested only AMD-K6-2 processor Linux systems, -/ please tell me: -/ - whether this code works on other 3DNow! capable processors -/ (ex.IDT-C6-2) or not -/ - whether this code works on other OSes or not -/ -/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 -/ <kim@comtec.co.jp> - after 1.Apr.1998 - -/ Enhancments for q-word operation by Michael Hipp - -.bss - .comm buffs,4352,4 -.data - .align 4 -bo: - .long 1 -.text -.globl synth_1to1_3dnow -synth_1to1_3dnow: - subl $12,%esp - pushl %ebp - pushl %edi - pushl %esi - pushl %ebx - movl 32(%esp),%eax - movl 40(%esp),%esi - movl $0,%edi - movl bo,%ebp - cmpl %edi,36(%esp) - jne .L48 - decl %ebp - andl $15,%ebp - movl %ebp,bo - movl $buffs,%ecx - jmp .L49 -.L48: - addl $2,%esi - movl $buffs+2176,%ecx -.L49: - testl $1,%ebp - je .L50 - movl %ecx,%ebx - movl %ebp,16(%esp) - pushl %eax - movl 20(%esp),%edx - leal (%ebx,%edx,4),%eax - pushl %eax - movl 24(%esp),%eax - incl %eax - andl $15,%eax - leal 1088(,%eax,4),%eax - addl %ebx,%eax - jmp .L74 -.L50: - leal 1088(%ecx),%ebx - leal 1(%ebp),%edx - movl %edx,16(%esp) - pushl %eax - leal 1092(%ecx,%ebp,4),%eax - pushl %eax - leal (%ecx,%ebp,4),%eax -.L74: - pushl %eax - call dct64_3dnow - addl $12,%esp - movl 16(%esp),%edx - leal 0(,%edx,4),%edx - movl $decwin+64,%eax - movl %eax,%ecx - subl %edx,%ecx - movl $16,%ebp - -.L55: - movq (%ecx),%mm4 - movq (%ebx),%mm3 - movq 8(%ecx),%mm0 - movq 8(%ebx),%mm1 - pfmul %mm3,%mm4 - - movq 16(%ecx),%mm2 - pfmul %mm1,%mm0 - movq 16(%ebx),%mm3 - pfadd %mm0,%mm4 - - movq 24(%ecx),%mm0 - pfmul %mm2,%mm3 - movq 24(%ebx),%mm1 - pfadd %mm3,%mm4 - - movq 32(%ecx),%mm2 - pfmul %mm1,%mm0 - movq 32(%ebx),%mm3 - pfadd %mm0,%mm4 - - movq 40(%ecx),%mm0 - pfmul %mm2,%mm3 - movq 40(%ebx),%mm1 - pfadd %mm3,%mm4 - - movq 48(%ecx),%mm2 - pfmul %mm1,%mm0 - movq 48(%ebx),%mm3 - pfadd %mm0,%mm4 - - movq 56(%ecx),%mm0 - pfmul %mm2,%mm3 - movq 56(%ebx),%mm1 - pfadd %mm3,%mm4 - - pfmul %mm1,%mm0 - pfadd %mm0,%mm4 - - movq %mm4,%mm0 - psrlq $32,%mm0 - pfsub %mm0,%mm4 - - pf2id %mm4,%mm4 - movd %mm4,%eax - - sar $16,%eax - movw %ax,(%esi) - - addl $64,%ebx - subl $-128,%ecx - addl $4,%esi - decl %ebp - jnz .L55 - -/ --- end of loop 1 --- - - movd (%ecx),%mm2 - movd (%ebx),%mm1 - pfmul %mm1,%mm2 - - movd 8(%ecx),%mm0 - movd 8(%ebx),%mm1 - pfmul %mm0,%mm1 - pfadd %mm1,%mm2 - - movd 16(%ecx),%mm0 - movd 16(%ebx),%mm1 - pfmul %mm0,%mm1 - pfadd %mm1,%mm2 - - movd 24(%ecx),%mm0 - movd 24(%ebx),%mm1 - pfmul %mm0,%mm1 - pfadd %mm1,%mm2 - - movd 32(%ecx),%mm0 - movd 32(%ebx),%mm1 - pfmul %mm0,%mm1 - pfadd %mm1,%mm2 - - movd 40(%ecx),%mm0 - movd 40(%ebx),%mm1 - pfmul %mm0,%mm1 - pfadd %mm1,%mm2 - - movd 48(%ecx),%mm0 - movd 48(%ebx),%mm1 - pfmul %mm0,%mm1 - pfadd %mm1,%mm2 - - movd 56(%ecx),%mm0 - movd 56(%ebx),%mm1 - pfmul %mm0,%mm1 - pfadd %mm1,%mm2 - - pf2id %mm2,%mm2 - movd %mm2,%eax - - sar $16,%eax - - movw %ax,(%esi) - - addl $-64,%ebx - addl $4,%esi - addl $256,%ecx - movl $15,%ebp - -.L68: - psubd %mm0,%mm0 - - movq (%ebx),%mm1 - movq (%ecx),%mm2 - pfmul %mm1,%mm2 - pfsub %mm2,%mm0 - - movq 8(%ebx),%mm3 - movq 8(%ecx),%mm4 - pfmul %mm3,%mm4 - pfsub %mm4,%mm0 - - movq 16(%ebx),%mm1 - movq 16(%ecx),%mm2 - pfmul %mm1,%mm2 - pfsub %mm2,%mm0 - - movq 24(%ebx),%mm3 - movq 24(%ecx),%mm4 - pfmul %mm3,%mm4 - pfsub %mm4,%mm0 - - movq 32(%ebx),%mm1 - movq 32(%ecx),%mm2 - pfmul %mm1,%mm2 - pfsub %mm2,%mm0 - - movq 40(%ebx),%mm3 - movq 40(%ecx),%mm4 - pfmul %mm3,%mm4 - pfsub %mm4,%mm0 - - movq 48(%ebx),%mm1 - movq 48(%ecx),%mm2 - pfmul %mm1,%mm2 - pfsub %mm2,%mm0 - - movq 56(%ebx),%mm3 - movq 56(%ecx),%mm4 - pfmul %mm3,%mm4 - pfsub %mm4,%mm0 - - pfacc %mm0,%mm0 - - pf2id %mm0,%mm0 - movd %mm0,%eax - - sar $16,%eax - - movw %ax,(%esi) - - addl $-64,%ebx - subl $-128,%ecx - addl $4,%esi - decl %ebp - jnz .L68 - -/ --- end of loop 2 - - femms - - movl %edi,%eax - popl %ebx - popl %esi - popl %edi - popl %ebp - addl $12,%esp - ret
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mp3lib/decode_MMX.s Fri Jun 29 17:55:35 2001 +0000 @@ -0,0 +1,117 @@ +# this code comes under GPL +# This code was taken from http://www.mpg123.org +# See ChangeLog of mpg123-0.59s-pre.1 for detail +# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> +# +# TODO: Partial loops unrolling and removing MOVW insn. +# + +.text + +.globl synth_1to1_MMX_s + +synth_1to1_MMX_s: + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + movl 24(%esp),%ecx + movl 28(%esp),%edi + movl $15,%ebx + movl 36(%esp),%edx + leal (%edi,%ecx,2),%edi + decl %ecx + movl 32(%esp),%esi + movl (%edx),%eax + jecxz .L1 + decl %eax + andl %ebx,%eax + leal 1088(%esi),%esi + movl %eax,(%edx) +.L1: + leal (%esi,%eax,2),%edx + movl %eax,%ebp + incl %eax + pushl 20(%esp) + andl %ebx,%eax + leal 544(%esi,%eax,2),%ecx + incl %ebx + testl $1, %eax + jnz .L2 + xchgl %edx,%ecx + incl %ebp + leal 544(%esi),%esi +.L2: + emms + pushl %edx + pushl %ecx + call *dct64_MMX_func + addl $12,%esp + leal 1(%ebx), %ecx + subl %ebp,%ebx + + leal decwins(%ebx,%ebx,1), %edx +.L3: + movq (%edx),%mm0 + pmaddwd (%esi),%mm0 + movq 8(%edx),%mm1 + pmaddwd 8(%esi),%mm1 + movq 16(%edx),%mm2 + pmaddwd 16(%esi),%mm2 + movq 24(%edx),%mm3 + pmaddwd 24(%esi),%mm3 + paddd %mm1,%mm0 + paddd %mm2,%mm0 + paddd %mm3,%mm0 + movq %mm0,%mm1 + psrlq $32,%mm1 + paddd %mm1,%mm0 + psrad $13,%mm0 + packssdw %mm0,%mm0 + movd %mm0,%eax + movw %ax, (%edi) + + leal 32(%esi),%esi + leal 64(%edx),%edx + leal 4(%edi),%edi + decl %ecx + jnz .L3 + + + subl $64,%esi + movl $15,%ecx +.L4: + movq (%edx),%mm0 + pmaddwd (%esi),%mm0 + movq 8(%edx),%mm1 + pmaddwd 8(%esi),%mm1 + movq 16(%edx),%mm2 + pmaddwd 16(%esi),%mm2 + movq 24(%edx),%mm3 + pmaddwd 24(%esi),%mm3 + paddd %mm1,%mm0 + paddd %mm2,%mm0 + paddd %mm3,%mm0 + movq %mm0,%mm1 + psrlq $32,%mm1 + paddd %mm0,%mm1 + psrad $13,%mm1 + packssdw %mm1,%mm1 + psubd %mm0,%mm0 + psubsw %mm1,%mm0 + movd %mm0,%eax + movw %ax,(%edi) + + subl $32,%esi + addl $64,%edx + leal 4(%edi),%edi + decl %ecx + jnz .L4 + emms + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + +
--- a/mp3lib/decode_k7.s Fri Jun 29 10:54:41 2001 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,364 +0,0 @@ -/// -/// Replacement of synth_1to1() with AMD's 3DNowEx(DSP)! SIMD operations support -/// -/// This code based 'decode_3dnow.s' by Syuuhei Kashiyama -/// <squash@mb.kcom.ne.jp>,only some types of changes have been made: -/// -/// - Added new opcode PFNACC -/// - decreased number of opcodes (as it was suggested by k7 manual) -/// (using memory reference as operand of instructions) -/// - added PREFETCHW opcode. It has different semantic on k7 than on k6-2 -/// and saves 15-25 cpu clocks for athlon. -/// - partial unrolling loops for removing slower MOVW insns. -/// (Note: probably same operation should be done for decode_3dnow.s) -/// - change function name for support 3DNowEx! automatic detect -/// - added loops alignment -/// -/// note: because K7 processors are an aggresive out-of-order three-way -/// superscalar ones instruction order is not significand for them. -/// -/// Benchmark: measured by mplayer on Duron-700: -/// 3dNow! optimized code - 1.4% of cpu usage -/// k7 optimized code (without partial loop unrolling) - 1.3% of cpu usage -/// k7 optimized code - 1.1% of cpu usage -/// Note: K6-2 users have an chance with partial loops unrolling -/// -/// Modified by Nick Kurshev <nickols_k@mail.ru> -/// -/ synth_1to1_3dnow works the same way as the c version of -/ synth_1to1. this assembler code based 'decode-i586.s' -/ (by Stefan Bieschewski <stb@acm.org>), two types of changes -/ have been made: -/ - use {MMX,3DNow!} instruction for reduce cpu -/ - remove unused(?) local symbols -/ -/ useful sources of information on optimizing 3DNow! code include: -/ AMD 3DNow! Technology Manual (Publication #21928) -/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf -/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) -/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) -/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf -/ -/ This code was tested only AMD-K6-2 processor Linux systems, -/ please tell me: -/ - whether this code works on other 3DNow! capable processors -/ (ex.IDT-C6-2) or not -/ - whether this code works on other OSes or not -/ -/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 -/ <kim@comtec.co.jp> - after 1.Apr.1998 - -/ Enhancments for q-word operation by Michael Hipp - -.bss - .comm buffs,4352,4 -.data - .align 8 -null_one: .long 0x0000ffff, 0x0000ffff -one_null: .long 0xffff0000, 0xffff0000 -bo: .long 1 -.text -/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ -.globl synth_1to1_3dnowex -synth_1to1_3dnowex: - subl $12,%esp - pushl %ebp - pushl %edi - pushl %esi - pushl %ebx - - movl 32(%esp),%eax - movl 40(%esp),%esi - movl $0,%edi - movl bo,%ebp - cmpl %edi,36(%esp) - jne .L48 - decl %ebp - andl $15,%ebp - movl %ebp,bo - movl $buffs,%ecx - jmp .L49 -.L48: - addl $2,%esi - movl $buffs+2176,%ecx -.L49: - testl $1,%ebp - je .L50 - movl %ecx,%ebx - movl %ebp,16(%esp) - pushl %eax - movl 20(%esp),%edx - leal (%ebx,%edx,4),%eax - pushl %eax - movl 24(%esp),%eax - incl %eax - andl $15,%eax - leal 1088(,%eax,4),%eax - addl %ebx,%eax - jmp .L74 -.L50: - leal 1088(%ecx),%ebx - leal 1(%ebp),%edx - movl %edx,16(%esp) - pushl %eax - leal 1092(%ecx,%ebp,4),%eax - pushl %eax - leal (%ecx,%ebp,4),%eax -.L74: - pushl %eax - call dct64_3dnowex - movl 16(%esp),%edx - leal 0(,%edx,4),%edx - movl $decwin+64,%eax - movl %eax,%ecx - subl %edx,%ecx - movl $8,%ebp - prefetchw (%esi) -.align 16 -.L55: - - movq (%ecx),%mm0 - pfmul (%ebx),%mm0 - movq 128(%ecx),%mm4 - pfmul 64(%ebx),%mm4 - - movq 8(%ecx),%mm1 - pfmul 8(%ebx),%mm1 - pfadd %mm1,%mm0 - movq 136(%ecx),%mm5 - pfmul 72(%ebx),%mm5 - pfadd %mm5,%mm4 - - movq 16(%ebx),%mm2 - pfmul 16(%ecx),%mm2 - pfadd %mm2,%mm0 - movq 80(%ebx),%mm6 - pfmul 144(%ecx),%mm6 - pfadd %mm6,%mm4 - - movq 24(%ecx),%mm3 - pfmul 24(%ebx),%mm3 - pfadd %mm3,%mm0 - movq 152(%ecx),%mm7 - pfmul 88(%ebx),%mm7 - pfadd %mm7,%mm4 - - movq 32(%ebx),%mm1 - pfmul 32(%ecx),%mm1 - pfadd %mm1,%mm0 - movq 96(%ebx),%mm5 - pfmul 160(%ecx),%mm5 - pfadd %mm5,%mm4 - - movq 40(%ecx),%mm2 - pfmul 40(%ebx),%mm2 - pfadd %mm2,%mm0 - movq 168(%ecx),%mm6 - pfmul 104(%ebx),%mm6 - pfadd %mm6,%mm4 - - movq 48(%ebx),%mm3 - pfmul 48(%ecx),%mm3 - pfadd %mm3,%mm0 - movq 112(%ebx),%mm7 - pfmul 176(%ecx),%mm7 - pfadd %mm7,%mm4 - - movq 56(%ecx),%mm1 - pfmul 56(%ebx),%mm1 - pfadd %mm1,%mm0 - movq 184(%ecx),%mm5 - pfmul 120(%ebx),%mm5 - pfadd %mm5,%mm4 - - pfnacc %mm4, %mm0 - movq (%esi), %mm1 - pf2id %mm0, %mm0 - pand one_null, %mm1 - psrld $16,%mm0 - pand null_one, %mm0 - por %mm0, %mm1 - movq %mm1,(%esi) - - addl $128,%ebx - addl $256,%ecx - addl $8,%esi - decl %ebp - jnz .L55 - -/ --- end of loop 1 --- - - prefetchw (%esi) /* prefetching for writing this block and next loop */ - - movd (%ecx),%mm0 - pfmul (%ebx),%mm0 - - movd 8(%ebx),%mm1 - pfmul 8(%ecx),%mm1 - pfadd %mm1,%mm0 - - movd 16(%ebx),%mm2 - pfmul 16(%ecx),%mm2 - pfadd %mm2,%mm0 - - movd 24(%ebx),%mm3 - pfmul 24(%ecx),%mm3 - pfadd %mm3,%mm0 - - movd 32(%ebx),%mm4 - pfmul 32(%ecx),%mm4 - pfadd %mm4,%mm0 - - movd 40(%ebx),%mm5 - pfmul 40(%ecx),%mm5 - pfadd %mm5,%mm0 - - movd 48(%ebx),%mm6 - pfmul 48(%ecx),%mm6 - pfadd %mm6,%mm0 - - movd 56(%ebx),%mm7 - pfmul 56(%ecx),%mm7 - pfadd %mm7,%mm0 - - pf2id %mm0,%mm0 - movd %mm0,%eax - - sar $16,%eax - - movw %ax,(%esi) - - subl $64,%ebx - addl $4,%esi - addl $256,%ecx - movl $7,%ebp -.align 16 -.L68: - pxor %mm0, %mm0 - pxor %mm4, %mm4 - - movq (%ecx),%mm1 - pfmul (%ebx),%mm1 - pfsub %mm1,%mm0 - movq 128(%ecx),%mm5 - pfmul -64(%ebx),%mm5 - pfsub %mm5,%mm4 - - movq 8(%ecx),%mm2 - pfmul 8(%ebx),%mm2 - pfsub %mm2,%mm0 - movq 136(%ecx),%mm6 - pfmul -56(%ebx),%mm6 - pfsub %mm6,%mm4 - - movq 16(%ecx),%mm3 - pfmul 16(%ebx),%mm3 - pfsub %mm3,%mm0 - movq 144(%ecx),%mm7 - pfmul -48(%ebx),%mm7 - pfsub %mm7,%mm4 - - movq 24(%ecx),%mm1 - pfmul 24(%ebx),%mm1 - pfsub %mm1,%mm0 - movq 152(%ecx),%mm5 - pfmul -40(%ebx),%mm5 - pfsub %mm5,%mm4 - - movq 32(%ecx),%mm2 - pfmul 32(%ebx),%mm2 - pfsub %mm2,%mm0 - movq 160(%ecx),%mm6 - pfmul -32(%ebx),%mm6 - pfsub %mm6,%mm4 - - movq 40(%ecx),%mm3 - pfmul 40(%ebx),%mm3 - pfsub %mm3,%mm0 - movq 168(%ecx),%mm7 - pfmul -24(%ebx),%mm7 - pfsub %mm7,%mm4 - - movq 48(%ecx),%mm1 - pfmul 48(%ebx),%mm1 - pfsub %mm1,%mm0 - movq 176(%ecx),%mm5 - pfmul -16(%ebx),%mm5 - pfsub %mm5,%mm4 - - movq 56(%ecx),%mm2 - pfmul 56(%ebx),%mm2 - pfsub %mm2,%mm0 - movq 184(%ecx),%mm6 - pfmul -8(%ebx),%mm6 - pfsub %mm6,%mm4 - - pfacc %mm4,%mm0 - movq (%esi), %mm1 - pf2id %mm0, %mm0 - pand one_null, %mm1 - psrld $16,%mm0 - pand null_one, %mm0 - por %mm0, %mm1 - movq %mm1,(%esi) - - subl $128,%ebx - addl $256,%ecx - addl $8,%esi - decl %ebp - jnz .L68 - -/ --- end of loop 2 - - pxor %mm0, %mm0 - - movq (%ecx),%mm1 - pfmul (%ebx),%mm1 - pfsub %mm1,%mm0 - - movq 8(%ecx),%mm2 - pfmul 8(%ebx),%mm2 - pfsub %mm2,%mm0 - - movq 16(%ecx),%mm3 - pfmul 16(%ebx),%mm3 - pfsub %mm3,%mm0 - - movq 24(%ecx),%mm4 - pfmul 24(%ebx),%mm4 - pfsub %mm4,%mm0 - - movq 32(%ecx),%mm5 - pfmul 32(%ebx),%mm5 - pfsub %mm5,%mm0 - - movq 40(%ecx),%mm6 - pfmul 40(%ebx),%mm6 - pfsub %mm6,%mm0 - - movq 48(%ecx),%mm7 - pfmul 48(%ebx),%mm7 - pfsub %mm7,%mm0 - - movq 56(%ecx),%mm1 - pfmul 56(%ebx),%mm1 - pfsub %mm1,%mm0 - - pfacc %mm0,%mm0 - - pf2id %mm0,%mm0 - movd %mm0,%eax - - sar $16,%eax - - movw %ax,(%esi) - - femms - - movl %edi,%eax - popl %ebx - popl %esi - popl %edi - popl %ebp - addl $12,%esp - ret
--- a/mp3lib/decode_sse.s Fri Jun 29 10:54:41 2001 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,201 +0,0 @@ -/// -/// Replacement of synth_1to1() with Intel's SSE SIMD operations support -/// -/// This code based 'decode_k7.s' by Nick Kurshev -/// <squash@mb.kcom.ne.jp>,only some types of changes have been made: -/// -/// - SSE optimization -/// - change function name for support SSE automatic detect -/// -/// Modified by Nick Kurshev <nickols_k@mail.ru> -/// -/ synth_1to1_3dnow works the same way as the c version of -/ synth_1to1. this assembler code based 'decode-i586.s' -/ (by Stefan Bieschewski <stb@acm.org>), two types of changes -/ have been made: -/ - use {MMX,3DNow!} instruction for reduce cpu -/ - remove unused(?) local symbols -/ -/ useful sources of information on optimizing 3DNow! code include: -/ AMD 3DNow! Technology Manual (Publication #21928) -/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf -/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) -/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) -/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf -/ -/ This code was tested only AMD-K6-2 processor Linux systems, -/ please tell me: -/ - whether this code works on other 3DNow! capable processors -/ (ex.IDT-C6-2) or not -/ - whether this code works on other OSes or not -/ -/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 -/ <kim@comtec.co.jp> - after 1.Apr.1998 - -/ Enhancments for q-word operation by Michael Hipp - -.bss - .comm buffs,4352,4 -.data - .align 4 -bo: - .long 1 -.text -/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ -.globl synth_1to1_sse -synth_1to1_sse: - subl $12,%esp - pushl %ebp - pushl %edi - pushl %esi - pushl %ebx - - movl 32(%esp),%eax - movl 40(%esp),%esi - movl $0,%edi - movl bo,%ebp - cmpl %edi,36(%esp) - jne .L48 - decl %ebp - andl $15,%ebp - movl %ebp,bo - movl $buffs,%ecx - jmp .L49 -.L48: - addl $2,%esi - movl $buffs+2176,%ecx -.L49: - testl $1,%ebp - je .L50 - movl %ecx,%ebx - movl %ebp,16(%esp) - pushl %eax - movl 20(%esp),%edx - leal (%ebx,%edx,4),%eax - pushl %eax - movl 24(%esp),%eax - incl %eax - andl $15,%eax - leal 1088(,%eax,4),%eax - addl %ebx,%eax - jmp .L74 -.L50: - leal 1088(%ecx),%ebx - leal 1(%ebp),%edx - movl %edx,16(%esp) - pushl %eax - leal 1092(%ecx,%ebp,4),%eax - pushl %eax - leal (%ecx,%ebp,4),%eax -.L74: - pushl %eax - call dct64 - addl $12, %esp - movl 16(%esp),%edx - leal 0(,%edx,4),%edx - movl $decwin+64,%eax - movl %eax,%ecx - subl %edx,%ecx - movl $16,%ebp - -.L55: - movups (%ecx), %xmm0 - mulps (%ebx), %xmm0 - movups 16(%ecx), %xmm1 - mulps 16(%ebx), %xmm1 - addps %xmm1, %xmm0 - movups 32(%ecx), %xmm1 - mulps 32(%ebx), %xmm1 - addps %xmm1, %xmm0 - movups 48(%ecx), %xmm1 - mulps 48(%ebx), %xmm1 - addps %xmm1, %xmm0 -/* pfnacc -> PFNACC mmreg1, mmreg2 performs the following operations: */ -/* temp = mmreg2 */ -/* mmreg1[31:0] = mmreg1[31:0] - mmreg1[63:32] */ -/* mmreg1[63:32]= temp [31:0] - temp[63:32] */ -/* save difference of mmreg1's low-word and high-word into mmreg1's low-word */ -/* save difference of mmreg2's low-word and high-word into mmreg1's high-word */ - movhlps %xmm0, %xmm1 - addps %xmm1, %xmm0 - movaps %xmm0, %xmm1 - shufps $0x55, %xmm1, %xmm1 /* fake of pfnacc. 1|1|1|1 */ - - subss %xmm1, %xmm0 - cvtss2si %xmm0, %eax - -/ sar $16,%eax - movw %ax,(%esi) - - addl $64,%ebx - subl $-128,%ecx - addl $4,%esi - decl %ebp - jnz .L55 - -/ --- end of loop 1 --- - - movups (%ecx), %xmm0 - mulps (%ebx), %xmm0 - movups 16(%ecx), %xmm1 - mulps 16(%ebx), %xmm1 - addps %xmm1, %xmm0 - movups 32(%ecx), %xmm1 - mulps 32(%ebx), %xmm1 - addps %xmm1, %xmm0 - movups 48(%ecx), %xmm1 - mulps 48(%ebx), %xmm1 - addps %xmm1, %xmm0 - movhlps %xmm0, %xmm1 - addss %xmm1, %xmm0 - cvtss2si %xmm0, %eax - -/ sar $16,%eax - - movw %ax,(%esi) - - addl $-64,%ebx - addl $4,%esi - addl $256,%ecx - movl $15,%ebp - -.L68: - xorps %xmm0, %xmm0 - movups (%ecx), %xmm1 - mulps (%ebx), %xmm1 - subps %xmm1, %xmm0 - movups 16(%ecx), %xmm1 - mulps 16(%ebx), %xmm1 - subps %xmm1, %xmm0 - movups 32(%ecx), %xmm1 - mulps 32(%ebx), %xmm1 - subps %xmm1, %xmm0 - movups 48(%ecx), %xmm1 - mulps 48(%ebx), %xmm1 - subps %xmm1, %xmm0 - movhlps %xmm0, %xmm1 - subps %xmm1, %xmm0 - movaps %xmm0, %xmm1 - shufps $0x55, %xmm1, %xmm1 /* fake of pfacc 1|1|1|1 */ - addss %xmm1, %xmm0 - cvtss2si %xmm0, %eax - -/ sar $16,%eax - - movw %ax,(%esi) - - addl $-64,%ebx - subl $-128,%ecx - addl $4,%esi - decl %ebp - jnz .L68 - -/ --- end of loop 2 - - movl %edi,%eax - popl %ebx - popl %esi - popl %edi - popl %ebp - addl $12,%esp - ret
--- a/mp3lib/layer2.c Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/layer2.c Fri Jun 29 17:55:35 2001 +0000 @@ -50,8 +50,16 @@ { double m=mulmul[k]; table = muls[k]; + if(_has_mmx) + { + for(j=3,i=0;i<63;i++,j--) + *table++ = 16384 * m * pow(2.0,(double) j / 3.0); + } + else for(j=3,i=0;i<63;i++,j--) + { *table++ = m * pow(2.0,(double) j / 3.0); + } *table++ = 0.0; } }
--- a/mp3lib/layer3.c Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/layer3.c Fri Jun 29 17:55:35 2001 +0000 @@ -22,9 +22,9 @@ #define GP2MAX (256+118+4) static real gainpow2[GP2MAX]; -static real nCOS9[9]; +real COS9[9]; static real COS6_1,COS6_2; -static real tfcos36[9]; +real tfcos36[9]; static real tfcos12[3]; #ifdef NEW_DCT9 static real cos9[3],cos18[3]; @@ -111,8 +111,12 @@ int i,j,k,l; for(i=-256;i<118+4;i++) - gainpow2[i+256] = pow((double)2.0,-0.25 * (double) (i+210) ); - + { + if(_has_mmx) + gainpow2[i+256] = 16384.0 * pow((double)2.0,-0.25 * (double) (i+210) ); + else + gainpow2[i+256] = pow((double)2.0,-0.25 * (double) (i+210) ); + } for(i=0;i<8207;i++) ispow[i] = pow((double)i,(double)4.0/3.0); @@ -139,7 +143,7 @@ } for(i=0;i<9;i++) - nCOS9[i] = cos( M_PI / 18.0 * (double) i); + COS9[i] = cos( M_PI / 18.0 * (double) i); for(i=0;i<9;i++) tfcos36[i] = 0.5 / cos ( M_PI * (double) (i*2+1) / 36.0 ); @@ -1533,6 +1537,9 @@ /* * III_hybrid */ + +dct36_func_t dct36_func; + static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT],real tsOut[SSLIMIT][SBLIMIT], int ch,struct gr_info_s *gr_info) { @@ -1553,8 +1560,8 @@ if(gr_info->mixed_block_flag) { sb = 2; - dct36(fsIn[0],rawout1,rawout2,win[0],tspnt); - dct36(fsIn[1],rawout1+18,rawout2+18,win1[0],tspnt+1); + (*dct36_func)(fsIn[0],rawout1,rawout2,win[0],tspnt); + (*dct36_func)(fsIn[1],rawout1+18,rawout2+18,win1[0],tspnt+1); rawout1 += 36; rawout2 += 36; tspnt += 2; } @@ -1567,8 +1574,8 @@ } else { for (; sb<gr_info->maxb; sb+=2,tspnt+=2,rawout1+=36,rawout2+=36) { - dct36(fsIn[sb],rawout1,rawout2,win[bt],tspnt); - dct36(fsIn[sb+1],rawout1+18,rawout2+18,win1[bt],tspnt+1); + (*dct36_func)(fsIn[sb],rawout1,rawout2,win[bt],tspnt); + (*dct36_func)(fsIn[sb+1],rawout1+18,rawout2+18,win1[bt],tspnt+1); } }
--- a/mp3lib/mpg123.h Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/mpg123.h Fri Jun 29 17:55:35 2001 +0000 @@ -104,33 +104,22 @@ }; static long freqs[9]; -#ifdef HAVE_3DNOW - real decwin[2*(512+32)]; -#else - real decwin[512+32]; -#endif - real *pnts[]; +extern real decwin[(512+32)]; +extern real *pnts[]; static int do_layer2(struct frame *fr,int single); static int do_layer3(struct frame *fr,int single); static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt); -extern int synth_1to1_pent( real *,int,unsigned char * ); +extern int synth_1to1_pent( real *,int,short * ); +extern void make_decode_tables_MMX(long scaleval); +extern int synth_1to1_MMX( real *,int,short * ); +extern int synth_1to1_MMX_s(real *, int, short *, short *, int *); extern void dct64(real *a,real *b,real *c); -#ifdef HAVE_3DNOW - extern void dct64_3dnow( real *,real *, real * ); - extern void dct36_3dnow(real *,real *,real *,real *,real *); - extern int synth_1to1_3dnow( real *,int,unsigned char * ); -#endif -#ifdef HAVE_3DNOWEX - extern void dct64_3dnowex( real *,real *, real * ); - extern void dct36_3dnowex(real *,real *,real *,real *,real *); - extern int synth_1to1_3dnowex( real *,int,unsigned char * ); -#endif -#ifdef HAVE_SSE_MP3 -// extern void dct64_3dnow( real *,real *, real * ); -// extern void dct36_3dnow(real *,real *,real *,real *,real *); - extern int synth_1to1_sse( real *,int,unsigned char * ); -#endif +extern void dct36_3dnow(real *,real *,real *,real *,real *); +extern void dct36_3dnowex(real *,real *,real *,real *,real *); +extern void dct36_sse(real *,real *,real *,real *,real *); +typedef int (*synth_func_t)( real *,int,short * ); +typedef void (*dct36_func_t)(real *,real *,real *,real *,real *);
--- a/mp3lib/sr1.c Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/sr1.c Fri Jun 29 17:55:35 2001 +0000 @@ -343,6 +343,12 @@ static int tables_done_flag=0; +/* It's hidden from gcc in assembler */ +extern void dct64_MMX( void ); +extern void dct64_MMX_3dnow( void ); +extern void dct64_MMX_3dnowex( void ); +void (*dct64_MMX_func)( void ); + // Init decoder tables. Call first, once! #ifdef USE_FAKE_MONO void MP3_Init(int fakemono){ @@ -351,20 +357,41 @@ #endif _CpuID=CpuDetect(); _i586=ipentium(); -#ifdef HAVE_3DNOW +#ifndef HAVE_MMX + _i586 &= 1; +#endif _3dnow=a3dnow(); +#ifndef HAVE_3DNOW + _3dnow = 0; #endif - +#ifndef HAVE_3DNOWEX + _3dnow &= 1; +#endif + _isse=isse(); +#ifndef HAVE_SSE + _isse = 0; +#endif +#ifndef HAVE_SSE2 + _isse &= 1; +#endif + _has_mmx=_i586>1||_3dnow||_isse; printf( "mp3lib: Processor ID: %x\n",_CpuID ); - printf( "mp3lib: i586 processor %sdetected.\n",(_i586?"":"not ") ); -#ifdef HAVE_3DNOW - printf( "mp3lib: AMD 3dnow! extension %sdetected.\n",(_3dnow?"":"not ") ); -#endif -#ifdef HAVE_3DNOWEX - printf( "mp3lib: AMD 3dnow-dsp! extension %sdetected.\n",(_3dnow>1?"":"not ") ); -#endif + if(_i586&&!_3dnow&&!_isse) + printf( "mp3lib: Using Pentium%s optimized decore.\n",(_i586>1?"-MMX":"")); + else + if(_isse) + /* + Note: It's ok, Since K8 will have SSE2 support and will much faster + of P4 ;) + */ + printf( "mp3lib: Using SSE%s! optimized decore.\n",(_isse>1?"2":"")); + else + if(_3dnow) + printf( "mp3lib: Using AMD 3dnow%s! optimized decore.\n",(_3dnow>1?"-dsp(k7)":"")); - make_decode_tables(outscale); +/* Use it for any MMX cpu */ + if(_has_mmx) make_decode_tables_MMX(outscale); + else make_decode_tables(outscale); #ifdef USE_FAKE_MONO if (fakemono == 1) fr.synth=synth_1to1_l; @@ -381,6 +408,42 @@ init_layer2(); init_layer3(fr.down_sample_sblimit); tables_done_flag=1; + + dct36_func=dct36; + if(_isse) + { + synth_func=synth_1to1_MMX; + dct64_MMX_func=dct64_MMX; + } + else + if ( _3dnow > 1 ) + { + synth_func=synth_1to1_MMX; + dct36_func=dct36_3dnowex; + dct64_MMX_func=dct64_MMX_3dnowex; + } + else + if ( _3dnow ) + { + synth_func=synth_1to1_MMX; + dct36_func=dct36_3dnow; + dct64_MMX_func=dct64_MMX_3dnow; + } + else + if ( _i586 > 1) + { + synth_func=synth_1to1_MMX; + dct64_MMX_func=dct64_MMX; + } + else + if ( _i586 ) + { + synth_func=synth_1to1_pent; + } + else + { + synth_func = NULL; + } } #if 0
--- a/mp3lib/tabinit.c Fri Jun 29 10:54:41 2001 +0000 +++ b/mp3lib/tabinit.c Fri Jun 29 17:55:35 2001 +0000 @@ -1,20 +1,7 @@ - +real decwin[(512+32)], cos64[32], cos32[16], cos16[8], cos8[4], cos4[2]; +real *pnts[]={ cos64,cos32,cos16,cos8,cos4 }; -#ifdef HAVE_3DNOW - real decwin[2*(512+32)] __attribute__((aligned(8))); - real cos64[32] __attribute__((aligned(8))); - real cos32[16] __attribute__((aligned(8))); - real cos16[8] __attribute__((aligned(8))); - real cos8[4] __attribute__((aligned(8))); - real cos4[2] __attribute__((aligned(8))); - real *pnts[]={ cos64,cos32,cos16,cos8,cos4 }; -#else - real decwin[512+32]; - real cos64[16],cos32[8],cos16[4],cos8[2],cos4[1]; - real *pnts[] = { cos64,cos32,cos16,cos8,cos4 }; -#endif - -long intwinbase[] = { +static long intwinbase[] = { 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -10, -11, -13, -14, -16, -17, -19, -21, @@ -42,7 +29,7 @@ 64019, 65290, 66494, 67629, 68692, 69679, 70590, 71420, 72169, 72835, 73415, 73908, 74313, 74630, 74856, 74992, 75038 }; - void make_decode_tables(long scaleval) +void make_decode_tables(long scaleval) { int i,j,k,kr,divv; real *table,*costab; @@ -53,17 +40,13 @@ kr=0x10>>i; divv=0x40>>i; costab = pnts[i]; for(k=0;k<kr;k++) costab[k] = 1.0 / (2.0 * cos(M_PI * ((double) k * 2.0 + 1.0) / (double) divv)); - #ifdef HAVE_3DNOW - if ( _3dnow ) for(k=0;k<kr;k++) costab[k+kr]=-costab[k]; - #endif - } table = decwin; scaleval = -scaleval; for(i=0,j=0;i<256;i++,j++,table+=32) { - if(table < decwin+512+16) + if(table < decwin+512+16) table[16] = table[0] = (double) intwinbase[j] / 65536.0 * (double) scaleval; if(i % 32 == 31) table -= 1023; @@ -80,14 +63,6 @@ if(i % 64 == 63) scaleval = - scaleval; } - #ifdef HAVE_3DNOW - if ( _3dnow ) - for(i=0;i<512+32;i++) - { - decwin[512+31-i]*=65536.0; // allows faster clipping in 3dnow code - decwin[512+32+i]=decwin[512+31-i]; - } - #endif }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mp3lib/tabinit_MMX.s Fri Jun 29 17:55:35 2001 +0000 @@ -0,0 +1,161 @@ +# This code was taken from http://www.mpg123.org +# See ChangeLog of mpg123-0.59s-pre.1 for detail +# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> +.bss + .align 8 + .comm decwin,2176,32 + .align 8 + .comm decwins,2176,32 +.data + .align 8 +intwinbase_MMX: + .value 0, -1, -1, -1, -1, -1, -1, -2 + .value -2, -2, -2, -3, -3, -4, -4, -5 + .value -5, -6, -7, -7, -8, -9, -10, -11 + .value -13, -14, -16, -17, -19, -21, -24, -26 + .value -29, -31, -35, -38, -41, -45, -49, -53 + .value -58, -63, -68, -73, -79, -85, -91, -97 + .value -104, -111, -117, -125, -132, -139, -147, -154 + .value -161, -169, -176, -183, -190, -196, -202, -208 + .value -213, -218, -222, -225, -227, -228, -228, -227 + .value -224, -221, -215, -208, -200, -189, -177, -163 + .value -146, -127, -106, -83, -57, -29, 2, 36 + .value 72, 111, 153, 197, 244, 294, 347, 401 + .value 459, 519, 581, 645, 711, 779, 848, 919 + .value 991, 1064, 1137, 1210, 1283, 1356, 1428, 1498 + .value 1567, 1634, 1698, 1759, 1817, 1870, 1919, 1962 + .value 2001, 2032, 2057, 2075, 2085, 2087, 2080, 2063 + .value 2037, 2000, 1952, 1893, 1822, 1739, 1644, 1535 + .value 1414, 1280, 1131, 970, 794, 605, 402, 185 + .value -45, -288, -545, -814, -1095, -1388, -1692, -2006 + .value -2330, -2663, -3004, -3351, -3705, -4063, -4425, -4788 + .value -5153, -5517, -5879, -6237, -6589, -6935, -7271, -7597 + .value -7910, -8209, -8491, -8755, -8998, -9219, -9416, -9585 + .value -9727, -9838, -9916, -9959, -9966, -9935, -9863, -9750 + .value -9592, -9389, -9139, -8840, -8492, -8092, -7640, -7134 + .value -6574, -5959, -5288, -4561, -3776, -2935, -2037, -1082 + .value -70, 998, 2122, 3300, 4533, 5818, 7154, 8540 + .value 9975, 11455, 12980, 14548, 16155, 17799, 19478, 21189 + .value 22929, 24694, 26482, 28289, 30112, 31947,-26209,-24360 + .value -22511,-20664,-18824,-16994,-15179,-13383,-11610, -9863 + .value -8147, -6466, -4822, -3222, -1667, -162, 1289, 2684 + .value 4019, 5290, 6494, 7629, 8692, 9679, 10590, 11420 + .value 12169, 12835, 13415, 13908, 14313, 14630, 14856, 14992 + .value 15038 + +intwindiv: + .long 0x47800000 # 65536.0 +.text + .align 32 +.globl make_decode_tables_MMX +make_decode_tables_MMX: + pushl %edi + pushl %esi + pushl %ebx + + xorl %ecx,%ecx + xorl %ebx,%ebx + movl $32,%esi + movl $intwinbase_MMX,%edi + negl 16(%esp) # scaleval + pushl $2 # intwinbase step +.L00: + cmpl $528,%ecx + jnc .L02 + movswl (%edi),%eax + cmpl $intwinbase_MMX+444,%edi + jc .L01 + addl $60000,%eax +.L01: + pushl %eax + fildl (%esp) + fdivs intwindiv + fimull 24(%esp) + popl %eax + fsts decwin(,%ecx,4) + fstps decwin+64(,%ecx,4) +.L02: + leal -1(%esi),%edx + and %ebx,%edx + cmp $31,%edx + jnz .L03 + addl $-1023,%ecx + test %esi,%ebx + jz .L03 + negl 20(%esp) +.L03: + addl %esi,%ecx + addl (%esp),%edi + incl %ebx + cmpl $intwinbase_MMX,%edi + jz .L04 + cmp $256,%ebx + jnz .L00 + negl (%esp) + jmp .L00 +.L04: + popl %eax + + xorl %ecx,%ecx + xorl %ebx,%ebx + pushl $2 +.L05: + cmpl $528,%ecx + jnc .L11 + movswl (%edi),%eax + cmpl $intwinbase_MMX+444,%edi + jc .L06 + addl $60000,%eax +.L06: + cltd + imull 20(%esp) + shrdl $17,%edx,%eax + cmpl $32767,%eax + movl $1055,%edx + jle .L07 + movl $32767,%eax + jmp .L08 +.L07: + cmpl $-32767,%eax + jge .L08 + movl $-32767,%eax +.L08: + cmpl $512,%ecx + jnc .L09 + subl %ecx,%edx + movw %ax,decwins(,%edx,2) + movw %ax,decwins-32(,%edx,2) +.L09: + testl $1,%ecx + jnz .L10 + negl %eax +.L10: + movw %ax,decwins(,%ecx,2) + movw %ax,decwins+32(,%ecx,2) +.L11: + leal -1(%esi),%edx + and %ebx,%edx + cmp $31,%edx + jnz .L12 + addl $-1023,%ecx + test %esi,%ebx + jz .L12 + negl 20(%esp) +.L12: + addl %esi,%ecx + addl (%esp),%edi + incl %ebx + cmpl $intwinbase_MMX,%edi + jz .L13 + cmp $256,%ebx + jnz .L05 + negl (%esp) + jmp .L05 +.L13: + popl %eax + + popl %ebx + popl %esi + popl %edi + ret +