Mercurial > mplayer.hg
changeset 11241:c8b84bb55089
removed obsoleted (or never ending) code
author | alex |
---|---|
date | Wed, 22 Oct 2003 21:08:46 +0000 |
parents | 5403367c7032 |
children | 34770e3d9654 |
files | mp3lib/Makefile mp3lib/dct64_sse.s mp3lib/mp3lib_objfix.sh mp3lib/sr1.c |
diffstat | 4 files changed, 1 insertions(+), 2255 deletions(-) [+] |
line wrap: on
line diff
--- a/mp3lib/Makefile Wed Oct 22 20:58:46 2003 +0000 +++ b/mp3lib/Makefile Wed Oct 22 21:08:46 2003 +0000 @@ -13,10 +13,6 @@ OBJS += decode_i586.o SRCS += decode_MMX.c dct64_MMX.c tabinit_MMX.c OBJS += decode_MMX.o dct64_MMX.o tabinit_MMX.o -#ifeq ($(TARGET_SSE),yes) -#SRCS += dct64_sse.s -#OBJS += dct64_sse.o -#endif SRCS += dct36_3dnow.c dct64_3dnow.c OBJS += dct36_3dnow.o dct64_3dnow.o SRCS += dct36_k7.c dct64_k7.c @@ -67,7 +63,7 @@ $(CC) $(CFLAGS) test2.c ../libvo/aclib.c -o test2 -I.. ./libMP3.a ../mp_msg-mencoder.o ../cpudetect.o -lm clean: - rm -f *~ *.o *.a + rm -f *~ *.o *.a test1 test2 distclean: rm -f *~ *.o *.a Makefile.bak .depend
--- a/mp3lib/dct64_sse.s Wed Oct 22 20:58:46 2003 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2221 +0,0 @@ -/ This code is a translation of dct64_k7.s from MPlayer. -/ Coded by Felix Buenemann <atmosfear at users.sourceforge.net> -/ -/ TODO: - fix phases 4 and 5 (sse) -/ - optimize scalar FPU code? (interleave with sse code) -/ - fix alignment (prohibits finishing this code) -/ - then use faster insns for aligned data -/ -/ Note: currently code is disabled as I couldn't get input data aligned! -/ - -//.data -// .align 8 -//x_plus_minus_3dnow: .long 0x00000000, 0x80000000 -//plus_1f: .float 1.0 - -.text - - .align 16 - - .global dct64_MMX_sse - -dct64_MMX_sse: - pushl %ebx - pushl %esi - pushl %edi - subl $256,%esp - movl 280(%esp),%eax - - leal 128(%esp),%edx - movl 272(%esp),%esi - movl 276(%esp),%edi - movl $costab_mmx,%ebx - orl %ecx,%ecx - movl %esp,%ecx - -/* Phase 1 (complete, worx) */ - -// [1] Process Block A1 (16 Bytes) -/ movq (%eax), %mm0 -/ movq 8(%eax), %mm4 - movups (%eax), %xmm0 - -// Copy A1 to another register A2 -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -// Process Block B1 (last 16 bytes) -/ movq 120(%eax), %mm1 -/ movq 112(%eax), %mm5 - movups 112(%eax), %xmm1 - -/* The PSWAPD instruction swaps or reverses the upper and lower - * doublewords of the source operand. PSWAPD mmreg1, mmreg2 - * performs the following operations: - * temp = mmreg2 - * mmreg1[63:32] = temp[31:0 ] - * mmreg1[31:0 ] = temp[63:32] - */ -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -// shufps here exchanges a,b,c,d to b,a,d,c in xmm1 (desc ia32-ref p.752) -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -// Add B1 to A1 -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -// Save Block A1 -/ movq %mm0, (%edx) -/ movq %mm4, 8(%edx) - movups %xmm0, (%edx) - -// Sub B1 from A2 -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -// Mul mem with A2 -/ pfmul (%ebx), %mm3 -/ pfmul 8(%ebx), %mm7 - movups (%ebx), %xmm7 - mulps %xmm7, %xmm2 - -// Shuffle A2 -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 -// I do a,b,c,d -> d,c,b,a to suit order when writing to mem (saves one shufps) - shufps $27, %xmm2, %xmm2 - -// Save A2 to mem (end) -/ movq %mm3, 120(%edx) -/ movq %mm7, 112(%edx) - movups %xmm2, 112(%edx) - -// [2] Process next data block -/ movq 16(%eax), %mm0 -/ movq 24(%eax), %mm4 - movups 16(%eax), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 104(%eax), %mm1 -/ movq 96(%eax), %mm5 - movups 96(%eax), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 16(%edx) -/ movq %mm4, 24(%edx) - movups %xmm0, 16(%edx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul 16(%ebx), %mm3 -/ pfmul 24(%ebx), %mm7 - movups 16(%ebx), %xmm7 - mulps %xmm7, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 104(%edx) -/ movq %mm7, 96(%edx) - movups %xmm2, 96(%edx) - -// [3] -/ movq 32(%eax), %mm0 -/ movq 40(%eax), %mm4 - movups 32(%eax), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 88(%eax), %mm1 -/ movq 80(%eax), %mm5 - movups 80(%eax), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 32(%edx) -/ movq %mm4, 40(%edx) - movups %xmm0, 32(%edx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul 32(%ebx), %mm3 -/ pfmul 40(%ebx), %mm7 - movups 32(%ebx), %xmm7 - mulps %xmm7, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 88(%edx) -/ movq %mm7, 80(%edx) - movups %xmm2, 80(%edx) - -// [4] -/ movq 48(%eax), %mm0 -/ movq 56(%eax), %mm4 - movups 48(%eax), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 72(%eax), %mm1 -/ movq 64(%eax), %mm5 - movups 64(%eax), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 48(%edx) -/ movq %mm4, 56(%edx) - movups %xmm0, 48(%edx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul 48(%ebx), %mm3 -/ pfmul 56(%ebx), %mm7 - movups 48(%ebx), %xmm7 - mulps %xmm7, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 72(%edx) -/ movq %mm7, 64(%edx) - movups %xmm2, 64(%edx) - - -// phase 1 fpu code -/* Phase 1*/ -/* - flds (%eax) - leal 128(%esp),%edx - fadds 124(%eax) - movl 272(%esp),%esi - fstps (%edx) - movl 276(%esp),%edi - - flds 4(%eax) - movl $costab_mmx,%ebx - fadds 120(%eax) - orl %ecx,%ecx - fstps 4(%edx) - - flds (%eax) - movl %esp,%ecx - fsubs 124(%eax) - fmuls (%ebx) - fstps 124(%edx) - - flds 4(%eax) - fsubs 120(%eax) - fmuls 4(%ebx) - fstps 120(%edx) - - flds 8(%eax) - fadds 116(%eax) - fstps 8(%edx) - - flds 12(%eax) - fadds 112(%eax) - fstps 12(%edx) - - flds 8(%eax) - fsubs 116(%eax) - fmuls 8(%ebx) - fstps 116(%edx) - - flds 12(%eax) - fsubs 112(%eax) - fmuls 12(%ebx) - fstps 112(%edx) - - flds 16(%eax) - fadds 108(%eax) - fstps 16(%edx) - - flds 20(%eax) - fadds 104(%eax) - fstps 20(%edx) - - flds 16(%eax) - fsubs 108(%eax) - fmuls 16(%ebx) - fstps 108(%edx) - - flds 20(%eax) - fsubs 104(%eax) - fmuls 20(%ebx) - fstps 104(%edx) - - flds 24(%eax) - fadds 100(%eax) - fstps 24(%edx) - - flds 28(%eax) - fadds 96(%eax) - fstps 28(%edx) - - flds 24(%eax) - fsubs 100(%eax) - fmuls 24(%ebx) - fstps 100(%edx) - - flds 28(%eax) - fsubs 96(%eax) - fmuls 28(%ebx) - fstps 96(%edx) - - flds 32(%eax) - fadds 92(%eax) - fstps 32(%edx) - - flds 36(%eax) - fadds 88(%eax) - fstps 36(%edx) - - flds 32(%eax) - fsubs 92(%eax) - fmuls 32(%ebx) - fstps 92(%edx) - - flds 36(%eax) - fsubs 88(%eax) - fmuls 36(%ebx) - fstps 88(%edx) - - flds 40(%eax) - fadds 84(%eax) - fstps 40(%edx) - - flds 44(%eax) - fadds 80(%eax) - fstps 44(%edx) - - flds 40(%eax) - fsubs 84(%eax) - fmuls 40(%ebx) - fstps 84(%edx) - - flds 44(%eax) - fsubs 80(%eax) - fmuls 44(%ebx) - fstps 80(%edx) - - flds 48(%eax) - fadds 76(%eax) - fstps 48(%edx) - - flds 52(%eax) - fadds 72(%eax) - fstps 52(%edx) - - flds 48(%eax) - fsubs 76(%eax) - fmuls 48(%ebx) - fstps 76(%edx) - - flds 52(%eax) - fsubs 72(%eax) - fmuls 52(%ebx) - fstps 72(%edx) - - flds 56(%eax) - fadds 68(%eax) - fstps 56(%edx) - - flds 60(%eax) - fadds 64(%eax) - fstps 60(%edx) - - flds 56(%eax) - fsubs 68(%eax) - fmuls 56(%ebx) - fstps 68(%edx) - - flds 60(%eax) - fsubs 64(%eax) - fmuls 60(%ebx) - fstps 64(%edx) -*/ -// end phase 1 fpu code - -/* Phase 2 (completed, worx) */ - -/ movq (%edx), %mm0 -/ movq 8(%edx), %mm4 - movups (%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 56(%edx), %mm1 -/ movq 48(%edx), %mm5 - movups 48(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, (%ecx) -/ movq %mm4, 8(%ecx) - movups %xmm0, (%ecx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul 64(%ebx), %mm3 -/ pfmul 72(%ebx), %mm7 - movups 64(%ebx), %xmm7 - mulps %xmm7, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 56(%ecx) -/ movq %mm7, 48(%ecx) - movups %xmm2, 48(%ecx) - -/ movq 16(%edx), %mm0 -/ movq 24(%edx), %mm4 - movups 16(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 40(%edx), %mm1 -/ movq 32(%edx), %mm5 - movups 32(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 16(%ecx) -/ movq %mm4, 24(%ecx) - movups %xmm0, 16(%ecx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul 80(%ebx), %mm3 -/ pfmul 88(%ebx), %mm7 - movups 80(%ebx), %xmm7 - mulps %xmm7, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 40(%ecx) -/ movq %mm7, 32(%ecx) - movups %xmm2, 32(%ecx) - - -// phase 2 fpu -/* Phase 2*/ -/* - flds (%edx) - fadds 60(%edx) - fstps (%ecx) - - flds 4(%edx) - fadds 56(%edx) - fstps 4(%ecx) - - flds (%edx) - fsubs 60(%edx) - fmuls 64(%ebx) - fstps 60(%ecx) - - flds 4(%edx) - fsubs 56(%edx) - fmuls 68(%ebx) - fstps 56(%ecx) - - flds 8(%edx) - fadds 52(%edx) - fstps 8(%ecx) - - flds 12(%edx) - fadds 48(%edx) - fstps 12(%ecx) - - flds 8(%edx) - fsubs 52(%edx) - fmuls 72(%ebx) - fstps 52(%ecx) - - flds 12(%edx) - fsubs 48(%edx) - fmuls 76(%ebx) - fstps 48(%ecx) - - flds 16(%edx) - fadds 44(%edx) - fstps 16(%ecx) - - flds 20(%edx) - fadds 40(%edx) - fstps 20(%ecx) - - flds 16(%edx) - fsubs 44(%edx) - fmuls 80(%ebx) - fstps 44(%ecx) - - flds 20(%edx) - fsubs 40(%edx) - fmuls 84(%ebx) - fstps 40(%ecx) - - flds 24(%edx) - fadds 36(%edx) - fstps 24(%ecx) - - flds 28(%edx) - fadds 32(%edx) - fstps 28(%ecx) - - flds 24(%edx) - fsubs 36(%edx) - fmuls 88(%ebx) - fstps 36(%ecx) - - flds 28(%edx) - fsubs 32(%edx) - fmuls 92(%ebx) - fstps 32(%ecx) -*/ -// end phase 2 fpu - -/* Phase 3 (completed, working) */ - -/ movq 64(%edx), %mm0 -/ movq 72(%edx), %mm4 - movups 64(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 120(%edx), %mm1 -/ movq 112(%edx), %mm5 - movups 112(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 64(%ecx) -/ movq %mm4, 72(%ecx) - movups %xmm0, 64(%ecx) - -/ pfsubr %mm1, %mm3 -/ pfsubr %mm5, %mm7 -// optimized (xmm1<->xmm2) - subps %xmm2, %xmm1 - -/ pfmul 64(%ebx), %mm3 -/ pfmul 72(%ebx), %mm7 - movups 64(%ebx), %xmm7 - mulps %xmm7, %xmm1 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm1, %xmm1 - -/ movq %mm3, 120(%ecx) -/ movq %mm7, 112(%ecx) - movups %xmm1, 112(%ecx) - - -/ movq 80(%edx), %mm0 -/ movq 88(%edx), %mm4 - movups 80(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 104(%edx), %mm1 -/ movq 96(%edx), %mm5 - movups 96(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 80(%ecx) -/ movq %mm4, 88(%ecx) - movups %xmm0, 80(%ecx) - -/ pfsubr %mm1, %mm3 -/ pfsubr %mm5, %mm7 -// optimized (xmm1<->xmm2) - subps %xmm2, %xmm1 - -/ pfmul 80(%ebx), %mm3 -/ pfmul 88(%ebx), %mm7 - movups 80(%ebx), %xmm7 - mulps %xmm7, %xmm1 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm1, %xmm1 - -/ movq %mm3, 104(%ecx) -/ movq %mm7, 96(%ecx) - movups %xmm1, 96(%ecx) - - -// phase 3 fpu -/* Phase 3*/ -/* - flds 64(%edx) - fadds 124(%edx) - fstps 64(%ecx) - - flds 68(%edx) - fadds 120(%edx) - fstps 68(%ecx) - - flds 124(%edx) - fsubs 64(%edx) - fmuls 64(%ebx) - fstps 124(%ecx) - - flds 120(%edx) - fsubs 68(%edx) - fmuls 68(%ebx) - fstps 120(%ecx) - - flds 72(%edx) - fadds 116(%edx) - fstps 72(%ecx) - - flds 76(%edx) - fadds 112(%edx) - fstps 76(%ecx) - - flds 116(%edx) - fsubs 72(%edx) - fmuls 72(%ebx) - fstps 116(%ecx) - - flds 112(%edx) - fsubs 76(%edx) - fmuls 76(%ebx) - fstps 112(%ecx) - - flds 80(%edx) - fadds 108(%edx) - fstps 80(%ecx) - - flds 84(%edx) - fadds 104(%edx) - fstps 84(%ecx) - - flds 108(%edx) - fsubs 80(%edx) - fmuls 80(%ebx) - fstps 108(%ecx) - - flds 104(%edx) - fsubs 84(%edx) - fmuls 84(%ebx) - fstps 104(%ecx) - - flds 88(%edx) - fadds 100(%edx) - fstps 88(%ecx) - - flds 92(%edx) - fadds 96(%edx) - fstps 92(%ecx) - - flds 100(%edx) - fsubs 88(%edx) - fmuls 88(%ebx) - fstps 100(%ecx) - - flds 96(%edx) - fsubs 92(%edx) - fmuls 92(%ebx) - fstps 96(%ecx) -*/ -// end phase 3 fpu - - -/* Phase 4 (completed, buggy) */ -/* -/ movq 96(%ebx), %mm2 -/ movq 104(%ebx), %mm6 - movups 96(%ebx), %xmm4 - - -/ movq (%ecx), %mm0 -/ movq 8(%ecx), %mm4 - movups (%ecx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 24(%ecx), %mm1 -/ movq 16(%ecx), %mm5 - movups 16(%ecx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, (%edx) -/ movq %mm4, 8(%edx) - movups %xmm0, (%edx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm6, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 24(%edx) -/ movq %mm7, 16(%edx) - movups %xmm2, 16(%edx) - -/ movq 32(%ecx), %mm0 -/ movq 40(%ecx), %mm4 - movups 32(%ecx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 56(%ecx), %mm1 -/ movq 48(%ecx), %mm5 - movups 48(%ecx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 32(%edx) -/ movq %mm4, 40(%edx) - movups %xmm0, 32(%edx) - -/ pfsubr %mm1, %mm3 -/ pfsubr %mm5, %mm7 -// Luckily we can swap this (xmm1<->xmm2) - subps %xmm2, %xmm1 - -/ pfmul %mm2, %mm3 -/ pfmul %mm6, %mm7 - mulps %xmm4, %xmm1 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm1, %xmm1 - -/ movq %mm3, 56(%edx) -/ movq %mm7, 48(%edx) - movups %xmm1, 48(%edx) - - -/ movq 64(%ecx), %mm0 -/ movq 72(%ecx), %mm4 - movups 64(%ecx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 88(%ecx), %mm1 -/ movq 80(%ecx), %mm5 - movups 80(%ecx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 64(%edx) -/ movq %mm4, 72(%edx) - movups %xmm0, 64(%edx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm6, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 88(%edx) -/ movq %mm7, 80(%edx) - movups %xmm2, 80(%edx) - - -/ movq 96(%ecx), %mm0 -/ movq 104(%ecx), %mm4 - movups 96(%ecx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 120(%ecx), %mm1 -/ movq 112(%ecx), %mm5 - movups 112(%ecx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 96(%edx) -/ movq %mm4, 104(%edx) - movups %xmm0, 96(%edx) - -/ pfsubr %mm1, %mm3 -/ pfsubr %mm5, %mm7 -// This is already optimized, so xmm2 must be swapped with xmm1 for rest of phase - subps %xmm2, %xmm1 - -/ pfmul %mm2, %mm3 -/ pfmul %mm6, %mm7 - mulps %xmm4, %xmm1 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm1, %xmm1 - -/ movq %mm3, 120(%edx) -/ movq %mm7, 112(%edx) - movups %xmm1, 112(%edx) -*/ - -// phase 4 fpu code -/* Phase 4*/ - - flds (%ecx) - fadds 28(%ecx) - fstps (%edx) - - flds (%ecx) - fsubs 28(%ecx) - fmuls 96(%ebx) - fstps 28(%edx) - - flds 4(%ecx) - fadds 24(%ecx) - fstps 4(%edx) - - flds 4(%ecx) - fsubs 24(%ecx) - fmuls 100(%ebx) - fstps 24(%edx) - - flds 8(%ecx) - fadds 20(%ecx) - fstps 8(%edx) - - flds 8(%ecx) - fsubs 20(%ecx) - fmuls 104(%ebx) - fstps 20(%edx) - - flds 12(%ecx) - fadds 16(%ecx) - fstps 12(%edx) - - flds 12(%ecx) - fsubs 16(%ecx) - fmuls 108(%ebx) - fstps 16(%edx) - - flds 32(%ecx) - fadds 60(%ecx) - fstps 32(%edx) - - flds 60(%ecx) - fsubs 32(%ecx) - fmuls 96(%ebx) - fstps 60(%edx) - - flds 36(%ecx) - fadds 56(%ecx) - fstps 36(%edx) - - flds 56(%ecx) - fsubs 36(%ecx) - fmuls 100(%ebx) - fstps 56(%edx) - - flds 40(%ecx) - fadds 52(%ecx) - fstps 40(%edx) - - flds 52(%ecx) - fsubs 40(%ecx) - fmuls 104(%ebx) - fstps 52(%edx) - - flds 44(%ecx) - fadds 48(%ecx) - fstps 44(%edx) - - flds 48(%ecx) - fsubs 44(%ecx) - fmuls 108(%ebx) - fstps 48(%edx) - - flds 64(%ecx) - fadds 92(%ecx) - fstps 64(%edx) - - flds 64(%ecx) - fsubs 92(%ecx) - fmuls 96(%ebx) - fstps 92(%edx) - - flds 68(%ecx) - fadds 88(%ecx) - fstps 68(%edx) - - flds 68(%ecx) - fsubs 88(%ecx) - fmuls 100(%ebx) - fstps 88(%edx) - - flds 72(%ecx) - fadds 84(%ecx) - fstps 72(%edx) - - flds 72(%ecx) - fsubs 84(%ecx) - fmuls 104(%ebx) - fstps 84(%edx) - - flds 76(%ecx) - fadds 80(%ecx) - fstps 76(%edx) - - flds 76(%ecx) - fsubs 80(%ecx) - fmuls 108(%ebx) - fstps 80(%edx) - - flds 96(%ecx) - fadds 124(%ecx) - fstps 96(%edx) - - flds 124(%ecx) - fsubs 96(%ecx) - fmuls 96(%ebx) - fstps 124(%edx) - - flds 100(%ecx) - fadds 120(%ecx) - fstps 100(%edx) - - flds 120(%ecx) - fsubs 100(%ecx) - fmuls 100(%ebx) - fstps 120(%edx) - - flds 104(%ecx) - fadds 116(%ecx) - fstps 104(%edx) - - flds 116(%ecx) - fsubs 104(%ecx) - fmuls 104(%ebx) - fstps 116(%edx) - - flds 108(%ecx) - fadds 112(%ecx) - fstps 108(%edx) - - flds 112(%ecx) - fsubs 108(%ecx) - fmuls 108(%ebx) - fstps 112(%edx) - - flds (%edx) - fadds 12(%edx) - fstps (%ecx) - - flds (%edx) - fsubs 12(%edx) - fmuls 112(%ebx) - fstps 12(%ecx) - - flds 4(%edx) - fadds 8(%edx) - fstps 4(%ecx) - - flds 4(%edx) - fsubs 8(%edx) - fmuls 116(%ebx) - fstps 8(%ecx) - - flds 16(%edx) - fadds 28(%edx) - fstps 16(%ecx) - - flds 28(%edx) - fsubs 16(%edx) - fmuls 112(%ebx) - fstps 28(%ecx) - - flds 20(%edx) - fadds 24(%edx) - fstps 20(%ecx) - - flds 24(%edx) - fsubs 20(%edx) - fmuls 116(%ebx) - fstps 24(%ecx) - - flds 32(%edx) - fadds 44(%edx) - fstps 32(%ecx) - - flds 32(%edx) - fsubs 44(%edx) - fmuls 112(%ebx) - fstps 44(%ecx) - - flds 36(%edx) - fadds 40(%edx) - fstps 36(%ecx) - - flds 36(%edx) - fsubs 40(%edx) - fmuls 116(%ebx) - fstps 40(%ecx) - - flds 48(%edx) - fadds 60(%edx) - fstps 48(%ecx) - - flds 60(%edx) - fsubs 48(%edx) - fmuls 112(%ebx) - fstps 60(%ecx) - - flds 52(%edx) - fadds 56(%edx) - fstps 52(%ecx) - - flds 56(%edx) - fsubs 52(%edx) - fmuls 116(%ebx) - fstps 56(%ecx) - - flds 64(%edx) - fadds 76(%edx) - fstps 64(%ecx) - - flds 64(%edx) - fsubs 76(%edx) - fmuls 112(%ebx) - fstps 76(%ecx) - - flds 68(%edx) - fadds 72(%edx) - fstps 68(%ecx) - - flds 68(%edx) - fsubs 72(%edx) - fmuls 116(%ebx) - fstps 72(%ecx) - - flds 80(%edx) - fadds 92(%edx) - fstps 80(%ecx) - - flds 92(%edx) - fsubs 80(%edx) - fmuls 112(%ebx) - fstps 92(%ecx) - - flds 84(%edx) - fadds 88(%edx) - fstps 84(%ecx) - - flds 88(%edx) - fsubs 84(%edx) - fmuls 116(%ebx) - fstps 88(%ecx) - - flds 96(%edx) - fadds 108(%edx) - fstps 96(%ecx) - - flds 96(%edx) - fsubs 108(%edx) - fmuls 112(%ebx) - fstps 108(%ecx) - - flds 100(%edx) - fadds 104(%edx) - fstps 100(%ecx) - - flds 100(%edx) - fsubs 104(%edx) - fmuls 116(%ebx) - fstps 104(%ecx) - - flds 112(%edx) - fadds 124(%edx) - fstps 112(%ecx) - - flds 124(%edx) - fsubs 112(%edx) - fmuls 112(%ebx) - fstps 124(%ecx) - - flds 116(%edx) - fadds 120(%edx) - fstps 116(%ecx) - - flds 120(%edx) - fsubs 116(%edx) - fmuls 116(%ebx) - fstps 120(%ecx) - -// end of phase 4 fpu - -// below stuff needs to be finished I use FPU code for first -/* Phase 5 (completed, crashing) */ -/* -/ movq 112(%ebx), %mm2 - // move 8 byte data to (low)high quadword - check this! atmos - movlps 112(%ebx), %xmm4 - // maybe I need movhlps too to get data into correct quadword - movlhps %xmm4, %xmm4 - -/ movq (%edx), %mm0 -/ movq 16(%edx), %mm4 - movups (%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -// hmm? this is strange -/ movq 8(%edx), %mm1 -/ movq 24(%edx), %mm5 - movlps 8(%edx), %xmm1 - movhps 24(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 - pshufd $177, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, (%ecx) -/ movq %mm4, 16(%ecx) - movlps %xmm0, (%ecx) - movhps %xmm0, 16(%ecx) - -/ pfsub %mm1, %mm3 -/ pfsubr %mm5, %mm7 -// I need to emulate pfsubr here - movaps %xmm1, %xmm3 - subps %xmm2, %xmm3 - subps %xmm1, %xmm2 -// now move correct quadword from reverse substration in xmm3 to correct -// quadword in xmm2 and leave other quadword with non-reversed substration untouched -/// shufpd $2, %xmm3, %xmm2 -// (or $1?) (see ia32-ref p.749) -// optimize - movq %xmm2, %xmm3 - movaps %xmm3, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm2, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $177, %xmm2, %xmm2 - -/ movq %mm3, 8(%ecx) -/ movq %mm7, 24(%ecx) - movlps %xmm2, 8(%ecx) - movhps %xmm2, 24(%ecx) - -/ movq 32(%edx), %mm0 -/ movq 48(%edx), %mm4 - movlps 32(%edx), %xmm0 - movhps 48(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 40(%edx), %mm1 -/ movq 56(%edx), %mm5 - movlps 40(%edx), %xmm1 - movhps 56(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 - shufps $177, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 32(%ecx) -/ movq %mm4, 48(%ecx) - movlps %xmm0, 32(%ecx) - movhps %xmm0, 48(%ecx) - -/ pfsub %mm1, %mm3 -/ pfsubr %mm5, %mm7 - movaps %xmm1, %xmm3 - subps %xmm2, %xmm3 - subps %xmm1, %xmm2 -/// shufpd $2, %xmm3, %xmm2 -// (or $1?) -// optimize - movq %xmm2, %xmm3 - movaps %xmm3, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm2, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $177, %xmm2, %xmm2 - -/ movq %mm3, 40(%ecx) -/ movq %mm7, 56(%ecx) - movlps %xmm2, 40(%ecx) - movhps %xmm2, 56(%ecx) - - -/ movq 64(%edx), %mm0 -/ movq 80(%edx), %mm4 - movlps 64(%edx), %xmm0 - movhps 80(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 72(%edx), %mm1 -/ movq 88(%edx), %mm5 - movlps 72(%edx), %xmm1 - movhps 88(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 - shufps $177, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 64(%ecx) -/ movq %mm4, 80(%ecx) - movlps %xmm0, 64(%ecx) - movhps %xmm0, 80(%ecx) - -/ pfsub %mm1, %mm3 -/ pfsubr %mm5, %mm7 - movaps %xmm1, %xmm3 - subps %xmm2, %xmm3 - subps %xmm1, %xmm2 -/// shufpd $2, %xmm3, %xmm2 -// (or $1?) -// optimize - movq %xmm2, %xmm3 - movaps %xmm3, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm2, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $177, %xmm2, %xmm2 - -/ movq %mm3, 72(%ecx) -/ movq %mm7, 88(%ecx) - movlps %xmm2, 72(%ecx) - movhps %xmm2, 88(%ecx) - -/ movq 96(%edx), %mm0 -/ movq 112(%edx), %mm4 - movups 96(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 104(%edx), %mm1 -/ movq 120(%edx), %mm5 - movlps 104(%edx), %xmm1 - movhps 120(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 - shufps $177, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 96(%ecx) -/ movq %mm4, 112(%ecx) - movups %xmm0, 96(%ecx) - -/ pfsub %mm1, %mm3 -/ pfsubr %mm5, %mm7 - movaps %xmm1, %xmm3 - subps %xmm2, %xmm3 - subps %xmm1, %xmm2 -/// shufpd $2, %xmm3, %xmm2 -// (or $1?) -// optimize - movq %xmm2, %xmm3 - movaps %xmm3, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm2, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $177, %xmm2, %xmm2 - -/ movq %mm3, 104(%ecx) -/ movq %mm7, 120(%ecx) - movlps %xmm2, 104(%ecx) - movhps %xmm2, 120(%ecx) -*/ - - -/* Phase 6. This is the end of easy road. */ -/* Code below is coded in scalar mode. Should be optimized */ -// -// movd plus_1f, %mm6 -// punpckldq 120(%ebx), %mm6 /* mm6 = 1.0 | 120(%ebx)*/ -// movq x_plus_minus_3dnow, %mm7 /* mm7 = +1 | -1 */ -/* - movq 32(%ecx), %mm0 - movq 64(%ecx), %mm2 - movq %mm0, %mm1 - movq %mm2, %mm3 - pxor %mm7, %mm1 - pxor %mm7, %mm3 - pfacc %mm1, %mm0 - pfacc %mm3, %mm2 - pfmul %mm6, %mm0 - pfmul %mm6, %mm2 - movq %mm0, 32(%edx) - movq %mm2, 64(%edx) - - movd 44(%ecx), %mm0 - movd 40(%ecx), %mm2 - movd 120(%ebx), %mm3 - punpckldq 76(%ecx), %mm0 - punpckldq 72(%ecx), %mm2 - punpckldq %mm3, %mm3 - movq %mm0, %mm4 - movq %mm2, %mm5 - pfsub %mm2, %mm0 - pfmul %mm3, %mm0 - movq %mm0, %mm1 - pfadd %mm5, %mm0 - pfadd %mm4, %mm0 - movq %mm0, %mm2 - punpckldq %mm1, %mm0 - punpckhdq %mm1, %mm2 - movq %mm0, 40(%edx) - movq %mm2, 72(%edx) - - movd 48(%ecx), %mm3 - movd 60(%ecx), %mm2 - pfsub 52(%ecx), %mm3 - pfsub 56(%ecx), %mm2 - pfmul 120(%ebx), %mm3 - pfmul 120(%ebx), %mm2 - movq %mm2, %mm1 - - pfadd 56(%ecx), %mm1 - pfadd 60(%ecx), %mm1 - movq %mm1, %mm0 - - pfadd 48(%ecx), %mm0 - pfadd 52(%ecx), %mm0 - pfadd %mm3, %mm1 - punpckldq %mm2, %mm1 - pfadd %mm3, %mm2 - punpckldq %mm2, %mm0 - movq %mm1, 56(%edx) - movq %mm0, 48(%edx) -*/ -/*---*/ -/* - movd 92(%ecx), %mm1 - pfsub 88(%ecx), %mm1 - pfmul 120(%ebx), %mm1 - movd %mm1, 92(%edx) - pfadd 92(%ecx), %mm1 - pfadd 88(%ecx), %mm1 - movq %mm1, %mm0 - - pfadd 80(%ecx), %mm0 - pfadd 84(%ecx), %mm0 - movd %mm0, 80(%edx) - - movd 80(%ecx), %mm0 - pfsub 84(%ecx), %mm0 - pfmul 120(%ebx), %mm0 - pfadd %mm0, %mm1 - pfadd 92(%edx), %mm0 - punpckldq %mm1, %mm0 - movq %mm0, 84(%edx) - - movq 96(%ecx), %mm0 - movq %mm0, %mm1 - pxor %mm7, %mm1 - pfacc %mm1, %mm0 - pfmul %mm6, %mm0 - movq %mm0, 96(%edx) - - movd 108(%ecx), %mm0 - pfsub 104(%ecx), %mm0 - pfmul 120(%ebx), %mm0 - movd %mm0, 108(%edx) - pfadd 104(%ecx), %mm0 - pfadd 108(%ecx), %mm0 - movd %mm0, 104(%edx) - - movd 124(%ecx), %mm1 - pfsub 120(%ecx), %mm1 - pfmul 120(%ebx), %mm1 - movd %mm1, 124(%edx) - pfadd 120(%ecx), %mm1 - pfadd 124(%ecx), %mm1 - movq %mm1, %mm0 - - pfadd 112(%ecx), %mm0 - pfadd 116(%ecx), %mm0 - movd %mm0, 112(%edx) - - movd 112(%ecx), %mm0 - pfsub 116(%ecx), %mm0 - pfmul 120(%ebx), %mm0 - pfadd %mm0,%mm1 - pfadd 124(%edx), %mm0 - punpckldq %mm1, %mm0 - movq %mm0, 116(%edx) - - jnz .L01 -*/ - - -/* Phase 7*/ -/* Code below is coded in scalar mode. Should be optimized */ -/* - movd (%ecx), %mm0 - pfadd 4(%ecx), %mm0 - movd %mm0, 1024(%esi) - - movd (%ecx), %mm0 - pfsub 4(%ecx), %mm0 - pfmul 120(%ebx), %mm0 - movd %mm0, (%esi) - movd %mm0, (%edi) - - movd 12(%ecx), %mm0 - pfsub 8(%ecx), %mm0 - pfmul 120(%ebx), %mm0 - movd %mm0, 512(%edi) - pfadd 12(%ecx), %mm0 - pfadd 8(%ecx), %mm0 - movd %mm0, 512(%esi) - - movd 16(%ecx), %mm0 - pfsub 20(%ecx), %mm0 - pfmul 120(%ebx), %mm0 - movq %mm0, %mm3 - - movd 28(%ecx), %mm0 - pfsub 24(%ecx), %mm0 - pfmul 120(%ebx), %mm0 - movd %mm0, 768(%edi) - movq %mm0, %mm2 - - pfadd 24(%ecx), %mm0 - pfadd 28(%ecx), %mm0 - movq %mm0, %mm1 - - pfadd 16(%ecx), %mm0 - pfadd 20(%ecx), %mm0 - movd %mm0, 768(%esi) - pfadd %mm3, %mm1 - movd %mm1, 256(%esi) - pfadd %mm3, %mm2 - movd %mm2, 256(%edi) -*/ - - -/* Phase 8*/ -/* - movq 32(%edx), %mm0 - movq 48(%edx), %mm1 - pfadd 48(%edx), %mm0 - pfadd 40(%edx), %mm1 - movd %mm0, 896(%esi) - movd %mm1, 640(%esi) - psrlq $32, %mm0 - psrlq $32, %mm1 - movd %mm0, 128(%edi) - movd %mm1, 384(%edi) - - movd 40(%edx), %mm0 - pfadd 56(%edx), %mm0 - movd %mm0, 384(%esi) - - movd 56(%edx), %mm0 - pfadd 36(%edx), %mm0 - movd %mm0, 128(%esi) - - movd 60(%edx), %mm0 - movd %mm0, 896(%edi) - pfadd 44(%edx), %mm0 - movd %mm0, 640(%edi) - - movq 96(%edx), %mm0 - movq 112(%edx), %mm2 - movq 104(%edx), %mm4 - pfadd 112(%edx), %mm0 - pfadd 104(%edx), %mm2 - pfadd 120(%edx), %mm4 - movq %mm0, %mm1 - movq %mm2, %mm3 - movq %mm4, %mm5 - pfadd 64(%edx), %mm0 - pfadd 80(%edx), %mm2 - pfadd 72(%edx), %mm4 - movd %mm0, 960(%esi) - movd %mm2, 704(%esi) - movd %mm4, 448(%esi) - psrlq $32, %mm0 - psrlq $32, %mm2 - psrlq $32, %mm4 - movd %mm0, 64(%edi) - movd %mm2, 320(%edi) - movd %mm4, 576(%edi) - pfadd 80(%edx), %mm1 - pfadd 72(%edx), %mm3 - pfadd 88(%edx), %mm5 - movd %mm1, 832(%esi) - movd %mm3, 576(%esi) - movd %mm5, 320(%esi) - psrlq $32, %mm1 - psrlq $32, %mm3 - psrlq $32, %mm5 - movd %mm1, 192(%edi) - movd %mm3, 448(%edi) - movd %mm5, 704(%edi) - - movd 120(%edx), %mm0 - pfadd 100(%edx), %mm0 - movq %mm0, %mm1 - pfadd 88(%edx), %mm0 - movd %mm0, 192(%esi) - pfadd 68(%edx), %mm1 - movd %mm1, 64(%esi) - - movd 124(%edx), %mm0 - movd %mm0, 960(%edi) - pfadd 92(%edx), %mm0 - movd %mm0, 832(%edi) - - jmp .L_bye -.L01: -*/ - - -/* Phase 9*/ -/* - movq (%ecx), %mm0 - movq %mm0, %mm1 - pxor %mm7, %mm1 - pfacc %mm1, %mm0 - pfmul %mm6, %mm0 - pf2id %mm0, %mm0 - movd %mm0, %eax - movw %ax, 512(%esi) - psrlq $32, %mm0 - movd %mm0, %eax - movw %ax, (%esi) - - movd 12(%ecx), %mm0 - pfsub 8(%ecx), %mm0 - pfmul 120(%ebx), %mm0 - pf2id %mm0, %mm7 - movd %mm7, %eax - movw %ax, 256(%edi) - pfadd 12(%ecx), %mm0 - pfadd 8(%ecx), %mm0 - pf2id %mm0, %mm0 - movd %mm0, %eax - movw %ax, 256(%esi) - - movd 16(%ecx), %mm3 - pfsub 20(%ecx), %mm3 - pfmul 120(%ebx), %mm3 - movq %mm3, %mm2 - - movd 28(%ecx), %mm2 - pfsub 24(%ecx), %mm2 - pfmul 120(%ebx), %mm2 - movq %mm2, %mm1 - - pf2id %mm2, %mm7 - movd %mm7, %eax - movw %ax, 384(%edi) - - pfadd 24(%ecx), %mm1 - pfadd 28(%ecx), %mm1 - movq %mm1, %mm0 - - pfadd 16(%ecx), %mm0 - pfadd 20(%ecx), %mm0 - pf2id %mm0, %mm0 - movd %mm0, %eax - movw %ax, 384(%esi) - pfadd %mm3, %mm1 - pf2id %mm1, %mm1 - movd %mm1, %eax - movw %ax, 128(%esi) - pfadd %mm3, %mm2 - pf2id %mm2, %mm2 - movd %mm2, %eax - movw %ax, 128(%edi) -*/ - - -/* Phase 10*/ -/* - movq 32(%edx), %mm0 - movq 48(%edx), %mm1 - pfadd 48(%edx), %mm0 - pfadd 40(%edx), %mm1 - pf2id %mm0, %mm0 - pf2id %mm1, %mm1 - movd %mm0, %eax - movd %mm1, %ecx - movw %ax, 448(%esi) - movw %cx, 320(%esi) - psrlq $32, %mm0 - psrlq $32, %mm1 - movd %mm0, %eax - movd %mm1, %ecx - movw %ax, 64(%edi) - movw %cx, 192(%edi) - - movd 40(%edx), %mm3 - movd 56(%edx), %mm4 - movd 60(%edx), %mm0 - movd 44(%edx), %mm2 - movd 120(%edx), %mm5 - punpckldq %mm4, %mm3 - punpckldq 124(%edx), %mm0 - pfadd 100(%edx), %mm5 - punpckldq 36(%edx), %mm4 - punpckldq 92(%edx), %mm2 - movq %mm5, %mm6 - pfadd %mm4, %mm3 - pf2id %mm0, %mm1 - pf2id %mm3, %mm3 - pfadd 88(%edx), %mm5 - movd %mm1, %eax - movd %mm3, %ecx - movw %ax, 448(%edi) - movw %cx, 192(%esi) - pf2id %mm5, %mm5 - psrlq $32, %mm1 - psrlq $32, %mm3 - movd %mm5, %ebx - movd %mm1, %eax - movd %mm3, %ecx - movw %bx, 96(%esi) - movw %ax, 480(%edi) - movw %cx, 64(%esi) - pfadd %mm2, %mm0 - pf2id %mm0, %mm0 - movd %mm0, %eax - pfadd 68(%edx), %mm6 - movw %ax, 320(%edi) - psrlq $32, %mm0 - pf2id %mm6, %mm6 - movd %mm0, %eax - movd %mm6, %ebx - movw %ax, 416(%edi) - movw %bx, 32(%esi) - - movq 96(%edx), %mm0 - movq 112(%edx), %mm2 - movq 104(%edx), %mm4 - pfadd %mm2, %mm0 - pfadd %mm4, %mm2 - pfadd 120(%edx), %mm4 - movq %mm0, %mm1 - movq %mm2, %mm3 - movq %mm4, %mm5 - pfadd 64(%edx), %mm0 - pfadd 80(%edx), %mm2 - pfadd 72(%edx), %mm4 - pf2id %mm0, %mm0 - pf2id %mm2, %mm2 - pf2id %mm4, %mm4 - movd %mm0, %eax - movd %mm2, %ecx - movd %mm4, %ebx - movw %ax, 480(%esi) - movw %cx, 352(%esi) - movw %bx, 224(%esi) - psrlq $32, %mm0 - psrlq $32, %mm2 - psrlq $32, %mm4 - movd %mm0, %eax - movd %mm2, %ecx - movd %mm4, %ebx - movw %ax, 32(%edi) - movw %cx, 160(%edi) - movw %bx, 288(%edi) - pfadd 80(%edx), %mm1 - pfadd 72(%edx), %mm3 - pfadd 88(%edx), %mm5 - pf2id %mm1, %mm1 - pf2id %mm3, %mm3 - pf2id %mm5, %mm5 - movd %mm1, %eax - movd %mm3, %ecx - movd %mm5, %ebx - movw %ax, 416(%esi) - movw %cx, 288(%esi) - movw %bx, 160(%esi) - psrlq $32, %mm1 - psrlq $32, %mm3 - psrlq $32, %mm5 - movd %mm1, %eax - movd %mm3, %ecx - movd %mm5, %ebx - movw %ax, 96(%edi) - movw %cx, 224(%edi) - movw %bx, 352(%edi) - - movsw - -.L_bye: - addl $256,%esp -/ femms - emms - popl %edi - popl %esi - popl %ebx - ret $12 -*/ - -// here comes old fashioned FPU code for the tough parts - -/* Phase 5*/ - - flds 32(%ecx) - fadds 36(%ecx) - fstps 32(%edx) - - flds 32(%ecx) - fsubs 36(%ecx) - fmuls 120(%ebx) - fstps 36(%edx) - - flds 44(%ecx) - fsubs 40(%ecx) - fmuls 120(%ebx) - fsts 44(%edx) - fadds 40(%ecx) - fadds 44(%ecx) - fstps 40(%edx) - - flds 48(%ecx) - fsubs 52(%ecx) - fmuls 120(%ebx) - - flds 60(%ecx) - fsubs 56(%ecx) - fmuls 120(%ebx) - fld %st(0) - fadds 56(%ecx) - fadds 60(%ecx) - fld %st(0) - fadds 48(%ecx) - fadds 52(%ecx) - fstps 48(%edx) - fadd %st(2) - fstps 56(%edx) - fsts 60(%edx) - faddp %st(1) - fstps 52(%edx) - - flds 64(%ecx) - fadds 68(%ecx) - fstps 64(%edx) - - flds 64(%ecx) - fsubs 68(%ecx) - fmuls 120(%ebx) - fstps 68(%edx) - - flds 76(%ecx) - fsubs 72(%ecx) - fmuls 120(%ebx) - fsts 76(%edx) - fadds 72(%ecx) - fadds 76(%ecx) - fstps 72(%edx) - - flds 92(%ecx) - fsubs 88(%ecx) - fmuls 120(%ebx) - fsts 92(%edx) - fadds 92(%ecx) - fadds 88(%ecx) - fld %st(0) - fadds 80(%ecx) - fadds 84(%ecx) - fstps 80(%edx) - - flds 80(%ecx) - fsubs 84(%ecx) - fmuls 120(%ebx) - fadd %st(0), %st(1) - fadds 92(%edx) - fstps 84(%edx) - fstps 88(%edx) - - flds 96(%ecx) - fadds 100(%ecx) - fstps 96(%edx) - - flds 96(%ecx) - fsubs 100(%ecx) - fmuls 120(%ebx) - fstps 100(%edx) - - flds 108(%ecx) - fsubs 104(%ecx) - fmuls 120(%ebx) - fsts 108(%edx) - fadds 104(%ecx) - fadds 108(%ecx) - fstps 104(%edx) - - flds 124(%ecx) - fsubs 120(%ecx) - fmuls 120(%ebx) - fsts 124(%edx) - fadds 120(%ecx) - fadds 124(%ecx) - fld %st(0) - fadds 112(%ecx) - fadds 116(%ecx) - fstps 112(%edx) - - flds 112(%ecx) - fsubs 116(%ecx) - fmuls 120(%ebx) - fadd %st(0),%st(1) - fadds 124(%edx) - fstps 116(%edx) - fstps 120(%edx) - jnz .L01 - - -/* Phase 6*/ - - flds (%ecx) - fadds 4(%ecx) - fstps 1024(%esi) - - flds (%ecx) - fsubs 4(%ecx) - fmuls 120(%ebx) - fsts (%esi) - fstps (%edi) - - flds 12(%ecx) - fsubs 8(%ecx) - fmuls 120(%ebx) - fsts 512(%edi) - fadds 12(%ecx) - fadds 8(%ecx) - fstps 512(%esi) - - flds 16(%ecx) - fsubs 20(%ecx) - fmuls 120(%ebx) - - flds 28(%ecx) - fsubs 24(%ecx) - fmuls 120(%ebx) - fsts 768(%edi) - fld %st(0) - fadds 24(%ecx) - fadds 28(%ecx) - fld %st(0) - fadds 16(%ecx) - fadds 20(%ecx) - fstps 768(%esi) - fadd %st(2) - fstps 256(%esi) - faddp %st(1) - fstps 256(%edi) - -/* Phase 7*/ - - flds 32(%edx) - fadds 48(%edx) - fstps 896(%esi) - - flds 48(%edx) - fadds 40(%edx) - fstps 640(%esi) - - flds 40(%edx) - fadds 56(%edx) - fstps 384(%esi) - - flds 56(%edx) - fadds 36(%edx) - fstps 128(%esi) - - flds 36(%edx) - fadds 52(%edx) - fstps 128(%edi) - - flds 52(%edx) - fadds 44(%edx) - fstps 384(%edi) - - flds 60(%edx) - fsts 896(%edi) - fadds 44(%edx) - fstps 640(%edi) - - flds 96(%edx) - fadds 112(%edx) - fld %st(0) - fadds 64(%edx) - fstps 960(%esi) - fadds 80(%edx) - fstps 832(%esi) - - flds 112(%edx) - fadds 104(%edx) - fld %st(0) - fadds 80(%edx) - fstps 704(%esi) - fadds 72(%edx) - fstps 576(%esi) - - flds 104(%edx) - fadds 120(%edx) - fld %st(0) - fadds 72(%edx) - fstps 448(%esi) - fadds 88(%edx) - fstps 320(%esi) - - flds 120(%edx) - fadds 100(%edx) - fld %st(0) - fadds 88(%edx) - fstps 192(%esi) - fadds 68(%edx) - fstps 64(%esi) - - flds 100(%edx) - fadds 116(%edx) - fld %st(0) - fadds 68(%edx) - fstps 64(%edi) - fadds 84(%edx) - fstps 192(%edi) - - flds 116(%edx) - fadds 108(%edx) - fld %st(0) - fadds 84(%edx) - fstps 320(%edi) - fadds 76(%edx) - fstps 448(%edi) - - flds 108(%edx) - fadds 124(%edx) - fld %st(0) - fadds 76(%edx) - fstps 576(%edi) - fadds 92(%edx) - fstps 704(%edi) - - flds 124(%edx) - fsts 960(%edi) - fadds 92(%edx) - fstps 832(%edi) - addl $256,%esp - popl %edi - popl %esi - popl %ebx - ret -.L01: -/* Phase 8*/ - - flds (%ecx) - fadds 4(%ecx) - fistp 512(%esi) - - flds (%ecx) - fsubs 4(%ecx) - fmuls 120(%ebx) - - fistp (%esi) - - - flds 12(%ecx) - fsubs 8(%ecx) - fmuls 120(%ebx) - fist 256(%edi) - fadds 12(%ecx) - fadds 8(%ecx) - fistp 256(%esi) - - flds 16(%ecx) - fsubs 20(%ecx) - fmuls 120(%ebx) - - flds 28(%ecx) - fsubs 24(%ecx) - fmuls 120(%ebx) - fist 384(%edi) - fld %st(0) - fadds 24(%ecx) - fadds 28(%ecx) - fld %st(0) - fadds 16(%ecx) - fadds 20(%ecx) - fistp 384(%esi) - fadd %st(2) - fistp 128(%esi) - faddp %st(1) - fistp 128(%edi) - -/* Phase 9*/ - - flds 32(%edx) - fadds 48(%edx) - fistp 448(%esi) - - flds 48(%edx) - fadds 40(%edx) - fistp 320(%esi) - - flds 40(%edx) - fadds 56(%edx) - fistp 192(%esi) - - flds 56(%edx) - fadds 36(%edx) - fistp 64(%esi) - - flds 36(%edx) - fadds 52(%edx) - fistp 64(%edi) - - flds 52(%edx) - fadds 44(%edx) - fistp 192(%edi) - - flds 60(%edx) - fist 448(%edi) - fadds 44(%edx) - fistp 320(%edi) - - flds 96(%edx) - fadds 112(%edx) - fld %st(0) - fadds 64(%edx) - fistp 480(%esi) - fadds 80(%edx) - fistp 416(%esi) - - flds 112(%edx) - fadds 104(%edx) - fld %st(0) - fadds 80(%edx) - fistp 352(%esi) - fadds 72(%edx) - fistp 288(%esi) - - flds 104(%edx) - fadds 120(%edx) - fld %st(0) - fadds 72(%edx) - fistp 224(%esi) - fadds 88(%edx) - fistp 160(%esi) - - flds 120(%edx) - fadds 100(%edx) - fld %st(0) - fadds 88(%edx) - fistp 96(%esi) - fadds 68(%edx) - fistp 32(%esi) - - flds 100(%edx) - fadds 116(%edx) - fld %st(0) - fadds 68(%edx) - fistp 32(%edi) - fadds 84(%edx) - fistp 96(%edi) - - flds 116(%edx) - fadds 108(%edx) - fld %st(0) - fadds 84(%edx) - fistp 160(%edi) - fadds 76(%edx) - fistp 224(%edi) - - flds 108(%edx) - fadds 124(%edx) - fld %st(0) - fadds 76(%edx) - fistp 288(%edi) - fadds 92(%edx) - fistp 352(%edi) - - flds 124(%edx) - fist 480(%edi) - fadds 92(%edx) - fistp 416(%edi) - movsw - addl $256,%esp - popl %edi - popl %esi - popl %ebx - ret $12 - -// end of FPU stuff
--- a/mp3lib/mp3lib_objfix.sh Wed Oct 22 20:58:46 2003 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -#!/bin/sh -# This script fixes up symbol mangling in GNU as code of mp3lib. -# (c)2001-2002 by Felix Buenemann <atmosfear at users.sourceforge.net>A -# This file is licensed under the GPL, more info at http://www.fsf.org/ -for i in \ - "CpuDetect" \ - "ipentium" \ - "a3dnow" \ - "isse" \ - "dct36_3dnowex" \ - "dct36_3dnow" \ - "x_plus_minus_3dnow" \ - "tfcos36" \ - "COS9" -do -echo "fixing: $i=_$i" -objcopy --redefine-sym "$i=_$i" libMP3.a -done -
--- a/mp3lib/sr1.c Wed Oct 22 20:58:46 2003 +0000 +++ b/mp3lib/sr1.c Wed Oct 22 21:08:46 2003 +0000 @@ -386,7 +386,6 @@ extern void dct64_MMX(real *, real *, real *); extern void dct64_MMX_3dnow(real *, real *, real *); extern void dct64_MMX_3dnowex(real *, real *, real *); -extern void dct64_MMX_sse(real *, real *, real *); void (*dct64_MMX_func)(real *, real *, real *); #include "../cpudetect.h" @@ -415,15 +414,6 @@ synth_func = synth_1to1_MMX; } -#if 0 - if(gCpuCaps.hasSSE) - { - /* SSE version is buggy */ - dct64_MMX_func = dct64_MMX_sse; - mp_msg(MSGT_DECAUDIO,MSGL_V,"mp3lib: using SSE optimized decore!\n"); - } - else -#endif if (gCpuCaps.has3DNowExt) { dct36_func=dct36_3dnowex;