Mercurial > mplayer.hg
view mp3lib/dct64_sse.s @ 7429:7a221aaf7012
Enable the LIVE lib only if the network layer (STREAMING) is enable.
Fixed the bug where if streaming disable and live enable, the live config
test will reenable the network layer.
author | bertrand |
---|---|
date | Tue, 17 Sep 2002 19:47:55 +0000 |
parents | 8312f4bc8dab |
children |
line wrap: on
line source
/ This code is a translation of dct64_k7.s from MPlayer. / Coded by Felix Buenemann <atmosfear at users.sourceforge.net> / / TODO: - fix phases 4 and 5 (sse) / - optimize scalar FPU code? (interleave with sse code) / - fix alignment (prohibits finishing this code) / - then use faster insns for aligned data / / Note: currently code is disabled as I couldn't get input data aligned! / //.data // .align 8 //x_plus_minus_3dnow: .long 0x00000000, 0x80000000 //plus_1f: .float 1.0 .text .align 16 .global dct64_MMX_sse dct64_MMX_sse: pushl %ebx pushl %esi pushl %edi subl $256,%esp movl 280(%esp),%eax leal 128(%esp),%edx movl 272(%esp),%esi movl 276(%esp),%edi movl $costab_mmx,%ebx orl %ecx,%ecx movl %esp,%ecx /* Phase 1 (complete, worx) */ // [1] Process Block A1 (16 Bytes) / movq (%eax), %mm0 / movq 8(%eax), %mm4 movups (%eax), %xmm0 // Copy A1 to another register A2 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 // Process Block B1 (last 16 bytes) / movq 120(%eax), %mm1 / movq 112(%eax), %mm5 movups 112(%eax), %xmm1 /* The PSWAPD instruction swaps or reverses the upper and lower * doublewords of the source operand. PSWAPD mmreg1, mmreg2 * performs the following operations: * temp = mmreg2 * mmreg1[63:32] = temp[31:0 ] * mmreg1[31:0 ] = temp[63:32] */ / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 // shufps here exchanges a,b,c,d to b,a,d,c in xmm1 (desc ia32-ref p.752) //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 // Add B1 to A1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 // Save Block A1 / movq %mm0, (%edx) / movq %mm4, 8(%edx) movups %xmm0, (%edx) // Sub B1 from A2 / pfsub %mm1, %mm3 / pfsub %mm5, %mm7 subps %xmm1, %xmm2 // Mul mem with A2 / pfmul (%ebx), %mm3 / pfmul 8(%ebx), %mm7 movups (%ebx), %xmm7 mulps %xmm7, %xmm2 // Shuffle A2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 // I do a,b,c,d -> d,c,b,a to suit order when writing to mem (saves one shufps) shufps $27, %xmm2, %xmm2 // Save A2 to mem (end) / movq %mm3, 120(%edx) / movq %mm7, 112(%edx) movups %xmm2, 112(%edx) // [2] Process next data block / movq 16(%eax), %mm0 / movq 24(%eax), %mm4 movups 16(%eax), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 104(%eax), %mm1 / movq 96(%eax), %mm5 movups 96(%eax), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 16(%edx) / movq %mm4, 24(%edx) movups %xmm0, 16(%edx) / pfsub %mm1, %mm3 / pfsub %mm5, %mm7 subps %xmm1, %xmm2 / pfmul 16(%ebx), %mm3 / pfmul 24(%ebx), %mm7 movups 16(%ebx), %xmm7 mulps %xmm7, %xmm2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $27, %xmm2, %xmm2 / movq %mm3, 104(%edx) / movq %mm7, 96(%edx) movups %xmm2, 96(%edx) // [3] / movq 32(%eax), %mm0 / movq 40(%eax), %mm4 movups 32(%eax), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 88(%eax), %mm1 / movq 80(%eax), %mm5 movups 80(%eax), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 32(%edx) / movq %mm4, 40(%edx) movups %xmm0, 32(%edx) / pfsub %mm1, %mm3 / pfsub %mm5, %mm7 subps %xmm1, %xmm2 / pfmul 32(%ebx), %mm3 / pfmul 40(%ebx), %mm7 movups 32(%ebx), %xmm7 mulps %xmm7, %xmm2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $27, %xmm2, %xmm2 / movq %mm3, 88(%edx) / movq %mm7, 80(%edx) movups %xmm2, 80(%edx) // [4] / movq 48(%eax), %mm0 / movq 56(%eax), %mm4 movups 48(%eax), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 72(%eax), %mm1 / movq 64(%eax), %mm5 movups 64(%eax), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 48(%edx) / movq %mm4, 56(%edx) movups %xmm0, 48(%edx) / pfsub %mm1, %mm3 / pfsub %mm5, %mm7 subps %xmm1, %xmm2 / pfmul 48(%ebx), %mm3 / pfmul 56(%ebx), %mm7 movups 48(%ebx), %xmm7 mulps %xmm7, %xmm2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $27, %xmm2, %xmm2 / movq %mm3, 72(%edx) / movq %mm7, 64(%edx) movups %xmm2, 64(%edx) // phase 1 fpu code /* Phase 1*/ /* flds (%eax) leal 128(%esp),%edx fadds 124(%eax) movl 272(%esp),%esi fstps (%edx) movl 276(%esp),%edi flds 4(%eax) movl $costab_mmx,%ebx fadds 120(%eax) orl %ecx,%ecx fstps 4(%edx) flds (%eax) movl %esp,%ecx fsubs 124(%eax) fmuls (%ebx) fstps 124(%edx) flds 4(%eax) fsubs 120(%eax) fmuls 4(%ebx) fstps 120(%edx) flds 8(%eax) fadds 116(%eax) fstps 8(%edx) flds 12(%eax) fadds 112(%eax) fstps 12(%edx) flds 8(%eax) fsubs 116(%eax) fmuls 8(%ebx) fstps 116(%edx) flds 12(%eax) fsubs 112(%eax) fmuls 12(%ebx) fstps 112(%edx) flds 16(%eax) fadds 108(%eax) fstps 16(%edx) flds 20(%eax) fadds 104(%eax) fstps 20(%edx) flds 16(%eax) fsubs 108(%eax) fmuls 16(%ebx) fstps 108(%edx) flds 20(%eax) fsubs 104(%eax) fmuls 20(%ebx) fstps 104(%edx) flds 24(%eax) fadds 100(%eax) fstps 24(%edx) flds 28(%eax) fadds 96(%eax) fstps 28(%edx) flds 24(%eax) fsubs 100(%eax) fmuls 24(%ebx) fstps 100(%edx) flds 28(%eax) fsubs 96(%eax) fmuls 28(%ebx) fstps 96(%edx) flds 32(%eax) fadds 92(%eax) fstps 32(%edx) flds 36(%eax) fadds 88(%eax) fstps 36(%edx) flds 32(%eax) fsubs 92(%eax) fmuls 32(%ebx) fstps 92(%edx) flds 36(%eax) fsubs 88(%eax) fmuls 36(%ebx) fstps 88(%edx) flds 40(%eax) fadds 84(%eax) fstps 40(%edx) flds 44(%eax) fadds 80(%eax) fstps 44(%edx) flds 40(%eax) fsubs 84(%eax) fmuls 40(%ebx) fstps 84(%edx) flds 44(%eax) fsubs 80(%eax) fmuls 44(%ebx) fstps 80(%edx) flds 48(%eax) fadds 76(%eax) fstps 48(%edx) flds 52(%eax) fadds 72(%eax) fstps 52(%edx) flds 48(%eax) fsubs 76(%eax) fmuls 48(%ebx) fstps 76(%edx) flds 52(%eax) fsubs 72(%eax) fmuls 52(%ebx) fstps 72(%edx) flds 56(%eax) fadds 68(%eax) fstps 56(%edx) flds 60(%eax) fadds 64(%eax) fstps 60(%edx) flds 56(%eax) fsubs 68(%eax) fmuls 56(%ebx) fstps 68(%edx) flds 60(%eax) fsubs 64(%eax) fmuls 60(%ebx) fstps 64(%edx) */ // end phase 1 fpu code /* Phase 2 (completed, worx) */ / movq (%edx), %mm0 / movq 8(%edx), %mm4 movups (%edx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 56(%edx), %mm1 / movq 48(%edx), %mm5 movups 48(%edx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, (%ecx) / movq %mm4, 8(%ecx) movups %xmm0, (%ecx) / pfsub %mm1, %mm3 / pfsub %mm5, %mm7 subps %xmm1, %xmm2 / pfmul 64(%ebx), %mm3 / pfmul 72(%ebx), %mm7 movups 64(%ebx), %xmm7 mulps %xmm7, %xmm2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $27, %xmm2, %xmm2 / movq %mm3, 56(%ecx) / movq %mm7, 48(%ecx) movups %xmm2, 48(%ecx) / movq 16(%edx), %mm0 / movq 24(%edx), %mm4 movups 16(%edx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 40(%edx), %mm1 / movq 32(%edx), %mm5 movups 32(%edx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 16(%ecx) / movq %mm4, 24(%ecx) movups %xmm0, 16(%ecx) / pfsub %mm1, %mm3 / pfsub %mm5, %mm7 subps %xmm1, %xmm2 / pfmul 80(%ebx), %mm3 / pfmul 88(%ebx), %mm7 movups 80(%ebx), %xmm7 mulps %xmm7, %xmm2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $27, %xmm2, %xmm2 / movq %mm3, 40(%ecx) / movq %mm7, 32(%ecx) movups %xmm2, 32(%ecx) // phase 2 fpu /* Phase 2*/ /* flds (%edx) fadds 60(%edx) fstps (%ecx) flds 4(%edx) fadds 56(%edx) fstps 4(%ecx) flds (%edx) fsubs 60(%edx) fmuls 64(%ebx) fstps 60(%ecx) flds 4(%edx) fsubs 56(%edx) fmuls 68(%ebx) fstps 56(%ecx) flds 8(%edx) fadds 52(%edx) fstps 8(%ecx) flds 12(%edx) fadds 48(%edx) fstps 12(%ecx) flds 8(%edx) fsubs 52(%edx) fmuls 72(%ebx) fstps 52(%ecx) flds 12(%edx) fsubs 48(%edx) fmuls 76(%ebx) fstps 48(%ecx) flds 16(%edx) fadds 44(%edx) fstps 16(%ecx) flds 20(%edx) fadds 40(%edx) fstps 20(%ecx) flds 16(%edx) fsubs 44(%edx) fmuls 80(%ebx) fstps 44(%ecx) flds 20(%edx) fsubs 40(%edx) fmuls 84(%ebx) fstps 40(%ecx) flds 24(%edx) fadds 36(%edx) fstps 24(%ecx) flds 28(%edx) fadds 32(%edx) fstps 28(%ecx) flds 24(%edx) fsubs 36(%edx) fmuls 88(%ebx) fstps 36(%ecx) flds 28(%edx) fsubs 32(%edx) fmuls 92(%ebx) fstps 32(%ecx) */ // end phase 2 fpu /* Phase 3 (completed, working) */ / movq 64(%edx), %mm0 / movq 72(%edx), %mm4 movups 64(%edx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 120(%edx), %mm1 / movq 112(%edx), %mm5 movups 112(%edx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 64(%ecx) / movq %mm4, 72(%ecx) movups %xmm0, 64(%ecx) / pfsubr %mm1, %mm3 / pfsubr %mm5, %mm7 // optimized (xmm1<->xmm2) subps %xmm2, %xmm1 / pfmul 64(%ebx), %mm3 / pfmul 72(%ebx), %mm7 movups 64(%ebx), %xmm7 mulps %xmm7, %xmm1 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $27, %xmm1, %xmm1 / movq %mm3, 120(%ecx) / movq %mm7, 112(%ecx) movups %xmm1, 112(%ecx) / movq 80(%edx), %mm0 / movq 88(%edx), %mm4 movups 80(%edx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 104(%edx), %mm1 / movq 96(%edx), %mm5 movups 96(%edx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 80(%ecx) / movq %mm4, 88(%ecx) movups %xmm0, 80(%ecx) / pfsubr %mm1, %mm3 / pfsubr %mm5, %mm7 // optimized (xmm1<->xmm2) subps %xmm2, %xmm1 / pfmul 80(%ebx), %mm3 / pfmul 88(%ebx), %mm7 movups 80(%ebx), %xmm7 mulps %xmm7, %xmm1 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $27, %xmm1, %xmm1 / movq %mm3, 104(%ecx) / movq %mm7, 96(%ecx) movups %xmm1, 96(%ecx) // phase 3 fpu /* Phase 3*/ /* flds 64(%edx) fadds 124(%edx) fstps 64(%ecx) flds 68(%edx) fadds 120(%edx) fstps 68(%ecx) flds 124(%edx) fsubs 64(%edx) fmuls 64(%ebx) fstps 124(%ecx) flds 120(%edx) fsubs 68(%edx) fmuls 68(%ebx) fstps 120(%ecx) flds 72(%edx) fadds 116(%edx) fstps 72(%ecx) flds 76(%edx) fadds 112(%edx) fstps 76(%ecx) flds 116(%edx) fsubs 72(%edx) fmuls 72(%ebx) fstps 116(%ecx) flds 112(%edx) fsubs 76(%edx) fmuls 76(%ebx) fstps 112(%ecx) flds 80(%edx) fadds 108(%edx) fstps 80(%ecx) flds 84(%edx) fadds 104(%edx) fstps 84(%ecx) flds 108(%edx) fsubs 80(%edx) fmuls 80(%ebx) fstps 108(%ecx) flds 104(%edx) fsubs 84(%edx) fmuls 84(%ebx) fstps 104(%ecx) flds 88(%edx) fadds 100(%edx) fstps 88(%ecx) flds 92(%edx) fadds 96(%edx) fstps 92(%ecx) flds 100(%edx) fsubs 88(%edx) fmuls 88(%ebx) fstps 100(%ecx) flds 96(%edx) fsubs 92(%edx) fmuls 92(%ebx) fstps 96(%ecx) */ // end phase 3 fpu /* Phase 4 (completed, buggy) */ /* / movq 96(%ebx), %mm2 / movq 104(%ebx), %mm6 movups 96(%ebx), %xmm4 / movq (%ecx), %mm0 / movq 8(%ecx), %mm4 movups (%ecx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 24(%ecx), %mm1 / movq 16(%ecx), %mm5 movups 16(%ecx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, (%edx) / movq %mm4, 8(%edx) movups %xmm0, (%edx) / pfsub %mm1, %mm3 / pfsub %mm5, %mm7 subps %xmm1, %xmm2 / pfmul %mm2, %mm3 / pfmul %mm6, %mm7 mulps %xmm4, %xmm2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $27, %xmm2, %xmm2 / movq %mm3, 24(%edx) / movq %mm7, 16(%edx) movups %xmm2, 16(%edx) / movq 32(%ecx), %mm0 / movq 40(%ecx), %mm4 movups 32(%ecx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 56(%ecx), %mm1 / movq 48(%ecx), %mm5 movups 48(%ecx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 32(%edx) / movq %mm4, 40(%edx) movups %xmm0, 32(%edx) / pfsubr %mm1, %mm3 / pfsubr %mm5, %mm7 // Luckily we can swap this (xmm1<->xmm2) subps %xmm2, %xmm1 / pfmul %mm2, %mm3 / pfmul %mm6, %mm7 mulps %xmm4, %xmm1 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $27, %xmm1, %xmm1 / movq %mm3, 56(%edx) / movq %mm7, 48(%edx) movups %xmm1, 48(%edx) / movq 64(%ecx), %mm0 / movq 72(%ecx), %mm4 movups 64(%ecx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 88(%ecx), %mm1 / movq 80(%ecx), %mm5 movups 80(%ecx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 64(%edx) / movq %mm4, 72(%edx) movups %xmm0, 64(%edx) / pfsub %mm1, %mm3 / pfsub %mm5, %mm7 subps %xmm1, %xmm2 / pfmul %mm2, %mm3 / pfmul %mm6, %mm7 mulps %xmm4, %xmm2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $27, %xmm2, %xmm2 / movq %mm3, 88(%edx) / movq %mm7, 80(%edx) movups %xmm2, 80(%edx) / movq 96(%ecx), %mm0 / movq 104(%ecx), %mm4 movups 96(%ecx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 120(%ecx), %mm1 / movq 112(%ecx), %mm5 movups 112(%ecx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 //// shufps $177, %xmm1, %xmm1 shufps $27, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 96(%edx) / movq %mm4, 104(%edx) movups %xmm0, 96(%edx) / pfsubr %mm1, %mm3 / pfsubr %mm5, %mm7 // This is already optimized, so xmm2 must be swapped with xmm1 for rest of phase subps %xmm2, %xmm1 / pfmul %mm2, %mm3 / pfmul %mm6, %mm7 mulps %xmm4, %xmm1 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $27, %xmm1, %xmm1 / movq %mm3, 120(%edx) / movq %mm7, 112(%edx) movups %xmm1, 112(%edx) */ // phase 4 fpu code /* Phase 4*/ flds (%ecx) fadds 28(%ecx) fstps (%edx) flds (%ecx) fsubs 28(%ecx) fmuls 96(%ebx) fstps 28(%edx) flds 4(%ecx) fadds 24(%ecx) fstps 4(%edx) flds 4(%ecx) fsubs 24(%ecx) fmuls 100(%ebx) fstps 24(%edx) flds 8(%ecx) fadds 20(%ecx) fstps 8(%edx) flds 8(%ecx) fsubs 20(%ecx) fmuls 104(%ebx) fstps 20(%edx) flds 12(%ecx) fadds 16(%ecx) fstps 12(%edx) flds 12(%ecx) fsubs 16(%ecx) fmuls 108(%ebx) fstps 16(%edx) flds 32(%ecx) fadds 60(%ecx) fstps 32(%edx) flds 60(%ecx) fsubs 32(%ecx) fmuls 96(%ebx) fstps 60(%edx) flds 36(%ecx) fadds 56(%ecx) fstps 36(%edx) flds 56(%ecx) fsubs 36(%ecx) fmuls 100(%ebx) fstps 56(%edx) flds 40(%ecx) fadds 52(%ecx) fstps 40(%edx) flds 52(%ecx) fsubs 40(%ecx) fmuls 104(%ebx) fstps 52(%edx) flds 44(%ecx) fadds 48(%ecx) fstps 44(%edx) flds 48(%ecx) fsubs 44(%ecx) fmuls 108(%ebx) fstps 48(%edx) flds 64(%ecx) fadds 92(%ecx) fstps 64(%edx) flds 64(%ecx) fsubs 92(%ecx) fmuls 96(%ebx) fstps 92(%edx) flds 68(%ecx) fadds 88(%ecx) fstps 68(%edx) flds 68(%ecx) fsubs 88(%ecx) fmuls 100(%ebx) fstps 88(%edx) flds 72(%ecx) fadds 84(%ecx) fstps 72(%edx) flds 72(%ecx) fsubs 84(%ecx) fmuls 104(%ebx) fstps 84(%edx) flds 76(%ecx) fadds 80(%ecx) fstps 76(%edx) flds 76(%ecx) fsubs 80(%ecx) fmuls 108(%ebx) fstps 80(%edx) flds 96(%ecx) fadds 124(%ecx) fstps 96(%edx) flds 124(%ecx) fsubs 96(%ecx) fmuls 96(%ebx) fstps 124(%edx) flds 100(%ecx) fadds 120(%ecx) fstps 100(%edx) flds 120(%ecx) fsubs 100(%ecx) fmuls 100(%ebx) fstps 120(%edx) flds 104(%ecx) fadds 116(%ecx) fstps 104(%edx) flds 116(%ecx) fsubs 104(%ecx) fmuls 104(%ebx) fstps 116(%edx) flds 108(%ecx) fadds 112(%ecx) fstps 108(%edx) flds 112(%ecx) fsubs 108(%ecx) fmuls 108(%ebx) fstps 112(%edx) flds (%edx) fadds 12(%edx) fstps (%ecx) flds (%edx) fsubs 12(%edx) fmuls 112(%ebx) fstps 12(%ecx) flds 4(%edx) fadds 8(%edx) fstps 4(%ecx) flds 4(%edx) fsubs 8(%edx) fmuls 116(%ebx) fstps 8(%ecx) flds 16(%edx) fadds 28(%edx) fstps 16(%ecx) flds 28(%edx) fsubs 16(%edx) fmuls 112(%ebx) fstps 28(%ecx) flds 20(%edx) fadds 24(%edx) fstps 20(%ecx) flds 24(%edx) fsubs 20(%edx) fmuls 116(%ebx) fstps 24(%ecx) flds 32(%edx) fadds 44(%edx) fstps 32(%ecx) flds 32(%edx) fsubs 44(%edx) fmuls 112(%ebx) fstps 44(%ecx) flds 36(%edx) fadds 40(%edx) fstps 36(%ecx) flds 36(%edx) fsubs 40(%edx) fmuls 116(%ebx) fstps 40(%ecx) flds 48(%edx) fadds 60(%edx) fstps 48(%ecx) flds 60(%edx) fsubs 48(%edx) fmuls 112(%ebx) fstps 60(%ecx) flds 52(%edx) fadds 56(%edx) fstps 52(%ecx) flds 56(%edx) fsubs 52(%edx) fmuls 116(%ebx) fstps 56(%ecx) flds 64(%edx) fadds 76(%edx) fstps 64(%ecx) flds 64(%edx) fsubs 76(%edx) fmuls 112(%ebx) fstps 76(%ecx) flds 68(%edx) fadds 72(%edx) fstps 68(%ecx) flds 68(%edx) fsubs 72(%edx) fmuls 116(%ebx) fstps 72(%ecx) flds 80(%edx) fadds 92(%edx) fstps 80(%ecx) flds 92(%edx) fsubs 80(%edx) fmuls 112(%ebx) fstps 92(%ecx) flds 84(%edx) fadds 88(%edx) fstps 84(%ecx) flds 88(%edx) fsubs 84(%edx) fmuls 116(%ebx) fstps 88(%ecx) flds 96(%edx) fadds 108(%edx) fstps 96(%ecx) flds 96(%edx) fsubs 108(%edx) fmuls 112(%ebx) fstps 108(%ecx) flds 100(%edx) fadds 104(%edx) fstps 100(%ecx) flds 100(%edx) fsubs 104(%edx) fmuls 116(%ebx) fstps 104(%ecx) flds 112(%edx) fadds 124(%edx) fstps 112(%ecx) flds 124(%edx) fsubs 112(%edx) fmuls 112(%ebx) fstps 124(%ecx) flds 116(%edx) fadds 120(%edx) fstps 116(%ecx) flds 120(%edx) fsubs 116(%edx) fmuls 116(%ebx) fstps 120(%ecx) // end of phase 4 fpu // below stuff needs to be finished I use FPU code for first /* Phase 5 (completed, crashing) */ /* / movq 112(%ebx), %mm2 // move 8 byte data to (low)high quadword - check this! atmos movlps 112(%ebx), %xmm4 // maybe I need movhlps too to get data into correct quadword movlhps %xmm4, %xmm4 / movq (%edx), %mm0 / movq 16(%edx), %mm4 movups (%edx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 // hmm? this is strange / movq 8(%edx), %mm1 / movq 24(%edx), %mm5 movlps 8(%edx), %xmm1 movhps 24(%edx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 pshufd $177, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, (%ecx) / movq %mm4, 16(%ecx) movlps %xmm0, (%ecx) movhps %xmm0, 16(%ecx) / pfsub %mm1, %mm3 / pfsubr %mm5, %mm7 // I need to emulate pfsubr here movaps %xmm1, %xmm3 subps %xmm2, %xmm3 subps %xmm1, %xmm2 // now move correct quadword from reverse substration in xmm3 to correct // quadword in xmm2 and leave other quadword with non-reversed substration untouched /// shufpd $2, %xmm3, %xmm2 // (or $1?) (see ia32-ref p.749) // optimize movq %xmm2, %xmm3 movaps %xmm3, %xmm2 / pfmul %mm2, %mm3 / pfmul %mm2, %mm7 mulps %xmm4, %xmm2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $177, %xmm2, %xmm2 / movq %mm3, 8(%ecx) / movq %mm7, 24(%ecx) movlps %xmm2, 8(%ecx) movhps %xmm2, 24(%ecx) / movq 32(%edx), %mm0 / movq 48(%edx), %mm4 movlps 32(%edx), %xmm0 movhps 48(%edx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 40(%edx), %mm1 / movq 56(%edx), %mm5 movlps 40(%edx), %xmm1 movhps 56(%edx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 shufps $177, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 32(%ecx) / movq %mm4, 48(%ecx) movlps %xmm0, 32(%ecx) movhps %xmm0, 48(%ecx) / pfsub %mm1, %mm3 / pfsubr %mm5, %mm7 movaps %xmm1, %xmm3 subps %xmm2, %xmm3 subps %xmm1, %xmm2 /// shufpd $2, %xmm3, %xmm2 // (or $1?) // optimize movq %xmm2, %xmm3 movaps %xmm3, %xmm2 / pfmul %mm2, %mm3 / pfmul %mm2, %mm7 mulps %xmm4, %xmm2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $177, %xmm2, %xmm2 / movq %mm3, 40(%ecx) / movq %mm7, 56(%ecx) movlps %xmm2, 40(%ecx) movhps %xmm2, 56(%ecx) / movq 64(%edx), %mm0 / movq 80(%edx), %mm4 movlps 64(%edx), %xmm0 movhps 80(%edx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 72(%edx), %mm1 / movq 88(%edx), %mm5 movlps 72(%edx), %xmm1 movhps 88(%edx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 shufps $177, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 64(%ecx) / movq %mm4, 80(%ecx) movlps %xmm0, 64(%ecx) movhps %xmm0, 80(%ecx) / pfsub %mm1, %mm3 / pfsubr %mm5, %mm7 movaps %xmm1, %xmm3 subps %xmm2, %xmm3 subps %xmm1, %xmm2 /// shufpd $2, %xmm3, %xmm2 // (or $1?) // optimize movq %xmm2, %xmm3 movaps %xmm3, %xmm2 / pfmul %mm2, %mm3 / pfmul %mm2, %mm7 mulps %xmm4, %xmm2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $177, %xmm2, %xmm2 / movq %mm3, 72(%ecx) / movq %mm7, 88(%ecx) movlps %xmm2, 72(%ecx) movhps %xmm2, 88(%ecx) / movq 96(%edx), %mm0 / movq 112(%edx), %mm4 movups 96(%edx), %xmm0 / movq %mm0, %mm3 / movq %mm4, %mm7 movaps %xmm0, %xmm2 / movq 104(%edx), %mm1 / movq 120(%edx), %mm5 movlps 104(%edx), %xmm1 movhps 120(%edx), %xmm1 / pswapd %mm1, %mm1 / pswapd %mm5, %mm5 shufps $177, %xmm1, %xmm1 / pfadd %mm1, %mm0 / pfadd %mm5, %mm4 addps %xmm1, %xmm0 / movq %mm0, 96(%ecx) / movq %mm4, 112(%ecx) movups %xmm0, 96(%ecx) / pfsub %mm1, %mm3 / pfsubr %mm5, %mm7 movaps %xmm1, %xmm3 subps %xmm2, %xmm3 subps %xmm1, %xmm2 /// shufpd $2, %xmm3, %xmm2 // (or $1?) // optimize movq %xmm2, %xmm3 movaps %xmm3, %xmm2 / pfmul %mm2, %mm3 / pfmul %mm2, %mm7 mulps %xmm4, %xmm2 / pswapd %mm3, %mm3 / pswapd %mm7, %mm7 shufps $177, %xmm2, %xmm2 / movq %mm3, 104(%ecx) / movq %mm7, 120(%ecx) movlps %xmm2, 104(%ecx) movhps %xmm2, 120(%ecx) */ /* Phase 6. This is the end of easy road. */ /* Code below is coded in scalar mode. Should be optimized */ // // movd plus_1f, %mm6 // punpckldq 120(%ebx), %mm6 /* mm6 = 1.0 | 120(%ebx)*/ // movq x_plus_minus_3dnow, %mm7 /* mm7 = +1 | -1 */ /* movq 32(%ecx), %mm0 movq 64(%ecx), %mm2 movq %mm0, %mm1 movq %mm2, %mm3 pxor %mm7, %mm1 pxor %mm7, %mm3 pfacc %mm1, %mm0 pfacc %mm3, %mm2 pfmul %mm6, %mm0 pfmul %mm6, %mm2 movq %mm0, 32(%edx) movq %mm2, 64(%edx) movd 44(%ecx), %mm0 movd 40(%ecx), %mm2 movd 120(%ebx), %mm3 punpckldq 76(%ecx), %mm0 punpckldq 72(%ecx), %mm2 punpckldq %mm3, %mm3 movq %mm0, %mm4 movq %mm2, %mm5 pfsub %mm2, %mm0 pfmul %mm3, %mm0 movq %mm0, %mm1 pfadd %mm5, %mm0 pfadd %mm4, %mm0 movq %mm0, %mm2 punpckldq %mm1, %mm0 punpckhdq %mm1, %mm2 movq %mm0, 40(%edx) movq %mm2, 72(%edx) movd 48(%ecx), %mm3 movd 60(%ecx), %mm2 pfsub 52(%ecx), %mm3 pfsub 56(%ecx), %mm2 pfmul 120(%ebx), %mm3 pfmul 120(%ebx), %mm2 movq %mm2, %mm1 pfadd 56(%ecx), %mm1 pfadd 60(%ecx), %mm1 movq %mm1, %mm0 pfadd 48(%ecx), %mm0 pfadd 52(%ecx), %mm0 pfadd %mm3, %mm1 punpckldq %mm2, %mm1 pfadd %mm3, %mm2 punpckldq %mm2, %mm0 movq %mm1, 56(%edx) movq %mm0, 48(%edx) */ /*---*/ /* movd 92(%ecx), %mm1 pfsub 88(%ecx), %mm1 pfmul 120(%ebx), %mm1 movd %mm1, 92(%edx) pfadd 92(%ecx), %mm1 pfadd 88(%ecx), %mm1 movq %mm1, %mm0 pfadd 80(%ecx), %mm0 pfadd 84(%ecx), %mm0 movd %mm0, 80(%edx) movd 80(%ecx), %mm0 pfsub 84(%ecx), %mm0 pfmul 120(%ebx), %mm0 pfadd %mm0, %mm1 pfadd 92(%edx), %mm0 punpckldq %mm1, %mm0 movq %mm0, 84(%edx) movq 96(%ecx), %mm0 movq %mm0, %mm1 pxor %mm7, %mm1 pfacc %mm1, %mm0 pfmul %mm6, %mm0 movq %mm0, 96(%edx) movd 108(%ecx), %mm0 pfsub 104(%ecx), %mm0 pfmul 120(%ebx), %mm0 movd %mm0, 108(%edx) pfadd 104(%ecx), %mm0 pfadd 108(%ecx), %mm0 movd %mm0, 104(%edx) movd 124(%ecx), %mm1 pfsub 120(%ecx), %mm1 pfmul 120(%ebx), %mm1 movd %mm1, 124(%edx) pfadd 120(%ecx), %mm1 pfadd 124(%ecx), %mm1 movq %mm1, %mm0 pfadd 112(%ecx), %mm0 pfadd 116(%ecx), %mm0 movd %mm0, 112(%edx) movd 112(%ecx), %mm0 pfsub 116(%ecx), %mm0 pfmul 120(%ebx), %mm0 pfadd %mm0,%mm1 pfadd 124(%edx), %mm0 punpckldq %mm1, %mm0 movq %mm0, 116(%edx) jnz .L01 */ /* Phase 7*/ /* Code below is coded in scalar mode. Should be optimized */ /* movd (%ecx), %mm0 pfadd 4(%ecx), %mm0 movd %mm0, 1024(%esi) movd (%ecx), %mm0 pfsub 4(%ecx), %mm0 pfmul 120(%ebx), %mm0 movd %mm0, (%esi) movd %mm0, (%edi) movd 12(%ecx), %mm0 pfsub 8(%ecx), %mm0 pfmul 120(%ebx), %mm0 movd %mm0, 512(%edi) pfadd 12(%ecx), %mm0 pfadd 8(%ecx), %mm0 movd %mm0, 512(%esi) movd 16(%ecx), %mm0 pfsub 20(%ecx), %mm0 pfmul 120(%ebx), %mm0 movq %mm0, %mm3 movd 28(%ecx), %mm0 pfsub 24(%ecx), %mm0 pfmul 120(%ebx), %mm0 movd %mm0, 768(%edi) movq %mm0, %mm2 pfadd 24(%ecx), %mm0 pfadd 28(%ecx), %mm0 movq %mm0, %mm1 pfadd 16(%ecx), %mm0 pfadd 20(%ecx), %mm0 movd %mm0, 768(%esi) pfadd %mm3, %mm1 movd %mm1, 256(%esi) pfadd %mm3, %mm2 movd %mm2, 256(%edi) */ /* Phase 8*/ /* movq 32(%edx), %mm0 movq 48(%edx), %mm1 pfadd 48(%edx), %mm0 pfadd 40(%edx), %mm1 movd %mm0, 896(%esi) movd %mm1, 640(%esi) psrlq $32, %mm0 psrlq $32, %mm1 movd %mm0, 128(%edi) movd %mm1, 384(%edi) movd 40(%edx), %mm0 pfadd 56(%edx), %mm0 movd %mm0, 384(%esi) movd 56(%edx), %mm0 pfadd 36(%edx), %mm0 movd %mm0, 128(%esi) movd 60(%edx), %mm0 movd %mm0, 896(%edi) pfadd 44(%edx), %mm0 movd %mm0, 640(%edi) movq 96(%edx), %mm0 movq 112(%edx), %mm2 movq 104(%edx), %mm4 pfadd 112(%edx), %mm0 pfadd 104(%edx), %mm2 pfadd 120(%edx), %mm4 movq %mm0, %mm1 movq %mm2, %mm3 movq %mm4, %mm5 pfadd 64(%edx), %mm0 pfadd 80(%edx), %mm2 pfadd 72(%edx), %mm4 movd %mm0, 960(%esi) movd %mm2, 704(%esi) movd %mm4, 448(%esi) psrlq $32, %mm0 psrlq $32, %mm2 psrlq $32, %mm4 movd %mm0, 64(%edi) movd %mm2, 320(%edi) movd %mm4, 576(%edi) pfadd 80(%edx), %mm1 pfadd 72(%edx), %mm3 pfadd 88(%edx), %mm5 movd %mm1, 832(%esi) movd %mm3, 576(%esi) movd %mm5, 320(%esi) psrlq $32, %mm1 psrlq $32, %mm3 psrlq $32, %mm5 movd %mm1, 192(%edi) movd %mm3, 448(%edi) movd %mm5, 704(%edi) movd 120(%edx), %mm0 pfadd 100(%edx), %mm0 movq %mm0, %mm1 pfadd 88(%edx), %mm0 movd %mm0, 192(%esi) pfadd 68(%edx), %mm1 movd %mm1, 64(%esi) movd 124(%edx), %mm0 movd %mm0, 960(%edi) pfadd 92(%edx), %mm0 movd %mm0, 832(%edi) jmp .L_bye .L01: */ /* Phase 9*/ /* movq (%ecx), %mm0 movq %mm0, %mm1 pxor %mm7, %mm1 pfacc %mm1, %mm0 pfmul %mm6, %mm0 pf2id %mm0, %mm0 movd %mm0, %eax movw %ax, 512(%esi) psrlq $32, %mm0 movd %mm0, %eax movw %ax, (%esi) movd 12(%ecx), %mm0 pfsub 8(%ecx), %mm0 pfmul 120(%ebx), %mm0 pf2id %mm0, %mm7 movd %mm7, %eax movw %ax, 256(%edi) pfadd 12(%ecx), %mm0 pfadd 8(%ecx), %mm0 pf2id %mm0, %mm0 movd %mm0, %eax movw %ax, 256(%esi) movd 16(%ecx), %mm3 pfsub 20(%ecx), %mm3 pfmul 120(%ebx), %mm3 movq %mm3, %mm2 movd 28(%ecx), %mm2 pfsub 24(%ecx), %mm2 pfmul 120(%ebx), %mm2 movq %mm2, %mm1 pf2id %mm2, %mm7 movd %mm7, %eax movw %ax, 384(%edi) pfadd 24(%ecx), %mm1 pfadd 28(%ecx), %mm1 movq %mm1, %mm0 pfadd 16(%ecx), %mm0 pfadd 20(%ecx), %mm0 pf2id %mm0, %mm0 movd %mm0, %eax movw %ax, 384(%esi) pfadd %mm3, %mm1 pf2id %mm1, %mm1 movd %mm1, %eax movw %ax, 128(%esi) pfadd %mm3, %mm2 pf2id %mm2, %mm2 movd %mm2, %eax movw %ax, 128(%edi) */ /* Phase 10*/ /* movq 32(%edx), %mm0 movq 48(%edx), %mm1 pfadd 48(%edx), %mm0 pfadd 40(%edx), %mm1 pf2id %mm0, %mm0 pf2id %mm1, %mm1 movd %mm0, %eax movd %mm1, %ecx movw %ax, 448(%esi) movw %cx, 320(%esi) psrlq $32, %mm0 psrlq $32, %mm1 movd %mm0, %eax movd %mm1, %ecx movw %ax, 64(%edi) movw %cx, 192(%edi) movd 40(%edx), %mm3 movd 56(%edx), %mm4 movd 60(%edx), %mm0 movd 44(%edx), %mm2 movd 120(%edx), %mm5 punpckldq %mm4, %mm3 punpckldq 124(%edx), %mm0 pfadd 100(%edx), %mm5 punpckldq 36(%edx), %mm4 punpckldq 92(%edx), %mm2 movq %mm5, %mm6 pfadd %mm4, %mm3 pf2id %mm0, %mm1 pf2id %mm3, %mm3 pfadd 88(%edx), %mm5 movd %mm1, %eax movd %mm3, %ecx movw %ax, 448(%edi) movw %cx, 192(%esi) pf2id %mm5, %mm5 psrlq $32, %mm1 psrlq $32, %mm3 movd %mm5, %ebx movd %mm1, %eax movd %mm3, %ecx movw %bx, 96(%esi) movw %ax, 480(%edi) movw %cx, 64(%esi) pfadd %mm2, %mm0 pf2id %mm0, %mm0 movd %mm0, %eax pfadd 68(%edx), %mm6 movw %ax, 320(%edi) psrlq $32, %mm0 pf2id %mm6, %mm6 movd %mm0, %eax movd %mm6, %ebx movw %ax, 416(%edi) movw %bx, 32(%esi) movq 96(%edx), %mm0 movq 112(%edx), %mm2 movq 104(%edx), %mm4 pfadd %mm2, %mm0 pfadd %mm4, %mm2 pfadd 120(%edx), %mm4 movq %mm0, %mm1 movq %mm2, %mm3 movq %mm4, %mm5 pfadd 64(%edx), %mm0 pfadd 80(%edx), %mm2 pfadd 72(%edx), %mm4 pf2id %mm0, %mm0 pf2id %mm2, %mm2 pf2id %mm4, %mm4 movd %mm0, %eax movd %mm2, %ecx movd %mm4, %ebx movw %ax, 480(%esi) movw %cx, 352(%esi) movw %bx, 224(%esi) psrlq $32, %mm0 psrlq $32, %mm2 psrlq $32, %mm4 movd %mm0, %eax movd %mm2, %ecx movd %mm4, %ebx movw %ax, 32(%edi) movw %cx, 160(%edi) movw %bx, 288(%edi) pfadd 80(%edx), %mm1 pfadd 72(%edx), %mm3 pfadd 88(%edx), %mm5 pf2id %mm1, %mm1 pf2id %mm3, %mm3 pf2id %mm5, %mm5 movd %mm1, %eax movd %mm3, %ecx movd %mm5, %ebx movw %ax, 416(%esi) movw %cx, 288(%esi) movw %bx, 160(%esi) psrlq $32, %mm1 psrlq $32, %mm3 psrlq $32, %mm5 movd %mm1, %eax movd %mm3, %ecx movd %mm5, %ebx movw %ax, 96(%edi) movw %cx, 224(%edi) movw %bx, 352(%edi) movsw .L_bye: addl $256,%esp / femms emms popl %edi popl %esi popl %ebx ret $12 */ // here comes old fashioned FPU code for the tough parts /* Phase 5*/ flds 32(%ecx) fadds 36(%ecx) fstps 32(%edx) flds 32(%ecx) fsubs 36(%ecx) fmuls 120(%ebx) fstps 36(%edx) flds 44(%ecx) fsubs 40(%ecx) fmuls 120(%ebx) fsts 44(%edx) fadds 40(%ecx) fadds 44(%ecx) fstps 40(%edx) flds 48(%ecx) fsubs 52(%ecx) fmuls 120(%ebx) flds 60(%ecx) fsubs 56(%ecx) fmuls 120(%ebx) fld %st(0) fadds 56(%ecx) fadds 60(%ecx) fld %st(0) fadds 48(%ecx) fadds 52(%ecx) fstps 48(%edx) fadd %st(2) fstps 56(%edx) fsts 60(%edx) faddp %st(1) fstps 52(%edx) flds 64(%ecx) fadds 68(%ecx) fstps 64(%edx) flds 64(%ecx) fsubs 68(%ecx) fmuls 120(%ebx) fstps 68(%edx) flds 76(%ecx) fsubs 72(%ecx) fmuls 120(%ebx) fsts 76(%edx) fadds 72(%ecx) fadds 76(%ecx) fstps 72(%edx) flds 92(%ecx) fsubs 88(%ecx) fmuls 120(%ebx) fsts 92(%edx) fadds 92(%ecx) fadds 88(%ecx) fld %st(0) fadds 80(%ecx) fadds 84(%ecx) fstps 80(%edx) flds 80(%ecx) fsubs 84(%ecx) fmuls 120(%ebx) fadd %st(0), %st(1) fadds 92(%edx) fstps 84(%edx) fstps 88(%edx) flds 96(%ecx) fadds 100(%ecx) fstps 96(%edx) flds 96(%ecx) fsubs 100(%ecx) fmuls 120(%ebx) fstps 100(%edx) flds 108(%ecx) fsubs 104(%ecx) fmuls 120(%ebx) fsts 108(%edx) fadds 104(%ecx) fadds 108(%ecx) fstps 104(%edx) flds 124(%ecx) fsubs 120(%ecx) fmuls 120(%ebx) fsts 124(%edx) fadds 120(%ecx) fadds 124(%ecx) fld %st(0) fadds 112(%ecx) fadds 116(%ecx) fstps 112(%edx) flds 112(%ecx) fsubs 116(%ecx) fmuls 120(%ebx) fadd %st(0),%st(1) fadds 124(%edx) fstps 116(%edx) fstps 120(%edx) jnz .L01 /* Phase 6*/ flds (%ecx) fadds 4(%ecx) fstps 1024(%esi) flds (%ecx) fsubs 4(%ecx) fmuls 120(%ebx) fsts (%esi) fstps (%edi) flds 12(%ecx) fsubs 8(%ecx) fmuls 120(%ebx) fsts 512(%edi) fadds 12(%ecx) fadds 8(%ecx) fstps 512(%esi) flds 16(%ecx) fsubs 20(%ecx) fmuls 120(%ebx) flds 28(%ecx) fsubs 24(%ecx) fmuls 120(%ebx) fsts 768(%edi) fld %st(0) fadds 24(%ecx) fadds 28(%ecx) fld %st(0) fadds 16(%ecx) fadds 20(%ecx) fstps 768(%esi) fadd %st(2) fstps 256(%esi) faddp %st(1) fstps 256(%edi) /* Phase 7*/ flds 32(%edx) fadds 48(%edx) fstps 896(%esi) flds 48(%edx) fadds 40(%edx) fstps 640(%esi) flds 40(%edx) fadds 56(%edx) fstps 384(%esi) flds 56(%edx) fadds 36(%edx) fstps 128(%esi) flds 36(%edx) fadds 52(%edx) fstps 128(%edi) flds 52(%edx) fadds 44(%edx) fstps 384(%edi) flds 60(%edx) fsts 896(%edi) fadds 44(%edx) fstps 640(%edi) flds 96(%edx) fadds 112(%edx) fld %st(0) fadds 64(%edx) fstps 960(%esi) fadds 80(%edx) fstps 832(%esi) flds 112(%edx) fadds 104(%edx) fld %st(0) fadds 80(%edx) fstps 704(%esi) fadds 72(%edx) fstps 576(%esi) flds 104(%edx) fadds 120(%edx) fld %st(0) fadds 72(%edx) fstps 448(%esi) fadds 88(%edx) fstps 320(%esi) flds 120(%edx) fadds 100(%edx) fld %st(0) fadds 88(%edx) fstps 192(%esi) fadds 68(%edx) fstps 64(%esi) flds 100(%edx) fadds 116(%edx) fld %st(0) fadds 68(%edx) fstps 64(%edi) fadds 84(%edx) fstps 192(%edi) flds 116(%edx) fadds 108(%edx) fld %st(0) fadds 84(%edx) fstps 320(%edi) fadds 76(%edx) fstps 448(%edi) flds 108(%edx) fadds 124(%edx) fld %st(0) fadds 76(%edx) fstps 576(%edi) fadds 92(%edx) fstps 704(%edi) flds 124(%edx) fsts 960(%edi) fadds 92(%edx) fstps 832(%edi) addl $256,%esp popl %edi popl %esi popl %ebx ret .L01: /* Phase 8*/ flds (%ecx) fadds 4(%ecx) fistp 512(%esi) flds (%ecx) fsubs 4(%ecx) fmuls 120(%ebx) fistp (%esi) flds 12(%ecx) fsubs 8(%ecx) fmuls 120(%ebx) fist 256(%edi) fadds 12(%ecx) fadds 8(%ecx) fistp 256(%esi) flds 16(%ecx) fsubs 20(%ecx) fmuls 120(%ebx) flds 28(%ecx) fsubs 24(%ecx) fmuls 120(%ebx) fist 384(%edi) fld %st(0) fadds 24(%ecx) fadds 28(%ecx) fld %st(0) fadds 16(%ecx) fadds 20(%ecx) fistp 384(%esi) fadd %st(2) fistp 128(%esi) faddp %st(1) fistp 128(%edi) /* Phase 9*/ flds 32(%edx) fadds 48(%edx) fistp 448(%esi) flds 48(%edx) fadds 40(%edx) fistp 320(%esi) flds 40(%edx) fadds 56(%edx) fistp 192(%esi) flds 56(%edx) fadds 36(%edx) fistp 64(%esi) flds 36(%edx) fadds 52(%edx) fistp 64(%edi) flds 52(%edx) fadds 44(%edx) fistp 192(%edi) flds 60(%edx) fist 448(%edi) fadds 44(%edx) fistp 320(%edi) flds 96(%edx) fadds 112(%edx) fld %st(0) fadds 64(%edx) fistp 480(%esi) fadds 80(%edx) fistp 416(%esi) flds 112(%edx) fadds 104(%edx) fld %st(0) fadds 80(%edx) fistp 352(%esi) fadds 72(%edx) fistp 288(%esi) flds 104(%edx) fadds 120(%edx) fld %st(0) fadds 72(%edx) fistp 224(%esi) fadds 88(%edx) fistp 160(%esi) flds 120(%edx) fadds 100(%edx) fld %st(0) fadds 88(%edx) fistp 96(%esi) fadds 68(%edx) fistp 32(%esi) flds 100(%edx) fadds 116(%edx) fld %st(0) fadds 68(%edx) fistp 32(%edi) fadds 84(%edx) fistp 96(%edi) flds 116(%edx) fadds 108(%edx) fld %st(0) fadds 84(%edx) fistp 160(%edi) fadds 76(%edx) fistp 224(%edi) flds 108(%edx) fadds 124(%edx) fld %st(0) fadds 76(%edx) fistp 288(%edi) fadds 92(%edx) fistp 352(%edi) flds 124(%edx) fist 480(%edi) fadds 92(%edx) fistp 416(%edi) movsw addl $256,%esp popl %edi popl %esi popl %ebx ret $12 // end of FPU stuff