Mercurial > mplayer.hg
diff mp3lib/decode_i586.s @ 1:3b5f5d1c5041
Initial revision
author | arpi_esp |
---|---|
date | Sat, 24 Feb 2001 20:28:24 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mp3lib/decode_i586.s Sat Feb 24 20:28:24 2001 +0000 @@ -0,0 +1,321 @@ +/ +/ mpg123_synth_1to1 works the same way as the c version of this +/ file. only two types of changes have been made: +/ - reordered floating point instructions to +/ prevent pipline stalls +/ - made WRITE_SAMPLE use integer instead of +/ (slower) floating point +/ all kinds of x86 processors should benefit from these +/ modifications. +/ +/ useful sources of information on optimizing x86 code include: +/ +/ Intel Architecture Optimization Manual +/ http://www.intel.com/design/pentium/manuals/242816.htm +/ +/ Cyrix 6x86 Instruction Set Summary +/ ftp://ftp.cyrix.com/6x86/6x-dbch6.pdf +/ +/ AMD-K5 Processor Software Development +/ http://www.amd.com/products/cpg/techdocs/appnotes/20007e.pdf +/ +/ Stefan Bieschewski <stb@acm.org> +/ +/ $Id$ +/ +.bss + .comm buffs,4352,4 +.data + .align 4 +bo: + .long 1 +.section .rodata + .align 8 +.LC0: + .long 0x0,0x40dfffc0 + .align 8 +.LC1: + .long 0x0,0xc0e00000 + .align 8 +.text +.globl synth_1to1_pent +synth_1to1_pent: + subl $12,%esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + movl 32(%esp),%eax + movl 40(%esp),%esi + xorl %edi,%edi + movl bo,%ebp + cmpl %edi,36(%esp) + jne .L48 + decl %ebp + andl $15,%ebp + movl %ebp,bo + movl $buffs,%ecx + jmp .L49 +.L48: + addl $2,%esi + movl $buffs+2176,%ecx +.L49: + testl $1,%ebp + je .L50 + movl %ecx,%ebx + movl %ebp,16(%esp) + pushl %eax + movl 20(%esp),%edx + leal (%ebx,%edx,4),%eax + pushl %eax + movl 24(%esp),%eax + incl %eax + andl $15,%eax + leal 1088(,%eax,4),%eax + addl %ebx,%eax + jmp .L74 +.L50: + leal 1088(%ecx),%ebx + leal 1(%ebp),%edx + movl %edx,16(%esp) + pushl %eax + leal 1092(%ecx,%ebp,4),%eax + pushl %eax + leal (%ecx,%ebp,4),%eax +.L74: + pushl %eax + call dct64 + addl $12,%esp + movl 16(%esp),%edx + leal 0(,%edx,4),%edx + movl $decwin+64,%eax + movl %eax,%ecx + subl %edx,%ecx + movl $16,%ebp +.L55: + flds (%ecx) + fmuls (%ebx) + flds 4(%ecx) + fmuls 4(%ebx) + fxch %st(1) + flds 8(%ecx) + fmuls 8(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 12(%ecx) + fmuls 12(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 16(%ecx) + fmuls 16(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 20(%ecx) + fmuls 20(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 24(%ecx) + fmuls 24(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 28(%ecx) + fmuls 28(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 32(%ecx) + fmuls 32(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 36(%ecx) + fmuls 36(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 40(%ecx) + fmuls 40(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 44(%ecx) + fmuls 44(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 48(%ecx) + fmuls 48(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 52(%ecx) + fmuls 52(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 56(%ecx) + fmuls 56(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 60(%ecx) + fmuls 60(%ebx) + fxch %st(2) + subl $4,%esp + faddp %st,%st(1) + fxch %st(1) + fsubrp %st,%st(1) + fistpl (%esp) + popl %eax + cmpl $32767,%eax + jg 1f + cmpl $-32768,%eax + jl 2f + movw %ax,(%esi) + jmp 4f +1: movw $32767,(%esi) + jmp 3f +2: movw $-32768,(%esi) +3: incl %edi +4: +.L54: + addl $64,%ebx + subl $-128,%ecx + addl $4,%esi + decl %ebp + jnz .L55 + flds (%ecx) + fmuls (%ebx) + flds 8(%ecx) + fmuls 8(%ebx) + flds 16(%ecx) + fmuls 16(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 24(%ecx) + fmuls 24(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 32(%ecx) + fmuls 32(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 40(%ecx) + fmuls 40(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 48(%ecx) + fmuls 48(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 56(%ecx) + fmuls 56(%ebx) + fxch %st(2) + subl $4,%esp + faddp %st,%st(1) + fxch %st(1) + faddp %st,%st(1) + fistpl (%esp) + popl %eax + cmpl $32767,%eax + jg 1f + cmpl $-32768,%eax + jl 2f + movw %ax,(%esi) + jmp 4f +1: movw $32767,(%esi) + jmp 3f +2: movw $-32768,(%esi) +3: incl %edi +4: +.L62: + addl $-64,%ebx + addl $4,%esi + movl 16(%esp),%edx + leal -128(%ecx,%edx,8),%ecx + movl $15,%ebp +.L68: + flds -4(%ecx) + fchs + fmuls (%ebx) + flds -8(%ecx) + fmuls 4(%ebx) + fxch %st(1) + flds -12(%ecx) + fmuls 8(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -16(%ecx) + fmuls 12(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -20(%ecx) + fmuls 16(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -24(%ecx) + fmuls 20(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -28(%ecx) + fmuls 24(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -32(%ecx) + fmuls 28(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -36(%ecx) + fmuls 32(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -40(%ecx) + fmuls 36(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -44(%ecx) + fmuls 40(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -48(%ecx) + fmuls 44(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -52(%ecx) + fmuls 48(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -56(%ecx) + fmuls 52(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -60(%ecx) + fmuls 56(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds (%ecx) + fmuls 60(%ebx) + fxch %st(2) + subl $4,%esp + fsubrp %st,%st(1) + fxch %st(1) + fsubrp %st,%st(1) + fistpl (%esp) + popl %eax + cmpl $32767,%eax + jg 1f + cmpl $-32768,%eax + jl 2f + movw %ax,(%esi) + jmp 4f +1: movw $32767,(%esi) + jmp 3f +2: movw $-32768,(%esi) +3: incl %edi +4: +.L67: + addl $-64,%ebx + addl $-128,%ecx + addl $4,%esi + decl %ebp + jnz .L68 + movl %edi,%eax + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $12,%esp + ret +