Mercurial > mplayer.hg
diff mp3lib/decode_3dnow.s @ 1:3b5f5d1c5041
Initial revision
author | arpi_esp |
---|---|
date | Sat, 24 Feb 2001 20:28:24 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mp3lib/decode_3dnow.s Sat Feb 24 20:28:24 2001 +0000 @@ -0,0 +1,265 @@ +/ synth_1to1_3dnow works the same way as the c version of +/ synth_1to1. this assembler code based 'decode-i586.s' +/ (by Stefan Bieschewski <stb@acm.org>), two types of changes +/ have been made: +/ - use {MMX,3DNow!} instruction for reduce cpu +/ - remove unused(?) local symbols +/ +/ useful sources of information on optimizing 3DNow! code include: +/ AMD 3DNow! Technology Manual (Publication #21928) +/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf +/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) +/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) +/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf +/ +/ This code was tested only AMD-K6-2 processor Linux systems, +/ please tell me: +/ - whether this code works on other 3DNow! capable processors +/ (ex.IDT-C6-2) or not +/ - whether this code works on other OSes or not +/ +/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 +/ <kim@comtec.co.jp> - after 1.Apr.1998 + +/ Enhancments for q-word operation by Michael Hipp + +.bss + .comm buffs,4352,4 +.data + .align 4 +bo: + .long 1 +.text +.globl synth_1to1_3dnow +synth_1to1_3dnow: + subl $12,%esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + movl 32(%esp),%eax + movl 40(%esp),%esi + movl $0,%edi + movl bo,%ebp + cmpl %edi,36(%esp) + jne .L48 + decl %ebp + andl $15,%ebp + movl %ebp,bo + movl $buffs,%ecx + jmp .L49 +.L48: + addl $2,%esi + movl $buffs+2176,%ecx +.L49: + testl $1,%ebp + je .L50 + movl %ecx,%ebx + movl %ebp,16(%esp) + pushl %eax + movl 20(%esp),%edx + leal (%ebx,%edx,4),%eax + pushl %eax + movl 24(%esp),%eax + incl %eax + andl $15,%eax + leal 1088(,%eax,4),%eax + addl %ebx,%eax + jmp .L74 +.L50: + leal 1088(%ecx),%ebx + leal 1(%ebp),%edx + movl %edx,16(%esp) + pushl %eax + leal 1092(%ecx,%ebp,4),%eax + pushl %eax + leal (%ecx,%ebp,4),%eax +.L74: + pushl %eax + call dct64_3dnow + addl $12,%esp + movl 16(%esp),%edx + leal 0(,%edx,4),%edx + movl $decwin+64,%eax + movl %eax,%ecx + subl %edx,%ecx + movl $16,%ebp + +.L55: + movq (%ecx),%mm4 + movq (%ebx),%mm3 + movq 8(%ecx),%mm0 + movq 8(%ebx),%mm1 + pfmul %mm3,%mm4 + + movq 16(%ecx),%mm2 + pfmul %mm1,%mm0 + movq 16(%ebx),%mm3 + pfadd %mm0,%mm4 + + movq 24(%ecx),%mm0 + pfmul %mm2,%mm3 + movq 24(%ebx),%mm1 + pfadd %mm3,%mm4 + + movq 32(%ecx),%mm2 + pfmul %mm1,%mm0 + movq 32(%ebx),%mm3 + pfadd %mm0,%mm4 + + movq 40(%ecx),%mm0 + pfmul %mm2,%mm3 + movq 40(%ebx),%mm1 + pfadd %mm3,%mm4 + + movq 48(%ecx),%mm2 + pfmul %mm1,%mm0 + movq 48(%ebx),%mm3 + pfadd %mm0,%mm4 + + movq 56(%ecx),%mm0 + pfmul %mm2,%mm3 + movq 56(%ebx),%mm1 + pfadd %mm3,%mm4 + + pfmul %mm1,%mm0 + pfadd %mm0,%mm4 + + movq %mm4,%mm0 + psrlq $32,%mm0 + pfsub %mm0,%mm4 + + pf2id %mm4,%mm4 + movd %mm4,%eax + + sar $16,%eax + movw %ax,(%esi) + + addl $64,%ebx + subl $-128,%ecx + addl $4,%esi + decl %ebp + jnz .L55 + +/ --- end of loop 1 --- + + movd (%ecx),%mm2 + movd (%ebx),%mm1 + pfmul %mm1,%mm2 + + movd 8(%ecx),%mm0 + movd 8(%ebx),%mm1 + pfmul %mm0,%mm1 + pfadd %mm1,%mm2 + + movd 16(%ecx),%mm0 + movd 16(%ebx),%mm1 + pfmul %mm0,%mm1 + pfadd %mm1,%mm2 + + movd 24(%ecx),%mm0 + movd 24(%ebx),%mm1 + pfmul %mm0,%mm1 + pfadd %mm1,%mm2 + + movd 32(%ecx),%mm0 + movd 32(%ebx),%mm1 + pfmul %mm0,%mm1 + pfadd %mm1,%mm2 + + movd 40(%ecx),%mm0 + movd 40(%ebx),%mm1 + pfmul %mm0,%mm1 + pfadd %mm1,%mm2 + + movd 48(%ecx),%mm0 + movd 48(%ebx),%mm1 + pfmul %mm0,%mm1 + pfadd %mm1,%mm2 + + movd 56(%ecx),%mm0 + movd 56(%ebx),%mm1 + pfmul %mm0,%mm1 + pfadd %mm1,%mm2 + + pf2id %mm2,%mm2 + movd %mm2,%eax + + sar $16,%eax + + movw %ax,(%esi) + + addl $-64,%ebx + addl $4,%esi + addl $256,%ecx + movl $15,%ebp + +.L68: + psubd %mm0,%mm0 + + movq (%ebx),%mm1 + movq (%ecx),%mm2 + pfmul %mm1,%mm2 + pfsub %mm2,%mm0 + + movq 8(%ebx),%mm3 + movq 8(%ecx),%mm4 + pfmul %mm3,%mm4 + pfsub %mm4,%mm0 + + movq 16(%ebx),%mm1 + movq 16(%ecx),%mm2 + pfmul %mm1,%mm2 + pfsub %mm2,%mm0 + + movq 24(%ebx),%mm3 + movq 24(%ecx),%mm4 + pfmul %mm3,%mm4 + pfsub %mm4,%mm0 + + movq 32(%ebx),%mm1 + movq 32(%ecx),%mm2 + pfmul %mm1,%mm2 + pfsub %mm2,%mm0 + + movq 40(%ebx),%mm3 + movq 40(%ecx),%mm4 + pfmul %mm3,%mm4 + pfsub %mm4,%mm0 + + movq 48(%ebx),%mm1 + movq 48(%ecx),%mm2 + pfmul %mm1,%mm2 + pfsub %mm2,%mm0 + + movq 56(%ebx),%mm3 + movq 56(%ecx),%mm4 + pfmul %mm3,%mm4 + pfsub %mm4,%mm0 + + pfacc %mm0,%mm0 + + pf2id %mm0,%mm0 + movd %mm0,%eax + + sar $16,%eax + + movw %ax,(%esi) + + addl $-64,%ebx + subl $-128,%ecx + addl $4,%esi + decl %ebp + jnz .L68 + +/ --- end of loop 2 + + femms + + movl %edi,%eax + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $12,%esp + ret