diff mp3lib/decode_sse.s @ 787:9bc104531aec

mp3lib sse support - disabled by default
author arpi_esp
date Sun, 13 May 2001 18:30:53 +0000
parents
children 290801346d62
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mp3lib/decode_sse.s	Sun May 13 18:30:53 2001 +0000
@@ -0,0 +1,201 @@
+///
+/// Replacement of synth_1to1() with Intel's SSE SIMD operations support
+///
+/// This code based 'decode_k7.s' by Nick Kurshev
+/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
+///
+///  - SSE optimization
+///  - change function name for support SSE automatic detect
+///
+/// Modified by Nick Kurshev <nickols_k@mail.ru>
+///
+/ synth_1to1_3dnow works the same way as the c version of
+/ synth_1to1. this assembler code based 'decode-i586.s'
+/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
+/ have been made:
+/ - use {MMX,3DNow!} instruction for reduce cpu
+/ - remove unused(?) local symbols
+/
+/ useful sources of information on optimizing 3DNow! code include:
+/ AMD 3DNow! Technology Manual (Publication #21928)
+/     English:  http://www.amd.com/K6/k6docs/pdf/21928d.pdf
+/    (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
+/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
+/     English:  http://www.amd.com/K6/k6docs/pdf/21924b.pdf
+/
+/ This code was tested only AMD-K6-2 processor Linux systems,
+/ please tell me:
+/ - whether this code works on other 3DNow! capable processors
+/  (ex.IDT-C6-2) or not
+/ - whether this code works on other OSes or not
+/
+/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
+/                    <kim@comtec.co.jp>               - after  1.Apr.1998
+
+/ Enhancments for q-word operation by Michael Hipp
+
+.bss
+        .comm   buffs,4352,4
+.data
+        .align 4
+bo:
+        .long 1
+.text
+/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
+.globl synth_1to1_sse
+synth_1to1_sse:
+        subl  $12,%esp
+        pushl %ebp
+        pushl %edi
+        pushl %esi
+        pushl %ebx
+	
+        movl  32(%esp),%eax
+        movl  40(%esp),%esi
+        movl  $0,%edi
+        movl  bo,%ebp
+        cmpl  %edi,36(%esp)
+        jne   .L48
+        decl  %ebp
+        andl  $15,%ebp
+        movl  %ebp,bo
+        movl  $buffs,%ecx
+        jmp   .L49
+.L48:
+        addl  $2,%esi
+        movl  $buffs+2176,%ecx
+.L49:
+        testl $1,%ebp
+        je    .L50
+        movl  %ecx,%ebx
+        movl  %ebp,16(%esp)
+        pushl %eax
+        movl  20(%esp),%edx
+        leal  (%ebx,%edx,4),%eax
+        pushl %eax
+        movl  24(%esp),%eax
+        incl  %eax
+        andl  $15,%eax
+        leal  1088(,%eax,4),%eax
+        addl  %ebx,%eax
+        jmp   .L74
+.L50:
+        leal  1088(%ecx),%ebx
+        leal  1(%ebp),%edx
+        movl  %edx,16(%esp)
+        pushl %eax
+        leal  1092(%ecx,%ebp,4),%eax
+        pushl %eax
+        leal  (%ecx,%ebp,4),%eax
+.L74:
+        pushl %eax
+        call  dct64
+        addl  $12,%esp
+        movl  16(%esp),%edx
+        leal  0(,%edx,4),%edx
+        movl  $decwin+64,%eax
+        movl  %eax,%ecx            
+        subl  %edx,%ecx
+        movl  $16,%ebp
+
+.L55:
+	movups	(%ecx), %xmm4
+	mulps	(%ebx), %xmm4
+	movups	16(%ecx), %xmm0
+	mulps	16(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	movups	32(%ecx), %xmm1
+	mulps	32(%ebx), %xmm1
+	addps	%xmm1, %xmm4
+	movups	48(%ecx), %xmm0
+	mulps	48(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	shufps	$0xDD, %xmm4, %xmm1 /* fake of pfacc. 3|2|3|2 */
+	addps	%xmm1, %xmm4
+	shufps	$0x55, %xmm4, %xmm1 /* fake of pfnacc. 1|1|1|1 */
+	subps	%xmm1, %xmm4
+	cvtps2pi %xmm4, %mm4
+
+        movd	%mm4,%eax
+
+        sar	$16,%eax
+        movw	%ax,(%esi)
+
+        addl  $64,%ebx
+        subl  $-128,%ecx
+        addl  $4,%esi
+        decl  %ebp
+        jnz  .L55
+
+/ --- end of  loop 1 ---
+
+	movups	(%ecx), %xmm4
+	mulps	(%ebx), %xmm4
+	movups	16(%ecx), %xmm0
+	mulps	16(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	movups	32(%ecx), %xmm1
+	mulps	32(%ebx), %xmm1
+	addps	%xmm1, %xmm4
+	movups	48(%ecx), %xmm0
+	mulps	48(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	shufps	$0xDD, %xmm4, %xmm1 /* 3|2|3|2 */
+	addps	%xmm1, %xmm4
+	cvtps2pi %xmm4, %mm4
+
+	movd	%mm4, %eax
+
+        sar	$16,%eax
+
+        movw	%ax,(%esi)
+
+        addl  $-64,%ebx
+        addl  $4,%esi
+        addl  $256,%ecx
+        movl  $15,%ebp
+
+.L68:
+	xorps	%xmm3, %xmm3
+
+	movups	(%ecx), %xmm4
+	mulps	(%ebx), %xmm4
+	subps	%xmm4, %xmm3
+	movups	16(%ecx), %xmm0
+	mulps	16(%ebx), %xmm0
+	subps	%xmm0, %xmm3
+	movups	32(%ecx), %xmm1
+	mulps	32(%ebx), %xmm1
+	subps	%xmm1, %xmm3
+	movups	48(%ecx), %xmm0
+	mulps	48(%ebx), %xmm0
+	subps	%xmm0, %xmm3
+	shufps	$0xDD, %xmm3, %xmm1 /* 3|2|3|2 */
+	addps	%xmm1, %xmm3
+	shufps	$0x55, %xmm3, %xmm1 /* fake of pfacc 1|1|1|1 */
+	addps	%xmm1, %xmm3
+	cvtps2pi %xmm3, %mm0
+
+        movd	%mm0,%eax
+
+        sar	$16,%eax
+
+        movw	%ax,(%esi)
+
+        addl  $-64,%ebx
+        subl  $-128,%ecx
+        addl  $4,%esi
+        decl  %ebp
+        jnz   .L68
+
+/ --- end of loop 2
+
+        emms
+
+        movl  %edi,%eax
+        popl  %ebx
+        popl  %esi
+        popl  %edi
+        popl  %ebp
+        addl  $12,%esp
+        ret