changeset 787:9bc104531aec

mp3lib sse support - disabled by default
author arpi_esp
date Sun, 13 May 2001 18:30:53 +0000
parents 4b6dc49b0cb8
children 214ea3f02d13
files configure mp3lib/decod386.c mp3lib/decode_sse.s mp3lib/mpg123.h
diffstat 4 files changed, 229 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/configure	Sun May 13 17:21:55 2001 +0000
+++ b/configure	Sun May 13 18:30:53 2001 +0000
@@ -1181,15 +1181,20 @@
 echo "Creating mp3lib/config.mak"
 
 if [ $_3dnowex = yes ]; then
- _3dnowobjectsrcs='dct36_k7.s dct64_k7.s decode_k7.s dct36_3dnow.s dct64_3dnow.s decode_3dnow.s'
- _3dnowobjectobjs='dct36_k7.o dct64_k7.o decode_k7.o dct36_3dnow.o dct64_3dnow.o decode_3dnow.o'
+ _mp3libobjectsrcs='dct36_k7.s dct64_k7.s decode_k7.s dct36_3dnow.s dct64_3dnow.s decode_3dnow.s'
+ _mp3libobjectobjs='dct36_k7.o dct64_k7.o decode_k7.o dct36_3dnow.o dct64_3dnow.o decode_3dnow.o'
 else
 if [ $_3dnow = yes ]; then
- _3dnowobjectsrcs='dct36_3dnow.s dct64_3dnow.s decode_3dnow.s'
- _3dnowobjectobjs='dct36_3dnow.o dct64_3dnow.o decode_3dnow.o'
+ _mp3libobjectsrcs='dct36_3dnow.s dct64_3dnow.s decode_3dnow.s'
+ _mp3libobjectobjs='dct36_3dnow.o dct64_3dnow.o decode_3dnow.o'
 else
- _3dnowobjectsrcs=
- _3dnowobjectobjs=
+if [ $_sse = yes ]; then
+ _mp3libobjectsrcs='decode_sse.s'
+ _mp3libobjectobjs='decode_sse.o'
+else
+ _mp3libobjectsrcs=
+ _mp3libobjectobjs=
+fi
 fi
 fi
 
@@ -1197,8 +1202,8 @@
 
 include ../config.mak
 
-OPTIONAL_SRCS = $_3dnowobjectsrcs
-OPTIONAL_OBJS = $_3dnowobjectobjs
+OPTIONAL_SRCS = $_mp3libobjectsrcs
+OPTIONAL_OBJS = $_mp3libobjectobjs
 
 EOF
 
--- a/mp3lib/decod386.c	Sun May 13 17:21:55 2001 +0000
+++ b/mp3lib/decod386.c	Sun May 13 18:30:53 2001 +0000
@@ -117,6 +117,15 @@
   int clip = 0;
   int bo1;
 
+  #ifdef HAVE_SSE_MP3
+  //if ( _3dnow )
+   {
+    int ret;
+    ret=synth_1to1_sse( bandPtr,channel,out+*pnt );
+    *pnt+=128;
+    return ret;
+   }
+  #endif
   #ifdef HAVE_3DNOWEX
   if ( _3dnow > 1 )
    {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mp3lib/decode_sse.s	Sun May 13 18:30:53 2001 +0000
@@ -0,0 +1,201 @@
+///
+/// Replacement of synth_1to1() with Intel's SSE SIMD operations support
+///
+/// This code based 'decode_k7.s' by Nick Kurshev
+/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
+///
+///  - SSE optimization
+///  - change function name for support SSE automatic detect
+///
+/// Modified by Nick Kurshev <nickols_k@mail.ru>
+///
+/ synth_1to1_3dnow works the same way as the c version of
+/ synth_1to1. this assembler code based 'decode-i586.s'
+/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
+/ have been made:
+/ - use {MMX,3DNow!} instruction for reduce cpu
+/ - remove unused(?) local symbols
+/
+/ useful sources of information on optimizing 3DNow! code include:
+/ AMD 3DNow! Technology Manual (Publication #21928)
+/     English:  http://www.amd.com/K6/k6docs/pdf/21928d.pdf
+/    (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
+/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
+/     English:  http://www.amd.com/K6/k6docs/pdf/21924b.pdf
+/
+/ This code was tested only AMD-K6-2 processor Linux systems,
+/ please tell me:
+/ - whether this code works on other 3DNow! capable processors
+/  (ex.IDT-C6-2) or not
+/ - whether this code works on other OSes or not
+/
+/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
+/                    <kim@comtec.co.jp>               - after  1.Apr.1998
+
+/ Enhancments for q-word operation by Michael Hipp
+
+.bss
+        .comm   buffs,4352,4
+.data
+        .align 4
+bo:
+        .long 1
+.text
+/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
+.globl synth_1to1_sse
+synth_1to1_sse:
+        subl  $12,%esp
+        pushl %ebp
+        pushl %edi
+        pushl %esi
+        pushl %ebx
+	
+        movl  32(%esp),%eax
+        movl  40(%esp),%esi
+        movl  $0,%edi
+        movl  bo,%ebp
+        cmpl  %edi,36(%esp)
+        jne   .L48
+        decl  %ebp
+        andl  $15,%ebp
+        movl  %ebp,bo
+        movl  $buffs,%ecx
+        jmp   .L49
+.L48:
+        addl  $2,%esi
+        movl  $buffs+2176,%ecx
+.L49:
+        testl $1,%ebp
+        je    .L50
+        movl  %ecx,%ebx
+        movl  %ebp,16(%esp)
+        pushl %eax
+        movl  20(%esp),%edx
+        leal  (%ebx,%edx,4),%eax
+        pushl %eax
+        movl  24(%esp),%eax
+        incl  %eax
+        andl  $15,%eax
+        leal  1088(,%eax,4),%eax
+        addl  %ebx,%eax
+        jmp   .L74
+.L50:
+        leal  1088(%ecx),%ebx
+        leal  1(%ebp),%edx
+        movl  %edx,16(%esp)
+        pushl %eax
+        leal  1092(%ecx,%ebp,4),%eax
+        pushl %eax
+        leal  (%ecx,%ebp,4),%eax
+.L74:
+        pushl %eax
+        call  dct64
+        addl  $12,%esp
+        movl  16(%esp),%edx
+        leal  0(,%edx,4),%edx
+        movl  $decwin+64,%eax
+        movl  %eax,%ecx            
+        subl  %edx,%ecx
+        movl  $16,%ebp
+
+.L55:
+	movups	(%ecx), %xmm4
+	mulps	(%ebx), %xmm4
+	movups	16(%ecx), %xmm0
+	mulps	16(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	movups	32(%ecx), %xmm1
+	mulps	32(%ebx), %xmm1
+	addps	%xmm1, %xmm4
+	movups	48(%ecx), %xmm0
+	mulps	48(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	shufps	$0xDD, %xmm4, %xmm1 /* fake of pfacc. 3|2|3|2 */
+	addps	%xmm1, %xmm4
+	shufps	$0x55, %xmm4, %xmm1 /* fake of pfnacc. 1|1|1|1 */
+	subps	%xmm1, %xmm4
+	cvtps2pi %xmm4, %mm4
+
+        movd	%mm4,%eax
+
+        sar	$16,%eax
+        movw	%ax,(%esi)
+
+        addl  $64,%ebx
+        subl  $-128,%ecx
+        addl  $4,%esi
+        decl  %ebp
+        jnz  .L55
+
+/ --- end of  loop 1 ---
+
+	movups	(%ecx), %xmm4
+	mulps	(%ebx), %xmm4
+	movups	16(%ecx), %xmm0
+	mulps	16(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	movups	32(%ecx), %xmm1
+	mulps	32(%ebx), %xmm1
+	addps	%xmm1, %xmm4
+	movups	48(%ecx), %xmm0
+	mulps	48(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	shufps	$0xDD, %xmm4, %xmm1 /* 3|2|3|2 */
+	addps	%xmm1, %xmm4
+	cvtps2pi %xmm4, %mm4
+
+	movd	%mm4, %eax
+
+        sar	$16,%eax
+
+        movw	%ax,(%esi)
+
+        addl  $-64,%ebx
+        addl  $4,%esi
+        addl  $256,%ecx
+        movl  $15,%ebp
+
+.L68:
+	xorps	%xmm3, %xmm3
+
+	movups	(%ecx), %xmm4
+	mulps	(%ebx), %xmm4
+	subps	%xmm4, %xmm3
+	movups	16(%ecx), %xmm0
+	mulps	16(%ebx), %xmm0
+	subps	%xmm0, %xmm3
+	movups	32(%ecx), %xmm1
+	mulps	32(%ebx), %xmm1
+	subps	%xmm1, %xmm3
+	movups	48(%ecx), %xmm0
+	mulps	48(%ebx), %xmm0
+	subps	%xmm0, %xmm3
+	shufps	$0xDD, %xmm3, %xmm1 /* 3|2|3|2 */
+	addps	%xmm1, %xmm3
+	shufps	$0x55, %xmm3, %xmm1 /* fake of pfacc 1|1|1|1 */
+	addps	%xmm1, %xmm3
+	cvtps2pi %xmm3, %mm0
+
+        movd	%mm0,%eax
+
+        sar	$16,%eax
+
+        movw	%ax,(%esi)
+
+        addl  $-64,%ebx
+        subl  $-128,%ecx
+        addl  $4,%esi
+        decl  %ebp
+        jnz   .L68
+
+/ --- end of loop 2
+
+        emms
+
+        movl  %edi,%eax
+        popl  %ebx
+        popl  %esi
+        popl  %edi
+        popl  %ebp
+        addl  $12,%esp
+        ret
--- a/mp3lib/mpg123.h	Sun May 13 17:21:55 2001 +0000
+++ b/mp3lib/mpg123.h	Sun May 13 18:30:53 2001 +0000
@@ -128,3 +128,9 @@
  extern void dct36_3dnowex(real *,real *,real *,real *,real *);
  extern int  synth_1to1_3dnowex( real *,int,unsigned char * );
 #endif
+#ifdef HAVE_SSE_MP3
+// extern void dct64_3dnow( real *,real *, real * );
+// extern void dct36_3dnow(real *,real *,real *,real *,real *);
+ extern int  synth_1to1_sse( real *,int,unsigned char * );
+#endif
+