# HG changeset patch # User arpi_esp # Date 989778653 0 # Node ID 9bc104531aec5815fdd544784b4c28e9f9b422bd # Parent 4b6dc49b0cb8e01b2be7d9dc782e70472d3c4b80 mp3lib sse support - disabled by default diff -r 4b6dc49b0cb8 -r 9bc104531aec configure --- a/configure Sun May 13 17:21:55 2001 +0000 +++ b/configure Sun May 13 18:30:53 2001 +0000 @@ -1181,15 +1181,20 @@ echo "Creating mp3lib/config.mak" if [ $_3dnowex = yes ]; then - _3dnowobjectsrcs='dct36_k7.s dct64_k7.s decode_k7.s dct36_3dnow.s dct64_3dnow.s decode_3dnow.s' - _3dnowobjectobjs='dct36_k7.o dct64_k7.o decode_k7.o dct36_3dnow.o dct64_3dnow.o decode_3dnow.o' + _mp3libobjectsrcs='dct36_k7.s dct64_k7.s decode_k7.s dct36_3dnow.s dct64_3dnow.s decode_3dnow.s' + _mp3libobjectobjs='dct36_k7.o dct64_k7.o decode_k7.o dct36_3dnow.o dct64_3dnow.o decode_3dnow.o' else if [ $_3dnow = yes ]; then - _3dnowobjectsrcs='dct36_3dnow.s dct64_3dnow.s decode_3dnow.s' - _3dnowobjectobjs='dct36_3dnow.o dct64_3dnow.o decode_3dnow.o' + _mp3libobjectsrcs='dct36_3dnow.s dct64_3dnow.s decode_3dnow.s' + _mp3libobjectobjs='dct36_3dnow.o dct64_3dnow.o decode_3dnow.o' else - _3dnowobjectsrcs= - _3dnowobjectobjs= +if [ $_sse = yes ]; then + _mp3libobjectsrcs='decode_sse.s' + _mp3libobjectobjs='decode_sse.o' +else + _mp3libobjectsrcs= + _mp3libobjectobjs= +fi fi fi @@ -1197,8 +1202,8 @@ include ../config.mak -OPTIONAL_SRCS = $_3dnowobjectsrcs -OPTIONAL_OBJS = $_3dnowobjectobjs +OPTIONAL_SRCS = $_mp3libobjectsrcs +OPTIONAL_OBJS = $_mp3libobjectobjs EOF diff -r 4b6dc49b0cb8 -r 9bc104531aec mp3lib/decod386.c --- a/mp3lib/decod386.c Sun May 13 17:21:55 2001 +0000 +++ b/mp3lib/decod386.c Sun May 13 18:30:53 2001 +0000 @@ -117,6 +117,15 @@ int clip = 0; int bo1; + #ifdef HAVE_SSE_MP3 + //if ( _3dnow ) + { + int ret; + ret=synth_1to1_sse( bandPtr,channel,out+*pnt ); + *pnt+=128; + return ret; + } + #endif #ifdef HAVE_3DNOWEX if ( _3dnow > 1 ) { diff -r 4b6dc49b0cb8 -r 9bc104531aec mp3lib/decode_sse.s --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mp3lib/decode_sse.s Sun May 13 18:30:53 2001 +0000 @@ -0,0 +1,201 @@ +/// +/// Replacement of synth_1to1() with Intel's SSE SIMD operations support +/// +/// This code based 'decode_k7.s' by Nick Kurshev +/// ,only some types of changes have been made: +/// +/// - SSE optimization +/// - change function name for support SSE automatic detect +/// +/// Modified by Nick Kurshev +/// +/ synth_1to1_3dnow works the same way as the c version of +/ synth_1to1. this assembler code based 'decode-i586.s' +/ (by Stefan Bieschewski ), two types of changes +/ have been made: +/ - use {MMX,3DNow!} instruction for reduce cpu +/ - remove unused(?) local symbols +/ +/ useful sources of information on optimizing 3DNow! code include: +/ AMD 3DNow! Technology Manual (Publication #21928) +/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf +/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) +/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) +/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf +/ +/ This code was tested only AMD-K6-2 processor Linux systems, +/ please tell me: +/ - whether this code works on other 3DNow! capable processors +/ (ex.IDT-C6-2) or not +/ - whether this code works on other OSes or not +/ +/ by KIMURA Takuhiro - until 31.Mar.1998 +/ - after 1.Apr.1998 + +/ Enhancments for q-word operation by Michael Hipp + +.bss + .comm buffs,4352,4 +.data + .align 4 +bo: + .long 1 +.text +/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ +.globl synth_1to1_sse +synth_1to1_sse: + subl $12,%esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl 32(%esp),%eax + movl 40(%esp),%esi + movl $0,%edi + movl bo,%ebp + cmpl %edi,36(%esp) + jne .L48 + decl %ebp + andl $15,%ebp + movl %ebp,bo + movl $buffs,%ecx + jmp .L49 +.L48: + addl $2,%esi + movl $buffs+2176,%ecx +.L49: + testl $1,%ebp + je .L50 + movl %ecx,%ebx + movl %ebp,16(%esp) + pushl %eax + movl 20(%esp),%edx + leal (%ebx,%edx,4),%eax + pushl %eax + movl 24(%esp),%eax + incl %eax + andl $15,%eax + leal 1088(,%eax,4),%eax + addl %ebx,%eax + jmp .L74 +.L50: + leal 1088(%ecx),%ebx + leal 1(%ebp),%edx + movl %edx,16(%esp) + pushl %eax + leal 1092(%ecx,%ebp,4),%eax + pushl %eax + leal (%ecx,%ebp,4),%eax +.L74: + pushl %eax + call dct64 + addl $12,%esp + movl 16(%esp),%edx + leal 0(,%edx,4),%edx + movl $decwin+64,%eax + movl %eax,%ecx + subl %edx,%ecx + movl $16,%ebp + +.L55: + movups (%ecx), %xmm4 + mulps (%ebx), %xmm4 + movups 16(%ecx), %xmm0 + mulps 16(%ebx), %xmm0 + addps %xmm0, %xmm4 + movups 32(%ecx), %xmm1 + mulps 32(%ebx), %xmm1 + addps %xmm1, %xmm4 + movups 48(%ecx), %xmm0 + mulps 48(%ebx), %xmm0 + addps %xmm0, %xmm4 + shufps $0xDD, %xmm4, %xmm1 /* fake of pfacc. 3|2|3|2 */ + addps %xmm1, %xmm4 + shufps $0x55, %xmm4, %xmm1 /* fake of pfnacc. 1|1|1|1 */ + subps %xmm1, %xmm4 + cvtps2pi %xmm4, %mm4 + + movd %mm4,%eax + + sar $16,%eax + movw %ax,(%esi) + + addl $64,%ebx + subl $-128,%ecx + addl $4,%esi + decl %ebp + jnz .L55 + +/ --- end of loop 1 --- + + movups (%ecx), %xmm4 + mulps (%ebx), %xmm4 + movups 16(%ecx), %xmm0 + mulps 16(%ebx), %xmm0 + addps %xmm0, %xmm4 + movups 32(%ecx), %xmm1 + mulps 32(%ebx), %xmm1 + addps %xmm1, %xmm4 + movups 48(%ecx), %xmm0 + mulps 48(%ebx), %xmm0 + addps %xmm0, %xmm4 + shufps $0xDD, %xmm4, %xmm1 /* 3|2|3|2 */ + addps %xmm1, %xmm4 + cvtps2pi %xmm4, %mm4 + + movd %mm4, %eax + + sar $16,%eax + + movw %ax,(%esi) + + addl $-64,%ebx + addl $4,%esi + addl $256,%ecx + movl $15,%ebp + +.L68: + xorps %xmm3, %xmm3 + + movups (%ecx), %xmm4 + mulps (%ebx), %xmm4 + subps %xmm4, %xmm3 + movups 16(%ecx), %xmm0 + mulps 16(%ebx), %xmm0 + subps %xmm0, %xmm3 + movups 32(%ecx), %xmm1 + mulps 32(%ebx), %xmm1 + subps %xmm1, %xmm3 + movups 48(%ecx), %xmm0 + mulps 48(%ebx), %xmm0 + subps %xmm0, %xmm3 + shufps $0xDD, %xmm3, %xmm1 /* 3|2|3|2 */ + addps %xmm1, %xmm3 + shufps $0x55, %xmm3, %xmm1 /* fake of pfacc 1|1|1|1 */ + addps %xmm1, %xmm3 + cvtps2pi %xmm3, %mm0 + + movd %mm0,%eax + + sar $16,%eax + + movw %ax,(%esi) + + addl $-64,%ebx + subl $-128,%ecx + addl $4,%esi + decl %ebp + jnz .L68 + +/ --- end of loop 2 + + emms + + movl %edi,%eax + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $12,%esp + ret diff -r 4b6dc49b0cb8 -r 9bc104531aec mp3lib/mpg123.h --- a/mp3lib/mpg123.h Sun May 13 17:21:55 2001 +0000 +++ b/mp3lib/mpg123.h Sun May 13 18:30:53 2001 +0000 @@ -128,3 +128,9 @@ extern void dct36_3dnowex(real *,real *,real *,real *,real *); extern int synth_1to1_3dnowex( real *,int,unsigned char * ); #endif +#ifdef HAVE_SSE_MP3 +// extern void dct64_3dnow( real *,real *, real * ); +// extern void dct36_3dnow(real *,real *,real *,real *,real *); + extern int synth_1to1_sse( real *,int,unsigned char * ); +#endif +