Mercurial > mplayer.hg
changeset 3904:848d848521b9
runtime cpudetect
author | michael |
---|---|
date | Sun, 30 Dec 2001 19:57:14 +0000 |
parents | 16c96a2353e8 |
children | 91e5c563fce5 |
files | liba52/a52_internal.h liba52/downmix.c |
diffstat | 2 files changed, 549 insertions(+), 303 deletions(-) [+] |
line wrap: on
line diff
--- a/liba52/a52_internal.h Sun Dec 30 19:38:28 2001 +0000 +++ b/liba52/a52_internal.h Sun Dec 30 19:57:14 2001 +0000 @@ -45,9 +45,9 @@ sample_t clev, sample_t slev); int downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level, sample_t clev, sample_t slev); -void downmix (sample_t * samples, int acmod, int output, sample_t bias, +extern void (*downmix) (sample_t * samples, int acmod, int output, sample_t bias, sample_t clev, sample_t slev); -void upmix (sample_t * samples, int acmod, int output); +extern void (*upmix) (sample_t * samples, int acmod, int output); void imdct_init (uint32_t mm_accel); extern void (* imdct_256) (sample_t * data, sample_t * delay, sample_t bias);
--- a/liba52/downmix.c Sun Dec 30 19:38:28 2001 +0000 +++ b/liba52/downmix.c Sun Dec 30 19:57:14 2001 +0000 @@ -24,6 +24,7 @@ */ #include "config.h" +#include "../cpudetect.h" #include <string.h> #include <inttypes.h> @@ -33,6 +34,20 @@ #define CONVERT(acmod,output) (((output) << 3) + (acmod)) +//#undef HAVE_SSE +//#undef HAVE_MMX + +void (*downmix)(sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev)= NULL; +void (*upmix)(sample_t * samples, int acmod, int output)= NULL; + +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev); +static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev); +static void upmix_MMX (sample_t * samples, int acmod, int output); +static void upmix_C (sample_t * samples, int acmod, int output); + int downmix_init (int input, int flags, sample_t * level, sample_t clev, sample_t slev) { @@ -62,6 +77,13 @@ }; int output; + upmix= upmix_C; + downmix= downmix_C; +#ifdef ARCH_X86 + if(gCpuCaps.hasMMX) upmix= upmix_MMX; + if(gCpuCaps.hasSSE) downmix= downmix_SSE; +#endif + output = flags & A52_CHANNEL_MASK; if (output > A52_DOLBY) return -1; @@ -305,115 +327,34 @@ { int i; -#ifdef HAVE_SSE - asm volatile( - "movlps %2, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps (%0, %%esi), %%xmm0 \n\t" - "movaps 16(%0, %%esi), %%xmm1 \n\t" - "addps (%1, %%esi), %%xmm0 \n\t" - "addps 16(%1, %%esi), %%xmm1 \n\t" - "addps %%xmm7, %%xmm0 \n\t" - "addps %%xmm7, %%xmm1 \n\t" - "movaps %%xmm0, (%1, %%esi) \n\t" - "movaps %%xmm1, 16(%1, %%esi) \n\t" - "addl $32, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (src+256), "r" (dest+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) dest[i] += src[i] + bias; -#endif } static void mix3to1 (sample_t * samples, sample_t bias) { int i; -#ifdef HAVE_SSE - asm volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps (%0, %%esi), %%xmm0 \n\t" - "movaps 1024(%0, %%esi), %%xmm1 \n\t" - "addps 2048(%0, %%esi), %%xmm0 \n\t" - "addps %%xmm7, %%xmm1 \n\t" - "addps %%xmm1, %%xmm0 \n\t" - "movaps %%xmm0, (%0, %%esi) \n\t" - "addl $16, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) samples[i] += samples[i + 256] + samples[i + 512] + bias; -#endif } static void mix4to1 (sample_t * samples, sample_t bias) { int i; -#ifdef HAVE_SSE - asm volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps (%0, %%esi), %%xmm0 \n\t" - "movaps 1024(%0, %%esi), %%xmm1 \n\t" - "addps 2048(%0, %%esi), %%xmm0 \n\t" - "addps 3072(%0, %%esi), %%xmm1 \n\t" - "addps %%xmm7, %%xmm0 \n\t" - "addps %%xmm1, %%xmm0 \n\t" - "movaps %%xmm0, (%0, %%esi) \n\t" - "addl $16, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) samples[i] += (samples[i + 256] + samples[i + 512] + samples[i + 768] + bias); -#endif } static void mix5to1 (sample_t * samples, sample_t bias) { int i; -#ifdef HAVE_SSE - asm volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps (%0, %%esi), %%xmm0 \n\t" - "movaps 1024(%0, %%esi), %%xmm1 \n\t" - "addps 2048(%0, %%esi), %%xmm0 \n\t" - "addps 3072(%0, %%esi), %%xmm1 \n\t" - "addps %%xmm7, %%xmm0 \n\t" - "addps 4096(%0, %%esi), %%xmm1 \n\t" - "addps %%xmm1, %%xmm0 \n\t" - "movaps %%xmm0, (%0, %%esi) \n\t" - "addl $16, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) samples[i] += (samples[i + 256] + samples[i + 512] + samples[i + 768] + samples[i + 1024] + bias); -#endif } static void mix3to2 (sample_t * samples, sample_t bias) @@ -421,32 +362,11 @@ int i; sample_t common; -#ifdef HAVE_SSE - asm volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps 1024(%0, %%esi), %%xmm0 \n\t" - "addps %%xmm7, %%xmm0 \n\t" //common - "movaps (%0, %%esi), %%xmm1 \n\t" - "movaps 2048(%0, %%esi), %%xmm2 \n\t" - "addps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%esi) \n\t" - "movaps %%xmm2, 1024(%0, %%esi) \n\t" - "addl $16, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) { common = samples[i + 256] + bias; samples[i] += common; samples[i + 256] = samples[i + 512] + common; } -#endif } static void mix21to2 (sample_t * left, sample_t * right, sample_t bias) @@ -454,32 +374,11 @@ int i; sample_t common; -#ifdef HAVE_SSE - asm volatile( - "movlps %2, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps 1024(%1, %%esi), %%xmm0 \n\t" - "addps %%xmm7, %%xmm0 \n\t" //common - "movaps (%0, %%esi), %%xmm1 \n\t" - "movaps (%1, %%esi), %%xmm2 \n\t" - "addps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%esi) \n\t" - "movaps %%xmm2, (%1, %%esi) \n\t" - "addl $16, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (left+256), "r" (right+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) { common = right[i + 256] + bias; left[i] += common; right[i] += common; } -#endif } static void mix21toS (sample_t * samples, sample_t bias) @@ -487,33 +386,11 @@ int i; sample_t surround; -#ifdef HAVE_SSE - asm volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps 2048(%0, %%esi), %%xmm0 \n\t" // surround - "movaps (%0, %%esi), %%xmm1 \n\t" - "movaps 1024(%0, %%esi), %%xmm2 \n\t" - "addps %%xmm7, %%xmm1 \n\t" - "addps %%xmm7, %%xmm2 \n\t" - "subps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%esi) \n\t" - "movaps %%xmm2, 1024(%0, %%esi) \n\t" - "addl $16, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) { surround = samples[i + 512]; samples[i] += bias - surround; samples[i + 256] += bias + surround; } -#endif } static void mix31to2 (sample_t * samples, sample_t bias) @@ -521,33 +398,11 @@ int i; sample_t common; -#ifdef HAVE_SSE - asm volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps 1024(%0, %%esi), %%xmm0 \n\t" - "addps 3072(%0, %%esi), %%xmm0 \n\t" - "addps %%xmm7, %%xmm0 \n\t" // common - "movaps (%0, %%esi), %%xmm1 \n\t" - "movaps 2048(%0, %%esi), %%xmm2 \n\t" - "addps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%esi) \n\t" - "movaps %%xmm2, 1024(%0, %%esi) \n\t" - "addl $16, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) { common = samples[i + 256] + samples[i + 768] + bias; samples[i] += common; samples[i + 256] = samples[i + 512] + common; } -#endif } static void mix31toS (sample_t * samples, sample_t bias) @@ -555,36 +410,12 @@ int i; sample_t common, surround; -#ifdef HAVE_SSE - asm volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps 1024(%0, %%esi), %%xmm0 \n\t" - "movaps 3072(%0, %%esi), %%xmm3 \n\t" // surround - "addps %%xmm7, %%xmm0 \n\t" // common - "movaps (%0, %%esi), %%xmm1 \n\t" - "movaps 2048(%0, %%esi), %%xmm2 \n\t" - "addps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "subps %%xmm3, %%xmm1 \n\t" - "addps %%xmm3, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%esi) \n\t" - "movaps %%xmm2, 1024(%0, %%esi) \n\t" - "addl $16, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) { common = samples[i + 256] + bias; surround = samples[i + 768]; samples[i] += common - surround; samples[i + 256] = samples[i + 512] + common + surround; } -#endif } static void mix22toS (sample_t * samples, sample_t bias) @@ -592,34 +423,11 @@ int i; sample_t surround; -#ifdef HAVE_SSE - asm volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps 2048(%0, %%esi), %%xmm0 \n\t" - "addps 3072(%0, %%esi), %%xmm0 \n\t" // surround - "movaps (%0, %%esi), %%xmm1 \n\t" - "movaps 1024(%0, %%esi), %%xmm2 \n\t" - "addps %%xmm7, %%xmm1 \n\t" - "addps %%xmm7, %%xmm2 \n\t" - "subps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, (%0, %%esi) \n\t" - "movaps %%xmm2, 1024(%0, %%esi) \n\t" - "addl $16, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) { surround = samples[i + 512] + samples[i + 768]; samples[i] += bias - surround; samples[i + 256] += bias + surround; } -#endif } static void mix32to2 (sample_t * samples, sample_t bias) @@ -627,33 +435,11 @@ int i; sample_t common; -#ifdef HAVE_SSE - asm volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps 1024(%0, %%esi), %%xmm0 \n\t" - "addps %%xmm7, %%xmm0 \n\t" // common - "movaps %%xmm0, %%xmm1 \n\t" // common - "addps (%0, %%esi), %%xmm0 \n\t" - "addps 2048(%0, %%esi), %%xmm1 \n\t" - "addps 3072(%0, %%esi), %%xmm0 \n\t" - "addps 4096(%0, %%esi), %%xmm1 \n\t" - "movaps %%xmm0, (%0, %%esi) \n\t" - "movaps %%xmm1, 1024(%0, %%esi) \n\t" - "addl $16, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) { common = samples[i + 256] + bias; samples[i] += common + samples[i + 768]; samples[i + 256] = common + samples[i + 512] + samples[i + 1024]; } -#endif } static void mix32toS (sample_t * samples, sample_t bias) @@ -661,93 +447,30 @@ int i; sample_t common, surround; -#ifdef HAVE_SSE - asm volatile( - "movlps %1, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps 1024(%0, %%esi), %%xmm0 \n\t" - "movaps 3072(%0, %%esi), %%xmm2 \n\t" - "addps %%xmm7, %%xmm0 \n\t" // common - "addps 4096(%0, %%esi), %%xmm2 \n\t" // surround - "movaps (%0, %%esi), %%xmm1 \n\t" - "movaps 2048(%0, %%esi), %%xmm3 \n\t" - "subps %%xmm2, %%xmm1 \n\t" - "addps %%xmm2, %%xmm3 \n\t" - "addps %%xmm0, %%xmm1 \n\t" - "addps %%xmm0, %%xmm3 \n\t" - "movaps %%xmm1, (%0, %%esi) \n\t" - "movaps %%xmm3, 1024(%0, %%esi) \n\t" - "addl $16, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (samples+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) { common = samples[i + 256] + bias; surround = samples[i + 768] + samples[i + 1024]; samples[i] += common - surround; samples[i + 256] = samples[i + 512] + common + surround; } -#endif } static void move2to1 (sample_t * src, sample_t * dest, sample_t bias) { int i; -#ifdef HAVE_SSE - asm volatile( - "movlps %2, %%xmm7 \n\t" - "shufps $0x00, %%xmm7, %%xmm7 \n\t" - "movl $-1024, %%esi \n\t" - "1: \n\t" - "movaps (%0, %%esi), %%xmm0 \n\t" - "movaps 16(%0, %%esi), %%xmm1 \n\t" - "addps 1024(%0, %%esi), %%xmm0 \n\t" - "addps 1040(%0, %%esi), %%xmm1 \n\t" - "addps %%xmm7, %%xmm0 \n\t" - "addps %%xmm7, %%xmm1 \n\t" - "movaps %%xmm0, (%1, %%esi) \n\t" - "movaps %%xmm1, 16(%1, %%esi) \n\t" - "addl $32, %%esi \n\t" - " jnz 1b \n\t" - :: "r" (src+256), "r" (dest+256), "m" (bias) - : "%esi" - ); -#else for (i = 0; i < 256; i++) dest[i] = src[i] + src[i + 256] + bias; -#endif } static void zero (sample_t * samples) { int i; -#ifdef HAVE_MMX - asm volatile( - "movl $-1024, %%esi \n\t" - "pxor %%mm0, %%mm0 \n\t" - "1: \n\t" - "movq %%mm0, (%0, %%esi) \n\t" - "movq %%mm0, 8(%0, %%esi) \n\t" - "movq %%mm0, 16(%0, %%esi) \n\t" - "movq %%mm0, 24(%0, %%esi) \n\t" - "addl $32, %%esi \n\t" - " jnz 1b \n\t" - "emms" - :: "r" (samples+256) - : "%esi" - ); -#else for (i = 0; i < 256; i++) samples[i] = 0; -#endif } -void downmix (sample_t * samples, int acmod, int output, sample_t bias, +static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, sample_t clev, sample_t slev) { switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { @@ -888,7 +611,7 @@ } } -void upmix (sample_t * samples, int acmod, int output) +static void upmix_C (sample_t * samples, int acmod, int output) { switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { @@ -953,3 +676,526 @@ goto mix_31to21; } } + +#ifdef ARCH_X86 +static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias) +{ + asm volatile( + "movlps %2, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps (%0, %%esi), %%xmm0 \n\t" + "movaps 16(%0, %%esi), %%xmm1 \n\t" + "addps (%1, %%esi), %%xmm0 \n\t" + "addps 16(%1, %%esi), %%xmm1 \n\t" + "addps %%xmm7, %%xmm0 \n\t" + "addps %%xmm7, %%xmm1 \n\t" + "movaps %%xmm0, (%1, %%esi) \n\t" + "movaps %%xmm1, 16(%1, %%esi) \n\t" + "addl $32, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (src+256), "r" (dest+256), "m" (bias) + : "%esi" + ); +} + +static void mix3to1_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps (%0, %%esi), %%xmm0 \n\t" + "movaps 1024(%0, %%esi), %%xmm1 \n\t" + "addps 2048(%0, %%esi), %%xmm0 \n\t" + "addps %%xmm7, %%xmm1 \n\t" + "addps %%xmm1, %%xmm0 \n\t" + "movaps %%xmm0, (%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%esi" + ); +} + +static void mix4to1_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps (%0, %%esi), %%xmm0 \n\t" + "movaps 1024(%0, %%esi), %%xmm1 \n\t" + "addps 2048(%0, %%esi), %%xmm0 \n\t" + "addps 3072(%0, %%esi), %%xmm1 \n\t" + "addps %%xmm7, %%xmm0 \n\t" + "addps %%xmm1, %%xmm0 \n\t" + "movaps %%xmm0, (%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%esi" + ); +} + +static void mix5to1_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps (%0, %%esi), %%xmm0 \n\t" + "movaps 1024(%0, %%esi), %%xmm1 \n\t" + "addps 2048(%0, %%esi), %%xmm0 \n\t" + "addps 3072(%0, %%esi), %%xmm1 \n\t" + "addps %%xmm7, %%xmm0 \n\t" + "addps 4096(%0, %%esi), %%xmm1 \n\t" + "addps %%xmm1, %%xmm0 \n\t" + "movaps %%xmm0, (%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%esi" + ); +} + +static void mix3to2_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps 1024(%0, %%esi), %%xmm0 \n\t" + "addps %%xmm7, %%xmm0 \n\t" //common + "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps 2048(%0, %%esi), %%xmm2 \n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%esi) \n\t" + "movaps %%xmm2, 1024(%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%esi" + ); +} + +static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias) +{ + asm volatile( + "movlps %2, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps 1024(%1, %%esi), %%xmm0 \n\t" + "addps %%xmm7, %%xmm0 \n\t" //common + "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps (%1, %%esi), %%xmm2 \n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%esi) \n\t" + "movaps %%xmm2, (%1, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (left+256), "r" (right+256), "m" (bias) + : "%esi" + ); +} + +static void mix21toS_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps 2048(%0, %%esi), %%xmm0 \n\t" // surround + "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps 1024(%0, %%esi), %%xmm2 \n\t" + "addps %%xmm7, %%xmm1 \n\t" + "addps %%xmm7, %%xmm2 \n\t" + "subps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%esi) \n\t" + "movaps %%xmm2, 1024(%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%esi" + ); +} + +static void mix31to2_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps 1024(%0, %%esi), %%xmm0 \n\t" + "addps 3072(%0, %%esi), %%xmm0 \n\t" + "addps %%xmm7, %%xmm0 \n\t" // common + "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps 2048(%0, %%esi), %%xmm2 \n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%esi) \n\t" + "movaps %%xmm2, 1024(%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%esi" + ); +} + +static void mix31toS_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps 1024(%0, %%esi), %%xmm0 \n\t" + "movaps 3072(%0, %%esi), %%xmm3 \n\t" // surround + "addps %%xmm7, %%xmm0 \n\t" // common + "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps 2048(%0, %%esi), %%xmm2 \n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "subps %%xmm3, %%xmm1 \n\t" + "addps %%xmm3, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%esi) \n\t" + "movaps %%xmm2, 1024(%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%esi" + ); +} + +static void mix22toS_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps 2048(%0, %%esi), %%xmm0 \n\t" + "addps 3072(%0, %%esi), %%xmm0 \n\t" // surround + "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps 1024(%0, %%esi), %%xmm2 \n\t" + "addps %%xmm7, %%xmm1 \n\t" + "addps %%xmm7, %%xmm2 \n\t" + "subps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "movaps %%xmm1, (%0, %%esi) \n\t" + "movaps %%xmm2, 1024(%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%esi" + ); +} + +static void mix32to2_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps 1024(%0, %%esi), %%xmm0 \n\t" + "addps %%xmm7, %%xmm0 \n\t" // common + "movaps %%xmm0, %%xmm1 \n\t" // common + "addps (%0, %%esi), %%xmm0 \n\t" + "addps 2048(%0, %%esi), %%xmm1 \n\t" + "addps 3072(%0, %%esi), %%xmm0 \n\t" + "addps 4096(%0, %%esi), %%xmm1 \n\t" + "movaps %%xmm0, (%0, %%esi) \n\t" + "movaps %%xmm1, 1024(%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%esi" + ); +} + +static void mix32toS_SSE (sample_t * samples, sample_t bias) +{ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps 1024(%0, %%esi), %%xmm0 \n\t" + "movaps 3072(%0, %%esi), %%xmm2 \n\t" + "addps %%xmm7, %%xmm0 \n\t" // common + "addps 4096(%0, %%esi), %%xmm2 \n\t" // surround + "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps 2048(%0, %%esi), %%xmm3 \n\t" + "subps %%xmm2, %%xmm1 \n\t" + "addps %%xmm2, %%xmm3 \n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm3 \n\t" + "movaps %%xmm1, (%0, %%esi) \n\t" + "movaps %%xmm3, 1024(%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) + : "%esi" + ); +} + +static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias) +{ + asm volatile( + "movlps %2, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" + "movl $-1024, %%esi \n\t" + "1: \n\t" + "movaps (%0, %%esi), %%xmm0 \n\t" + "movaps 16(%0, %%esi), %%xmm1 \n\t" + "addps 1024(%0, %%esi), %%xmm0 \n\t" + "addps 1040(%0, %%esi), %%xmm1 \n\t" + "addps %%xmm7, %%xmm0 \n\t" + "addps %%xmm7, %%xmm1 \n\t" + "movaps %%xmm0, (%1, %%esi) \n\t" + "movaps %%xmm1, 16(%1, %%esi) \n\t" + "addl $32, %%esi \n\t" + " jnz 1b \n\t" + :: "r" (src+256), "r" (dest+256), "m" (bias) + : "%esi" + ); +} + +static void zero_MMX(sample_t * samples) +{ + asm volatile( + "movl $-1024, %%esi \n\t" + "pxor %%mm0, %%mm0 \n\t" + "1: \n\t" + "movq %%mm0, (%0, %%esi) \n\t" + "movq %%mm0, 8(%0, %%esi) \n\t" + "movq %%mm0, 16(%0, %%esi) \n\t" + "movq %%mm0, 24(%0, %%esi) \n\t" + "addl $32, %%esi \n\t" + " jnz 1b \n\t" + "emms" + :: "r" (samples+256) + : "%esi" + ); +} + + +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev) +{ + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { + + case CONVERT (A52_CHANNEL, A52_CHANNEL2): + memcpy (samples, samples + 256, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_CHANNEL, A52_MONO): + case CONVERT (A52_STEREO, A52_MONO): + mix_2to1_SSE: + mix2to1_SSE (samples, samples + 256, bias); + break; + + case CONVERT (A52_2F1R, A52_MONO): + if (slev == 0) + goto mix_2to1_SSE; + case CONVERT (A52_3F, A52_MONO): + mix_3to1_SSE: + mix3to1_SSE (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_MONO): + if (slev == 0) + goto mix_3to1_SSE; + case CONVERT (A52_2F2R, A52_MONO): + if (slev == 0) + goto mix_2to1_SSE; + mix4to1_SSE (samples, bias); + break; + + case CONVERT (A52_3F2R, A52_MONO): + if (slev == 0) + goto mix_3to1_SSE; + mix5to1_SSE (samples, bias); + break; + + case CONVERT (A52_MONO, A52_DOLBY): + memcpy (samples + 256, samples, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F, A52_STEREO): + case CONVERT (A52_3F, A52_DOLBY): + mix_3to2_SSE: + mix3to2_SSE (samples, bias); + break; + + case CONVERT (A52_2F1R, A52_STEREO): + if (slev == 0) + break; + mix21to2_SSE (samples, samples + 256, bias); + break; + + case CONVERT (A52_2F1R, A52_DOLBY): + mix21toS_SSE (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_STEREO): + if (slev == 0) + goto mix_3to2_SSE; + mix31to2_SSE (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_DOLBY): + mix31toS_SSE (samples, bias); + break; + + case CONVERT (A52_2F2R, A52_STEREO): + if (slev == 0) + break; + mix2to1_SSE (samples, samples + 512, bias); + mix2to1_SSE (samples + 256, samples + 768, bias); + break; + + case CONVERT (A52_2F2R, A52_DOLBY): + mix22toS_SSE (samples, bias); + break; + + case CONVERT (A52_3F2R, A52_STEREO): + if (slev == 0) + goto mix_3to2_SSE; + mix32to2_SSE (samples, bias); + break; + + case CONVERT (A52_3F2R, A52_DOLBY): + mix32toS_SSE (samples, bias); + break; + + case CONVERT (A52_3F1R, A52_3F): + if (slev == 0) + break; + mix21to2_SSE (samples, samples + 512, bias); + break; + + case CONVERT (A52_3F2R, A52_3F): + if (slev == 0) + break; + mix2to1_SSE (samples, samples + 768, bias); + mix2to1_SSE (samples + 512, samples + 1024, bias); + break; + + case CONVERT (A52_3F1R, A52_2F1R): + mix3to2_SSE (samples, bias); + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_2F2R, A52_2F1R): + mix2to1_SSE (samples + 512, samples + 768, bias); + break; + + case CONVERT (A52_3F2R, A52_2F1R): + mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used) + move2to1_SSE (samples + 768, samples + 512, bias); + break; + + case CONVERT (A52_3F2R, A52_3F1R): + mix2to1_SSE (samples + 768, samples + 1024, bias); + break; + + case CONVERT (A52_2F1R, A52_2F2R): + memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F1R, A52_2F2R): + mix3to2_SSE (samples, bias); + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F2R, A52_2F2R): + mix3to2_SSE (samples, bias); + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); + memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F1R, A52_3F2R): + memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t)); + break; + } +} + +static void upmix_MMX (sample_t * samples, int acmod, int output) +{ + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { + + case CONVERT (A52_CHANNEL, A52_CHANNEL2): + memcpy (samples + 256, samples, 256 * sizeof (sample_t)); + break; + + case CONVERT (A52_3F2R, A52_MONO): + zero_MMX (samples + 1024); + case CONVERT (A52_3F1R, A52_MONO): + case CONVERT (A52_2F2R, A52_MONO): + zero_MMX (samples + 768); + case CONVERT (A52_3F, A52_MONO): + case CONVERT (A52_2F1R, A52_MONO): + zero_MMX (samples + 512); + case CONVERT (A52_CHANNEL, A52_MONO): + case CONVERT (A52_STEREO, A52_MONO): + zero_MMX (samples + 256); + break; + + case CONVERT (A52_3F2R, A52_STEREO): + case CONVERT (A52_3F2R, A52_DOLBY): + zero_MMX (samples + 1024); + case CONVERT (A52_3F1R, A52_STEREO): + case CONVERT (A52_3F1R, A52_DOLBY): + zero_MMX (samples + 768); + case CONVERT (A52_3F, A52_STEREO): + case CONVERT (A52_3F, A52_DOLBY): + mix_3to2_MMX: + memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t)); + zero_MMX (samples + 256); + break; + + case CONVERT (A52_2F2R, A52_STEREO): + case CONVERT (A52_2F2R, A52_DOLBY): + zero_MMX (samples + 768); + case CONVERT (A52_2F1R, A52_STEREO): + case CONVERT (A52_2F1R, A52_DOLBY): + zero_MMX (samples + 512); + break; + + case CONVERT (A52_3F2R, A52_3F): + zero_MMX (samples + 1024); + case CONVERT (A52_3F1R, A52_3F): + case CONVERT (A52_2F2R, A52_2F1R): + zero_MMX (samples + 768); + break; + + case CONVERT (A52_3F2R, A52_3F1R): + zero_MMX (samples + 1024); + break; + + case CONVERT (A52_3F2R, A52_2F1R): + zero_MMX (samples + 1024); + case CONVERT (A52_3F1R, A52_2F1R): + mix_31to21_MMX: + memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); + goto mix_3to2_MMX; + + case CONVERT (A52_3F2R, A52_2F2R): + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); + goto mix_31to21_MMX; + } +} +#endif //ARCH_X86