# HG changeset patch # User lorenm # Date 1218670540 0 # Node ID 7cf79395487199b949eaefdd05dac6728a24a745 # Parent 8390efaa0c03d88d75f0ffef1ebe9ea26adb82f8 simd int->float 20% faster ac3 if downmixing, 15% if not diff -r 8390efaa0c03 -r 7cf793954871 ac3dec.c --- a/ac3dec.c Wed Aug 13 23:33:48 2008 +0000 +++ b/ac3dec.c Wed Aug 13 23:35:40 2008 +0000 @@ -994,9 +994,7 @@ } else { gain *= s->dynamic_range[0]; } - for(i=0; i<256; i++) { - s->transform_coeffs[ch][i] = s->fixed_coeffs[ch][i] * gain; - } + s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); } /* downmix and MDCT. order depends on whether block switching is used for diff -r 8390efaa0c03 -r 7cf793954871 ac3dec.h --- a/ac3dec.h Wed Aug 13 23:33:48 2008 +0000 +++ b/ac3dec.h Wed Aug 13 23:35:40 2008 +0000 @@ -158,7 +158,7 @@ float mul_bias; ///< scaling for float_to_int16 conversion ///@} - int fixed_coeffs[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///> fixed-point transform coefficients + DECLARE_ALIGNED_16(int, fixed_coeffs[AC3_MAX_CHANNELS][AC3_MAX_COEFS]); ///> fixed-point transform coefficients ///@defgroup arrays aligned arrays DECLARE_ALIGNED_16(float, transform_coeffs[AC3_MAX_CHANNELS][AC3_MAX_COEFS]); ///< transform coefficients diff -r 8390efaa0c03 -r 7cf793954871 dsputil.c --- a/dsputil.c Wed Aug 13 23:33:48 2008 +0000 +++ b/dsputil.c Wed Aug 13 23:35:40 2008 +0000 @@ -3948,6 +3948,12 @@ } } +static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ + int i; + for(i=0; ivector_fmul_reverse = vector_fmul_reverse_c; c->vector_fmul_add_add = ff_vector_fmul_add_add_c; c->vector_fmul_window = ff_vector_fmul_window_c; + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; c->float_to_int16 = ff_float_to_int16_c; c->float_to_int16_interleave = ff_float_to_int16_interleave_c; c->add_int16 = add_int16_c; diff -r 8390efaa0c03 -r 7cf793954871 dsputil.h --- a/dsputil.h Wed Aug 13 23:33:48 2008 +0000 +++ b/dsputil.h Wed Aug 13 23:35:40 2008 +0000 @@ -370,6 +370,8 @@ void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step); /* assume len is a multiple of 4, and arrays are 16-byte aligned */ void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len); + /* assume len is a multiple of 8, and arrays are 16-byte aligned */ + void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767] * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */ diff -r 8390efaa0c03 -r 7cf793954871 i386/dsputil_mmx.c --- a/i386/dsputil_mmx.c Wed Aug 13 23:33:48 2008 +0000 +++ b/i386/dsputil_mmx.c Wed Aug 13 23:35:40 2008 +0000 @@ -2192,6 +2192,50 @@ ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); } +static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) +{ + x86_reg i = -4*len; + asm volatile( + "movss %3, %%xmm4 \n" + "shufps $0, %%xmm4, %%xmm4 \n" + "1: \n" + "cvtpi2ps (%2,%0), %%xmm0 \n" + "cvtpi2ps 8(%2,%0), %%xmm1 \n" + "cvtpi2ps 16(%2,%0), %%xmm2 \n" + "cvtpi2ps 24(%2,%0), %%xmm3 \n" + "movlhps %%xmm1, %%xmm0 \n" + "movlhps %%xmm3, %%xmm2 \n" + "mulps %%xmm4, %%xmm0 \n" + "mulps %%xmm4, %%xmm2 \n" + "movaps %%xmm0, (%1,%0) \n" + "movaps %%xmm2, 16(%1,%0) \n" + "add $32, %0 \n" + "jl 1b \n" + :"+r"(i) + :"r"(dst+len), "r"(src+len), "xm"(mul) + ); +} + +static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) +{ + x86_reg i = -4*len; + asm volatile( + "movss %3, %%xmm4 \n" + "shufps $0, %%xmm4, %%xmm4 \n" + "1: \n" + "cvtdq2ps (%2,%0), %%xmm0 \n" + "cvtdq2ps 16(%2,%0), %%xmm1 \n" + "mulps %%xmm4, %%xmm0 \n" + "mulps %%xmm4, %%xmm1 \n" + "movaps %%xmm0, (%1,%0) \n" + "movaps %%xmm1, 16(%1,%0) \n" + "add $32, %0 \n" + "jl 1b \n" + :"+r"(i) + :"r"(dst+len), "r"(src+len), "xm"(mul) + ); +} + static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ // not bit-exact: pf2id uses different rounding than C and SSE asm volatile( @@ -2786,12 +2830,14 @@ c->vector_fmul_reverse = vector_fmul_reverse_sse; c->vector_fmul_add_add = vector_fmul_add_add_sse; c->vector_fmul_window = vector_fmul_window_sse; + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; c->float_to_int16 = float_to_int16_sse; c->float_to_int16_interleave = float_to_int16_interleave_sse; } if(mm_flags & MM_3DNOW) c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse if(mm_flags & MM_SSE2){ + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; c->float_to_int16 = float_to_int16_sse2; c->float_to_int16_interleave = float_to_int16_interleave_sse2; c->add_int16 = add_int16_sse2;