# HG changeset patch # User lorenm # Date 1215960961 0 # Node ID 032a49f033e8e89a0760a45ee0c13e098e72d05a # Parent 3ec34b551aae611441a709d0cc7d8127920a06c4 simplify vorbis windowing diff -r 3ec34b551aae -r 032a49f033e8 dsputil.c --- a/dsputil.c Sun Jul 13 14:27:48 2008 +0000 +++ b/dsputil.c Sun Jul 13 14:56:01 2008 +0000 @@ -3930,17 +3930,40 @@ dst[i*step] = src0[i] * src1[i] + src2[i] + src3; } +void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){ + int i; + for(i=0; i>31; + // is this faster on some gcc/cpu combinations? +// if(tmp > 0x43c0ffff) tmp = 0xFFFF; +// else tmp = 0; + } + return tmp - 0x8000; +} + void ff_float_to_int16_c(int16_t *dst, const float *src, long len){ int i; - for(i=0; i>31; - // is this faster on some gcc/cpu combinations? -// if(tmp > 0x43c0ffff) tmp = 0xFFFF; -// else tmp = 0; + for(i=0; ivector_fmul = vector_fmul_c; c->vector_fmul_reverse = vector_fmul_reverse_c; c->vector_fmul_add_add = ff_vector_fmul_add_add_c; + c->vector_fmul_window = ff_vector_fmul_window_c; c->float_to_int16 = ff_float_to_int16_c; + c->float_to_int16_interleave = ff_float_to_int16_interleave_c; c->add_int16 = add_int16_c; c->sub_int16 = sub_int16_c; c->scalarproduct_int16 = scalarproduct_int16_c; diff -r 3ec34b551aae -r 032a49f033e8 dsputil.h --- a/dsputil.h Sun Jul 13 14:27:48 2008 +0000 +++ b/dsputil.h Sun Jul 13 14:56:01 2008 +0000 @@ -63,6 +63,8 @@ void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int blocksize, int step); +void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, + const float *win, float add_bias, int len); void ff_float_to_int16_c(int16_t *dst, const float *src, long len); /* encoding scans */ @@ -364,10 +366,13 @@ void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len); /* assume len is a multiple of 8, and src arrays are 16-byte aligned */ void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step); + /* assume len is a multiple of 4, and arrays are 16-byte aligned */ + void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len); /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767] * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */ void (*float_to_int16)(int16_t *dst, const float *src, long len); + void (*float_to_int16_interleave)(int16_t *dst, const float *src, long len, int channels); /* (I)DCT */ void (*fdct)(DCTELEM *block/* align 16*/); diff -r 3ec34b551aae -r 032a49f033e8 i386/dsputil_mmx.c --- a/i386/dsputil_mmx.c Sun Jul 13 14:27:48 2008 +0000 +++ b/i386/dsputil_mmx.c Sun Jul 13 14:56:01 2008 +0000 @@ -2022,6 +2022,39 @@ ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); } +static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, + const float *win, float add_bias, int len){ +#ifdef HAVE_6REGS + if(add_bias == 0){ + x86_reg i = -len*2; + x86_reg j = len*2-16; + asm volatile( + "1: \n" + "movaps (%5,%0), %%xmm0 \n" + "movaps (%5,%1), %%xmm1 \n" + "movaps %%xmm0, %%xmm2 \n" + "movaps %%xmm1, %%xmm3 \n" + "shufps $0x1b, %%xmm2, %%xmm2 \n" + "shufps $0x1b, %%xmm3, %%xmm3 \n" + "mulps (%4,%0), %%xmm0 \n" + "mulps (%4,%1), %%xmm1 \n" + "mulps (%3,%0), %%xmm3 \n" + "mulps (%3,%1), %%xmm2 \n" + "addps %%xmm3, %%xmm0 \n" + "addps %%xmm2, %%xmm1 \n" + "movaps %%xmm0, (%2,%0) \n" + "movaps %%xmm1, (%2,%1) \n" + "sub $16, %1 \n" + "add $16, %0 \n" + "jl 1b \n" + :"+r"(i), "+r"(j) + :"r"(dst+len/2), "r"(src0+len/2), "r"(src1+len/2), "r"(win+len/2) + ); + }else +#endif + ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); +} + static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ // not bit-exact: pf2id uses different rounding than C and SSE asm volatile( @@ -2083,6 +2116,87 @@ ); } +#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ +/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ +static av_noinline void float_to_int16_interleave2_##cpu(int16_t *dst, const float *src, long len, int channels){\ + DECLARE_ALIGNED_16(int16_t, tmp[len*channels]);\ + int i,j,c;\ + float_to_int16_##cpu(tmp, src, len*channels);\ + for(c=0; c2)\ + float_to_int16_interleave2_##cpu(dst, src, len, channels);\ + else{\ + float *src1;\ + asm volatile(\ + "shl $2, %0 \n"\ + "add %0, %1 \n"\ + "add %0, %2 \n"\ + "lea (%2,%0), %3 \n"\ + "neg %0 \n"\ + body\ + :"+r"(len), "+r"(dst), "+r"(src), "=r"(src1)\ + );\ + }\ +} + +FLOAT_TO_INT16_INTERLEAVE(3dnow, + "1: \n" + "pf2id (%2,%0), %%mm0 \n" + "pf2id 8(%2,%0), %%mm1 \n" + "pf2id (%3,%0), %%mm2 \n" + "pf2id 8(%3,%0), %%mm3 \n" + "packssdw %%mm1, %%mm0 \n" + "packssdw %%mm3, %%mm2 \n" + "movq %%mm0, %%mm1 \n" + "punpcklwd %%mm2, %%mm0 \n" + "punpckhwd %%mm2, %%mm1 \n" + "movq %%mm0, (%1,%0)\n" + "movq %%mm0, 8(%1,%0)\n" + "add $16, %0 \n" + "js 1b \n" + "femms \n" +) + +FLOAT_TO_INT16_INTERLEAVE(sse, + "1: \n" + "cvtps2pi (%2,%0), %%mm0 \n" + "cvtps2pi 8(%2,%0), %%mm1 \n" + "cvtps2pi (%3,%0), %%mm2 \n" + "cvtps2pi 8(%3,%0), %%mm3 \n" + "packssdw %%mm1, %%mm0 \n" + "packssdw %%mm3, %%mm2 \n" + "movq %%mm0, %%mm1 \n" + "punpcklwd %%mm2, %%mm0 \n" + "punpckhwd %%mm2, %%mm1 \n" + "movq %%mm0, (%1,%0)\n" + "movq %%mm0, 8(%1,%0)\n" + "add $16, %0 \n" + "js 1b \n" + "emms \n" +) + +FLOAT_TO_INT16_INTERLEAVE(sse2, + "1: \n" + "cvtps2dq (%2,%0), %%xmm0 \n" + "cvtps2dq (%3,%0), %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "movhlps %%xmm0, %%xmm1 \n" + "punpcklwd %%xmm1, %%xmm0 \n" + "movdqa %%xmm0, (%1,%0) \n" + "add $16, %0 \n" + "js 1b \n" +) + + extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width); extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width); extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); @@ -2519,8 +2633,10 @@ if(mm_flags & MM_3DNOW){ c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; c->vector_fmul = vector_fmul_3dnow; - if(!(avctx->flags & CODEC_FLAG_BITEXACT)) + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->float_to_int16 = float_to_int16_3dnow; + c->float_to_int16_interleave = float_to_int16_interleave_3dnow; + } } if(mm_flags & MM_3DNOWEXT) c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; @@ -2528,11 +2644,14 @@ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; c->vector_fmul = vector_fmul_sse; c->float_to_int16 = float_to_int16_sse; + c->float_to_int16_interleave = float_to_int16_interleave_sse; c->vector_fmul_reverse = vector_fmul_reverse_sse; c->vector_fmul_add_add = vector_fmul_add_add_sse; + c->vector_fmul_window = vector_fmul_window_sse; } if(mm_flags & MM_SSE2){ c->float_to_int16 = float_to_int16_sse2; + c->float_to_int16_interleave = float_to_int16_interleave_sse2; } if(mm_flags & MM_3DNOW) c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse diff -r 3ec34b551aae -r 032a49f033e8 vorbis_dec.c --- a/vorbis_dec.c Sun Jul 13 14:27:48 2008 +0000 +++ b/vorbis_dec.c Sun Jul 13 14:56:01 2008 +0000 @@ -149,10 +149,10 @@ uint_fast8_t mode_count; vorbis_mode *modes; uint_fast8_t mode_number; // mode number for the current packet + uint_fast8_t previous_window; float *channel_residues; float *channel_floors; float *saved; - uint_fast16_t saved_start; float *ret; float *buf; float *buf_tmp; @@ -903,7 +903,7 @@ vc->ret = av_malloc((vc->blocksize[1]/2)*vc->audio_channels * sizeof(float)); vc->buf = av_malloc( vc->blocksize[1] * sizeof(float)); vc->buf_tmp = av_malloc( vc->blocksize[1] * sizeof(float)); - vc->saved_start=0; + vc->previous_window=0; ff_mdct_init(&vc->mdct[0], bl0, 1); ff_mdct_init(&vc->mdct[1], bl1, 1); @@ -1394,13 +1394,26 @@ } } +static void copy_normalize(float *dst, float *src, int len, int exp_bias, float add_bias) +{ + int i; + if(exp_bias) { + for(i=0; igb; - uint_fast8_t previous_window=0,next_window=0; + uint_fast8_t previous_window=vc->previous_window; uint_fast8_t mode_number; + uint_fast8_t blockflag; uint_fast16_t blocksize; int_fast32_t i,j; uint_fast8_t no_residue[vc->audio_channels]; @@ -1411,7 +1424,6 @@ uint_fast8_t res_chan[vc->audio_channels]; uint_fast8_t res_num=0; int_fast16_t retlen=0; - uint_fast16_t saved_start=0; float fadd_bias = vc->add_bias; if (get_bits1(gb)) { @@ -1429,12 +1441,12 @@ AV_DEBUG(" Mode number: %d , mapping: %d , blocktype %d \n", mode_number, vc->modes[mode_number].mapping, vc->modes[mode_number].blockflag); - if (vc->modes[mode_number].blockflag) { - previous_window=get_bits1(gb); - next_window=get_bits1(gb); + blockflag=vc->modes[mode_number].blockflag; + blocksize=vc->blocksize[blockflag]; + if (blockflag) { + skip_bits(gb, 2); // previous_window, next_window } - blocksize=vc->blocksize[vc->modes[mode_number].blockflag]; memset(ch_res_ptr, 0, sizeof(float)*vc->audio_channels*blocksize/2); //FIXME can this be removed ? memset(ch_floor_ptr, 0, sizeof(float)*vc->audio_channels*blocksize/2); //FIXME can this be removed ? @@ -1504,76 +1516,31 @@ // MDCT, overlap/add, save data for next overlapping FPMATH + retlen = (blocksize + vc->blocksize[previous_window])/4; for(j=0;jaudio_channels;++j) { - uint_fast8_t step=vc->audio_channels; - uint_fast16_t k; - float *saved=vc->saved+j*vc->blocksize[1]/2; - float *ret=vc->ret; - const float *lwin=vc->win[1]; - const float *swin=vc->win[0]; + uint_fast16_t bs0=vc->blocksize[0]; + uint_fast16_t bs1=vc->blocksize[1]; + float *saved=vc->saved+j*bs1/2; + float *ret=vc->ret+j*retlen; float *buf=vc->buf; - float *buf_tmp=vc->buf_tmp; - - ch_floor_ptr=vc->channel_floors+j*blocksize/2; + const float *win=vc->win[blockflag&previous_window]; - saved_start=vc->saved_start; - - vc->mdct[0].fft.imdct_calc(&vc->mdct[vc->modes[mode_number].blockflag], buf, ch_floor_ptr, buf_tmp); + vc->mdct[0].fft.imdct_calc(&vc->mdct[blockflag], buf, vc->channel_floors+j*blocksize/2, vc->buf_tmp); - //FIXME process channels together, to allow faster simd vector_fmul_add_add? - if (vc->modes[mode_number].blockflag) { - // -- overlap/add - if (previous_window) { - vc->dsp.vector_fmul_add_add(ret+j, buf, lwin, saved, vc->add_bias, vc->blocksize[1]/2, step); - retlen=vc->blocksize[1]/2; - } else { - int len = (vc->blocksize[1]-vc->blocksize[0])/4; - buf += len; - vc->dsp.vector_fmul_add_add(ret+j, buf, swin, saved, vc->add_bias, vc->blocksize[0]/2, step); - k = vc->blocksize[0]/2*step + j; - buf += vc->blocksize[0]/2; - if(vc->exp_bias){ - for(i=0; iexp_bias; // ret[k]=buf[i]*(1<buf; - retlen=vc->blocksize[0]/2+len; - } - // -- save - if (next_window) { - buf += vc->blocksize[1]/2; - vc->dsp.vector_fmul_reverse(saved, buf, lwin, vc->blocksize[1]/2); - saved_start=0; - } else { - saved_start=(vc->blocksize[1]-vc->blocksize[0])/4; - buf += vc->blocksize[1]/2; - for(i=0; iexp_bias; - vc->dsp.vector_fmul_reverse(saved+saved_start, buf+saved_start, swin, vc->blocksize[0]/2); - } + if(blockflag == previous_window) { + vc->dsp.vector_fmul_window(ret, saved, buf, win, fadd_bias, blocksize/2); + } else if(blockflag > previous_window) { + vc->dsp.vector_fmul_window(ret, saved, buf+(bs1-bs0)/4, win, fadd_bias, bs0/2); + copy_normalize(ret+bs0/2, buf+(bs1+bs0)/4, (bs1-bs0)/4, vc->exp_bias, fadd_bias); } else { - // --overlap/add - if(vc->add_bias) { - for(k=j, i=0;idsp.vector_fmul_add_add(ret+k, buf, swin, saved+saved_start, vc->add_bias, vc->blocksize[0]/2, step); - retlen=saved_start+vc->blocksize[0]/2; - // -- save - buf += vc->blocksize[0]/2; - vc->dsp.vector_fmul_reverse(saved, buf, swin, vc->blocksize[0]/2); - saved_start=0; + copy_normalize(ret, saved, (bs1-bs0)/4, vc->exp_bias, fadd_bias); + vc->dsp.vector_fmul_window(ret+(bs1-bs0)/4, saved+(bs1-bs0)/4, buf, win, fadd_bias, bs0/2); } + memcpy(saved, buf+blocksize/2, blocksize/2*sizeof(float)); } - vc->saved_start=saved_start; - return retlen*vc->audio_channels; + vc->previous_window = blockflag; + return retlen; } // Return the decoded audio packet through the standard api @@ -1610,8 +1577,8 @@ AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len); - vc->dsp.float_to_int16(data, vc->ret, len); - *data_size=len*2; + vc->dsp.float_to_int16_interleave(data, vc->ret, len, vc->audio_channels); + *data_size=len*2*vc->audio_channels; return buf_size ; }