Mercurial > libavcodec.hg
changeset 3568:945caa35ee9a libavcodec
sse and 3dnow implementations of float->int conversion and mdct windowing.
15% faster vorbis.
author | lorenm |
---|---|
date | Thu, 10 Aug 2006 19:06:25 +0000 |
parents | 1f8730f62765 |
children | c42c03f3b402 |
files | dsputil.c dsputil.h i386/dsputil_mmx.c vorbis.c vorbis.h |
diffstat | 5 files changed, 317 insertions(+), 48 deletions(-) [+] |
line wrap: on
line diff
--- a/dsputil.c Thu Aug 10 18:49:47 2006 +0000 +++ b/dsputil.c Thu Aug 10 19:06:25 2006 +0000 @@ -3753,6 +3753,39 @@ WARPER8_16_SQ(rd8x8_c, rd16_c) WARPER8_16_SQ(bit8x8_c, bit16_c) +static void vector_fmul_c(float *dst, const float *src, int len){ + int i; + for(i=0; i<len; i++) + dst[i] *= src[i]; +} + +static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){ + int i; + src1 += len-1; + for(i=0; i<len; i++) + dst[i] = src0[i] * src1[-i]; +} + +void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){ + int i; + for(i=0; i<len; i++) + dst[i*step] = src0[i] * src1[i] + src2[i] + src3; +} + +void ff_float_to_int16_c(int16_t *dst, const float *src, int len){ + int i; + for(i=0; i<len; i++) { + int_fast32_t tmp = ((int32_t*)src)[i]; + if(tmp & 0xf0000){ + tmp = (0x43c0ffff - tmp)>>31; + // is this faster on some gcc/cpu combinations? +// if(tmp > 0x43c0ffff) tmp = 0xFFFF; +// else tmp = 0; + } + dst[i] = tmp - 0x8000; + } +} + /* XXX: those functions should be suppressed ASAP when all IDCTs are converted */ static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block) @@ -4096,6 +4129,10 @@ #ifdef CONFIG_VORBIS_DECODER c->vorbis_inverse_coupling = vorbis_inverse_coupling; #endif + c->vector_fmul = vector_fmul_c; + c->vector_fmul_reverse = vector_fmul_reverse_c; + c->vector_fmul_add_add = ff_vector_fmul_add_add_c; + c->float_to_int16 = ff_float_to_int16_c; c->shrink[0]= ff_img_copy_plane; c->shrink[1]= ff_shrink22;
--- a/dsputil.h Thu Aug 10 18:49:47 2006 +0000 +++ b/dsputil.h Thu Aug 10 19:06:25 2006 +0000 @@ -58,6 +58,10 @@ void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block); void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block); +void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, + const float *src2, int src3, int blocksize, int step); +void ff_float_to_int16_c(int16_t *dst, const float *src, int len); + /* encoding scans */ extern const uint8_t ff_alternate_horizontal_scan[64]; extern const uint8_t ff_alternate_vertical_scan[64]; @@ -307,7 +311,17 @@ void (*h261_loop_filter)(uint8_t *src, int stride); + /* assume len is a multiple of 4, and arrays are 16-byte aligned */ void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize); + void (*vector_fmul)(float *dst, const float *src, int len); + /* assume len is a multiple of 8, and arrays are 16-byte aligned */ + void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len); + /* assume len is a multiple of 8, and src arrays are 16-byte aligned */ + void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step); + + /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767] + * asm versions: convert floats from [-32768.0,32767.0] without rescaling */ + void (*float_to_int16)(int16_t *dst, const float *src, int len); /* (I)DCT */ void (*fdct)(DCTELEM *block/* align 16*/);
--- a/i386/dsputil_mmx.c Thu Aug 10 18:49:47 2006 +0000 +++ b/i386/dsputil_mmx.c Thu Aug 10 19:06:25 2006 +0000 @@ -2772,6 +2772,198 @@ } } +static void vector_fmul_3dnow(float *dst, const float *src, int len){ + long i; + len >>= 1; + for(i=0; i<len; i++) { + asm volatile( + "movq %0, %%mm0 \n\t" + "pfmul %1, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"+m"(dst[i*2]) + :"m"(src[i*2]) + :"memory" + ); + } + asm volatile("femms"); +} +static void vector_fmul_sse(float *dst, const float *src, int len){ + long i; + len >>= 2; + for(i=0; i<len; i++) { + asm volatile( + "movaps %0, %%xmm0 \n\t" + "mulps %1, %%xmm0 \n\t" + "movaps %%xmm0, %0 \n\t" + :"+m"(dst[i*4]) + :"m"(src[i*4]) + :"memory" + ); + } +} + +static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ + long i = len*4-16; + asm volatile( + "1: \n\t" + "pswapd 8(%1), %%mm0 \n\t" + "pswapd (%1), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "movq %%mm0, (%2,%0) \n\t" + "movq %%mm1, 8(%2,%0) \n\t" + "add $16, %1 \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(src1) + :"r"(dst), "r"(src0) + ); + asm volatile("femms"); +} +static void vector_fmul_reverse_sse2(float *dst, const float *src0, const float *src1, int len){ + long i = len*4-32; + asm volatile( + "1: \n\t" + "pshufd $0x1b, 16(%1), %%xmm0 \n\t" + "pshufd $0x1b, (%1), %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "movaps %%xmm0, (%2,%0) \n\t" + "movaps %%xmm1, 16(%2,%0) \n\t" + "add $32, %1 \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(src1) + :"r"(dst), "r"(src0) + ); +} + +static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, + const float *src2, int src3, int len, int step){ + long i; + if(step == 2 && src3 == 0){ + i = (len-4)*4; + dst += (len-4)*2; + asm volatile( + "1: \n\t" + "movq (%2,%0), %%mm0 \n\t" + "movq 8(%2,%0), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "pfadd (%4,%0), %%mm0 \n\t" + "pfadd 8(%4,%0), %%mm1 \n\t" + "movd %%mm0, (%1) \n\t" + "movd %%mm1, 16(%1) \n\t" + "psrlq $32, %%mm0 \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm0, 8(%1) \n\t" + "movd %%mm1, 24(%1) \n\t" + "sub $32, %1 \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(dst) + :"r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else if(step == 1 && src3 == 0){ + for(i=0; i<len; i+=2){ + asm volatile( + "movq %1, %%mm0 \n\t" + "pfmul %2, %%mm0 \n\t" + "pfadd %3, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"=m"(dst[i]) + :"m"(src0[i]), "m"(src1[i]), "m"(src2[i]) + ); + } + } + else + ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); + asm volatile("femms"); +} +static void vector_fmul_add_add_sse2(float *dst, const float *src0, const float *src1, + const float *src2, float src3, int len, int step){ + long i; + if(step == 2 && src3 == 0){ + i = (len-8)*4; + dst += (len-8)*2; + asm volatile( + "1: \n\t" + "movaps (%2,%0), %%xmm0 \n\t" + "movaps 16(%2,%0), %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "addps (%4,%0), %%xmm0 \n\t" + "addps 16(%4,%0), %%xmm1 \n\t" + "movd %%xmm0, (%1) \n\t" + "movd %%xmm1, 32(%1) \n\t" + "psrldq $4, %%xmm0 \n\t" + "psrldq $4, %%xmm1 \n\t" + "movd %%xmm0, 8(%1) \n\t" + "movd %%xmm1, 40(%1) \n\t" + "psrldq $4, %%xmm0 \n\t" + "psrldq $4, %%xmm1 \n\t" + "movd %%xmm0, 16(%1) \n\t" + "movd %%xmm1, 48(%1) \n\t" + "psrldq $4, %%xmm0 \n\t" + "psrldq $4, %%xmm1 \n\t" + "movd %%xmm0, 24(%1) \n\t" + "movd %%xmm1, 56(%1) \n\t" + "sub $64, %1 \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(dst) + :"r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else if(step == 1 && src3 == 0){ + for(i=0; i<len; i+=4){ + asm volatile( + "movaps %1, %%xmm0 \n\t" + "mulps %2, %%xmm0 \n\t" + "addps %3, %%xmm0 \n\t" + "movaps %%xmm0, %0 \n\t" + :"=m"(dst[i]) + :"m"(src0[i]), "m"(src1[i]), "m"(src2[i]) + ); + } + } + else + ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); +} + +void float_to_int16_3dnow(int16_t *dst, const float *src, int len){ + // not bit-exact: pf2id uses different rounding than C and SSE + int i; + for(i=0; i<len; i+=4) { + asm volatile( + "pf2id %1, %%mm0 \n\t" + "pf2id %2, %%mm1 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"=m"(dst[i]) + :"m"(src[i]), "m"(src[i+2]) + ); + } + asm volatile("femms"); +} +void float_to_int16_sse(int16_t *dst, const float *src, int len){ + int i; + for(i=0; i<len; i+=4) { + asm volatile( + "cvtps2pi %1, %%mm0 \n\t" + "cvtps2pi %2, %%mm1 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"=m"(dst[i]) + :"m"(src[i]), "m"(src[i+2]) + ); + } + asm volatile("emms"); +} + #ifdef CONFIG_SNOW_ENCODER extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width); extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width); @@ -3199,10 +3391,25 @@ } #endif - if(mm_flags & MM_SSE) + if(mm_flags & MM_3DNOW){ + c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; + c->vector_fmul = vector_fmul_3dnow; + if(!(avctx->flags & CODEC_FLAG_BITEXACT)) + c->float_to_int16 = float_to_int16_3dnow; + } + if(mm_flags & MM_3DNOWEXT) + c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; + if(mm_flags & MM_SSE){ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; - else if(mm_flags & MM_3DNOW) - c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; + c->vector_fmul = vector_fmul_sse; + c->float_to_int16 = float_to_int16_sse; + } + if(mm_flags & MM_SSE2){ + c->vector_fmul_reverse = vector_fmul_reverse_sse2; + c->vector_fmul_add_add = vector_fmul_add_add_sse2; + } + if(mm_flags & MM_3DNOW) + c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse2 } #ifdef CONFIG_ENCODERS
--- a/vorbis.c Thu Aug 10 18:49:47 2006 +0000 +++ b/vorbis.c Thu Aug 10 19:06:25 2006 +0000 @@ -192,6 +192,11 @@ av_free(vc->mappings[i].mux); } av_freep(&vc->mappings); + + if(vc->exp_bias){ + av_freep(&vc->swin); + av_freep(&vc->lwin); + } } // Parse setup header ------------------------------------------------- @@ -888,6 +893,19 @@ vc->swin=vwin[bl0-6]; vc->lwin=vwin[bl1-6]; + if(vc->exp_bias){ + int i; + float *win; + win = av_malloc(vc->blocksize_0/2 * sizeof(float)); + for(i=0; i<vc->blocksize_0/2; i++) + win[i] = vc->swin[i] * (1<<15); + vc->swin = win; + win = av_malloc(vc->blocksize_1/2 * sizeof(float)); + for(i=0; i<vc->blocksize_1/2; i++) + win[i] = vc->lwin[i] * (1<<15); + vc->lwin = win; + } + if ((get_bits1(gb)) == 0) { av_log(vc->avccontext, AV_LOG_ERROR, " Vorbis id header packet corrupt (framing flag not set). \n"); return 2; @@ -931,6 +949,14 @@ vc->avccontext = avccontext; dsputil_init(&vc->dsp, avccontext); + if(vc->dsp.float_to_int16 == ff_float_to_int16_c) { + vc->add_bias = 385; + vc->exp_bias = 0; + } else { + vc->add_bias = 0; + vc->exp_bias = 15<<23; + } + if (!headers_len) { av_log(avccontext, AV_LOG_ERROR, "Extradata corrupt.\n"); return -1; @@ -1472,7 +1498,6 @@ } // Decode the audio packet using the functions above -#define BIAS 385 static int vorbis_parse_audio_packet(vorbis_context *vc) { GetBitContext *gb=&vc->gb; @@ -1490,6 +1515,7 @@ uint_fast8_t res_num=0; int_fast16_t retlen=0; uint_fast16_t saved_start=0; + float fadd_bias = vc->add_bias; if (get_bits1(gb)) { av_log(vc->avccontext, AV_LOG_ERROR, "Not a Vorbis I audio packet.\n"); @@ -1576,10 +1602,7 @@ for(j=0, ch_floor_ptr=vc->channel_floors;j<vc->audio_channels;++j,ch_floor_ptr+=blocksize/2) { ch_res_ptr=vc->channel_residues+res_chan[j]*blocksize/2; - - for(i=0;i<blocksize/2;++i) { - ch_floor_ptr[i]*=ch_res_ptr[i]; //FPMATH - } + vc->dsp.vector_fmul(ch_floor_ptr, ch_res_ptr, blocksize/2); } // MDCT, overlap/add, save data for next overlapping FPMATH @@ -1600,59 +1623,54 @@ vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp); + //FIXME process channels together, to allow faster simd vector_fmul_add_add? if (vc->modes[mode_number].blockflag) { // -- overlap/add if (previous_window) { - for(k=j, i=0;i<vc->blocksize_1/2;++i, k+=step) { - ret[k]=saved[i]+buf[i]*lwin[i]+BIAS; - } + vc->dsp.vector_fmul_add_add(ret+j, buf, lwin, saved, vc->add_bias, vc->blocksize_1/2, step); retlen=vc->blocksize_1/2; } else { - buf += (vc->blocksize_1-vc->blocksize_0)/4; - for(k=j, i=0;i<vc->blocksize_0/2;++i, k+=step) { - ret[k]=saved[i]+buf[i]*swin[i]+BIAS; - } + int len = (vc->blocksize_1-vc->blocksize_0)/4; + buf += len; + vc->dsp.vector_fmul_add_add(ret+j, buf, swin, saved, vc->add_bias, vc->blocksize_0/2, step); + k = vc->blocksize_0/2*step + j; buf += vc->blocksize_0/2; - for(i=0;i<(vc->blocksize_1-vc->blocksize_0)/4;++i, k+=step) { - ret[k]=buf[i]+BIAS; + if(vc->exp_bias){ + for(i=0; i<len; i++, k+=step) + ((uint32_t*)ret)[k] = ((uint32_t*)buf)[i] + vc->exp_bias; // ret[k]=buf[i]*(1<<bias) + } else { + for(i=0; i<len; i++, k+=step) + ret[k] = buf[i] + fadd_bias; } buf=vc->buf; - retlen=vc->blocksize_0/2+(vc->blocksize_1-vc->blocksize_0)/4; + retlen=vc->blocksize_0/2+len; } // -- save if (next_window) { buf += vc->blocksize_1/2; - lwin += vc->blocksize_1/2-1; - for(i=0;i<vc->blocksize_1/2;++i) { - saved[i]=buf[i]*lwin[-i]; - } + vc->dsp.vector_fmul_reverse(saved, buf, lwin, vc->blocksize_1/2); saved_start=0; } else { saved_start=(vc->blocksize_1-vc->blocksize_0)/4; buf += vc->blocksize_1/2; - for(i=0;i<saved_start;++i) { - saved[i]=buf[i]; - } - swin += vc->blocksize_0/2-1; - for(i=0;i<vc->blocksize_0/2;++i) { - saved[saved_start+i]=buf[saved_start+i]*swin[-i]; - } + for(i=0; i<saved_start; i++) + ((uint32_t*)saved)[i] = ((uint32_t*)buf)[i] + vc->exp_bias; + vc->dsp.vector_fmul_reverse(saved+saved_start, buf+saved_start, swin, vc->blocksize_0/2); } } else { // --overlap/add - for(k=j, i=0;i<saved_start;++i, k+=step) { - ret[k]=saved[i]+BIAS; + if(vc->add_bias) { + for(k=j, i=0;i<saved_start;++i, k+=step) + ret[k] = saved[i] + fadd_bias; + } else { + for(k=j, i=0;i<saved_start;++i, k+=step) + ret[k] = saved[i]; } - for(i=0;i<vc->blocksize_0/2;++i, k+=step) { - ret[k]=saved[saved_start+i]+buf[i]*swin[i]+BIAS; - } + vc->dsp.vector_fmul_add_add(ret+k, buf, swin, saved+saved_start, vc->add_bias, vc->blocksize_0/2, step); retlen=saved_start+vc->blocksize_0/2; // -- save buf += vc->blocksize_0/2; - swin += vc->blocksize_0/2-1; - for(i=0;i<vc->blocksize_0/2;++i) { - saved[i]=buf[i]*swin[-i]; - } + vc->dsp.vector_fmul_reverse(saved, buf, swin, vc->blocksize_0/2); saved_start=0; } } @@ -1695,16 +1713,7 @@ AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len); - for(i=0;i<len;++i) { - int_fast32_t tmp= ((int32_t*)vc->ret)[i]; - if(tmp & 0xf0000){ -// tmp= (0x43c0ffff - tmp)>>31; //ask gcc devs why this is slower - if(tmp > 0x43c0ffff) tmp= 0xFFFF; - else tmp= 0; - } - ((int16_t*)data)[i]=tmp - 0x8000; - } - + vc->dsp.float_to_int16(data, vc->ret, len); *data_size=len*2; return buf_size ;