Mercurial > libavcodec.hg
diff i386/dsputil_mmx.c @ 7261:032a49f033e8 libavcodec
simplify vorbis windowing
author | lorenm |
---|---|
date | Sun, 13 Jul 2008 14:56:01 +0000 |
parents | 08cc6e202aa6 |
children | fc843d00867c |
line wrap: on
line diff
--- a/i386/dsputil_mmx.c Sun Jul 13 14:27:48 2008 +0000 +++ b/i386/dsputil_mmx.c Sun Jul 13 14:56:01 2008 +0000 @@ -2022,6 +2022,39 @@ ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); } +static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, + const float *win, float add_bias, int len){ +#ifdef HAVE_6REGS + if(add_bias == 0){ + x86_reg i = -len*2; + x86_reg j = len*2-16; + asm volatile( + "1: \n" + "movaps (%5,%0), %%xmm0 \n" + "movaps (%5,%1), %%xmm1 \n" + "movaps %%xmm0, %%xmm2 \n" + "movaps %%xmm1, %%xmm3 \n" + "shufps $0x1b, %%xmm2, %%xmm2 \n" + "shufps $0x1b, %%xmm3, %%xmm3 \n" + "mulps (%4,%0), %%xmm0 \n" + "mulps (%4,%1), %%xmm1 \n" + "mulps (%3,%0), %%xmm3 \n" + "mulps (%3,%1), %%xmm2 \n" + "addps %%xmm3, %%xmm0 \n" + "addps %%xmm2, %%xmm1 \n" + "movaps %%xmm0, (%2,%0) \n" + "movaps %%xmm1, (%2,%1) \n" + "sub $16, %1 \n" + "add $16, %0 \n" + "jl 1b \n" + :"+r"(i), "+r"(j) + :"r"(dst+len/2), "r"(src0+len/2), "r"(src1+len/2), "r"(win+len/2) + ); + }else +#endif + ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); +} + static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ // not bit-exact: pf2id uses different rounding than C and SSE asm volatile( @@ -2083,6 +2116,87 @@ ); } +#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ +/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ +static av_noinline void float_to_int16_interleave2_##cpu(int16_t *dst, const float *src, long len, int channels){\ + DECLARE_ALIGNED_16(int16_t, tmp[len*channels]);\ + int i,j,c;\ + float_to_int16_##cpu(tmp, src, len*channels);\ + for(c=0; c<channels; c++){\ + int16_t *ptmp = tmp+c*len;\ + for(i=0, j=c; i<len; i++, j+=channels)\ + dst[j] = ptmp[i];\ + }\ +}\ +\ +static void float_to_int16_interleave_##cpu(int16_t *dst, const float *src, long len, int channels){\ + if(channels==1)\ + float_to_int16_##cpu(dst, src, len);\ + else if(channels>2)\ + float_to_int16_interleave2_##cpu(dst, src, len, channels);\ + else{\ + float *src1;\ + asm volatile(\ + "shl $2, %0 \n"\ + "add %0, %1 \n"\ + "add %0, %2 \n"\ + "lea (%2,%0), %3 \n"\ + "neg %0 \n"\ + body\ + :"+r"(len), "+r"(dst), "+r"(src), "=r"(src1)\ + );\ + }\ +} + +FLOAT_TO_INT16_INTERLEAVE(3dnow, + "1: \n" + "pf2id (%2,%0), %%mm0 \n" + "pf2id 8(%2,%0), %%mm1 \n" + "pf2id (%3,%0), %%mm2 \n" + "pf2id 8(%3,%0), %%mm3 \n" + "packssdw %%mm1, %%mm0 \n" + "packssdw %%mm3, %%mm2 \n" + "movq %%mm0, %%mm1 \n" + "punpcklwd %%mm2, %%mm0 \n" + "punpckhwd %%mm2, %%mm1 \n" + "movq %%mm0, (%1,%0)\n" + "movq %%mm0, 8(%1,%0)\n" + "add $16, %0 \n" + "js 1b \n" + "femms \n" +) + +FLOAT_TO_INT16_INTERLEAVE(sse, + "1: \n" + "cvtps2pi (%2,%0), %%mm0 \n" + "cvtps2pi 8(%2,%0), %%mm1 \n" + "cvtps2pi (%3,%0), %%mm2 \n" + "cvtps2pi 8(%3,%0), %%mm3 \n" + "packssdw %%mm1, %%mm0 \n" + "packssdw %%mm3, %%mm2 \n" + "movq %%mm0, %%mm1 \n" + "punpcklwd %%mm2, %%mm0 \n" + "punpckhwd %%mm2, %%mm1 \n" + "movq %%mm0, (%1,%0)\n" + "movq %%mm0, 8(%1,%0)\n" + "add $16, %0 \n" + "js 1b \n" + "emms \n" +) + +FLOAT_TO_INT16_INTERLEAVE(sse2, + "1: \n" + "cvtps2dq (%2,%0), %%xmm0 \n" + "cvtps2dq (%3,%0), %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "movhlps %%xmm0, %%xmm1 \n" + "punpcklwd %%xmm1, %%xmm0 \n" + "movdqa %%xmm0, (%1,%0) \n" + "add $16, %0 \n" + "js 1b \n" +) + + extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width); extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width); extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); @@ -2519,8 +2633,10 @@ if(mm_flags & MM_3DNOW){ c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; c->vector_fmul = vector_fmul_3dnow; - if(!(avctx->flags & CODEC_FLAG_BITEXACT)) + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->float_to_int16 = float_to_int16_3dnow; + c->float_to_int16_interleave = float_to_int16_interleave_3dnow; + } } if(mm_flags & MM_3DNOWEXT) c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; @@ -2528,11 +2644,14 @@ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; c->vector_fmul = vector_fmul_sse; c->float_to_int16 = float_to_int16_sse; + c->float_to_int16_interleave = float_to_int16_interleave_sse; c->vector_fmul_reverse = vector_fmul_reverse_sse; c->vector_fmul_add_add = vector_fmul_add_add_sse; + c->vector_fmul_window = vector_fmul_window_sse; } if(mm_flags & MM_SSE2){ c->float_to_int16 = float_to_int16_sse2; + c->float_to_int16_interleave = float_to_int16_interleave_sse2; } if(mm_flags & MM_3DNOW) c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse