Mercurial > libavcodec.hg
diff i386/fft_3dn2.c @ 7263:fc843d00867c libavcodec
exploit mdct symmetry
2% faster vorbis on conroe, k8. 7% on celeron.
author | lorenm |
---|---|
date | Sun, 13 Jul 2008 15:03:58 +0000 |
parents | f7cbb7733146 |
children | a8a8205a9081 |
line wrap: on
line diff
--- a/i386/fft_3dn2.c Sun Jul 13 14:59:39 2008 +0000 +++ b/i386/fft_3dn2.c Sun Jul 13 15:03:58 2008 +0000 @@ -124,10 +124,9 @@ asm volatile("femms"); } -void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp) +static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp) { - long n8, n4, n2, n; + long n4, n2, n; x86_reg k; const uint16_t *revtab = s->fft.revtab; const FFTSample *tcos = s->tcos; @@ -138,7 +137,6 @@ n = 1 << s->nbits; n2 = n >> 1; n4 = n >> 2; - n8 = n >> 3; /* pre rotation */ in1 = input; @@ -182,6 +180,20 @@ :"m"(tcos[k]), "m"(tsin[k]) ); } +} + +void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + x86_reg k; + long n8, n2, n; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n8 = n >> 3; + + imdct_3dn2(s, input, tmp); k = n-8; asm volatile("movd %0, %%mm7" ::"r"(1<<31)); @@ -212,3 +224,40 @@ asm volatile("femms"); } +void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + x86_reg j, k; + long n8, n4, n; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n4 = n >> 2; + n8 = n >> 3; + + imdct_3dn2(s, input, tmp); + + j = -n; + k = n-8; + asm volatile("movd %0, %%mm7" ::"r"(1<<31)); + asm volatile( + "1: \n\t" + "movq (%3,%1), %%mm0 \n\t" // z[n8+k] + "pswapd (%3,%0), %%mm1 \n\t" // z[n8-1-k] + "movq %%mm0, %%mm2 \n\t" + "punpckldq %%mm1, %%mm0 \n\t" + "punpckhdq %%mm2, %%mm1 \n\t" + "pxor %%mm7, %%mm0 \n\t" + "pxor %%mm7, %%mm1 \n\t" + "movq %%mm0, (%2,%1) \n\t" // output[n4+2*k] = { -z[n8+k].re, z[n8-1-k].im } + "movq %%mm1, (%2,%0) \n\t" // output[n4-2-2*k] = { -z[n8-1-k].re, z[n8+k].im } + "sub $8, %1 \n\t" + "add $8, %0 \n\t" + "jl 1b \n\t" + :"+r"(j), "+r"(k) + :"r"(output+n4), "r"(z+n8) + :"memory" + ); + asm volatile("femms"); +} +