Mercurial > libavcodec.hg
changeset 7544:ee1cb5ab9f99 libavcodec
optimize imdct_half:
remove tmp buffer.
skip fft reinterleave pass, leaving data in a format more convenient for simd.
merge post-rotate with post-reorder.
author | lorenm |
---|---|
date | Tue, 12 Aug 2008 00:33:34 +0000 |
parents | f04ff5a6fb55 |
children | 2dca9201c400 |
files | dsputil.h fft.c i386/fft_3dn2.c i386/fft_sse.c mdct.c vorbis_dec.c |
diffstat | 6 files changed, 256 insertions(+), 367 deletions(-) [+] |
line wrap: on
line diff
--- a/dsputil.h Tue Aug 12 00:27:21 2008 +0000 +++ b/dsputil.h Tue Aug 12 00:33:34 2008 +0000 @@ -645,7 +645,7 @@ void (*imdct_calc)(struct MDCTContext *s, FFTSample *output, const FFTSample *input, FFTSample *tmp); void (*imdct_half)(struct MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp); + const FFTSample *input); } FFTContext; int ff_fft_init(FFTContext *s, int nbits, int inverse); @@ -696,16 +696,16 @@ int ff_mdct_init(MDCTContext *s, int nbits, int inverse); void ff_imdct_calc(MDCTContext *s, FFTSample *output, const FFTSample *input, FFTSample *tmp); -void ff_imdct_half(MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp); +void ff_imdct_half(MDCTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_calc_3dn(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp); +void ff_imdct_half_3dn(MDCTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input, FFTSample *tmp); -void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp); +void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input, FFTSample *tmp); -void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp); +void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input); void ff_mdct_calc(MDCTContext *s, FFTSample *out, const FFTSample *input, FFTSample *tmp); void ff_mdct_end(MDCTContext *s);
--- a/fft.c Tue Aug 12 00:27:21 2008 +0000 +++ b/fft.c Tue Aug 12 00:33:34 2008 +0000 @@ -106,6 +106,8 @@ s->fft_calc = ff_fft_calc_3dn2; } else if (has_vectors & MM_3DNOW) { /* 3DNow! for K6-2/3 */ + s->imdct_calc = ff_imdct_calc_3dn; + s->imdct_half = ff_imdct_half_3dn; s->fft_calc = ff_fft_calc_3dn; } #elif defined HAVE_ALTIVEC && !defined ALTIVEC_USE_REFERENCE_C_CODE
--- a/i386/fft_3dn2.c Tue Aug 12 00:27:21 2008 +0000 +++ b/i386/fft_3dn2.c Tue Aug 12 00:33:34 2008 +0000 @@ -1,7 +1,6 @@ /* * FFT/MDCT transform with Extended 3DNow! optimizations - * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt - * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. + * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt * * This file is part of FFmpeg. * @@ -23,16 +22,23 @@ #include "libavutil/x86_cpu.h" #include "libavcodec/dsputil.h" +DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 }; + #ifdef EMULATE_3DNOWEXT +#define PSWAPD(s,d)\ + "movq "#s","#d"\n"\ + "psrlq $32,"#d"\n"\ + "punpckldq "#s","#d"\n" #define ff_fft_calc_3dn2 ff_fft_calc_3dn #define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn #define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn #define ff_imdct_calc_3dn2 ff_imdct_calc_3dn #define ff_imdct_half_3dn2 ff_imdct_half_3dn +#else +#define PSWAPD(s,d) "pswapd "#s","#d"\n" #endif void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); -void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) { @@ -45,35 +51,45 @@ FFSWAP(FFTSample, z[i].im, z[i+1].re); } -static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp) +void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input) { - long n4, n2, n; - x86_reg k; + x86_reg j, k; + long n = 1 << s->nbits; + long n2 = n >> 1; + long n4 = n >> 2; + long n8 = n >> 3; const uint16_t *revtab = s->fft.revtab; const FFTSample *tcos = s->tcos; const FFTSample *tsin = s->tsin; const FFTSample *in1, *in2; - FFTComplex *z = (FFTComplex *)tmp; - - n = 1 << s->nbits; - n2 = n >> 1; - n4 = n >> 2; + FFTComplex *z = (FFTComplex *)output; /* pre rotation */ in1 = input; in2 = input + n2 - 1; +#ifdef EMULATE_3DNOWEXT + asm volatile("movd %0, %%mm7" ::"r"(1<<31)); +#endif for(k = 0; k < n4; k++) { // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it asm volatile( - "movd %0, %%mm0 \n\t" - "movd %2, %%mm1 \n\t" - "punpckldq %1, %%mm0 \n\t" - "punpckldq %3, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "pfmul %%mm1, %%mm0 \n\t" - "pswapd %%mm1, %%mm1 \n\t" - "pfmul %%mm1, %%mm2 \n\t" - "pfpnacc %%mm2, %%mm0 \n\t" + "movd %0, %%mm0 \n" + "movd %2, %%mm1 \n" + "punpckldq %1, %%mm0 \n" + "punpckldq %3, %%mm1 \n" + "movq %%mm0, %%mm2 \n" + PSWAPD( %%mm1, %%mm3 ) + "pfmul %%mm1, %%mm0 \n" + "pfmul %%mm3, %%mm2 \n" +#ifdef EMULATE_3DNOWEXT + "movq %%mm0, %%mm1 \n" + "punpckhdq %%mm2, %%mm0 \n" + "punpckldq %%mm2, %%mm1 \n" + "pxor %%mm7, %%mm0 \n" + "pfadd %%mm1, %%mm0 \n" +#else + "pfpnacc %%mm2, %%mm0 \n" +#endif ::"m"(in2[-2*k]), "m"(in1[2*k]), "m"(tcos[k]), "m"(tsin[k]) ); @@ -83,102 +99,75 @@ ); } - ff_fft_calc_3dn2(&s->fft, z); + ff_fft_dispatch_3dn2(z, s->fft.nbits); + +#define CMUL(j,mm0,mm1)\ + "movq (%2,"#j",2), %%mm6 \n"\ + "movq 8(%2,"#j",2), "#mm0"\n"\ + "movq %%mm6, "#mm1"\n"\ + "movq "#mm0",%%mm7 \n"\ + "pfmul (%3,"#j"), %%mm6 \n"\ + "pfmul (%4,"#j"), "#mm0"\n"\ + "pfmul (%4,"#j"), "#mm1"\n"\ + "pfmul (%3,"#j"), %%mm7 \n"\ + "pfsub %%mm6, "#mm0"\n"\ + "pfadd %%mm7, "#mm1"\n" - /* post rotation + reordering */ - for(k = 0; k < n4; k++) { - asm volatile( - "movq %0, %%mm0 \n\t" - "movd %1, %%mm1 \n\t" - "punpckldq %2, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "pfmul %%mm1, %%mm0 \n\t" - "pswapd %%mm1, %%mm1 \n\t" - "pfmul %%mm1, %%mm2 \n\t" - "pfpnacc %%mm2, %%mm0 \n\t" - "movq %%mm0, %0 \n\t" - :"+m"(z[k]) - :"m"(tcos[k]), "m"(tsin[k]) - ); - } + /* post rotation */ + j = -n2; + k = n2-8; + asm volatile( + "1: \n" + CMUL(%0, %%mm0, %%mm1) + CMUL(%1, %%mm2, %%mm3) + "movd %%mm0, (%2,%0,2) \n" + "movd %%mm1,12(%2,%1,2) \n" + "movd %%mm2, (%2,%1,2) \n" + "movd %%mm3,12(%2,%0,2) \n" + "psrlq $32, %%mm0 \n" + "psrlq $32, %%mm1 \n" + "psrlq $32, %%mm2 \n" + "psrlq $32, %%mm3 \n" + "movd %%mm0, 8(%2,%0,2) \n" + "movd %%mm1, 4(%2,%1,2) \n" + "movd %%mm2, 8(%2,%1,2) \n" + "movd %%mm3, 4(%2,%0,2) \n" + "sub $8, %1 \n" + "add $8, %0 \n" + "jl 1b \n" + :"+r"(j), "+r"(k) + :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) + :"memory" + ); + asm volatile("femms"); } void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input, FFTSample *tmp) { - x86_reg k; - long n8, n2, n; - FFTComplex *z = (FFTComplex *)tmp; + x86_reg j, k; + long n = 1 << s->nbits; + long n4 = n >> 2; - n = 1 << s->nbits; - n2 = n >> 1; - n8 = n >> 3; + ff_imdct_half_3dn2(s, output+n4, input); - imdct_3dn2(s, input, tmp); - + j = -n; k = n-8; - asm volatile("movd %0, %%mm7" ::"r"(1<<31)); asm volatile( - "1: \n\t" - "movq (%4,%0), %%mm0 \n\t" // z[n8+k] - "neg %0 \n\t" - "pswapd -8(%4,%0), %%mm1 \n\t" // z[n8-1-k] - "movq %%mm0, %%mm2 \n\t" - "pxor %%mm7, %%mm2 \n\t" - "punpckldq %%mm1, %%mm2 \n\t" - "pswapd %%mm2, %%mm3 \n\t" - "punpckhdq %%mm1, %%mm0 \n\t" - "pswapd %%mm0, %%mm4 \n\t" - "pxor %%mm7, %%mm0 \n\t" - "pxor %%mm7, %%mm4 \n\t" - "movq %%mm3, -8(%3,%0) \n\t" // output[n-2-2*k] = { z[n8-1-k].im, -z[n8+k].re } - "movq %%mm4, -8(%2,%0) \n\t" // output[n2-2-2*k]= { -z[n8-1-k].re, z[n8+k].im } - "neg %0 \n\t" - "movq %%mm0, (%1,%0) \n\t" // output[2*k] = { -z[n8+k].im, z[n8-1-k].re } - "movq %%mm2, (%2,%0) \n\t" // output[n2+2*k] = { -z[n8+k].re, z[n8-1-k].im } - "sub $8, %0 \n\t" - "jge 1b \n\t" - :"+r"(k) - :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8) - :"memory" + "movq %4, %%mm7 \n" + "1: \n" + PSWAPD((%2,%1), %%mm0) + PSWAPD((%3,%0), %%mm1) + "pxor %%mm7, %%mm0 \n" + "movq %%mm1, (%3,%1) \n" + "movq %%mm0, (%2,%0) \n" + "sub $8, %1 \n" + "add $8, %0 \n" + "jl 1b \n" + :"+r"(j), "+r"(k) + :"r"(output+n4), "r"(output+n4*3), + "m"(*m1m1) ); asm volatile("femms"); } -void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp) -{ - x86_reg j, k; - long n8, n4, n; - FFTComplex *z = (FFTComplex *)tmp; - - n = 1 << s->nbits; - n4 = n >> 2; - n8 = n >> 3; - - imdct_3dn2(s, input, tmp); - - j = -n; - k = n-8; - asm volatile("movd %0, %%mm7" ::"r"(1<<31)); - asm volatile( - "1: \n\t" - "movq (%3,%1), %%mm0 \n\t" // z[n8+k] - "pswapd (%3,%0), %%mm1 \n\t" // z[n8-1-k] - "movq %%mm0, %%mm2 \n\t" - "punpckldq %%mm1, %%mm0 \n\t" - "punpckhdq %%mm2, %%mm1 \n\t" - "pxor %%mm7, %%mm0 \n\t" - "pxor %%mm7, %%mm1 \n\t" - "movq %%mm0, (%2,%1) \n\t" // output[n4+2*k] = { -z[n8+k].re, z[n8-1-k].im } - "movq %%mm1, (%2,%0) \n\t" // output[n4-2-2*k] = { -z[n8-1-k].re, z[n8+k].im } - "sub $8, %1 \n\t" - "add $8, %0 \n\t" - "jl 1b \n\t" - :"+r"(j), "+r"(k) - :"r"(output+n4), "r"(z+n8) - :"memory" - ); - asm volatile("femms"); -} -
--- a/i386/fft_sse.c Tue Aug 12 00:27:21 2008 +0000 +++ b/i386/fft_sse.c Tue Aug 12 00:33:34 2008 +0000 @@ -1,6 +1,6 @@ /* * FFT/MDCT transform with SSE optimizations - * Copyright (c) 2002 Fabrice Bellard. + * Copyright (c) 2008 Loren Merritt * * This file is part of FFmpeg. * @@ -22,9 +22,6 @@ #include "libavutil/x86_cpu.h" #include "libavcodec/dsputil.h" -static const int p1m1p1m1[4] __attribute__((aligned(16))) = - { 0, 1 << 31, 0, 1 << 31 }; - static const int m1m1m1m1[4] __attribute__((aligned(16))) = { 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; @@ -73,212 +70,134 @@ memcpy(z, s->tmp_buf, n*sizeof(FFTComplex)); } -static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp) +void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input) { - x86_reg k; - long n4, n2, n; - const uint16_t *revtab = s->fft.revtab; + av_unused x86_reg i, j, k, l; + long n = 1 << s->nbits; + long n2 = n >> 1; + long n4 = n >> 2; + long n8 = n >> 3; + const uint16_t *revtab = s->fft.revtab + n8; const FFTSample *tcos = s->tcos; const FFTSample *tsin = s->tsin; - const FFTSample *in1, *in2; - FFTComplex *z = (FFTComplex *)tmp; - - n = 1 << s->nbits; - n2 = n >> 1; - n4 = n >> 2; - -#ifdef ARCH_X86_64 - asm volatile ("movaps %0, %%xmm8\n\t"::"m"(*p1m1p1m1)); -#define P1M1P1M1 "%%xmm8" -#else -#define P1M1P1M1 "%4" -#endif + FFTComplex *z = (FFTComplex *)output; /* pre rotation */ - in1 = input; - in2 = input + n2 - 4; - - /* Complex multiplication */ - for (k = 0; k < n4; k += 4) { - asm volatile ( - "movaps %0, %%xmm0 \n\t" // xmm0 = r0 X r1 X : in2 - "movaps %1, %%xmm3 \n\t" // xmm3 = X i1 X i0: in1 - "movaps -16+1*%0, %%xmm4 \n\t" // xmm4 = r0 X r1 X : in2 - "movaps 16+1*%1, %%xmm7 \n\t" // xmm7 = X i1 X i0: in1 - "movlps %2, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos - "movlps %3, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin - "movlps 8+1*%2, %%xmm5 \n\t" // xmm5 = X X R1 R0: tcos - "movlps 8+1*%3, %%xmm6 \n\t" // xmm6 = X X I1 I0: tsin - "shufps $95, %%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0 - "shufps $160,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0 - "shufps $95, %%xmm4, %%xmm4 \n\t" // xmm4 = r1 r1 r0 r0 - "shufps $160,%%xmm7, %%xmm7 \n\t" // xmm7 = i1 i1 i0 i0 - "unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0 - "unpcklps %%xmm6, %%xmm5 \n\t" // xmm5 = I1 R1 I0 R0 - "movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 - "movaps %%xmm5, %%xmm6 \n\t" // xmm6 = I1 R1 I0 R0 - "xorps "P1M1P1M1", %%xmm2 \n\t" // xmm2 = -I1 R1 -I0 R0 - "xorps "P1M1P1M1", %%xmm6 \n\t" // xmm6 = -I1 R1 -I0 R0 - "mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR - "mulps %%xmm5, %%xmm4 \n\t" // xmm4 = rI rR rI rR - "shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 -I1 R0 -I0 - "shufps $177,%%xmm6, %%xmm6 \n\t" // xmm6 = R1 -I1 R0 -I0 - "mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri -Ii Ri -Ii - "mulps %%xmm6, %%xmm7 \n\t" // xmm7 = Ri -Ii Ri -Ii - "addps %%xmm3, %%xmm0 \n\t" // xmm0 = result - "addps %%xmm7, %%xmm4 \n\t" // xmm4 = result - ::"m"(in2[-2*k]), "m"(in1[2*k]), - "m"(tcos[k]), "m"(tsin[k]) -#ifndef ARCH_X86_64 - ,"m"(*p1m1p1m1) + for(k=n8-2; k>=0; k-=2) { + asm volatile( + "movaps (%2,%1,2), %%xmm0 \n" // { z[k].re, z[k].im, z[k+1].re, z[k+1].im } + "movaps -16(%2,%0,2), %%xmm1 \n" // { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } + "movaps %%xmm0, %%xmm2 \n" + "shufps $0x88, %%xmm1, %%xmm0 \n" // { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } + "shufps $0x77, %%xmm2, %%xmm1 \n" // { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } + "movlps (%3,%1), %%xmm4 \n" + "movlps (%4,%1), %%xmm5 \n" + "movhps -8(%3,%0), %%xmm4 \n" // { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } + "movhps -8(%4,%0), %%xmm5 \n" // { sin[k], sin[k+1], sin[-k-2], sin[-k-1] } + "movaps %%xmm0, %%xmm2 \n" + "movaps %%xmm1, %%xmm3 \n" + "mulps %%xmm5, %%xmm0 \n" // re*sin + "mulps %%xmm4, %%xmm1 \n" // im*cos + "mulps %%xmm4, %%xmm2 \n" // re*cos + "mulps %%xmm5, %%xmm3 \n" // im*sin + "subps %%xmm0, %%xmm1 \n" // -> re + "addps %%xmm3, %%xmm2 \n" // -> im + "movaps %%xmm1, %%xmm0 \n" + "unpcklps %%xmm2, %%xmm1 \n" // { z[k], z[k+1] } + "unpckhps %%xmm2, %%xmm0 \n" // { z[-k-2], z[-k-1] } + ::"r"(-4*k), "r"(4*k), + "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8) + ); +#ifdef ARCH_X86_64 + // if we have enough regs, don't let gcc make the luts latency-bound + // but if not, latency is faster than spilling + asm("movlps %%xmm0, %0 \n" + "movhps %%xmm0, %1 \n" + "movlps %%xmm1, %2 \n" + "movhps %%xmm1, %3 \n" + :"=m"(z[revtab[-k-2]]), + "=m"(z[revtab[-k-1]]), + "=m"(z[revtab[ k ]]), + "=m"(z[revtab[ k+1]]) + ); +#else + asm("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]])); + asm("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]])); + asm("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]])); + asm("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]])); #endif - ); - /* Should be in the same block, hack for gcc2.95 & gcc3 */ - asm ( - "movlps %%xmm0, %0 \n\t" - "movhps %%xmm0, %1 \n\t" - "movlps %%xmm4, %2 \n\t" - "movhps %%xmm4, %3 \n\t" - :"=m"(z[revtab[k]]), "=m"(z[revtab[k + 1]]), - "=m"(z[revtab[k + 2]]), "=m"(z[revtab[k + 3]]) - ); } - ff_fft_calc_sse(&s->fft, z); + ff_fft_dispatch_sse(z, s->fft.nbits); + + /* post rotation + reinterleave + reorder */ -#ifndef ARCH_X86_64 -#undef P1M1P1M1 -#define P1M1P1M1 "%3" -#endif +#define CMUL(j,xmm0,xmm1)\ + "movaps (%2,"#j",2), %%xmm6 \n"\ + "movaps 16(%2,"#j",2), "#xmm0"\n"\ + "movaps %%xmm6, "#xmm1"\n"\ + "movaps "#xmm0",%%xmm7 \n"\ + "mulps (%3,"#j"), %%xmm6 \n"\ + "mulps (%4,"#j"), "#xmm0"\n"\ + "mulps (%4,"#j"), "#xmm1"\n"\ + "mulps (%3,"#j"), %%xmm7 \n"\ + "subps %%xmm6, "#xmm0"\n"\ + "addps %%xmm7, "#xmm1"\n" - /* post rotation + reordering */ - for (k = 0; k < n4; k += 4) { - asm ( - "movaps %0, %%xmm0 \n\t" // xmm0 = i1 r1 i0 r0: z - "movaps 16+1*%0, %%xmm4 \n\t" // xmm4 = i1 r1 i0 r0: z - "movlps %1, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos - "movlps 8+1*%1, %%xmm5 \n\t" // xmm5 = X X R1 R0: tcos - "movaps %%xmm0, %%xmm3 \n\t" // xmm3 = i1 r1 i0 r0 - "movaps %%xmm4, %%xmm7 \n\t" // xmm7 = i1 r1 i0 r0 - "movlps %2, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin - "movlps 8+1*%2, %%xmm6 \n\t" // xmm6 = X X I1 I0: tsin - "shufps $160,%%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0 - "shufps $245,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0 - "shufps $160,%%xmm4, %%xmm4 \n\t" // xmm4 = r1 r1 r0 r0 - "shufps $245,%%xmm7, %%xmm7 \n\t" // xmm7 = i1 i1 i0 i0 - "unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0 - "unpcklps %%xmm6, %%xmm5 \n\t" // xmm5 = I1 R1 I0 R0 - "movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 - "movaps %%xmm5, %%xmm6 \n\t" // xmm6 = I1 R1 I0 R0 - "xorps "P1M1P1M1", %%xmm2 \n\t" // xmm2 = -I1 R1 -I0 R0 - "mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR - "xorps "P1M1P1M1", %%xmm6 \n\t" // xmm6 = -I1 R1 -I0 R0 - "mulps %%xmm5, %%xmm4 \n\t" // xmm4 = rI rR rI rR - "shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 -I1 R0 -I0 - "shufps $177,%%xmm6, %%xmm6 \n\t" // xmm6 = R1 -I1 R0 -I0 - "mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri -Ii Ri -Ii - "mulps %%xmm6, %%xmm7 \n\t" // xmm7 = Ri -Ii Ri -Ii - "addps %%xmm3, %%xmm0 \n\t" // xmm0 = result - "addps %%xmm7, %%xmm4 \n\t" // xmm4 = result - "movaps %%xmm0, %0 \n\t" - "movaps %%xmm4, 16+1*%0\n\t" - :"+m"(z[k]) - :"m"(tcos[k]), "m"(tsin[k]) -#ifndef ARCH_X86_64 - ,"m"(*p1m1p1m1) -#endif - ); - } + j = -n2; + k = n2-16; + asm volatile( + "1: \n" + CMUL(%0, %%xmm0, %%xmm1) + CMUL(%1, %%xmm4, %%xmm5) + "shufps $0x1b, %%xmm1, %%xmm1 \n" + "shufps $0x1b, %%xmm5, %%xmm5 \n" + "movaps %%xmm4, %%xmm6 \n" + "unpckhps %%xmm1, %%xmm4 \n" + "unpcklps %%xmm1, %%xmm6 \n" + "movaps %%xmm0, %%xmm2 \n" + "unpcklps %%xmm5, %%xmm0 \n" + "unpckhps %%xmm5, %%xmm2 \n" + "movaps %%xmm6, (%2,%1,2) \n" + "movaps %%xmm4, 16(%2,%1,2) \n" + "movaps %%xmm0, (%2,%0,2) \n" + "movaps %%xmm2, 16(%2,%0,2) \n" + "sub $16, %1 \n" + "add $16, %0 \n" + "jl 1b \n" + :"+&r"(j), "+&r"(k) + :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) + :"memory" + ); } void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input, FFTSample *tmp) { - x86_reg k; - long n8, n2, n; - FFTComplex *z = (FFTComplex *)tmp; - - n = 1 << s->nbits; - n2 = n >> 1; - n8 = n >> 3; - - imdct_sse(s, input, tmp); + x86_reg j, k; + long n = 1 << s->nbits; + long n4 = n >> 2; - /* - Mnemonics: - 0 = z[k].re - 1 = z[k].im - 2 = z[k + 1].re - 3 = z[k + 1].im - 4 = z[-k - 2].re - 5 = z[-k - 2].im - 6 = z[-k - 1].re - 7 = z[-k - 1].im - */ - k = 16-n; - asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1)); + ff_imdct_half_sse(s, output+n4, input); + + j = -n; + k = n-16; asm volatile( - "1: \n\t" - "movaps -16(%4,%0), %%xmm1 \n\t" // xmm1 = 4 5 6 7 = z[-2-k] - "neg %0 \n\t" - "movaps (%4,%0), %%xmm0 \n\t" // xmm0 = 0 1 2 3 = z[k] - "xorps %%xmm7, %%xmm0 \n\t" // xmm0 = -0 -1 -2 -3 - "movaps %%xmm0, %%xmm2 \n\t" // xmm2 = -0 -1 -2 -3 - "shufps $141,%%xmm1, %%xmm0 \n\t" // xmm0 = -1 -3 4 6 - "shufps $216,%%xmm1, %%xmm2 \n\t" // xmm2 = -0 -2 5 7 - "shufps $156,%%xmm0, %%xmm0 \n\t" // xmm0 = -1 6 -3 4 ! - "shufps $156,%%xmm2, %%xmm2 \n\t" // xmm2 = -0 7 -2 5 ! - "movaps %%xmm0, (%1,%0) \n\t" // output[2*k] - "movaps %%xmm2, (%2,%0) \n\t" // output[n2+2*k] - "neg %0 \n\t" - "shufps $27, %%xmm0, %%xmm0 \n\t" // xmm0 = 4 -3 6 -1 - "xorps %%xmm7, %%xmm0 \n\t" // xmm0 = -4 3 -6 1 ! - "shufps $27, %%xmm2, %%xmm2 \n\t" // xmm2 = 5 -2 7 -0 ! - "movaps %%xmm0, -16(%2,%0) \n\t" // output[n2-4-2*k] - "movaps %%xmm2, -16(%3,%0) \n\t" // output[n-4-2*k] - "add $16, %0 \n\t" - "jle 1b \n\t" - :"+r"(k) - :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8) - :"memory" + "movaps %4, %%xmm7 \n" + "1: \n" + "movaps (%2,%1), %%xmm0 \n" + "movaps (%3,%0), %%xmm1 \n" + "shufps $0x1b, %%xmm0, %%xmm0 \n" + "shufps $0x1b, %%xmm1, %%xmm1 \n" + "xorps %%xmm7, %%xmm0 \n" + "movaps %%xmm1, (%3,%1) \n" + "movaps %%xmm0, (%2,%0) \n" + "sub $16, %1 \n" + "add $16, %0 \n" + "jl 1b \n" + :"+r"(j), "+r"(k) + :"r"(output+n4), "r"(output+n4*3), + "m"(*m1m1m1m1) ); } -void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp) -{ - x86_reg j, k; - long n8, n4, n; - FFTComplex *z = (FFTComplex *)tmp; - - n = 1 << s->nbits; - n4 = n >> 2; - n8 = n >> 3; - - imdct_sse(s, input, tmp); - - j = -n; - k = n-16; - asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1)); - asm volatile( - "1: \n\t" - "movaps (%3,%1), %%xmm0 \n\t" - "movaps (%3,%0), %%xmm1 \n\t" - "xorps %%xmm7, %%xmm0 \n\t" - "movaps %%xmm0, %%xmm2 \n\t" - "shufps $141,%%xmm1, %%xmm0 \n\t" - "shufps $216,%%xmm1, %%xmm2 \n\t" - "shufps $54, %%xmm0, %%xmm0 \n\t" - "shufps $156,%%xmm2, %%xmm2 \n\t" - "xorps %%xmm7, %%xmm0 \n\t" - "movaps %%xmm2, (%2,%1) \n\t" - "movaps %%xmm0, (%2,%0) \n\t" - "sub $16, %1 \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - :"+r"(j), "+r"(k) - :"r"(output+n4), "r"(z+n8) - :"memory" - ); -} -
--- a/mdct.c Tue Aug 12 00:27:21 2008 +0000 +++ b/mdct.c Tue Aug 12 00:33:34 2008 +0000 @@ -100,18 +100,25 @@ (pim) = _are * _bim + _aim * _bre;\ } -static void imdct_c(MDCTContext *s, const FFTSample *input, FFTSample *tmp) +/** + * Compute the middle half of the inverse MDCT of size N = 2^nbits, + * thus excluding the parts that can be derived by symmetry + * @param output N/2 samples + * @param input N/2 samples + */ +void ff_imdct_half(MDCTContext *s, FFTSample *output, const FFTSample *input) { - int k, n4, n2, n, j; + int k, n8, n4, n2, n, j; const uint16_t *revtab = s->fft.revtab; const FFTSample *tcos = s->tcos; const FFTSample *tsin = s->tsin; const FFTSample *in1, *in2; - FFTComplex *z = (FFTComplex *)tmp; + FFTComplex *z = (FFTComplex *)output; n = 1 << s->nbits; n2 = n >> 1; n4 = n >> 2; + n8 = n >> 3; /* pre rotation */ in1 = input; @@ -125,9 +132,15 @@ ff_fft_calc(&s->fft, z); /* post rotation + reordering */ - /* XXX: optimize */ - for(k = 0; k < n4; k++) { - CMUL(z[k].re, z[k].im, z[k].re, z[k].im, tcos[k], tsin[k]); + output += n4; + for(k = 0; k < n8; k++) { + FFTSample r0, i0, r1, i1; + CMUL(r0, i1, z[n8-k-1].im, z[n8-k-1].re, tsin[n8-k-1], tcos[n8-k-1]); + CMUL(r1, i0, z[n8+k ].im, z[n8+k ].re, tsin[n8+k ], tcos[n8+k ]); + z[n8-k-1].re = r0; + z[n8-k-1].im = i0; + z[n8+k ].re = r1; + z[n8+k ].im = i1; } } @@ -140,52 +153,16 @@ void ff_imdct_calc(MDCTContext *s, FFTSample *output, const FFTSample *input, FFTSample *tmp) { - int k, n8, n2, n; - FFTComplex *z = (FFTComplex *)tmp; - n = 1 << s->nbits; - n2 = n >> 1; - n8 = n >> 3; - - imdct_c(s, input, tmp); - - for(k = 0; k < n8; k++) { - output[2*k] = -z[n8 + k].im; - output[n2-1-2*k] = z[n8 + k].im; - - output[2*k+1] = z[n8-1-k].re; - output[n2-1-2*k-1] = -z[n8-1-k].re; - - output[n2 + 2*k]=-z[k+n8].re; - output[n-1- 2*k]=-z[k+n8].re; - - output[n2 + 2*k+1]=z[n8-k-1].im; - output[n-2 - 2 * k] = z[n8-k-1].im; - } -} + int k; + int n = 1 << s->nbits; + int n2 = n >> 1; + int n4 = n >> 2; -/** - * Compute the middle half of the inverse MDCT of size N = 2^nbits, - * thus excluding the parts that can be derived by symmetry - * @param output N/2 samples - * @param input N/2 samples - * @param tmp N/2 samples - */ -void ff_imdct_half(MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp) -{ - int k, n8, n4, n; - FFTComplex *z = (FFTComplex *)tmp; - n = 1 << s->nbits; - n4 = n >> 2; - n8 = n >> 3; + ff_imdct_half(s, output+n4, input); - imdct_c(s, input, tmp); - - for(k = 0; k < n8; k++) { - output[n4-1-2*k] = z[n8+k].im; - output[n4-1-2*k-1] = -z[n8-k-1].re; - output[n4 + 2*k] = -z[n8+k].re; - output[n4 + 2*k+1] = z[n8-k-1].im; + for(k = 0; k < n4; k++) { + output[k] = -output[n2-k-1]; + output[n-k-1] = output[n2+k]; } } @@ -203,7 +180,7 @@ const uint16_t *revtab = s->fft.revtab; const FFTSample *tcos = s->tcos; const FFTSample *tsin = s->tsin; - FFTComplex *x = (FFTComplex *)tmp; + FFTComplex *x = (FFTComplex *)out; n = 1 << s->nbits; n2 = n >> 1; @@ -227,12 +204,14 @@ ff_fft_calc(&s->fft, x); /* post rotation */ - for(i=0;i<n4;i++) { - re = x[i].re; - im = x[i].im; - CMUL(re1, im1, re, im, -tsin[i], -tcos[i]); - out[2*i] = im1; - out[n2-1-2*i] = re1; + for(i=0;i<n8;i++) { + FFTSample r0, i0, r1, i1; + CMUL(i1, r0, x[n8-i-1].re, x[n8-i-1].im, -tsin[n8-i-1], -tcos[n8-i-1]); + CMUL(i0, r1, x[n8+i ].re, x[n8+i ].im, -tsin[n8+i ], -tcos[n8+i ]); + x[n8-i-1].re = r0; + x[n8-i-1].im = i0; + x[n8+i ].re = r1; + x[n8+i ].im = i1; } }
--- a/vorbis_dec.c Tue Aug 12 00:27:21 2008 +0000 +++ b/vorbis_dec.c Tue Aug 12 00:33:34 2008 +0000 @@ -1517,18 +1517,18 @@ // MDCT, overlap/add, save data for next overlapping FPMATH retlen = (blocksize + vc->blocksize[previous_window])/4; - dir = retlen <= blocksize/2; // pick an order so that ret[] can reuse residues[] without stepping on any data we need + dir = retlen <= blocksize/2; // pick an order so that ret[] can reuse floors[] without stepping on any data we need for(j=dir?0:vc->audio_channels-1; (unsigned)j<vc->audio_channels; j+=dir*2-1) { uint_fast16_t bs0=vc->blocksize[0]; uint_fast16_t bs1=vc->blocksize[1]; float *residue=vc->channel_residues+res_chan[j]*blocksize/2; float *floor=vc->channel_floors+j*blocksize/2; float *saved=vc->saved+j*bs1/4; - float *ret=vc->channel_residues+j*retlen; - float *buf=floor; + float *ret=vc->channel_floors+j*retlen; + float *buf=residue; const float *win=vc->win[blockflag&previous_window]; - vc->mdct[0].fft.imdct_half(&vc->mdct[blockflag], buf, floor, residue); + vc->mdct[0].fft.imdct_half(&vc->mdct[blockflag], buf, floor); if(blockflag == previous_window) { vc->dsp.vector_fmul_window(ret, saved, buf, win, fadd_bias, blocksize/4); @@ -1583,7 +1583,7 @@ AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len); for(i=0; i<vc->audio_channels; i++) - channel_ptrs[i] = vc->channel_residues+i*len; + channel_ptrs[i] = vc->channel_floors+i*len; vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels); *data_size=len*2*vc->audio_channels;