# HG changeset patch # User lorenm # Date 1155009664 0 # Node ID 5ea82888103e412029d9d81e969c4932f668b626 # Parent ce5554dd79ce1f4d18b4acb90311a89cce067b7f 3dnow2 implementation of imdct. 6% faster vorbis and wma. diff -r ce5554dd79ce -r 5ea82888103e dsputil.h --- a/dsputil.h Sun Aug 06 23:15:32 2006 +0000 +++ b/dsputil.h Tue Aug 08 04:01:04 2006 +0000 @@ -594,6 +594,8 @@ FFTSample type */ typedef float FFTSample; +struct MDCTContext; + typedef struct FFTComplex { FFTSample re, im; } FFTComplex; @@ -605,6 +607,8 @@ FFTComplex *exptab; FFTComplex *exptab1; /* only used by SSE code */ void (*fft_calc)(struct FFTContext *s, FFTComplex *z); + void (*imdct_calc)(struct MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp); } FFTContext; int ff_fft_init(FFTContext *s, int nbits, int inverse); @@ -635,6 +639,8 @@ int ff_mdct_init(MDCTContext *s, int nbits, int inverse); void ff_imdct_calc(MDCTContext *s, FFTSample *output, const FFTSample *input, FFTSample *tmp); +void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp); void ff_mdct_calc(MDCTContext *s, FFTSample *out, const FFTSample *input, FFTSample *tmp); void ff_mdct_end(MDCTContext *s); diff -r ce5554dd79ce -r 5ea82888103e fft.c --- a/fft.c Sun Aug 06 23:15:32 2006 +0000 +++ b/fft.c Tue Aug 08 04:01:04 2006 +0000 @@ -54,6 +54,7 @@ s->exptab[i].im = s1; } s->fft_calc = ff_fft_calc_c; + s->imdct_calc = ff_imdct_calc; s->exptab1 = NULL; /* compute constant table for HAVE_SSE version */ @@ -62,11 +63,7 @@ int has_vectors = 0; #if defined(HAVE_MMX) -#ifdef HAVE_MM3DNOW has_vectors = mm_support() & (MM_3DNOW | MM_3DNOWEXT | MM_SSE | MM_SSE2); -#else - has_vectors = mm_support() & (MM_SSE | MM_SSE2); -#endif #endif #if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE) has_vectors = mm_support() & MM_ALTIVEC; @@ -98,6 +95,8 @@ } while (nblocks != 0); av_freep(&s->exptab); #if defined(HAVE_MMX) + if (has_vectors & MM_3DNOWEXT) + s->imdct_calc = ff_imdct_calc_3dn2; #ifdef HAVE_MM3DNOW if (has_vectors & MM_3DNOWEXT) /* 3DNowEx for Athlon(XP) */ diff -r ce5554dd79ce -r 5ea82888103e i386/fft_3dn2.c --- a/i386/fft_3dn2.c Sun Aug 06 23:15:32 2006 +0000 +++ b/i386/fft_3dn2.c Tue Aug 08 04:01:04 2006 +0000 @@ -1,6 +1,6 @@ /* * FFT/MDCT transform with Extended 3DNow! optimizations - * Copyright (c) 2006 Zuxy MENG Jie. + * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. * * This library is free software; you can redistribute it and/or @@ -134,3 +134,84 @@ } #endif + +void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + int k, n8, n4, n2, n; + const uint16_t *revtab = s->fft.revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + const FFTSample *in1, *in2; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + + /* pre rotation */ + in1 = input; + in2 = input + n2 - 1; + for(k = 0; k < n4; k++) { + asm volatile( + "movd %1, %%mm0 \n\t" + "movd %3, %%mm1 \n\t" + "punpckldq %2, %%mm0 \n\t" + "punpckldq %4, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "pfmul %%mm1, %%mm0 \n\t" + "pswapd %%mm1, %%mm1 \n\t" + "pfmul %%mm1, %%mm2 \n\t" + "pfpnacc %%mm2, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"=m"(z[revtab[k]]) + :"m"(in2[-2*k]), "m"(in1[2*k]), + "m"(tcos[k]), "m"(tsin[k]) + ); + } + + ff_fft_calc(&s->fft, z); + + /* post rotation + reordering */ + for(k = 0; k < n4; k++) { + asm volatile( + "movq %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + "punpckldq %2, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "pfmul %%mm1, %%mm0 \n\t" + "pswapd %%mm1, %%mm1 \n\t" + "pfmul %%mm1, %%mm2 \n\t" + "pfpnacc %%mm2, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"+m"(z[k]) + :"m"(tcos[k]), "m"(tsin[k]) + ); + } + + asm volatile("movd %0, %%mm7" ::"r"(1<<31)); + for(k = 0; k < n8; k++) { + asm volatile( + "movq %4, %%mm0 \n\t" + "pswapd %5, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "pxor %%mm7, %%mm2 \n\t" + "punpckldq %%mm1, %%mm2 \n\t" + "pswapd %%mm2, %%mm3 \n\t" + "punpckhdq %%mm1, %%mm0 \n\t" + "pswapd %%mm0, %%mm4 \n\t" + "pxor %%mm7, %%mm0 \n\t" + "pxor %%mm7, %%mm4 \n\t" + "movq %%mm0, %0 \n\t" // { -z[n8+k].im, z[n8-1-k].re } + "movq %%mm4, %1 \n\t" // { -z[n8-1-k].re, z[n8+k].im } + "movq %%mm2, %2 \n\t" // { -z[n8+k].re, z[n8-1-k].im } + "movq %%mm3, %3 \n\t" // { z[n8-1-k].im, -z[n8+k].re } + :"=m"(output[2*k]), "=m"(output[n2-2-2*k]), + "=m"(output[n2+2*k]), "=m"(output[n-2-2*k]) + :"m"(z[n8+k]), "m"(z[n8-1-k]) + :"memory" + ); + } + asm volatile("emms"); +} diff -r ce5554dd79ce -r 5ea82888103e vorbis.c --- a/vorbis.c Sun Aug 06 23:15:32 2006 +0000 +++ b/vorbis.c Tue Aug 08 04:01:04 2006 +0000 @@ -1598,7 +1598,7 @@ saved_start=vc->saved_start; - ff_imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp); + vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp); if (vc->modes[mode_number].blockflag) { // -- overlap/add diff -r ce5554dd79ce -r 5ea82888103e wmadec.c --- a/wmadec.c Sun Aug 06 23:15:32 2006 +0000 +++ b/wmadec.c Tue Aug 08 04:01:04 2006 +0000 @@ -1113,7 +1113,7 @@ n = s->block_len; n4 = s->block_len / 2; - ff_imdct_calc(&s->mdct_ctx[bsize], + s->mdct_ctx[bsize].fft.imdct_calc(&s->mdct_ctx[bsize], output, s->coefs[ch], s->mdct_tmp); /* XXX: optimize all that by build the window and