Mercurial > libavcodec.hg
changeset 781:6f5e87957bcb libavcodec
new generic FFT/MDCT code for audio codecs
author | bellard |
---|---|
date | Mon, 28 Oct 2002 00:34:08 +0000 |
parents | a48bb8bc63dd |
children | dd7d5748d064 |
files | dsputil.h fft-test.c fft.c i386/fft_sse.c mdct.c |
diffstat | 5 files changed, 869 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/dsputil.h Sun Oct 27 21:02:47 2002 +0000 +++ b/dsputil.h Mon Oct 28 00:34:08 2002 +0000 @@ -221,5 +221,52 @@ void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3], int orig_linesize[3], int coded_linesize, AVCodecContext *avctx); - + +/* FFT computation */ + +/* NOTE: soon integer code will be added, so you must use the + FFTSample type */ +typedef float FFTSample; + +typedef struct FFTComplex { + FFTSample re, im; +} FFTComplex; + +typedef struct FFTContext { + int nbits; + int inverse; + uint16_t *revtab; + FFTComplex *exptab; + FFTComplex *exptab1; /* only used by SSE code */ + void (*fft_calc)(struct FFTContext *s, FFTComplex *z); +} FFTContext; + +int fft_init(FFTContext *s, int nbits, int inverse); +void fft_permute(FFTContext *s, FFTComplex *z); +void fft_calc_c(FFTContext *s, FFTComplex *z); +void fft_calc_sse(FFTContext *s, FFTComplex *z); +static inline void fft_calc(FFTContext *s, FFTComplex *z) +{ + s->fft_calc(s, z); +} +void fft_end(FFTContext *s); + +/* MDCT computation */ + +typedef struct MDCTContext { + int n; /* size of MDCT (i.e. number of input data * 2) */ + int nbits; /* n = 2^nbits */ + /* pre/post rotation tables */ + FFTSample *tcos; + FFTSample *tsin; + FFTContext fft; +} MDCTContext; + +int mdct_init(MDCTContext *s, int nbits, int inverse); +void imdct_calc(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp); +void mdct_calc(MDCTContext *s, FFTSample *out, + const FFTSample *input, FFTSample *tmp); +void mdct_end(MDCTContext *s); + #endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft-test.c Mon Oct 28 00:34:08 2002 +0000 @@ -0,0 +1,294 @@ +/* FFT and MDCT tests */ +#include "dsputil.h" +#include <math.h> +#include <getopt.h> +#include <sys/time.h> + +int mm_flags; + +void *av_malloc(int size) +{ + void *ptr; + ptr = malloc(size); + return ptr; +} + +void av_free(void *ptr) +{ + /* XXX: this test should not be needed on most libcs */ + if (ptr) + free(ptr); +} + +/* cannot call it directly because of 'void **' casting is not automatic */ +void __av_freep(void **ptr) +{ + av_free(*ptr); + *ptr = NULL; +} + +/* reference fft */ + +#define MUL16(a,b) ((a) * (b)) + +#define CMAC(pre, pim, are, aim, bre, bim) \ +{\ + pre += (MUL16(are, bre) - MUL16(aim, bim));\ + pim += (MUL16(are, bim) + MUL16(bre, aim));\ +} + +FFTComplex *exptab; + +void fft_ref_init(int nbits, int inverse) +{ + int n, i; + float c1, s1, alpha; + + n = 1 << nbits; + exptab = av_malloc((n / 2) * sizeof(FFTComplex)); + + for(i=0;i<(n/2);i++) { + alpha = 2 * M_PI * (float)i / (float)n; + c1 = cos(alpha); + s1 = sin(alpha); + if (!inverse) + s1 = -s1; + exptab[i].re = c1; + exptab[i].im = s1; + } +} + +void fft_ref(FFTComplex *tabr, FFTComplex *tab, int nbits) +{ + int n, i, j, k, n2; + float tmp_re, tmp_im, s, c; + FFTComplex *q; + + n = 1 << nbits; + n2 = n >> 1; + for(i=0;i<n;i++) { + tmp_re = 0; + tmp_im = 0; + q = tab; + for(j=0;j<n;j++) { + k = (i * j) & (n - 1); + if (k >= n2) { + c = -exptab[k - n2].re; + s = -exptab[k - n2].im; + } else { + c = exptab[k].re; + s = exptab[k].im; + } + CMAC(tmp_re, tmp_im, c, s, q->re, q->im); + q++; + } + tabr[i].re = tmp_re; + tabr[i].im = tmp_im; + } +} + +void imdct_ref(float *out, float *in, int n) +{ + int k, i, a; + float sum, f; + + for(i=0;i<n;i++) { + sum = 0; + for(k=0;k<n/2;k++) { + a = (2 * i + 1 + (n / 2)) * (2 * k + 1); + f = cos(M_PI * a / (double)(2 * n)); + sum += f * in[k]; + } + out[i] = -sum; + } +} + +/* NOTE: no normalisation by 1 / N is done */ +void mdct_ref(float *output, float *input, int n) +{ + int k, i; + float a, s; + + /* do it by hand */ + for(k=0;k<n/2;k++) { + s = 0; + for(i=0;i<n;i++) { + a = (2*M_PI*(2*i+1+n/2)*(2*k+1) / (4 * n)); + s += input[i] * cos(a); + } + output[k] = s; + } +} + + +float frandom(void) +{ + return (float)((random() & 0xffff) - 32768) / 32768.0; +} + +INT64 gettime(void) +{ + struct timeval tv; + gettimeofday(&tv,NULL); + return (INT64)tv.tv_sec * 1000000 + tv.tv_usec; +} + +void check_diff(float *tab1, float *tab2, int n) +{ + int i; + + for(i=0;i<n;i++) { + if (fabsf(tab1[i] - tab2[i]) >= 1e-3) { + printf("ERROR %d: %f %f\n", + i, tab1[i], tab2[i]); + } + } +} + + +void help(void) +{ + printf("usage: fft-test [-h] [-s] [-i] [-n b]\n" + "-h print this help\n" + "-s speed test\n" + "-m (I)MDCT test\n" + "-i inverse transform test\n" + "-n b set the transform size to 2^b\n" + ); + exit(1); +} + + + +int main(int argc, char **argv) +{ + FFTComplex *tab, *tab1, *tab_ref; + FFTSample *tabtmp, *tab2; + int it, i, c; + int do_speed = 0; + int do_mdct = 0; + int do_inverse = 0; + FFTContext s1, *s = &s1; + MDCTContext m1, *m = &m1; + int fft_nbits, fft_size; + + mm_flags = 0; + fft_nbits = 9; + for(;;) { + c = getopt(argc, argv, "hsimn:"); + if (c == -1) + break; + switch(c) { + case 'h': + help(); + break; + case 's': + do_speed = 1; + break; + case 'i': + do_inverse = 1; + break; + case 'm': + do_mdct = 1; + break; + case 'n': + fft_nbits = atoi(optarg); + break; + } + } + + fft_size = 1 << fft_nbits; + tab = av_malloc(fft_size * sizeof(FFTComplex)); + tab1 = av_malloc(fft_size * sizeof(FFTComplex)); + tab_ref = av_malloc(fft_size * sizeof(FFTComplex)); + tabtmp = av_malloc(fft_size / 2 * sizeof(FFTSample)); + tab2 = av_malloc(fft_size * sizeof(FFTSample)); + + if (do_mdct) { + if (do_inverse) + printf("IMDCT"); + else + printf("MDCT"); + mdct_init(m, fft_nbits, do_inverse); + } else { + if (do_inverse) + printf("IFFT"); + else + printf("FFT"); + fft_init(s, fft_nbits, do_inverse); + fft_ref_init(fft_nbits, do_inverse); + } + printf(" %d test\n", fft_size); + + /* generate random data */ + + for(i=0;i<fft_size;i++) { + tab1[i].re = frandom(); + tab1[i].im = frandom(); + } + + /* checking result */ + printf("Checking...\n"); + + if (do_mdct) { + if (do_inverse) { + imdct_ref((float *)tab_ref, (float *)tab1, fft_size); + imdct_calc(m, tab2, (float *)tab1, tabtmp); + check_diff((float *)tab_ref, tab2, fft_size); + } else { + mdct_ref((float *)tab_ref, (float *)tab1, fft_size); + + mdct_calc(m, tab2, (float *)tab1, tabtmp); + + check_diff((float *)tab_ref, tab2, fft_size / 2); + } + } else { + memcpy(tab, tab1, fft_size * sizeof(FFTComplex)); + fft_permute(s, tab); + fft_calc(s, tab); + + fft_ref(tab_ref, tab1, fft_nbits); + check_diff((float *)tab_ref, (float *)tab, fft_size * 2); + } + + /* do a speed test */ + + if (do_speed) { + INT64 time_start, duration; + int nb_its; + + printf("Speed test...\n"); + /* we measure during about 1 seconds */ + nb_its = 1; + for(;;) { + time_start = gettime(); + for(it=0;it<nb_its;it++) { + if (do_mdct) { + if (do_inverse) { + imdct_calc(m, (float *)tab, (float *)tab1, tabtmp); + } else { + mdct_calc(m, (float *)tab, (float *)tab1, tabtmp); + } + } else { + memcpy(tab, tab1, fft_size * sizeof(FFTComplex)); + fft_calc(s, tab); + } + } + duration = gettime() - time_start; + if (duration >= 1000000) + break; + nb_its *= 2; + } + printf("time: %0.1f us/transform [total time=%0.2f s its=%d]\n", + (double)duration / nb_its, + (double)duration / 1000000.0, + nb_its); + } + + if (do_mdct) { + mdct_end(m); + } else { + fft_end(s); + } + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft.c Mon Oct 28 00:34:08 2002 +0000 @@ -0,0 +1,229 @@ +/* + * FFT/IFFT transforms + * Copyright (c) 2002 Fabrice Bellard. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "dsputil.h" + +/** + * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is + * done + */ +int fft_init(FFTContext *s, int nbits, int inverse) +{ + int i, j, m, n; + float alpha, c1, s1, s2; + + s->nbits = nbits; + n = 1 << nbits; + + s->exptab = av_malloc((n / 2) * sizeof(FFTComplex)); + if (!s->exptab) + goto fail; + s->revtab = av_malloc(n * sizeof(uint16_t)); + if (!s->revtab) + goto fail; + s->inverse = inverse; + + s2 = inverse ? 1.0 : -1.0; + + for(i=0;i<(n/2);i++) { + alpha = 2 * M_PI * (float)i / (float)n; + c1 = cos(alpha); + s1 = sin(alpha) * s2; + s->exptab[i].re = c1; + s->exptab[i].im = s1; + } + s->fft_calc = fft_calc_c; + s->exptab1 = NULL; + + /* compute constant table for HAVE_SSE version */ +#if defined(HAVE_MMX) && 0 + if (mm_flags & MM_SSE) { + int np, nblocks, np2, l; + FFTComplex *q; + + np = 1 << nbits; + nblocks = np >> 3; + np2 = np >> 1; + s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex)); + if (!s->exptab1) + goto fail; + q = s->exptab1; + do { + for(l = 0; l < np2; l += 2 * nblocks) { + *q++ = s->exptab[l]; + *q++ = s->exptab[l + nblocks]; + + q->re = -s->exptab[l].im; + q->im = s->exptab[l].re; + q++; + q->re = -s->exptab[l + nblocks].im; + q->im = s->exptab[l + nblocks].re; + q++; + } + nblocks = nblocks >> 1; + } while (nblocks != 0); + av_freep(&s->exptab); + } +#endif + + /* compute bit reverse table */ + + for(i=0;i<n;i++) { + m=0; + for(j=0;j<nbits;j++) { + m |= ((i >> j) & 1) << (nbits-j-1); + } + s->revtab[i]=m; + } + return 0; + fail: + av_freep(&s->revtab); + av_freep(&s->exptab); + av_freep(&s->exptab1); + return -1; +} + +/* butter fly op */ +#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ +{\ + FFTSample ax, ay, bx, by;\ + bx=pre1;\ + by=pim1;\ + ax=qre1;\ + ay=qim1;\ + pre = (bx + ax);\ + pim = (by + ay);\ + qre = (bx - ax);\ + qim = (by - ay);\ +} + +#define MUL16(a,b) ((a) * (b)) + +#define CMUL(pre, pim, are, aim, bre, bim) \ +{\ + pre = (MUL16(are, bre) - MUL16(aim, bim));\ + pim = (MUL16(are, bim) + MUL16(bre, aim));\ +} + +/** + * Do a complex FFT with the parameters defined in fft_init(). The + * input data must be permuted before with s->revtab table. No + * 1.0/sqrt(n) normalization is done. + */ +void fft_calc_c(FFTContext *s, FFTComplex *z) +{ + int ln = s->nbits; + int j, np, np2; + int nblocks, nloops; + register FFTComplex *p, *q; + FFTComplex *exptab = s->exptab; + int l; + FFTSample tmp_re, tmp_im; + + np = 1 << ln; + + /* pass 0 */ + + p=&z[0]; + j=(np >> 1); + do { + BF(p[0].re, p[0].im, p[1].re, p[1].im, + p[0].re, p[0].im, p[1].re, p[1].im); + p+=2; + } while (--j != 0); + + /* pass 1 */ + + + p=&z[0]; + j=np >> 2; + if (s->inverse) { + do { + BF(p[0].re, p[0].im, p[2].re, p[2].im, + p[0].re, p[0].im, p[2].re, p[2].im); + BF(p[1].re, p[1].im, p[3].re, p[3].im, + p[1].re, p[1].im, -p[3].im, p[3].re); + p+=4; + } while (--j != 0); + } else { + do { + BF(p[0].re, p[0].im, p[2].re, p[2].im, + p[0].re, p[0].im, p[2].re, p[2].im); + BF(p[1].re, p[1].im, p[3].re, p[3].im, + p[1].re, p[1].im, p[3].im, -p[3].re); + p+=4; + } while (--j != 0); + } + /* pass 2 .. ln-1 */ + + nblocks = np >> 3; + nloops = 1 << 2; + np2 = np >> 1; + do { + p = z; + q = z + nloops; + for (j = 0; j < nblocks; ++j) { + BF(p->re, p->im, q->re, q->im, + p->re, p->im, q->re, q->im); + + p++; + q++; + for(l = nblocks; l < np2; l += nblocks) { + CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im); + BF(p->re, p->im, q->re, q->im, + p->re, p->im, tmp_re, tmp_im); + p++; + q++; + } + + p += nloops; + q += nloops; + } + nblocks = nblocks >> 1; + nloops = nloops << 1; + } while (nblocks != 0); +} + +/** + * Do the permutation needed BEFORE calling fft_calc() + */ +void fft_permute(FFTContext *s, FFTComplex *z) +{ + int j, k, np; + FFTComplex tmp; + const uint16_t *revtab = s->revtab; + + /* reverse */ + np = 1 << s->nbits; + for(j=0;j<np;j++) { + k = revtab[j]; + if (k < j) { + tmp = z[k]; + z[k] = z[j]; + z[j] = tmp; + } + } +} + +void fft_end(FFTContext *s) +{ + av_freep(&s->revtab); + av_freep(&s->exptab); + av_freep(&s->exptab1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/i386/fft_sse.c Mon Oct 28 00:34:08 2002 +0000 @@ -0,0 +1,128 @@ +/* + * FFT/MDCT transform with SSE optimizations + * Copyright (c) 2002 Fabrice Bellard. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "../dsputil.h" +#include <math.h> + +#include <xmmintrin.h> + +static const float p1p1p1m1[4] __attribute__((aligned(16))) = + { 1.0, 1.0, 1.0, -1.0 }; + +static const float p1p1m1m1[4] __attribute__((aligned(16))) = + { 1.0, 1.0, -1.0, -1.0 }; + +#if 0 +static void print_v4sf(const char *str, __m128 a) +{ + float *p = (float *)&a; + printf("%s: %f %f %f %f\n", + str, p[0], p[1], p[2], p[3]); +} +#endif + +/* XXX: handle reverse case */ +void fft_calc_sse(FFTContext *s, FFTComplex *z) +{ + int ln = s->nbits; + int j, np, np2; + int nblocks, nloops; + register FFTComplex *p, *q; + FFTComplex *cptr, *cptr1; + int k; + + np = 1 << ln; + + { + __m128 *r, a, b, a1, c1, c2; + + r = (__m128 *)&z[0]; + c1 = *(__m128 *)p1p1m1m1; + c2 = *(__m128 *)p1p1p1m1; + j = (np >> 2); + do { + a = r[0]; + b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); + a = _mm_mul_ps(a, c1); + /* do the pass 0 butterfly */ + a = _mm_add_ps(a, b); + + a1 = r[1]; + b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2)); + a1 = _mm_mul_ps(a1, c1); + /* do the pass 0 butterfly */ + b = _mm_add_ps(a1, b); + + /* multiply third by -i */ + b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0)); + b = _mm_mul_ps(b, c2); + + /* do the pass 1 butterfly */ + r[0] = _mm_add_ps(a, b); + r[1] = _mm_sub_ps(a, b); + r += 2; + } while (--j != 0); + } + /* pass 2 .. ln-1 */ + + nblocks = np >> 3; + nloops = 1 << 2; + np2 = np >> 1; + + cptr1 = s->exptab1; + do { + p = z; + q = z + nloops; + j = nblocks; + do { + cptr = cptr1; + k = nloops >> 1; + do { + __m128 a, b, c, t1, t2; + + a = *(__m128 *)p; + b = *(__m128 *)q; + + /* complex mul */ + c = *(__m128 *)cptr; + /* cre*re cim*re */ + t1 = _mm_mul_ps(c, + _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); + c = *(__m128 *)(cptr + 2); + /* -cim*im cre*im */ + t2 = _mm_mul_ps(c, + _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); + b = _mm_add_ps(t1, t2); + + /* butterfly */ + *(__m128 *)p = _mm_add_ps(a, b); + *(__m128 *)q = _mm_sub_ps(a, b); + + p += 2; + q += 2; + cptr += 4; + } while (--k); + + p += nloops; + q += nloops; + } while (--j); + cptr1 += nloops * 2; + nblocks = nblocks >> 1; + nloops = nloops << 1; + } while (nblocks != 0); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mdct.c Mon Oct 28 00:34:08 2002 +0000 @@ -0,0 +1,170 @@ +/* + * MDCT/IMDCT transforms + * Copyright (c) 2002 Fabrice Bellard. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "dsputil.h" + +/* + * init MDCT or IMDCT computation + */ +int mdct_init(MDCTContext *s, int nbits, int inverse) +{ + int n, n4, i; + float alpha; + + memset(s, 0, sizeof(*s)); + n = 1 << nbits; + s->nbits = nbits; + s->n = n; + n4 = n >> 2; + s->tcos = malloc(n4 * sizeof(FFTSample)); + if (!s->tcos) + goto fail; + s->tsin = malloc(n4 * sizeof(FFTSample)); + if (!s->tsin) + goto fail; + + for(i=0;i<n4;i++) { + alpha = 2 * M_PI * (i + 1.0 / 8.0) / n; + s->tcos[i] = -cos(alpha); + s->tsin[i] = -sin(alpha); + } + if (fft_init(&s->fft, s->nbits - 2, inverse) < 0) + goto fail; + return 0; + fail: + av_freep(&s->tcos); + av_freep(&s->tsin); + return -1; +} + +/* complex multiplication: p = a * b */ +#define CMUL(pre, pim, are, aim, bre, bim) \ +{\ + float _are = (are);\ + float _aim = (aim);\ + float _bre = (bre);\ + float _bim = (bim);\ + (pre) = _are * _bre - _aim * _bim;\ + (pim) = _are * _bim + _aim * _bre;\ +} + +/** + * Compute inverse MDCT of size N = 2^nbits + * @param output N samples + * @param input N/2 samples + * @param tmp N/2 samples + */ +void imdct_calc(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + int k, n8, n4, n2, n, j; + const uint16_t *revtab = s->fft.revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + const FFTSample *in1, *in2; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + + /* pre rotation */ + in1 = input; + in2 = input + n2 - 1; + for(k = 0; k < n4; k++) { + j=revtab[k]; + CMUL(z[j].re, z[j].im, *in2, *in1, tcos[k], tsin[k]); + in1 += 2; + in2 -= 2; + } + fft_calc(&s->fft, z); + + /* post rotation + reordering */ + /* XXX: optimize */ + for(k = 0; k < n4; k++) { + CMUL(z[k].re, z[k].im, z[k].re, z[k].im, tcos[k], tsin[k]); + } + for(k = 0; k < n8; k++) { + output[2*k] = -z[n8 + k].im; + output[n2-1-2*k] = z[n8 + k].im; + + output[2*k+1] = z[n8-1-k].re; + output[n2-1-2*k-1] = -z[n8-1-k].re; + + output[n2 + 2*k]=-z[k+n8].re; + output[n-1- 2*k]=-z[k+n8].re; + + output[n2 + 2*k+1]=z[n8-k-1].im; + output[n-2 - 2 * k] = z[n8-k-1].im; + } +} + +/** + * Compute MDCT of size N = 2^nbits + * @param input N samples + * @param out N/2 samples + * @param tmp temporary storage of N/2 samples + */ +void mdct_calc(MDCTContext *s, FFTSample *out, + const FFTSample *input, FFTSample *tmp) +{ + int i, j, n, n8, n4, n2, n3; + FFTSample re, im, re1, im1; + const uint16_t *revtab = s->fft.revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + FFTComplex *x = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + n3 = 3 * n4; + + /* pre rotation */ + for(i=0;i<n8;i++) { + re = -input[2*i+3*n4] - input[n3-1-2*i]; + im = -input[n4+2*i] + input[n4-1-2*i]; + j = revtab[i]; + CMUL(x[j].re, x[j].im, re, im, -tcos[i], tsin[i]); + + re = input[2*i] - input[n2-1-2*i]; + im = -(input[n2+2*i] + input[n-1-2*i]); + j = revtab[n8 + i]; + CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]); + } + + fft_calc(&s->fft, x); + + /* post rotation */ + for(i=0;i<n4;i++) { + re = x[i].re; + im = x[i].im; + CMUL(re1, im1, re, im, -tsin[i], -tcos[i]); + out[2*i] = im1; + out[n2-1-2*i] = re1; + } +} + +void mdct_end(MDCTContext *s) +{ + av_freep(&s->tcos); + av_freep(&s->tsin); + fft_end(&s->fft); +}