# HG changeset patch
# User bellard
# Date 1035765248 0
# Node ID 6f5e87957bcb2d16694a2a9ad24876acd566c3d5
# Parent  a48bb8bc63ddf550730eedfc52dadc3236ac6186
new generic FFT/MDCT code for audio codecs

diff -r a48bb8bc63dd -r 6f5e87957bcb dsputil.h
--- a/dsputil.h	Sun Oct 27 21:02:47 2002 +0000
+++ b/dsputil.h	Mon Oct 28 00:34:08 2002 +0000
@@ -221,5 +221,52 @@
 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
               int orig_linesize[3], int coded_linesize,
               AVCodecContext *avctx);
-              
+
+/* FFT computation */
+
+/* NOTE: soon integer code will be added, so you must use the
+   FFTSample type */
+typedef float FFTSample;
+
+typedef struct FFTComplex {
+    FFTSample re, im;
+} FFTComplex;
+
+typedef struct FFTContext {
+    int nbits;
+    int inverse;
+    uint16_t *revtab;
+    FFTComplex *exptab;
+    FFTComplex *exptab1; /* only used by SSE code */
+    void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
+} FFTContext;
+
+int fft_init(FFTContext *s, int nbits, int inverse);
+void fft_permute(FFTContext *s, FFTComplex *z);
+void fft_calc_c(FFTContext *s, FFTComplex *z);
+void fft_calc_sse(FFTContext *s, FFTComplex *z);
+static inline void fft_calc(FFTContext *s, FFTComplex *z)
+{
+    s->fft_calc(s, z);
+}
+void fft_end(FFTContext *s);
+
+/* MDCT computation */
+
+typedef struct MDCTContext {
+    int n;  /* size of MDCT (i.e. number of input data * 2) */
+    int nbits; /* n = 2^nbits */
+    /* pre/post rotation tables */
+    FFTSample *tcos;
+    FFTSample *tsin;
+    FFTContext fft;
+} MDCTContext;
+
+int mdct_init(MDCTContext *s, int nbits, int inverse);
+void imdct_calc(MDCTContext *s, FFTSample *output, 
+                const FFTSample *input, FFTSample *tmp);
+void mdct_calc(MDCTContext *s, FFTSample *out, 
+               const FFTSample *input, FFTSample *tmp);
+void mdct_end(MDCTContext *s);
+
 #endif
diff -r a48bb8bc63dd -r 6f5e87957bcb fft-test.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft-test.c	Mon Oct 28 00:34:08 2002 +0000
@@ -0,0 +1,294 @@
+/* FFT and MDCT tests */
+#include "dsputil.h"
+#include <math.h>
+#include <getopt.h>
+#include <sys/time.h>
+
+int mm_flags;
+
+void *av_malloc(int size)
+{
+    void *ptr;
+    ptr = malloc(size);
+    return ptr;
+}
+
+void av_free(void *ptr)
+{
+    /* XXX: this test should not be needed on most libcs */
+    if (ptr)
+        free(ptr);
+}
+
+/* cannot call it directly because of 'void **' casting is not automatic */
+void __av_freep(void **ptr)
+{
+    av_free(*ptr);
+    *ptr = NULL;
+}
+
+/* reference fft */
+
+#define MUL16(a,b) ((a) * (b))
+
+#define CMAC(pre, pim, are, aim, bre, bim) \
+{\
+   pre += (MUL16(are, bre) - MUL16(aim, bim));\
+   pim += (MUL16(are, bim) + MUL16(bre, aim));\
+}
+
+FFTComplex *exptab;
+
+void fft_ref_init(int nbits, int inverse)
+{
+    int n, i;
+    float c1, s1, alpha;
+
+    n = 1 << nbits;
+    exptab = av_malloc((n / 2) * sizeof(FFTComplex));
+
+    for(i=0;i<(n/2);i++) {
+        alpha = 2 * M_PI * (float)i / (float)n;
+        c1 = cos(alpha);
+        s1 = sin(alpha);
+        if (!inverse)
+            s1 = -s1;
+        exptab[i].re = c1;
+        exptab[i].im = s1;
+    }
+}
+
+void fft_ref(FFTComplex *tabr, FFTComplex *tab, int nbits)
+{
+    int n, i, j, k, n2;
+    float tmp_re, tmp_im, s, c;
+    FFTComplex *q;
+
+    n = 1 << nbits;
+    n2 = n >> 1;
+    for(i=0;i<n;i++) {
+        tmp_re = 0;
+        tmp_im = 0;
+        q = tab;
+        for(j=0;j<n;j++) {
+            k = (i * j) & (n - 1);
+            if (k >= n2) {
+                c = -exptab[k - n2].re;
+                s = -exptab[k - n2].im;
+            } else {
+                c = exptab[k].re;
+                s = exptab[k].im;
+            }
+            CMAC(tmp_re, tmp_im, c, s, q->re, q->im);
+            q++;
+        }
+        tabr[i].re = tmp_re;
+        tabr[i].im = tmp_im;
+    }
+}
+
+void imdct_ref(float *out, float *in, int n)
+{
+    int k, i, a;
+    float sum, f;
+
+    for(i=0;i<n;i++) {
+        sum = 0;
+        for(k=0;k<n/2;k++) {
+            a = (2 * i + 1 + (n / 2)) * (2 * k + 1);
+            f = cos(M_PI * a / (double)(2 * n));
+            sum += f * in[k];
+        }
+        out[i] = -sum;
+    }
+}
+
+/* NOTE: no normalisation by 1 / N is done */
+void mdct_ref(float *output, float *input, int n)
+{
+    int k, i;
+    float a, s;
+
+    /* do it by hand */
+    for(k=0;k<n/2;k++) {
+        s = 0;
+        for(i=0;i<n;i++) {
+            a = (2*M_PI*(2*i+1+n/2)*(2*k+1) / (4 * n));
+            s += input[i] * cos(a);
+        }
+        output[k] = s;
+    }
+}
+
+
+float frandom(void)
+{
+    return (float)((random() & 0xffff) - 32768) / 32768.0;
+}
+
+INT64 gettime(void)
+{
+    struct timeval tv;
+    gettimeofday(&tv,NULL);
+    return (INT64)tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+void check_diff(float *tab1, float *tab2, int n)
+{
+    int i;
+
+    for(i=0;i<n;i++) {
+        if (fabsf(tab1[i] - tab2[i]) >= 1e-3) {
+            printf("ERROR %d: %f %f\n", 
+                   i, tab1[i], tab2[i]);
+        }
+    }
+}
+
+
+void help(void)
+{
+    printf("usage: fft-test [-h] [-s] [-i] [-n b]\n"
+           "-h     print this help\n"
+           "-s     speed test\n"
+           "-m     (I)MDCT test\n"
+           "-i     inverse transform test\n"
+           "-n b   set the transform size to 2^b\n"
+           );
+    exit(1);
+}
+
+
+
+int main(int argc, char **argv)
+{
+    FFTComplex *tab, *tab1, *tab_ref;
+    FFTSample *tabtmp, *tab2;
+    int it, i, c;
+    int do_speed = 0;
+    int do_mdct = 0;
+    int do_inverse = 0;
+    FFTContext s1, *s = &s1;
+    MDCTContext m1, *m = &m1;
+    int fft_nbits, fft_size;
+
+    mm_flags = 0;
+    fft_nbits = 9;
+    for(;;) {
+        c = getopt(argc, argv, "hsimn:");
+        if (c == -1)
+            break;
+        switch(c) {
+        case 'h':
+            help();
+            break;
+        case 's':
+            do_speed = 1;
+            break;
+        case 'i':
+            do_inverse = 1;
+            break;
+        case 'm':
+            do_mdct = 1;
+            break;
+        case 'n':
+            fft_nbits = atoi(optarg);
+            break;
+        }
+    }
+
+    fft_size = 1 << fft_nbits;
+    tab = av_malloc(fft_size * sizeof(FFTComplex));
+    tab1 = av_malloc(fft_size * sizeof(FFTComplex));
+    tab_ref = av_malloc(fft_size * sizeof(FFTComplex));
+    tabtmp = av_malloc(fft_size / 2 * sizeof(FFTSample));
+    tab2 = av_malloc(fft_size * sizeof(FFTSample));
+
+    if (do_mdct) {
+        if (do_inverse)
+            printf("IMDCT");
+        else
+            printf("MDCT");
+        mdct_init(m, fft_nbits, do_inverse);
+    } else {
+        if (do_inverse)
+            printf("IFFT");
+        else
+            printf("FFT");
+        fft_init(s, fft_nbits, do_inverse);
+        fft_ref_init(fft_nbits, do_inverse);
+    }
+    printf(" %d test\n", fft_size);
+
+    /* generate random data */
+
+    for(i=0;i<fft_size;i++) {
+        tab1[i].re = frandom();
+        tab1[i].im = frandom();
+    }
+
+    /* checking result */
+    printf("Checking...\n");
+
+    if (do_mdct) {
+        if (do_inverse) {
+            imdct_ref((float *)tab_ref, (float *)tab1, fft_size);
+            imdct_calc(m, tab2, (float *)tab1, tabtmp);
+            check_diff((float *)tab_ref, tab2, fft_size);
+        } else {
+            mdct_ref((float *)tab_ref, (float *)tab1, fft_size);
+            
+            mdct_calc(m, tab2, (float *)tab1, tabtmp);
+
+            check_diff((float *)tab_ref, tab2, fft_size / 2);
+        }
+    } else {
+        memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
+        fft_permute(s, tab);
+        fft_calc(s, tab);
+        
+        fft_ref(tab_ref, tab1, fft_nbits);
+        check_diff((float *)tab_ref, (float *)tab, fft_size * 2);
+    }
+
+    /* do a speed test */
+
+    if (do_speed) {
+        INT64 time_start, duration;
+        int nb_its;
+
+        printf("Speed test...\n");
+        /* we measure during about 1 seconds */
+        nb_its = 1;
+        for(;;) {
+            time_start = gettime();
+            for(it=0;it<nb_its;it++) {
+                if (do_mdct) {
+                    if (do_inverse) {
+                        imdct_calc(m, (float *)tab, (float *)tab1, tabtmp);
+                    } else {
+                        mdct_calc(m, (float *)tab, (float *)tab1, tabtmp);
+                    }
+                } else {
+                    memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
+                    fft_calc(s, tab);
+                }
+            }
+            duration = gettime() - time_start;
+            if (duration >= 1000000)
+                break;
+            nb_its *= 2;
+        }
+        printf("time: %0.1f us/transform [total time=%0.2f s its=%d]\n", 
+               (double)duration / nb_its, 
+               (double)duration / 1000000.0,
+               nb_its);
+    }
+    
+    if (do_mdct) {
+        mdct_end(m);
+    } else {
+        fft_end(s);
+    }
+    return 0;
+}
diff -r a48bb8bc63dd -r 6f5e87957bcb fft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft.c	Mon Oct 28 00:34:08 2002 +0000
@@ -0,0 +1,229 @@
+/*
+ * FFT/IFFT transforms
+ * Copyright (c) 2002 Fabrice Bellard.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "dsputil.h"
+
+/**
+ * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is
+ * done 
+ */
+int fft_init(FFTContext *s, int nbits, int inverse)
+{
+    int i, j, m, n;
+    float alpha, c1, s1, s2;
+    
+    s->nbits = nbits;
+    n = 1 << nbits;
+
+    s->exptab = av_malloc((n / 2) * sizeof(FFTComplex));
+    if (!s->exptab)
+        goto fail;
+    s->revtab = av_malloc(n * sizeof(uint16_t));
+    if (!s->revtab)
+        goto fail;
+    s->inverse = inverse;
+
+    s2 = inverse ? 1.0 : -1.0;
+        
+    for(i=0;i<(n/2);i++) {
+        alpha = 2 * M_PI * (float)i / (float)n;
+        c1 = cos(alpha);
+        s1 = sin(alpha) * s2;
+        s->exptab[i].re = c1;
+        s->exptab[i].im = s1;
+    }
+    s->fft_calc = fft_calc_c;
+    s->exptab1 = NULL;
+
+    /* compute constant table for HAVE_SSE version */
+#if defined(HAVE_MMX) && 0
+    if (mm_flags & MM_SSE) {
+        int np, nblocks, np2, l;
+        FFTComplex *q;
+
+        np = 1 << nbits;
+        nblocks = np >> 3;
+        np2 = np >> 1;
+        s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex));
+        if (!s->exptab1)
+            goto fail;
+        q = s->exptab1;
+        do {
+            for(l = 0; l < np2; l += 2 * nblocks) {
+                *q++ = s->exptab[l];
+                *q++ = s->exptab[l + nblocks];
+
+                q->re = -s->exptab[l].im;
+                q->im = s->exptab[l].re;
+                q++;
+                q->re = -s->exptab[l + nblocks].im;
+                q->im = s->exptab[l + nblocks].re;
+                q++;
+            }
+            nblocks = nblocks >> 1;
+        } while (nblocks != 0);
+        av_freep(&s->exptab);
+    }
+#endif
+
+    /* compute bit reverse table */
+
+    for(i=0;i<n;i++) {
+        m=0;
+        for(j=0;j<nbits;j++) {
+            m |= ((i >> j) & 1) << (nbits-j-1);
+        }
+        s->revtab[i]=m;
+    }
+    return 0;
+ fail:
+    av_freep(&s->revtab);
+    av_freep(&s->exptab);
+    av_freep(&s->exptab1);
+    return -1;
+}
+
+/* butter fly op */
+#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
+{\
+  FFTSample ax, ay, bx, by;\
+  bx=pre1;\
+  by=pim1;\
+  ax=qre1;\
+  ay=qim1;\
+  pre = (bx + ax);\
+  pim = (by + ay);\
+  qre = (bx - ax);\
+  qim = (by - ay);\
+}
+
+#define MUL16(a,b) ((a) * (b))
+
+#define CMUL(pre, pim, are, aim, bre, bim) \
+{\
+   pre = (MUL16(are, bre) - MUL16(aim, bim));\
+   pim = (MUL16(are, bim) + MUL16(bre, aim));\
+}
+
+/**
+ * Do a complex FFT with the parameters defined in fft_init(). The
+ * input data must be permuted before with s->revtab table. No
+ * 1.0/sqrt(n) normalization is done.  
+ */
+void fft_calc_c(FFTContext *s, FFTComplex *z)
+{
+    int ln = s->nbits;
+    int	j, np, np2;
+    int	nblocks, nloops;
+    register FFTComplex *p, *q;
+    FFTComplex *exptab = s->exptab;
+    int l;
+    FFTSample tmp_re, tmp_im;
+
+    np = 1 << ln;
+
+    /* pass 0 */
+
+    p=&z[0];
+    j=(np >> 1);
+    do {
+        BF(p[0].re, p[0].im, p[1].re, p[1].im, 
+           p[0].re, p[0].im, p[1].re, p[1].im);
+        p+=2;
+    } while (--j != 0);
+
+    /* pass 1 */
+
+    
+    p=&z[0];
+    j=np >> 2;
+    if (s->inverse) {
+        do {
+            BF(p[0].re, p[0].im, p[2].re, p[2].im, 
+               p[0].re, p[0].im, p[2].re, p[2].im);
+            BF(p[1].re, p[1].im, p[3].re, p[3].im, 
+               p[1].re, p[1].im, -p[3].im, p[3].re);
+            p+=4;
+        } while (--j != 0);
+    } else {
+        do {
+            BF(p[0].re, p[0].im, p[2].re, p[2].im, 
+               p[0].re, p[0].im, p[2].re, p[2].im);
+            BF(p[1].re, p[1].im, p[3].re, p[3].im, 
+               p[1].re, p[1].im, p[3].im, -p[3].re);
+            p+=4;
+        } while (--j != 0);
+    }
+    /* pass 2 .. ln-1 */
+
+    nblocks = np >> 3;
+    nloops = 1 << 2;
+    np2 = np >> 1;
+    do {
+        p = z;
+        q = z + nloops;
+        for (j = 0; j < nblocks; ++j) {
+            BF(p->re, p->im, q->re, q->im,
+               p->re, p->im, q->re, q->im);
+            
+            p++;
+            q++;
+            for(l = nblocks; l < np2; l += nblocks) {
+                CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
+                BF(p->re, p->im, q->re, q->im,
+                   p->re, p->im, tmp_re, tmp_im);
+                p++;
+                q++;
+            }
+
+            p += nloops;
+            q += nloops;
+        }
+        nblocks = nblocks >> 1;
+        nloops = nloops << 1;
+    } while (nblocks != 0);
+}
+
+/**
+ * Do the permutation needed BEFORE calling fft_calc()
+ */
+void fft_permute(FFTContext *s, FFTComplex *z)
+{
+    int j, k, np;
+    FFTComplex tmp;
+    const uint16_t *revtab = s->revtab;
+    
+    /* reverse */
+    np = 1 << s->nbits;
+    for(j=0;j<np;j++) {
+        k = revtab[j];
+        if (k < j) {
+            tmp = z[k];
+            z[k] = z[j];
+            z[j] = tmp;
+        }
+    }
+}
+
+void fft_end(FFTContext *s)
+{
+    av_freep(&s->revtab);
+    av_freep(&s->exptab);
+    av_freep(&s->exptab1);
+}
+
diff -r a48bb8bc63dd -r 6f5e87957bcb i386/fft_sse.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/i386/fft_sse.c	Mon Oct 28 00:34:08 2002 +0000
@@ -0,0 +1,128 @@
+/*
+ * FFT/MDCT transform with SSE optimizations
+ * Copyright (c) 2002 Fabrice Bellard.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "../dsputil.h"
+#include <math.h>
+
+#include <xmmintrin.h>
+
+static const float p1p1p1m1[4] __attribute__((aligned(16))) = 
+    { 1.0, 1.0, 1.0, -1.0 };
+
+static const float p1p1m1m1[4] __attribute__((aligned(16))) = 
+    { 1.0, 1.0, -1.0, -1.0 };
+
+#if 0
+static void print_v4sf(const char *str, __m128 a)
+{
+    float *p = (float *)&a;
+    printf("%s: %f %f %f %f\n",
+           str, p[0], p[1], p[2], p[3]);
+}
+#endif
+
+/* XXX: handle reverse case */
+void fft_calc_sse(FFTContext *s, FFTComplex *z)
+{
+    int ln = s->nbits;
+    int	j, np, np2;
+    int	nblocks, nloops;
+    register FFTComplex *p, *q;
+    FFTComplex *cptr, *cptr1;
+    int k;
+
+    np = 1 << ln;
+
+    {
+        __m128 *r, a, b, a1, c1, c2;
+
+        r = (__m128 *)&z[0];
+        c1 = *(__m128 *)p1p1m1m1;
+        c2 = *(__m128 *)p1p1p1m1;
+        j = (np >> 2);
+        do {
+            a = r[0];
+            b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
+            a = _mm_mul_ps(a, c1);
+            /* do the pass 0 butterfly */
+            a = _mm_add_ps(a, b);
+
+            a1 = r[1];
+            b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2));
+            a1 = _mm_mul_ps(a1, c1);
+            /* do the pass 0 butterfly */
+            b = _mm_add_ps(a1, b);
+
+            /* multiply third by -i */
+            b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0));
+            b = _mm_mul_ps(b, c2);
+
+            /* do the pass 1 butterfly */
+            r[0] = _mm_add_ps(a, b);
+            r[1] = _mm_sub_ps(a, b);
+            r += 2;
+        } while (--j != 0);
+    }
+    /* pass 2 .. ln-1 */
+
+    nblocks = np >> 3;
+    nloops = 1 << 2;
+    np2 = np >> 1;
+
+    cptr1 = s->exptab1;
+    do {
+        p = z;
+        q = z + nloops;
+        j = nblocks;
+        do {
+            cptr = cptr1;
+            k = nloops >> 1;
+            do {
+                __m128 a, b, c, t1, t2;
+
+                a = *(__m128 *)p;
+                b = *(__m128 *)q;
+                
+                /* complex mul */
+                c = *(__m128 *)cptr;
+                /*  cre*re cim*re */
+                t1 = _mm_mul_ps(c, 
+                                _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); 
+                c = *(__m128 *)(cptr + 2);
+                /*  -cim*im cre*im */
+                t2 = _mm_mul_ps(c,
+                                _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); 
+                b = _mm_add_ps(t1, t2);
+                
+                /* butterfly */
+                *(__m128 *)p = _mm_add_ps(a, b);
+                *(__m128 *)q = _mm_sub_ps(a, b);
+                
+                p += 2;
+                q += 2;
+                cptr += 4;
+            } while (--k);
+        
+            p += nloops;
+            q += nloops;
+        } while (--j);
+        cptr1 += nloops * 2;
+        nblocks = nblocks >> 1;
+        nloops = nloops << 1;
+    } while (nblocks != 0);
+}
diff -r a48bb8bc63dd -r 6f5e87957bcb mdct.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mdct.c	Mon Oct 28 00:34:08 2002 +0000
@@ -0,0 +1,170 @@
+/*
+ * MDCT/IMDCT transforms
+ * Copyright (c) 2002 Fabrice Bellard.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "dsputil.h"
+
+/*
+ * init MDCT or IMDCT computation
+ */
+int mdct_init(MDCTContext *s, int nbits, int inverse)
+{
+    int n, n4, i;
+    float alpha;
+
+    memset(s, 0, sizeof(*s));
+    n = 1 << nbits;
+    s->nbits = nbits;
+    s->n = n;
+    n4 = n >> 2;
+    s->tcos = malloc(n4 * sizeof(FFTSample));
+    if (!s->tcos)
+        goto fail;
+    s->tsin = malloc(n4 * sizeof(FFTSample));
+    if (!s->tsin)
+        goto fail;
+
+    for(i=0;i<n4;i++) {
+        alpha = 2 * M_PI * (i + 1.0 / 8.0) / n;
+        s->tcos[i] = -cos(alpha);
+        s->tsin[i] = -sin(alpha);
+    }
+    if (fft_init(&s->fft, s->nbits - 2, inverse) < 0)
+        goto fail;
+    return 0;
+ fail:
+    av_freep(&s->tcos);
+    av_freep(&s->tsin);
+    return -1;
+}
+
+/* complex multiplication: p = a * b */
+#define CMUL(pre, pim, are, aim, bre, bim) \
+{\
+    float _are = (are);\
+    float _aim = (aim);\
+    float _bre = (bre);\
+    float _bim = (bim);\
+    (pre) = _are * _bre - _aim * _bim;\
+    (pim) = _are * _bim + _aim * _bre;\
+}
+
+/**
+ * Compute inverse MDCT of size N = 2^nbits
+ * @param output N samples
+ * @param input N/2 samples
+ * @param tmp N/2 samples
+ */
+void imdct_calc(MDCTContext *s, FFTSample *output, 
+                const FFTSample *input, FFTSample *tmp)
+{
+    int k, n8, n4, n2, n, j;
+    const uint16_t *revtab = s->fft.revtab;
+    const FFTSample *tcos = s->tcos;
+    const FFTSample *tsin = s->tsin;
+    const FFTSample *in1, *in2;
+    FFTComplex *z = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n2 = n >> 1;
+    n4 = n >> 2;
+    n8 = n >> 3;
+
+    /* pre rotation */
+    in1 = input;
+    in2 = input + n2 - 1;
+    for(k = 0; k < n4; k++) {
+        j=revtab[k];
+        CMUL(z[j].re, z[j].im, *in2, *in1, tcos[k], tsin[k]);
+        in1 += 2;
+        in2 -= 2;
+    }
+    fft_calc(&s->fft, z);
+
+    /* post rotation + reordering */
+    /* XXX: optimize */
+    for(k = 0; k < n4; k++) {
+        CMUL(z[k].re, z[k].im, z[k].re, z[k].im, tcos[k], tsin[k]);
+    }
+    for(k = 0; k < n8; k++) {
+        output[2*k] = -z[n8 + k].im;
+        output[n2-1-2*k] = z[n8 + k].im;
+
+        output[2*k+1] = z[n8-1-k].re;
+        output[n2-1-2*k-1] = -z[n8-1-k].re;
+
+        output[n2 + 2*k]=-z[k+n8].re;
+        output[n-1- 2*k]=-z[k+n8].re;
+
+        output[n2 + 2*k+1]=z[n8-k-1].im;
+        output[n-2 - 2 * k] = z[n8-k-1].im;
+    }
+}
+
+/**
+ * Compute MDCT of size N = 2^nbits
+ * @param input N samples
+ * @param out N/2 samples
+ * @param tmp temporary storage of N/2 samples
+ */
+void mdct_calc(MDCTContext *s, FFTSample *out, 
+               const FFTSample *input, FFTSample *tmp)
+{
+    int i, j, n, n8, n4, n2, n3;
+    FFTSample re, im, re1, im1;
+    const uint16_t *revtab = s->fft.revtab;
+    const FFTSample *tcos = s->tcos;
+    const FFTSample *tsin = s->tsin;
+    FFTComplex *x = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n2 = n >> 1;
+    n4 = n >> 2;
+    n8 = n >> 3;
+    n3 = 3 * n4;
+
+    /* pre rotation */
+    for(i=0;i<n8;i++) {
+        re = -input[2*i+3*n4] - input[n3-1-2*i];
+        im = -input[n4+2*i] + input[n4-1-2*i];
+        j = revtab[i];
+        CMUL(x[j].re, x[j].im, re, im, -tcos[i], tsin[i]);
+
+        re = input[2*i] - input[n2-1-2*i];
+        im = -(input[n2+2*i] + input[n-1-2*i]);
+        j = revtab[n8 + i];
+        CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
+    }
+
+    fft_calc(&s->fft, x);
+  
+    /* post rotation */
+    for(i=0;i<n4;i++) {
+        re = x[i].re;
+        im = x[i].im;
+        CMUL(re1, im1, re, im, -tsin[i], -tcos[i]);
+        out[2*i] = im1;
+        out[n2-1-2*i] = re1;
+    }
+}
+
+void mdct_end(MDCTContext *s)
+{
+    av_freep(&s->tcos);
+    av_freep(&s->tsin);
+    fft_end(&s->fft);
+}