# HG changeset patch # User bellard # Date 1035765248 0 # Node ID 6f5e87957bcb2d16694a2a9ad24876acd566c3d5 # Parent a48bb8bc63ddf550730eedfc52dadc3236ac6186 new generic FFT/MDCT code for audio codecs diff -r a48bb8bc63dd -r 6f5e87957bcb dsputil.h --- a/dsputil.h Sun Oct 27 21:02:47 2002 +0000 +++ b/dsputil.h Mon Oct 28 00:34:08 2002 +0000 @@ -221,5 +221,52 @@ void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3], int orig_linesize[3], int coded_linesize, AVCodecContext *avctx); - + +/* FFT computation */ + +/* NOTE: soon integer code will be added, so you must use the + FFTSample type */ +typedef float FFTSample; + +typedef struct FFTComplex { + FFTSample re, im; +} FFTComplex; + +typedef struct FFTContext { + int nbits; + int inverse; + uint16_t *revtab; + FFTComplex *exptab; + FFTComplex *exptab1; /* only used by SSE code */ + void (*fft_calc)(struct FFTContext *s, FFTComplex *z); +} FFTContext; + +int fft_init(FFTContext *s, int nbits, int inverse); +void fft_permute(FFTContext *s, FFTComplex *z); +void fft_calc_c(FFTContext *s, FFTComplex *z); +void fft_calc_sse(FFTContext *s, FFTComplex *z); +static inline void fft_calc(FFTContext *s, FFTComplex *z) +{ + s->fft_calc(s, z); +} +void fft_end(FFTContext *s); + +/* MDCT computation */ + +typedef struct MDCTContext { + int n; /* size of MDCT (i.e. number of input data * 2) */ + int nbits; /* n = 2^nbits */ + /* pre/post rotation tables */ + FFTSample *tcos; + FFTSample *tsin; + FFTContext fft; +} MDCTContext; + +int mdct_init(MDCTContext *s, int nbits, int inverse); +void imdct_calc(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp); +void mdct_calc(MDCTContext *s, FFTSample *out, + const FFTSample *input, FFTSample *tmp); +void mdct_end(MDCTContext *s); + #endif diff -r a48bb8bc63dd -r 6f5e87957bcb fft-test.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft-test.c Mon Oct 28 00:34:08 2002 +0000 @@ -0,0 +1,294 @@ +/* FFT and MDCT tests */ +#include "dsputil.h" +#include +#include +#include + +int mm_flags; + +void *av_malloc(int size) +{ + void *ptr; + ptr = malloc(size); + return ptr; +} + +void av_free(void *ptr) +{ + /* XXX: this test should not be needed on most libcs */ + if (ptr) + free(ptr); +} + +/* cannot call it directly because of 'void **' casting is not automatic */ +void __av_freep(void **ptr) +{ + av_free(*ptr); + *ptr = NULL; +} + +/* reference fft */ + +#define MUL16(a,b) ((a) * (b)) + +#define CMAC(pre, pim, are, aim, bre, bim) \ +{\ + pre += (MUL16(are, bre) - MUL16(aim, bim));\ + pim += (MUL16(are, bim) + MUL16(bre, aim));\ +} + +FFTComplex *exptab; + +void fft_ref_init(int nbits, int inverse) +{ + int n, i; + float c1, s1, alpha; + + n = 1 << nbits; + exptab = av_malloc((n / 2) * sizeof(FFTComplex)); + + for(i=0;i<(n/2);i++) { + alpha = 2 * M_PI * (float)i / (float)n; + c1 = cos(alpha); + s1 = sin(alpha); + if (!inverse) + s1 = -s1; + exptab[i].re = c1; + exptab[i].im = s1; + } +} + +void fft_ref(FFTComplex *tabr, FFTComplex *tab, int nbits) +{ + int n, i, j, k, n2; + float tmp_re, tmp_im, s, c; + FFTComplex *q; + + n = 1 << nbits; + n2 = n >> 1; + for(i=0;i= n2) { + c = -exptab[k - n2].re; + s = -exptab[k - n2].im; + } else { + c = exptab[k].re; + s = exptab[k].im; + } + CMAC(tmp_re, tmp_im, c, s, q->re, q->im); + q++; + } + tabr[i].re = tmp_re; + tabr[i].im = tmp_im; + } +} + +void imdct_ref(float *out, float *in, int n) +{ + int k, i, a; + float sum, f; + + for(i=0;i= 1e-3) { + printf("ERROR %d: %f %f\n", + i, tab1[i], tab2[i]); + } + } +} + + +void help(void) +{ + printf("usage: fft-test [-h] [-s] [-i] [-n b]\n" + "-h print this help\n" + "-s speed test\n" + "-m (I)MDCT test\n" + "-i inverse transform test\n" + "-n b set the transform size to 2^b\n" + ); + exit(1); +} + + + +int main(int argc, char **argv) +{ + FFTComplex *tab, *tab1, *tab_ref; + FFTSample *tabtmp, *tab2; + int it, i, c; + int do_speed = 0; + int do_mdct = 0; + int do_inverse = 0; + FFTContext s1, *s = &s1; + MDCTContext m1, *m = &m1; + int fft_nbits, fft_size; + + mm_flags = 0; + fft_nbits = 9; + for(;;) { + c = getopt(argc, argv, "hsimn:"); + if (c == -1) + break; + switch(c) { + case 'h': + help(); + break; + case 's': + do_speed = 1; + break; + case 'i': + do_inverse = 1; + break; + case 'm': + do_mdct = 1; + break; + case 'n': + fft_nbits = atoi(optarg); + break; + } + } + + fft_size = 1 << fft_nbits; + tab = av_malloc(fft_size * sizeof(FFTComplex)); + tab1 = av_malloc(fft_size * sizeof(FFTComplex)); + tab_ref = av_malloc(fft_size * sizeof(FFTComplex)); + tabtmp = av_malloc(fft_size / 2 * sizeof(FFTSample)); + tab2 = av_malloc(fft_size * sizeof(FFTSample)); + + if (do_mdct) { + if (do_inverse) + printf("IMDCT"); + else + printf("MDCT"); + mdct_init(m, fft_nbits, do_inverse); + } else { + if (do_inverse) + printf("IFFT"); + else + printf("FFT"); + fft_init(s, fft_nbits, do_inverse); + fft_ref_init(fft_nbits, do_inverse); + } + printf(" %d test\n", fft_size); + + /* generate random data */ + + for(i=0;i= 1000000) + break; + nb_its *= 2; + } + printf("time: %0.1f us/transform [total time=%0.2f s its=%d]\n", + (double)duration / nb_its, + (double)duration / 1000000.0, + nb_its); + } + + if (do_mdct) { + mdct_end(m); + } else { + fft_end(s); + } + return 0; +} diff -r a48bb8bc63dd -r 6f5e87957bcb fft.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft.c Mon Oct 28 00:34:08 2002 +0000 @@ -0,0 +1,229 @@ +/* + * FFT/IFFT transforms + * Copyright (c) 2002 Fabrice Bellard. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "dsputil.h" + +/** + * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is + * done + */ +int fft_init(FFTContext *s, int nbits, int inverse) +{ + int i, j, m, n; + float alpha, c1, s1, s2; + + s->nbits = nbits; + n = 1 << nbits; + + s->exptab = av_malloc((n / 2) * sizeof(FFTComplex)); + if (!s->exptab) + goto fail; + s->revtab = av_malloc(n * sizeof(uint16_t)); + if (!s->revtab) + goto fail; + s->inverse = inverse; + + s2 = inverse ? 1.0 : -1.0; + + for(i=0;i<(n/2);i++) { + alpha = 2 * M_PI * (float)i / (float)n; + c1 = cos(alpha); + s1 = sin(alpha) * s2; + s->exptab[i].re = c1; + s->exptab[i].im = s1; + } + s->fft_calc = fft_calc_c; + s->exptab1 = NULL; + + /* compute constant table for HAVE_SSE version */ +#if defined(HAVE_MMX) && 0 + if (mm_flags & MM_SSE) { + int np, nblocks, np2, l; + FFTComplex *q; + + np = 1 << nbits; + nblocks = np >> 3; + np2 = np >> 1; + s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex)); + if (!s->exptab1) + goto fail; + q = s->exptab1; + do { + for(l = 0; l < np2; l += 2 * nblocks) { + *q++ = s->exptab[l]; + *q++ = s->exptab[l + nblocks]; + + q->re = -s->exptab[l].im; + q->im = s->exptab[l].re; + q++; + q->re = -s->exptab[l + nblocks].im; + q->im = s->exptab[l + nblocks].re; + q++; + } + nblocks = nblocks >> 1; + } while (nblocks != 0); + av_freep(&s->exptab); + } +#endif + + /* compute bit reverse table */ + + for(i=0;i> j) & 1) << (nbits-j-1); + } + s->revtab[i]=m; + } + return 0; + fail: + av_freep(&s->revtab); + av_freep(&s->exptab); + av_freep(&s->exptab1); + return -1; +} + +/* butter fly op */ +#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ +{\ + FFTSample ax, ay, bx, by;\ + bx=pre1;\ + by=pim1;\ + ax=qre1;\ + ay=qim1;\ + pre = (bx + ax);\ + pim = (by + ay);\ + qre = (bx - ax);\ + qim = (by - ay);\ +} + +#define MUL16(a,b) ((a) * (b)) + +#define CMUL(pre, pim, are, aim, bre, bim) \ +{\ + pre = (MUL16(are, bre) - MUL16(aim, bim));\ + pim = (MUL16(are, bim) + MUL16(bre, aim));\ +} + +/** + * Do a complex FFT with the parameters defined in fft_init(). The + * input data must be permuted before with s->revtab table. No + * 1.0/sqrt(n) normalization is done. + */ +void fft_calc_c(FFTContext *s, FFTComplex *z) +{ + int ln = s->nbits; + int j, np, np2; + int nblocks, nloops; + register FFTComplex *p, *q; + FFTComplex *exptab = s->exptab; + int l; + FFTSample tmp_re, tmp_im; + + np = 1 << ln; + + /* pass 0 */ + + p=&z[0]; + j=(np >> 1); + do { + BF(p[0].re, p[0].im, p[1].re, p[1].im, + p[0].re, p[0].im, p[1].re, p[1].im); + p+=2; + } while (--j != 0); + + /* pass 1 */ + + + p=&z[0]; + j=np >> 2; + if (s->inverse) { + do { + BF(p[0].re, p[0].im, p[2].re, p[2].im, + p[0].re, p[0].im, p[2].re, p[2].im); + BF(p[1].re, p[1].im, p[3].re, p[3].im, + p[1].re, p[1].im, -p[3].im, p[3].re); + p+=4; + } while (--j != 0); + } else { + do { + BF(p[0].re, p[0].im, p[2].re, p[2].im, + p[0].re, p[0].im, p[2].re, p[2].im); + BF(p[1].re, p[1].im, p[3].re, p[3].im, + p[1].re, p[1].im, p[3].im, -p[3].re); + p+=4; + } while (--j != 0); + } + /* pass 2 .. ln-1 */ + + nblocks = np >> 3; + nloops = 1 << 2; + np2 = np >> 1; + do { + p = z; + q = z + nloops; + for (j = 0; j < nblocks; ++j) { + BF(p->re, p->im, q->re, q->im, + p->re, p->im, q->re, q->im); + + p++; + q++; + for(l = nblocks; l < np2; l += nblocks) { + CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im); + BF(p->re, p->im, q->re, q->im, + p->re, p->im, tmp_re, tmp_im); + p++; + q++; + } + + p += nloops; + q += nloops; + } + nblocks = nblocks >> 1; + nloops = nloops << 1; + } while (nblocks != 0); +} + +/** + * Do the permutation needed BEFORE calling fft_calc() + */ +void fft_permute(FFTContext *s, FFTComplex *z) +{ + int j, k, np; + FFTComplex tmp; + const uint16_t *revtab = s->revtab; + + /* reverse */ + np = 1 << s->nbits; + for(j=0;jrevtab); + av_freep(&s->exptab); + av_freep(&s->exptab1); +} + diff -r a48bb8bc63dd -r 6f5e87957bcb i386/fft_sse.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/i386/fft_sse.c Mon Oct 28 00:34:08 2002 +0000 @@ -0,0 +1,128 @@ +/* + * FFT/MDCT transform with SSE optimizations + * Copyright (c) 2002 Fabrice Bellard. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "../dsputil.h" +#include + +#include + +static const float p1p1p1m1[4] __attribute__((aligned(16))) = + { 1.0, 1.0, 1.0, -1.0 }; + +static const float p1p1m1m1[4] __attribute__((aligned(16))) = + { 1.0, 1.0, -1.0, -1.0 }; + +#if 0 +static void print_v4sf(const char *str, __m128 a) +{ + float *p = (float *)&a; + printf("%s: %f %f %f %f\n", + str, p[0], p[1], p[2], p[3]); +} +#endif + +/* XXX: handle reverse case */ +void fft_calc_sse(FFTContext *s, FFTComplex *z) +{ + int ln = s->nbits; + int j, np, np2; + int nblocks, nloops; + register FFTComplex *p, *q; + FFTComplex *cptr, *cptr1; + int k; + + np = 1 << ln; + + { + __m128 *r, a, b, a1, c1, c2; + + r = (__m128 *)&z[0]; + c1 = *(__m128 *)p1p1m1m1; + c2 = *(__m128 *)p1p1p1m1; + j = (np >> 2); + do { + a = r[0]; + b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); + a = _mm_mul_ps(a, c1); + /* do the pass 0 butterfly */ + a = _mm_add_ps(a, b); + + a1 = r[1]; + b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2)); + a1 = _mm_mul_ps(a1, c1); + /* do the pass 0 butterfly */ + b = _mm_add_ps(a1, b); + + /* multiply third by -i */ + b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0)); + b = _mm_mul_ps(b, c2); + + /* do the pass 1 butterfly */ + r[0] = _mm_add_ps(a, b); + r[1] = _mm_sub_ps(a, b); + r += 2; + } while (--j != 0); + } + /* pass 2 .. ln-1 */ + + nblocks = np >> 3; + nloops = 1 << 2; + np2 = np >> 1; + + cptr1 = s->exptab1; + do { + p = z; + q = z + nloops; + j = nblocks; + do { + cptr = cptr1; + k = nloops >> 1; + do { + __m128 a, b, c, t1, t2; + + a = *(__m128 *)p; + b = *(__m128 *)q; + + /* complex mul */ + c = *(__m128 *)cptr; + /* cre*re cim*re */ + t1 = _mm_mul_ps(c, + _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); + c = *(__m128 *)(cptr + 2); + /* -cim*im cre*im */ + t2 = _mm_mul_ps(c, + _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); + b = _mm_add_ps(t1, t2); + + /* butterfly */ + *(__m128 *)p = _mm_add_ps(a, b); + *(__m128 *)q = _mm_sub_ps(a, b); + + p += 2; + q += 2; + cptr += 4; + } while (--k); + + p += nloops; + q += nloops; + } while (--j); + cptr1 += nloops * 2; + nblocks = nblocks >> 1; + nloops = nloops << 1; + } while (nblocks != 0); +} diff -r a48bb8bc63dd -r 6f5e87957bcb mdct.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mdct.c Mon Oct 28 00:34:08 2002 +0000 @@ -0,0 +1,170 @@ +/* + * MDCT/IMDCT transforms + * Copyright (c) 2002 Fabrice Bellard. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "dsputil.h" + +/* + * init MDCT or IMDCT computation + */ +int mdct_init(MDCTContext *s, int nbits, int inverse) +{ + int n, n4, i; + float alpha; + + memset(s, 0, sizeof(*s)); + n = 1 << nbits; + s->nbits = nbits; + s->n = n; + n4 = n >> 2; + s->tcos = malloc(n4 * sizeof(FFTSample)); + if (!s->tcos) + goto fail; + s->tsin = malloc(n4 * sizeof(FFTSample)); + if (!s->tsin) + goto fail; + + for(i=0;itcos[i] = -cos(alpha); + s->tsin[i] = -sin(alpha); + } + if (fft_init(&s->fft, s->nbits - 2, inverse) < 0) + goto fail; + return 0; + fail: + av_freep(&s->tcos); + av_freep(&s->tsin); + return -1; +} + +/* complex multiplication: p = a * b */ +#define CMUL(pre, pim, are, aim, bre, bim) \ +{\ + float _are = (are);\ + float _aim = (aim);\ + float _bre = (bre);\ + float _bim = (bim);\ + (pre) = _are * _bre - _aim * _bim;\ + (pim) = _are * _bim + _aim * _bre;\ +} + +/** + * Compute inverse MDCT of size N = 2^nbits + * @param output N samples + * @param input N/2 samples + * @param tmp N/2 samples + */ +void imdct_calc(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + int k, n8, n4, n2, n, j; + const uint16_t *revtab = s->fft.revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + const FFTSample *in1, *in2; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + + /* pre rotation */ + in1 = input; + in2 = input + n2 - 1; + for(k = 0; k < n4; k++) { + j=revtab[k]; + CMUL(z[j].re, z[j].im, *in2, *in1, tcos[k], tsin[k]); + in1 += 2; + in2 -= 2; + } + fft_calc(&s->fft, z); + + /* post rotation + reordering */ + /* XXX: optimize */ + for(k = 0; k < n4; k++) { + CMUL(z[k].re, z[k].im, z[k].re, z[k].im, tcos[k], tsin[k]); + } + for(k = 0; k < n8; k++) { + output[2*k] = -z[n8 + k].im; + output[n2-1-2*k] = z[n8 + k].im; + + output[2*k+1] = z[n8-1-k].re; + output[n2-1-2*k-1] = -z[n8-1-k].re; + + output[n2 + 2*k]=-z[k+n8].re; + output[n-1- 2*k]=-z[k+n8].re; + + output[n2 + 2*k+1]=z[n8-k-1].im; + output[n-2 - 2 * k] = z[n8-k-1].im; + } +} + +/** + * Compute MDCT of size N = 2^nbits + * @param input N samples + * @param out N/2 samples + * @param tmp temporary storage of N/2 samples + */ +void mdct_calc(MDCTContext *s, FFTSample *out, + const FFTSample *input, FFTSample *tmp) +{ + int i, j, n, n8, n4, n2, n3; + FFTSample re, im, re1, im1; + const uint16_t *revtab = s->fft.revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + FFTComplex *x = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + n3 = 3 * n4; + + /* pre rotation */ + for(i=0;ifft, x); + + /* post rotation */ + for(i=0;itcos); + av_freep(&s->tsin); + fft_end(&s->fft); +}