libavcodec.hg: mdct.c comparison

comparison mdct.c @ 7544:ee1cb5ab9f99 libavcodec

optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder.

author	lorenm
date	Tue, 12 Aug 2008 00:33:34 +0000
parents	fc843d00867c
children	2dca9201c400

comparison

equal deleted inserted replaced

-:f04ff5a6fb55
+:ee1cb5ab9f99
 double _bim = (bim);\
 (pre) = _are * _bre - _aim * _bim;\
 (pim) = _are * _bim + _aim * _bre;\
 }
-static void imdct_c(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
+/**
-{
+* Compute the middle half of the inverse MDCT of size N = 2^nbits,
-int k, n4, n2, n, j;
+* thus excluding the parts that can be derived by symmetry
+* @param output N/2 samples
+* @param input N/2 samples
+*/
+void ff_imdct_half(MDCTContext *s, FFTSample *output, const FFTSample *input)
+{
+int k, n8, n4, n2, n, j;
 const uint16_t *revtab = s->fft.revtab;
 const FFTSample *tcos = s->tcos;
 const FFTSample *tsin = s->tsin;
 const FFTSample *in1, *in2;
-FFTComplex *z = (FFTComplex *)tmp;
+FFTComplex *z = (FFTComplex *)output;
 n = 1 << s->nbits;
 n2 = n >> 1;
 n4 = n >> 2;
+n8 = n >> 3;
 /* pre rotation */
 in1 = input;
 in2 = input + n2 - 1;
 for(k = 0; k < n4; k++) {
 in2 -= 2;
 }
 ff_fft_calc(&s->fft, z);
 /* post rotation + reordering */
-/* XXX: optimize */
+output += n4;
-for(k = 0; k < n4; k++) {
+for(k = 0; k < n8; k++) {
-CMUL(z[k].re, z[k].im, z[k].re, z[k].im, tcos[k], tsin[k]);
+FFTSample r0, i0, r1, i1;
+CMUL(r0, i1, z[n8-k-1].im, z[n8-k-1].re, tsin[n8-k-1], tcos[n8-k-1]);
+CMUL(r1, i0, z[n8+k  ].im, z[n8+k  ].re, tsin[n8+k  ], tcos[n8+k  ]);
+z[n8-k-1].re = r0;
+z[n8-k-1].im = i0;
+z[n8+k  ].re = r1;
+z[n8+k  ].im = i1;
 }
 }
 /**
 * Compute inverse MDCT of size N = 2^nbits
 * @param tmp N/2 samples
 */
 void ff_imdct_calc(MDCTContext *s, FFTSample *output,
 const FFTSample *input, FFTSample *tmp)
 {
-int k, n8, n2, n;
+int k;
-FFTComplex *z = (FFTComplex *)tmp;
+int n = 1 << s->nbits;
-n = 1 << s->nbits;
+int n2 = n >> 1;
-n2 = n >> 1;
+int n4 = n >> 2;
-n8 = n >> 3;
+ff_imdct_half(s, output+n4, input);
-imdct_c(s, input, tmp);
+for(k = 0; k < n4; k++) {
-for(k = 0; k < n8; k++) {
+output[k] = -output[n2-k-1];
-output[2*k] = -z[n8 + k].im;
+output[n-k-1] = output[n2+k];
-output[n2-1-2*k] = z[n8 + k].im;
-output[2*k+1] = z[n8-1-k].re;
-output[n2-1-2*k-1] = -z[n8-1-k].re;
-output[n2 + 2*k]=-z[k+n8].re;
-output[n-1- 2*k]=-z[k+n8].re;
-output[n2 + 2*k+1]=z[n8-k-1].im;
-output[n-2 - 2 * k] = z[n8-k-1].im;
-}
-}
-/**
-* Compute the middle half of the inverse MDCT of size N = 2^nbits,
-* thus excluding the parts that can be derived by symmetry
-* @param output N/2 samples
-* @param input N/2 samples
-* @param tmp N/2 samples
-*/
-void ff_imdct_half(MDCTContext *s, FFTSample *output,
-const FFTSample *input, FFTSample *tmp)
-{
-int k, n8, n4, n;
-FFTComplex *z = (FFTComplex *)tmp;
-n = 1 << s->nbits;
-n4 = n >> 2;
-n8 = n >> 3;
-imdct_c(s, input, tmp);
-for(k = 0; k < n8; k++) {
-output[n4-1-2*k]   =  z[n8+k].im;
-output[n4-1-2*k-1] = -z[n8-k-1].re;
-output[n4 + 2*k]   = -z[n8+k].re;
-output[n4 + 2*k+1] =  z[n8-k-1].im;
 }
 }
 /**
 * Compute MDCT of size N = 2^nbits
 int i, j, n, n8, n4, n2, n3;
 FFTSample re, im, re1, im1;
 const uint16_t *revtab = s->fft.revtab;
 const FFTSample *tcos = s->tcos;
 const FFTSample *tsin = s->tsin;
-FFTComplex *x = (FFTComplex *)tmp;
+FFTComplex *x = (FFTComplex *)out;
 n = 1 << s->nbits;
 n2 = n >> 1;
 n4 = n >> 2;
 n8 = n >> 3;
 }
 ff_fft_calc(&s->fft, x);
 /* post rotation */
-for(i=0;i<n4;i++) {
+for(i=0;i<n8;i++) {
-re = x[i].re;
+FFTSample r0, i0, r1, i1;
-im = x[i].im;
+CMUL(i1, r0, x[n8-i-1].re, x[n8-i-1].im, -tsin[n8-i-1], -tcos[n8-i-1]);
-CMUL(re1, im1, re, im, -tsin[i], -tcos[i]);
+CMUL(i0, r1, x[n8+i  ].re, x[n8+i  ].im, -tsin[n8+i  ], -tcos[n8+i  ]);
-out[2*i] = im1;
+x[n8-i-1].re = r0;
-out[n2-1-2*i] = re1;
+x[n8-i-1].im = i0;
+x[n8+i  ].re = r1;
+x[n8+i  ].im = i1;
 }
 }
 void ff_mdct_end(MDCTContext *s)
 {

Mercurial > libavcodec.hg

comparison mdct.c @ 7544:ee1cb5ab9f99 libavcodec