comparison mdct.c @ 7544:ee1cb5ab9f99 libavcodec

optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder.
author lorenm
date Tue, 12 Aug 2008 00:33:34 +0000
parents fc843d00867c
children 2dca9201c400
comparison
equal deleted inserted replaced
7543:f04ff5a6fb55 7544:ee1cb5ab9f99
98 double _bim = (bim);\ 98 double _bim = (bim);\
99 (pre) = _are * _bre - _aim * _bim;\ 99 (pre) = _are * _bre - _aim * _bim;\
100 (pim) = _are * _bim + _aim * _bre;\ 100 (pim) = _are * _bim + _aim * _bre;\
101 } 101 }
102 102
103 static void imdct_c(MDCTContext *s, const FFTSample *input, FFTSample *tmp) 103 /**
104 { 104 * Compute the middle half of the inverse MDCT of size N = 2^nbits,
105 int k, n4, n2, n, j; 105 * thus excluding the parts that can be derived by symmetry
106 * @param output N/2 samples
107 * @param input N/2 samples
108 */
109 void ff_imdct_half(MDCTContext *s, FFTSample *output, const FFTSample *input)
110 {
111 int k, n8, n4, n2, n, j;
106 const uint16_t *revtab = s->fft.revtab; 112 const uint16_t *revtab = s->fft.revtab;
107 const FFTSample *tcos = s->tcos; 113 const FFTSample *tcos = s->tcos;
108 const FFTSample *tsin = s->tsin; 114 const FFTSample *tsin = s->tsin;
109 const FFTSample *in1, *in2; 115 const FFTSample *in1, *in2;
110 FFTComplex *z = (FFTComplex *)tmp; 116 FFTComplex *z = (FFTComplex *)output;
111 117
112 n = 1 << s->nbits; 118 n = 1 << s->nbits;
113 n2 = n >> 1; 119 n2 = n >> 1;
114 n4 = n >> 2; 120 n4 = n >> 2;
121 n8 = n >> 3;
115 122
116 /* pre rotation */ 123 /* pre rotation */
117 in1 = input; 124 in1 = input;
118 in2 = input + n2 - 1; 125 in2 = input + n2 - 1;
119 for(k = 0; k < n4; k++) { 126 for(k = 0; k < n4; k++) {
123 in2 -= 2; 130 in2 -= 2;
124 } 131 }
125 ff_fft_calc(&s->fft, z); 132 ff_fft_calc(&s->fft, z);
126 133
127 /* post rotation + reordering */ 134 /* post rotation + reordering */
128 /* XXX: optimize */ 135 output += n4;
129 for(k = 0; k < n4; k++) { 136 for(k = 0; k < n8; k++) {
130 CMUL(z[k].re, z[k].im, z[k].re, z[k].im, tcos[k], tsin[k]); 137 FFTSample r0, i0, r1, i1;
138 CMUL(r0, i1, z[n8-k-1].im, z[n8-k-1].re, tsin[n8-k-1], tcos[n8-k-1]);
139 CMUL(r1, i0, z[n8+k ].im, z[n8+k ].re, tsin[n8+k ], tcos[n8+k ]);
140 z[n8-k-1].re = r0;
141 z[n8-k-1].im = i0;
142 z[n8+k ].re = r1;
143 z[n8+k ].im = i1;
131 } 144 }
132 } 145 }
133 146
134 /** 147 /**
135 * Compute inverse MDCT of size N = 2^nbits 148 * Compute inverse MDCT of size N = 2^nbits
138 * @param tmp N/2 samples 151 * @param tmp N/2 samples
139 */ 152 */
140 void ff_imdct_calc(MDCTContext *s, FFTSample *output, 153 void ff_imdct_calc(MDCTContext *s, FFTSample *output,
141 const FFTSample *input, FFTSample *tmp) 154 const FFTSample *input, FFTSample *tmp)
142 { 155 {
143 int k, n8, n2, n; 156 int k;
144 FFTComplex *z = (FFTComplex *)tmp; 157 int n = 1 << s->nbits;
145 n = 1 << s->nbits; 158 int n2 = n >> 1;
146 n2 = n >> 1; 159 int n4 = n >> 2;
147 n8 = n >> 3; 160
148 161 ff_imdct_half(s, output+n4, input);
149 imdct_c(s, input, tmp); 162
150 163 for(k = 0; k < n4; k++) {
151 for(k = 0; k < n8; k++) { 164 output[k] = -output[n2-k-1];
152 output[2*k] = -z[n8 + k].im; 165 output[n-k-1] = output[n2+k];
153 output[n2-1-2*k] = z[n8 + k].im;
154
155 output[2*k+1] = z[n8-1-k].re;
156 output[n2-1-2*k-1] = -z[n8-1-k].re;
157
158 output[n2 + 2*k]=-z[k+n8].re;
159 output[n-1- 2*k]=-z[k+n8].re;
160
161 output[n2 + 2*k+1]=z[n8-k-1].im;
162 output[n-2 - 2 * k] = z[n8-k-1].im;
163 }
164 }
165
166 /**
167 * Compute the middle half of the inverse MDCT of size N = 2^nbits,
168 * thus excluding the parts that can be derived by symmetry
169 * @param output N/2 samples
170 * @param input N/2 samples
171 * @param tmp N/2 samples
172 */
173 void ff_imdct_half(MDCTContext *s, FFTSample *output,
174 const FFTSample *input, FFTSample *tmp)
175 {
176 int k, n8, n4, n;
177 FFTComplex *z = (FFTComplex *)tmp;
178 n = 1 << s->nbits;
179 n4 = n >> 2;
180 n8 = n >> 3;
181
182 imdct_c(s, input, tmp);
183
184 for(k = 0; k < n8; k++) {
185 output[n4-1-2*k] = z[n8+k].im;
186 output[n4-1-2*k-1] = -z[n8-k-1].re;
187 output[n4 + 2*k] = -z[n8+k].re;
188 output[n4 + 2*k+1] = z[n8-k-1].im;
189 } 166 }
190 } 167 }
191 168
192 /** 169 /**
193 * Compute MDCT of size N = 2^nbits 170 * Compute MDCT of size N = 2^nbits
201 int i, j, n, n8, n4, n2, n3; 178 int i, j, n, n8, n4, n2, n3;
202 FFTSample re, im, re1, im1; 179 FFTSample re, im, re1, im1;
203 const uint16_t *revtab = s->fft.revtab; 180 const uint16_t *revtab = s->fft.revtab;
204 const FFTSample *tcos = s->tcos; 181 const FFTSample *tcos = s->tcos;
205 const FFTSample *tsin = s->tsin; 182 const FFTSample *tsin = s->tsin;
206 FFTComplex *x = (FFTComplex *)tmp; 183 FFTComplex *x = (FFTComplex *)out;
207 184
208 n = 1 << s->nbits; 185 n = 1 << s->nbits;
209 n2 = n >> 1; 186 n2 = n >> 1;
210 n4 = n >> 2; 187 n4 = n >> 2;
211 n8 = n >> 3; 188 n8 = n >> 3;
225 } 202 }
226 203
227 ff_fft_calc(&s->fft, x); 204 ff_fft_calc(&s->fft, x);
228 205
229 /* post rotation */ 206 /* post rotation */
230 for(i=0;i<n4;i++) { 207 for(i=0;i<n8;i++) {
231 re = x[i].re; 208 FFTSample r0, i0, r1, i1;
232 im = x[i].im; 209 CMUL(i1, r0, x[n8-i-1].re, x[n8-i-1].im, -tsin[n8-i-1], -tcos[n8-i-1]);
233 CMUL(re1, im1, re, im, -tsin[i], -tcos[i]); 210 CMUL(i0, r1, x[n8+i ].re, x[n8+i ].im, -tsin[n8+i ], -tcos[n8+i ]);
234 out[2*i] = im1; 211 x[n8-i-1].re = r0;
235 out[n2-1-2*i] = re1; 212 x[n8-i-1].im = i0;
213 x[n8+i ].re = r1;
214 x[n8+i ].im = i1;
236 } 215 }
237 } 216 }
238 217
239 void ff_mdct_end(MDCTContext *s) 218 void ff_mdct_end(MDCTContext *s)
240 { 219 {