Mercurial > libavcodec.hg
comparison mdct.c @ 7544:ee1cb5ab9f99 libavcodec
optimize imdct_half:
remove tmp buffer.
skip fft reinterleave pass, leaving data in a format more convenient for simd.
merge post-rotate with post-reorder.
author | lorenm |
---|---|
date | Tue, 12 Aug 2008 00:33:34 +0000 |
parents | fc843d00867c |
children | 2dca9201c400 |
comparison
equal
deleted
inserted
replaced
7543:f04ff5a6fb55 | 7544:ee1cb5ab9f99 |
---|---|
98 double _bim = (bim);\ | 98 double _bim = (bim);\ |
99 (pre) = _are * _bre - _aim * _bim;\ | 99 (pre) = _are * _bre - _aim * _bim;\ |
100 (pim) = _are * _bim + _aim * _bre;\ | 100 (pim) = _are * _bim + _aim * _bre;\ |
101 } | 101 } |
102 | 102 |
103 static void imdct_c(MDCTContext *s, const FFTSample *input, FFTSample *tmp) | 103 /** |
104 { | 104 * Compute the middle half of the inverse MDCT of size N = 2^nbits, |
105 int k, n4, n2, n, j; | 105 * thus excluding the parts that can be derived by symmetry |
106 * @param output N/2 samples | |
107 * @param input N/2 samples | |
108 */ | |
109 void ff_imdct_half(MDCTContext *s, FFTSample *output, const FFTSample *input) | |
110 { | |
111 int k, n8, n4, n2, n, j; | |
106 const uint16_t *revtab = s->fft.revtab; | 112 const uint16_t *revtab = s->fft.revtab; |
107 const FFTSample *tcos = s->tcos; | 113 const FFTSample *tcos = s->tcos; |
108 const FFTSample *tsin = s->tsin; | 114 const FFTSample *tsin = s->tsin; |
109 const FFTSample *in1, *in2; | 115 const FFTSample *in1, *in2; |
110 FFTComplex *z = (FFTComplex *)tmp; | 116 FFTComplex *z = (FFTComplex *)output; |
111 | 117 |
112 n = 1 << s->nbits; | 118 n = 1 << s->nbits; |
113 n2 = n >> 1; | 119 n2 = n >> 1; |
114 n4 = n >> 2; | 120 n4 = n >> 2; |
121 n8 = n >> 3; | |
115 | 122 |
116 /* pre rotation */ | 123 /* pre rotation */ |
117 in1 = input; | 124 in1 = input; |
118 in2 = input + n2 - 1; | 125 in2 = input + n2 - 1; |
119 for(k = 0; k < n4; k++) { | 126 for(k = 0; k < n4; k++) { |
123 in2 -= 2; | 130 in2 -= 2; |
124 } | 131 } |
125 ff_fft_calc(&s->fft, z); | 132 ff_fft_calc(&s->fft, z); |
126 | 133 |
127 /* post rotation + reordering */ | 134 /* post rotation + reordering */ |
128 /* XXX: optimize */ | 135 output += n4; |
129 for(k = 0; k < n4; k++) { | 136 for(k = 0; k < n8; k++) { |
130 CMUL(z[k].re, z[k].im, z[k].re, z[k].im, tcos[k], tsin[k]); | 137 FFTSample r0, i0, r1, i1; |
138 CMUL(r0, i1, z[n8-k-1].im, z[n8-k-1].re, tsin[n8-k-1], tcos[n8-k-1]); | |
139 CMUL(r1, i0, z[n8+k ].im, z[n8+k ].re, tsin[n8+k ], tcos[n8+k ]); | |
140 z[n8-k-1].re = r0; | |
141 z[n8-k-1].im = i0; | |
142 z[n8+k ].re = r1; | |
143 z[n8+k ].im = i1; | |
131 } | 144 } |
132 } | 145 } |
133 | 146 |
134 /** | 147 /** |
135 * Compute inverse MDCT of size N = 2^nbits | 148 * Compute inverse MDCT of size N = 2^nbits |
138 * @param tmp N/2 samples | 151 * @param tmp N/2 samples |
139 */ | 152 */ |
140 void ff_imdct_calc(MDCTContext *s, FFTSample *output, | 153 void ff_imdct_calc(MDCTContext *s, FFTSample *output, |
141 const FFTSample *input, FFTSample *tmp) | 154 const FFTSample *input, FFTSample *tmp) |
142 { | 155 { |
143 int k, n8, n2, n; | 156 int k; |
144 FFTComplex *z = (FFTComplex *)tmp; | 157 int n = 1 << s->nbits; |
145 n = 1 << s->nbits; | 158 int n2 = n >> 1; |
146 n2 = n >> 1; | 159 int n4 = n >> 2; |
147 n8 = n >> 3; | 160 |
148 | 161 ff_imdct_half(s, output+n4, input); |
149 imdct_c(s, input, tmp); | 162 |
150 | 163 for(k = 0; k < n4; k++) { |
151 for(k = 0; k < n8; k++) { | 164 output[k] = -output[n2-k-1]; |
152 output[2*k] = -z[n8 + k].im; | 165 output[n-k-1] = output[n2+k]; |
153 output[n2-1-2*k] = z[n8 + k].im; | |
154 | |
155 output[2*k+1] = z[n8-1-k].re; | |
156 output[n2-1-2*k-1] = -z[n8-1-k].re; | |
157 | |
158 output[n2 + 2*k]=-z[k+n8].re; | |
159 output[n-1- 2*k]=-z[k+n8].re; | |
160 | |
161 output[n2 + 2*k+1]=z[n8-k-1].im; | |
162 output[n-2 - 2 * k] = z[n8-k-1].im; | |
163 } | |
164 } | |
165 | |
166 /** | |
167 * Compute the middle half of the inverse MDCT of size N = 2^nbits, | |
168 * thus excluding the parts that can be derived by symmetry | |
169 * @param output N/2 samples | |
170 * @param input N/2 samples | |
171 * @param tmp N/2 samples | |
172 */ | |
173 void ff_imdct_half(MDCTContext *s, FFTSample *output, | |
174 const FFTSample *input, FFTSample *tmp) | |
175 { | |
176 int k, n8, n4, n; | |
177 FFTComplex *z = (FFTComplex *)tmp; | |
178 n = 1 << s->nbits; | |
179 n4 = n >> 2; | |
180 n8 = n >> 3; | |
181 | |
182 imdct_c(s, input, tmp); | |
183 | |
184 for(k = 0; k < n8; k++) { | |
185 output[n4-1-2*k] = z[n8+k].im; | |
186 output[n4-1-2*k-1] = -z[n8-k-1].re; | |
187 output[n4 + 2*k] = -z[n8+k].re; | |
188 output[n4 + 2*k+1] = z[n8-k-1].im; | |
189 } | 166 } |
190 } | 167 } |
191 | 168 |
192 /** | 169 /** |
193 * Compute MDCT of size N = 2^nbits | 170 * Compute MDCT of size N = 2^nbits |
201 int i, j, n, n8, n4, n2, n3; | 178 int i, j, n, n8, n4, n2, n3; |
202 FFTSample re, im, re1, im1; | 179 FFTSample re, im, re1, im1; |
203 const uint16_t *revtab = s->fft.revtab; | 180 const uint16_t *revtab = s->fft.revtab; |
204 const FFTSample *tcos = s->tcos; | 181 const FFTSample *tcos = s->tcos; |
205 const FFTSample *tsin = s->tsin; | 182 const FFTSample *tsin = s->tsin; |
206 FFTComplex *x = (FFTComplex *)tmp; | 183 FFTComplex *x = (FFTComplex *)out; |
207 | 184 |
208 n = 1 << s->nbits; | 185 n = 1 << s->nbits; |
209 n2 = n >> 1; | 186 n2 = n >> 1; |
210 n4 = n >> 2; | 187 n4 = n >> 2; |
211 n8 = n >> 3; | 188 n8 = n >> 3; |
225 } | 202 } |
226 | 203 |
227 ff_fft_calc(&s->fft, x); | 204 ff_fft_calc(&s->fft, x); |
228 | 205 |
229 /* post rotation */ | 206 /* post rotation */ |
230 for(i=0;i<n4;i++) { | 207 for(i=0;i<n8;i++) { |
231 re = x[i].re; | 208 FFTSample r0, i0, r1, i1; |
232 im = x[i].im; | 209 CMUL(i1, r0, x[n8-i-1].re, x[n8-i-1].im, -tsin[n8-i-1], -tcos[n8-i-1]); |
233 CMUL(re1, im1, re, im, -tsin[i], -tcos[i]); | 210 CMUL(i0, r1, x[n8+i ].re, x[n8+i ].im, -tsin[n8+i ], -tcos[n8+i ]); |
234 out[2*i] = im1; | 211 x[n8-i-1].re = r0; |
235 out[n2-1-2*i] = re1; | 212 x[n8-i-1].im = i0; |
213 x[n8+i ].re = r1; | |
214 x[n8+i ].im = i1; | |
236 } | 215 } |
237 } | 216 } |
238 | 217 |
239 void ff_mdct_end(MDCTContext *s) | 218 void ff_mdct_end(MDCTContext *s) |
240 { | 219 { |