Mercurial > libavcodec.hg
annotate fft.c @ 9830:bd0879f752e6 libavcodec
Express the H.264 parser dependency on the golomb code in configure instead of
in the Makefile as it is done for all other parts that depend on golomb.
author | diego |
---|---|
date | Tue, 09 Jun 2009 20:29:52 +0000 |
parents | 4b1736ba9f2f |
children | c5e8a5a044c3 |
rev | line source |
---|---|
781 | 1 /* |
2 * FFT/IFFT transforms | |
7542 | 3 * Copyright (c) 2008 Loren Merritt |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8590
diff
changeset
|
4 * Copyright (c) 2002 Fabrice Bellard |
7542 | 5 * Partly based on libdjbfft by D. J. Bernstein |
781 | 6 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
7 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
8 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
9 * FFmpeg is free software; you can redistribute it and/or |
781 | 10 * modify it under the terms of the GNU Lesser General Public |
11 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
12 * version 2.1 of the License, or (at your option) any later version. |
781 | 13 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
14 * FFmpeg is distributed in the hope that it will be useful, |
781 | 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 * Lesser General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
20 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
781 | 22 */ |
1106 | 23 |
24 /** | |
8718
e9d9d946f213
Use full internal pathname in doxygen @file directives.
diego
parents:
8694
diff
changeset
|
25 * @file libavcodec/fft.c |
1106 | 26 * FFT/IFFT transforms. |
27 */ | |
28 | |
781 | 29 #include "dsputil.h" |
30 | |
7542 | 31 /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */ |
32 DECLARE_ALIGNED_16(FFTSample, ff_cos_16[8]); | |
33 DECLARE_ALIGNED_16(FFTSample, ff_cos_32[16]); | |
34 DECLARE_ALIGNED_16(FFTSample, ff_cos_64[32]); | |
35 DECLARE_ALIGNED_16(FFTSample, ff_cos_128[64]); | |
36 DECLARE_ALIGNED_16(FFTSample, ff_cos_256[128]); | |
37 DECLARE_ALIGNED_16(FFTSample, ff_cos_512[256]); | |
38 DECLARE_ALIGNED_16(FFTSample, ff_cos_1024[512]); | |
39 DECLARE_ALIGNED_16(FFTSample, ff_cos_2048[1024]); | |
40 DECLARE_ALIGNED_16(FFTSample, ff_cos_4096[2048]); | |
41 DECLARE_ALIGNED_16(FFTSample, ff_cos_8192[4096]); | |
42 DECLARE_ALIGNED_16(FFTSample, ff_cos_16384[8192]); | |
43 DECLARE_ALIGNED_16(FFTSample, ff_cos_32768[16384]); | |
44 DECLARE_ALIGNED_16(FFTSample, ff_cos_65536[32768]); | |
8694
68fd157bab48
Add the rdft family of transforms (fft/ifft of an all real sequence) to dsputil.
alexc
parents:
8687
diff
changeset
|
45 FFTSample *ff_cos_tabs[] = { |
7542 | 46 ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024, |
47 ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536, | |
48 }; | |
49 | |
50 static int split_radix_permutation(int i, int n, int inverse) | |
51 { | |
52 int m; | |
53 if(n <= 2) return i&1; | |
54 m = n >> 1; | |
55 if(!(i&m)) return split_radix_permutation(i, m, inverse)*2; | |
56 m >>= 1; | |
57 if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1; | |
58 else return split_radix_permutation(i, m, inverse)*4 - 1; | |
59 } | |
60 | |
8637 | 61 av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) |
781 | 62 { |
63 int i, j, m, n; | |
64 float alpha, c1, s1, s2; | |
7542 | 65 int split_radix = 1; |
6503 | 66 int av_unused has_vectors; |
2967 | 67 |
7542 | 68 if (nbits < 2 || nbits > 16) |
69 goto fail; | |
781 | 70 s->nbits = nbits; |
71 n = 1 << nbits; | |
72 | |
7542 | 73 s->tmp_buf = NULL; |
8974 | 74 s->exptab = av_malloc((n / 2) * sizeof(FFTComplex)); |
781 | 75 if (!s->exptab) |
76 goto fail; | |
77 s->revtab = av_malloc(n * sizeof(uint16_t)); | |
78 if (!s->revtab) | |
79 goto fail; | |
80 s->inverse = inverse; | |
81 | |
82 s2 = inverse ? 1.0 : -1.0; | |
2967 | 83 |
7542 | 84 s->fft_permute = ff_fft_permute_c; |
8974 | 85 s->fft_calc = ff_fft_calc_c; |
86 s->imdct_calc = ff_imdct_calc_c; | |
87 s->imdct_half = ff_imdct_half_c; | |
88 s->exptab1 = NULL; | |
781 | 89 |
8590 | 90 #if HAVE_MMX && HAVE_YASM |
6503 | 91 has_vectors = mm_support(); |
8981
dc19e4d7d0eb
Only enable SSE/3DNOW optimizations when they have been enabled at compilation.
diego
parents:
8974
diff
changeset
|
92 if (has_vectors & FF_MM_SSE && HAVE_SSE) { |
7542 | 93 /* SSE for P3/P4/K8 */ |
8974 | 94 s->imdct_calc = ff_imdct_calc_sse; |
95 s->imdct_half = ff_imdct_half_sse; | |
7542 | 96 s->fft_permute = ff_fft_permute_sse; |
8974 | 97 s->fft_calc = ff_fft_calc_sse; |
8981
dc19e4d7d0eb
Only enable SSE/3DNOW optimizations when they have been enabled at compilation.
diego
parents:
8974
diff
changeset
|
98 } else if (has_vectors & FF_MM_3DNOWEXT && HAVE_AMD3DNOWEXT) { |
7542 | 99 /* 3DNowEx for K7 */ |
6503 | 100 s->imdct_calc = ff_imdct_calc_3dn2; |
7263 | 101 s->imdct_half = ff_imdct_half_3dn2; |
8974 | 102 s->fft_calc = ff_fft_calc_3dn2; |
8981
dc19e4d7d0eb
Only enable SSE/3DNOW optimizations when they have been enabled at compilation.
diego
parents:
8974
diff
changeset
|
103 } else if (has_vectors & FF_MM_3DNOW && HAVE_AMD3DNOW) { |
6503 | 104 /* 3DNow! for K6-2/3 */ |
7544 | 105 s->imdct_calc = ff_imdct_calc_3dn; |
106 s->imdct_half = ff_imdct_half_3dn; | |
8974 | 107 s->fft_calc = ff_fft_calc_3dn; |
6503 | 108 } |
9177
4b1736ba9f2f
Remove long unused ALTIVEC_USE_REFERENCE_C_CODE ifdef; all other references
conrad
parents:
8981
diff
changeset
|
109 #elif HAVE_ALTIVEC |
6503 | 110 has_vectors = mm_support(); |
8104
0d108ec85620
Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents:
7547
diff
changeset
|
111 if (has_vectors & FF_MM_ALTIVEC) { |
6503 | 112 s->fft_calc = ff_fft_calc_altivec; |
7542 | 113 split_radix = 0; |
6503 | 114 } |
115 #endif | |
781 | 116 |
7542 | 117 if (split_radix) { |
118 for(j=4; j<=nbits; j++) { | |
119 int m = 1<<j; | |
120 double freq = 2*M_PI/m; | |
121 FFTSample *tab = ff_cos_tabs[j-4]; | |
122 for(i=0; i<=m/4; i++) | |
123 tab[i] = cos(i*freq); | |
124 for(i=1; i<m/4; i++) | |
125 tab[m/2-i] = tab[i]; | |
126 } | |
127 for(i=0; i<n; i++) | |
128 s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i; | |
129 s->tmp_buf = av_malloc(n * sizeof(FFTComplex)); | |
130 } else { | |
6504 | 131 int np, nblocks, np2, l; |
132 FFTComplex *q; | |
2967 | 133 |
7542 | 134 for(i=0; i<(n/2); i++) { |
135 alpha = 2 * M_PI * (float)i / (float)n; | |
136 c1 = cos(alpha); | |
137 s1 = sin(alpha) * s2; | |
138 s->exptab[i].re = c1; | |
139 s->exptab[i].im = s1; | |
140 } | |
141 | |
6504 | 142 np = 1 << nbits; |
143 nblocks = np >> 3; | |
144 np2 = np >> 1; | |
145 s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex)); | |
146 if (!s->exptab1) | |
147 goto fail; | |
148 q = s->exptab1; | |
149 do { | |
150 for(l = 0; l < np2; l += 2 * nblocks) { | |
151 *q++ = s->exptab[l]; | |
152 *q++ = s->exptab[l + nblocks]; | |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
971
diff
changeset
|
153 |
6504 | 154 q->re = -s->exptab[l].im; |
155 q->im = s->exptab[l].re; | |
156 q++; | |
157 q->re = -s->exptab[l + nblocks].im; | |
158 q->im = s->exptab[l + nblocks].re; | |
159 q++; | |
160 } | |
161 nblocks = nblocks >> 1; | |
162 } while (nblocks != 0); | |
163 av_freep(&s->exptab); | |
781 | 164 |
7543 | 165 /* compute bit reverse table */ |
166 for(i=0;i<n;i++) { | |
167 m=0; | |
168 for(j=0;j<nbits;j++) { | |
169 m |= ((i >> j) & 1) << (nbits-j-1); | |
170 } | |
171 s->revtab[i]=m; | |
781 | 172 } |
7542 | 173 } |
174 | |
781 | 175 return 0; |
176 fail: | |
177 av_freep(&s->revtab); | |
178 av_freep(&s->exptab); | |
179 av_freep(&s->exptab1); | |
7542 | 180 av_freep(&s->tmp_buf); |
781 | 181 return -1; |
182 } | |
183 | |
7542 | 184 void ff_fft_permute_c(FFTContext *s, FFTComplex *z) |
781 | 185 { |
186 int j, k, np; | |
187 FFTComplex tmp; | |
188 const uint16_t *revtab = s->revtab; | |
7542 | 189 np = 1 << s->nbits; |
190 | |
191 if (s->tmp_buf) { | |
192 /* TODO: handle split-radix permute in a more optimal way, probably in-place */ | |
193 for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j]; | |
194 memcpy(z, s->tmp_buf, np * sizeof(FFTComplex)); | |
195 return; | |
196 } | |
2967 | 197 |
781 | 198 /* reverse */ |
199 for(j=0;j<np;j++) { | |
200 k = revtab[j]; | |
201 if (k < j) { | |
202 tmp = z[k]; | |
203 z[k] = z[j]; | |
204 z[j] = tmp; | |
205 } | |
206 } | |
207 } | |
208 | |
8687 | 209 av_cold void ff_fft_end(FFTContext *s) |
781 | 210 { |
211 av_freep(&s->revtab); | |
212 av_freep(&s->exptab); | |
213 av_freep(&s->exptab1); | |
7542 | 214 av_freep(&s->tmp_buf); |
215 } | |
216 | |
217 #define sqrthalf (float)M_SQRT1_2 | |
218 | |
219 #define BF(x,y,a,b) {\ | |
220 x = a - b;\ | |
221 y = a + b;\ | |
222 } | |
223 | |
224 #define BUTTERFLIES(a0,a1,a2,a3) {\ | |
225 BF(t3, t5, t5, t1);\ | |
226 BF(a2.re, a0.re, a0.re, t5);\ | |
227 BF(a3.im, a1.im, a1.im, t3);\ | |
228 BF(t4, t6, t2, t6);\ | |
229 BF(a3.re, a1.re, a1.re, t4);\ | |
230 BF(a2.im, a0.im, a0.im, t6);\ | |
231 } | |
232 | |
233 // force loading all the inputs before storing any. | |
234 // this is slightly slower for small data, but avoids store->load aliasing | |
235 // for addresses separated by large powers of 2. | |
236 #define BUTTERFLIES_BIG(a0,a1,a2,a3) {\ | |
237 FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\ | |
238 BF(t3, t5, t5, t1);\ | |
239 BF(a2.re, a0.re, r0, t5);\ | |
240 BF(a3.im, a1.im, i1, t3);\ | |
241 BF(t4, t6, t2, t6);\ | |
242 BF(a3.re, a1.re, r1, t4);\ | |
243 BF(a2.im, a0.im, i0, t6);\ | |
244 } | |
245 | |
246 #define TRANSFORM(a0,a1,a2,a3,wre,wim) {\ | |
247 t1 = a2.re * wre + a2.im * wim;\ | |
248 t2 = a2.im * wre - a2.re * wim;\ | |
249 t5 = a3.re * wre - a3.im * wim;\ | |
250 t6 = a3.im * wre + a3.re * wim;\ | |
251 BUTTERFLIES(a0,a1,a2,a3)\ | |
252 } | |
253 | |
254 #define TRANSFORM_ZERO(a0,a1,a2,a3) {\ | |
255 t1 = a2.re;\ | |
256 t2 = a2.im;\ | |
257 t5 = a3.re;\ | |
258 t6 = a3.im;\ | |
259 BUTTERFLIES(a0,a1,a2,a3)\ | |
260 } | |
261 | |
262 /* z[0...8n-1], w[1...2n-1] */ | |
263 #define PASS(name)\ | |
264 static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\ | |
265 {\ | |
266 FFTSample t1, t2, t3, t4, t5, t6;\ | |
267 int o1 = 2*n;\ | |
268 int o2 = 4*n;\ | |
269 int o3 = 6*n;\ | |
270 const FFTSample *wim = wre+o1;\ | |
271 n--;\ | |
272 \ | |
273 TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\ | |
274 TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ | |
275 do {\ | |
276 z += 2;\ | |
277 wre += 2;\ | |
278 wim -= 2;\ | |
279 TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\ | |
280 TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ | |
281 } while(--n);\ | |
781 | 282 } |
283 | |
7542 | 284 PASS(pass) |
285 #undef BUTTERFLIES | |
286 #define BUTTERFLIES BUTTERFLIES_BIG | |
287 PASS(pass_big) | |
288 | |
289 #define DECL_FFT(n,n2,n4)\ | |
290 static void fft##n(FFTComplex *z)\ | |
291 {\ | |
292 fft##n2(z);\ | |
293 fft##n4(z+n4*2);\ | |
294 fft##n4(z+n4*3);\ | |
295 pass(z,ff_cos_##n,n4/2);\ | |
296 } | |
297 | |
298 static void fft4(FFTComplex *z) | |
299 { | |
300 FFTSample t1, t2, t3, t4, t5, t6, t7, t8; | |
301 | |
302 BF(t3, t1, z[0].re, z[1].re); | |
303 BF(t8, t6, z[3].re, z[2].re); | |
304 BF(z[2].re, z[0].re, t1, t6); | |
305 BF(t4, t2, z[0].im, z[1].im); | |
306 BF(t7, t5, z[2].im, z[3].im); | |
307 BF(z[3].im, z[1].im, t4, t8); | |
308 BF(z[3].re, z[1].re, t3, t7); | |
309 BF(z[2].im, z[0].im, t2, t5); | |
310 } | |
311 | |
312 static void fft8(FFTComplex *z) | |
313 { | |
314 FFTSample t1, t2, t3, t4, t5, t6, t7, t8; | |
315 | |
316 fft4(z); | |
317 | |
318 BF(t1, z[5].re, z[4].re, -z[5].re); | |
319 BF(t2, z[5].im, z[4].im, -z[5].im); | |
320 BF(t3, z[7].re, z[6].re, -z[7].re); | |
321 BF(t4, z[7].im, z[6].im, -z[7].im); | |
322 BF(t8, t1, t3, t1); | |
323 BF(t7, t2, t2, t4); | |
324 BF(z[4].re, z[0].re, z[0].re, t1); | |
325 BF(z[4].im, z[0].im, z[0].im, t2); | |
326 BF(z[6].re, z[2].re, z[2].re, t7); | |
327 BF(z[6].im, z[2].im, z[2].im, t8); | |
328 | |
329 TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf); | |
330 } | |
331 | |
8590 | 332 #if !CONFIG_SMALL |
7542 | 333 static void fft16(FFTComplex *z) |
334 { | |
335 FFTSample t1, t2, t3, t4, t5, t6; | |
336 | |
337 fft8(z); | |
338 fft4(z+8); | |
339 fft4(z+12); | |
340 | |
341 TRANSFORM_ZERO(z[0],z[4],z[8],z[12]); | |
342 TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf); | |
343 TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]); | |
344 TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]); | |
345 } | |
346 #else | |
347 DECL_FFT(16,8,4) | |
348 #endif | |
349 DECL_FFT(32,16,8) | |
350 DECL_FFT(64,32,16) | |
351 DECL_FFT(128,64,32) | |
352 DECL_FFT(256,128,64) | |
353 DECL_FFT(512,256,128) | |
8590 | 354 #if !CONFIG_SMALL |
7542 | 355 #define pass pass_big |
356 #endif | |
357 DECL_FFT(1024,512,256) | |
358 DECL_FFT(2048,1024,512) | |
359 DECL_FFT(4096,2048,1024) | |
360 DECL_FFT(8192,4096,2048) | |
361 DECL_FFT(16384,8192,4096) | |
362 DECL_FFT(32768,16384,8192) | |
363 DECL_FFT(65536,32768,16384) | |
364 | |
365 static void (*fft_dispatch[])(FFTComplex*) = { | |
366 fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024, | |
367 fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, | |
368 }; | |
369 | |
370 void ff_fft_calc_c(FFTContext *s, FFTComplex *z) | |
371 { | |
372 fft_dispatch[s->nbits-2](z); | |
373 } | |
374 |