Mercurial > libavcodec.hg
annotate fft.c @ 8130:c45366b01126 libavcodec
ARM: fix j_rev_dct_ARM
This is a bugfix for ARMv4 assembly implementation of 'j_rev_dct'
function.
The problem was in the incorrect partially empty row detection. Even
if the first two coefficients in the row were nonzero, it handled this
just like the case with only the first nonzero coefficient.
Now this function produces exactly the same output as the stripped
down reference C version of 'j_rev_dct' (with the nested checks like
'if (d6) { if (d2) { ...' always evaluated as true, avoiding shortcut
branches).
author | mru |
---|---|
date | Wed, 12 Nov 2008 20:23:36 +0000 |
parents | 0d108ec85620 |
children | 7a463923ecd1 |
rev | line source |
---|---|
781 | 1 /* |
2 * FFT/IFFT transforms | |
7542 | 3 * Copyright (c) 2008 Loren Merritt |
781 | 4 * Copyright (c) 2002 Fabrice Bellard. |
7542 | 5 * Partly based on libdjbfft by D. J. Bernstein |
781 | 6 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
7 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
8 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
9 * FFmpeg is free software; you can redistribute it and/or |
781 | 10 * modify it under the terms of the GNU Lesser General Public |
11 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
12 * version 2.1 of the License, or (at your option) any later version. |
781 | 13 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
14 * FFmpeg is distributed in the hope that it will be useful, |
781 | 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 * Lesser General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
20 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
781 | 22 */ |
1106 | 23 |
24 /** | |
25 * @file fft.c | |
26 * FFT/IFFT transforms. | |
27 */ | |
28 | |
781 | 29 #include "dsputil.h" |
30 | |
7542 | 31 /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */ |
32 DECLARE_ALIGNED_16(FFTSample, ff_cos_16[8]); | |
33 DECLARE_ALIGNED_16(FFTSample, ff_cos_32[16]); | |
34 DECLARE_ALIGNED_16(FFTSample, ff_cos_64[32]); | |
35 DECLARE_ALIGNED_16(FFTSample, ff_cos_128[64]); | |
36 DECLARE_ALIGNED_16(FFTSample, ff_cos_256[128]); | |
37 DECLARE_ALIGNED_16(FFTSample, ff_cos_512[256]); | |
38 DECLARE_ALIGNED_16(FFTSample, ff_cos_1024[512]); | |
39 DECLARE_ALIGNED_16(FFTSample, ff_cos_2048[1024]); | |
40 DECLARE_ALIGNED_16(FFTSample, ff_cos_4096[2048]); | |
41 DECLARE_ALIGNED_16(FFTSample, ff_cos_8192[4096]); | |
42 DECLARE_ALIGNED_16(FFTSample, ff_cos_16384[8192]); | |
43 DECLARE_ALIGNED_16(FFTSample, ff_cos_32768[16384]); | |
44 DECLARE_ALIGNED_16(FFTSample, ff_cos_65536[32768]); | |
45 static FFTSample *ff_cos_tabs[] = { | |
46 ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024, | |
47 ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536, | |
48 }; | |
49 | |
50 static int split_radix_permutation(int i, int n, int inverse) | |
51 { | |
52 int m; | |
53 if(n <= 2) return i&1; | |
54 m = n >> 1; | |
55 if(!(i&m)) return split_radix_permutation(i, m, inverse)*2; | |
56 m >>= 1; | |
57 if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1; | |
58 else return split_radix_permutation(i, m, inverse)*4 - 1; | |
59 } | |
60 | |
781 | 61 /** |
62 * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is | |
2967 | 63 * done |
781 | 64 */ |
1879
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1106
diff
changeset
|
65 int ff_fft_init(FFTContext *s, int nbits, int inverse) |
781 | 66 { |
67 int i, j, m, n; | |
68 float alpha, c1, s1, s2; | |
7542 | 69 int split_radix = 1; |
6503 | 70 int av_unused has_vectors; |
2967 | 71 |
7542 | 72 if (nbits < 2 || nbits > 16) |
73 goto fail; | |
781 | 74 s->nbits = nbits; |
75 n = 1 << nbits; | |
76 | |
7542 | 77 s->tmp_buf = NULL; |
781 | 78 s->exptab = av_malloc((n / 2) * sizeof(FFTComplex)); |
79 if (!s->exptab) | |
80 goto fail; | |
81 s->revtab = av_malloc(n * sizeof(uint16_t)); | |
82 if (!s->revtab) | |
83 goto fail; | |
84 s->inverse = inverse; | |
85 | |
86 s2 = inverse ? 1.0 : -1.0; | |
2967 | 87 |
7542 | 88 s->fft_permute = ff_fft_permute_c; |
1879
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1106
diff
changeset
|
89 s->fft_calc = ff_fft_calc_c; |
7547 | 90 s->imdct_calc = ff_imdct_calc_c; |
91 s->imdct_half = ff_imdct_half_c; | |
781 | 92 s->exptab1 = NULL; |
93 | |
7542 | 94 #if defined HAVE_MMX && defined HAVE_YASM |
6503 | 95 has_vectors = mm_support(); |
8104
0d108ec85620
Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents:
7547
diff
changeset
|
96 if (has_vectors & FF_MM_SSE) { |
7542 | 97 /* SSE for P3/P4/K8 */ |
98 s->imdct_calc = ff_imdct_calc_sse; | |
99 s->imdct_half = ff_imdct_half_sse; | |
100 s->fft_permute = ff_fft_permute_sse; | |
101 s->fft_calc = ff_fft_calc_sse; | |
8104
0d108ec85620
Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents:
7547
diff
changeset
|
102 } else if (has_vectors & FF_MM_3DNOWEXT) { |
7542 | 103 /* 3DNowEx for K7 */ |
6503 | 104 s->imdct_calc = ff_imdct_calc_3dn2; |
7263 | 105 s->imdct_half = ff_imdct_half_3dn2; |
6503 | 106 s->fft_calc = ff_fft_calc_3dn2; |
8104
0d108ec85620
Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents:
7547
diff
changeset
|
107 } else if (has_vectors & FF_MM_3DNOW) { |
6503 | 108 /* 3DNow! for K6-2/3 */ |
7544 | 109 s->imdct_calc = ff_imdct_calc_3dn; |
110 s->imdct_half = ff_imdct_half_3dn; | |
6503 | 111 s->fft_calc = ff_fft_calc_3dn; |
112 } | |
113 #elif defined HAVE_ALTIVEC && !defined ALTIVEC_USE_REFERENCE_C_CODE | |
114 has_vectors = mm_support(); | |
8104
0d108ec85620
Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents:
7547
diff
changeset
|
115 if (has_vectors & FF_MM_ALTIVEC) { |
6503 | 116 s->fft_calc = ff_fft_calc_altivec; |
7542 | 117 split_radix = 0; |
6503 | 118 } |
119 #endif | |
781 | 120 |
7542 | 121 if (split_radix) { |
122 for(j=4; j<=nbits; j++) { | |
123 int m = 1<<j; | |
124 double freq = 2*M_PI/m; | |
125 FFTSample *tab = ff_cos_tabs[j-4]; | |
126 for(i=0; i<=m/4; i++) | |
127 tab[i] = cos(i*freq); | |
128 for(i=1; i<m/4; i++) | |
129 tab[m/2-i] = tab[i]; | |
130 } | |
131 for(i=0; i<n; i++) | |
132 s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i; | |
133 s->tmp_buf = av_malloc(n * sizeof(FFTComplex)); | |
134 } else { | |
6504 | 135 int np, nblocks, np2, l; |
136 FFTComplex *q; | |
2967 | 137 |
7542 | 138 for(i=0; i<(n/2); i++) { |
139 alpha = 2 * M_PI * (float)i / (float)n; | |
140 c1 = cos(alpha); | |
141 s1 = sin(alpha) * s2; | |
142 s->exptab[i].re = c1; | |
143 s->exptab[i].im = s1; | |
144 } | |
145 | |
6504 | 146 np = 1 << nbits; |
147 nblocks = np >> 3; | |
148 np2 = np >> 1; | |
149 s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex)); | |
150 if (!s->exptab1) | |
151 goto fail; | |
152 q = s->exptab1; | |
153 do { | |
154 for(l = 0; l < np2; l += 2 * nblocks) { | |
155 *q++ = s->exptab[l]; | |
156 *q++ = s->exptab[l + nblocks]; | |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
971
diff
changeset
|
157 |
6504 | 158 q->re = -s->exptab[l].im; |
159 q->im = s->exptab[l].re; | |
160 q++; | |
161 q->re = -s->exptab[l + nblocks].im; | |
162 q->im = s->exptab[l + nblocks].re; | |
163 q++; | |
164 } | |
165 nblocks = nblocks >> 1; | |
166 } while (nblocks != 0); | |
167 av_freep(&s->exptab); | |
781 | 168 |
7543 | 169 /* compute bit reverse table */ |
170 for(i=0;i<n;i++) { | |
171 m=0; | |
172 for(j=0;j<nbits;j++) { | |
173 m |= ((i >> j) & 1) << (nbits-j-1); | |
174 } | |
175 s->revtab[i]=m; | |
781 | 176 } |
7542 | 177 } |
178 | |
781 | 179 return 0; |
180 fail: | |
181 av_freep(&s->revtab); | |
182 av_freep(&s->exptab); | |
183 av_freep(&s->exptab1); | |
7542 | 184 av_freep(&s->tmp_buf); |
781 | 185 return -1; |
186 } | |
187 | |
188 /** | |
1879
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1106
diff
changeset
|
189 * Do the permutation needed BEFORE calling ff_fft_calc() |
781 | 190 */ |
7542 | 191 void ff_fft_permute_c(FFTContext *s, FFTComplex *z) |
781 | 192 { |
193 int j, k, np; | |
194 FFTComplex tmp; | |
195 const uint16_t *revtab = s->revtab; | |
7542 | 196 np = 1 << s->nbits; |
197 | |
198 if (s->tmp_buf) { | |
199 /* TODO: handle split-radix permute in a more optimal way, probably in-place */ | |
200 for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j]; | |
201 memcpy(z, s->tmp_buf, np * sizeof(FFTComplex)); | |
202 return; | |
203 } | |
2967 | 204 |
781 | 205 /* reverse */ |
206 for(j=0;j<np;j++) { | |
207 k = revtab[j]; | |
208 if (k < j) { | |
209 tmp = z[k]; | |
210 z[k] = z[j]; | |
211 z[j] = tmp; | |
212 } | |
213 } | |
214 } | |
215 | |
1879
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1106
diff
changeset
|
216 void ff_fft_end(FFTContext *s) |
781 | 217 { |
218 av_freep(&s->revtab); | |
219 av_freep(&s->exptab); | |
220 av_freep(&s->exptab1); | |
7542 | 221 av_freep(&s->tmp_buf); |
222 } | |
223 | |
224 #define sqrthalf (float)M_SQRT1_2 | |
225 | |
226 #define BF(x,y,a,b) {\ | |
227 x = a - b;\ | |
228 y = a + b;\ | |
229 } | |
230 | |
231 #define BUTTERFLIES(a0,a1,a2,a3) {\ | |
232 BF(t3, t5, t5, t1);\ | |
233 BF(a2.re, a0.re, a0.re, t5);\ | |
234 BF(a3.im, a1.im, a1.im, t3);\ | |
235 BF(t4, t6, t2, t6);\ | |
236 BF(a3.re, a1.re, a1.re, t4);\ | |
237 BF(a2.im, a0.im, a0.im, t6);\ | |
238 } | |
239 | |
240 // force loading all the inputs before storing any. | |
241 // this is slightly slower for small data, but avoids store->load aliasing | |
242 // for addresses separated by large powers of 2. | |
243 #define BUTTERFLIES_BIG(a0,a1,a2,a3) {\ | |
244 FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\ | |
245 BF(t3, t5, t5, t1);\ | |
246 BF(a2.re, a0.re, r0, t5);\ | |
247 BF(a3.im, a1.im, i1, t3);\ | |
248 BF(t4, t6, t2, t6);\ | |
249 BF(a3.re, a1.re, r1, t4);\ | |
250 BF(a2.im, a0.im, i0, t6);\ | |
251 } | |
252 | |
253 #define TRANSFORM(a0,a1,a2,a3,wre,wim) {\ | |
254 t1 = a2.re * wre + a2.im * wim;\ | |
255 t2 = a2.im * wre - a2.re * wim;\ | |
256 t5 = a3.re * wre - a3.im * wim;\ | |
257 t6 = a3.im * wre + a3.re * wim;\ | |
258 BUTTERFLIES(a0,a1,a2,a3)\ | |
259 } | |
260 | |
261 #define TRANSFORM_ZERO(a0,a1,a2,a3) {\ | |
262 t1 = a2.re;\ | |
263 t2 = a2.im;\ | |
264 t5 = a3.re;\ | |
265 t6 = a3.im;\ | |
266 BUTTERFLIES(a0,a1,a2,a3)\ | |
267 } | |
268 | |
269 /* z[0...8n-1], w[1...2n-1] */ | |
270 #define PASS(name)\ | |
271 static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\ | |
272 {\ | |
273 FFTSample t1, t2, t3, t4, t5, t6;\ | |
274 int o1 = 2*n;\ | |
275 int o2 = 4*n;\ | |
276 int o3 = 6*n;\ | |
277 const FFTSample *wim = wre+o1;\ | |
278 n--;\ | |
279 \ | |
280 TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\ | |
281 TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ | |
282 do {\ | |
283 z += 2;\ | |
284 wre += 2;\ | |
285 wim -= 2;\ | |
286 TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\ | |
287 TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ | |
288 } while(--n);\ | |
781 | 289 } |
290 | |
7542 | 291 PASS(pass) |
292 #undef BUTTERFLIES | |
293 #define BUTTERFLIES BUTTERFLIES_BIG | |
294 PASS(pass_big) | |
295 | |
296 #define DECL_FFT(n,n2,n4)\ | |
297 static void fft##n(FFTComplex *z)\ | |
298 {\ | |
299 fft##n2(z);\ | |
300 fft##n4(z+n4*2);\ | |
301 fft##n4(z+n4*3);\ | |
302 pass(z,ff_cos_##n,n4/2);\ | |
303 } | |
304 | |
305 static void fft4(FFTComplex *z) | |
306 { | |
307 FFTSample t1, t2, t3, t4, t5, t6, t7, t8; | |
308 | |
309 BF(t3, t1, z[0].re, z[1].re); | |
310 BF(t8, t6, z[3].re, z[2].re); | |
311 BF(z[2].re, z[0].re, t1, t6); | |
312 BF(t4, t2, z[0].im, z[1].im); | |
313 BF(t7, t5, z[2].im, z[3].im); | |
314 BF(z[3].im, z[1].im, t4, t8); | |
315 BF(z[3].re, z[1].re, t3, t7); | |
316 BF(z[2].im, z[0].im, t2, t5); | |
317 } | |
318 | |
319 static void fft8(FFTComplex *z) | |
320 { | |
321 FFTSample t1, t2, t3, t4, t5, t6, t7, t8; | |
322 | |
323 fft4(z); | |
324 | |
325 BF(t1, z[5].re, z[4].re, -z[5].re); | |
326 BF(t2, z[5].im, z[4].im, -z[5].im); | |
327 BF(t3, z[7].re, z[6].re, -z[7].re); | |
328 BF(t4, z[7].im, z[6].im, -z[7].im); | |
329 BF(t8, t1, t3, t1); | |
330 BF(t7, t2, t2, t4); | |
331 BF(z[4].re, z[0].re, z[0].re, t1); | |
332 BF(z[4].im, z[0].im, z[0].im, t2); | |
333 BF(z[6].re, z[2].re, z[2].re, t7); | |
334 BF(z[6].im, z[2].im, z[2].im, t8); | |
335 | |
336 TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf); | |
337 } | |
338 | |
339 #ifndef CONFIG_SMALL | |
340 static void fft16(FFTComplex *z) | |
341 { | |
342 FFTSample t1, t2, t3, t4, t5, t6; | |
343 | |
344 fft8(z); | |
345 fft4(z+8); | |
346 fft4(z+12); | |
347 | |
348 TRANSFORM_ZERO(z[0],z[4],z[8],z[12]); | |
349 TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf); | |
350 TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]); | |
351 TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]); | |
352 } | |
353 #else | |
354 DECL_FFT(16,8,4) | |
355 #endif | |
356 DECL_FFT(32,16,8) | |
357 DECL_FFT(64,32,16) | |
358 DECL_FFT(128,64,32) | |
359 DECL_FFT(256,128,64) | |
360 DECL_FFT(512,256,128) | |
361 #ifndef CONFIG_SMALL | |
362 #define pass pass_big | |
363 #endif | |
364 DECL_FFT(1024,512,256) | |
365 DECL_FFT(2048,1024,512) | |
366 DECL_FFT(4096,2048,1024) | |
367 DECL_FFT(8192,4096,2048) | |
368 DECL_FFT(16384,8192,4096) | |
369 DECL_FFT(32768,16384,8192) | |
370 DECL_FFT(65536,32768,16384) | |
371 | |
372 static void (*fft_dispatch[])(FFTComplex*) = { | |
373 fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024, | |
374 fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, | |
375 }; | |
376 | |
377 /** | |
378 * Do a complex FFT with the parameters defined in ff_fft_init(). The | |
379 * input data must be permuted before with s->revtab table. No | |
380 * 1.0/sqrt(n) normalization is done. | |
381 */ | |
382 void ff_fft_calc_c(FFTContext *s, FFTComplex *z) | |
383 { | |
384 fft_dispatch[s->nbits-2](z); | |
385 } | |
386 |