Mercurial > libavcodec.hg
annotate fft.c @ 11032:01bd040f8607 libavcodec
Unroll main loop so the edge==0 case is seperate.
This allows many things to be simplified away.
h264 decoder is overall 1% faster with a mbaff sample and
0.1% slower with the cathedral sample, probably because the slow loop
filter code must be loaded into the code cache for each first MB of each
row but isnt used for the following MBs.
author | michael |
---|---|
date | Thu, 28 Jan 2010 01:24:25 +0000 |
parents | 74b0c1a0851e |
children | e45c852b6820 |
rev | line source |
---|---|
781 | 1 /* |
2 * FFT/IFFT transforms | |
7542 | 3 * Copyright (c) 2008 Loren Merritt |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8590
diff
changeset
|
4 * Copyright (c) 2002 Fabrice Bellard |
7542 | 5 * Partly based on libdjbfft by D. J. Bernstein |
781 | 6 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
7 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
8 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
9 * FFmpeg is free software; you can redistribute it and/or |
781 | 10 * modify it under the terms of the GNU Lesser General Public |
11 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
12 * version 2.1 of the License, or (at your option) any later version. |
781 | 13 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
14 * FFmpeg is distributed in the hope that it will be useful, |
781 | 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 * Lesser General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
20 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
781 | 22 */ |
1106 | 23 |
24 /** | |
8718
e9d9d946f213
Use full internal pathname in doxygen @file directives.
diego
parents:
8694
diff
changeset
|
25 * @file libavcodec/fft.c |
1106 | 26 * FFT/IFFT transforms. |
27 */ | |
28 | |
781 | 29 #include "dsputil.h" |
30 | |
10407
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
31 /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */ |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
32 #if !CONFIG_HARDCODED_TABLES |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
33 COSTABLE(16); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
34 COSTABLE(32); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
35 COSTABLE(64); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
36 COSTABLE(128); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
37 COSTABLE(256); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
38 COSTABLE(512); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
39 COSTABLE(1024); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
40 COSTABLE(2048); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
41 COSTABLE(4096); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
42 COSTABLE(8192); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
43 COSTABLE(16384); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
44 COSTABLE(32768); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
45 COSTABLE(65536); |
10400
866dffa620d1
Use hardcoded instead of runtime-calculated ff_cos_* tables if
reimar
parents:
10391
diff
changeset
|
46 #endif |
10407
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
47 COSTABLE_CONST FFTSample * const ff_cos_tabs[] = { |
10492
63910f7ba293
Pad ff_cos_tabs and ff_sin_tabs so that index n points to the table for n bits.
reimar
parents:
10407
diff
changeset
|
48 NULL, NULL, NULL, NULL, |
7542 | 49 ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024, |
50 ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536, | |
51 }; | |
52 | |
53 static int split_radix_permutation(int i, int n, int inverse) | |
54 { | |
55 int m; | |
56 if(n <= 2) return i&1; | |
57 m = n >> 1; | |
58 if(!(i&m)) return split_radix_permutation(i, m, inverse)*2; | |
59 m >>= 1; | |
60 if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1; | |
61 else return split_radix_permutation(i, m, inverse)*4 - 1; | |
62 } | |
63 | |
10496
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
64 av_cold void ff_init_ff_cos_tabs(int index) |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
65 { |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
66 #if !CONFIG_HARDCODED_TABLES |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
67 int i; |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
68 int m = 1<<index; |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
69 double freq = 2*M_PI/m; |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
70 FFTSample *tab = ff_cos_tabs[index]; |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
71 for(i=0; i<=m/4; i++) |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
72 tab[i] = cos(i*freq); |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
73 for(i=1; i<m/4; i++) |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
74 tab[m/2-i] = tab[i]; |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
75 #endif |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
76 } |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
77 |
8637 | 78 av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) |
781 | 79 { |
80 int i, j, m, n; | |
81 float alpha, c1, s1, s2; | |
6503 | 82 int av_unused has_vectors; |
2967 | 83 |
7542 | 84 if (nbits < 2 || nbits > 16) |
85 goto fail; | |
781 | 86 s->nbits = nbits; |
87 n = 1 << nbits; | |
88 | |
7542 | 89 s->tmp_buf = NULL; |
8974 | 90 s->exptab = av_malloc((n / 2) * sizeof(FFTComplex)); |
781 | 91 if (!s->exptab) |
92 goto fail; | |
93 s->revtab = av_malloc(n * sizeof(uint16_t)); | |
94 if (!s->revtab) | |
95 goto fail; | |
96 s->inverse = inverse; | |
97 | |
98 s2 = inverse ? 1.0 : -1.0; | |
2967 | 99 |
7542 | 100 s->fft_permute = ff_fft_permute_c; |
8974 | 101 s->fft_calc = ff_fft_calc_c; |
102 s->imdct_calc = ff_imdct_calc_c; | |
103 s->imdct_half = ff_imdct_half_c; | |
10161 | 104 s->mdct_calc = ff_mdct_calc_c; |
8974 | 105 s->exptab1 = NULL; |
10175
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
10172
diff
changeset
|
106 s->split_radix = 1; |
781 | 107 |
10175
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
10172
diff
changeset
|
108 if (ARCH_ARM) ff_fft_init_arm(s); |
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
10172
diff
changeset
|
109 if (HAVE_ALTIVEC) ff_fft_init_altivec(s); |
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
10172
diff
changeset
|
110 if (HAVE_MMX) ff_fft_init_mmx(s); |
781 | 111 |
10175
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
10172
diff
changeset
|
112 if (s->split_radix) { |
7542 | 113 for(j=4; j<=nbits; j++) { |
10496
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
114 ff_init_ff_cos_tabs(j); |
7542 | 115 } |
116 for(i=0; i<n; i++) | |
10172 | 117 s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i; |
7542 | 118 s->tmp_buf = av_malloc(n * sizeof(FFTComplex)); |
119 } else { | |
6504 | 120 int np, nblocks, np2, l; |
121 FFTComplex *q; | |
2967 | 122 |
7542 | 123 for(i=0; i<(n/2); i++) { |
124 alpha = 2 * M_PI * (float)i / (float)n; | |
125 c1 = cos(alpha); | |
126 s1 = sin(alpha) * s2; | |
127 s->exptab[i].re = c1; | |
128 s->exptab[i].im = s1; | |
129 } | |
130 | |
6504 | 131 np = 1 << nbits; |
132 nblocks = np >> 3; | |
133 np2 = np >> 1; | |
134 s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex)); | |
135 if (!s->exptab1) | |
136 goto fail; | |
137 q = s->exptab1; | |
138 do { | |
139 for(l = 0; l < np2; l += 2 * nblocks) { | |
140 *q++ = s->exptab[l]; | |
141 *q++ = s->exptab[l + nblocks]; | |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
971
diff
changeset
|
142 |
6504 | 143 q->re = -s->exptab[l].im; |
144 q->im = s->exptab[l].re; | |
145 q++; | |
146 q->re = -s->exptab[l + nblocks].im; | |
147 q->im = s->exptab[l + nblocks].re; | |
148 q++; | |
149 } | |
150 nblocks = nblocks >> 1; | |
151 } while (nblocks != 0); | |
152 av_freep(&s->exptab); | |
781 | 153 |
7543 | 154 /* compute bit reverse table */ |
155 for(i=0;i<n;i++) { | |
156 m=0; | |
157 for(j=0;j<nbits;j++) { | |
158 m |= ((i >> j) & 1) << (nbits-j-1); | |
159 } | |
160 s->revtab[i]=m; | |
781 | 161 } |
7542 | 162 } |
163 | |
781 | 164 return 0; |
165 fail: | |
166 av_freep(&s->revtab); | |
167 av_freep(&s->exptab); | |
168 av_freep(&s->exptab1); | |
7542 | 169 av_freep(&s->tmp_buf); |
781 | 170 return -1; |
171 } | |
172 | |
7542 | 173 void ff_fft_permute_c(FFTContext *s, FFTComplex *z) |
781 | 174 { |
175 int j, k, np; | |
176 FFTComplex tmp; | |
177 const uint16_t *revtab = s->revtab; | |
7542 | 178 np = 1 << s->nbits; |
179 | |
180 if (s->tmp_buf) { | |
181 /* TODO: handle split-radix permute in a more optimal way, probably in-place */ | |
182 for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j]; | |
183 memcpy(z, s->tmp_buf, np * sizeof(FFTComplex)); | |
184 return; | |
185 } | |
2967 | 186 |
781 | 187 /* reverse */ |
188 for(j=0;j<np;j++) { | |
189 k = revtab[j]; | |
190 if (k < j) { | |
191 tmp = z[k]; | |
192 z[k] = z[j]; | |
193 z[j] = tmp; | |
194 } | |
195 } | |
196 } | |
197 | |
8687 | 198 av_cold void ff_fft_end(FFTContext *s) |
781 | 199 { |
200 av_freep(&s->revtab); | |
201 av_freep(&s->exptab); | |
202 av_freep(&s->exptab1); | |
7542 | 203 av_freep(&s->tmp_buf); |
204 } | |
205 | |
206 #define sqrthalf (float)M_SQRT1_2 | |
207 | |
208 #define BF(x,y,a,b) {\ | |
209 x = a - b;\ | |
210 y = a + b;\ | |
211 } | |
212 | |
213 #define BUTTERFLIES(a0,a1,a2,a3) {\ | |
214 BF(t3, t5, t5, t1);\ | |
215 BF(a2.re, a0.re, a0.re, t5);\ | |
216 BF(a3.im, a1.im, a1.im, t3);\ | |
217 BF(t4, t6, t2, t6);\ | |
218 BF(a3.re, a1.re, a1.re, t4);\ | |
219 BF(a2.im, a0.im, a0.im, t6);\ | |
220 } | |
221 | |
222 // force loading all the inputs before storing any. | |
223 // this is slightly slower for small data, but avoids store->load aliasing | |
224 // for addresses separated by large powers of 2. | |
225 #define BUTTERFLIES_BIG(a0,a1,a2,a3) {\ | |
226 FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\ | |
227 BF(t3, t5, t5, t1);\ | |
228 BF(a2.re, a0.re, r0, t5);\ | |
229 BF(a3.im, a1.im, i1, t3);\ | |
230 BF(t4, t6, t2, t6);\ | |
231 BF(a3.re, a1.re, r1, t4);\ | |
232 BF(a2.im, a0.im, i0, t6);\ | |
233 } | |
234 | |
235 #define TRANSFORM(a0,a1,a2,a3,wre,wim) {\ | |
236 t1 = a2.re * wre + a2.im * wim;\ | |
237 t2 = a2.im * wre - a2.re * wim;\ | |
238 t5 = a3.re * wre - a3.im * wim;\ | |
239 t6 = a3.im * wre + a3.re * wim;\ | |
240 BUTTERFLIES(a0,a1,a2,a3)\ | |
241 } | |
242 | |
243 #define TRANSFORM_ZERO(a0,a1,a2,a3) {\ | |
244 t1 = a2.re;\ | |
245 t2 = a2.im;\ | |
246 t5 = a3.re;\ | |
247 t6 = a3.im;\ | |
248 BUTTERFLIES(a0,a1,a2,a3)\ | |
249 } | |
250 | |
251 /* z[0...8n-1], w[1...2n-1] */ | |
252 #define PASS(name)\ | |
253 static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\ | |
254 {\ | |
255 FFTSample t1, t2, t3, t4, t5, t6;\ | |
256 int o1 = 2*n;\ | |
257 int o2 = 4*n;\ | |
258 int o3 = 6*n;\ | |
259 const FFTSample *wim = wre+o1;\ | |
260 n--;\ | |
261 \ | |
262 TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\ | |
263 TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ | |
264 do {\ | |
265 z += 2;\ | |
266 wre += 2;\ | |
267 wim -= 2;\ | |
268 TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\ | |
269 TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ | |
270 } while(--n);\ | |
781 | 271 } |
272 | |
7542 | 273 PASS(pass) |
274 #undef BUTTERFLIES | |
275 #define BUTTERFLIES BUTTERFLIES_BIG | |
276 PASS(pass_big) | |
277 | |
278 #define DECL_FFT(n,n2,n4)\ | |
279 static void fft##n(FFTComplex *z)\ | |
280 {\ | |
281 fft##n2(z);\ | |
282 fft##n4(z+n4*2);\ | |
283 fft##n4(z+n4*3);\ | |
284 pass(z,ff_cos_##n,n4/2);\ | |
285 } | |
286 | |
287 static void fft4(FFTComplex *z) | |
288 { | |
289 FFTSample t1, t2, t3, t4, t5, t6, t7, t8; | |
290 | |
291 BF(t3, t1, z[0].re, z[1].re); | |
292 BF(t8, t6, z[3].re, z[2].re); | |
293 BF(z[2].re, z[0].re, t1, t6); | |
294 BF(t4, t2, z[0].im, z[1].im); | |
295 BF(t7, t5, z[2].im, z[3].im); | |
296 BF(z[3].im, z[1].im, t4, t8); | |
297 BF(z[3].re, z[1].re, t3, t7); | |
298 BF(z[2].im, z[0].im, t2, t5); | |
299 } | |
300 | |
301 static void fft8(FFTComplex *z) | |
302 { | |
303 FFTSample t1, t2, t3, t4, t5, t6, t7, t8; | |
304 | |
305 fft4(z); | |
306 | |
307 BF(t1, z[5].re, z[4].re, -z[5].re); | |
308 BF(t2, z[5].im, z[4].im, -z[5].im); | |
309 BF(t3, z[7].re, z[6].re, -z[7].re); | |
310 BF(t4, z[7].im, z[6].im, -z[7].im); | |
311 BF(t8, t1, t3, t1); | |
312 BF(t7, t2, t2, t4); | |
313 BF(z[4].re, z[0].re, z[0].re, t1); | |
314 BF(z[4].im, z[0].im, z[0].im, t2); | |
315 BF(z[6].re, z[2].re, z[2].re, t7); | |
316 BF(z[6].im, z[2].im, z[2].im, t8); | |
317 | |
318 TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf); | |
319 } | |
320 | |
8590 | 321 #if !CONFIG_SMALL |
7542 | 322 static void fft16(FFTComplex *z) |
323 { | |
324 FFTSample t1, t2, t3, t4, t5, t6; | |
325 | |
326 fft8(z); | |
327 fft4(z+8); | |
328 fft4(z+12); | |
329 | |
330 TRANSFORM_ZERO(z[0],z[4],z[8],z[12]); | |
331 TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf); | |
332 TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]); | |
333 TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]); | |
334 } | |
335 #else | |
336 DECL_FFT(16,8,4) | |
337 #endif | |
338 DECL_FFT(32,16,8) | |
339 DECL_FFT(64,32,16) | |
340 DECL_FFT(128,64,32) | |
341 DECL_FFT(256,128,64) | |
342 DECL_FFT(512,256,128) | |
8590 | 343 #if !CONFIG_SMALL |
7542 | 344 #define pass pass_big |
345 #endif | |
346 DECL_FFT(1024,512,256) | |
347 DECL_FFT(2048,1024,512) | |
348 DECL_FFT(4096,2048,1024) | |
349 DECL_FFT(8192,4096,2048) | |
350 DECL_FFT(16384,8192,4096) | |
351 DECL_FFT(32768,16384,8192) | |
352 DECL_FFT(65536,32768,16384) | |
353 | |
10391 | 354 static void (* const fft_dispatch[])(FFTComplex*) = { |
7542 | 355 fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024, |
356 fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, | |
357 }; | |
358 | |
359 void ff_fft_calc_c(FFTContext *s, FFTComplex *z) | |
360 { | |
361 fft_dispatch[s->nbits-2](z); | |
362 } | |
363 |