Mercurial > libavcodec.hg
annotate fft.c @ 12266:48d6738904a9 libavcodec
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
splits it into small optimization-specific macros which are selected for each
DSP function. The advantage of this approach is that the sse4 functions now
use the ssse3 codepath also without needing an explicit sse4 codepath.
author | rbultje |
---|---|
date | Sat, 24 Jul 2010 19:33:05 +0000 |
parents | a2e5b142776b |
children |
rev | line source |
---|---|
781 | 1 /* |
2 * FFT/IFFT transforms | |
7542 | 3 * Copyright (c) 2008 Loren Merritt |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8590
diff
changeset
|
4 * Copyright (c) 2002 Fabrice Bellard |
7542 | 5 * Partly based on libdjbfft by D. J. Bernstein |
781 | 6 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
7 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
8 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
9 * FFmpeg is free software; you can redistribute it and/or |
781 | 10 * modify it under the terms of the GNU Lesser General Public |
11 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
12 * version 2.1 of the License, or (at your option) any later version. |
781 | 13 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
14 * FFmpeg is distributed in the hope that it will be useful, |
781 | 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 * Lesser General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3746
diff
changeset
|
20 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
781 | 22 */ |
1106 | 23 |
24 /** | |
11644
7dd2a45249a9
Remove explicit filename from Doxygen @file commands.
diego
parents:
11444
diff
changeset
|
25 * @file |
1106 | 26 * FFT/IFFT transforms. |
27 */ | |
28 | |
11444
6f1697664bf2
Replace many includes of libavutil/common.h with what is actually needed
mru
parents:
11370
diff
changeset
|
29 #include <stdlib.h> |
6f1697664bf2
Replace many includes of libavutil/common.h with what is actually needed
mru
parents:
11370
diff
changeset
|
30 #include <string.h> |
11370 | 31 #include "libavutil/mathematics.h" |
32 #include "fft.h" | |
781 | 33 |
10407
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
34 /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */ |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
35 #if !CONFIG_HARDCODED_TABLES |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
36 COSTABLE(16); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
37 COSTABLE(32); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
38 COSTABLE(64); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
39 COSTABLE(128); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
40 COSTABLE(256); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
41 COSTABLE(512); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
42 COSTABLE(1024); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
43 COSTABLE(2048); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
44 COSTABLE(4096); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
45 COSTABLE(8192); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
46 COSTABLE(16384); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
47 COSTABLE(32768); |
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
48 COSTABLE(65536); |
10400
866dffa620d1
Use hardcoded instead of runtime-calculated ff_cos_* tables if
reimar
parents:
10391
diff
changeset
|
49 #endif |
10407
57acce8b1380
Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
reimar
parents:
10400
diff
changeset
|
50 COSTABLE_CONST FFTSample * const ff_cos_tabs[] = { |
10492
63910f7ba293
Pad ff_cos_tabs and ff_sin_tabs so that index n points to the table for n bits.
reimar
parents:
10407
diff
changeset
|
51 NULL, NULL, NULL, NULL, |
7542 | 52 ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024, |
53 ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536, | |
54 }; | |
55 | |
56 static int split_radix_permutation(int i, int n, int inverse) | |
57 { | |
58 int m; | |
59 if(n <= 2) return i&1; | |
60 m = n >> 1; | |
61 if(!(i&m)) return split_radix_permutation(i, m, inverse)*2; | |
62 m >>= 1; | |
63 if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1; | |
64 else return split_radix_permutation(i, m, inverse)*4 - 1; | |
65 } | |
66 | |
10496
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
67 av_cold void ff_init_ff_cos_tabs(int index) |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
68 { |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
69 #if !CONFIG_HARDCODED_TABLES |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
70 int i; |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
71 int m = 1<<index; |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
72 double freq = 2*M_PI/m; |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
73 FFTSample *tab = ff_cos_tabs[index]; |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
74 for(i=0; i<=m/4; i++) |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
75 tab[i] = cos(i*freq); |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
76 for(i=1; i<m/4; i++) |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
77 tab[m/2-i] = tab[i]; |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
78 #endif |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
79 } |
74b0c1a0851e
Add ff_init_ff_cos_tabs function and use it in rdft.c to ensure that the
reimar
parents:
10492
diff
changeset
|
80 |
8637 | 81 av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) |
781 | 82 { |
12047 | 83 int i, j, n; |
2967 | 84 |
7542 | 85 if (nbits < 2 || nbits > 16) |
86 goto fail; | |
781 | 87 s->nbits = nbits; |
88 n = 1 << nbits; | |
89 | |
90 s->revtab = av_malloc(n * sizeof(uint16_t)); | |
91 if (!s->revtab) | |
92 goto fail; | |
12047 | 93 s->tmp_buf = av_malloc(n * sizeof(FFTComplex)); |
94 if (!s->tmp_buf) | |
95 goto fail; | |
781 | 96 s->inverse = inverse; |
97 | |
7542 | 98 s->fft_permute = ff_fft_permute_c; |
8974 | 99 s->fft_calc = ff_fft_calc_c; |
11122 | 100 #if CONFIG_MDCT |
8974 | 101 s->imdct_calc = ff_imdct_calc_c; |
102 s->imdct_half = ff_imdct_half_c; | |
10161 | 103 s->mdct_calc = ff_mdct_calc_c; |
11122 | 104 #endif |
781 | 105 |
10175
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
10172
diff
changeset
|
106 if (ARCH_ARM) ff_fft_init_arm(s); |
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
10172
diff
changeset
|
107 if (HAVE_ALTIVEC) ff_fft_init_altivec(s); |
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
10172
diff
changeset
|
108 if (HAVE_MMX) ff_fft_init_mmx(s); |
781 | 109 |
12048 | 110 for(j=4; j<=nbits; j++) { |
111 ff_init_ff_cos_tabs(j); | |
112 } | |
113 for(i=0; i<n; i++) | |
114 s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i; | |
7542 | 115 |
781 | 116 return 0; |
117 fail: | |
118 av_freep(&s->revtab); | |
7542 | 119 av_freep(&s->tmp_buf); |
781 | 120 return -1; |
121 } | |
122 | |
7542 | 123 void ff_fft_permute_c(FFTContext *s, FFTComplex *z) |
781 | 124 { |
12047 | 125 int j, np; |
781 | 126 const uint16_t *revtab = s->revtab; |
7542 | 127 np = 1 << s->nbits; |
12048 | 128 /* TODO: handle split-radix permute in a more optimal way, probably in-place */ |
129 for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j]; | |
130 memcpy(z, s->tmp_buf, np * sizeof(FFTComplex)); | |
781 | 131 } |
132 | |
8687 | 133 av_cold void ff_fft_end(FFTContext *s) |
781 | 134 { |
135 av_freep(&s->revtab); | |
7542 | 136 av_freep(&s->tmp_buf); |
137 } | |
138 | |
139 #define sqrthalf (float)M_SQRT1_2 | |
140 | |
141 #define BF(x,y,a,b) {\ | |
142 x = a - b;\ | |
143 y = a + b;\ | |
144 } | |
145 | |
146 #define BUTTERFLIES(a0,a1,a2,a3) {\ | |
147 BF(t3, t5, t5, t1);\ | |
148 BF(a2.re, a0.re, a0.re, t5);\ | |
149 BF(a3.im, a1.im, a1.im, t3);\ | |
150 BF(t4, t6, t2, t6);\ | |
151 BF(a3.re, a1.re, a1.re, t4);\ | |
152 BF(a2.im, a0.im, a0.im, t6);\ | |
153 } | |
154 | |
155 // force loading all the inputs before storing any. | |
156 // this is slightly slower for small data, but avoids store->load aliasing | |
157 // for addresses separated by large powers of 2. | |
158 #define BUTTERFLIES_BIG(a0,a1,a2,a3) {\ | |
159 FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\ | |
160 BF(t3, t5, t5, t1);\ | |
161 BF(a2.re, a0.re, r0, t5);\ | |
162 BF(a3.im, a1.im, i1, t3);\ | |
163 BF(t4, t6, t2, t6);\ | |
164 BF(a3.re, a1.re, r1, t4);\ | |
165 BF(a2.im, a0.im, i0, t6);\ | |
166 } | |
167 | |
168 #define TRANSFORM(a0,a1,a2,a3,wre,wim) {\ | |
169 t1 = a2.re * wre + a2.im * wim;\ | |
170 t2 = a2.im * wre - a2.re * wim;\ | |
171 t5 = a3.re * wre - a3.im * wim;\ | |
172 t6 = a3.im * wre + a3.re * wim;\ | |
173 BUTTERFLIES(a0,a1,a2,a3)\ | |
174 } | |
175 | |
176 #define TRANSFORM_ZERO(a0,a1,a2,a3) {\ | |
177 t1 = a2.re;\ | |
178 t2 = a2.im;\ | |
179 t5 = a3.re;\ | |
180 t6 = a3.im;\ | |
181 BUTTERFLIES(a0,a1,a2,a3)\ | |
182 } | |
183 | |
184 /* z[0...8n-1], w[1...2n-1] */ | |
185 #define PASS(name)\ | |
186 static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\ | |
187 {\ | |
188 FFTSample t1, t2, t3, t4, t5, t6;\ | |
189 int o1 = 2*n;\ | |
190 int o2 = 4*n;\ | |
191 int o3 = 6*n;\ | |
192 const FFTSample *wim = wre+o1;\ | |
193 n--;\ | |
194 \ | |
195 TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\ | |
196 TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ | |
197 do {\ | |
198 z += 2;\ | |
199 wre += 2;\ | |
200 wim -= 2;\ | |
201 TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\ | |
202 TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ | |
203 } while(--n);\ | |
781 | 204 } |
205 | |
7542 | 206 PASS(pass) |
207 #undef BUTTERFLIES | |
208 #define BUTTERFLIES BUTTERFLIES_BIG | |
209 PASS(pass_big) | |
210 | |
211 #define DECL_FFT(n,n2,n4)\ | |
212 static void fft##n(FFTComplex *z)\ | |
213 {\ | |
214 fft##n2(z);\ | |
215 fft##n4(z+n4*2);\ | |
216 fft##n4(z+n4*3);\ | |
217 pass(z,ff_cos_##n,n4/2);\ | |
218 } | |
219 | |
220 static void fft4(FFTComplex *z) | |
221 { | |
222 FFTSample t1, t2, t3, t4, t5, t6, t7, t8; | |
223 | |
224 BF(t3, t1, z[0].re, z[1].re); | |
225 BF(t8, t6, z[3].re, z[2].re); | |
226 BF(z[2].re, z[0].re, t1, t6); | |
227 BF(t4, t2, z[0].im, z[1].im); | |
228 BF(t7, t5, z[2].im, z[3].im); | |
229 BF(z[3].im, z[1].im, t4, t8); | |
230 BF(z[3].re, z[1].re, t3, t7); | |
231 BF(z[2].im, z[0].im, t2, t5); | |
232 } | |
233 | |
234 static void fft8(FFTComplex *z) | |
235 { | |
236 FFTSample t1, t2, t3, t4, t5, t6, t7, t8; | |
237 | |
238 fft4(z); | |
239 | |
240 BF(t1, z[5].re, z[4].re, -z[5].re); | |
241 BF(t2, z[5].im, z[4].im, -z[5].im); | |
242 BF(t3, z[7].re, z[6].re, -z[7].re); | |
243 BF(t4, z[7].im, z[6].im, -z[7].im); | |
244 BF(t8, t1, t3, t1); | |
245 BF(t7, t2, t2, t4); | |
246 BF(z[4].re, z[0].re, z[0].re, t1); | |
247 BF(z[4].im, z[0].im, z[0].im, t2); | |
248 BF(z[6].re, z[2].re, z[2].re, t7); | |
249 BF(z[6].im, z[2].im, z[2].im, t8); | |
250 | |
251 TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf); | |
252 } | |
253 | |
8590 | 254 #if !CONFIG_SMALL |
7542 | 255 static void fft16(FFTComplex *z) |
256 { | |
257 FFTSample t1, t2, t3, t4, t5, t6; | |
258 | |
259 fft8(z); | |
260 fft4(z+8); | |
261 fft4(z+12); | |
262 | |
263 TRANSFORM_ZERO(z[0],z[4],z[8],z[12]); | |
264 TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf); | |
265 TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]); | |
266 TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]); | |
267 } | |
268 #else | |
269 DECL_FFT(16,8,4) | |
270 #endif | |
271 DECL_FFT(32,16,8) | |
272 DECL_FFT(64,32,16) | |
273 DECL_FFT(128,64,32) | |
274 DECL_FFT(256,128,64) | |
275 DECL_FFT(512,256,128) | |
8590 | 276 #if !CONFIG_SMALL |
7542 | 277 #define pass pass_big |
278 #endif | |
279 DECL_FFT(1024,512,256) | |
280 DECL_FFT(2048,1024,512) | |
281 DECL_FFT(4096,2048,1024) | |
282 DECL_FFT(8192,4096,2048) | |
283 DECL_FFT(16384,8192,4096) | |
284 DECL_FFT(32768,16384,8192) | |
285 DECL_FFT(65536,32768,16384) | |
286 | |
10391 | 287 static void (* const fft_dispatch[])(FFTComplex*) = { |
7542 | 288 fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024, |
289 fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, | |
290 }; | |
291 | |
292 void ff_fft_calc_c(FFTContext *s, FFTComplex *z) | |
293 { | |
294 fft_dispatch[s->nbits-2](z); | |
295 } | |
296 |