Mercurial > audlegacy
comparison Plugins/Input/wma/libffwma/fft.c @ 1398:1ddaf20ab50e trunk
[svn] AltiVec support for WMA, by Luca "lu_zero" Barbato from Gentoo.
author | chainsaw |
---|---|
date | Thu, 13 Jul 2006 16:01:57 -0700 |
parents | 62a33367a6cb |
children | f12d7e208b43 |
comparison
equal
deleted
inserted
replaced
1397:86242883ddc7 | 1398:1ddaf20ab50e |
---|---|
22 * FFT/IFFT transforms. | 22 * FFT/IFFT transforms. |
23 */ | 23 */ |
24 | 24 |
25 #include "dsputil.h" | 25 #include "dsputil.h" |
26 | 26 |
27 #ifdef HAVE_ALTIVEC | |
28 | |
29 #ifdef HAVE_ALTIVEC_H | |
30 #include <altivec.h> | |
31 #endif | |
32 | |
33 #ifdef CONFIG_DARWIN | |
34 #include <sys/sysctl.h> | |
35 #else /* CONFIG_DARWIN */ | |
36 #include <signal.h> | |
37 #include <setjmp.h> | |
38 | |
39 static sigjmp_buf jmpbuf; | |
40 static volatile sig_atomic_t canjump = 0; | |
41 | |
42 static void sigill_handler (int sig) | |
43 { | |
44 if (!canjump) { | |
45 signal (sig, SIG_DFL); | |
46 raise (sig); | |
47 } | |
48 | |
49 canjump = 0; | |
50 siglongjmp (jmpbuf, 1); | |
51 } | |
52 #endif /* CONFIG_DARWIN */ | |
53 | |
54 | |
55 #define WORD_0 0x00,0x01,0x02,0x03 | |
56 #define WORD_1 0x04,0x05,0x06,0x07 | |
57 #define WORD_2 0x08,0x09,0x0a,0x0b | |
58 #define WORD_3 0x0c,0x0d,0x0e,0x0f | |
59 #define WORD_s0 0x10,0x11,0x12,0x13 | |
60 #define WORD_s1 0x14,0x15,0x16,0x17 | |
61 #define WORD_s2 0x18,0x19,0x1a,0x1b | |
62 #define WORD_s3 0x1c,0x1d,0x1e,0x1f | |
63 | |
64 #ifdef CONFIG_DARWIN | |
65 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) | |
66 #else | |
67 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} | |
68 #endif | |
69 | |
70 // vcprmle is used to keep the same index as in the SSE version. | |
71 // it's the same as vcprm, with the index inversed | |
72 // ('le' is Little Endian) | |
73 #define vcprmle(a,b,c,d) vcprm(d,c,b,a) | |
74 | |
75 // used to build inverse/identity vectors (vcii) | |
76 // n is _n_egative, p is _p_ositive | |
77 #define FLOAT_n -1. | |
78 #define FLOAT_p 1. | |
79 | |
80 | |
81 #ifdef CONFIG_DARWIN | |
82 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) | |
83 #else | |
84 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} | |
85 #endif | |
86 | |
87 int has_altivec(void) | |
88 { | |
89 #ifdef CONFIG_DARWIN | |
90 int sels[2] = {CTL_HW, HW_VECTORUNIT}; | |
91 int has_vu = 0; | |
92 size_t len = sizeof(has_vu); | |
93 int err; | |
94 | |
95 err = sysctl(sels, 2, &has_vu, &len, NULL, 0); | |
96 | |
97 if (err == 0) return (has_vu != 0); | |
98 #else /* CONFIG_DARWIN */ | |
99 /* no Darwin, do it the brute-force way */ | |
100 /* this is borrowed from the libmpeg2 library */ | |
101 { | |
102 signal (SIGILL, sigill_handler); | |
103 if (sigsetjmp (jmpbuf, 1)) { | |
104 signal (SIGILL, SIG_DFL); | |
105 } else { | |
106 canjump = 1; | |
107 | |
108 asm volatile ("mtspr 256, %0\n\t" | |
109 "vand %%v0, %%v0, %%v0" | |
110 : | |
111 : "r" (-1)); | |
112 | |
113 signal (SIGILL, SIG_DFL); | |
114 return 1; | |
115 } | |
116 } | |
117 #endif /* CONFIG_DARWIN */ | |
118 return 0; | |
119 } | |
120 | |
121 | |
122 void fft_calc_altivec(FFTContext *s, FFTComplex *z) | |
123 { | |
124 #ifdef CONFIG_DARWIN | |
125 register const vector float vczero = (const vector float)(0.); | |
126 #else | |
127 register const vector float vczero = (const vector float){0.,0.,0.,0.}; | |
128 #endif | |
129 | |
130 int ln = s->nbits; | |
131 int j, np, np2; | |
132 int nblocks, nloops; | |
133 register FFTComplex *p, *q; | |
134 FFTComplex *cptr, *cptr1; | |
135 int k; | |
136 | |
137 np = 1 << ln; | |
138 | |
139 { | |
140 vector float *r, a, b, a1, c1, c2; | |
141 | |
142 r = (vector float *)&z[0]; | |
143 | |
144 c1 = vcii(p,p,n,n); | |
145 | |
146 if (s->inverse) | |
147 { | |
148 c2 = vcii(p,p,n,p); | |
149 } | |
150 else | |
151 { | |
152 c2 = vcii(p,p,p,n); | |
153 } | |
154 | |
155 j = (np >> 2); | |
156 do { | |
157 a = vec_ld(0, r); | |
158 a1 = vec_ld(sizeof(vector float), r); | |
159 | |
160 b = vec_perm(a,a,vcprmle(1,0,3,2)); | |
161 a = vec_madd(a,c1,b); | |
162 /* do the pass 0 butterfly */ | |
163 | |
164 b = vec_perm(a1,a1,vcprmle(1,0,3,2)); | |
165 b = vec_madd(a1,c1,b); | |
166 /* do the pass 0 butterfly */ | |
167 | |
168 /* multiply third by -i */ | |
169 b = vec_perm(b,b,vcprmle(2,3,1,0)); | |
170 | |
171 /* do the pass 1 butterfly */ | |
172 vec_st(vec_madd(b,c2,a), 0, r); | |
173 vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r); | |
174 | |
175 r += 2; | |
176 } while (--j != 0); | |
177 } | |
178 /* pass 2 .. ln-1 */ | |
179 | |
180 nblocks = np >> 3; | |
181 nloops = 1 << 2; | |
182 np2 = np >> 1; | |
183 | |
184 cptr1 = s->exptab1; | |
185 do { | |
186 p = z; | |
187 q = z + nloops; | |
188 j = nblocks; | |
189 do { | |
190 cptr = cptr1; | |
191 k = nloops >> 1; | |
192 do { | |
193 vector float a,b,c,t1; | |
194 | |
195 a = vec_ld(0, (float*)p); | |
196 b = vec_ld(0, (float*)q); | |
197 | |
198 /* complex mul */ | |
199 c = vec_ld(0, (float*)cptr); | |
200 /* cre*re cim*re */ | |
201 t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero); | |
202 c = vec_ld(sizeof(vector float), (float*)cptr); | |
203 /* -cim*im cre*im */ | |
204 b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1); | |
205 | |
206 /* butterfly */ | |
207 vec_st(vec_add(a,b), 0, (float*)p); | |
208 vec_st(vec_sub(a,b), 0, (float*)q); | |
209 | |
210 p += 2; | |
211 q += 2; | |
212 cptr += 4; | |
213 } while (--k); | |
214 | |
215 p += nloops; | |
216 q += nloops; | |
217 } while (--j); | |
218 cptr1 += nloops * 2; | |
219 nblocks = nblocks >> 1; | |
220 nloops = nloops << 1; | |
221 } while (nblocks != 0); | |
222 } | |
223 | |
224 #endif | |
225 | |
27 /** | 226 /** |
28 * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is | 227 * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is |
29 * done | 228 * done |
30 */ | 229 */ |
31 int fft_inits(FFTContext *s, int nbits, int inverse) | 230 int fft_inits(FFTContext *s, int nbits, int inverse) |
34 float alpha, c1, s1, s2; | 233 float alpha, c1, s1, s2; |
35 | 234 |
36 s->nbits = nbits; | 235 s->nbits = nbits; |
37 n = 1 << nbits; | 236 n = 1 << nbits; |
38 | 237 |
39 s->exptab = malloc((n / 2) * sizeof(FFTComplex)); | 238 s->exptab = av_malloc((n / 2) * sizeof(FFTComplex)); |
40 if (!s->exptab) | 239 if (!s->exptab) |
41 goto fail; | 240 goto fail; |
42 s->revtab = malloc(n * sizeof(uint16_t)); | 241 s->revtab = av_malloc(n * sizeof(uint16_t)); |
43 if (!s->revtab) | 242 if (!s->revtab) |
44 goto fail; | 243 goto fail; |
45 s->inverse = inverse; | 244 s->inverse = inverse; |
46 | 245 |
47 s2 = inverse ? 1.0 : -1.0; | 246 s2 = inverse ? 1.0 : -1.0; |
54 s->exptab[i].im = s1; | 253 s->exptab[i].im = s1; |
55 } | 254 } |
56 s->fft_calc = fft_calc_c; | 255 s->fft_calc = fft_calc_c; |
57 s->exptab1 = NULL; | 256 s->exptab1 = NULL; |
58 /* compute constant table for HAVE_SSE version */ | 257 /* compute constant table for HAVE_SSE version */ |
59 #if (defined(HAVE_MMX) && defined(HAVE_BUILTIN_VECTOR)) || defined(BLAH_NO_ALTIVEC) | 258 #if (defined(HAVE_ALTIVEC)) |
60 { | 259 { |
61 int has_vectors = 0; | 260 int has_vectors = 0; |
62 | 261 |
63 #if defined(HAVE_MMX) | 262 #if defined(HAVE_ALTIVEC) |
64 has_vectors = mm_support() & MM_SSE; | 263 has_vectors = has_altivec(); |
65 #endif | |
66 #if defined(BLAH_NO_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE) | |
67 has_vectors = mm_support() & MM_ALTIVEC; | |
68 #endif | 264 #endif |
69 if (has_vectors) { | 265 if (has_vectors) { |
70 int np, nblocks, np2, l; | 266 int np, nblocks, np2, l; |
71 FFTComplex *q; | 267 FFTComplex *q; |
72 | 268 |
73 np = 1 << nbits; | 269 np = 1 << nbits; |
74 nblocks = np >> 3; | 270 nblocks = np >> 3; |
75 np2 = np >> 1; | 271 np2 = np >> 1; |
76 s->exptab1 = malloc(np * 2 * sizeof(FFTComplex)); | 272 s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex)); |
77 if (!s->exptab1) | 273 if (!s->exptab1) |
78 goto fail; | 274 goto fail; |
79 q = s->exptab1; | 275 q = s->exptab1; |
80 do { | 276 do { |
81 for(l = 0; l < np2; l += 2 * nblocks) { | 277 for(l = 0; l < np2; l += 2 * nblocks) { |
90 q++; | 286 q++; |
91 } | 287 } |
92 nblocks = nblocks >> 1; | 288 nblocks = nblocks >> 1; |
93 } while (nblocks != 0); | 289 } while (nblocks != 0); |
94 av_freep(&s->exptab); | 290 av_freep(&s->exptab); |
95 #if defined(HAVE_MMX) | |
96 s->fft_calc = fft_calc_sse; | |
97 #else | |
98 s->fft_calc = fft_calc_altivec; | 291 s->fft_calc = fft_calc_altivec; |
99 #endif | |
100 } | 292 } |
101 } | 293 } |
102 #endif | 294 #endif |
103 | 295 |
104 /* compute bit reverse table */ | 296 /* compute bit reverse table */ |