comparison Plugins/Input/wma/libffwma/fft.c @ 1398:1ddaf20ab50e trunk

[svn] AltiVec support for WMA, by Luca "lu_zero" Barbato from Gentoo.
author chainsaw
date Thu, 13 Jul 2006 16:01:57 -0700
parents 62a33367a6cb
children f12d7e208b43
comparison
equal deleted inserted replaced
1397:86242883ddc7 1398:1ddaf20ab50e
22 * FFT/IFFT transforms. 22 * FFT/IFFT transforms.
23 */ 23 */
24 24
25 #include "dsputil.h" 25 #include "dsputil.h"
26 26
27 #ifdef HAVE_ALTIVEC
28
29 #ifdef HAVE_ALTIVEC_H
30 #include <altivec.h>
31 #endif
32
33 #ifdef CONFIG_DARWIN
34 #include <sys/sysctl.h>
35 #else /* CONFIG_DARWIN */
36 #include <signal.h>
37 #include <setjmp.h>
38
39 static sigjmp_buf jmpbuf;
40 static volatile sig_atomic_t canjump = 0;
41
42 static void sigill_handler (int sig)
43 {
44 if (!canjump) {
45 signal (sig, SIG_DFL);
46 raise (sig);
47 }
48
49 canjump = 0;
50 siglongjmp (jmpbuf, 1);
51 }
52 #endif /* CONFIG_DARWIN */
53
54
55 #define WORD_0 0x00,0x01,0x02,0x03
56 #define WORD_1 0x04,0x05,0x06,0x07
57 #define WORD_2 0x08,0x09,0x0a,0x0b
58 #define WORD_3 0x0c,0x0d,0x0e,0x0f
59 #define WORD_s0 0x10,0x11,0x12,0x13
60 #define WORD_s1 0x14,0x15,0x16,0x17
61 #define WORD_s2 0x18,0x19,0x1a,0x1b
62 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
63
64 #ifdef CONFIG_DARWIN
65 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
66 #else
67 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
68 #endif
69
70 // vcprmle is used to keep the same index as in the SSE version.
71 // it's the same as vcprm, with the index inversed
72 // ('le' is Little Endian)
73 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
74
75 // used to build inverse/identity vectors (vcii)
76 // n is _n_egative, p is _p_ositive
77 #define FLOAT_n -1.
78 #define FLOAT_p 1.
79
80
81 #ifdef CONFIG_DARWIN
82 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
83 #else
84 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
85 #endif
86
87 int has_altivec(void)
88 {
89 #ifdef CONFIG_DARWIN
90 int sels[2] = {CTL_HW, HW_VECTORUNIT};
91 int has_vu = 0;
92 size_t len = sizeof(has_vu);
93 int err;
94
95 err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
96
97 if (err == 0) return (has_vu != 0);
98 #else /* CONFIG_DARWIN */
99 /* no Darwin, do it the brute-force way */
100 /* this is borrowed from the libmpeg2 library */
101 {
102 signal (SIGILL, sigill_handler);
103 if (sigsetjmp (jmpbuf, 1)) {
104 signal (SIGILL, SIG_DFL);
105 } else {
106 canjump = 1;
107
108 asm volatile ("mtspr 256, %0\n\t"
109 "vand %%v0, %%v0, %%v0"
110 :
111 : "r" (-1));
112
113 signal (SIGILL, SIG_DFL);
114 return 1;
115 }
116 }
117 #endif /* CONFIG_DARWIN */
118 return 0;
119 }
120
121
122 void fft_calc_altivec(FFTContext *s, FFTComplex *z)
123 {
124 #ifdef CONFIG_DARWIN
125 register const vector float vczero = (const vector float)(0.);
126 #else
127 register const vector float vczero = (const vector float){0.,0.,0.,0.};
128 #endif
129
130 int ln = s->nbits;
131 int j, np, np2;
132 int nblocks, nloops;
133 register FFTComplex *p, *q;
134 FFTComplex *cptr, *cptr1;
135 int k;
136
137 np = 1 << ln;
138
139 {
140 vector float *r, a, b, a1, c1, c2;
141
142 r = (vector float *)&z[0];
143
144 c1 = vcii(p,p,n,n);
145
146 if (s->inverse)
147 {
148 c2 = vcii(p,p,n,p);
149 }
150 else
151 {
152 c2 = vcii(p,p,p,n);
153 }
154
155 j = (np >> 2);
156 do {
157 a = vec_ld(0, r);
158 a1 = vec_ld(sizeof(vector float), r);
159
160 b = vec_perm(a,a,vcprmle(1,0,3,2));
161 a = vec_madd(a,c1,b);
162 /* do the pass 0 butterfly */
163
164 b = vec_perm(a1,a1,vcprmle(1,0,3,2));
165 b = vec_madd(a1,c1,b);
166 /* do the pass 0 butterfly */
167
168 /* multiply third by -i */
169 b = vec_perm(b,b,vcprmle(2,3,1,0));
170
171 /* do the pass 1 butterfly */
172 vec_st(vec_madd(b,c2,a), 0, r);
173 vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);
174
175 r += 2;
176 } while (--j != 0);
177 }
178 /* pass 2 .. ln-1 */
179
180 nblocks = np >> 3;
181 nloops = 1 << 2;
182 np2 = np >> 1;
183
184 cptr1 = s->exptab1;
185 do {
186 p = z;
187 q = z + nloops;
188 j = nblocks;
189 do {
190 cptr = cptr1;
191 k = nloops >> 1;
192 do {
193 vector float a,b,c,t1;
194
195 a = vec_ld(0, (float*)p);
196 b = vec_ld(0, (float*)q);
197
198 /* complex mul */
199 c = vec_ld(0, (float*)cptr);
200 /* cre*re cim*re */
201 t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
202 c = vec_ld(sizeof(vector float), (float*)cptr);
203 /* -cim*im cre*im */
204 b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);
205
206 /* butterfly */
207 vec_st(vec_add(a,b), 0, (float*)p);
208 vec_st(vec_sub(a,b), 0, (float*)q);
209
210 p += 2;
211 q += 2;
212 cptr += 4;
213 } while (--k);
214
215 p += nloops;
216 q += nloops;
217 } while (--j);
218 cptr1 += nloops * 2;
219 nblocks = nblocks >> 1;
220 nloops = nloops << 1;
221 } while (nblocks != 0);
222 }
223
224 #endif
225
27 /** 226 /**
28 * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is 227 * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is
29 * done 228 * done
30 */ 229 */
31 int fft_inits(FFTContext *s, int nbits, int inverse) 230 int fft_inits(FFTContext *s, int nbits, int inverse)
34 float alpha, c1, s1, s2; 233 float alpha, c1, s1, s2;
35 234
36 s->nbits = nbits; 235 s->nbits = nbits;
37 n = 1 << nbits; 236 n = 1 << nbits;
38 237
39 s->exptab = malloc((n / 2) * sizeof(FFTComplex)); 238 s->exptab = av_malloc((n / 2) * sizeof(FFTComplex));
40 if (!s->exptab) 239 if (!s->exptab)
41 goto fail; 240 goto fail;
42 s->revtab = malloc(n * sizeof(uint16_t)); 241 s->revtab = av_malloc(n * sizeof(uint16_t));
43 if (!s->revtab) 242 if (!s->revtab)
44 goto fail; 243 goto fail;
45 s->inverse = inverse; 244 s->inverse = inverse;
46 245
47 s2 = inverse ? 1.0 : -1.0; 246 s2 = inverse ? 1.0 : -1.0;
54 s->exptab[i].im = s1; 253 s->exptab[i].im = s1;
55 } 254 }
56 s->fft_calc = fft_calc_c; 255 s->fft_calc = fft_calc_c;
57 s->exptab1 = NULL; 256 s->exptab1 = NULL;
58 /* compute constant table for HAVE_SSE version */ 257 /* compute constant table for HAVE_SSE version */
59 #if (defined(HAVE_MMX) && defined(HAVE_BUILTIN_VECTOR)) || defined(BLAH_NO_ALTIVEC) 258 #if (defined(HAVE_ALTIVEC))
60 { 259 {
61 int has_vectors = 0; 260 int has_vectors = 0;
62 261
63 #if defined(HAVE_MMX) 262 #if defined(HAVE_ALTIVEC)
64 has_vectors = mm_support() & MM_SSE; 263 has_vectors = has_altivec();
65 #endif
66 #if defined(BLAH_NO_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE)
67 has_vectors = mm_support() & MM_ALTIVEC;
68 #endif 264 #endif
69 if (has_vectors) { 265 if (has_vectors) {
70 int np, nblocks, np2, l; 266 int np, nblocks, np2, l;
71 FFTComplex *q; 267 FFTComplex *q;
72 268
73 np = 1 << nbits; 269 np = 1 << nbits;
74 nblocks = np >> 3; 270 nblocks = np >> 3;
75 np2 = np >> 1; 271 np2 = np >> 1;
76 s->exptab1 = malloc(np * 2 * sizeof(FFTComplex)); 272 s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex));
77 if (!s->exptab1) 273 if (!s->exptab1)
78 goto fail; 274 goto fail;
79 q = s->exptab1; 275 q = s->exptab1;
80 do { 276 do {
81 for(l = 0; l < np2; l += 2 * nblocks) { 277 for(l = 0; l < np2; l += 2 * nblocks) {
90 q++; 286 q++;
91 } 287 }
92 nblocks = nblocks >> 1; 288 nblocks = nblocks >> 1;
93 } while (nblocks != 0); 289 } while (nblocks != 0);
94 av_freep(&s->exptab); 290 av_freep(&s->exptab);
95 #if defined(HAVE_MMX)
96 s->fft_calc = fft_calc_sse;
97 #else
98 s->fft_calc = fft_calc_altivec; 291 s->fft_calc = fft_calc_altivec;
99 #endif
100 } 292 }
101 } 293 }
102 #endif 294 #endif
103 295
104 /* compute bit reverse table */ 296 /* compute bit reverse table */