Mercurial > mplayer.hg
annotate liba52/imdct_3dnow.h @ 14216:0d4589ab4d28
Adds support for LADSPA (Linux Audio Developer's Simple Plugin API) plugins.
Compilation is optional and can be controled by configure. You need to
have the LADSPA SDK installed in order to have it autodetected by configure.
Manual page is updated.
author | ivo |
---|---|
date | Thu, 23 Dec 2004 02:09:49 +0000 |
parents | 772d6d27fd66 |
children | 4bad7f00556e |
rev | line source |
---|---|
4497 | 1 /* |
2 3DNOW and 3DNOWEX optimized IMDCT | |
3 Licence: GPL v2 | |
4 Copyrights: Nick Kurshev | |
5 */ | |
6 | |
7 #undef FFT_4_3DNOW | |
8 #undef FFT_8_3DNOW | |
9 #undef FFT_ASMB_3DNOW | |
10 #undef FFT_ASMB16_3DNOW | |
11 #undef FFT_128P_3DNOW | |
12 | |
13 #ifdef HAVE_3DNOWEX | |
14 #define FFT_4_3DNOW fft_4_3dnowex | |
15 #define FFT_8_3DNOW fft_8_3dnowex | |
16 #define FFT_ASMB_3DNOW fft_asmb_3dnowex | |
17 #define FFT_ASMB16_3DNOW fft_asmb16_3dnowex | |
18 #define FFT_128P_3DNOW fft_128p_3dnowex | |
19 #else | |
20 #define FFT_4_3DNOW fft_4_3dnow | |
21 #define FFT_8_3DNOW fft_8_3dnow | |
22 #define FFT_ASMB_3DNOW fft_asmb_3dnow | |
23 #define FFT_ASMB16_3DNOW fft_asmb16_3dnow | |
24 #define FFT_128P_3DNOW fft_128p_3dnow | |
25 #endif | |
26 | |
27 static void FFT_4_3DNOW(complex_t *x) | |
28 { | |
29 /* delta_p = 1 here */ | |
30 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} | |
31 */ | |
32 __asm__ __volatile__( | |
33 "movq 24(%1), %%mm3\n\t" | |
34 "movq 8(%1), %%mm1\n\t" | |
35 "pxor %2, %%mm3\n\t" /* mm3.re | -mm3.im */ | |
36 "pxor %3, %%mm1\n\t" /* -mm1.re | mm1.im */ | |
37 "pfadd %%mm1, %%mm3\n\t" /* vi.im = x[3].re - x[1].re; */ | |
38 "movq %%mm3, %%mm4\n\t" /* vi.re =-x[3].im + x[1].im; mm4 = vi */ | |
39 #ifdef HAVE_3DNOWEX | |
40 "pswapd %%mm4, %%mm4\n\t" | |
41 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
42 "punpckldq %%mm4, %%mm5\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
43 "punpckhdq %%mm5, %%mm4\n\t" |
4497 | 44 #endif |
45 "movq (%1), %%mm5\n\t" /* yb.re = x[0].re - x[2].re; */ | |
46 "movq (%1), %%mm6\n\t" /* yt.re = x[0].re + x[2].re; */ | |
47 "movq 24(%1), %%mm7\n\t" /* u.re = x[3].re + x[1].re; */ | |
48 "pfsub 16(%1), %%mm5\n\t" /* yb.im = x[0].im - x[2].im; mm5 = yb */ | |
49 "pfadd 16(%1), %%mm6\n\t" /* yt.im = x[0].im + x[2].im; mm6 = yt */ | |
50 "pfadd 8(%1), %%mm7\n\t" /* u.im = x[3].im + x[1].im; mm7 = u */ | |
51 | |
52 "movq %%mm6, %%mm0\n\t" /* x[0].re = yt.re + u.re; */ | |
53 "movq %%mm5, %%mm1\n\t" /* x[1].re = yb.re + vi.re; */ | |
54 "pfadd %%mm7, %%mm0\n\t" /*x[0].im = yt.im + u.im; */ | |
55 "pfadd %%mm4, %%mm1\n\t" /* x[1].im = yb.im + vi.im; */ | |
56 "movq %%mm0, (%0)\n\t" | |
57 "movq %%mm1, 8(%0)\n\t" | |
58 | |
59 "pfsub %%mm7, %%mm6\n\t" /* x[2].re = yt.re - u.re; */ | |
60 "pfsub %%mm4, %%mm5\n\t" /* x[3].re = yb.re - vi.re; */ | |
61 "movq %%mm6, 16(%0)\n\t" /* x[2].im = yt.im - u.im; */ | |
62 "movq %%mm5, 24(%0)" /* x[3].im = yb.im - vi.im; */ | |
63 :"=r"(x) | |
64 :"0"(x), | |
65 "m"(x_plus_minus_3dnow), | |
66 "m"(x_minus_plus_3dnow) | |
67 :"memory"); | |
68 } | |
69 | |
70 static void FFT_8_3DNOW(complex_t *x) | |
71 { | |
72 /* delta_p = diag{1, sqrt(i)} here */ | |
73 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} | |
74 */ | |
75 complex_t wT1, wB1, wB2; | |
76 | |
77 __asm__ __volatile__( | |
78 "movq 8(%2), %%mm0\n\t" | |
79 "movq 24(%2), %%mm1\n\t" | |
80 "movq %%mm0, %0\n\t" /* wT1 = x[1]; */ | |
81 "movq %%mm1, %1\n\t" /* wB1 = x[3]; */ | |
82 :"=m"(wT1), "=m"(wB1) | |
83 :"r"(x) | |
84 :"memory"); | |
85 | |
86 __asm__ __volatile__( | |
87 "movq 16(%0), %%mm2\n\t" | |
88 "movq 32(%0), %%mm3\n\t" | |
89 "movq %%mm2, 8(%0)\n\t" /* x[1] = x[2]; */ | |
90 "movq 48(%0), %%mm4\n\t" | |
91 "movq %%mm3, 16(%0)\n\t" /* x[2] = x[4]; */ | |
92 "movq %%mm4, 24(%0)\n\t" /* x[3] = x[6]; */ | |
93 :"=r"(x) | |
94 :"0"(x) | |
95 :"memory"); | |
96 | |
97 fft_4_3dnow(&x[0]); | |
98 | |
99 /* x[0] x[4] x[2] x[6] */ | |
100 | |
101 __asm__ __volatile__( | |
102 "movq 40(%1), %%mm0\n\t" | |
103 "movq %%mm0, %%mm3\n\t" | |
104 "movq 56(%1), %%mm1\n\t" | |
105 "pfadd %%mm1, %%mm0\n\t" | |
106 "pfsub %%mm1, %%mm3\n\t" | |
107 "movq (%2), %%mm2\n\t" | |
108 "pfadd %%mm2, %%mm0\n\t" | |
109 "pfadd %%mm2, %%mm3\n\t" | |
110 "movq (%3), %%mm1\n\t" | |
111 "pfadd %%mm1, %%mm0\n\t" | |
112 "pfsub %%mm1, %%mm3\n\t" | |
113 "movq (%1), %%mm1\n\t" | |
114 "movq 16(%1), %%mm4\n\t" | |
115 "movq %%mm1, %%mm2\n\t" | |
116 #ifdef HAVE_3DNOWEX | |
117 "pswapd %%mm3, %%mm3\n\t" | |
118 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
119 "punpckldq %%mm3, %%mm6\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
120 "punpckhdq %%mm6, %%mm3\n\t" |
4497 | 121 #endif |
122 "pfadd %%mm0, %%mm1\n\t" | |
123 "movq %%mm4, %%mm5\n\t" | |
124 "pfsub %%mm0, %%mm2\n\t" | |
125 "pfadd %%mm3, %%mm4\n\t" | |
126 "movq %%mm1, (%0)\n\t" | |
127 "pfsub %%mm3, %%mm5\n\t" | |
128 "movq %%mm2, 32(%0)\n\t" | |
129 "movd %%mm4, 16(%0)\n\t" | |
130 "movd %%mm5, 48(%0)\n\t" | |
131 "psrlq $32, %%mm4\n\t" | |
132 "psrlq $32, %%mm5\n\t" | |
133 "movd %%mm4, 52(%0)\n\t" | |
134 "movd %%mm5, 20(%0)" | |
135 :"=r"(x) | |
136 :"0"(x), "r"(&wT1), "r"(&wB1) | |
137 :"memory"); | |
138 | |
139 /* x[1] x[5] */ | |
140 __asm__ __volatile__ ( | |
141 "movq %6, %%mm6\n\t" | |
142 "movq %5, %%mm7\n\t" | |
143 "movq %1, %%mm0\n\t" | |
144 "movq %2, %%mm1\n\t" | |
145 "movq 56(%3), %%mm3\n\t" | |
146 "pfsub 40(%3), %%mm0\n\t" | |
147 #ifdef HAVE_3DNOWEX | |
148 "pswapd %%mm1, %%mm1\n\t" | |
149 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
150 "punpckldq %%mm1, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
151 "punpckhdq %%mm2, %%mm1\n\t" |
4497 | 152 #endif |
153 "pxor %%mm7, %%mm1\n\t" | |
154 "pfadd %%mm1, %%mm0\n\t" | |
155 #ifdef HAVE_3DNOWEX | |
156 "pswapd %%mm3, %%mm3\n\t" | |
157 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
158 "punpckldq %%mm3, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
159 "punpckhdq %%mm2, %%mm3\n\t" |
4497 | 160 #endif |
161 "pxor %%mm6, %%mm3\n\t" | |
162 "pfadd %%mm3, %%mm0\n\t" | |
163 "movq %%mm0, %%mm1\n\t" | |
164 "pxor %%mm6, %%mm1\n\t" | |
165 "pfacc %%mm1, %%mm0\n\t" | |
166 "pfmul %4, %%mm0\n\t" | |
167 | |
168 "movq 40(%3), %%mm5\n\t" | |
169 #ifdef HAVE_3DNOWEX | |
170 "pswapd %%mm5, %%mm5\n\t" | |
171 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
172 "punpckldq %%mm5, %%mm1\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
173 "punpckhdq %%mm1, %%mm5\n\t" |
4497 | 174 #endif |
175 "movq %%mm5, %0\n\t" | |
176 | |
177 "movq 8(%3), %%mm1\n\t" | |
178 "movq %%mm1, %%mm2\n\t" | |
179 "pfsub %%mm0, %%mm1\n\t" | |
180 "pfadd %%mm0, %%mm2\n\t" | |
181 "movq %%mm1, 40(%3)\n\t" | |
182 "movq %%mm2, 8(%3)\n\t" | |
183 :"=m"(wB2) | |
184 :"m"(wT1), "m"(wB1), "r"(x), "m"(HSQRT2_3DNOW), | |
185 "m"(x_plus_minus_3dnow), "m"(x_minus_plus_3dnow) | |
186 :"memory"); | |
187 | |
188 | |
189 /* x[3] x[7] */ | |
190 __asm__ __volatile__( | |
191 "movq %1, %%mm0\n\t" | |
192 #ifdef HAVE_3DNOWEX | |
193 "pswapd %3, %%mm1\n\t" | |
194 #else | |
195 "movq %3, %%mm1\n\t" | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
196 "punpckldq %%mm1, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
197 "punpckhdq %%mm2, %%mm1\n\t" |
4497 | 198 #endif |
199 "pxor %%mm6, %%mm1\n\t" | |
200 "pfadd %%mm1, %%mm0\n\t" | |
201 "movq %2, %%mm2\n\t" | |
202 "movq 56(%4), %%mm3\n\t" | |
203 "pxor %%mm7, %%mm3\n\t" | |
204 "pfadd %%mm3, %%mm2\n\t" | |
205 #ifdef HAVE_3DNOWEX | |
206 "pswapd %%mm2, %%mm2\n\t" | |
207 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
208 "punpckldq %%mm2, %%mm5\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
209 "punpckhdq %%mm5, %%mm2\n\t" |
4497 | 210 #endif |
211 "movq 24(%4), %%mm3\n\t" | |
212 "pfsub %%mm2, %%mm0\n\t" | |
213 "movq %%mm3, %%mm4\n\t" | |
214 "movq %%mm0, %%mm1\n\t" | |
215 "pxor %%mm6, %%mm0\n\t" | |
216 "pfacc %%mm1, %%mm0\n\t" | |
217 "pfmul %5, %%mm0\n\t" | |
218 "movq %%mm0, %%mm1\n\t" | |
219 "pxor %%mm6, %%mm1\n\t" | |
220 "pxor %%mm7, %%mm0\n\t" | |
221 "pfadd %%mm1, %%mm3\n\t" | |
222 "pfadd %%mm0, %%mm4\n\t" | |
223 "movq %%mm4, 24(%0)\n\t" | |
224 "movq %%mm3, 56(%0)\n\t" | |
225 :"=r"(x) | |
226 :"m"(wT1), "m"(wB2), "m"(wB1), "0"(x), "m"(HSQRT2_3DNOW) | |
227 :"memory"); | |
228 } | |
229 | |
230 static void FFT_ASMB_3DNOW(int k, complex_t *x, complex_t *wTB, | |
231 const complex_t *d, const complex_t *d_3) | |
232 { | |
233 register complex_t *x2k, *x3k, *x4k, *wB; | |
234 | |
235 TRANS_FILL_MM6_MM7_3DNOW(); | |
236 x2k = x + 2 * k; | |
237 x3k = x2k + 2 * k; | |
238 x4k = x3k + 2 * k; | |
239 wB = wTB + 2 * k; | |
240 | |
241 TRANSZERO_3DNOW(x[0],x2k[0],x3k[0],x4k[0]); | |
242 TRANS_3DNOW(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]); | |
243 | |
244 --k; | |
245 for(;;) { | |
246 TRANS_3DNOW(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]); | |
247 TRANS_3DNOW(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]); | |
248 if (!--k) break; | |
249 x += 2; | |
250 x2k += 2; | |
251 x3k += 2; | |
252 x4k += 2; | |
253 d += 2; | |
254 d_3 += 2; | |
255 wTB += 2; | |
256 wB += 2; | |
257 } | |
258 | |
259 } | |
260 | |
261 void FFT_ASMB16_3DNOW(complex_t *x, complex_t *wTB) | |
262 { | |
263 int k = 2; | |
264 | |
265 TRANS_FILL_MM6_MM7_3DNOW(); | |
266 /* transform x[0], x[8], x[4], x[12] */ | |
267 TRANSZERO_3DNOW(x[0],x[4],x[8],x[12]); | |
268 | |
269 /* transform x[1], x[9], x[5], x[13] */ | |
270 TRANS_3DNOW(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]); | |
271 | |
272 /* transform x[2], x[10], x[6], x[14] */ | |
273 TRANSHALF_16_3DNOW(x[2],x[6],x[10],x[14]); | |
274 | |
275 /* transform x[3], x[11], x[7], x[15] */ | |
276 TRANS_3DNOW(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]); | |
277 | |
278 } | |
279 | |
280 static void FFT_128P_3DNOW(complex_t *a) | |
281 { | |
282 FFT_8_3DNOW(&a[0]); FFT_4_3DNOW(&a[8]); FFT_4_3DNOW(&a[12]); | |
283 FFT_ASMB16_3DNOW(&a[0], &a[8]); | |
284 | |
285 FFT_8_3DNOW(&a[16]), FFT_8_3DNOW(&a[24]); | |
286 FFT_ASMB_3DNOW(4, &a[0], &a[16],&delta32[0], &delta32_3[0]); | |
287 | |
288 FFT_8_3DNOW(&a[32]); FFT_4_3DNOW(&a[40]); FFT_4_3DNOW(&a[44]); | |
289 FFT_ASMB16_3DNOW(&a[32], &a[40]); | |
290 | |
291 FFT_8_3DNOW(&a[48]); FFT_4_3DNOW(&a[56]); FFT_4_3DNOW(&a[60]); | |
292 FFT_ASMB16_3DNOW(&a[48], &a[56]); | |
293 | |
294 FFT_ASMB_3DNOW(8, &a[0], &a[32],&delta64[0], &delta64_3[0]); | |
295 | |
296 FFT_8_3DNOW(&a[64]); FFT_4_3DNOW(&a[72]); FFT_4_3DNOW(&a[76]); | |
297 /* FFT_16(&a[64]); */ | |
298 FFT_ASMB16_3DNOW(&a[64], &a[72]); | |
299 | |
300 FFT_8_3DNOW(&a[80]); FFT_8_3DNOW(&a[88]); | |
301 | |
302 /* FFT_32(&a[64]); */ | |
303 FFT_ASMB_3DNOW(4, &a[64], &a[80],&delta32[0], &delta32_3[0]); | |
304 | |
305 FFT_8_3DNOW(&a[96]); FFT_4_3DNOW(&a[104]), FFT_4_3DNOW(&a[108]); | |
306 /* FFT_16(&a[96]); */ | |
307 FFT_ASMB16_3DNOW(&a[96], &a[104]); | |
308 | |
309 FFT_8_3DNOW(&a[112]), FFT_8_3DNOW(&a[120]); | |
310 /* FFT_32(&a[96]); */ | |
311 FFT_ASMB_3DNOW(4, &a[96], &a[112], &delta32[0], &delta32_3[0]); | |
312 | |
313 /* FFT_128(&a[0]); */ | |
314 FFT_ASMB_3DNOW(16, &a[0], &a[64], &delta128[0], &delta128_3[0]); | |
315 } | |
316 | |
317 static void | |
318 #ifdef HAVE_3DNOWEX | |
319 imdct_do_512_3dnowex | |
320 #else | |
321 imdct_do_512_3dnow | |
322 #endif | |
323 (sample_t data[],sample_t delay[], sample_t bias) | |
324 { | |
8254
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
8230
diff
changeset
|
325 int i; |
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
8230
diff
changeset
|
326 /* int k; |
4497 | 327 int p,q; |
328 int m; | |
329 int two_m; | |
330 int two_m_plus_one; | |
331 | |
332 sample_t tmp_a_i; | |
333 sample_t tmp_a_r; | |
334 sample_t tmp_b_i; | |
8254
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
8230
diff
changeset
|
335 sample_t tmp_b_r;*/ |
4497 | 336 |
337 sample_t *data_ptr; | |
338 sample_t *delay_ptr; | |
339 sample_t *window_ptr; | |
340 | |
341 /* 512 IMDCT with source and dest data in 'data' */ | |
342 | |
343 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ | |
344 #if 1 | |
345 __asm__ __volatile__ ( | |
346 "movq %0, %%mm7\n\t" | |
347 ::"m"(x_plus_minus_3dnow) | |
348 :"memory"); | |
349 for( i=0; i < 128; i++) { | |
350 int j = pm128[i]; | |
351 __asm__ __volatile__ ( | |
352 "movd %1, %%mm0\n\t" | |
353 "movd %3, %%mm1\n\t" | |
354 "punpckldq %2, %%mm0\n\t" /* mm0 = data[256-2*j-1] | data[2*j]*/ | |
355 "punpckldq %4, %%mm1\n\t" /* mm1 = xcos[j] | xsin[j] */ | |
356 "movq %%mm0, %%mm2\n\t" | |
357 "pfmul %%mm1, %%mm0\n\t" | |
358 #ifdef HAVE_3DNOWEX | |
359 "pswapd %%mm1, %%mm1\n\t" | |
360 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
361 "punpckldq %%mm1, %%mm5\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
362 "punpckhdq %%mm5, %%mm1\n\t" |
4497 | 363 #endif |
364 "pfmul %%mm1, %%mm2\n\t" | |
365 #ifdef HAVE_3DNOWEX | |
366 "pfpnacc %%mm2, %%mm0\n\t" | |
367 #else | |
368 "pxor %%mm7, %%mm0\n\t" | |
369 "pfacc %%mm2, %%mm0\n\t" | |
370 #endif | |
371 "pxor %%mm7, %%mm0\n\t" | |
372 "movq %%mm0, %0" | |
373 :"=m"(buf[i]) | |
374 :"m"(data[256-2*j-1]), "m"(data[2*j]), "m"(xcos1[j]), "m"(xsin1[j]) | |
375 :"memory" | |
376 ); | |
377 /* buf[i].re = (data[256-2*j-1] * xcos1[j] - data[2*j] * xsin1[j]); | |
378 buf[i].im = (data[256-2*j-1] * xsin1[j] + data[2*j] * xcos1[j])*(-1.0);*/ | |
379 } | |
380 #else | |
381 __asm__ __volatile__ ("femms":::"memory"); | |
382 for( i=0; i < 128; i++) { | |
383 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ | |
384 int j= pm128[i]; | |
385 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); | |
386 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); | |
387 } | |
388 #endif | |
389 | |
390 /* FFT Merge */ | |
391 /* unoptimized variant | |
392 for (m=1; m < 7; m++) { | |
393 if(m) | |
394 two_m = (1 << m); | |
395 else | |
396 two_m = 1; | |
397 | |
398 two_m_plus_one = (1 << (m+1)); | |
399 | |
400 for(i = 0; i < 128; i += two_m_plus_one) { | |
401 for(k = 0; k < two_m; k++) { | |
402 p = k + i; | |
403 q = p + two_m; | |
404 tmp_a_r = buf[p].real; | |
405 tmp_a_i = buf[p].imag; | |
406 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
407 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
408 buf[p].real = tmp_a_r + tmp_b_r; | |
409 buf[p].imag = tmp_a_i + tmp_b_i; | |
410 buf[q].real = tmp_a_r - tmp_b_r; | |
411 buf[q].imag = tmp_a_i - tmp_b_i; | |
412 } | |
413 } | |
414 } | |
415 */ | |
416 | |
417 FFT_128P_3DNOW (&buf[0]); | |
418 // asm volatile ("femms \n\t":::"memory"); | |
419 | |
420 /* Post IFFT complex multiply plus IFFT complex conjugate*/ | |
421 #if 1 | |
422 __asm__ __volatile__ ( | |
423 "movq %0, %%mm7\n\t" | |
424 "movq %1, %%mm6\n\t" | |
425 ::"m"(x_plus_minus_3dnow), | |
426 "m"(x_minus_plus_3dnow) | |
427 :"eax","memory"); | |
428 for (i=0; i < 128; i++) { | |
429 __asm__ __volatile__ ( | |
430 "movq %1, %%mm0\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ | |
431 "movq %%mm0, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ | |
432 #ifndef HAVE_3DNOWEX | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
433 "punpckldq %%mm1, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
434 "punpckhdq %%mm2, %%mm1\n\t" |
4497 | 435 #else |
436 "pswapd %%mm1, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ | |
437 #endif | |
438 "movd %3, %%mm3\n\t" /* ac3_xsin[i] */ | |
439 "punpckldq %2, %%mm3\n\t" /* ac3_xsin[i] | ac3_xcos[i] */ | |
440 "pfmul %%mm3, %%mm0\n\t" | |
441 "pfmul %%mm3, %%mm1\n\t" | |
442 #ifndef HAVE_3DNOWEX | |
443 "pxor %%mm7, %%mm0\n\t" | |
444 "pfacc %%mm1, %%mm0\n\t" | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
445 "punpckldq %%mm0, %%mm1\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
446 "punpckhdq %%mm1, %%mm0\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
447 "movq %%mm0, %0\n\t" |
4497 | 448 #else |
449 "pfpnacc %%mm1, %%mm0\n\t" /* mm0 = mm0[0] - mm0[1] | mm1[0] + mm1[1] */ | |
450 "pswapd %%mm0, %%mm0\n\t" | |
451 "movq %%mm0, %0" | |
452 #endif | |
453 :"=m"(buf[i]) | |
454 :"m"(buf[i]),"m"(xcos1[i]),"m"(xsin1[i]) | |
455 :"memory"); | |
456 /* ac3_buf[i].re =(tmp_a_r * ac3_xcos1[i]) + (tmp_a_i * ac3_xsin1[i]); | |
457 ac3_buf[i].im =(tmp_a_r * ac3_xsin1[i]) - (tmp_a_i * ac3_xcos1[i]);*/ | |
458 } | |
459 #else | |
460 __asm__ __volatile__ ("femms":::"memory"); | |
461 for( i=0; i < 128; i++) { | |
462 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ | |
463 tmp_a_r = buf[i].real; | |
464 tmp_a_i = -1.0 * buf[i].imag; | |
465 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]); | |
466 buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]); | |
467 } | |
468 #endif | |
469 | |
470 data_ptr = data; | |
471 delay_ptr = delay; | |
472 window_ptr = imdct_window; | |
473 | |
474 /* Window and convert to real valued signal */ | |
475 #if 1 | |
476 asm volatile ( | |
477 "movd (%0), %%mm3 \n\t" | |
478 "punpckldq %%mm3, %%mm3 \n\t" | |
479 :: "r" (&bias) | |
480 ); | |
481 for (i=0; i< 64; i++) { | |
482 /* merge two loops in one to enable working of 2 decoders */ | |
483 __asm__ __volatile__ ( | |
484 "movd 516(%1), %%mm0\n\t" | |
485 "movd (%1), %%mm1\n\t" /**data_ptr++=-buf[64+i].im**window_ptr+++*delay_ptr++;*/ | |
486 "punpckldq (%2), %%mm0\n\t"/*data_ptr[128]=-buf[i].re*window_ptr[128]+delay_ptr[128];*/ | |
487 "punpckldq 516(%2), %%mm1\n\t" | |
488 "pfmul (%3), %%mm0\n\t"/**data_ptr++=buf[64-i-1].re**window_ptr+++*delay_ptr++;*/ | |
489 "pfmul 512(%3), %%mm1\n\t" | |
490 "pxor %%mm6, %%mm0\n\t"/*data_ptr[128]=buf[128-i-1].im*window_ptr[128]+delay_ptr[128];*/ | |
491 "pxor %%mm6, %%mm1\n\t" | |
492 "pfadd (%4), %%mm0\n\t" | |
493 "pfadd 512(%4), %%mm1\n\t" | |
494 "pfadd %%mm3, %%mm0\n\t" | |
495 "pfadd %%mm3, %%mm1\n\t" | |
496 "movq %%mm0, (%0)\n\t" | |
497 "movq %%mm1, 512(%0)" | |
498 :"=r"(data_ptr) | |
499 :"r"(&buf[i].real), "r"(&buf[64-i-1].real), "r"(window_ptr), "r"(delay_ptr), "0"(data_ptr) | |
500 :"memory"); | |
501 data_ptr += 2; | |
502 window_ptr += 2; | |
503 delay_ptr += 2; | |
504 } | |
505 window_ptr += 128; | |
506 #else | |
507 __asm__ __volatile__ ("femms":::"memory"); | |
508 for(i=0; i< 64; i++) { | |
509 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; | |
510 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | |
511 } | |
512 | |
513 for(i=0; i< 64; i++) { | |
514 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; | |
515 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | |
516 } | |
517 #endif | |
518 | |
519 /* The trailing edge of the window goes into the delay line */ | |
520 delay_ptr = delay; | |
521 #if 1 | |
522 for(i=0; i< 64; i++) { | |
523 /* merge two loops in one to enable working of 2 decoders */ | |
524 window_ptr -=2; | |
525 __asm__ __volatile__( | |
526 "movd 508(%1), %%mm0\n\t" | |
527 "movd (%1), %%mm1\n\t" | |
528 "punpckldq (%2), %%mm0\n\t" | |
529 "punpckldq 508(%2), %%mm1\n\t" | |
530 #ifdef HAVE_3DNOWEX | |
531 "pswapd (%3), %%mm3\n\t" | |
532 "pswapd -512(%3), %%mm4\n\t" | |
533 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
534 "movq (%3), %%mm3\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
535 "punpckldq %%mm3, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
536 "punpckhdq %%mm2, %%mm3\n\t" |
4497 | 537 "movq -512(%3), %%mm4\n\t" |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
538 "punpckldq %%mm4, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
539 "punpckhdq %%mm2, %%mm4\n\t" |
4497 | 540 #endif |
541 "pfmul %%mm3, %%mm0\n\t" | |
542 "pfmul %%mm4, %%mm1\n\t" | |
543 "pxor %%mm6, %%mm0\n\t" | |
544 "pxor %%mm7, %%mm1\n\t" | |
545 "movq %%mm0, (%0)\n\t" | |
546 "movq %%mm1, 512(%0)" | |
547 :"=r"(delay_ptr) | |
548 :"r"(&buf[i].imag), "r"(&buf[64-i-1].imag), "r"(window_ptr), "0"(delay_ptr) | |
549 :"memory"); | |
550 delay_ptr += 2; | |
551 } | |
552 __asm__ __volatile__ ("femms":::"memory"); | |
553 #else | |
554 __asm__ __volatile__ ("femms":::"memory"); | |
555 for(i=0; i< 64; i++) { | |
556 *delay_ptr++ = -buf[64+i].real * *--window_ptr; | |
557 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; | |
558 } | |
559 | |
560 for(i=0; i<64; i++) { | |
561 *delay_ptr++ = buf[i].imag * *--window_ptr; | |
562 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; | |
563 } | |
564 #endif | |
565 } |