Mercurial > mplayer.hg
annotate liba52/imdct_3dnow.h @ 25509:1cfa39a320cc
Fix update_subtitles() checking subtitle type for the wrong track.
update_subtitles() uses 'type' field from d_dvdsub even when some other track
is active. For this reason, external vobsub is not displayed when there is at
least one text track from demuxer (type is always 't' or 'a' in this case).
The solution is to check vobsub_id and dvdsub_id instead.
author | eugeni |
---|---|
date | Fri, 28 Dec 2007 20:57:38 +0000 |
parents | 8855a2568281 |
children | 08d18fe9da52 |
rev | line source |
---|---|
4497 | 1 /* |
25323
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
2 * 3DNOW and 3DNOWEX optimized IMDCT |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
3 * Copyright (C) 2002 Nick Kurshev |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
4 * |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
5 * This file is part of a52dec, a free ATSC A-52 stream decoder. |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
6 * See http://liba52.sourceforge.net/ for updates. |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
7 * |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
8 * a52dec is free software; you can redistribute it and/or modify |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
9 * it under the terms of the GNU General Public License as published by |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
10 * the Free Software Foundation; either version 2 of the License, or |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
11 * (at your option) any later version. |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
12 * |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
13 * a52dec is distributed in the hope that it will be useful, |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
16 * GNU General Public License for more details. |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
17 * |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
18 * You should have received a copy of the GNU General Public License |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
19 * along with this program; if not, write to the Free Software |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
8855a2568281
Relicense as GPL v2 or later like the rest of liba52.
diego
parents:
18720
diff
changeset
|
21 */ |
4497 | 22 |
23 #undef FFT_4_3DNOW | |
24 #undef FFT_8_3DNOW | |
25 #undef FFT_ASMB_3DNOW | |
26 #undef FFT_ASMB16_3DNOW | |
27 #undef FFT_128P_3DNOW | |
28 | |
29 #ifdef HAVE_3DNOWEX | |
30 #define FFT_4_3DNOW fft_4_3dnowex | |
31 #define FFT_8_3DNOW fft_8_3dnowex | |
32 #define FFT_ASMB_3DNOW fft_asmb_3dnowex | |
33 #define FFT_ASMB16_3DNOW fft_asmb16_3dnowex | |
34 #define FFT_128P_3DNOW fft_128p_3dnowex | |
35 #else | |
36 #define FFT_4_3DNOW fft_4_3dnow | |
37 #define FFT_8_3DNOW fft_8_3dnow | |
38 #define FFT_ASMB_3DNOW fft_asmb_3dnow | |
39 #define FFT_ASMB16_3DNOW fft_asmb16_3dnow | |
40 #define FFT_128P_3DNOW fft_128p_3dnow | |
41 #endif | |
42 | |
43 static void FFT_4_3DNOW(complex_t *x) | |
44 { | |
45 /* delta_p = 1 here */ | |
46 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} | |
47 */ | |
48 __asm__ __volatile__( | |
49 "movq 24(%1), %%mm3\n\t" | |
50 "movq 8(%1), %%mm1\n\t" | |
51 "pxor %2, %%mm3\n\t" /* mm3.re | -mm3.im */ | |
52 "pxor %3, %%mm1\n\t" /* -mm1.re | mm1.im */ | |
53 "pfadd %%mm1, %%mm3\n\t" /* vi.im = x[3].re - x[1].re; */ | |
54 "movq %%mm3, %%mm4\n\t" /* vi.re =-x[3].im + x[1].im; mm4 = vi */ | |
55 #ifdef HAVE_3DNOWEX | |
56 "pswapd %%mm4, %%mm4\n\t" | |
57 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
58 "punpckldq %%mm4, %%mm5\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
59 "punpckhdq %%mm5, %%mm4\n\t" |
4497 | 60 #endif |
61 "movq (%1), %%mm5\n\t" /* yb.re = x[0].re - x[2].re; */ | |
62 "movq (%1), %%mm6\n\t" /* yt.re = x[0].re + x[2].re; */ | |
63 "movq 24(%1), %%mm7\n\t" /* u.re = x[3].re + x[1].re; */ | |
64 "pfsub 16(%1), %%mm5\n\t" /* yb.im = x[0].im - x[2].im; mm5 = yb */ | |
65 "pfadd 16(%1), %%mm6\n\t" /* yt.im = x[0].im + x[2].im; mm6 = yt */ | |
66 "pfadd 8(%1), %%mm7\n\t" /* u.im = x[3].im + x[1].im; mm7 = u */ | |
67 | |
68 "movq %%mm6, %%mm0\n\t" /* x[0].re = yt.re + u.re; */ | |
69 "movq %%mm5, %%mm1\n\t" /* x[1].re = yb.re + vi.re; */ | |
70 "pfadd %%mm7, %%mm0\n\t" /*x[0].im = yt.im + u.im; */ | |
71 "pfadd %%mm4, %%mm1\n\t" /* x[1].im = yb.im + vi.im; */ | |
72 "movq %%mm0, (%0)\n\t" | |
73 "movq %%mm1, 8(%0)\n\t" | |
74 | |
75 "pfsub %%mm7, %%mm6\n\t" /* x[2].re = yt.re - u.re; */ | |
76 "pfsub %%mm4, %%mm5\n\t" /* x[3].re = yb.re - vi.re; */ | |
77 "movq %%mm6, 16(%0)\n\t" /* x[2].im = yt.im - u.im; */ | |
78 "movq %%mm5, 24(%0)" /* x[3].im = yb.im - vi.im; */ | |
79 :"=r"(x) | |
80 :"0"(x), | |
81 "m"(x_plus_minus_3dnow), | |
82 "m"(x_minus_plus_3dnow) | |
83 :"memory"); | |
84 } | |
85 | |
86 static void FFT_8_3DNOW(complex_t *x) | |
87 { | |
88 /* delta_p = diag{1, sqrt(i)} here */ | |
89 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} | |
90 */ | |
91 complex_t wT1, wB1, wB2; | |
92 | |
93 __asm__ __volatile__( | |
94 "movq 8(%2), %%mm0\n\t" | |
95 "movq 24(%2), %%mm1\n\t" | |
96 "movq %%mm0, %0\n\t" /* wT1 = x[1]; */ | |
97 "movq %%mm1, %1\n\t" /* wB1 = x[3]; */ | |
98 :"=m"(wT1), "=m"(wB1) | |
99 :"r"(x) | |
100 :"memory"); | |
101 | |
102 __asm__ __volatile__( | |
103 "movq 16(%0), %%mm2\n\t" | |
104 "movq 32(%0), %%mm3\n\t" | |
105 "movq %%mm2, 8(%0)\n\t" /* x[1] = x[2]; */ | |
106 "movq 48(%0), %%mm4\n\t" | |
107 "movq %%mm3, 16(%0)\n\t" /* x[2] = x[4]; */ | |
108 "movq %%mm4, 24(%0)\n\t" /* x[3] = x[6]; */ | |
109 :"=r"(x) | |
110 :"0"(x) | |
111 :"memory"); | |
112 | |
113 fft_4_3dnow(&x[0]); | |
114 | |
115 /* x[0] x[4] x[2] x[6] */ | |
116 | |
117 __asm__ __volatile__( | |
118 "movq 40(%1), %%mm0\n\t" | |
119 "movq %%mm0, %%mm3\n\t" | |
120 "movq 56(%1), %%mm1\n\t" | |
121 "pfadd %%mm1, %%mm0\n\t" | |
122 "pfsub %%mm1, %%mm3\n\t" | |
123 "movq (%2), %%mm2\n\t" | |
124 "pfadd %%mm2, %%mm0\n\t" | |
125 "pfadd %%mm2, %%mm3\n\t" | |
126 "movq (%3), %%mm1\n\t" | |
127 "pfadd %%mm1, %%mm0\n\t" | |
128 "pfsub %%mm1, %%mm3\n\t" | |
129 "movq (%1), %%mm1\n\t" | |
130 "movq 16(%1), %%mm4\n\t" | |
131 "movq %%mm1, %%mm2\n\t" | |
132 #ifdef HAVE_3DNOWEX | |
133 "pswapd %%mm3, %%mm3\n\t" | |
134 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
135 "punpckldq %%mm3, %%mm6\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
136 "punpckhdq %%mm6, %%mm3\n\t" |
4497 | 137 #endif |
138 "pfadd %%mm0, %%mm1\n\t" | |
139 "movq %%mm4, %%mm5\n\t" | |
140 "pfsub %%mm0, %%mm2\n\t" | |
141 "pfadd %%mm3, %%mm4\n\t" | |
142 "movq %%mm1, (%0)\n\t" | |
143 "pfsub %%mm3, %%mm5\n\t" | |
144 "movq %%mm2, 32(%0)\n\t" | |
145 "movd %%mm4, 16(%0)\n\t" | |
146 "movd %%mm5, 48(%0)\n\t" | |
147 "psrlq $32, %%mm4\n\t" | |
148 "psrlq $32, %%mm5\n\t" | |
149 "movd %%mm4, 52(%0)\n\t" | |
150 "movd %%mm5, 20(%0)" | |
151 :"=r"(x) | |
152 :"0"(x), "r"(&wT1), "r"(&wB1) | |
153 :"memory"); | |
154 | |
155 /* x[1] x[5] */ | |
156 __asm__ __volatile__ ( | |
157 "movq %6, %%mm6\n\t" | |
158 "movq %5, %%mm7\n\t" | |
159 "movq %1, %%mm0\n\t" | |
160 "movq %2, %%mm1\n\t" | |
161 "movq 56(%3), %%mm3\n\t" | |
162 "pfsub 40(%3), %%mm0\n\t" | |
163 #ifdef HAVE_3DNOWEX | |
164 "pswapd %%mm1, %%mm1\n\t" | |
165 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
166 "punpckldq %%mm1, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
167 "punpckhdq %%mm2, %%mm1\n\t" |
4497 | 168 #endif |
169 "pxor %%mm7, %%mm1\n\t" | |
170 "pfadd %%mm1, %%mm0\n\t" | |
171 #ifdef HAVE_3DNOWEX | |
172 "pswapd %%mm3, %%mm3\n\t" | |
173 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
174 "punpckldq %%mm3, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
175 "punpckhdq %%mm2, %%mm3\n\t" |
4497 | 176 #endif |
177 "pxor %%mm6, %%mm3\n\t" | |
178 "pfadd %%mm3, %%mm0\n\t" | |
179 "movq %%mm0, %%mm1\n\t" | |
180 "pxor %%mm6, %%mm1\n\t" | |
181 "pfacc %%mm1, %%mm0\n\t" | |
182 "pfmul %4, %%mm0\n\t" | |
183 | |
184 "movq 40(%3), %%mm5\n\t" | |
185 #ifdef HAVE_3DNOWEX | |
186 "pswapd %%mm5, %%mm5\n\t" | |
187 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
188 "punpckldq %%mm5, %%mm1\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
189 "punpckhdq %%mm1, %%mm5\n\t" |
4497 | 190 #endif |
191 "movq %%mm5, %0\n\t" | |
192 | |
193 "movq 8(%3), %%mm1\n\t" | |
194 "movq %%mm1, %%mm2\n\t" | |
195 "pfsub %%mm0, %%mm1\n\t" | |
196 "pfadd %%mm0, %%mm2\n\t" | |
197 "movq %%mm1, 40(%3)\n\t" | |
198 "movq %%mm2, 8(%3)\n\t" | |
199 :"=m"(wB2) | |
200 :"m"(wT1), "m"(wB1), "r"(x), "m"(HSQRT2_3DNOW), | |
201 "m"(x_plus_minus_3dnow), "m"(x_minus_plus_3dnow) | |
202 :"memory"); | |
203 | |
204 | |
205 /* x[3] x[7] */ | |
206 __asm__ __volatile__( | |
207 "movq %1, %%mm0\n\t" | |
208 #ifdef HAVE_3DNOWEX | |
209 "pswapd %3, %%mm1\n\t" | |
210 #else | |
211 "movq %3, %%mm1\n\t" | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
212 "punpckldq %%mm1, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
213 "punpckhdq %%mm2, %%mm1\n\t" |
4497 | 214 #endif |
215 "pxor %%mm6, %%mm1\n\t" | |
216 "pfadd %%mm1, %%mm0\n\t" | |
217 "movq %2, %%mm2\n\t" | |
218 "movq 56(%4), %%mm3\n\t" | |
219 "pxor %%mm7, %%mm3\n\t" | |
220 "pfadd %%mm3, %%mm2\n\t" | |
221 #ifdef HAVE_3DNOWEX | |
222 "pswapd %%mm2, %%mm2\n\t" | |
223 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
224 "punpckldq %%mm2, %%mm5\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
225 "punpckhdq %%mm5, %%mm2\n\t" |
4497 | 226 #endif |
227 "movq 24(%4), %%mm3\n\t" | |
228 "pfsub %%mm2, %%mm0\n\t" | |
229 "movq %%mm3, %%mm4\n\t" | |
230 "movq %%mm0, %%mm1\n\t" | |
231 "pxor %%mm6, %%mm0\n\t" | |
232 "pfacc %%mm1, %%mm0\n\t" | |
233 "pfmul %5, %%mm0\n\t" | |
234 "movq %%mm0, %%mm1\n\t" | |
235 "pxor %%mm6, %%mm1\n\t" | |
236 "pxor %%mm7, %%mm0\n\t" | |
237 "pfadd %%mm1, %%mm3\n\t" | |
238 "pfadd %%mm0, %%mm4\n\t" | |
239 "movq %%mm4, 24(%0)\n\t" | |
240 "movq %%mm3, 56(%0)\n\t" | |
241 :"=r"(x) | |
242 :"m"(wT1), "m"(wB2), "m"(wB1), "0"(x), "m"(HSQRT2_3DNOW) | |
243 :"memory"); | |
244 } | |
245 | |
246 static void FFT_ASMB_3DNOW(int k, complex_t *x, complex_t *wTB, | |
247 const complex_t *d, const complex_t *d_3) | |
248 { | |
249 register complex_t *x2k, *x3k, *x4k, *wB; | |
250 | |
251 TRANS_FILL_MM6_MM7_3DNOW(); | |
252 x2k = x + 2 * k; | |
253 x3k = x2k + 2 * k; | |
254 x4k = x3k + 2 * k; | |
255 wB = wTB + 2 * k; | |
256 | |
257 TRANSZERO_3DNOW(x[0],x2k[0],x3k[0],x4k[0]); | |
258 TRANS_3DNOW(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]); | |
259 | |
260 --k; | |
261 for(;;) { | |
262 TRANS_3DNOW(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]); | |
263 TRANS_3DNOW(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]); | |
264 if (!--k) break; | |
265 x += 2; | |
266 x2k += 2; | |
267 x3k += 2; | |
268 x4k += 2; | |
269 d += 2; | |
270 d_3 += 2; | |
271 wTB += 2; | |
272 wB += 2; | |
273 } | |
274 | |
275 } | |
276 | |
277 void FFT_ASMB16_3DNOW(complex_t *x, complex_t *wTB) | |
278 { | |
279 int k = 2; | |
280 | |
281 TRANS_FILL_MM6_MM7_3DNOW(); | |
282 /* transform x[0], x[8], x[4], x[12] */ | |
283 TRANSZERO_3DNOW(x[0],x[4],x[8],x[12]); | |
284 | |
285 /* transform x[1], x[9], x[5], x[13] */ | |
286 TRANS_3DNOW(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]); | |
287 | |
288 /* transform x[2], x[10], x[6], x[14] */ | |
289 TRANSHALF_16_3DNOW(x[2],x[6],x[10],x[14]); | |
290 | |
291 /* transform x[3], x[11], x[7], x[15] */ | |
292 TRANS_3DNOW(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]); | |
293 | |
294 } | |
295 | |
296 static void FFT_128P_3DNOW(complex_t *a) | |
297 { | |
298 FFT_8_3DNOW(&a[0]); FFT_4_3DNOW(&a[8]); FFT_4_3DNOW(&a[12]); | |
299 FFT_ASMB16_3DNOW(&a[0], &a[8]); | |
300 | |
301 FFT_8_3DNOW(&a[16]), FFT_8_3DNOW(&a[24]); | |
302 FFT_ASMB_3DNOW(4, &a[0], &a[16],&delta32[0], &delta32_3[0]); | |
303 | |
304 FFT_8_3DNOW(&a[32]); FFT_4_3DNOW(&a[40]); FFT_4_3DNOW(&a[44]); | |
305 FFT_ASMB16_3DNOW(&a[32], &a[40]); | |
306 | |
307 FFT_8_3DNOW(&a[48]); FFT_4_3DNOW(&a[56]); FFT_4_3DNOW(&a[60]); | |
308 FFT_ASMB16_3DNOW(&a[48], &a[56]); | |
309 | |
310 FFT_ASMB_3DNOW(8, &a[0], &a[32],&delta64[0], &delta64_3[0]); | |
311 | |
312 FFT_8_3DNOW(&a[64]); FFT_4_3DNOW(&a[72]); FFT_4_3DNOW(&a[76]); | |
313 /* FFT_16(&a[64]); */ | |
314 FFT_ASMB16_3DNOW(&a[64], &a[72]); | |
315 | |
316 FFT_8_3DNOW(&a[80]); FFT_8_3DNOW(&a[88]); | |
317 | |
318 /* FFT_32(&a[64]); */ | |
319 FFT_ASMB_3DNOW(4, &a[64], &a[80],&delta32[0], &delta32_3[0]); | |
320 | |
321 FFT_8_3DNOW(&a[96]); FFT_4_3DNOW(&a[104]), FFT_4_3DNOW(&a[108]); | |
322 /* FFT_16(&a[96]); */ | |
323 FFT_ASMB16_3DNOW(&a[96], &a[104]); | |
324 | |
325 FFT_8_3DNOW(&a[112]), FFT_8_3DNOW(&a[120]); | |
326 /* FFT_32(&a[96]); */ | |
327 FFT_ASMB_3DNOW(4, &a[96], &a[112], &delta32[0], &delta32_3[0]); | |
328 | |
329 /* FFT_128(&a[0]); */ | |
330 FFT_ASMB_3DNOW(16, &a[0], &a[64], &delta128[0], &delta128_3[0]); | |
331 } | |
332 | |
333 static void | |
334 #ifdef HAVE_3DNOWEX | |
335 imdct_do_512_3dnowex | |
336 #else | |
337 imdct_do_512_3dnow | |
338 #endif | |
339 (sample_t data[],sample_t delay[], sample_t bias) | |
340 { | |
8254
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
8230
diff
changeset
|
341 int i; |
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
8230
diff
changeset
|
342 /* int k; |
4497 | 343 int p,q; |
344 int m; | |
345 int two_m; | |
346 int two_m_plus_one; | |
347 | |
348 sample_t tmp_a_i; | |
349 sample_t tmp_a_r; | |
350 sample_t tmp_b_i; | |
8254
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
8230
diff
changeset
|
351 sample_t tmp_b_r;*/ |
4497 | 352 |
353 sample_t *data_ptr; | |
354 sample_t *delay_ptr; | |
355 sample_t *window_ptr; | |
356 | |
357 /* 512 IMDCT with source and dest data in 'data' */ | |
358 | |
359 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ | |
360 #if 1 | |
361 __asm__ __volatile__ ( | |
362 "movq %0, %%mm7\n\t" | |
363 ::"m"(x_plus_minus_3dnow) | |
364 :"memory"); | |
365 for( i=0; i < 128; i++) { | |
366 int j = pm128[i]; | |
367 __asm__ __volatile__ ( | |
368 "movd %1, %%mm0\n\t" | |
369 "movd %3, %%mm1\n\t" | |
370 "punpckldq %2, %%mm0\n\t" /* mm0 = data[256-2*j-1] | data[2*j]*/ | |
371 "punpckldq %4, %%mm1\n\t" /* mm1 = xcos[j] | xsin[j] */ | |
372 "movq %%mm0, %%mm2\n\t" | |
373 "pfmul %%mm1, %%mm0\n\t" | |
374 #ifdef HAVE_3DNOWEX | |
375 "pswapd %%mm1, %%mm1\n\t" | |
376 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
377 "punpckldq %%mm1, %%mm5\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
378 "punpckhdq %%mm5, %%mm1\n\t" |
4497 | 379 #endif |
380 "pfmul %%mm1, %%mm2\n\t" | |
381 #ifdef HAVE_3DNOWEX | |
382 "pfpnacc %%mm2, %%mm0\n\t" | |
383 #else | |
384 "pxor %%mm7, %%mm0\n\t" | |
385 "pfacc %%mm2, %%mm0\n\t" | |
386 #endif | |
387 "pxor %%mm7, %%mm0\n\t" | |
388 "movq %%mm0, %0" | |
389 :"=m"(buf[i]) | |
390 :"m"(data[256-2*j-1]), "m"(data[2*j]), "m"(xcos1[j]), "m"(xsin1[j]) | |
391 :"memory" | |
392 ); | |
393 /* buf[i].re = (data[256-2*j-1] * xcos1[j] - data[2*j] * xsin1[j]); | |
394 buf[i].im = (data[256-2*j-1] * xsin1[j] + data[2*j] * xcos1[j])*(-1.0);*/ | |
395 } | |
396 #else | |
397 __asm__ __volatile__ ("femms":::"memory"); | |
398 for( i=0; i < 128; i++) { | |
399 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ | |
400 int j= pm128[i]; | |
401 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); | |
402 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); | |
403 } | |
404 #endif | |
405 | |
406 /* FFT Merge */ | |
407 /* unoptimized variant | |
408 for (m=1; m < 7; m++) { | |
409 if(m) | |
410 two_m = (1 << m); | |
411 else | |
412 two_m = 1; | |
413 | |
414 two_m_plus_one = (1 << (m+1)); | |
415 | |
416 for(i = 0; i < 128; i += two_m_plus_one) { | |
417 for(k = 0; k < two_m; k++) { | |
418 p = k + i; | |
419 q = p + two_m; | |
420 tmp_a_r = buf[p].real; | |
421 tmp_a_i = buf[p].imag; | |
422 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
423 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
424 buf[p].real = tmp_a_r + tmp_b_r; | |
425 buf[p].imag = tmp_a_i + tmp_b_i; | |
426 buf[q].real = tmp_a_r - tmp_b_r; | |
427 buf[q].imag = tmp_a_i - tmp_b_i; | |
428 } | |
429 } | |
430 } | |
431 */ | |
432 | |
433 FFT_128P_3DNOW (&buf[0]); | |
434 // asm volatile ("femms \n\t":::"memory"); | |
435 | |
436 /* Post IFFT complex multiply plus IFFT complex conjugate*/ | |
437 #if 1 | |
438 __asm__ __volatile__ ( | |
439 "movq %0, %%mm7\n\t" | |
440 "movq %1, %%mm6\n\t" | |
441 ::"m"(x_plus_minus_3dnow), | |
442 "m"(x_minus_plus_3dnow) | |
443 :"eax","memory"); | |
444 for (i=0; i < 128; i++) { | |
445 __asm__ __volatile__ ( | |
446 "movq %1, %%mm0\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ | |
447 "movq %%mm0, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ | |
448 #ifndef HAVE_3DNOWEX | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
449 "punpckldq %%mm1, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
450 "punpckhdq %%mm2, %%mm1\n\t" |
4497 | 451 #else |
452 "pswapd %%mm1, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ | |
453 #endif | |
454 "movd %3, %%mm3\n\t" /* ac3_xsin[i] */ | |
455 "punpckldq %2, %%mm3\n\t" /* ac3_xsin[i] | ac3_xcos[i] */ | |
456 "pfmul %%mm3, %%mm0\n\t" | |
457 "pfmul %%mm3, %%mm1\n\t" | |
458 #ifndef HAVE_3DNOWEX | |
459 "pxor %%mm7, %%mm0\n\t" | |
460 "pfacc %%mm1, %%mm0\n\t" | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
461 "punpckldq %%mm0, %%mm1\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
462 "punpckhdq %%mm1, %%mm0\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
463 "movq %%mm0, %0\n\t" |
4497 | 464 #else |
465 "pfpnacc %%mm1, %%mm0\n\t" /* mm0 = mm0[0] - mm0[1] | mm1[0] + mm1[1] */ | |
466 "pswapd %%mm0, %%mm0\n\t" | |
467 "movq %%mm0, %0" | |
468 #endif | |
469 :"=m"(buf[i]) | |
470 :"m"(buf[i]),"m"(xcos1[i]),"m"(xsin1[i]) | |
471 :"memory"); | |
472 /* ac3_buf[i].re =(tmp_a_r * ac3_xcos1[i]) + (tmp_a_i * ac3_xsin1[i]); | |
473 ac3_buf[i].im =(tmp_a_r * ac3_xsin1[i]) - (tmp_a_i * ac3_xcos1[i]);*/ | |
474 } | |
475 #else | |
476 __asm__ __volatile__ ("femms":::"memory"); | |
477 for( i=0; i < 128; i++) { | |
478 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ | |
479 tmp_a_r = buf[i].real; | |
480 tmp_a_i = -1.0 * buf[i].imag; | |
481 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]); | |
482 buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]); | |
483 } | |
484 #endif | |
485 | |
486 data_ptr = data; | |
487 delay_ptr = delay; | |
18720
4bad7f00556e
sync with liba52 0.7.4, patch by Emanuele Giaquinta >emanuele.giaquinta ! gmail * com<
rathann
parents:
8254
diff
changeset
|
488 window_ptr = a52_imdct_window; |
4497 | 489 |
490 /* Window and convert to real valued signal */ | |
491 #if 1 | |
492 asm volatile ( | |
493 "movd (%0), %%mm3 \n\t" | |
494 "punpckldq %%mm3, %%mm3 \n\t" | |
495 :: "r" (&bias) | |
496 ); | |
497 for (i=0; i< 64; i++) { | |
498 /* merge two loops in one to enable working of 2 decoders */ | |
499 __asm__ __volatile__ ( | |
500 "movd 516(%1), %%mm0\n\t" | |
501 "movd (%1), %%mm1\n\t" /**data_ptr++=-buf[64+i].im**window_ptr+++*delay_ptr++;*/ | |
502 "punpckldq (%2), %%mm0\n\t"/*data_ptr[128]=-buf[i].re*window_ptr[128]+delay_ptr[128];*/ | |
503 "punpckldq 516(%2), %%mm1\n\t" | |
504 "pfmul (%3), %%mm0\n\t"/**data_ptr++=buf[64-i-1].re**window_ptr+++*delay_ptr++;*/ | |
505 "pfmul 512(%3), %%mm1\n\t" | |
506 "pxor %%mm6, %%mm0\n\t"/*data_ptr[128]=buf[128-i-1].im*window_ptr[128]+delay_ptr[128];*/ | |
507 "pxor %%mm6, %%mm1\n\t" | |
508 "pfadd (%4), %%mm0\n\t" | |
509 "pfadd 512(%4), %%mm1\n\t" | |
510 "pfadd %%mm3, %%mm0\n\t" | |
511 "pfadd %%mm3, %%mm1\n\t" | |
512 "movq %%mm0, (%0)\n\t" | |
513 "movq %%mm1, 512(%0)" | |
514 :"=r"(data_ptr) | |
515 :"r"(&buf[i].real), "r"(&buf[64-i-1].real), "r"(window_ptr), "r"(delay_ptr), "0"(data_ptr) | |
516 :"memory"); | |
517 data_ptr += 2; | |
518 window_ptr += 2; | |
519 delay_ptr += 2; | |
520 } | |
521 window_ptr += 128; | |
522 #else | |
523 __asm__ __volatile__ ("femms":::"memory"); | |
524 for(i=0; i< 64; i++) { | |
525 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; | |
526 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | |
527 } | |
528 | |
529 for(i=0; i< 64; i++) { | |
530 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; | |
531 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | |
532 } | |
533 #endif | |
534 | |
535 /* The trailing edge of the window goes into the delay line */ | |
536 delay_ptr = delay; | |
537 #if 1 | |
538 for(i=0; i< 64; i++) { | |
539 /* merge two loops in one to enable working of 2 decoders */ | |
540 window_ptr -=2; | |
541 __asm__ __volatile__( | |
542 "movd 508(%1), %%mm0\n\t" | |
543 "movd (%1), %%mm1\n\t" | |
544 "punpckldq (%2), %%mm0\n\t" | |
545 "punpckldq 508(%2), %%mm1\n\t" | |
546 #ifdef HAVE_3DNOWEX | |
547 "pswapd (%3), %%mm3\n\t" | |
548 "pswapd -512(%3), %%mm4\n\t" | |
549 #else | |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
550 "movq (%3), %%mm3\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
551 "punpckldq %%mm3, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
552 "punpckhdq %%mm2, %%mm3\n\t" |
4497 | 553 "movq -512(%3), %%mm4\n\t" |
8230
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
554 "punpckldq %%mm4, %%mm2\n\t" |
330086b89d8f
minor optimization & gcc-CVS fix/workaround patch by (Glen Nakamura <glen at imodulo dot com>)
michael
parents:
4497
diff
changeset
|
555 "punpckhdq %%mm2, %%mm4\n\t" |
4497 | 556 #endif |
557 "pfmul %%mm3, %%mm0\n\t" | |
558 "pfmul %%mm4, %%mm1\n\t" | |
559 "pxor %%mm6, %%mm0\n\t" | |
560 "pxor %%mm7, %%mm1\n\t" | |
561 "movq %%mm0, (%0)\n\t" | |
562 "movq %%mm1, 512(%0)" | |
563 :"=r"(delay_ptr) | |
564 :"r"(&buf[i].imag), "r"(&buf[64-i-1].imag), "r"(window_ptr), "0"(delay_ptr) | |
565 :"memory"); | |
566 delay_ptr += 2; | |
567 } | |
568 __asm__ __volatile__ ("femms":::"memory"); | |
569 #else | |
570 __asm__ __volatile__ ("femms":::"memory"); | |
571 for(i=0; i< 64; i++) { | |
572 *delay_ptr++ = -buf[64+i].real * *--window_ptr; | |
573 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; | |
574 } | |
575 | |
576 for(i=0; i<64; i++) { | |
577 *delay_ptr++ = buf[i].imag * *--window_ptr; | |
578 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; | |
579 } | |
580 #endif | |
581 } |