Mercurial > libavcodec.hg
annotate liba52/resample_mmx.c @ 4504:6287a2ff4d08 libavcodec
merge asm fragments in H264_CHROMA_MC2_TMPL()
10% faster avg_h264_chroma_mc2_mmx2()
5% faster put_h264_chroma_mc2_mmx2()
author | michael |
---|---|
date | Fri, 09 Feb 2007 12:24:22 +0000 |
parents | 1d69d79f7cc3 |
children |
rev | line source |
---|---|
3673
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
1 /* |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
2 * resample_mmx.c |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
3 * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at> |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
4 * |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
5 * This file is part of a52dec, a free ATSC A-52 stream decoder. |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
6 * See http://liba52.sourceforge.net/ for updates. |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
7 * |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
8 * a52dec is free software; you can redistribute it and/or modify |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
9 * it under the terms of the GNU General Public License as published by |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
10 * the Free Software Foundation; either version 2 of the License, or |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
11 * (at your option) any later version. |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
12 * |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
13 * a52dec is distributed in the hope that it will be useful, |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
16 * GNU General Public License for more details. |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
17 * |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
18 * You should have received a copy of the GNU General Public License |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
19 * along with this program; if not, write to the Free Software |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
1d69d79f7cc3
Exchange informal GPL notice by official license header.
diego
parents:
2967
diff
changeset
|
21 */ |
1193 | 22 |
2967 | 23 /* optimization TODO / NOTES |
24 movntq is slightly faster (0.5% with the current test.c benchmark) | |
1193 | 25 (but thats just test.c so that needs to be testd in reallity) |
2967 | 26 and it would mean (C / MMX2 / MMX / 3DNOW) versions |
1193 | 27 */ |
28 | |
2352 | 29 static uint64_t __attribute__((aligned(8))) attribute_used magicF2W= 0x43c0000043c00000LL; |
30 static uint64_t __attribute__((aligned(8))) attribute_used wm1010= 0xFFFF0000FFFF0000LL; | |
31 static uint64_t __attribute__((aligned(8))) attribute_used wm0101= 0x0000FFFF0000FFFFLL; | |
32 static uint64_t __attribute__((aligned(8))) attribute_used wm1100= 0xFFFFFFFF00000000LL; | |
1193 | 33 |
34 static int a52_resample_MONO_to_5_MMX(float * _f, int16_t * s16){ | |
35 int32_t * f = (int32_t *) _f; | |
36 asm volatile( | |
37 "movl $-512, %%esi \n\t" | |
38 "movq "MANGLE(magicF2W)", %%mm7 \n\t" | |
39 "movq "MANGLE(wm1100)", %%mm3 \n\t" | |
40 "movq "MANGLE(wm0101)", %%mm4 \n\t" | |
41 "movq "MANGLE(wm1010)", %%mm5 \n\t" | |
42 "pxor %%mm6, %%mm6 \n\t" | |
43 "1: \n\t" | |
44 "movq (%1, %%esi, 2), %%mm0 \n\t" | |
45 "movq 8(%1, %%esi, 2), %%mm1 \n\t" | |
46 "leal (%%esi, %%esi, 4), %%edi \n\t" | |
47 "psubd %%mm7, %%mm0 \n\t" | |
48 "psubd %%mm7, %%mm1 \n\t" | |
49 "packssdw %%mm1, %%mm0 \n\t" | |
50 "movq %%mm0, %%mm1 \n\t" | |
51 "pand %%mm4, %%mm0 \n\t" | |
52 "pand %%mm5, %%mm1 \n\t" | |
53 "movq %%mm6, (%0, %%edi) \n\t" // 0 0 0 0 | |
54 "movd %%mm0, 8(%0, %%edi) \n\t" // A 0 | |
55 "pand %%mm3, %%mm0 \n\t" | |
56 "movd %%mm6, 12(%0, %%edi) \n\t" // 0 0 | |
57 "movd %%mm1, 16(%0, %%edi) \n\t" // 0 B | |
58 "pand %%mm3, %%mm1 \n\t" | |
59 "movd %%mm6, 20(%0, %%edi) \n\t" // 0 0 | |
60 "movq %%mm0, 24(%0, %%edi) \n\t" // 0 0 C 0 | |
61 "movq %%mm1, 32(%0, %%edi) \n\t" // 0 0 0 B | |
62 "addl $8, %%esi \n\t" | |
63 " jnz 1b \n\t" | |
64 "emms \n\t" | |
65 :: "r" (s16+1280), "r" (f+256) | |
66 :"%esi", "%edi", "memory" | |
67 ); | |
68 return 5*256; | |
69 } | |
70 | |
71 static int a52_resample_STEREO_to_2_MMX(float * _f, int16_t * s16){ | |
72 int32_t * f = (int32_t *) _f; | |
73 /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it | |
74 #ifdef HAVE_SSE | |
75 asm volatile( | |
76 "movl $-1024, %%esi \n\t" | |
77 "1: \n\t" | |
78 "cvtps2pi (%1, %%esi), %%mm0 \n\t" | |
79 "cvtps2pi 1024(%1, %%esi), %%mm2\n\t" | |
80 "movq %%mm0, %%mm1 \n\t" | |
81 "punpcklwd %%mm2, %%mm0 \n\t" | |
82 "punpckhwd %%mm2, %%mm1 \n\t" | |
83 "movq %%mm0, (%0, %%esi) \n\t" | |
84 "movq %%mm1, 8(%0, %%esi) \n\t" | |
85 "addl $16, %%esi \n\t" | |
86 " jnz 1b \n\t" | |
87 "emms \n\t" | |
88 :: "r" (s16+512), "r" (f+256) | |
89 :"%esi", "memory" | |
90 );*/ | |
91 asm volatile( | |
92 "movl $-1024, %%esi \n\t" | |
93 "movq "MANGLE(magicF2W)", %%mm7 \n\t" | |
94 "1: \n\t" | |
95 "movq (%1, %%esi), %%mm0 \n\t" | |
96 "movq 8(%1, %%esi), %%mm1 \n\t" | |
97 "movq 1024(%1, %%esi), %%mm2 \n\t" | |
98 "movq 1032(%1, %%esi), %%mm3 \n\t" | |
99 "psubd %%mm7, %%mm0 \n\t" | |
100 "psubd %%mm7, %%mm1 \n\t" | |
101 "psubd %%mm7, %%mm2 \n\t" | |
102 "psubd %%mm7, %%mm3 \n\t" | |
103 "packssdw %%mm1, %%mm0 \n\t" | |
104 "packssdw %%mm3, %%mm2 \n\t" | |
105 "movq %%mm0, %%mm1 \n\t" | |
106 "punpcklwd %%mm2, %%mm0 \n\t" | |
107 "punpckhwd %%mm2, %%mm1 \n\t" | |
108 "movq %%mm0, (%0, %%esi) \n\t" | |
109 "movq %%mm1, 8(%0, %%esi) \n\t" | |
110 "addl $16, %%esi \n\t" | |
111 " jnz 1b \n\t" | |
112 "emms \n\t" | |
113 :: "r" (s16+512), "r" (f+256) | |
114 :"%esi", "memory" | |
115 ); | |
116 return 2*256; | |
117 } | |
118 | |
119 static int a52_resample_3F_to_5_MMX(float * _f, int16_t * s16){ | |
120 int32_t * f = (int32_t *) _f; | |
121 asm volatile( | |
122 "movl $-1024, %%esi \n\t" | |
123 "movq "MANGLE(magicF2W)", %%mm7 \n\t" | |
124 "pxor %%mm6, %%mm6 \n\t" | |
125 "movq %%mm7, %%mm5 \n\t" | |
126 "punpckldq %%mm6, %%mm5 \n\t" | |
127 "1: \n\t" | |
128 "movd (%1, %%esi), %%mm0 \n\t" | |
129 "punpckldq 2048(%1, %%esi), %%mm0\n\t" | |
130 "movd 1024(%1, %%esi), %%mm1 \n\t" | |
131 "punpckldq 4(%1, %%esi), %%mm1 \n\t" | |
132 "movd 2052(%1, %%esi), %%mm2 \n\t" | |
133 "movq %%mm7, %%mm3 \n\t" | |
134 "punpckldq 1028(%1, %%esi), %%mm3\n\t" | |
135 "movd 8(%1, %%esi), %%mm4 \n\t" | |
136 "punpckldq 2056(%1, %%esi), %%mm4\n\t" | |
137 "leal (%%esi, %%esi, 4), %%edi \n\t" | |
138 "sarl $1, %%edi \n\t" | |
139 "psubd %%mm7, %%mm0 \n\t" | |
140 "psubd %%mm7, %%mm1 \n\t" | |
141 "psubd %%mm5, %%mm2 \n\t" | |
142 "psubd %%mm7, %%mm3 \n\t" | |
143 "psubd %%mm7, %%mm4 \n\t" | |
144 "packssdw %%mm6, %%mm0 \n\t" | |
145 "packssdw %%mm2, %%mm1 \n\t" | |
146 "packssdw %%mm4, %%mm3 \n\t" | |
147 "movq %%mm0, (%0, %%edi) \n\t" | |
148 "movq %%mm1, 8(%0, %%edi) \n\t" | |
149 "movq %%mm3, 16(%0, %%edi) \n\t" | |
2967 | 150 |
1193 | 151 "movd 1032(%1, %%esi), %%mm1 \n\t" |
152 "punpckldq 12(%1, %%esi), %%mm1\n\t" | |
153 "movd 2060(%1, %%esi), %%mm2 \n\t" | |
154 "movq %%mm7, %%mm3 \n\t" | |
155 "punpckldq 1036(%1, %%esi), %%mm3\n\t" | |
156 "pxor %%mm0, %%mm0 \n\t" | |
157 "psubd %%mm7, %%mm1 \n\t" | |
158 "psubd %%mm5, %%mm2 \n\t" | |
159 "psubd %%mm7, %%mm3 \n\t" | |
160 "packssdw %%mm1, %%mm0 \n\t" | |
161 "packssdw %%mm3, %%mm2 \n\t" | |
162 "movq %%mm0, 24(%0, %%edi) \n\t" | |
163 "movq %%mm2, 32(%0, %%edi) \n\t" | |
2967 | 164 |
1193 | 165 "addl $16, %%esi \n\t" |
166 " jnz 1b \n\t" | |
167 "emms \n\t" | |
168 :: "r" (s16+1280), "r" (f+256) | |
169 :"%esi", "%edi", "memory" | |
170 ); | |
171 return 5*256; | |
172 } | |
173 | |
174 static int a52_resample_2F_2R_to_4_MMX(float * _f, int16_t * s16){ | |
175 int32_t * f = (int32_t *) _f; | |
176 asm volatile( | |
177 "movl $-1024, %%esi \n\t" | |
178 "movq "MANGLE(magicF2W)", %%mm7 \n\t" | |
179 "1: \n\t" | |
180 "movq (%1, %%esi), %%mm0 \n\t" | |
181 "movq 8(%1, %%esi), %%mm1 \n\t" | |
182 "movq 1024(%1, %%esi), %%mm2 \n\t" | |
183 "movq 1032(%1, %%esi), %%mm3 \n\t" | |
184 "psubd %%mm7, %%mm0 \n\t" | |
185 "psubd %%mm7, %%mm1 \n\t" | |
186 "psubd %%mm7, %%mm2 \n\t" | |
187 "psubd %%mm7, %%mm3 \n\t" | |
188 "packssdw %%mm1, %%mm0 \n\t" | |
189 "packssdw %%mm3, %%mm2 \n\t" | |
190 "movq 2048(%1, %%esi), %%mm3 \n\t" | |
191 "movq 2056(%1, %%esi), %%mm4 \n\t" | |
192 "movq 3072(%1, %%esi), %%mm5 \n\t" | |
193 "movq 3080(%1, %%esi), %%mm6 \n\t" | |
194 "psubd %%mm7, %%mm3 \n\t" | |
195 "psubd %%mm7, %%mm4 \n\t" | |
196 "psubd %%mm7, %%mm5 \n\t" | |
197 "psubd %%mm7, %%mm6 \n\t" | |
198 "packssdw %%mm4, %%mm3 \n\t" | |
199 "packssdw %%mm6, %%mm5 \n\t" | |
200 "movq %%mm0, %%mm1 \n\t" | |
201 "movq %%mm3, %%mm4 \n\t" | |
202 "punpcklwd %%mm2, %%mm0 \n\t" | |
203 "punpckhwd %%mm2, %%mm1 \n\t" | |
204 "punpcklwd %%mm5, %%mm3 \n\t" | |
205 "punpckhwd %%mm5, %%mm4 \n\t" | |
206 "movq %%mm0, %%mm2 \n\t" | |
207 "movq %%mm1, %%mm5 \n\t" | |
208 "punpckldq %%mm3, %%mm0 \n\t" | |
209 "punpckhdq %%mm3, %%mm2 \n\t" | |
210 "punpckldq %%mm4, %%mm1 \n\t" | |
211 "punpckhdq %%mm4, %%mm5 \n\t" | |
212 "movq %%mm0, (%0, %%esi,2) \n\t" | |
213 "movq %%mm2, 8(%0, %%esi,2) \n\t" | |
214 "movq %%mm1, 16(%0, %%esi,2) \n\t" | |
215 "movq %%mm5, 24(%0, %%esi,2) \n\t" | |
216 "addl $16, %%esi \n\t" | |
217 " jnz 1b \n\t" | |
218 "emms \n\t" | |
219 :: "r" (s16+1024), "r" (f+256) | |
220 :"%esi", "memory" | |
221 ); | |
222 return 4*256; | |
223 } | |
224 | |
225 static int a52_resample_3F_2R_to_5_MMX(float * _f, int16_t * s16){ | |
226 int32_t * f = (int32_t *) _f; | |
227 asm volatile( | |
228 "movl $-1024, %%esi \n\t" | |
229 "movq "MANGLE(magicF2W)", %%mm7 \n\t" | |
230 "1: \n\t" | |
231 "movd (%1, %%esi), %%mm0 \n\t" | |
232 "punpckldq 2048(%1, %%esi), %%mm0\n\t" | |
233 "movd 3072(%1, %%esi), %%mm1 \n\t" | |
234 "punpckldq 4096(%1, %%esi), %%mm1\n\t" | |
235 "movd 1024(%1, %%esi), %%mm2 \n\t" | |
236 "punpckldq 4(%1, %%esi), %%mm2 \n\t" | |
237 "movd 2052(%1, %%esi), %%mm3 \n\t" | |
238 "punpckldq 3076(%1, %%esi), %%mm3\n\t" | |
239 "movd 4100(%1, %%esi), %%mm4 \n\t" | |
240 "punpckldq 1028(%1, %%esi), %%mm4\n\t" | |
241 "movd 8(%1, %%esi), %%mm5 \n\t" | |
242 "punpckldq 2056(%1, %%esi), %%mm5\n\t" | |
243 "leal (%%esi, %%esi, 4), %%edi \n\t" | |
244 "sarl $1, %%edi \n\t" | |
245 "psubd %%mm7, %%mm0 \n\t" | |
246 "psubd %%mm7, %%mm1 \n\t" | |
247 "psubd %%mm7, %%mm2 \n\t" | |
248 "psubd %%mm7, %%mm3 \n\t" | |
249 "psubd %%mm7, %%mm4 \n\t" | |
250 "psubd %%mm7, %%mm5 \n\t" | |
251 "packssdw %%mm1, %%mm0 \n\t" | |
252 "packssdw %%mm3, %%mm2 \n\t" | |
253 "packssdw %%mm5, %%mm4 \n\t" | |
254 "movq %%mm0, (%0, %%edi) \n\t" | |
255 "movq %%mm2, 8(%0, %%edi) \n\t" | |
256 "movq %%mm4, 16(%0, %%edi) \n\t" | |
2967 | 257 |
1193 | 258 "movd 3080(%1, %%esi), %%mm0 \n\t" |
259 "punpckldq 4104(%1, %%esi), %%mm0\n\t" | |
260 "movd 1032(%1, %%esi), %%mm1 \n\t" | |
261 "punpckldq 12(%1, %%esi), %%mm1\n\t" | |
262 "movd 2060(%1, %%esi), %%mm2 \n\t" | |
263 "punpckldq 3084(%1, %%esi), %%mm2\n\t" | |
264 "movd 4108(%1, %%esi), %%mm3 \n\t" | |
265 "punpckldq 1036(%1, %%esi), %%mm3\n\t" | |
266 "psubd %%mm7, %%mm0 \n\t" | |
267 "psubd %%mm7, %%mm1 \n\t" | |
268 "psubd %%mm7, %%mm2 \n\t" | |
269 "psubd %%mm7, %%mm3 \n\t" | |
270 "packssdw %%mm1, %%mm0 \n\t" | |
271 "packssdw %%mm3, %%mm2 \n\t" | |
272 "movq %%mm0, 24(%0, %%edi) \n\t" | |
273 "movq %%mm2, 32(%0, %%edi) \n\t" | |
2967 | 274 |
1193 | 275 "addl $16, %%esi \n\t" |
276 " jnz 1b \n\t" | |
277 "emms \n\t" | |
278 :: "r" (s16+1280), "r" (f+256) | |
279 :"%esi", "%edi", "memory" | |
280 ); | |
281 return 5*256; | |
282 } | |
283 | |
284 static int a52_resample_MONO_LFE_to_6_MMX(float * _f, int16_t * s16){ | |
285 int32_t * f = (int32_t *) _f; | |
286 asm volatile( | |
287 "movl $-1024, %%esi \n\t" | |
288 "movq "MANGLE(magicF2W)", %%mm7 \n\t" | |
289 "pxor %%mm6, %%mm6 \n\t" | |
290 "1: \n\t" | |
291 "movq 1024(%1, %%esi), %%mm0 \n\t" | |
292 "movq 1032(%1, %%esi), %%mm1 \n\t" | |
293 "movq (%1, %%esi), %%mm2 \n\t" | |
294 "movq 8(%1, %%esi), %%mm3 \n\t" | |
295 "psubd %%mm7, %%mm0 \n\t" | |
296 "psubd %%mm7, %%mm1 \n\t" | |
297 "psubd %%mm7, %%mm2 \n\t" | |
298 "psubd %%mm7, %%mm3 \n\t" | |
299 "packssdw %%mm1, %%mm0 \n\t" | |
300 "packssdw %%mm3, %%mm2 \n\t" | |
301 "movq %%mm0, %%mm1 \n\t" | |
302 "punpcklwd %%mm2, %%mm0 \n\t" | |
303 "punpckhwd %%mm2, %%mm1 \n\t" | |
304 "leal (%%esi, %%esi, 2), %%edi \n\t" | |
305 "movq %%mm6, (%0, %%edi) \n\t" | |
306 "movd %%mm0, 8(%0, %%edi) \n\t" | |
307 "punpckhdq %%mm0, %%mm0 \n\t" | |
308 "movq %%mm6, 12(%0, %%edi) \n\t" | |
309 "movd %%mm0, 20(%0, %%edi) \n\t" | |
310 "movq %%mm6, 24(%0, %%edi) \n\t" | |
311 "movd %%mm1, 32(%0, %%edi) \n\t" | |
312 "punpckhdq %%mm1, %%mm1 \n\t" | |
313 "movq %%mm6, 36(%0, %%edi) \n\t" | |
314 "movd %%mm1, 44(%0, %%edi) \n\t" | |
315 "addl $16, %%esi \n\t" | |
316 " jnz 1b \n\t" | |
317 "emms \n\t" | |
318 :: "r" (s16+1536), "r" (f+256) | |
319 :"%esi", "%edi", "memory" | |
320 ); | |
321 return 6*256; | |
322 } | |
323 | |
324 static int a52_resample_STEREO_LFE_to_6_MMX(float * _f, int16_t * s16){ | |
325 int32_t * f = (int32_t *) _f; | |
326 asm volatile( | |
327 "movl $-1024, %%esi \n\t" | |
328 "movq "MANGLE(magicF2W)", %%mm7 \n\t" | |
329 "pxor %%mm6, %%mm6 \n\t" | |
330 "1: \n\t" | |
331 "movq 1024(%1, %%esi), %%mm0 \n\t" | |
332 "movq 2048(%1, %%esi), %%mm1 \n\t" | |
2967 | 333 "movq (%1, %%esi), %%mm5 \n\t" |
1193 | 334 "psubd %%mm7, %%mm0 \n\t" |
335 "psubd %%mm7, %%mm1 \n\t" | |
336 "psubd %%mm7, %%mm5 \n\t" | |
337 "leal (%%esi, %%esi, 2), %%edi \n\t" | |
2967 | 338 |
1193 | 339 "pxor %%mm4, %%mm4 \n\t" |
340 "packssdw %%mm5, %%mm0 \n\t" // FfAa | |
341 "packssdw %%mm4, %%mm1 \n\t" // 00Bb | |
342 "punpckhwd %%mm0, %%mm4 \n\t" // F0f0 | |
343 "punpcklwd %%mm1, %%mm0 \n\t" // BAba | |
344 "movq %%mm0, %%mm1 \n\t" // BAba | |
345 "punpckldq %%mm4, %%mm3 \n\t" // f0XX | |
346 "punpckldq %%mm6, %%mm0 \n\t" // 00ba | |
347 "punpckhdq %%mm1, %%mm3 \n\t" // BAf0 | |
2967 | 348 |
1193 | 349 "movq %%mm0, (%0, %%edi) \n\t" // 00ba |
350 "punpckhdq %%mm4, %%mm0 \n\t" // F000 | |
351 "movq %%mm3, 8(%0, %%edi) \n\t" // BAf0 | |
352 "movq %%mm0, 16(%0, %%edi) \n\t" // F000 | |
353 "addl $8, %%esi \n\t" | |
354 " jnz 1b \n\t" | |
355 "emms \n\t" | |
356 :: "r" (s16+1536), "r" (f+256) | |
357 :"%esi", "%edi", "memory" | |
358 ); | |
359 return 6*256; | |
360 } | |
361 | |
362 static int a52_resample_3F_LFE_to_6_MMX(float * _f, int16_t * s16){ | |
363 int32_t * f = (int32_t *) _f; | |
364 asm volatile( | |
365 "movl $-1024, %%esi \n\t" | |
366 "movq "MANGLE(magicF2W)", %%mm7 \n\t" | |
367 "pxor %%mm6, %%mm6 \n\t" | |
368 "1: \n\t" | |
369 "movq 1024(%1, %%esi), %%mm0 \n\t" | |
370 "movq 3072(%1, %%esi), %%mm1 \n\t" | |
371 "movq 2048(%1, %%esi), %%mm4 \n\t" | |
2967 | 372 "movq (%1, %%esi), %%mm5 \n\t" |
1193 | 373 "psubd %%mm7, %%mm0 \n\t" |
374 "psubd %%mm7, %%mm1 \n\t" | |
375 "psubd %%mm7, %%mm4 \n\t" | |
376 "psubd %%mm7, %%mm5 \n\t" | |
377 "leal (%%esi, %%esi, 2), %%edi \n\t" | |
2967 | 378 |
1193 | 379 "packssdw %%mm4, %%mm0 \n\t" // EeAa |
380 "packssdw %%mm5, %%mm1 \n\t" // FfBb | |
381 "movq %%mm0, %%mm2 \n\t" // EeAa | |
382 "punpcklwd %%mm1, %%mm0 \n\t" // BAba | |
383 "punpckhwd %%mm1, %%mm2 \n\t" // FEfe | |
384 "movq %%mm0, %%mm1 \n\t" // BAba | |
385 "punpckldq %%mm6, %%mm0 \n\t" // 00ba | |
386 "punpckhdq %%mm1, %%mm1 \n\t" // BABA | |
2967 | 387 |
1193 | 388 "movq %%mm0, (%0, %%edi) \n\t" |
389 "punpckhdq %%mm2, %%mm0 \n\t" // FE00 | |
390 "punpckldq %%mm1, %%mm2 \n\t" // BAfe | |
391 "movq %%mm2, 8(%0, %%edi) \n\t" | |
392 "movq %%mm0, 16(%0, %%edi) \n\t" | |
393 "addl $8, %%esi \n\t" | |
394 " jnz 1b \n\t" | |
395 "emms \n\t" | |
396 :: "r" (s16+1536), "r" (f+256) | |
397 :"%esi", "%edi", "memory" | |
398 ); | |
399 return 6*256; | |
400 } | |
401 | |
402 static int a52_resample_2F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){ | |
403 int32_t * f = (int32_t *) _f; | |
404 asm volatile( | |
405 "movl $-1024, %%esi \n\t" | |
406 "movq "MANGLE(magicF2W)", %%mm7 \n\t" | |
407 // "pxor %%mm6, %%mm6 \n\t" | |
408 "1: \n\t" | |
409 "movq 1024(%1, %%esi), %%mm0 \n\t" | |
410 "movq 2048(%1, %%esi), %%mm1 \n\t" | |
411 "movq 3072(%1, %%esi), %%mm2 \n\t" | |
412 "movq 4096(%1, %%esi), %%mm3 \n\t" | |
2967 | 413 "movq (%1, %%esi), %%mm5 \n\t" |
1193 | 414 "psubd %%mm7, %%mm0 \n\t" |
415 "psubd %%mm7, %%mm1 \n\t" | |
416 "psubd %%mm7, %%mm2 \n\t" | |
417 "psubd %%mm7, %%mm3 \n\t" | |
418 "psubd %%mm7, %%mm5 \n\t" | |
419 "leal (%%esi, %%esi, 2), %%edi \n\t" | |
2967 | 420 |
1193 | 421 "packssdw %%mm2, %%mm0 \n\t" // CcAa |
422 "packssdw %%mm3, %%mm1 \n\t" // DdBb | |
423 "packssdw %%mm5, %%mm5 \n\t" // FfFf | |
424 "movq %%mm0, %%mm2 \n\t" // CcAa | |
425 "punpcklwd %%mm1, %%mm0 \n\t" // BAba | |
426 "punpckhwd %%mm1, %%mm2 \n\t" // DCdc | |
427 "pxor %%mm4, %%mm4 \n\t" // 0000 | |
428 "punpcklwd %%mm5, %%mm4 \n\t" // F0f0 | |
429 "movq %%mm0, %%mm1 \n\t" // BAba | |
430 "movq %%mm4, %%mm3 \n\t" // F0f0 | |
431 "punpckldq %%mm2, %%mm0 \n\t" // dcba | |
432 "punpckhdq %%mm1, %%mm1 \n\t" // BABA | |
433 "punpckldq %%mm1, %%mm4 \n\t" // BAf0 | |
434 "punpckhdq %%mm3, %%mm2 \n\t" // F0DC | |
2967 | 435 |
1193 | 436 "movq %%mm0, (%0, %%edi) \n\t" |
437 "movq %%mm4, 8(%0, %%edi) \n\t" | |
438 "movq %%mm2, 16(%0, %%edi) \n\t" | |
439 "addl $8, %%esi \n\t" | |
440 " jnz 1b \n\t" | |
441 "emms \n\t" | |
442 :: "r" (s16+1536), "r" (f+256) | |
443 :"%esi", "%edi", "memory" | |
444 ); | |
445 return 6*256; | |
446 } | |
447 | |
448 static int a52_resample_3F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){ | |
449 int32_t * f = (int32_t *) _f; | |
450 asm volatile( | |
451 "movl $-1024, %%esi \n\t" | |
452 "movq "MANGLE(magicF2W)", %%mm7 \n\t" | |
453 // "pxor %%mm6, %%mm6 \n\t" | |
454 "1: \n\t" | |
455 "movq 1024(%1, %%esi), %%mm0 \n\t" | |
456 "movq 3072(%1, %%esi), %%mm1 \n\t" | |
457 "movq 4096(%1, %%esi), %%mm2 \n\t" | |
458 "movq 5120(%1, %%esi), %%mm3 \n\t" | |
459 "movq 2048(%1, %%esi), %%mm4 \n\t" | |
2967 | 460 "movq (%1, %%esi), %%mm5 \n\t" |
1193 | 461 "psubd %%mm7, %%mm0 \n\t" |
462 "psubd %%mm7, %%mm1 \n\t" | |
463 "psubd %%mm7, %%mm2 \n\t" | |
464 "psubd %%mm7, %%mm3 \n\t" | |
465 "psubd %%mm7, %%mm4 \n\t" | |
466 "psubd %%mm7, %%mm5 \n\t" | |
467 "leal (%%esi, %%esi, 2), %%edi \n\t" | |
2967 | 468 |
1193 | 469 "packssdw %%mm2, %%mm0 \n\t" // CcAa |
470 "packssdw %%mm3, %%mm1 \n\t" // DdBb | |
471 "packssdw %%mm4, %%mm4 \n\t" // EeEe | |
472 "packssdw %%mm5, %%mm5 \n\t" // FfFf | |
473 "movq %%mm0, %%mm2 \n\t" // CcAa | |
474 "punpcklwd %%mm1, %%mm0 \n\t" // BAba | |
475 "punpckhwd %%mm1, %%mm2 \n\t" // DCdc | |
476 "punpcklwd %%mm5, %%mm4 \n\t" // FEfe | |
477 "movq %%mm0, %%mm1 \n\t" // BAba | |
478 "movq %%mm4, %%mm3 \n\t" // FEfe | |
479 "punpckldq %%mm2, %%mm0 \n\t" // dcba | |
480 "punpckhdq %%mm1, %%mm1 \n\t" // BABA | |
481 "punpckldq %%mm1, %%mm4 \n\t" // BAfe | |
482 "punpckhdq %%mm3, %%mm2 \n\t" // FEDC | |
2967 | 483 |
1193 | 484 "movq %%mm0, (%0, %%edi) \n\t" |
485 "movq %%mm4, 8(%0, %%edi) \n\t" | |
486 "movq %%mm2, 16(%0, %%edi) \n\t" | |
487 "addl $8, %%esi \n\t" | |
488 " jnz 1b \n\t" | |
489 "emms \n\t" | |
490 :: "r" (s16+1536), "r" (f+256) | |
491 :"%esi", "%edi", "memory" | |
492 ); | |
493 return 6*256; | |
494 } | |
495 | |
496 | |
497 static void* a52_resample_MMX(int flags, int ch){ | |
498 switch (flags) { | |
499 case A52_MONO: | |
500 if(ch==5) return a52_resample_MONO_to_5_MMX; | |
501 break; | |
502 case A52_CHANNEL: | |
503 case A52_STEREO: | |
504 case A52_DOLBY: | |
505 if(ch==2) return a52_resample_STEREO_to_2_MMX; | |
506 break; | |
507 case A52_3F: | |
508 if(ch==5) return a52_resample_3F_to_5_MMX; | |
509 break; | |
510 case A52_2F2R: | |
511 if(ch==4) return a52_resample_2F_2R_to_4_MMX; | |
512 break; | |
513 case A52_3F2R: | |
514 if(ch==5) return a52_resample_3F_2R_to_5_MMX; | |
515 break; | |
516 case A52_MONO | A52_LFE: | |
517 if(ch==6) return a52_resample_MONO_LFE_to_6_MMX; | |
518 break; | |
519 case A52_CHANNEL | A52_LFE: | |
520 case A52_STEREO | A52_LFE: | |
521 case A52_DOLBY | A52_LFE: | |
522 if(ch==6) return a52_resample_STEREO_LFE_to_6_MMX; | |
523 break; | |
524 case A52_3F | A52_LFE: | |
525 if(ch==6) return a52_resample_3F_LFE_to_6_MMX; | |
526 break; | |
527 case A52_2F2R | A52_LFE: | |
528 if(ch==6) return a52_resample_2F_2R_LFE_to_6_MMX; | |
529 break; | |
530 case A52_3F2R | A52_LFE: | |
531 if(ch==6) return a52_resample_3F_2R_LFE_to_6_MMX; | |
532 break; | |
533 } | |
534 return NULL; | |
535 } | |
536 | |
537 |