1193
|
1
|
|
2 // MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL)
|
|
3
|
2967
|
4 /* optimization TODO / NOTES
|
|
5 movntq is slightly faster (0.5% with the current test.c benchmark)
|
1193
|
6 (but thats just test.c so that needs to be testd in reallity)
|
2967
|
7 and it would mean (C / MMX2 / MMX / 3DNOW) versions
|
1193
|
8 */
|
|
9
|
2352
|
10 static uint64_t __attribute__((aligned(8))) attribute_used magicF2W= 0x43c0000043c00000LL;
|
|
11 static uint64_t __attribute__((aligned(8))) attribute_used wm1010= 0xFFFF0000FFFF0000LL;
|
|
12 static uint64_t __attribute__((aligned(8))) attribute_used wm0101= 0x0000FFFF0000FFFFLL;
|
|
13 static uint64_t __attribute__((aligned(8))) attribute_used wm1100= 0xFFFFFFFF00000000LL;
|
1193
|
14
|
|
15 static int a52_resample_MONO_to_5_MMX(float * _f, int16_t * s16){
|
|
16 int32_t * f = (int32_t *) _f;
|
|
17 asm volatile(
|
|
18 "movl $-512, %%esi \n\t"
|
|
19 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
|
|
20 "movq "MANGLE(wm1100)", %%mm3 \n\t"
|
|
21 "movq "MANGLE(wm0101)", %%mm4 \n\t"
|
|
22 "movq "MANGLE(wm1010)", %%mm5 \n\t"
|
|
23 "pxor %%mm6, %%mm6 \n\t"
|
|
24 "1: \n\t"
|
|
25 "movq (%1, %%esi, 2), %%mm0 \n\t"
|
|
26 "movq 8(%1, %%esi, 2), %%mm1 \n\t"
|
|
27 "leal (%%esi, %%esi, 4), %%edi \n\t"
|
|
28 "psubd %%mm7, %%mm0 \n\t"
|
|
29 "psubd %%mm7, %%mm1 \n\t"
|
|
30 "packssdw %%mm1, %%mm0 \n\t"
|
|
31 "movq %%mm0, %%mm1 \n\t"
|
|
32 "pand %%mm4, %%mm0 \n\t"
|
|
33 "pand %%mm5, %%mm1 \n\t"
|
|
34 "movq %%mm6, (%0, %%edi) \n\t" // 0 0 0 0
|
|
35 "movd %%mm0, 8(%0, %%edi) \n\t" // A 0
|
|
36 "pand %%mm3, %%mm0 \n\t"
|
|
37 "movd %%mm6, 12(%0, %%edi) \n\t" // 0 0
|
|
38 "movd %%mm1, 16(%0, %%edi) \n\t" // 0 B
|
|
39 "pand %%mm3, %%mm1 \n\t"
|
|
40 "movd %%mm6, 20(%0, %%edi) \n\t" // 0 0
|
|
41 "movq %%mm0, 24(%0, %%edi) \n\t" // 0 0 C 0
|
|
42 "movq %%mm1, 32(%0, %%edi) \n\t" // 0 0 0 B
|
|
43 "addl $8, %%esi \n\t"
|
|
44 " jnz 1b \n\t"
|
|
45 "emms \n\t"
|
|
46 :: "r" (s16+1280), "r" (f+256)
|
|
47 :"%esi", "%edi", "memory"
|
|
48 );
|
|
49 return 5*256;
|
|
50 }
|
|
51
|
|
52 static int a52_resample_STEREO_to_2_MMX(float * _f, int16_t * s16){
|
|
53 int32_t * f = (int32_t *) _f;
|
|
54 /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it
|
|
55 #ifdef HAVE_SSE
|
|
56 asm volatile(
|
|
57 "movl $-1024, %%esi \n\t"
|
|
58 "1: \n\t"
|
|
59 "cvtps2pi (%1, %%esi), %%mm0 \n\t"
|
|
60 "cvtps2pi 1024(%1, %%esi), %%mm2\n\t"
|
|
61 "movq %%mm0, %%mm1 \n\t"
|
|
62 "punpcklwd %%mm2, %%mm0 \n\t"
|
|
63 "punpckhwd %%mm2, %%mm1 \n\t"
|
|
64 "movq %%mm0, (%0, %%esi) \n\t"
|
|
65 "movq %%mm1, 8(%0, %%esi) \n\t"
|
|
66 "addl $16, %%esi \n\t"
|
|
67 " jnz 1b \n\t"
|
|
68 "emms \n\t"
|
|
69 :: "r" (s16+512), "r" (f+256)
|
|
70 :"%esi", "memory"
|
|
71 );*/
|
|
72 asm volatile(
|
|
73 "movl $-1024, %%esi \n\t"
|
|
74 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
|
|
75 "1: \n\t"
|
|
76 "movq (%1, %%esi), %%mm0 \n\t"
|
|
77 "movq 8(%1, %%esi), %%mm1 \n\t"
|
|
78 "movq 1024(%1, %%esi), %%mm2 \n\t"
|
|
79 "movq 1032(%1, %%esi), %%mm3 \n\t"
|
|
80 "psubd %%mm7, %%mm0 \n\t"
|
|
81 "psubd %%mm7, %%mm1 \n\t"
|
|
82 "psubd %%mm7, %%mm2 \n\t"
|
|
83 "psubd %%mm7, %%mm3 \n\t"
|
|
84 "packssdw %%mm1, %%mm0 \n\t"
|
|
85 "packssdw %%mm3, %%mm2 \n\t"
|
|
86 "movq %%mm0, %%mm1 \n\t"
|
|
87 "punpcklwd %%mm2, %%mm0 \n\t"
|
|
88 "punpckhwd %%mm2, %%mm1 \n\t"
|
|
89 "movq %%mm0, (%0, %%esi) \n\t"
|
|
90 "movq %%mm1, 8(%0, %%esi) \n\t"
|
|
91 "addl $16, %%esi \n\t"
|
|
92 " jnz 1b \n\t"
|
|
93 "emms \n\t"
|
|
94 :: "r" (s16+512), "r" (f+256)
|
|
95 :"%esi", "memory"
|
|
96 );
|
|
97 return 2*256;
|
|
98 }
|
|
99
|
|
100 static int a52_resample_3F_to_5_MMX(float * _f, int16_t * s16){
|
|
101 int32_t * f = (int32_t *) _f;
|
|
102 asm volatile(
|
|
103 "movl $-1024, %%esi \n\t"
|
|
104 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
|
|
105 "pxor %%mm6, %%mm6 \n\t"
|
|
106 "movq %%mm7, %%mm5 \n\t"
|
|
107 "punpckldq %%mm6, %%mm5 \n\t"
|
|
108 "1: \n\t"
|
|
109 "movd (%1, %%esi), %%mm0 \n\t"
|
|
110 "punpckldq 2048(%1, %%esi), %%mm0\n\t"
|
|
111 "movd 1024(%1, %%esi), %%mm1 \n\t"
|
|
112 "punpckldq 4(%1, %%esi), %%mm1 \n\t"
|
|
113 "movd 2052(%1, %%esi), %%mm2 \n\t"
|
|
114 "movq %%mm7, %%mm3 \n\t"
|
|
115 "punpckldq 1028(%1, %%esi), %%mm3\n\t"
|
|
116 "movd 8(%1, %%esi), %%mm4 \n\t"
|
|
117 "punpckldq 2056(%1, %%esi), %%mm4\n\t"
|
|
118 "leal (%%esi, %%esi, 4), %%edi \n\t"
|
|
119 "sarl $1, %%edi \n\t"
|
|
120 "psubd %%mm7, %%mm0 \n\t"
|
|
121 "psubd %%mm7, %%mm1 \n\t"
|
|
122 "psubd %%mm5, %%mm2 \n\t"
|
|
123 "psubd %%mm7, %%mm3 \n\t"
|
|
124 "psubd %%mm7, %%mm4 \n\t"
|
|
125 "packssdw %%mm6, %%mm0 \n\t"
|
|
126 "packssdw %%mm2, %%mm1 \n\t"
|
|
127 "packssdw %%mm4, %%mm3 \n\t"
|
|
128 "movq %%mm0, (%0, %%edi) \n\t"
|
|
129 "movq %%mm1, 8(%0, %%edi) \n\t"
|
|
130 "movq %%mm3, 16(%0, %%edi) \n\t"
|
2967
|
131
|
1193
|
132 "movd 1032(%1, %%esi), %%mm1 \n\t"
|
|
133 "punpckldq 12(%1, %%esi), %%mm1\n\t"
|
|
134 "movd 2060(%1, %%esi), %%mm2 \n\t"
|
|
135 "movq %%mm7, %%mm3 \n\t"
|
|
136 "punpckldq 1036(%1, %%esi), %%mm3\n\t"
|
|
137 "pxor %%mm0, %%mm0 \n\t"
|
|
138 "psubd %%mm7, %%mm1 \n\t"
|
|
139 "psubd %%mm5, %%mm2 \n\t"
|
|
140 "psubd %%mm7, %%mm3 \n\t"
|
|
141 "packssdw %%mm1, %%mm0 \n\t"
|
|
142 "packssdw %%mm3, %%mm2 \n\t"
|
|
143 "movq %%mm0, 24(%0, %%edi) \n\t"
|
|
144 "movq %%mm2, 32(%0, %%edi) \n\t"
|
2967
|
145
|
1193
|
146 "addl $16, %%esi \n\t"
|
|
147 " jnz 1b \n\t"
|
|
148 "emms \n\t"
|
|
149 :: "r" (s16+1280), "r" (f+256)
|
|
150 :"%esi", "%edi", "memory"
|
|
151 );
|
|
152 return 5*256;
|
|
153 }
|
|
154
|
|
155 static int a52_resample_2F_2R_to_4_MMX(float * _f, int16_t * s16){
|
|
156 int32_t * f = (int32_t *) _f;
|
|
157 asm volatile(
|
|
158 "movl $-1024, %%esi \n\t"
|
|
159 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
|
|
160 "1: \n\t"
|
|
161 "movq (%1, %%esi), %%mm0 \n\t"
|
|
162 "movq 8(%1, %%esi), %%mm1 \n\t"
|
|
163 "movq 1024(%1, %%esi), %%mm2 \n\t"
|
|
164 "movq 1032(%1, %%esi), %%mm3 \n\t"
|
|
165 "psubd %%mm7, %%mm0 \n\t"
|
|
166 "psubd %%mm7, %%mm1 \n\t"
|
|
167 "psubd %%mm7, %%mm2 \n\t"
|
|
168 "psubd %%mm7, %%mm3 \n\t"
|
|
169 "packssdw %%mm1, %%mm0 \n\t"
|
|
170 "packssdw %%mm3, %%mm2 \n\t"
|
|
171 "movq 2048(%1, %%esi), %%mm3 \n\t"
|
|
172 "movq 2056(%1, %%esi), %%mm4 \n\t"
|
|
173 "movq 3072(%1, %%esi), %%mm5 \n\t"
|
|
174 "movq 3080(%1, %%esi), %%mm6 \n\t"
|
|
175 "psubd %%mm7, %%mm3 \n\t"
|
|
176 "psubd %%mm7, %%mm4 \n\t"
|
|
177 "psubd %%mm7, %%mm5 \n\t"
|
|
178 "psubd %%mm7, %%mm6 \n\t"
|
|
179 "packssdw %%mm4, %%mm3 \n\t"
|
|
180 "packssdw %%mm6, %%mm5 \n\t"
|
|
181 "movq %%mm0, %%mm1 \n\t"
|
|
182 "movq %%mm3, %%mm4 \n\t"
|
|
183 "punpcklwd %%mm2, %%mm0 \n\t"
|
|
184 "punpckhwd %%mm2, %%mm1 \n\t"
|
|
185 "punpcklwd %%mm5, %%mm3 \n\t"
|
|
186 "punpckhwd %%mm5, %%mm4 \n\t"
|
|
187 "movq %%mm0, %%mm2 \n\t"
|
|
188 "movq %%mm1, %%mm5 \n\t"
|
|
189 "punpckldq %%mm3, %%mm0 \n\t"
|
|
190 "punpckhdq %%mm3, %%mm2 \n\t"
|
|
191 "punpckldq %%mm4, %%mm1 \n\t"
|
|
192 "punpckhdq %%mm4, %%mm5 \n\t"
|
|
193 "movq %%mm0, (%0, %%esi,2) \n\t"
|
|
194 "movq %%mm2, 8(%0, %%esi,2) \n\t"
|
|
195 "movq %%mm1, 16(%0, %%esi,2) \n\t"
|
|
196 "movq %%mm5, 24(%0, %%esi,2) \n\t"
|
|
197 "addl $16, %%esi \n\t"
|
|
198 " jnz 1b \n\t"
|
|
199 "emms \n\t"
|
|
200 :: "r" (s16+1024), "r" (f+256)
|
|
201 :"%esi", "memory"
|
|
202 );
|
|
203 return 4*256;
|
|
204 }
|
|
205
|
|
206 static int a52_resample_3F_2R_to_5_MMX(float * _f, int16_t * s16){
|
|
207 int32_t * f = (int32_t *) _f;
|
|
208 asm volatile(
|
|
209 "movl $-1024, %%esi \n\t"
|
|
210 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
|
|
211 "1: \n\t"
|
|
212 "movd (%1, %%esi), %%mm0 \n\t"
|
|
213 "punpckldq 2048(%1, %%esi), %%mm0\n\t"
|
|
214 "movd 3072(%1, %%esi), %%mm1 \n\t"
|
|
215 "punpckldq 4096(%1, %%esi), %%mm1\n\t"
|
|
216 "movd 1024(%1, %%esi), %%mm2 \n\t"
|
|
217 "punpckldq 4(%1, %%esi), %%mm2 \n\t"
|
|
218 "movd 2052(%1, %%esi), %%mm3 \n\t"
|
|
219 "punpckldq 3076(%1, %%esi), %%mm3\n\t"
|
|
220 "movd 4100(%1, %%esi), %%mm4 \n\t"
|
|
221 "punpckldq 1028(%1, %%esi), %%mm4\n\t"
|
|
222 "movd 8(%1, %%esi), %%mm5 \n\t"
|
|
223 "punpckldq 2056(%1, %%esi), %%mm5\n\t"
|
|
224 "leal (%%esi, %%esi, 4), %%edi \n\t"
|
|
225 "sarl $1, %%edi \n\t"
|
|
226 "psubd %%mm7, %%mm0 \n\t"
|
|
227 "psubd %%mm7, %%mm1 \n\t"
|
|
228 "psubd %%mm7, %%mm2 \n\t"
|
|
229 "psubd %%mm7, %%mm3 \n\t"
|
|
230 "psubd %%mm7, %%mm4 \n\t"
|
|
231 "psubd %%mm7, %%mm5 \n\t"
|
|
232 "packssdw %%mm1, %%mm0 \n\t"
|
|
233 "packssdw %%mm3, %%mm2 \n\t"
|
|
234 "packssdw %%mm5, %%mm4 \n\t"
|
|
235 "movq %%mm0, (%0, %%edi) \n\t"
|
|
236 "movq %%mm2, 8(%0, %%edi) \n\t"
|
|
237 "movq %%mm4, 16(%0, %%edi) \n\t"
|
2967
|
238
|
1193
|
239 "movd 3080(%1, %%esi), %%mm0 \n\t"
|
|
240 "punpckldq 4104(%1, %%esi), %%mm0\n\t"
|
|
241 "movd 1032(%1, %%esi), %%mm1 \n\t"
|
|
242 "punpckldq 12(%1, %%esi), %%mm1\n\t"
|
|
243 "movd 2060(%1, %%esi), %%mm2 \n\t"
|
|
244 "punpckldq 3084(%1, %%esi), %%mm2\n\t"
|
|
245 "movd 4108(%1, %%esi), %%mm3 \n\t"
|
|
246 "punpckldq 1036(%1, %%esi), %%mm3\n\t"
|
|
247 "psubd %%mm7, %%mm0 \n\t"
|
|
248 "psubd %%mm7, %%mm1 \n\t"
|
|
249 "psubd %%mm7, %%mm2 \n\t"
|
|
250 "psubd %%mm7, %%mm3 \n\t"
|
|
251 "packssdw %%mm1, %%mm0 \n\t"
|
|
252 "packssdw %%mm3, %%mm2 \n\t"
|
|
253 "movq %%mm0, 24(%0, %%edi) \n\t"
|
|
254 "movq %%mm2, 32(%0, %%edi) \n\t"
|
2967
|
255
|
1193
|
256 "addl $16, %%esi \n\t"
|
|
257 " jnz 1b \n\t"
|
|
258 "emms \n\t"
|
|
259 :: "r" (s16+1280), "r" (f+256)
|
|
260 :"%esi", "%edi", "memory"
|
|
261 );
|
|
262 return 5*256;
|
|
263 }
|
|
264
|
|
265 static int a52_resample_MONO_LFE_to_6_MMX(float * _f, int16_t * s16){
|
|
266 int32_t * f = (int32_t *) _f;
|
|
267 asm volatile(
|
|
268 "movl $-1024, %%esi \n\t"
|
|
269 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
|
|
270 "pxor %%mm6, %%mm6 \n\t"
|
|
271 "1: \n\t"
|
|
272 "movq 1024(%1, %%esi), %%mm0 \n\t"
|
|
273 "movq 1032(%1, %%esi), %%mm1 \n\t"
|
|
274 "movq (%1, %%esi), %%mm2 \n\t"
|
|
275 "movq 8(%1, %%esi), %%mm3 \n\t"
|
|
276 "psubd %%mm7, %%mm0 \n\t"
|
|
277 "psubd %%mm7, %%mm1 \n\t"
|
|
278 "psubd %%mm7, %%mm2 \n\t"
|
|
279 "psubd %%mm7, %%mm3 \n\t"
|
|
280 "packssdw %%mm1, %%mm0 \n\t"
|
|
281 "packssdw %%mm3, %%mm2 \n\t"
|
|
282 "movq %%mm0, %%mm1 \n\t"
|
|
283 "punpcklwd %%mm2, %%mm0 \n\t"
|
|
284 "punpckhwd %%mm2, %%mm1 \n\t"
|
|
285 "leal (%%esi, %%esi, 2), %%edi \n\t"
|
|
286 "movq %%mm6, (%0, %%edi) \n\t"
|
|
287 "movd %%mm0, 8(%0, %%edi) \n\t"
|
|
288 "punpckhdq %%mm0, %%mm0 \n\t"
|
|
289 "movq %%mm6, 12(%0, %%edi) \n\t"
|
|
290 "movd %%mm0, 20(%0, %%edi) \n\t"
|
|
291 "movq %%mm6, 24(%0, %%edi) \n\t"
|
|
292 "movd %%mm1, 32(%0, %%edi) \n\t"
|
|
293 "punpckhdq %%mm1, %%mm1 \n\t"
|
|
294 "movq %%mm6, 36(%0, %%edi) \n\t"
|
|
295 "movd %%mm1, 44(%0, %%edi) \n\t"
|
|
296 "addl $16, %%esi \n\t"
|
|
297 " jnz 1b \n\t"
|
|
298 "emms \n\t"
|
|
299 :: "r" (s16+1536), "r" (f+256)
|
|
300 :"%esi", "%edi", "memory"
|
|
301 );
|
|
302 return 6*256;
|
|
303 }
|
|
304
|
|
305 static int a52_resample_STEREO_LFE_to_6_MMX(float * _f, int16_t * s16){
|
|
306 int32_t * f = (int32_t *) _f;
|
|
307 asm volatile(
|
|
308 "movl $-1024, %%esi \n\t"
|
|
309 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
|
|
310 "pxor %%mm6, %%mm6 \n\t"
|
|
311 "1: \n\t"
|
|
312 "movq 1024(%1, %%esi), %%mm0 \n\t"
|
|
313 "movq 2048(%1, %%esi), %%mm1 \n\t"
|
2967
|
314 "movq (%1, %%esi), %%mm5 \n\t"
|
1193
|
315 "psubd %%mm7, %%mm0 \n\t"
|
|
316 "psubd %%mm7, %%mm1 \n\t"
|
|
317 "psubd %%mm7, %%mm5 \n\t"
|
|
318 "leal (%%esi, %%esi, 2), %%edi \n\t"
|
2967
|
319
|
1193
|
320 "pxor %%mm4, %%mm4 \n\t"
|
|
321 "packssdw %%mm5, %%mm0 \n\t" // FfAa
|
|
322 "packssdw %%mm4, %%mm1 \n\t" // 00Bb
|
|
323 "punpckhwd %%mm0, %%mm4 \n\t" // F0f0
|
|
324 "punpcklwd %%mm1, %%mm0 \n\t" // BAba
|
|
325 "movq %%mm0, %%mm1 \n\t" // BAba
|
|
326 "punpckldq %%mm4, %%mm3 \n\t" // f0XX
|
|
327 "punpckldq %%mm6, %%mm0 \n\t" // 00ba
|
|
328 "punpckhdq %%mm1, %%mm3 \n\t" // BAf0
|
2967
|
329
|
1193
|
330 "movq %%mm0, (%0, %%edi) \n\t" // 00ba
|
|
331 "punpckhdq %%mm4, %%mm0 \n\t" // F000
|
|
332 "movq %%mm3, 8(%0, %%edi) \n\t" // BAf0
|
|
333 "movq %%mm0, 16(%0, %%edi) \n\t" // F000
|
|
334 "addl $8, %%esi \n\t"
|
|
335 " jnz 1b \n\t"
|
|
336 "emms \n\t"
|
|
337 :: "r" (s16+1536), "r" (f+256)
|
|
338 :"%esi", "%edi", "memory"
|
|
339 );
|
|
340 return 6*256;
|
|
341 }
|
|
342
|
|
343 static int a52_resample_3F_LFE_to_6_MMX(float * _f, int16_t * s16){
|
|
344 int32_t * f = (int32_t *) _f;
|
|
345 asm volatile(
|
|
346 "movl $-1024, %%esi \n\t"
|
|
347 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
|
|
348 "pxor %%mm6, %%mm6 \n\t"
|
|
349 "1: \n\t"
|
|
350 "movq 1024(%1, %%esi), %%mm0 \n\t"
|
|
351 "movq 3072(%1, %%esi), %%mm1 \n\t"
|
|
352 "movq 2048(%1, %%esi), %%mm4 \n\t"
|
2967
|
353 "movq (%1, %%esi), %%mm5 \n\t"
|
1193
|
354 "psubd %%mm7, %%mm0 \n\t"
|
|
355 "psubd %%mm7, %%mm1 \n\t"
|
|
356 "psubd %%mm7, %%mm4 \n\t"
|
|
357 "psubd %%mm7, %%mm5 \n\t"
|
|
358 "leal (%%esi, %%esi, 2), %%edi \n\t"
|
2967
|
359
|
1193
|
360 "packssdw %%mm4, %%mm0 \n\t" // EeAa
|
|
361 "packssdw %%mm5, %%mm1 \n\t" // FfBb
|
|
362 "movq %%mm0, %%mm2 \n\t" // EeAa
|
|
363 "punpcklwd %%mm1, %%mm0 \n\t" // BAba
|
|
364 "punpckhwd %%mm1, %%mm2 \n\t" // FEfe
|
|
365 "movq %%mm0, %%mm1 \n\t" // BAba
|
|
366 "punpckldq %%mm6, %%mm0 \n\t" // 00ba
|
|
367 "punpckhdq %%mm1, %%mm1 \n\t" // BABA
|
2967
|
368
|
1193
|
369 "movq %%mm0, (%0, %%edi) \n\t"
|
|
370 "punpckhdq %%mm2, %%mm0 \n\t" // FE00
|
|
371 "punpckldq %%mm1, %%mm2 \n\t" // BAfe
|
|
372 "movq %%mm2, 8(%0, %%edi) \n\t"
|
|
373 "movq %%mm0, 16(%0, %%edi) \n\t"
|
|
374 "addl $8, %%esi \n\t"
|
|
375 " jnz 1b \n\t"
|
|
376 "emms \n\t"
|
|
377 :: "r" (s16+1536), "r" (f+256)
|
|
378 :"%esi", "%edi", "memory"
|
|
379 );
|
|
380 return 6*256;
|
|
381 }
|
|
382
|
|
383 static int a52_resample_2F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
|
|
384 int32_t * f = (int32_t *) _f;
|
|
385 asm volatile(
|
|
386 "movl $-1024, %%esi \n\t"
|
|
387 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
|
|
388 // "pxor %%mm6, %%mm6 \n\t"
|
|
389 "1: \n\t"
|
|
390 "movq 1024(%1, %%esi), %%mm0 \n\t"
|
|
391 "movq 2048(%1, %%esi), %%mm1 \n\t"
|
|
392 "movq 3072(%1, %%esi), %%mm2 \n\t"
|
|
393 "movq 4096(%1, %%esi), %%mm3 \n\t"
|
2967
|
394 "movq (%1, %%esi), %%mm5 \n\t"
|
1193
|
395 "psubd %%mm7, %%mm0 \n\t"
|
|
396 "psubd %%mm7, %%mm1 \n\t"
|
|
397 "psubd %%mm7, %%mm2 \n\t"
|
|
398 "psubd %%mm7, %%mm3 \n\t"
|
|
399 "psubd %%mm7, %%mm5 \n\t"
|
|
400 "leal (%%esi, %%esi, 2), %%edi \n\t"
|
2967
|
401
|
1193
|
402 "packssdw %%mm2, %%mm0 \n\t" // CcAa
|
|
403 "packssdw %%mm3, %%mm1 \n\t" // DdBb
|
|
404 "packssdw %%mm5, %%mm5 \n\t" // FfFf
|
|
405 "movq %%mm0, %%mm2 \n\t" // CcAa
|
|
406 "punpcklwd %%mm1, %%mm0 \n\t" // BAba
|
|
407 "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
|
|
408 "pxor %%mm4, %%mm4 \n\t" // 0000
|
|
409 "punpcklwd %%mm5, %%mm4 \n\t" // F0f0
|
|
410 "movq %%mm0, %%mm1 \n\t" // BAba
|
|
411 "movq %%mm4, %%mm3 \n\t" // F0f0
|
|
412 "punpckldq %%mm2, %%mm0 \n\t" // dcba
|
|
413 "punpckhdq %%mm1, %%mm1 \n\t" // BABA
|
|
414 "punpckldq %%mm1, %%mm4 \n\t" // BAf0
|
|
415 "punpckhdq %%mm3, %%mm2 \n\t" // F0DC
|
2967
|
416
|
1193
|
417 "movq %%mm0, (%0, %%edi) \n\t"
|
|
418 "movq %%mm4, 8(%0, %%edi) \n\t"
|
|
419 "movq %%mm2, 16(%0, %%edi) \n\t"
|
|
420 "addl $8, %%esi \n\t"
|
|
421 " jnz 1b \n\t"
|
|
422 "emms \n\t"
|
|
423 :: "r" (s16+1536), "r" (f+256)
|
|
424 :"%esi", "%edi", "memory"
|
|
425 );
|
|
426 return 6*256;
|
|
427 }
|
|
428
|
|
429 static int a52_resample_3F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
|
|
430 int32_t * f = (int32_t *) _f;
|
|
431 asm volatile(
|
|
432 "movl $-1024, %%esi \n\t"
|
|
433 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
|
|
434 // "pxor %%mm6, %%mm6 \n\t"
|
|
435 "1: \n\t"
|
|
436 "movq 1024(%1, %%esi), %%mm0 \n\t"
|
|
437 "movq 3072(%1, %%esi), %%mm1 \n\t"
|
|
438 "movq 4096(%1, %%esi), %%mm2 \n\t"
|
|
439 "movq 5120(%1, %%esi), %%mm3 \n\t"
|
|
440 "movq 2048(%1, %%esi), %%mm4 \n\t"
|
2967
|
441 "movq (%1, %%esi), %%mm5 \n\t"
|
1193
|
442 "psubd %%mm7, %%mm0 \n\t"
|
|
443 "psubd %%mm7, %%mm1 \n\t"
|
|
444 "psubd %%mm7, %%mm2 \n\t"
|
|
445 "psubd %%mm7, %%mm3 \n\t"
|
|
446 "psubd %%mm7, %%mm4 \n\t"
|
|
447 "psubd %%mm7, %%mm5 \n\t"
|
|
448 "leal (%%esi, %%esi, 2), %%edi \n\t"
|
2967
|
449
|
1193
|
450 "packssdw %%mm2, %%mm0 \n\t" // CcAa
|
|
451 "packssdw %%mm3, %%mm1 \n\t" // DdBb
|
|
452 "packssdw %%mm4, %%mm4 \n\t" // EeEe
|
|
453 "packssdw %%mm5, %%mm5 \n\t" // FfFf
|
|
454 "movq %%mm0, %%mm2 \n\t" // CcAa
|
|
455 "punpcklwd %%mm1, %%mm0 \n\t" // BAba
|
|
456 "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
|
|
457 "punpcklwd %%mm5, %%mm4 \n\t" // FEfe
|
|
458 "movq %%mm0, %%mm1 \n\t" // BAba
|
|
459 "movq %%mm4, %%mm3 \n\t" // FEfe
|
|
460 "punpckldq %%mm2, %%mm0 \n\t" // dcba
|
|
461 "punpckhdq %%mm1, %%mm1 \n\t" // BABA
|
|
462 "punpckldq %%mm1, %%mm4 \n\t" // BAfe
|
|
463 "punpckhdq %%mm3, %%mm2 \n\t" // FEDC
|
2967
|
464
|
1193
|
465 "movq %%mm0, (%0, %%edi) \n\t"
|
|
466 "movq %%mm4, 8(%0, %%edi) \n\t"
|
|
467 "movq %%mm2, 16(%0, %%edi) \n\t"
|
|
468 "addl $8, %%esi \n\t"
|
|
469 " jnz 1b \n\t"
|
|
470 "emms \n\t"
|
|
471 :: "r" (s16+1536), "r" (f+256)
|
|
472 :"%esi", "%edi", "memory"
|
|
473 );
|
|
474 return 6*256;
|
|
475 }
|
|
476
|
|
477
|
|
478 static void* a52_resample_MMX(int flags, int ch){
|
|
479 switch (flags) {
|
|
480 case A52_MONO:
|
|
481 if(ch==5) return a52_resample_MONO_to_5_MMX;
|
|
482 break;
|
|
483 case A52_CHANNEL:
|
|
484 case A52_STEREO:
|
|
485 case A52_DOLBY:
|
|
486 if(ch==2) return a52_resample_STEREO_to_2_MMX;
|
|
487 break;
|
|
488 case A52_3F:
|
|
489 if(ch==5) return a52_resample_3F_to_5_MMX;
|
|
490 break;
|
|
491 case A52_2F2R:
|
|
492 if(ch==4) return a52_resample_2F_2R_to_4_MMX;
|
|
493 break;
|
|
494 case A52_3F2R:
|
|
495 if(ch==5) return a52_resample_3F_2R_to_5_MMX;
|
|
496 break;
|
|
497 case A52_MONO | A52_LFE:
|
|
498 if(ch==6) return a52_resample_MONO_LFE_to_6_MMX;
|
|
499 break;
|
|
500 case A52_CHANNEL | A52_LFE:
|
|
501 case A52_STEREO | A52_LFE:
|
|
502 case A52_DOLBY | A52_LFE:
|
|
503 if(ch==6) return a52_resample_STEREO_LFE_to_6_MMX;
|
|
504 break;
|
|
505 case A52_3F | A52_LFE:
|
|
506 if(ch==6) return a52_resample_3F_LFE_to_6_MMX;
|
|
507 break;
|
|
508 case A52_2F2R | A52_LFE:
|
|
509 if(ch==6) return a52_resample_2F_2R_LFE_to_6_MMX;
|
|
510 break;
|
|
511 case A52_3F2R | A52_LFE:
|
|
512 if(ch==6) return a52_resample_3F_2R_LFE_to_6_MMX;
|
|
513 break;
|
|
514 }
|
|
515 return NULL;
|
|
516 }
|
|
517
|
|
518
|