Mercurial > mplayer.hg
comparison liba52/resample_mmx.c @ 3626:e22ff7ebdc05
runtime cpu detection for the resample stuff
author | michael |
---|---|
date | Wed, 19 Dec 2001 20:20:06 +0000 |
parents | 79759c05911e |
children | b11b15df02ed |
comparison
equal
deleted
inserted
replaced
3625:84ff13d4540c | 3626:e22ff7ebdc05 |
---|---|
14 (but thats just test.c so that needs to be testd in reallity) | 14 (but thats just test.c so that needs to be testd in reallity) |
15 and it would mean (C / MMX2 / MMX / 3DNOW) versions | 15 and it would mean (C / MMX2 / MMX / 3DNOW) versions |
16 */ | 16 */ |
17 | 17 |
18 #include <inttypes.h> | 18 #include <inttypes.h> |
19 #include <stdio.h> | |
19 #include "a52.h" | 20 #include "a52.h" |
20 #include "../config.h" | 21 #include "../config.h" |
21 | 22 #include "../cpudetect.h" |
22 #ifdef HAVE_MMX | 23 |
24 int (* a52_resample) (float * _f, int16_t * s16)=NULL; | |
25 | |
26 #ifdef ARCH_X86 | |
23 static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL; | 27 static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL; |
24 static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL; | 28 static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL; |
25 static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL; | 29 static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL; |
26 static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL; | 30 static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL; |
27 #endif | 31 #endif |
37 } | 41 } |
38 | 42 |
39 static int chans=2; | 43 static int chans=2; |
40 static int flags=0; | 44 static int flags=0; |
41 | 45 |
42 void a52_resample_init(int _flags,int _chans){ | 46 int a52_resample_C(float * _f, int16_t * s16) |
43 chans=_chans; | |
44 flags=_flags; | |
45 } | |
46 | |
47 int a52_resample(float * _f, int16_t * s16) | |
48 { | 47 { |
49 int i; | 48 int i; |
50 int32_t * f = (int32_t *) _f; | 49 int32_t * f = (int32_t *) _f; |
51 | 50 |
52 switch (flags) { | 51 switch (flags) { |
53 case A52_MONO: | 52 case A52_MONO: |
54 #ifdef HAVE_MMX | 53 for (i = 0; i < 256; i++) { |
54 s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0; | |
55 s16[5*i+4] = convert (f[i]); | |
56 } | |
57 break; | |
58 case A52_CHANNEL: | |
59 case A52_STEREO: | |
60 case A52_DOLBY: | |
61 for (i = 0; i < 256; i++) { | |
62 s16[2*i] = convert (f[i]); | |
63 s16[2*i+1] = convert (f[i+256]); | |
64 } | |
65 break; | |
66 case A52_3F: | |
67 for (i = 0; i < 256; i++) { | |
68 s16[5*i] = convert (f[i]); | |
69 s16[5*i+1] = convert (f[i+512]); | |
70 s16[5*i+2] = s16[5*i+3] = 0; | |
71 s16[5*i+4] = convert (f[i+256]); | |
72 } | |
73 break; | |
74 case A52_2F2R: | |
75 for (i = 0; i < 256; i++) { | |
76 s16[4*i] = convert (f[i]); | |
77 s16[4*i+1] = convert (f[i+256]); | |
78 s16[4*i+2] = convert (f[i+512]); | |
79 s16[4*i+3] = convert (f[i+768]); | |
80 } | |
81 break; | |
82 case A52_3F2R: | |
83 for (i = 0; i < 256; i++) { | |
84 s16[5*i] = convert (f[i]); | |
85 s16[5*i+1] = convert (f[i+512]); | |
86 s16[5*i+2] = convert (f[i+768]); | |
87 s16[5*i+3] = convert (f[i+1024]); | |
88 s16[5*i+4] = convert (f[i+256]); | |
89 } | |
90 break; | |
91 case A52_MONO | A52_LFE: | |
92 for (i = 0; i < 256; i++) { | |
93 s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0; | |
94 s16[6*i+4] = convert (f[i+256]); | |
95 s16[6*i+5] = convert (f[i]); | |
96 } | |
97 break; | |
98 case A52_CHANNEL | A52_LFE: | |
99 case A52_STEREO | A52_LFE: | |
100 case A52_DOLBY | A52_LFE: | |
101 for (i = 0; i < 256; i++) { | |
102 s16[6*i] = convert (f[i+256]); | |
103 s16[6*i+1] = convert (f[i+512]); | |
104 s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0; | |
105 s16[6*i+5] = convert (f[i]); | |
106 } | |
107 break; | |
108 case A52_3F | A52_LFE: | |
109 for (i = 0; i < 256; i++) { | |
110 s16[6*i] = convert (f[i+256]); | |
111 s16[6*i+1] = convert (f[i+768]); | |
112 s16[6*i+2] = s16[6*i+3] = 0; | |
113 s16[6*i+4] = convert (f[i+512]); | |
114 s16[6*i+5] = convert (f[i]); | |
115 } | |
116 break; | |
117 case A52_2F2R | A52_LFE: | |
118 for (i = 0; i < 256; i++) { | |
119 s16[6*i] = convert (f[i+256]); | |
120 s16[6*i+1] = convert (f[i+512]); | |
121 s16[6*i+2] = convert (f[i+768]); | |
122 s16[6*i+3] = convert (f[i+1024]); | |
123 s16[6*i+4] = 0; | |
124 s16[6*i+5] = convert (f[i]); | |
125 } | |
126 break; | |
127 case A52_3F2R | A52_LFE: | |
128 for (i = 0; i < 256; i++) { | |
129 s16[6*i] = convert (f[i+256]); | |
130 s16[6*i+1] = convert (f[i+768]); | |
131 s16[6*i+2] = convert (f[i+1024]); | |
132 s16[6*i+3] = convert (f[i+1280]); | |
133 s16[6*i+4] = convert (f[i+512]); | |
134 s16[6*i+5] = convert (f[i]); | |
135 } | |
136 break; | |
137 } | |
138 return chans*256; | |
139 } | |
140 | |
141 #ifdef ARCH_X86 | |
142 int a52_resample_MMX(float * _f, int16_t * s16) | |
143 { | |
144 int i; | |
145 int32_t * f = (int32_t *) _f; | |
146 | |
147 switch (flags) { | |
148 case A52_MONO: | |
55 asm volatile( | 149 asm volatile( |
56 "movl $-512, %%esi \n\t" | 150 "movl $-512, %%esi \n\t" |
57 "movq magicF2W, %%mm7 \n\t" | 151 "movq magicF2W, %%mm7 \n\t" |
58 "movq wm1100, %%mm3 \n\t" | 152 "movq wm1100, %%mm3 \n\t" |
59 "movq wm0101, %%mm4 \n\t" | 153 "movq wm0101, %%mm4 \n\t" |
82 " jnz 1b \n\t" | 176 " jnz 1b \n\t" |
83 "emms \n\t" | 177 "emms \n\t" |
84 :: "r" (s16+1280), "r" (f+256) | 178 :: "r" (s16+1280), "r" (f+256) |
85 :"%esi", "%edi", "memory" | 179 :"%esi", "%edi", "memory" |
86 ); | 180 ); |
87 #else | |
88 for (i = 0; i < 256; i++) { | |
89 s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0; | |
90 s16[5*i+4] = convert (f[i]); | |
91 } | |
92 #endif | |
93 break; | 181 break; |
94 case A52_CHANNEL: | 182 case A52_CHANNEL: |
95 case A52_STEREO: | 183 case A52_STEREO: |
96 case A52_DOLBY: | 184 case A52_DOLBY: |
97 /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it | 185 /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it |
110 " jnz 1b \n\t" | 198 " jnz 1b \n\t" |
111 "emms \n\t" | 199 "emms \n\t" |
112 :: "r" (s16+512), "r" (f+256) | 200 :: "r" (s16+512), "r" (f+256) |
113 :"%esi", "memory" | 201 :"%esi", "memory" |
114 );*/ | 202 );*/ |
115 #ifdef HAVE_MMX | |
116 asm volatile( | 203 asm volatile( |
117 "movl $-1024, %%esi \n\t" | 204 "movl $-1024, %%esi \n\t" |
118 "movq magicF2W, %%mm7 \n\t" | 205 "movq magicF2W, %%mm7 \n\t" |
119 "1: \n\t" | 206 "1: \n\t" |
120 "movq (%1, %%esi), %%mm0 \n\t" | 207 "movq (%1, %%esi), %%mm0 \n\t" |
136 " jnz 1b \n\t" | 223 " jnz 1b \n\t" |
137 "emms \n\t" | 224 "emms \n\t" |
138 :: "r" (s16+512), "r" (f+256) | 225 :: "r" (s16+512), "r" (f+256) |
139 :"%esi", "memory" | 226 :"%esi", "memory" |
140 ); | 227 ); |
141 #else | 228 break; |
142 for (i = 0; i < 256; i++) { | 229 case A52_3F: //FIXME Optimize |
143 s16[2*i] = convert (f[i]); | |
144 s16[2*i+1] = convert (f[i+256]); | |
145 } | |
146 #endif | |
147 break; | |
148 case A52_3F: | |
149 for (i = 0; i < 256; i++) { | 230 for (i = 0; i < 256; i++) { |
150 s16[5*i] = convert (f[i]); | 231 s16[5*i] = convert (f[i]); |
151 s16[5*i+1] = convert (f[i+512]); | 232 s16[5*i+1] = convert (f[i+512]); |
152 s16[5*i+2] = s16[5*i+3] = 0; | 233 s16[5*i+2] = s16[5*i+3] = 0; |
153 s16[5*i+4] = convert (f[i+256]); | 234 s16[5*i+4] = convert (f[i+256]); |
154 } | 235 } |
155 break; | 236 break; |
156 case A52_2F2R: | 237 case A52_2F2R: |
157 #ifdef HAVE_MMX | |
158 asm volatile( | 238 asm volatile( |
159 "movl $-1024, %%esi \n\t" | 239 "movl $-1024, %%esi \n\t" |
160 "movq magicF2W, %%mm7 \n\t" | 240 "movq magicF2W, %%mm7 \n\t" |
161 "1: \n\t" | 241 "1: \n\t" |
162 "movq (%1, %%esi), %%mm0 \n\t" | 242 "movq (%1, %%esi), %%mm0 \n\t" |
199 " jnz 1b \n\t" | 279 " jnz 1b \n\t" |
200 "emms \n\t" | 280 "emms \n\t" |
201 :: "r" (s16+1024), "r" (f+256) | 281 :: "r" (s16+1024), "r" (f+256) |
202 :"%esi", "memory" | 282 :"%esi", "memory" |
203 ); | 283 ); |
204 #else | 284 break; |
205 for (i = 0; i < 256; i++) { | 285 case A52_3F2R: //FIXME optimitze |
206 s16[4*i] = convert (f[i]); | |
207 s16[4*i+1] = convert (f[i+256]); | |
208 s16[4*i+2] = convert (f[i+512]); | |
209 s16[4*i+3] = convert (f[i+768]); | |
210 } | |
211 #endif | |
212 break; | |
213 case A52_3F2R: | |
214 for (i = 0; i < 256; i++) { | 286 for (i = 0; i < 256; i++) { |
215 s16[5*i] = convert (f[i]); | 287 s16[5*i] = convert (f[i]); |
216 s16[5*i+1] = convert (f[i+512]); | 288 s16[5*i+1] = convert (f[i+512]); |
217 s16[5*i+2] = convert (f[i+768]); | 289 s16[5*i+2] = convert (f[i+768]); |
218 s16[5*i+3] = convert (f[i+1024]); | 290 s16[5*i+3] = convert (f[i+1024]); |
219 s16[5*i+4] = convert (f[i+256]); | 291 s16[5*i+4] = convert (f[i+256]); |
220 } | 292 } |
221 break; | 293 break; |
222 case A52_MONO | A52_LFE: | 294 case A52_MONO | A52_LFE: |
223 #ifdef HAVE_MMX | |
224 asm volatile( | 295 asm volatile( |
225 "movl $-1024, %%esi \n\t" | 296 "movl $-1024, %%esi \n\t" |
226 "movq magicF2W, %%mm7 \n\t" | 297 "movq magicF2W, %%mm7 \n\t" |
227 "pxor %%mm6, %%mm6 \n\t" | 298 "pxor %%mm6, %%mm6 \n\t" |
228 "1: \n\t" | 299 "1: \n\t" |
254 " jnz 1b \n\t" | 325 " jnz 1b \n\t" |
255 "emms \n\t" | 326 "emms \n\t" |
256 :: "r" (s16+1536), "r" (f+256) | 327 :: "r" (s16+1536), "r" (f+256) |
257 :"%esi", "%edi", "memory" | 328 :"%esi", "%edi", "memory" |
258 ); | 329 ); |
259 #else | |
260 for (i = 0; i < 256; i++) { | |
261 s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0; | |
262 s16[6*i+4] = convert (f[i+256]); | |
263 s16[6*i+5] = convert (f[i]); | |
264 } | |
265 #endif | |
266 break; | 330 break; |
267 case A52_CHANNEL | A52_LFE: | 331 case A52_CHANNEL | A52_LFE: |
268 case A52_STEREO | A52_LFE: | 332 case A52_STEREO | A52_LFE: |
269 case A52_DOLBY | A52_LFE: | 333 case A52_DOLBY | A52_LFE: |
270 #ifdef HAVE_MMX | |
271 asm volatile( | 334 asm volatile( |
272 "movl $-1024, %%esi \n\t" | 335 "movl $-1024, %%esi \n\t" |
273 "movq magicF2W, %%mm7 \n\t" | 336 "movq magicF2W, %%mm7 \n\t" |
274 "pxor %%mm6, %%mm6 \n\t" | 337 "pxor %%mm6, %%mm6 \n\t" |
275 "1: \n\t" | 338 "1: \n\t" |
299 " jnz 1b \n\t" | 362 " jnz 1b \n\t" |
300 "emms \n\t" | 363 "emms \n\t" |
301 :: "r" (s16+1536), "r" (f+256) | 364 :: "r" (s16+1536), "r" (f+256) |
302 :"%esi", "%edi", "memory" | 365 :"%esi", "%edi", "memory" |
303 ); | 366 ); |
304 #else | |
305 for (i = 0; i < 256; i++) { | |
306 s16[6*i] = convert (f[i+256]); | |
307 s16[6*i+1] = convert (f[i+512]); | |
308 s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0; | |
309 s16[6*i+5] = convert (f[i]); | |
310 } | |
311 #endif | |
312 break; | 367 break; |
313 case A52_3F | A52_LFE: | 368 case A52_3F | A52_LFE: |
314 #ifdef HAVE_MMX | |
315 asm volatile( | 369 asm volatile( |
316 "movl $-1024, %%esi \n\t" | 370 "movl $-1024, %%esi \n\t" |
317 "movq magicF2W, %%mm7 \n\t" | 371 "movq magicF2W, %%mm7 \n\t" |
318 "pxor %%mm6, %%mm6 \n\t" | 372 "pxor %%mm6, %%mm6 \n\t" |
319 "1: \n\t" | 373 "1: \n\t" |
345 " jnz 1b \n\t" | 399 " jnz 1b \n\t" |
346 "emms \n\t" | 400 "emms \n\t" |
347 :: "r" (s16+1536), "r" (f+256) | 401 :: "r" (s16+1536), "r" (f+256) |
348 :"%esi", "%edi", "memory" | 402 :"%esi", "%edi", "memory" |
349 ); | 403 ); |
350 #else | |
351 for (i = 0; i < 256; i++) { | |
352 s16[6*i] = convert (f[i+256]); | |
353 s16[6*i+1] = convert (f[i+768]); | |
354 s16[6*i+2] = s16[6*i+3] = 0; | |
355 s16[6*i+4] = convert (f[i+512]); | |
356 s16[6*i+5] = convert (f[i]); | |
357 } | |
358 #endif | |
359 break; | 404 break; |
360 case A52_2F2R | A52_LFE: | 405 case A52_2F2R | A52_LFE: |
361 #ifdef HAVE_MMX | |
362 asm volatile( | 406 asm volatile( |
363 "movl $-1024, %%esi \n\t" | 407 "movl $-1024, %%esi \n\t" |
364 "movq magicF2W, %%mm7 \n\t" | 408 "movq magicF2W, %%mm7 \n\t" |
365 // "pxor %%mm6, %%mm6 \n\t" | 409 // "pxor %%mm6, %%mm6 \n\t" |
366 "1: \n\t" | 410 "1: \n\t" |
398 " jnz 1b \n\t" | 442 " jnz 1b \n\t" |
399 "emms \n\t" | 443 "emms \n\t" |
400 :: "r" (s16+1536), "r" (f+256) | 444 :: "r" (s16+1536), "r" (f+256) |
401 :"%esi", "%edi", "memory" | 445 :"%esi", "%edi", "memory" |
402 ); | 446 ); |
403 #else | |
404 for (i = 0; i < 256; i++) { | |
405 s16[6*i] = convert (f[i+256]); | |
406 s16[6*i+1] = convert (f[i+512]); | |
407 s16[6*i+2] = convert (f[i+768]); | |
408 s16[6*i+3] = convert (f[i+1024]); | |
409 s16[6*i+4] = 0; | |
410 s16[6*i+5] = convert (f[i]); | |
411 } | |
412 #endif | |
413 break; | 447 break; |
414 case A52_3F2R | A52_LFE: | 448 case A52_3F2R | A52_LFE: |
415 #ifdef HAVE_MMX | |
416 asm volatile( | 449 asm volatile( |
417 "movl $-1024, %%esi \n\t" | 450 "movl $-1024, %%esi \n\t" |
418 "movq magicF2W, %%mm7 \n\t" | 451 "movq magicF2W, %%mm7 \n\t" |
419 // "pxor %%mm6, %%mm6 \n\t" | 452 // "pxor %%mm6, %%mm6 \n\t" |
420 "1: \n\t" | 453 "1: \n\t" |
454 " jnz 1b \n\t" | 487 " jnz 1b \n\t" |
455 "emms \n\t" | 488 "emms \n\t" |
456 :: "r" (s16+1536), "r" (f+256) | 489 :: "r" (s16+1536), "r" (f+256) |
457 :"%esi", "%edi", "memory" | 490 :"%esi", "%edi", "memory" |
458 ); | 491 ); |
459 #else | |
460 for (i = 0; i < 256; i++) { | |
461 s16[6*i] = convert (f[i+256]); | |
462 s16[6*i+1] = convert (f[i+768]); | |
463 s16[6*i+2] = convert (f[i+1024]); | |
464 s16[6*i+3] = convert (f[i+1280]); | |
465 s16[6*i+4] = convert (f[i+512]); | |
466 s16[6*i+5] = convert (f[i]); | |
467 } | |
468 #endif | |
469 break; | 492 break; |
470 } | 493 } |
471 return chans*256; | 494 return chans*256; |
472 } | 495 } |
473 | 496 #endif //arch_x86 |
497 | |
498 void a52_resample_init(int _flags,int _chans){ | |
499 chans=_chans; | |
500 flags=_flags; | |
501 | |
502 if(a52_resample==NULL) // only once please ;) | |
503 { | |
504 if(gCpuCaps.hasMMX) fprintf(stderr, "Using MMX optimized resampler\n"); | |
505 else fprintf(stderr, "No accelerated resampler found\n"); | |
506 } | |
507 | |
508 #ifdef ARCH_X86 | |
509 if(gCpuCaps.hasMMX) a52_resample= a52_resample_MMX; | |
510 #else | |
511 if(0); | |
512 #endif | |
513 else a52_resample= a52_resample_C; | |
514 } | |
515 |