comparison liba52/resample_mmx.c @ 3626:e22ff7ebdc05

runtime cpu detection for the resample stuff
author michael
date Wed, 19 Dec 2001 20:20:06 +0000
parents 79759c05911e
children b11b15df02ed
comparison
equal deleted inserted replaced
3625:84ff13d4540c 3626:e22ff7ebdc05
14 (but thats just test.c so that needs to be testd in reallity) 14 (but thats just test.c so that needs to be testd in reallity)
15 and it would mean (C / MMX2 / MMX / 3DNOW) versions 15 and it would mean (C / MMX2 / MMX / 3DNOW) versions
16 */ 16 */
17 17
18 #include <inttypes.h> 18 #include <inttypes.h>
19 #include <stdio.h>
19 #include "a52.h" 20 #include "a52.h"
20 #include "../config.h" 21 #include "../config.h"
21 22 #include "../cpudetect.h"
22 #ifdef HAVE_MMX 23
24 int (* a52_resample) (float * _f, int16_t * s16)=NULL;
25
26 #ifdef ARCH_X86
23 static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL; 27 static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL;
24 static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL; 28 static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL;
25 static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL; 29 static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL;
26 static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL; 30 static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL;
27 #endif 31 #endif
37 } 41 }
38 42
39 static int chans=2; 43 static int chans=2;
40 static int flags=0; 44 static int flags=0;
41 45
42 void a52_resample_init(int _flags,int _chans){ 46 int a52_resample_C(float * _f, int16_t * s16)
43 chans=_chans;
44 flags=_flags;
45 }
46
47 int a52_resample(float * _f, int16_t * s16)
48 { 47 {
49 int i; 48 int i;
50 int32_t * f = (int32_t *) _f; 49 int32_t * f = (int32_t *) _f;
51 50
52 switch (flags) { 51 switch (flags) {
53 case A52_MONO: 52 case A52_MONO:
54 #ifdef HAVE_MMX 53 for (i = 0; i < 256; i++) {
54 s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0;
55 s16[5*i+4] = convert (f[i]);
56 }
57 break;
58 case A52_CHANNEL:
59 case A52_STEREO:
60 case A52_DOLBY:
61 for (i = 0; i < 256; i++) {
62 s16[2*i] = convert (f[i]);
63 s16[2*i+1] = convert (f[i+256]);
64 }
65 break;
66 case A52_3F:
67 for (i = 0; i < 256; i++) {
68 s16[5*i] = convert (f[i]);
69 s16[5*i+1] = convert (f[i+512]);
70 s16[5*i+2] = s16[5*i+3] = 0;
71 s16[5*i+4] = convert (f[i+256]);
72 }
73 break;
74 case A52_2F2R:
75 for (i = 0; i < 256; i++) {
76 s16[4*i] = convert (f[i]);
77 s16[4*i+1] = convert (f[i+256]);
78 s16[4*i+2] = convert (f[i+512]);
79 s16[4*i+3] = convert (f[i+768]);
80 }
81 break;
82 case A52_3F2R:
83 for (i = 0; i < 256; i++) {
84 s16[5*i] = convert (f[i]);
85 s16[5*i+1] = convert (f[i+512]);
86 s16[5*i+2] = convert (f[i+768]);
87 s16[5*i+3] = convert (f[i+1024]);
88 s16[5*i+4] = convert (f[i+256]);
89 }
90 break;
91 case A52_MONO | A52_LFE:
92 for (i = 0; i < 256; i++) {
93 s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0;
94 s16[6*i+4] = convert (f[i+256]);
95 s16[6*i+5] = convert (f[i]);
96 }
97 break;
98 case A52_CHANNEL | A52_LFE:
99 case A52_STEREO | A52_LFE:
100 case A52_DOLBY | A52_LFE:
101 for (i = 0; i < 256; i++) {
102 s16[6*i] = convert (f[i+256]);
103 s16[6*i+1] = convert (f[i+512]);
104 s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0;
105 s16[6*i+5] = convert (f[i]);
106 }
107 break;
108 case A52_3F | A52_LFE:
109 for (i = 0; i < 256; i++) {
110 s16[6*i] = convert (f[i+256]);
111 s16[6*i+1] = convert (f[i+768]);
112 s16[6*i+2] = s16[6*i+3] = 0;
113 s16[6*i+4] = convert (f[i+512]);
114 s16[6*i+5] = convert (f[i]);
115 }
116 break;
117 case A52_2F2R | A52_LFE:
118 for (i = 0; i < 256; i++) {
119 s16[6*i] = convert (f[i+256]);
120 s16[6*i+1] = convert (f[i+512]);
121 s16[6*i+2] = convert (f[i+768]);
122 s16[6*i+3] = convert (f[i+1024]);
123 s16[6*i+4] = 0;
124 s16[6*i+5] = convert (f[i]);
125 }
126 break;
127 case A52_3F2R | A52_LFE:
128 for (i = 0; i < 256; i++) {
129 s16[6*i] = convert (f[i+256]);
130 s16[6*i+1] = convert (f[i+768]);
131 s16[6*i+2] = convert (f[i+1024]);
132 s16[6*i+3] = convert (f[i+1280]);
133 s16[6*i+4] = convert (f[i+512]);
134 s16[6*i+5] = convert (f[i]);
135 }
136 break;
137 }
138 return chans*256;
139 }
140
141 #ifdef ARCH_X86
142 int a52_resample_MMX(float * _f, int16_t * s16)
143 {
144 int i;
145 int32_t * f = (int32_t *) _f;
146
147 switch (flags) {
148 case A52_MONO:
55 asm volatile( 149 asm volatile(
56 "movl $-512, %%esi \n\t" 150 "movl $-512, %%esi \n\t"
57 "movq magicF2W, %%mm7 \n\t" 151 "movq magicF2W, %%mm7 \n\t"
58 "movq wm1100, %%mm3 \n\t" 152 "movq wm1100, %%mm3 \n\t"
59 "movq wm0101, %%mm4 \n\t" 153 "movq wm0101, %%mm4 \n\t"
82 " jnz 1b \n\t" 176 " jnz 1b \n\t"
83 "emms \n\t" 177 "emms \n\t"
84 :: "r" (s16+1280), "r" (f+256) 178 :: "r" (s16+1280), "r" (f+256)
85 :"%esi", "%edi", "memory" 179 :"%esi", "%edi", "memory"
86 ); 180 );
87 #else
88 for (i = 0; i < 256; i++) {
89 s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0;
90 s16[5*i+4] = convert (f[i]);
91 }
92 #endif
93 break; 181 break;
94 case A52_CHANNEL: 182 case A52_CHANNEL:
95 case A52_STEREO: 183 case A52_STEREO:
96 case A52_DOLBY: 184 case A52_DOLBY:
97 /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it 185 /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it
110 " jnz 1b \n\t" 198 " jnz 1b \n\t"
111 "emms \n\t" 199 "emms \n\t"
112 :: "r" (s16+512), "r" (f+256) 200 :: "r" (s16+512), "r" (f+256)
113 :"%esi", "memory" 201 :"%esi", "memory"
114 );*/ 202 );*/
115 #ifdef HAVE_MMX
116 asm volatile( 203 asm volatile(
117 "movl $-1024, %%esi \n\t" 204 "movl $-1024, %%esi \n\t"
118 "movq magicF2W, %%mm7 \n\t" 205 "movq magicF2W, %%mm7 \n\t"
119 "1: \n\t" 206 "1: \n\t"
120 "movq (%1, %%esi), %%mm0 \n\t" 207 "movq (%1, %%esi), %%mm0 \n\t"
136 " jnz 1b \n\t" 223 " jnz 1b \n\t"
137 "emms \n\t" 224 "emms \n\t"
138 :: "r" (s16+512), "r" (f+256) 225 :: "r" (s16+512), "r" (f+256)
139 :"%esi", "memory" 226 :"%esi", "memory"
140 ); 227 );
141 #else 228 break;
142 for (i = 0; i < 256; i++) { 229 case A52_3F: //FIXME Optimize
143 s16[2*i] = convert (f[i]);
144 s16[2*i+1] = convert (f[i+256]);
145 }
146 #endif
147 break;
148 case A52_3F:
149 for (i = 0; i < 256; i++) { 230 for (i = 0; i < 256; i++) {
150 s16[5*i] = convert (f[i]); 231 s16[5*i] = convert (f[i]);
151 s16[5*i+1] = convert (f[i+512]); 232 s16[5*i+1] = convert (f[i+512]);
152 s16[5*i+2] = s16[5*i+3] = 0; 233 s16[5*i+2] = s16[5*i+3] = 0;
153 s16[5*i+4] = convert (f[i+256]); 234 s16[5*i+4] = convert (f[i+256]);
154 } 235 }
155 break; 236 break;
156 case A52_2F2R: 237 case A52_2F2R:
157 #ifdef HAVE_MMX
158 asm volatile( 238 asm volatile(
159 "movl $-1024, %%esi \n\t" 239 "movl $-1024, %%esi \n\t"
160 "movq magicF2W, %%mm7 \n\t" 240 "movq magicF2W, %%mm7 \n\t"
161 "1: \n\t" 241 "1: \n\t"
162 "movq (%1, %%esi), %%mm0 \n\t" 242 "movq (%1, %%esi), %%mm0 \n\t"
199 " jnz 1b \n\t" 279 " jnz 1b \n\t"
200 "emms \n\t" 280 "emms \n\t"
201 :: "r" (s16+1024), "r" (f+256) 281 :: "r" (s16+1024), "r" (f+256)
202 :"%esi", "memory" 282 :"%esi", "memory"
203 ); 283 );
204 #else 284 break;
205 for (i = 0; i < 256; i++) { 285 case A52_3F2R: //FIXME optimitze
206 s16[4*i] = convert (f[i]);
207 s16[4*i+1] = convert (f[i+256]);
208 s16[4*i+2] = convert (f[i+512]);
209 s16[4*i+3] = convert (f[i+768]);
210 }
211 #endif
212 break;
213 case A52_3F2R:
214 for (i = 0; i < 256; i++) { 286 for (i = 0; i < 256; i++) {
215 s16[5*i] = convert (f[i]); 287 s16[5*i] = convert (f[i]);
216 s16[5*i+1] = convert (f[i+512]); 288 s16[5*i+1] = convert (f[i+512]);
217 s16[5*i+2] = convert (f[i+768]); 289 s16[5*i+2] = convert (f[i+768]);
218 s16[5*i+3] = convert (f[i+1024]); 290 s16[5*i+3] = convert (f[i+1024]);
219 s16[5*i+4] = convert (f[i+256]); 291 s16[5*i+4] = convert (f[i+256]);
220 } 292 }
221 break; 293 break;
222 case A52_MONO | A52_LFE: 294 case A52_MONO | A52_LFE:
223 #ifdef HAVE_MMX
224 asm volatile( 295 asm volatile(
225 "movl $-1024, %%esi \n\t" 296 "movl $-1024, %%esi \n\t"
226 "movq magicF2W, %%mm7 \n\t" 297 "movq magicF2W, %%mm7 \n\t"
227 "pxor %%mm6, %%mm6 \n\t" 298 "pxor %%mm6, %%mm6 \n\t"
228 "1: \n\t" 299 "1: \n\t"
254 " jnz 1b \n\t" 325 " jnz 1b \n\t"
255 "emms \n\t" 326 "emms \n\t"
256 :: "r" (s16+1536), "r" (f+256) 327 :: "r" (s16+1536), "r" (f+256)
257 :"%esi", "%edi", "memory" 328 :"%esi", "%edi", "memory"
258 ); 329 );
259 #else
260 for (i = 0; i < 256; i++) {
261 s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0;
262 s16[6*i+4] = convert (f[i+256]);
263 s16[6*i+5] = convert (f[i]);
264 }
265 #endif
266 break; 330 break;
267 case A52_CHANNEL | A52_LFE: 331 case A52_CHANNEL | A52_LFE:
268 case A52_STEREO | A52_LFE: 332 case A52_STEREO | A52_LFE:
269 case A52_DOLBY | A52_LFE: 333 case A52_DOLBY | A52_LFE:
270 #ifdef HAVE_MMX
271 asm volatile( 334 asm volatile(
272 "movl $-1024, %%esi \n\t" 335 "movl $-1024, %%esi \n\t"
273 "movq magicF2W, %%mm7 \n\t" 336 "movq magicF2W, %%mm7 \n\t"
274 "pxor %%mm6, %%mm6 \n\t" 337 "pxor %%mm6, %%mm6 \n\t"
275 "1: \n\t" 338 "1: \n\t"
299 " jnz 1b \n\t" 362 " jnz 1b \n\t"
300 "emms \n\t" 363 "emms \n\t"
301 :: "r" (s16+1536), "r" (f+256) 364 :: "r" (s16+1536), "r" (f+256)
302 :"%esi", "%edi", "memory" 365 :"%esi", "%edi", "memory"
303 ); 366 );
304 #else
305 for (i = 0; i < 256; i++) {
306 s16[6*i] = convert (f[i+256]);
307 s16[6*i+1] = convert (f[i+512]);
308 s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0;
309 s16[6*i+5] = convert (f[i]);
310 }
311 #endif
312 break; 367 break;
313 case A52_3F | A52_LFE: 368 case A52_3F | A52_LFE:
314 #ifdef HAVE_MMX
315 asm volatile( 369 asm volatile(
316 "movl $-1024, %%esi \n\t" 370 "movl $-1024, %%esi \n\t"
317 "movq magicF2W, %%mm7 \n\t" 371 "movq magicF2W, %%mm7 \n\t"
318 "pxor %%mm6, %%mm6 \n\t" 372 "pxor %%mm6, %%mm6 \n\t"
319 "1: \n\t" 373 "1: \n\t"
345 " jnz 1b \n\t" 399 " jnz 1b \n\t"
346 "emms \n\t" 400 "emms \n\t"
347 :: "r" (s16+1536), "r" (f+256) 401 :: "r" (s16+1536), "r" (f+256)
348 :"%esi", "%edi", "memory" 402 :"%esi", "%edi", "memory"
349 ); 403 );
350 #else
351 for (i = 0; i < 256; i++) {
352 s16[6*i] = convert (f[i+256]);
353 s16[6*i+1] = convert (f[i+768]);
354 s16[6*i+2] = s16[6*i+3] = 0;
355 s16[6*i+4] = convert (f[i+512]);
356 s16[6*i+5] = convert (f[i]);
357 }
358 #endif
359 break; 404 break;
360 case A52_2F2R | A52_LFE: 405 case A52_2F2R | A52_LFE:
361 #ifdef HAVE_MMX
362 asm volatile( 406 asm volatile(
363 "movl $-1024, %%esi \n\t" 407 "movl $-1024, %%esi \n\t"
364 "movq magicF2W, %%mm7 \n\t" 408 "movq magicF2W, %%mm7 \n\t"
365 // "pxor %%mm6, %%mm6 \n\t" 409 // "pxor %%mm6, %%mm6 \n\t"
366 "1: \n\t" 410 "1: \n\t"
398 " jnz 1b \n\t" 442 " jnz 1b \n\t"
399 "emms \n\t" 443 "emms \n\t"
400 :: "r" (s16+1536), "r" (f+256) 444 :: "r" (s16+1536), "r" (f+256)
401 :"%esi", "%edi", "memory" 445 :"%esi", "%edi", "memory"
402 ); 446 );
403 #else
404 for (i = 0; i < 256; i++) {
405 s16[6*i] = convert (f[i+256]);
406 s16[6*i+1] = convert (f[i+512]);
407 s16[6*i+2] = convert (f[i+768]);
408 s16[6*i+3] = convert (f[i+1024]);
409 s16[6*i+4] = 0;
410 s16[6*i+5] = convert (f[i]);
411 }
412 #endif
413 break; 447 break;
414 case A52_3F2R | A52_LFE: 448 case A52_3F2R | A52_LFE:
415 #ifdef HAVE_MMX
416 asm volatile( 449 asm volatile(
417 "movl $-1024, %%esi \n\t" 450 "movl $-1024, %%esi \n\t"
418 "movq magicF2W, %%mm7 \n\t" 451 "movq magicF2W, %%mm7 \n\t"
419 // "pxor %%mm6, %%mm6 \n\t" 452 // "pxor %%mm6, %%mm6 \n\t"
420 "1: \n\t" 453 "1: \n\t"
454 " jnz 1b \n\t" 487 " jnz 1b \n\t"
455 "emms \n\t" 488 "emms \n\t"
456 :: "r" (s16+1536), "r" (f+256) 489 :: "r" (s16+1536), "r" (f+256)
457 :"%esi", "%edi", "memory" 490 :"%esi", "%edi", "memory"
458 ); 491 );
459 #else
460 for (i = 0; i < 256; i++) {
461 s16[6*i] = convert (f[i+256]);
462 s16[6*i+1] = convert (f[i+768]);
463 s16[6*i+2] = convert (f[i+1024]);
464 s16[6*i+3] = convert (f[i+1280]);
465 s16[6*i+4] = convert (f[i+512]);
466 s16[6*i+5] = convert (f[i]);
467 }
468 #endif
469 break; 492 break;
470 } 493 }
471 return chans*256; 494 return chans*256;
472 } 495 }
473 496 #endif //arch_x86
497
498 void a52_resample_init(int _flags,int _chans){
499 chans=_chans;
500 flags=_flags;
501
502 if(a52_resample==NULL) // only once please ;)
503 {
504 if(gCpuCaps.hasMMX) fprintf(stderr, "Using MMX optimized resampler\n");
505 else fprintf(stderr, "No accelerated resampler found\n");
506 }
507
508 #ifdef ARCH_X86
509 if(gCpuCaps.hasMMX) a52_resample= a52_resample_MMX;
510 #else
511 if(0);
512 #endif
513 else a52_resample= a52_resample_C;
514 }
515