Mercurial > mplayer.hg
comparison liba52/resample_mmx.c @ 3909:ef32c8bdee81
c, mmx versions separated. a52 style runtime stuff
author | arpi |
---|---|
date | Sun, 30 Dec 2001 21:44:10 +0000 |
parents | 0cc94b1eec0f |
children | 2dbd637ffe05 |
comparison
equal
deleted
inserted
replaced
3908:0cc94b1eec0f | 3909:ef32c8bdee81 |
---|---|
1 // this code come from a52dec/libao/audio_out_oss.c | |
2 | |
3 // FIXME FIXME FIXME | |
4 | |
5 // a52_resample_init should find the requested converter (from type flags -> | |
6 // given number of channels) and set up some function pointers... | |
7 | |
8 // a52_resample() should do the conversion. | |
9 | 1 |
10 // MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL) | 2 // MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL) |
11 | 3 |
12 /* optimization TODO / NOTES | 4 /* optimization TODO / NOTES |
13 movntq is slightly faster (0.5% with the current test.c benchmark) | 5 movntq is slightly faster (0.5% with the current test.c benchmark) |
14 (but thats just test.c so that needs to be testd in reallity) | 6 (but thats just test.c so that needs to be testd in reallity) |
15 and it would mean (C / MMX2 / MMX / 3DNOW) versions | 7 and it would mean (C / MMX2 / MMX / 3DNOW) versions |
16 */ | 8 */ |
17 | 9 |
18 #include <inttypes.h> | |
19 #include <stdio.h> | |
20 #include "a52.h" | |
21 #include "mm_accel.h" | |
22 #include "../config.h" | |
23 | |
24 int (* a52_resample) (float * _f, int16_t * s16)=NULL; | |
25 | |
26 #ifdef ARCH_X86 | |
27 static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL; | 10 static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL; |
28 static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL; | 11 static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL; |
29 static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL; | 12 static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL; |
30 static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL; | 13 static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL; |
31 #endif | 14 |
32 | 15 static int a52_resample_MONO_to_5_MMX(float * _f, int16_t * s16){ |
33 static inline int16_t convert (int32_t i) | 16 int i; |
34 { | 17 int32_t * f = (int32_t *) _f; |
35 if (i > 0x43c07fff) | |
36 return 32767; | |
37 else if (i < 0x43bf8000) | |
38 return -32768; | |
39 else | |
40 return i - 0x43c00000; | |
41 } | |
42 | |
43 static int chans=2; | |
44 static int flags=0; | |
45 | |
46 int a52_resample_C(float * _f, int16_t * s16) | |
47 { | |
48 int i; | |
49 int32_t * f = (int32_t *) _f; | |
50 | |
51 switch (flags) { | |
52 case A52_MONO: | |
53 for (i = 0; i < 256; i++) { | |
54 s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0; | |
55 s16[5*i+4] = convert (f[i]); | |
56 } | |
57 break; | |
58 case A52_CHANNEL: | |
59 case A52_STEREO: | |
60 case A52_DOLBY: | |
61 for (i = 0; i < 256; i++) { | |
62 s16[2*i] = convert (f[i]); | |
63 s16[2*i+1] = convert (f[i+256]); | |
64 } | |
65 break; | |
66 case A52_3F: | |
67 for (i = 0; i < 256; i++) { | |
68 s16[5*i] = convert (f[i]); | |
69 s16[5*i+1] = convert (f[i+512]); | |
70 s16[5*i+2] = s16[5*i+3] = 0; | |
71 s16[5*i+4] = convert (f[i+256]); | |
72 } | |
73 break; | |
74 case A52_2F2R: | |
75 for (i = 0; i < 256; i++) { | |
76 s16[4*i] = convert (f[i]); | |
77 s16[4*i+1] = convert (f[i+256]); | |
78 s16[4*i+2] = convert (f[i+512]); | |
79 s16[4*i+3] = convert (f[i+768]); | |
80 } | |
81 break; | |
82 case A52_3F2R: | |
83 for (i = 0; i < 256; i++) { | |
84 s16[5*i] = convert (f[i]); | |
85 s16[5*i+1] = convert (f[i+512]); | |
86 s16[5*i+2] = convert (f[i+768]); | |
87 s16[5*i+3] = convert (f[i+1024]); | |
88 s16[5*i+4] = convert (f[i+256]); | |
89 } | |
90 break; | |
91 case A52_MONO | A52_LFE: | |
92 for (i = 0; i < 256; i++) { | |
93 s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0; | |
94 s16[6*i+4] = convert (f[i+256]); | |
95 s16[6*i+5] = convert (f[i]); | |
96 } | |
97 break; | |
98 case A52_CHANNEL | A52_LFE: | |
99 case A52_STEREO | A52_LFE: | |
100 case A52_DOLBY | A52_LFE: | |
101 for (i = 0; i < 256; i++) { | |
102 s16[6*i] = convert (f[i+256]); | |
103 s16[6*i+1] = convert (f[i+512]); | |
104 s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0; | |
105 s16[6*i+5] = convert (f[i]); | |
106 } | |
107 break; | |
108 case A52_3F | A52_LFE: | |
109 for (i = 0; i < 256; i++) { | |
110 s16[6*i] = convert (f[i+256]); | |
111 s16[6*i+1] = convert (f[i+768]); | |
112 s16[6*i+2] = s16[6*i+3] = 0; | |
113 s16[6*i+4] = convert (f[i+512]); | |
114 s16[6*i+5] = convert (f[i]); | |
115 } | |
116 break; | |
117 case A52_2F2R | A52_LFE: | |
118 for (i = 0; i < 256; i++) { | |
119 s16[6*i] = convert (f[i+256]); | |
120 s16[6*i+1] = convert (f[i+512]); | |
121 s16[6*i+2] = convert (f[i+768]); | |
122 s16[6*i+3] = convert (f[i+1024]); | |
123 s16[6*i+4] = 0; | |
124 s16[6*i+5] = convert (f[i]); | |
125 } | |
126 break; | |
127 case A52_3F2R | A52_LFE: | |
128 for (i = 0; i < 256; i++) { | |
129 s16[6*i] = convert (f[i+256]); | |
130 s16[6*i+1] = convert (f[i+768]); | |
131 s16[6*i+2] = convert (f[i+1024]); | |
132 s16[6*i+3] = convert (f[i+1280]); | |
133 s16[6*i+4] = convert (f[i+512]); | |
134 s16[6*i+5] = convert (f[i]); | |
135 } | |
136 break; | |
137 } | |
138 return chans*256; | |
139 } | |
140 | |
141 #ifdef ARCH_X86 | |
142 int a52_resample_MMX(float * _f, int16_t * s16) | |
143 { | |
144 int i; | |
145 int32_t * f = (int32_t *) _f; | |
146 | |
147 switch (flags) { | |
148 case A52_MONO: | |
149 asm volatile( | 18 asm volatile( |
150 "movl $-512, %%esi \n\t" | 19 "movl $-512, %%esi \n\t" |
151 "movq magicF2W, %%mm7 \n\t" | 20 "movq magicF2W, %%mm7 \n\t" |
152 "movq wm1100, %%mm3 \n\t" | 21 "movq wm1100, %%mm3 \n\t" |
153 "movq wm0101, %%mm4 \n\t" | 22 "movq wm0101, %%mm4 \n\t" |
176 " jnz 1b \n\t" | 45 " jnz 1b \n\t" |
177 "emms \n\t" | 46 "emms \n\t" |
178 :: "r" (s16+1280), "r" (f+256) | 47 :: "r" (s16+1280), "r" (f+256) |
179 :"%esi", "%edi", "memory" | 48 :"%esi", "%edi", "memory" |
180 ); | 49 ); |
181 break; | 50 return 5*256; |
182 case A52_CHANNEL: | 51 } |
183 case A52_STEREO: | 52 |
184 case A52_DOLBY: | 53 static int a52_resample_STEREO_to_2_MMX(float * _f, int16_t * s16){ |
54 int i; | |
55 int32_t * f = (int32_t *) _f; | |
185 /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it | 56 /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it |
186 #ifdef HAVE_SSE | 57 #ifdef HAVE_SSE |
187 asm volatile( | 58 asm volatile( |
188 "movl $-1024, %%esi \n\t" | 59 "movl $-1024, %%esi \n\t" |
189 "1: \n\t" | 60 "1: \n\t" |
223 " jnz 1b \n\t" | 94 " jnz 1b \n\t" |
224 "emms \n\t" | 95 "emms \n\t" |
225 :: "r" (s16+512), "r" (f+256) | 96 :: "r" (s16+512), "r" (f+256) |
226 :"%esi", "memory" | 97 :"%esi", "memory" |
227 ); | 98 ); |
228 break; | 99 return 2*256; |
229 case A52_3F: | 100 } |
101 | |
102 static int a52_resample_3F_to_5_MMX(float * _f, int16_t * s16){ | |
103 int i; | |
104 int32_t * f = (int32_t *) _f; | |
230 asm volatile( | 105 asm volatile( |
231 "movl $-1024, %%esi \n\t" | 106 "movl $-1024, %%esi \n\t" |
232 "movq magicF2W, %%mm7 \n\t" | 107 "movq magicF2W, %%mm7 \n\t" |
233 "pxor %%mm6, %%mm6 \n\t" | 108 "pxor %%mm6, %%mm6 \n\t" |
234 "movq %%mm7, %%mm5 \n\t" | 109 "movq %%mm7, %%mm5 \n\t" |
275 " jnz 1b \n\t" | 150 " jnz 1b \n\t" |
276 "emms \n\t" | 151 "emms \n\t" |
277 :: "r" (s16+1280), "r" (f+256) | 152 :: "r" (s16+1280), "r" (f+256) |
278 :"%esi", "%edi", "memory" | 153 :"%esi", "%edi", "memory" |
279 ); | 154 ); |
280 break; | 155 return 5*256; |
281 case A52_2F2R: | 156 } |
157 | |
158 static int a52_resample_2F_2R_to_4_MMX(float * _f, int16_t * s16){ | |
159 int i; | |
160 int32_t * f = (int32_t *) _f; | |
282 asm volatile( | 161 asm volatile( |
283 "movl $-1024, %%esi \n\t" | 162 "movl $-1024, %%esi \n\t" |
284 "movq magicF2W, %%mm7 \n\t" | 163 "movq magicF2W, %%mm7 \n\t" |
285 "1: \n\t" | 164 "1: \n\t" |
286 "movq (%1, %%esi), %%mm0 \n\t" | 165 "movq (%1, %%esi), %%mm0 \n\t" |
323 " jnz 1b \n\t" | 202 " jnz 1b \n\t" |
324 "emms \n\t" | 203 "emms \n\t" |
325 :: "r" (s16+1024), "r" (f+256) | 204 :: "r" (s16+1024), "r" (f+256) |
326 :"%esi", "memory" | 205 :"%esi", "memory" |
327 ); | 206 ); |
328 break; | 207 return 4*256; |
329 case A52_3F2R: | 208 } |
209 | |
210 static int a52_resample_3F_2R_to_5_MMX(float * _f, int16_t * s16){ | |
211 int i; | |
212 int32_t * f = (int32_t *) _f; | |
330 asm volatile( | 213 asm volatile( |
331 "movl $-1024, %%esi \n\t" | 214 "movl $-1024, %%esi \n\t" |
332 "movq magicF2W, %%mm7 \n\t" | 215 "movq magicF2W, %%mm7 \n\t" |
333 "1: \n\t" | 216 "1: \n\t" |
334 "movd (%1, %%esi), %%mm0 \n\t" | 217 "movd (%1, %%esi), %%mm0 \n\t" |
379 " jnz 1b \n\t" | 262 " jnz 1b \n\t" |
380 "emms \n\t" | 263 "emms \n\t" |
381 :: "r" (s16+1280), "r" (f+256) | 264 :: "r" (s16+1280), "r" (f+256) |
382 :"%esi", "%edi", "memory" | 265 :"%esi", "%edi", "memory" |
383 ); | 266 ); |
384 break; | 267 return 5*256; |
385 case A52_MONO | A52_LFE: | 268 } |
269 | |
270 static int a52_resample_MONO_LFE_to_6_MMX(float * _f, int16_t * s16){ | |
271 int i; | |
272 int32_t * f = (int32_t *) _f; | |
386 asm volatile( | 273 asm volatile( |
387 "movl $-1024, %%esi \n\t" | 274 "movl $-1024, %%esi \n\t" |
388 "movq magicF2W, %%mm7 \n\t" | 275 "movq magicF2W, %%mm7 \n\t" |
389 "pxor %%mm6, %%mm6 \n\t" | 276 "pxor %%mm6, %%mm6 \n\t" |
390 "1: \n\t" | 277 "1: \n\t" |
416 " jnz 1b \n\t" | 303 " jnz 1b \n\t" |
417 "emms \n\t" | 304 "emms \n\t" |
418 :: "r" (s16+1536), "r" (f+256) | 305 :: "r" (s16+1536), "r" (f+256) |
419 :"%esi", "%edi", "memory" | 306 :"%esi", "%edi", "memory" |
420 ); | 307 ); |
421 break; | 308 return 6*256; |
422 case A52_CHANNEL | A52_LFE: | 309 } |
423 case A52_STEREO | A52_LFE: | 310 |
424 case A52_DOLBY | A52_LFE: | 311 static int a52_resample_STEREO_LFE_to_6_MMX(float * _f, int16_t * s16){ |
312 int i; | |
313 int32_t * f = (int32_t *) _f; | |
425 asm volatile( | 314 asm volatile( |
426 "movl $-1024, %%esi \n\t" | 315 "movl $-1024, %%esi \n\t" |
427 "movq magicF2W, %%mm7 \n\t" | 316 "movq magicF2W, %%mm7 \n\t" |
428 "pxor %%mm6, %%mm6 \n\t" | 317 "pxor %%mm6, %%mm6 \n\t" |
429 "1: \n\t" | 318 "1: \n\t" |
453 " jnz 1b \n\t" | 342 " jnz 1b \n\t" |
454 "emms \n\t" | 343 "emms \n\t" |
455 :: "r" (s16+1536), "r" (f+256) | 344 :: "r" (s16+1536), "r" (f+256) |
456 :"%esi", "%edi", "memory" | 345 :"%esi", "%edi", "memory" |
457 ); | 346 ); |
458 break; | 347 return 6*256; |
459 case A52_3F | A52_LFE: | 348 } |
349 | |
350 static int a52_resample_3F_LFE_to_6_MMX(float * _f, int16_t * s16){ | |
351 int i; | |
352 int32_t * f = (int32_t *) _f; | |
460 asm volatile( | 353 asm volatile( |
461 "movl $-1024, %%esi \n\t" | 354 "movl $-1024, %%esi \n\t" |
462 "movq magicF2W, %%mm7 \n\t" | 355 "movq magicF2W, %%mm7 \n\t" |
463 "pxor %%mm6, %%mm6 \n\t" | 356 "pxor %%mm6, %%mm6 \n\t" |
464 "1: \n\t" | 357 "1: \n\t" |
490 " jnz 1b \n\t" | 383 " jnz 1b \n\t" |
491 "emms \n\t" | 384 "emms \n\t" |
492 :: "r" (s16+1536), "r" (f+256) | 385 :: "r" (s16+1536), "r" (f+256) |
493 :"%esi", "%edi", "memory" | 386 :"%esi", "%edi", "memory" |
494 ); | 387 ); |
495 break; | 388 return 6*256; |
496 case A52_2F2R | A52_LFE: | 389 } |
390 | |
391 static int a52_resample_2F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){ | |
392 int i; | |
393 int32_t * f = (int32_t *) _f; | |
497 asm volatile( | 394 asm volatile( |
498 "movl $-1024, %%esi \n\t" | 395 "movl $-1024, %%esi \n\t" |
499 "movq magicF2W, %%mm7 \n\t" | 396 "movq magicF2W, %%mm7 \n\t" |
500 // "pxor %%mm6, %%mm6 \n\t" | 397 // "pxor %%mm6, %%mm6 \n\t" |
501 "1: \n\t" | 398 "1: \n\t" |
533 " jnz 1b \n\t" | 430 " jnz 1b \n\t" |
534 "emms \n\t" | 431 "emms \n\t" |
535 :: "r" (s16+1536), "r" (f+256) | 432 :: "r" (s16+1536), "r" (f+256) |
536 :"%esi", "%edi", "memory" | 433 :"%esi", "%edi", "memory" |
537 ); | 434 ); |
538 break; | 435 return 6*256; |
539 case A52_3F2R | A52_LFE: | 436 } |
437 | |
438 static int a52_resample_3F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){ | |
439 int i; | |
440 int32_t * f = (int32_t *) _f; | |
540 asm volatile( | 441 asm volatile( |
541 "movl $-1024, %%esi \n\t" | 442 "movl $-1024, %%esi \n\t" |
542 "movq magicF2W, %%mm7 \n\t" | 443 "movq magicF2W, %%mm7 \n\t" |
543 // "pxor %%mm6, %%mm6 \n\t" | 444 // "pxor %%mm6, %%mm6 \n\t" |
544 "1: \n\t" | 445 "1: \n\t" |
578 " jnz 1b \n\t" | 479 " jnz 1b \n\t" |
579 "emms \n\t" | 480 "emms \n\t" |
580 :: "r" (s16+1536), "r" (f+256) | 481 :: "r" (s16+1536), "r" (f+256) |
581 :"%esi", "%edi", "memory" | 482 :"%esi", "%edi", "memory" |
582 ); | 483 ); |
484 return 6*256; | |
485 } | |
486 | |
487 | |
488 static void* a52_resample_MMX(int flags, int ch){ | |
489 switch (flags) { | |
490 case A52_MONO: | |
491 if(ch==5) return a52_resample_MONO_to_5_MMX; | |
492 break; | |
493 case A52_CHANNEL: | |
494 case A52_STEREO: | |
495 case A52_DOLBY: | |
496 if(ch==2) return a52_resample_STEREO_to_2_MMX; | |
497 break; | |
498 case A52_3F: | |
499 if(ch==5) return a52_resample_3F_to_5_MMX; | |
500 break; | |
501 case A52_2F2R: | |
502 if(ch==4) return a52_resample_2F_2R_to_4_MMX; | |
503 break; | |
504 case A52_3F2R: | |
505 if(ch==5) return a52_resample_3F_2R_to_5_MMX; | |
506 break; | |
507 case A52_MONO | A52_LFE: | |
508 if(ch==6) return a52_resample_MONO_LFE_to_6_MMX; | |
509 break; | |
510 case A52_CHANNEL | A52_LFE: | |
511 case A52_STEREO | A52_LFE: | |
512 case A52_DOLBY | A52_LFE: | |
513 if(ch==6) return a52_resample_STEREO_LFE_to_6_MMX; | |
514 break; | |
515 case A52_3F | A52_LFE: | |
516 if(ch==6) return a52_resample_3F_LFE_to_6_MMX; | |
517 break; | |
518 case A52_2F2R | A52_LFE: | |
519 if(ch==6) return a52_resample_2F_2R_LFE_to_6_MMX; | |
520 break; | |
521 case A52_3F2R | A52_LFE: | |
522 if(ch==6) return a52_resample_3F_2R_LFE_to_6_MMX; | |
583 break; | 523 break; |
584 } | 524 } |
585 return chans*256; | 525 return NULL; |
586 } | 526 } |
587 #endif //arch_x86 | 527 |
588 | 528 |
589 void a52_resample_init(uint32_t mm_accel,int _flags,int _chans){ | |
590 chans=_chans; | |
591 flags=_flags; | |
592 | |
593 if(a52_resample==NULL) // only once please ;) | |
594 { | |
595 if(mm_accel & MM_ACCEL_X86_MMX) fprintf(stderr, "Using MMX optimized resampler\n"); | |
596 else fprintf(stderr, "No accelerated resampler found\n"); | |
597 } | |
598 | |
599 #ifdef ARCH_X86 | |
600 if(mm_accel & MM_ACCEL_X86_MMX) a52_resample= a52_resample_MMX; | |
601 #else | |
602 if(0); | |
603 #endif | |
604 else a52_resample= a52_resample_C; | |
605 } | |
606 |