comparison liba52/resample_mmx.c @ 3909:ef32c8bdee81

c, mmx versions separated. a52 style runtime stuff
author arpi
date Sun, 30 Dec 2001 21:44:10 +0000
parents 0cc94b1eec0f
children 2dbd637ffe05
comparison
equal deleted inserted replaced
3908:0cc94b1eec0f 3909:ef32c8bdee81
1 // this code come from a52dec/libao/audio_out_oss.c
2
3 // FIXME FIXME FIXME
4
5 // a52_resample_init should find the requested converter (from type flags ->
6 // given number of channels) and set up some function pointers...
7
8 // a52_resample() should do the conversion.
9 1
10 // MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL) 2 // MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL)
11 3
12 /* optimization TODO / NOTES 4 /* optimization TODO / NOTES
13 movntq is slightly faster (0.5% with the current test.c benchmark) 5 movntq is slightly faster (0.5% with the current test.c benchmark)
14 (but thats just test.c so that needs to be testd in reallity) 6 (but thats just test.c so that needs to be testd in reallity)
15 and it would mean (C / MMX2 / MMX / 3DNOW) versions 7 and it would mean (C / MMX2 / MMX / 3DNOW) versions
16 */ 8 */
17 9
18 #include <inttypes.h>
19 #include <stdio.h>
20 #include "a52.h"
21 #include "mm_accel.h"
22 #include "../config.h"
23
24 int (* a52_resample) (float * _f, int16_t * s16)=NULL;
25
26 #ifdef ARCH_X86
27 static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL; 10 static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL;
28 static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL; 11 static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL;
29 static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL; 12 static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL;
30 static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL; 13 static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL;
31 #endif 14
32 15 static int a52_resample_MONO_to_5_MMX(float * _f, int16_t * s16){
33 static inline int16_t convert (int32_t i) 16 int i;
34 { 17 int32_t * f = (int32_t *) _f;
35 if (i > 0x43c07fff)
36 return 32767;
37 else if (i < 0x43bf8000)
38 return -32768;
39 else
40 return i - 0x43c00000;
41 }
42
43 static int chans=2;
44 static int flags=0;
45
46 int a52_resample_C(float * _f, int16_t * s16)
47 {
48 int i;
49 int32_t * f = (int32_t *) _f;
50
51 switch (flags) {
52 case A52_MONO:
53 for (i = 0; i < 256; i++) {
54 s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0;
55 s16[5*i+4] = convert (f[i]);
56 }
57 break;
58 case A52_CHANNEL:
59 case A52_STEREO:
60 case A52_DOLBY:
61 for (i = 0; i < 256; i++) {
62 s16[2*i] = convert (f[i]);
63 s16[2*i+1] = convert (f[i+256]);
64 }
65 break;
66 case A52_3F:
67 for (i = 0; i < 256; i++) {
68 s16[5*i] = convert (f[i]);
69 s16[5*i+1] = convert (f[i+512]);
70 s16[5*i+2] = s16[5*i+3] = 0;
71 s16[5*i+4] = convert (f[i+256]);
72 }
73 break;
74 case A52_2F2R:
75 for (i = 0; i < 256; i++) {
76 s16[4*i] = convert (f[i]);
77 s16[4*i+1] = convert (f[i+256]);
78 s16[4*i+2] = convert (f[i+512]);
79 s16[4*i+3] = convert (f[i+768]);
80 }
81 break;
82 case A52_3F2R:
83 for (i = 0; i < 256; i++) {
84 s16[5*i] = convert (f[i]);
85 s16[5*i+1] = convert (f[i+512]);
86 s16[5*i+2] = convert (f[i+768]);
87 s16[5*i+3] = convert (f[i+1024]);
88 s16[5*i+4] = convert (f[i+256]);
89 }
90 break;
91 case A52_MONO | A52_LFE:
92 for (i = 0; i < 256; i++) {
93 s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0;
94 s16[6*i+4] = convert (f[i+256]);
95 s16[6*i+5] = convert (f[i]);
96 }
97 break;
98 case A52_CHANNEL | A52_LFE:
99 case A52_STEREO | A52_LFE:
100 case A52_DOLBY | A52_LFE:
101 for (i = 0; i < 256; i++) {
102 s16[6*i] = convert (f[i+256]);
103 s16[6*i+1] = convert (f[i+512]);
104 s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0;
105 s16[6*i+5] = convert (f[i]);
106 }
107 break;
108 case A52_3F | A52_LFE:
109 for (i = 0; i < 256; i++) {
110 s16[6*i] = convert (f[i+256]);
111 s16[6*i+1] = convert (f[i+768]);
112 s16[6*i+2] = s16[6*i+3] = 0;
113 s16[6*i+4] = convert (f[i+512]);
114 s16[6*i+5] = convert (f[i]);
115 }
116 break;
117 case A52_2F2R | A52_LFE:
118 for (i = 0; i < 256; i++) {
119 s16[6*i] = convert (f[i+256]);
120 s16[6*i+1] = convert (f[i+512]);
121 s16[6*i+2] = convert (f[i+768]);
122 s16[6*i+3] = convert (f[i+1024]);
123 s16[6*i+4] = 0;
124 s16[6*i+5] = convert (f[i]);
125 }
126 break;
127 case A52_3F2R | A52_LFE:
128 for (i = 0; i < 256; i++) {
129 s16[6*i] = convert (f[i+256]);
130 s16[6*i+1] = convert (f[i+768]);
131 s16[6*i+2] = convert (f[i+1024]);
132 s16[6*i+3] = convert (f[i+1280]);
133 s16[6*i+4] = convert (f[i+512]);
134 s16[6*i+5] = convert (f[i]);
135 }
136 break;
137 }
138 return chans*256;
139 }
140
141 #ifdef ARCH_X86
142 int a52_resample_MMX(float * _f, int16_t * s16)
143 {
144 int i;
145 int32_t * f = (int32_t *) _f;
146
147 switch (flags) {
148 case A52_MONO:
149 asm volatile( 18 asm volatile(
150 "movl $-512, %%esi \n\t" 19 "movl $-512, %%esi \n\t"
151 "movq magicF2W, %%mm7 \n\t" 20 "movq magicF2W, %%mm7 \n\t"
152 "movq wm1100, %%mm3 \n\t" 21 "movq wm1100, %%mm3 \n\t"
153 "movq wm0101, %%mm4 \n\t" 22 "movq wm0101, %%mm4 \n\t"
176 " jnz 1b \n\t" 45 " jnz 1b \n\t"
177 "emms \n\t" 46 "emms \n\t"
178 :: "r" (s16+1280), "r" (f+256) 47 :: "r" (s16+1280), "r" (f+256)
179 :"%esi", "%edi", "memory" 48 :"%esi", "%edi", "memory"
180 ); 49 );
181 break; 50 return 5*256;
182 case A52_CHANNEL: 51 }
183 case A52_STEREO: 52
184 case A52_DOLBY: 53 static int a52_resample_STEREO_to_2_MMX(float * _f, int16_t * s16){
54 int i;
55 int32_t * f = (int32_t *) _f;
185 /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it 56 /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it
186 #ifdef HAVE_SSE 57 #ifdef HAVE_SSE
187 asm volatile( 58 asm volatile(
188 "movl $-1024, %%esi \n\t" 59 "movl $-1024, %%esi \n\t"
189 "1: \n\t" 60 "1: \n\t"
223 " jnz 1b \n\t" 94 " jnz 1b \n\t"
224 "emms \n\t" 95 "emms \n\t"
225 :: "r" (s16+512), "r" (f+256) 96 :: "r" (s16+512), "r" (f+256)
226 :"%esi", "memory" 97 :"%esi", "memory"
227 ); 98 );
228 break; 99 return 2*256;
229 case A52_3F: 100 }
101
102 static int a52_resample_3F_to_5_MMX(float * _f, int16_t * s16){
103 int i;
104 int32_t * f = (int32_t *) _f;
230 asm volatile( 105 asm volatile(
231 "movl $-1024, %%esi \n\t" 106 "movl $-1024, %%esi \n\t"
232 "movq magicF2W, %%mm7 \n\t" 107 "movq magicF2W, %%mm7 \n\t"
233 "pxor %%mm6, %%mm6 \n\t" 108 "pxor %%mm6, %%mm6 \n\t"
234 "movq %%mm7, %%mm5 \n\t" 109 "movq %%mm7, %%mm5 \n\t"
275 " jnz 1b \n\t" 150 " jnz 1b \n\t"
276 "emms \n\t" 151 "emms \n\t"
277 :: "r" (s16+1280), "r" (f+256) 152 :: "r" (s16+1280), "r" (f+256)
278 :"%esi", "%edi", "memory" 153 :"%esi", "%edi", "memory"
279 ); 154 );
280 break; 155 return 5*256;
281 case A52_2F2R: 156 }
157
158 static int a52_resample_2F_2R_to_4_MMX(float * _f, int16_t * s16){
159 int i;
160 int32_t * f = (int32_t *) _f;
282 asm volatile( 161 asm volatile(
283 "movl $-1024, %%esi \n\t" 162 "movl $-1024, %%esi \n\t"
284 "movq magicF2W, %%mm7 \n\t" 163 "movq magicF2W, %%mm7 \n\t"
285 "1: \n\t" 164 "1: \n\t"
286 "movq (%1, %%esi), %%mm0 \n\t" 165 "movq (%1, %%esi), %%mm0 \n\t"
323 " jnz 1b \n\t" 202 " jnz 1b \n\t"
324 "emms \n\t" 203 "emms \n\t"
325 :: "r" (s16+1024), "r" (f+256) 204 :: "r" (s16+1024), "r" (f+256)
326 :"%esi", "memory" 205 :"%esi", "memory"
327 ); 206 );
328 break; 207 return 4*256;
329 case A52_3F2R: 208 }
209
210 static int a52_resample_3F_2R_to_5_MMX(float * _f, int16_t * s16){
211 int i;
212 int32_t * f = (int32_t *) _f;
330 asm volatile( 213 asm volatile(
331 "movl $-1024, %%esi \n\t" 214 "movl $-1024, %%esi \n\t"
332 "movq magicF2W, %%mm7 \n\t" 215 "movq magicF2W, %%mm7 \n\t"
333 "1: \n\t" 216 "1: \n\t"
334 "movd (%1, %%esi), %%mm0 \n\t" 217 "movd (%1, %%esi), %%mm0 \n\t"
379 " jnz 1b \n\t" 262 " jnz 1b \n\t"
380 "emms \n\t" 263 "emms \n\t"
381 :: "r" (s16+1280), "r" (f+256) 264 :: "r" (s16+1280), "r" (f+256)
382 :"%esi", "%edi", "memory" 265 :"%esi", "%edi", "memory"
383 ); 266 );
384 break; 267 return 5*256;
385 case A52_MONO | A52_LFE: 268 }
269
270 static int a52_resample_MONO_LFE_to_6_MMX(float * _f, int16_t * s16){
271 int i;
272 int32_t * f = (int32_t *) _f;
386 asm volatile( 273 asm volatile(
387 "movl $-1024, %%esi \n\t" 274 "movl $-1024, %%esi \n\t"
388 "movq magicF2W, %%mm7 \n\t" 275 "movq magicF2W, %%mm7 \n\t"
389 "pxor %%mm6, %%mm6 \n\t" 276 "pxor %%mm6, %%mm6 \n\t"
390 "1: \n\t" 277 "1: \n\t"
416 " jnz 1b \n\t" 303 " jnz 1b \n\t"
417 "emms \n\t" 304 "emms \n\t"
418 :: "r" (s16+1536), "r" (f+256) 305 :: "r" (s16+1536), "r" (f+256)
419 :"%esi", "%edi", "memory" 306 :"%esi", "%edi", "memory"
420 ); 307 );
421 break; 308 return 6*256;
422 case A52_CHANNEL | A52_LFE: 309 }
423 case A52_STEREO | A52_LFE: 310
424 case A52_DOLBY | A52_LFE: 311 static int a52_resample_STEREO_LFE_to_6_MMX(float * _f, int16_t * s16){
312 int i;
313 int32_t * f = (int32_t *) _f;
425 asm volatile( 314 asm volatile(
426 "movl $-1024, %%esi \n\t" 315 "movl $-1024, %%esi \n\t"
427 "movq magicF2W, %%mm7 \n\t" 316 "movq magicF2W, %%mm7 \n\t"
428 "pxor %%mm6, %%mm6 \n\t" 317 "pxor %%mm6, %%mm6 \n\t"
429 "1: \n\t" 318 "1: \n\t"
453 " jnz 1b \n\t" 342 " jnz 1b \n\t"
454 "emms \n\t" 343 "emms \n\t"
455 :: "r" (s16+1536), "r" (f+256) 344 :: "r" (s16+1536), "r" (f+256)
456 :"%esi", "%edi", "memory" 345 :"%esi", "%edi", "memory"
457 ); 346 );
458 break; 347 return 6*256;
459 case A52_3F | A52_LFE: 348 }
349
350 static int a52_resample_3F_LFE_to_6_MMX(float * _f, int16_t * s16){
351 int i;
352 int32_t * f = (int32_t *) _f;
460 asm volatile( 353 asm volatile(
461 "movl $-1024, %%esi \n\t" 354 "movl $-1024, %%esi \n\t"
462 "movq magicF2W, %%mm7 \n\t" 355 "movq magicF2W, %%mm7 \n\t"
463 "pxor %%mm6, %%mm6 \n\t" 356 "pxor %%mm6, %%mm6 \n\t"
464 "1: \n\t" 357 "1: \n\t"
490 " jnz 1b \n\t" 383 " jnz 1b \n\t"
491 "emms \n\t" 384 "emms \n\t"
492 :: "r" (s16+1536), "r" (f+256) 385 :: "r" (s16+1536), "r" (f+256)
493 :"%esi", "%edi", "memory" 386 :"%esi", "%edi", "memory"
494 ); 387 );
495 break; 388 return 6*256;
496 case A52_2F2R | A52_LFE: 389 }
390
391 static int a52_resample_2F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
392 int i;
393 int32_t * f = (int32_t *) _f;
497 asm volatile( 394 asm volatile(
498 "movl $-1024, %%esi \n\t" 395 "movl $-1024, %%esi \n\t"
499 "movq magicF2W, %%mm7 \n\t" 396 "movq magicF2W, %%mm7 \n\t"
500 // "pxor %%mm6, %%mm6 \n\t" 397 // "pxor %%mm6, %%mm6 \n\t"
501 "1: \n\t" 398 "1: \n\t"
533 " jnz 1b \n\t" 430 " jnz 1b \n\t"
534 "emms \n\t" 431 "emms \n\t"
535 :: "r" (s16+1536), "r" (f+256) 432 :: "r" (s16+1536), "r" (f+256)
536 :"%esi", "%edi", "memory" 433 :"%esi", "%edi", "memory"
537 ); 434 );
538 break; 435 return 6*256;
539 case A52_3F2R | A52_LFE: 436 }
437
438 static int a52_resample_3F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
439 int i;
440 int32_t * f = (int32_t *) _f;
540 asm volatile( 441 asm volatile(
541 "movl $-1024, %%esi \n\t" 442 "movl $-1024, %%esi \n\t"
542 "movq magicF2W, %%mm7 \n\t" 443 "movq magicF2W, %%mm7 \n\t"
543 // "pxor %%mm6, %%mm6 \n\t" 444 // "pxor %%mm6, %%mm6 \n\t"
544 "1: \n\t" 445 "1: \n\t"
578 " jnz 1b \n\t" 479 " jnz 1b \n\t"
579 "emms \n\t" 480 "emms \n\t"
580 :: "r" (s16+1536), "r" (f+256) 481 :: "r" (s16+1536), "r" (f+256)
581 :"%esi", "%edi", "memory" 482 :"%esi", "%edi", "memory"
582 ); 483 );
484 return 6*256;
485 }
486
487
488 static void* a52_resample_MMX(int flags, int ch){
489 switch (flags) {
490 case A52_MONO:
491 if(ch==5) return a52_resample_MONO_to_5_MMX;
492 break;
493 case A52_CHANNEL:
494 case A52_STEREO:
495 case A52_DOLBY:
496 if(ch==2) return a52_resample_STEREO_to_2_MMX;
497 break;
498 case A52_3F:
499 if(ch==5) return a52_resample_3F_to_5_MMX;
500 break;
501 case A52_2F2R:
502 if(ch==4) return a52_resample_2F_2R_to_4_MMX;
503 break;
504 case A52_3F2R:
505 if(ch==5) return a52_resample_3F_2R_to_5_MMX;
506 break;
507 case A52_MONO | A52_LFE:
508 if(ch==6) return a52_resample_MONO_LFE_to_6_MMX;
509 break;
510 case A52_CHANNEL | A52_LFE:
511 case A52_STEREO | A52_LFE:
512 case A52_DOLBY | A52_LFE:
513 if(ch==6) return a52_resample_STEREO_LFE_to_6_MMX;
514 break;
515 case A52_3F | A52_LFE:
516 if(ch==6) return a52_resample_3F_LFE_to_6_MMX;
517 break;
518 case A52_2F2R | A52_LFE:
519 if(ch==6) return a52_resample_2F_2R_LFE_to_6_MMX;
520 break;
521 case A52_3F2R | A52_LFE:
522 if(ch==6) return a52_resample_3F_2R_LFE_to_6_MMX;
583 break; 523 break;
584 } 524 }
585 return chans*256; 525 return NULL;
586 } 526 }
587 #endif //arch_x86 527
588 528
589 void a52_resample_init(uint32_t mm_accel,int _flags,int _chans){
590 chans=_chans;
591 flags=_flags;
592
593 if(a52_resample==NULL) // only once please ;)
594 {
595 if(mm_accel & MM_ACCEL_X86_MMX) fprintf(stderr, "Using MMX optimized resampler\n");
596 else fprintf(stderr, "No accelerated resampler found\n");
597 }
598
599 #ifdef ARCH_X86
600 if(mm_accel & MM_ACCEL_X86_MMX) a52_resample= a52_resample_MMX;
601 #else
602 if(0);
603 #endif
604 else a52_resample= a52_resample_C;
605 }
606