comparison liba52/imdct_3dnow.h @ 27757:b5a46071062a

Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'. We were using an inconsistent mix of the three variants and 'volatile' should be the most correct and portable variant.
author diego
date Thu, 16 Oct 2008 20:17:56 +0000
parents 08d18fe9da52
children 2f5073b600f4
comparison
equal deleted inserted replaced
27756:1266470a5651 27757:b5a46071062a
43 static void FFT_4_3DNOW(complex_t *x) 43 static void FFT_4_3DNOW(complex_t *x)
44 { 44 {
45 /* delta_p = 1 here */ 45 /* delta_p = 1 here */
46 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} 46 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
47 */ 47 */
48 __asm__ __volatile__( 48 __asm__ volatile(
49 "movq 24(%1), %%mm3\n\t" 49 "movq 24(%1), %%mm3\n\t"
50 "movq 8(%1), %%mm1\n\t" 50 "movq 8(%1), %%mm1\n\t"
51 "pxor %2, %%mm3\n\t" /* mm3.re | -mm3.im */ 51 "pxor %2, %%mm3\n\t" /* mm3.re | -mm3.im */
52 "pxor %3, %%mm1\n\t" /* -mm1.re | mm1.im */ 52 "pxor %3, %%mm1\n\t" /* -mm1.re | mm1.im */
53 "pfadd %%mm1, %%mm3\n\t" /* vi.im = x[3].re - x[1].re; */ 53 "pfadd %%mm1, %%mm3\n\t" /* vi.im = x[3].re - x[1].re; */
88 /* delta_p = diag{1, sqrt(i)} here */ 88 /* delta_p = diag{1, sqrt(i)} here */
89 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} 89 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
90 */ 90 */
91 complex_t wT1, wB1, wB2; 91 complex_t wT1, wB1, wB2;
92 92
93 __asm__ __volatile__( 93 __asm__ volatile(
94 "movq 8(%2), %%mm0\n\t" 94 "movq 8(%2), %%mm0\n\t"
95 "movq 24(%2), %%mm1\n\t" 95 "movq 24(%2), %%mm1\n\t"
96 "movq %%mm0, %0\n\t" /* wT1 = x[1]; */ 96 "movq %%mm0, %0\n\t" /* wT1 = x[1]; */
97 "movq %%mm1, %1\n\t" /* wB1 = x[3]; */ 97 "movq %%mm1, %1\n\t" /* wB1 = x[3]; */
98 :"=m"(wT1), "=m"(wB1) 98 :"=m"(wT1), "=m"(wB1)
99 :"r"(x) 99 :"r"(x)
100 :"memory"); 100 :"memory");
101 101
102 __asm__ __volatile__( 102 __asm__ volatile(
103 "movq 16(%0), %%mm2\n\t" 103 "movq 16(%0), %%mm2\n\t"
104 "movq 32(%0), %%mm3\n\t" 104 "movq 32(%0), %%mm3\n\t"
105 "movq %%mm2, 8(%0)\n\t" /* x[1] = x[2]; */ 105 "movq %%mm2, 8(%0)\n\t" /* x[1] = x[2]; */
106 "movq 48(%0), %%mm4\n\t" 106 "movq 48(%0), %%mm4\n\t"
107 "movq %%mm3, 16(%0)\n\t" /* x[2] = x[4]; */ 107 "movq %%mm3, 16(%0)\n\t" /* x[2] = x[4]; */
112 112
113 fft_4_3dnow(&x[0]); 113 fft_4_3dnow(&x[0]);
114 114
115 /* x[0] x[4] x[2] x[6] */ 115 /* x[0] x[4] x[2] x[6] */
116 116
117 __asm__ __volatile__( 117 __asm__ volatile(
118 "movq 40(%1), %%mm0\n\t" 118 "movq 40(%1), %%mm0\n\t"
119 "movq %%mm0, %%mm3\n\t" 119 "movq %%mm0, %%mm3\n\t"
120 "movq 56(%1), %%mm1\n\t" 120 "movq 56(%1), %%mm1\n\t"
121 "pfadd %%mm1, %%mm0\n\t" 121 "pfadd %%mm1, %%mm0\n\t"
122 "pfsub %%mm1, %%mm3\n\t" 122 "pfsub %%mm1, %%mm3\n\t"
151 :"=r"(x) 151 :"=r"(x)
152 :"0"(x), "r"(&wT1), "r"(&wB1) 152 :"0"(x), "r"(&wT1), "r"(&wB1)
153 :"memory"); 153 :"memory");
154 154
155 /* x[1] x[5] */ 155 /* x[1] x[5] */
156 __asm__ __volatile__ ( 156 __asm__ volatile (
157 "movq %6, %%mm6\n\t" 157 "movq %6, %%mm6\n\t"
158 "movq %5, %%mm7\n\t" 158 "movq %5, %%mm7\n\t"
159 "movq %1, %%mm0\n\t" 159 "movq %1, %%mm0\n\t"
160 "movq %2, %%mm1\n\t" 160 "movq %2, %%mm1\n\t"
161 "movq 56(%3), %%mm3\n\t" 161 "movq 56(%3), %%mm3\n\t"
201 "m"(x_plus_minus_3dnow), "m"(x_minus_plus_3dnow) 201 "m"(x_plus_minus_3dnow), "m"(x_minus_plus_3dnow)
202 :"memory"); 202 :"memory");
203 203
204 204
205 /* x[3] x[7] */ 205 /* x[3] x[7] */
206 __asm__ __volatile__( 206 __asm__ volatile(
207 "movq %1, %%mm0\n\t" 207 "movq %1, %%mm0\n\t"
208 #ifdef HAVE_3DNOWEX 208 #ifdef HAVE_3DNOWEX
209 "pswapd %3, %%mm1\n\t" 209 "pswapd %3, %%mm1\n\t"
210 #else 210 #else
211 "movq %3, %%mm1\n\t" 211 "movq %3, %%mm1\n\t"
356 356
357 /* 512 IMDCT with source and dest data in 'data' */ 357 /* 512 IMDCT with source and dest data in 'data' */
358 358
359 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ 359 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
360 #if 1 360 #if 1
361 __asm__ __volatile__ ( 361 __asm__ volatile (
362 "movq %0, %%mm7\n\t" 362 "movq %0, %%mm7\n\t"
363 ::"m"(x_plus_minus_3dnow) 363 ::"m"(x_plus_minus_3dnow)
364 :"memory"); 364 :"memory");
365 for( i=0; i < 128; i++) { 365 for( i=0; i < 128; i++) {
366 int j = pm128[i]; 366 int j = pm128[i];
367 __asm__ __volatile__ ( 367 __asm__ volatile (
368 "movd %1, %%mm0\n\t" 368 "movd %1, %%mm0\n\t"
369 "movd %3, %%mm1\n\t" 369 "movd %3, %%mm1\n\t"
370 "punpckldq %2, %%mm0\n\t" /* mm0 = data[256-2*j-1] | data[2*j]*/ 370 "punpckldq %2, %%mm0\n\t" /* mm0 = data[256-2*j-1] | data[2*j]*/
371 "punpckldq %4, %%mm1\n\t" /* mm1 = xcos[j] | xsin[j] */ 371 "punpckldq %4, %%mm1\n\t" /* mm1 = xcos[j] | xsin[j] */
372 "movq %%mm0, %%mm2\n\t" 372 "movq %%mm0, %%mm2\n\t"
392 ); 392 );
393 /* buf[i].re = (data[256-2*j-1] * xcos1[j] - data[2*j] * xsin1[j]); 393 /* buf[i].re = (data[256-2*j-1] * xcos1[j] - data[2*j] * xsin1[j]);
394 buf[i].im = (data[256-2*j-1] * xsin1[j] + data[2*j] * xcos1[j])*(-1.0);*/ 394 buf[i].im = (data[256-2*j-1] * xsin1[j] + data[2*j] * xcos1[j])*(-1.0);*/
395 } 395 }
396 #else 396 #else
397 __asm__ __volatile__ ("femms":::"memory"); 397 __asm__ volatile ("femms":::"memory");
398 for( i=0; i < 128; i++) { 398 for( i=0; i < 128; i++) {
399 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ 399 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
400 int j= pm128[i]; 400 int j= pm128[i];
401 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); 401 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
402 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); 402 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
433 FFT_128P_3DNOW (&buf[0]); 433 FFT_128P_3DNOW (&buf[0]);
434 // __asm__ volatile ("femms \n\t":::"memory"); 434 // __asm__ volatile ("femms \n\t":::"memory");
435 435
436 /* Post IFFT complex multiply plus IFFT complex conjugate*/ 436 /* Post IFFT complex multiply plus IFFT complex conjugate*/
437 #if 1 437 #if 1
438 __asm__ __volatile__ ( 438 __asm__ volatile (
439 "movq %0, %%mm7\n\t" 439 "movq %0, %%mm7\n\t"
440 "movq %1, %%mm6\n\t" 440 "movq %1, %%mm6\n\t"
441 ::"m"(x_plus_minus_3dnow), 441 ::"m"(x_plus_minus_3dnow),
442 "m"(x_minus_plus_3dnow) 442 "m"(x_minus_plus_3dnow)
443 :"eax","memory"); 443 :"eax","memory");
444 for (i=0; i < 128; i++) { 444 for (i=0; i < 128; i++) {
445 __asm__ __volatile__ ( 445 __asm__ volatile (
446 "movq %1, %%mm0\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ 446 "movq %1, %%mm0\n\t" /* ac3_buf[i].re | ac3_buf[i].im */
447 "movq %%mm0, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ 447 "movq %%mm0, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */
448 #ifndef HAVE_3DNOWEX 448 #ifndef HAVE_3DNOWEX
449 "punpckldq %%mm1, %%mm2\n\t" 449 "punpckldq %%mm1, %%mm2\n\t"
450 "punpckhdq %%mm2, %%mm1\n\t" 450 "punpckhdq %%mm2, %%mm1\n\t"
471 :"memory"); 471 :"memory");
472 /* ac3_buf[i].re =(tmp_a_r * ac3_xcos1[i]) + (tmp_a_i * ac3_xsin1[i]); 472 /* ac3_buf[i].re =(tmp_a_r * ac3_xcos1[i]) + (tmp_a_i * ac3_xsin1[i]);
473 ac3_buf[i].im =(tmp_a_r * ac3_xsin1[i]) - (tmp_a_i * ac3_xcos1[i]);*/ 473 ac3_buf[i].im =(tmp_a_r * ac3_xsin1[i]) - (tmp_a_i * ac3_xcos1[i]);*/
474 } 474 }
475 #else 475 #else
476 __asm__ __volatile__ ("femms":::"memory"); 476 __asm__ volatile ("femms":::"memory");
477 for( i=0; i < 128; i++) { 477 for( i=0; i < 128; i++) {
478 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ 478 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
479 tmp_a_r = buf[i].real; 479 tmp_a_r = buf[i].real;
480 tmp_a_i = -1.0 * buf[i].imag; 480 tmp_a_i = -1.0 * buf[i].imag;
481 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]); 481 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]);
494 "punpckldq %%mm3, %%mm3 \n\t" 494 "punpckldq %%mm3, %%mm3 \n\t"
495 :: "r" (&bias) 495 :: "r" (&bias)
496 ); 496 );
497 for (i=0; i< 64; i++) { 497 for (i=0; i< 64; i++) {
498 /* merge two loops in one to enable working of 2 decoders */ 498 /* merge two loops in one to enable working of 2 decoders */
499 __asm__ __volatile__ ( 499 __asm__ volatile (
500 "movd 516(%1), %%mm0\n\t" 500 "movd 516(%1), %%mm0\n\t"
501 "movd (%1), %%mm1\n\t" /**data_ptr++=-buf[64+i].im**window_ptr+++*delay_ptr++;*/ 501 "movd (%1), %%mm1\n\t" /**data_ptr++=-buf[64+i].im**window_ptr+++*delay_ptr++;*/
502 "punpckldq (%2), %%mm0\n\t"/*data_ptr[128]=-buf[i].re*window_ptr[128]+delay_ptr[128];*/ 502 "punpckldq (%2), %%mm0\n\t"/*data_ptr[128]=-buf[i].re*window_ptr[128]+delay_ptr[128];*/
503 "punpckldq 516(%2), %%mm1\n\t" 503 "punpckldq 516(%2), %%mm1\n\t"
504 "pfmul (%3), %%mm0\n\t"/**data_ptr++=buf[64-i-1].re**window_ptr+++*delay_ptr++;*/ 504 "pfmul (%3), %%mm0\n\t"/**data_ptr++=buf[64-i-1].re**window_ptr+++*delay_ptr++;*/
518 window_ptr += 2; 518 window_ptr += 2;
519 delay_ptr += 2; 519 delay_ptr += 2;
520 } 520 }
521 window_ptr += 128; 521 window_ptr += 128;
522 #else 522 #else
523 __asm__ __volatile__ ("femms":::"memory"); 523 __asm__ volatile ("femms":::"memory");
524 for(i=0; i< 64; i++) { 524 for(i=0; i< 64; i++) {
525 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; 525 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
526 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; 526 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
527 } 527 }
528 528
536 delay_ptr = delay; 536 delay_ptr = delay;
537 #if 1 537 #if 1
538 for(i=0; i< 64; i++) { 538 for(i=0; i< 64; i++) {
539 /* merge two loops in one to enable working of 2 decoders */ 539 /* merge two loops in one to enable working of 2 decoders */
540 window_ptr -=2; 540 window_ptr -=2;
541 __asm__ __volatile__( 541 __asm__ volatile(
542 "movd 508(%1), %%mm0\n\t" 542 "movd 508(%1), %%mm0\n\t"
543 "movd (%1), %%mm1\n\t" 543 "movd (%1), %%mm1\n\t"
544 "punpckldq (%2), %%mm0\n\t" 544 "punpckldq (%2), %%mm0\n\t"
545 "punpckldq 508(%2), %%mm1\n\t" 545 "punpckldq 508(%2), %%mm1\n\t"
546 #ifdef HAVE_3DNOWEX 546 #ifdef HAVE_3DNOWEX
563 :"=r"(delay_ptr) 563 :"=r"(delay_ptr)
564 :"r"(&buf[i].imag), "r"(&buf[64-i-1].imag), "r"(window_ptr), "0"(delay_ptr) 564 :"r"(&buf[i].imag), "r"(&buf[64-i-1].imag), "r"(window_ptr), "0"(delay_ptr)
565 :"memory"); 565 :"memory");
566 delay_ptr += 2; 566 delay_ptr += 2;
567 } 567 }
568 __asm__ __volatile__ ("femms":::"memory"); 568 __asm__ volatile ("femms":::"memory");
569 #else 569 #else
570 __asm__ __volatile__ ("femms":::"memory"); 570 __asm__ volatile ("femms":::"memory");
571 for(i=0; i< 64; i++) { 571 for(i=0; i< 64; i++) {
572 *delay_ptr++ = -buf[64+i].real * *--window_ptr; 572 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
573 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; 573 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
574 } 574 }
575 575