comparison liba52/imdct.c @ 3584:7c4046c04be3

removing unnecessary sse sin/cos LUT
author michael
date Tue, 18 Dec 2001 17:29:27 +0000
parents 8ddf654c4871
children 3f1c2c06d0d8
comparison
equal deleted inserted replaced
3583:66e418645b67 3584:7c4046c04be3
77 77
78 #ifdef ARCH_X86 78 #ifdef ARCH_X86
79 // NOTE: SSE needs 16byte alignment or it will segfault 79 // NOTE: SSE needs 16byte alignment or it will segfault
80 // 80 //
81 static complex_t __attribute__((aligned(16))) buf[128]; 81 static complex_t __attribute__((aligned(16))) buf[128];
82 static float __attribute__((aligned(16))) sseSinCos1a[256];
83 static float __attribute__((aligned(16))) sseSinCos1b[256];
84 static float __attribute__((aligned(16))) sseSinCos1c[256]; 82 static float __attribute__((aligned(16))) sseSinCos1c[256];
85 static float __attribute__((aligned(16))) sseSinCos1d[256]; 83 static float __attribute__((aligned(16))) sseSinCos1d[256];
86 static float __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1}; 84 static float __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
87 //static float __attribute__((aligned(16))) sseW0[4]; 85 //static float __attribute__((aligned(16))) sseW0[4];
88 static float __attribute__((aligned(16))) sseW1[8]; 86 static float __attribute__((aligned(16))) sseW1[8];
386 "leal bit_reverse_512, %%eax \n\t" 384 "leal bit_reverse_512, %%eax \n\t"
387 "movl $1008, %%edi \n\t" 385 "movl $1008, %%edi \n\t"
388 "pushl %%ebp \n\t" //use ebp without telling gcc 386 "pushl %%ebp \n\t" //use ebp without telling gcc
389 ".balign 16 \n\t" 387 ".balign 16 \n\t"
390 "1: \n\t" 388 "1: \n\t"
391 "movaps (%0, %%esi), %%xmm0 \n\t" 389 "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI
392 "movaps (%0, %%edi), %%xmm1 \n\t" 390 "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI
393 "shufps $0xA0, %%xmm0, %%xmm0 \n\t" 391 "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi
394 "shufps $0x5F, %%xmm1, %%xmm1 \n\t" 392 "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi
395 "mulps sseSinCos1a(%%esi), %%xmm0 \n\t" 393 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
396 "mulps sseSinCos1b(%%esi), %%xmm1 \n\t" 394 "movaps sseSinCos1c(%%esi), %%xmm2 \n\t"
397 "addps %%xmm1, %%xmm0 \n\t" 395 "mulps %%xmm0, %%xmm2 \n\t"
396 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
397 "mulps sseSinCos1d(%%esi), %%xmm0 \n\t"
398 "subps %%xmm0, %%xmm2 \n\t"
398 "movzbl (%%eax), %%edx \n\t" 399 "movzbl (%%eax), %%edx \n\t"
399 "movzbl 1(%%eax), %%ebp \n\t" 400 "movzbl 1(%%eax), %%ebp \n\t"
400 "movlps %%xmm0, (%1, %%edx,8) \n\t" 401 "movlps %%xmm2, (%1, %%edx,8) \n\t"
401 "movhps %%xmm0, (%1, %%ebp,8) \n\t" 402 "movhps %%xmm2, (%1, %%ebp,8) \n\t"
402 "addl $16, %%esi \n\t" 403 "addl $16, %%esi \n\t"
403 "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap 404 "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap
404 "subl $16, %%edi \n\t" 405 "subl $16, %%edi \n\t"
405 " jnc 1b \n\t" 406 " jnc 1b \n\t"
406 "popl %%ebp \n\t"//no we didnt touch ebp *g* 407 "popl %%ebp \n\t"//no we didnt touch ebp *g*
829 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); 830 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
830 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); 831 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
831 } 832 }
832 #ifdef ARCH_X86 833 #ifdef ARCH_X86
833 for (i = 0; i < 128; i++) { 834 for (i = 0; i < 128; i++) {
834 sseSinCos1a[2*i+0]= -xsin1[i];
835 sseSinCos1a[2*i+1]= -xcos1[i];
836 sseSinCos1b[2*i+0]= xcos1[i];
837 sseSinCos1b[2*i+1]= -xsin1[i];
838
839 sseSinCos1c[2*i+0]= xcos1[i]; 835 sseSinCos1c[2*i+0]= xcos1[i];
840 sseSinCos1c[2*i+1]= -xcos1[i]; 836 sseSinCos1c[2*i+1]= -xcos1[i];
841 sseSinCos1d[2*i+0]= xsin1[i]; 837 sseSinCos1d[2*i+0]= xsin1[i];
842 sseSinCos1d[2*i+1]= xsin1[i]; 838 sseSinCos1d[2*i+1]= xsin1[i];
843 } 839 }