mplayer.hg: liba52/imdct.c comparison

comparison liba52/imdct.c @ 16173:d6219ce521e9

liba52 asm optimizations ported to amd64

author	aurel
date	Fri, 05 Aug 2005 13:33:50 +0000
parents	130dd060f723
children	72764c0dad8a

comparison

equal deleted inserted replaced

-:ac70488d1f3e
+:d6219ce521e9
 	0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
 	0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
 	0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
 	0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 // NOTE: SSE needs 16byte alignment or it will segfault
 //
 static complex_t __attribute__((aligned(16))) buf[128];
 static float __attribute__((aligned(16))) sseSinCos1c[256];
 static float __attribute__((aligned(16))) sseSinCos1d[256];
 {
 int i;
 int k;
 int p,q;
 int m;
-int two_m;
+long two_m;
-int two_m_plus_one;
+long two_m_plus_one;
 sample_t tmp_b_i;
 sample_t tmp_b_r;
 sample_t tmp_a_i;
 sample_t tmp_a_r;
 #endif
 // Stuff below this line is borrowed from libac3
 #include "srfftp.h"
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 #ifndef HAVE_3DNOW
 #define HAVE_3DNOW 1
 #endif
 #include "srfftp_3dnow.h"
 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 {
 /*	int i,k;
 int p,q;*/
 int m;
-int two_m;
+long two_m;
-int two_m_plus_one;
+long two_m_plus_one;
-int two_m_plus_one_shl3;
+long two_m_plus_one_shl3;
 complex_t *buf_offset;
 /*  sample_t tmp_a_i;
 sample_t tmp_a_r;
 sample_t tmp_b_i;
 /* see the c version (dct_do_512()), its allmost identical, just in C */
 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
 /* Bit reversed shuffling */
 	asm volatile(
-		"xorl %%esi, %%esi			\n\t"
+		"xor %%"REG_S", %%"REG_S"		\n\t"
-		"leal "MANGLE(bit_reverse_512)", %%eax	\n\t"
+		"lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
-		"movl $1008, %%edi			\n\t"
+		"mov $1008, %%"REG_D"			\n\t"
-		"pushl %%ebp				\n\t" //use ebp without telling gcc
+		"push %%"REG_BP"			\n\t" //use ebp without telling gcc
 		".balign 16				\n\t"
 		"1:					\n\t"
-		"movlps (%0, %%esi), %%xmm0		\n\t" // XXXI
+		"movlps (%0, %%"REG_S"), %%xmm0	\n\t" // XXXI
-		"movhps 8(%0, %%edi), %%xmm0		\n\t" // RXXI
+		"movhps 8(%0, %%"REG_D"), %%xmm0	\n\t" // RXXI
-		"movlps 8(%0, %%esi), %%xmm1		\n\t" // XXXi
+		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // XXXi
-		"movhps (%0, %%edi), %%xmm1		\n\t" // rXXi
+		"movhps (%0, %%"REG_D"), %%xmm1	\n\t" // rXXi
 		"shufps $0x33, %%xmm1, %%xmm0		\n\t" // irIR
-		"movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t"
+		"movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
 		"mulps %%xmm0, %%xmm2			\n\t"
 		"shufps $0xB1, %%xmm0, %%xmm0		\n\t" // riRI
-		"mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
+		"mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
 		"subps %%xmm0, %%xmm2			\n\t"
-		"movzbl (%%eax), %%edx			\n\t"
+		"movzb (%%"REG_a"), %%"REG_d"		\n\t"
-		"movzbl 1(%%eax), %%ebp			\n\t"
+		"movzb 1(%%"REG_a"), %%"REG_BP"		\n\t"
-		"movlps %%xmm2, (%1, %%edx,8)		\n\t"
+		"movlps %%xmm2, (%1, %%"REG_d", 8)	\n\t"
-		"movhps %%xmm2, (%1, %%ebp,8)		\n\t"
+		"movhps %%xmm2, (%1, %%"REG_BP", 8)	\n\t"
-		"addl $16, %%esi			\n\t"
+		"add $16, %%"REG_S"			\n\t"
-		"addl $2, %%eax				\n\t" // avoid complex addressing for P4 crap
+		"add $2, %%"REG_a"			\n\t" // avoid complex addressing for P4 crap
-		"subl $16, %%edi			\n\t"
+		"sub $16, %%"REG_D"			\n\t"
-		" jnc 1b				\n\t"
+		"jnc 1b				 	\n\t"
-		"popl %%ebp				\n\t"//no we didnt touch ebp *g*
+		"pop %%"REG_BP"				\n\t"//no we didnt touch ebp *g*
-		:: "b" (data), "c" (buf)
+		:: "r" (data), "r" (buf)
-		: "%esi", "%edi", "%eax", "%edx"
+		: "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
 	);
 /* FFT Merge */
 /* unoptimized variant
 /* 1. iteration */
 	// Note w[0][0]={1,0}
 	asm volatile(
 		"xorps %%xmm1, %%xmm1	\n\t"
 		"xorps %%xmm2, %%xmm2	\n\t"
-		"movl %0, %%esi		\n\t"
+		"mov %0, %%"REG_S"	\n\t"
 		".balign 16				\n\t"
 		"1:			\n\t"
-		"movlps (%%esi), %%xmm0	\n\t" //buf[p]
+		"movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
-		"movlps 8(%%esi), %%xmm1\n\t" //buf[q]
+		"movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
-		"movhps (%%esi), %%xmm0	\n\t" //buf[p]
+		"movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
-		"movhps 8(%%esi), %%xmm2\n\t" //buf[q]
+		"movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
 		"addps %%xmm1, %%xmm0	\n\t"
 		"subps %%xmm2, %%xmm0	\n\t"
-		"movaps %%xmm0, (%%esi)	\n\t"
+		"movaps %%xmm0, (%%"REG_S")\n\t"
-		"addl $16, %%esi	\n\t"
+		"add $16, %%"REG_S"	\n\t"
-		"cmpl %1, %%esi		\n\t"
+		"cmp %1, %%"REG_S"	\n\t"
 		" jb 1b			\n\t"
 		:: "g" (buf), "r" (buf + 128)
-		: "%esi"
+		: "%"REG_S
 	);
 /* 2. iteration */
 	// Note w[1]={{1,0}, {0,-1}}
 	asm volatile(
 		"movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
-		"movl %0, %%esi			\n\t"
+		"mov %0, %%"REG_S"		\n\t"
 		".balign 16				\n\t"
 		"1:				\n\t"
-		"movaps 16(%%esi), %%xmm2	\n\t" //r2,i2,r3,i3
+		"movaps 16(%%"REG_S"), %%xmm2	\n\t" //r2,i2,r3,i3
 		"shufps $0xB4, %%xmm2, %%xmm2	\n\t" //r2,i2,i3,r3
 		"mulps %%xmm7, %%xmm2		\n\t" //r2,i2,i3,-r3
-		"movaps (%%esi), %%xmm0		\n\t" //r0,i0,r1,i1
+		"movaps (%%"REG_S"), %%xmm0	\n\t" //r0,i0,r1,i1
-		"movaps (%%esi), %%xmm1		\n\t" //r0,i0,r1,i1
+		"movaps (%%"REG_S"), %%xmm1	\n\t" //r0,i0,r1,i1
 		"addps %%xmm2, %%xmm0		\n\t"
 		"subps %%xmm2, %%xmm1		\n\t"
-		"movaps %%xmm0, (%%esi)		\n\t"
+		"movaps %%xmm0, (%%"REG_S")	\n\t"
-		"movaps %%xmm1, 16(%%esi)	\n\t"
+		"movaps %%xmm1, 16(%%"REG_S")	\n\t"
-		"addl $32, %%esi	\n\t"
+		"add $32, %%"REG_S"	\n\t"
-		"cmpl %1, %%esi		\n\t"
+		"cmp %1, %%"REG_S"	\n\t"
 		" jb 1b			\n\t"
 		:: "g" (buf), "r" (buf + 128)
-		: "%esi"
+		: "%"REG_S
 	);
 /* 3. iteration */
 /*
 Note sseW2+0={1,1,sqrt(2),sqrt(2))
 	asm volatile(
 		"movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
 		"movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
 		"xorps %%xmm5, %%xmm5		\n\t"
 		"xorps %%xmm2, %%xmm2		\n\t"
-		"movl %0, %%esi			\n\t"
+		"mov %0, %%"REG_S"		\n\t"
 		".balign 16			\n\t"
 		"1:				\n\t"
-		"movaps 32(%%esi), %%xmm2	\n\t" //r4,i4,r5,i5
+		"movaps 32(%%"REG_S"), %%xmm2	\n\t" //r4,i4,r5,i5
-		"movaps 48(%%esi), %%xmm3	\n\t" //r6,i6,r7,i7
+		"movaps 48(%%"REG_S"), %%xmm3	\n\t" //r6,i6,r7,i7
 		"movaps "MANGLE(sseW2)", %%xmm4	\n\t" //r4,i4,r5,i5
 		"movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
 		"mulps %%xmm2, %%xmm4		\n\t"
 		"mulps %%xmm3, %%xmm5		\n\t"
 		"shufps $0xB1, %%xmm2, %%xmm2	\n\t" //i4,r4,i5,r5
 		"shufps $0xB1, %%xmm3, %%xmm3	\n\t" //i6,r6,i7,r7
 		"mulps %%xmm6, %%xmm3		\n\t"
 		"mulps %%xmm7, %%xmm2		\n\t"
-		"movaps (%%esi), %%xmm0		\n\t" //r0,i0,r1,i1
+		"movaps (%%"REG_S"), %%xmm0	\n\t" //r0,i0,r1,i1
-		"movaps 16(%%esi), %%xmm1	\n\t" //r2,i2,r3,i3
+		"movaps 16(%%"REG_S"), %%xmm1	\n\t" //r2,i2,r3,i3
 		"addps %%xmm4, %%xmm2		\n\t"
 		"addps %%xmm5, %%xmm3		\n\t"
 		"movaps %%xmm2, %%xmm4		\n\t"
 		"movaps %%xmm3, %%xmm5		\n\t"
 		"addps %%xmm0, %%xmm2		\n\t"
 		"addps %%xmm1, %%xmm3		\n\t"
 		"subps %%xmm4, %%xmm0		\n\t"
 		"subps %%xmm5, %%xmm1		\n\t"
-		"movaps %%xmm2, (%%esi)		\n\t"
+		"movaps %%xmm2, (%%"REG_S")	\n\t"
-		"movaps %%xmm3, 16(%%esi)	\n\t"
+		"movaps %%xmm3, 16(%%"REG_S")	\n\t"
-		"movaps %%xmm0, 32(%%esi)	\n\t"
+		"movaps %%xmm0, 32(%%"REG_S")	\n\t"
-		"movaps %%xmm1, 48(%%esi)	\n\t"
+		"movaps %%xmm1, 48(%%"REG_S")	\n\t"
-		"addl $64, %%esi	\n\t"
+		"add $64, %%"REG_S"	\n\t"
-		"cmpl %1, %%esi		\n\t"
+		"cmp %1, %%"REG_S"	\n\t"
 		" jb 1b			\n\t"
 		:: "g" (buf), "r" (buf + 128)
-		: "%esi"
+		: "%"REG_S
 	);
 /* 4-7. iterations */
 for (m=3; m < 7; m++) {
 	two_m = (1 << m);
 	two_m_plus_one = two_m<<1;
 	two_m_plus_one_shl3 = (two_m_plus_one<<3);
 	buf_offset = buf+128;
 	asm volatile(
-		"movl %0, %%esi				\n\t"
+		"mov %0, %%"REG_S"			\n\t"
 		".balign 16				\n\t"
 		"1:					\n\t"
-		"xorl %%edi, %%edi			\n\t" // k
+		"xor %%"REG_D", %%"REG_D"		\n\t" // k
-		"leal (%%esi, %3), %%edx		\n\t"
+		"lea (%%"REG_S", %3), %%"REG_d"		\n\t"
 		"2:					\n\t"
-		"movaps (%%edx, %%edi), %%xmm1		\n\t"
+		"movaps (%%"REG_d", %%"REG_D"), %%xmm1	\n\t"
-		"movaps (%4, %%edi, 2), %%xmm2		\n\t"
+		"movaps (%4, %%"REG_D", 2), %%xmm2	\n\t"
 		"mulps %%xmm1, %%xmm2			\n\t"
 		"shufps $0xB1, %%xmm1, %%xmm1		\n\t"
-		"mulps 16(%4, %%edi, 2), %%xmm1		\n\t"
+		"mulps 16(%4, %%"REG_D", 2), %%xmm1	\n\t"
-		"movaps (%%esi, %%edi), %%xmm0		\n\t"
+		"movaps (%%"REG_S", %%"REG_D"), %%xmm0	\n\t"
 		"addps %%xmm2, %%xmm1			\n\t"
 		"movaps %%xmm1, %%xmm2			\n\t"
 		"addps %%xmm0, %%xmm1			\n\t"
 		"subps %%xmm2, %%xmm0			\n\t"
-		"movaps %%xmm1, (%%esi, %%edi)		\n\t"
+		"movaps %%xmm1, (%%"REG_S", %%"REG_D")	\n\t"
-		"movaps %%xmm0, (%%edx, %%edi)		\n\t"
+		"movaps %%xmm0, (%%"REG_d", %%"REG_D")	\n\t"
-		"addl $16, %%edi			\n\t"
+		"add $16, %%"REG_D"			\n\t"
-		"cmpl %3, %%edi				\n\t" //FIXME (opt) count against 0
+		"cmp %3, %%"REG_D"			\n\t" //FIXME (opt) count against 0
-		" jb 2b					\n\t"
+		"jb 2b					\n\t"
-		"addl %2, %%esi				\n\t"
+		"add %2, %%"REG_S"			\n\t"
-		"cmpl %1, %%esi				\n\t"
+		"cmp %1, %%"REG_S"			\n\t"
 		" jb 1b					\n\t"
 		:: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
 		   "r" (sseW[m])
-		: "%esi", "%edi", "%edx"
+		: "%"REG_S, "%"REG_D, "%"REG_d
 	);
 }
 /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 	asm volatile(
-		"movl $-1024, %%esi			\n\t"
+		"mov $-1024, %%"REG_S"			\n\t"
 		".balign 16				\n\t"
 		"1:					\n\t"
-		"movaps (%0, %%esi), %%xmm0		\n\t"
+		"movaps (%0, %%"REG_S"), %%xmm0		\n\t"
-		"movaps (%0, %%esi), %%xmm1		\n\t"
+		"movaps (%0, %%"REG_S"), %%xmm1		\n\t"
 		"shufps $0xB1, %%xmm0, %%xmm0		\n\t"
-		"mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t"
+		"mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
-		"mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
+		"mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
 		"addps %%xmm1, %%xmm0			\n\t"
-		"movaps %%xmm0, (%0, %%esi)		\n\t"
+		"movaps %%xmm0, (%0, %%"REG_S")		\n\t"
-		"addl $16, %%esi			\n\t"
+		"add $16, %%"REG_S"			\n\t"
 		" jnz 1b				\n\t"
 		:: "r" (buf+128)
-		: "%esi"
+		: "%"REG_S
 	);
 data_ptr = data;
 delay_ptr = delay;
 window_ptr = imdct_window;
 /* Window and convert to real valued signal */
 	asm volatile(
-		"xorl %%edi, %%edi			\n\t"  // 0
+		"xor %%"REG_D", %%"REG_D"		\n\t"  // 0
-		"xorl %%esi, %%esi			\n\t"  // 0
+		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
 		"movss %3, %%xmm2			\n\t"  // bias
 		"shufps $0x00, %%xmm2, %%xmm2		\n\t"  // bias, bias, ...
 		".balign 16				\n\t"
 		"1:					\n\t"
-		"movlps (%0, %%esi), %%xmm0		\n\t" // ? ? A ?
+		"movlps (%0, %%"REG_S"), %%xmm0		\n\t" // ? ? A ?
-		"movlps 8(%0, %%esi), %%xmm1		\n\t" // ? ? C ?
+		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? C ?
-		"movhps -16(%0, %%edi), %%xmm1		\n\t" // ? D C ?
+		"movhps -16(%0, %%"REG_D"), %%xmm1	\n\t" // ? D C ?
-		"movhps -8(%0, %%edi), %%xmm0		\n\t" // ? B A ?
+		"movhps -8(%0, %%"REG_D"), %%xmm0	\n\t" // ? B A ?
 		"shufps $0x99, %%xmm1, %%xmm0		\n\t" // D C B A
-		"mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
+		"mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
-		"addps (%2, %%esi), %%xmm0		\n\t"
+		"addps (%2, %%"REG_S"), %%xmm0		\n\t"
 		"addps %%xmm2, %%xmm0			\n\t"
-		"movaps %%xmm0, (%1, %%esi)		\n\t"
+		"movaps %%xmm0, (%1, %%"REG_S")		\n\t"
-		"addl $16, %%esi			\n\t"
+		"add  $16, %%"REG_S"			\n\t"
-		"subl $16, %%edi			\n\t"
+		"sub  $16, %%"REG_D"			\n\t"
-		"cmpl $512, %%esi			\n\t"
+		"cmp  $512, %%"REG_S"			\n\t"
 		" jb 1b					\n\t"
 		:: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
-		: "%esi", "%edi"
+		: "%"REG_S, "%"REG_D
 	);
 	data_ptr+=128;
 	delay_ptr+=128;
 //	window_ptr+=128;
 	asm volatile(
-		"movl $1024, %%edi			\n\t"  // 512
+		"mov $1024, %%"REG_D"			\n\t"  // 512
-		"xorl %%esi, %%esi			\n\t"  // 0
+		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
 		"movss %3, %%xmm2			\n\t"  // bias
 		"shufps $0x00, %%xmm2, %%xmm2		\n\t"  // bias, bias, ...
 		".balign 16				\n\t"
 		"1:					\n\t"
-		"movlps (%0, %%esi), %%xmm0		\n\t" // ? ? ? A
+		"movlps (%0, %%"REG_S"), %%xmm0		\n\t" // ? ? ? A
-		"movlps 8(%0, %%esi), %%xmm1		\n\t" // ? ? ? C
+		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? ? C
-		"movhps -16(%0, %%edi), %%xmm1		\n\t" // D ? ? C
+		"movhps -16(%0, %%"REG_D"), %%xmm1	\n\t" // D ? ? C
-		"movhps -8(%0, %%edi), %%xmm0		\n\t" // B ? ? A
+		"movhps -8(%0, %%"REG_D"), %%xmm0	\n\t" // B ? ? A
 		"shufps $0xCC, %%xmm1, %%xmm0		\n\t" // D C B A
-		"mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
+		"mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
-		"addps (%2, %%esi), %%xmm0		\n\t"
+		"addps (%2, %%"REG_S"), %%xmm0		\n\t"
 		"addps %%xmm2, %%xmm0			\n\t"
-		"movaps %%xmm0, (%1, %%esi)		\n\t"
+		"movaps %%xmm0, (%1, %%"REG_S")		\n\t"
-		"addl $16, %%esi			\n\t"
+		"add $16, %%"REG_S"			\n\t"
-		"subl $16, %%edi			\n\t"
+		"sub $16, %%"REG_D"			\n\t"
-		"cmpl $512, %%esi			\n\t"
+		"cmp $512, %%"REG_S"			\n\t"
 		" jb 1b					\n\t"
 		:: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
-		: "%esi", "%edi"
+		: "%"REG_S, "%"REG_D
 	);
 	data_ptr+=128;
 //	window_ptr+=128;
 /* The trailing edge of the window goes into the delay line */
 delay_ptr = delay;
 	asm volatile(
-		"xorl %%edi, %%edi			\n\t"  // 0
+		"xor %%"REG_D", %%"REG_D"		\n\t"  // 0
-		"xorl %%esi, %%esi			\n\t"  // 0
+		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
 		".balign 16				\n\t"
 		"1:					\n\t"
-		"movlps (%0, %%esi), %%xmm0		\n\t" // ? ? ? A
+		"movlps (%0, %%"REG_S"), %%xmm0		\n\t" // ? ? ? A
-		"movlps 8(%0, %%esi), %%xmm1		\n\t" // ? ? ? C
+		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? ? C
-		"movhps -16(%0, %%edi), %%xmm1		\n\t" // D ? ? C
+		"movhps -16(%0, %%"REG_D"), %%xmm1	\n\t" // D ? ? C
-		"movhps -8(%0, %%edi), %%xmm0		\n\t" // B ? ? A
+		"movhps -8(%0, %%"REG_D"), %%xmm0	\n\t" // B ? ? A
 		"shufps $0xCC, %%xmm1, %%xmm0		\n\t" // D C B A
-		"mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
+		"mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
-		"movaps %%xmm0, (%1, %%esi)		\n\t"
+		"movaps %%xmm0, (%1, %%"REG_S")		\n\t"
-		"addl $16, %%esi			\n\t"
+		"add $16, %%"REG_S"			\n\t"
-		"subl $16, %%edi			\n\t"
+		"sub $16, %%"REG_D"			\n\t"
-		"cmpl $512, %%esi			\n\t"
+		"cmp $512, %%"REG_S"			\n\t"
 		" jb 1b					\n\t"
 		:: "r" (buf+64), "r" (delay_ptr)
-		: "%esi", "%edi"
+		: "%"REG_S, "%"REG_D
 	);
 	delay_ptr+=128;
 //	window_ptr-=128;
 	asm volatile(
-		"movl $1024, %%edi			\n\t"  // 1024
+		"mov $1024, %%"REG_D"			\n\t"  // 1024
-		"xorl %%esi, %%esi			\n\t"  // 0
+		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
 		".balign 16				\n\t"
 		"1:					\n\t"
-		"movlps (%0, %%esi), %%xmm0		\n\t" // ? ? A ?
+		"movlps (%0, %%"REG_S"), %%xmm0	\n\t" // ? ? A ?
-		"movlps 8(%0, %%esi), %%xmm1		\n\t" // ? ? C ?
+		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? C ?
-		"movhps -16(%0, %%edi), %%xmm1		\n\t" // ? D C ?
+		"movhps -16(%0, %%"REG_D"), %%xmm1	\n\t" // ? D C ?
-		"movhps -8(%0, %%edi), %%xmm0		\n\t" // ? B A ?
+		"movhps -8(%0, %%"REG_D"), %%xmm0	\n\t" // ? B A ?
 		"shufps $0x99, %%xmm1, %%xmm0		\n\t" // D C B A
-		"mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
+		"mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
-		"movaps %%xmm0, (%1, %%esi)		\n\t"
+		"movaps %%xmm0, (%1, %%"REG_S")		\n\t"
-		"addl $16, %%esi			\n\t"
+		"add $16, %%"REG_S"			\n\t"
-		"subl $16, %%edi			\n\t"
+		"sub $16, %%"REG_D"			\n\t"
-		"cmpl $512, %%esi			\n\t"
+		"cmp $512, %%"REG_S"			\n\t"
 		" jb 1b					\n\t"
 		:: "r" (buf), "r" (delay_ptr)
-		: "%esi", "%edi"
+		: "%"REG_S, "%"REG_D
 	);
 }
-#endif //arch_x86
+#endif // ARCH_X86 || ARCH_X86_64
 void
 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
 {
 int i,k;
 	/* Twiddle factors to turn IFFT into IMDCT */
 	for (i = 0; i < 128; i++) {
 	    xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
 	    xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
 	}
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 	for (i = 0; i < 128; i++) {
 	    sseSinCos1c[2*i+0]= xcos1[i];
 	    sseSinCos1c[2*i+1]= -xcos1[i];
 	    sseSinCos1d[2*i+0]= xsin1[i];
 	    sseSinCos1d[2*i+1]= xsin1[i];
 	    for (k = 0; k < j; k++) {
 		w[i][k].real = cos (-M_PI * k / j);
 		w[i][k].imag = sin (-M_PI * k / j);
 	    }
 	}
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 	for (i = 1; i < 7; i++) {
 	    j = 1 << i;
 	    for (k = 0; k < j; k+=2) {
 	    	sseW[i][4*k + 0] = w[i][k+0].real;
 		sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
 		sseWindow[256 + 2*i+1]=  imdct_window[254 - 2*i+0];
 		sseWindow[384 + 2*i+0]=  imdct_window[126 - 2*i+1];
 		sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
 	}
-#endif // arch_x86
+#endif // ARCH_X86 || ARCH_X86_64
 	imdct_512 = imdct_do_512;
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 	if(mm_accel & MM_ACCEL_X86_SSE)
 	{
 	  fprintf (stderr, "Using SSE optimized IMDCT transform\n");
 	  imdct_512 = imdct_do_512_sse;
 	}
 	{
 	  fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
 	  imdct_512 = imdct_do_512_3dnow;
 	}
 	else
-#endif // arch_x86
+#endif // ARCH_X86 || ARCH_X86_64
 #ifdef HAVE_ALTIVEC
 if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
 	{
 	  fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
 imdct_512 = imdct_do_512_altivec;

Mercurial > mplayer.hg

comparison liba52/imdct.c @ 16173:d6219ce521e9