changeset 3534:3483390a902b

sse opt
author michael
date Sun, 16 Dec 2001 21:39:10 +0000
parents 6e6a74d2d1ea
children 975672407ef6
files liba52/imdct.c
diffstat 1 files changed, 91 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/liba52/imdct.c	Sun Dec 16 21:23:36 2001 +0000
+++ b/liba52/imdct.c	Sun Dec 16 21:39:10 2001 +0000
@@ -78,6 +78,15 @@
 static float __attribute__((aligned(16))) sseSinCos1a[256];
 static float __attribute__((aligned(16))) sseSinCos1b[256];
 static float __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
+//static float __attribute__((aligned(16))) sseW0[4];
+static float __attribute__((aligned(16))) sseW1[8];
+static float __attribute__((aligned(16))) sseW2[16];
+static float __attribute__((aligned(16))) sseW3[32];
+static float __attribute__((aligned(16))) sseW4[64];
+static float __attribute__((aligned(16))) sseW5[128];
+static float __attribute__((aligned(16))) sseW6[256];
+static float __attribute__((aligned(16))) *sseW[7]=
+	{NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
 #else
 static complex_t buf[128];
 #endif
@@ -300,25 +309,68 @@
 		:: "g" (buf), "r" (buf + 128)
 		: "%esi"
 	);
-	
-	m=2;
-	two_m = 4;
-
-	for(k = 0; k < two_m; k++) {
+/* C code for the next asm loop 
+	for(k = 0; k < 4; k++) {
 	    for(i = 0; i < 128; i += 8) {
 		p = k + i;
-		q = p + two_m;
+		q = p + 4;
 		tmp_a_r = buf[p].real;
 		tmp_a_i = buf[p].imag;
-		tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
-		tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
+		tmp_b_r = buf[q].real * w[2][k].real - buf[q].imag * w[2][k].imag;
+		tmp_b_i = buf[q].imag * w[2][k].real + buf[q].real * w[2][k].imag;
 		buf[p].real = tmp_a_r + tmp_b_r;
 		buf[p].imag =  tmp_a_i + tmp_b_i;
 		buf[q].real = tmp_a_r - tmp_b_r;
 		buf[q].imag =  tmp_a_i - tmp_b_i;
 	    }
 	}
+*/
+/*
+ Note sseW2+0={1,1,sqrt(2),sqrt(2))
+ Note sseW2+16={0,0,sqrt(2),-sqrt(2))
+ Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
+ Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
+*/
+	asm volatile(
+		"movaps sseW2, %%xmm6		\n\t" 
+		"movaps 16+sseW2, %%xmm7	\n\t" 
+		"xorps %%xmm5, %%xmm5		\n\t"
+		"xorps %%xmm2, %%xmm2		\n\t"
+		"movl %0, %%esi			\n\t"
+		".balign 16			\n\t"
+		"1:				\n\t"
+		"movhps 40(%%esi), %%xmm2	\n\t" //r4,i4,r5,i5
+		"movaps 48(%%esi), %%xmm3	\n\t" //r6,i6,r7,i7
+		"movaps 32(%%esi), %%xmm4	\n\t" //r4,i4,r5,i5
+		"movhps 56(%%esi), %%xmm5	\n\t" //r6,i6,r7,i7
+		"shufps $0xB1, %%xmm2, %%xmm2	\n\t" //i4,r4,i5,r5
+		"shufps $0xB1, %%xmm3, %%xmm3	\n\t" //i6,r6,i7,r7
+		"mulps %%xmm6, %%xmm4		\n\t"
+		"mulps 32+sseW2, %%xmm5		\n\t"
+		"mulps %%xmm7, %%xmm2		\n\t"
+		"mulps 48+sseW2, %%xmm3		\n\t"
+		"movaps (%%esi), %%xmm0		\n\t" //r0,i0,r1,i1
+		"movaps 16(%%esi), %%xmm1	\n\t" //r2,i2,r3,i3
+		"addps %%xmm4, %%xmm2		\n\t"
+		"addps %%xmm5, %%xmm3		\n\t"
+		"movaps %%xmm2, %%xmm4		\n\t"
+		"movaps %%xmm3, %%xmm5		\n\t"
+		"addps %%xmm0, %%xmm2		\n\t"
+		"addps %%xmm1, %%xmm3		\n\t"
+		"subps %%xmm4, %%xmm0		\n\t"
+		"subps %%xmm5, %%xmm1		\n\t"
+		"movaps %%xmm2, (%%esi)		\n\t" 
+		"movaps %%xmm3, 16(%%esi)	\n\t" 
+		"movaps %%xmm0, 32(%%esi)	\n\t" 
+		"movaps %%xmm1, 48(%%esi)	\n\t" 
+		"addl $64, %%esi	\n\t"
+		"cmpl %1, %%esi		\n\t"
+		" jb 1b			\n\t"
+		:: "g" (buf), "r" (buf + 128)
+		: "%esi"
+	);
 
+    
     for (m=3; m < 7; m++) {
 	two_m = (1 << m);
 
@@ -572,6 +624,37 @@
 		w[i][k].imag = sin (-M_PI * k / j);
 	    }
 	}
+#ifdef HAVE_SSE
+	for (i = 1; i < 7; i++) {
+	    j = 1 << i;
+	    for (k = 0; k < j; k+=2) {
+	    
+	    	sseW[i][4*k + 0] = w[i][k+0].real;
+	    	sseW[i][4*k + 1] = w[i][k+0].real;
+	    	sseW[i][4*k + 2] = w[i][k+1].real;
+	    	sseW[i][4*k + 3] = w[i][k+1].real;
+
+	    	sseW[i][4*k + 4] = -w[i][k+0].imag;
+	    	sseW[i][4*k + 5] = w[i][k+0].imag;
+	    	sseW[i][4*k + 6] = -w[i][k+1].imag;
+	    	sseW[i][4*k + 7] = w[i][k+1].imag;	    
+	    	
+	//we multiply more or less uninitalized numbers so we need to use exactly 0.0
+		if(k==0)
+		{
+//			sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
+			sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
+		}
+		
+		if(2*k == j)
+		{
+			sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
+//			sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
+		}
+	    }
+	}
+#endif
+	
 	imdct_512 = imdct_do_512;
 	imdct_256 = imdct_do_256;
     }