changeset 920:deeaad5bf1d7

libac3 now is full 3dnow! optimized
author nickols_k
date Thu, 31 May 2001 17:58:56 +0000
parents da072473937a
children 6b02113cd41a
files liba52/srfftp.h liba52/srfftp_3dnow.h
diffstat 2 files changed, 67 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/liba52/srfftp.h	Thu May 31 16:35:36 2001 +0000
+++ b/liba52/srfftp.h	Thu May 31 17:58:56 2001 +0000
@@ -32,19 +32,19 @@
 #ifndef SRFFTP_H__
 #define SRFFTP_H__
 
-static complex_t delta16[4] = 
+static complex_t delta16[4] __attribute__((aligned(16))) = 
  { {1.00000000000000,  0.00000000000000},
    {0.92387953251129, -0.38268343236509},
    {0.70710678118655, -0.70710678118655},
    {0.38268343236509, -0.92387953251129}};
 
-static complex_t delta16_3[4] = 
+static complex_t delta16_3[4] __attribute__((aligned(16))) = 
  { {1.00000000000000,  0.00000000000000},
    {0.38268343236509, -0.92387953251129},
    {-0.70710678118655, -0.70710678118655},
    {-0.92387953251129, 0.38268343236509}};
 
-static complex_t delta32[8] = 
+static complex_t delta32[8] __attribute__((aligned(16))) = 
  { {1.00000000000000,  0.00000000000000},
    {0.98078528040323, -0.19509032201613},
    {0.92387953251129, -0.38268343236509},
@@ -54,7 +54,7 @@
    {0.38268343236509, -0.92387953251129},
    {0.19509032201613, -0.98078528040323}};
 
-static complex_t delta32_3[8] = 
+static complex_t delta32_3[8] __attribute__((aligned(16))) = 
  { {1.00000000000000,  0.00000000000000},
    {0.83146961230255, -0.55557023301960},
    {0.38268343236509, -0.92387953251129},
@@ -64,7 +64,7 @@
    {-0.92387953251129, 0.38268343236509},
    {-0.55557023301960, 0.83146961230255}};
 
-static complex_t delta64[16] = 
+static complex_t delta64[16] __attribute__((aligned(16))) = 
  { {1.00000000000000,  0.00000000000000},
    {0.99518472667220, -0.09801714032956},
    {0.98078528040323, -0.19509032201613},
@@ -82,7 +82,7 @@
    {0.19509032201613, -0.98078528040323},
    {0.09801714032956, -0.99518472667220}};
 
-static complex_t delta64_3[16] = 
+static complex_t delta64_3[16] __attribute__((aligned(16))) = 
  { {1.00000000000000,  0.00000000000000},
    {0.95694033573221, -0.29028467725446},
    {0.83146961230255, -0.55557023301960},
@@ -100,7 +100,7 @@
    {-0.55557023301960, 0.83146961230255},
    {-0.29028467725446, 0.95694033573221}};
 
-static complex_t delta128[32] = 
+static complex_t delta128[32] __attribute__((aligned(16))) = 
  { {1.00000000000000,  0.00000000000000},
    {0.99879545620517, -0.04906767432742},
    {0.99518472667220, -0.09801714032956},
@@ -134,7 +134,7 @@
    {0.09801714032956, -0.99518472667220},
    {0.04906767432742, -0.99879545620517}};
 
-static complex_t delta128_3[32] = 
+static complex_t delta128_3[32] __attribute__((aligned(16))) = 
  { {1.00000000000000,  0.00000000000000},
    {0.98917650996478, -0.14673047445536},
    {0.95694033573221, -0.29028467725446},
--- a/liba52/srfftp_3dnow.h	Thu May 31 16:35:36 2001 +0000
+++ b/liba52/srfftp_3dnow.h	Thu May 31 17:58:56 2001 +0000
@@ -68,11 +68,18 @@
 	"psrlq $32, "##mm_base"\n\t"\
 	"punpckldq "##mm_hlp","##mm_base"\n\t"
 #endif
+#ifdef HAVE_3DNOWEX
+#define PFNACC_MM(mm_base,mm_hlp)	"pfnacc	"##mm_base","##mm_base"\n\t"
+#else
+#define PFNACC_MM(mm_base,mm_hlp)\
+	"movq	"##mm_base","##mm_hlp"\n\t"\
+	"psrlq	$32,"##mm_hlp"\n\t"\
+	"punpckldq "##mm_hlp","##mm_hlp"\n\t"\
+	"pfsub	"##mm_hlp","##mm_base"\n\t"
+#endif
 
 #define TRANSZERO_3DNOW(A0,A4,A8,A12) \
 { \
-    __asm__ __volatile__("femms":::"memory");\
-    TRANS_FILL_MM6_MM7_3DNOW()\
     __asm__ __volatile__(\
 	"movq	%4, %%mm0\n\t" /* mm0 = wTB[0]*/\
 	"movq	%5, %%mm1\n\t" /* mm1 = wTB[k*2]*/ \
@@ -98,13 +105,10 @@
 	:"=m"(A0), "=m"(A8), "=m"(A4), "=m"(A12)\
 	:"m"(wTB[0]), "m"(wTB[k*2]), "0"(A0), "2"(A4)\
 	:"memory");\
-    __asm__ __volatile__("femms":::"memory");\
 }
 
 #define TRANSHALF_16_3DNOW(A2,A6,A10,A14)\
 {\
-    __asm__ __volatile__("femms":::"memory");\
-    TRANS_FILL_MM6_MM7_3DNOW()\
     __asm__ __volatile__(\
 	"movq	%4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\
 	"movq	%%mm0, %%mm1\n\t"\
@@ -142,7 +146,56 @@
 	:"=m"(A2), "=m"(A10), "=m"(A6), "=m"(A14)\
 	:"m"(wTB[2]), "m"(wTB[6]), "0"(A2), "2"(A6), "m"(HSQRT2_3DNOW)\
 	:"memory");\
-    __asm__ __volatile__("femms":::"memory");\
+}
+
+#define TRANS_3DNOW(A1,A5,A9,A13,WT,WB,D,D3)\
+{ \
+    __asm__ __volatile__(\
+	"movq	%1,	%%mm4\n\t"\
+	"movq	%%mm4,	%%mm5\n\t"\
+	"punpckldq %%mm4, %%mm4\n\t"/*mm4 = D.re | D.re */\
+	"punpckhdq %%mm5, %%mm5\n\t"/*mm5 = D.im | D.im */\
+	"movq	%0,	%%mm0\n\t"\
+	"pfmul	%%mm0,	%%mm4\n\t"/* mm4 =u.re | u.im */\
+	"pfmul	%%mm0,	%%mm5\n\t"/* mm5 = a.re | a.im */\
+	PSWAP_MM("%%mm5","%%mm3")\
+	"pfmul	%%mm7,	%%mm5\n\t"\
+	"pfadd	%%mm5,	%%mm4\n\t"/* mm4 = u*/\
+	"movq	%3,	%%mm1\n\t"\
+	"movq	%2,	%%mm0\n\t"\
+	PSWAP_MM("%%mm1","%%mm3")\
+	"movq	%%mm0,	%%mm2\n\t"\
+	"pfmul	%%mm1,	%%mm0\n\t"/* mm0 = a*/\
+	"pfmul	%3,	%%mm2\n\t"/* mm2 = v*/\
+	PFNACC_MM("%%mm2","%%mm3")\
+	"pfacc	%%mm0,	%%mm0\n\t"\
+	"punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\
+	"movq	%%mm2,	%%mm3\n\t"\
+	"pfmul	%%mm7,	%%mm3\n\t"\
+	"movq	%%mm4,	%%mm5\n\t"\
+	"pfmul	%%mm6,	%%mm5\n\t"\
+	"pfadd	%%mm3,	%%mm5\n\t"\
+	PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\
+	"pfadd	%%mm2,	%%mm4\n\t"\
+	:\
+	:"m"(WT), "m"(D), "m"(WB), "m"(D3)\
+	:"memory");\
+    __asm__ __volatile__(\
+	"movq	%4, %%mm0\n\t"/* a1 = A1*/\
+	"movq	%%mm0, %%mm1\n\t"\
+	"pfadd	%%mm4, %%mm0\n\t"/*A1 = a1 + u*/\
+	"pfsub	%%mm4, %%mm1\n\t"/*A9 = a1 - u*/\
+	"movq	%%mm0, %0\n\t"\
+	"movq	%%mm1, %1\n\t"\
+	"movq	%5, %%mm2\n\t"/* a1 = A5*/\
+	"movq	%%mm2, %%mm3\n\t"\
+	"pfsub	%%mm5, %%mm2\n\t"/*A5 = a1 - v*/\
+	"pfadd	%%mm5, %%mm3\n\t"/*A9 = a1 + v*/\
+	"movq	%%mm2, %2\n\t"\
+	"movq	%%mm3, %3"\
+	:"=m"(A1), "=m"(A9), "=m"(A5), "=m"(A13)\
+	:"0"(A1), "2"(A5), "m"(u), "m"(v)\
+	:"memory");\
 }
 
 #endif