changeset 1173:3c53cbf53e7e

Better 3dnow! optimization
author nickols_k
date Wed, 20 Jun 2001 07:54:19 +0000
parents 290353337b44
children c594d213d429
files liba52/srfftp_3dnow.h mp3lib/dct64_k7.s
diffstat 2 files changed, 37 insertions(+), 48 deletions(-) [+]
line wrap: on
line diff
--- a/liba52/srfftp_3dnow.h	Tue Jun 19 23:20:59 2001 +0000
+++ b/liba52/srfftp_3dnow.h	Wed Jun 20 07:54:19 2001 +0000
@@ -33,32 +33,18 @@
 #ifndef SRFFTP_3DNOW_H__
 #define SRFFTP_3DNOW_H__
 
-static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
+typedef struct
+{
+  unsigned long val[2];
+}i_cmplx_t;
 
-#ifdef HAVE_3DNOWEX
 #define TRANS_FILL_MM6_MM7_3DNOW()\
     __asm__ __volatile__(\
-	"movl	$-1, %%eax\n\t"\
-	"movd	%%eax, %%mm7\n\t"\
-	"negl	%%eax\n\t"\
-	"movd	%%eax, %%mm6\n\t"\
-	"punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\
-	"pi2fd	%%mm7, %%mm7\n\t"\
-	"pswapd	%%mm7, %%mm6\n\t"/* 1.0 | -1.0 */\
-	:::"eax","memory");
-#else
-#define TRANS_FILL_MM6_MM7_3DNOW()\
-    __asm__ __volatile__(\
-	"movl	$-1, %%eax\n\t"\
-	"movd	%%eax, %%mm7\n\t"\
-	"negl	%%eax\n\t"\
-	"movd	%%eax, %%mm6\n\t"\
-	"punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\
-	"punpckldq %%mm7, %%mm6\n\t" /* 1.0 | -1.0 */\
-	"pi2fd	%%mm7, %%mm7\n\t"\
-	"pi2fd	%%mm6, %%mm6\n\t"\
-	:::"eax","memory");
-#endif
+	"movq	%1, %%mm7\n\t"\
+	"movq	%0, %%mm6\n\t"\
+	::"m"(x_plus_minus_3dnow),\
+	"m"(x_minus_plus_3dnow)\
+	:"memory");
 
 #ifdef HAVE_3DNOWEX
 #define PSWAP_MM(mm_base,mm_hlp) "pswapd	"mm_base","mm_base"\n\t"
@@ -85,8 +71,8 @@
 	"movq	%5, %%mm1\n\t" /* mm1 = wTB[k*2]*/ \
 	"movq	%%mm0, %%mm5\n\t"/*u.re = wTB[0].re + wTB[k*2].re;*/\
 	"pfadd	%%mm1, %%mm5\n\t"/*u.im = wTB[0].im + wTB[k*2].im; mm5 = u*/\
-	"pfmul  %%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\
-	"pfmul	%%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\
+	"pxor	%%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\
+	"pxor	%%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\
 	"pfadd	%%mm1, %%mm0\n\t"/*v.im = wTB[0].re - wTB[k*2].re;*/\
 	"movq	%%mm0, %%mm4\n\t"/*v.re =-wTB[0].im + wTB[k*2].im;*/\
 	PSWAP_MM("%%mm4","%%mm2")/* mm4 = v*/\
@@ -112,18 +98,18 @@
     __asm__ __volatile__(\
 	"movq	%4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\
 	"movq	%%mm0, %%mm1\n\t"\
-	"pfmul	%%mm7, %%mm1\n\t"\
+	"pxor	%%mm7, %%mm1\n\t"\
 	"pfacc	%%mm1, %%mm0\n\t"/*u.im = wTB[2].im - wTB[2].re; mm0 = u*/\
 	"movq	%5, %%mm1\n\t"  /*a.re = wTB[6].im - wTB[6].re; */\
 	"movq	%%mm1, %%mm2\n\t"\
-	"pfmul	%%mm7, %%mm1\n\t"\
+	"pxor	%%mm7, %%mm1\n\t"\
 	"pfacc	%%mm2, %%mm1\n\t"/*a.im = wTB[6].im + wTB[6].re;  mm1 = a*/\
 	"movq	%%mm1, %%mm2\n\t"\
-	"pfmul	%%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\
+	"pxor	%%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\
 	"movq	%%mm0, %%mm3\n\t"/*v.re = u.im + a.im;*/\
 	"pfadd	%%mm2, %%mm3\n\t"\
 	PSWAP_MM("%%mm3","%%mm2")/*mm3 = v*/\
-	"pfmul	%%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\
+	"pxor	%%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\
 	"pfadd	%%mm1, %%mm0\n\t"/*u.im = u.im - a.im; mm0 = u*/\
 	"movq	%8, %%mm2\n\t"\
 	"pfmul	%%mm2, %%mm3\n\t" /* v *= HSQRT2_3DNOW; */\
@@ -133,9 +119,9 @@
 	"movq	%%mm1, %%mm2\n\t"\
 	"movq	%%mm3, %%mm4\n\t"\
 	"pfadd	%%mm0, %%mm1\n\t" /*A2 = a1 + u;*/\
-	"pfmul	%%mm6, %%mm4\n\t"/*A6.re  = a1.re + v.re;*/\
+	"pxor	%%mm6, %%mm4\n\t"/*A6.re  = a1.re + v.re;*/\
 	"pfsub	%%mm0, %%mm2\n\t" /*A2 = a1 - u;*/\
-	"pfmul	%%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\
+	"pxor	%%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\
 	"movq	%%mm1, %0\n\t"\
 	"movq	%%mm2, %1\n\t"\
 	"movq	%%mm5, %%mm2\n\t"\
@@ -159,7 +145,7 @@
 	"pfmul	%%mm0,	%%mm4\n\t"/* mm4 =u.re | u.im */\
 	"pfmul	%%mm0,	%%mm5\n\t"/* mm5 = a.re | a.im */\
 	PSWAP_MM("%%mm5","%%mm3")\
-	"pfmul	%%mm7,	%%mm5\n\t"\
+	"pxor	%%mm7,	%%mm5\n\t"\
 	"pfadd	%%mm5,	%%mm4\n\t"/* mm4 = u*/\
 	"movq	%3,	%%mm1\n\t"\
 	"movq	%2,	%%mm0\n\t"\
@@ -171,9 +157,9 @@
 	"pfacc	%%mm0,	%%mm0\n\t"\
 	"movq	%%mm4,	%%mm5\n\t"\
 	"punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\
-	"pfmul	%%mm6,	%%mm5\n\t"\
+	"pxor	%%mm6,	%%mm5\n\t"\
 	"movq	%%mm2,	%%mm3\n\t"\
-	"pfmul	%%mm7,	%%mm3\n\t"\
+	"pxor	%%mm7,	%%mm3\n\t"\
 	"pfadd	%%mm3,	%%mm5\n\t"\
 	PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\
 	"pfadd	%%mm2,	%%mm4\n\t"\
--- a/mp3lib/dct64_k7.s	Tue Jun 19 23:20:59 2001 +0000
+++ b/mp3lib/dct64_k7.s	Wed Jun 20 07:54:19 2001 +0000
@@ -9,6 +9,9 @@
 ///    (using memory reference as operand of instructions)
 ///  - Phase 6 is rewritten with mixing of cpu and mmx opcodes
 ///  - change function name for support 3DNowEx! automatic detect
+///  - negation of 3dnow reg was replaced with PXOR 0x800000000, MMi instead 
+///    of PFMUL as it was suggested by athlon manual. (Two not separated PFMUL
+///    can not be paired, but PXOR can be).
 ///
 /// note: because K7 processors are an aggresive out-of-order three-way
 ///       superscalar ones instruction order is not significand for them.
@@ -21,6 +24,11 @@
 /// this program. Use it at your own risk.
 ///
 
+.data
+        .align 8
+plus_minus_3dnow: .long 0x00000000, 0x80000000
+
+.text
         .globl dct64_3dnowex
         .type    dct64_3dnowex,@function
 
@@ -412,13 +420,8 @@
         movq   %mm5, 120(%esi)
 
         // 5
-        movl $-1,%eax
-        movd %eax,%mm1
+	movq plus_minus_3dnow, %mm0 /* mm0 = 1.0 | -1.0 */
         movl $1,%eax
-        movd %eax,%mm0
-        / L | H
-        punpckldq %mm1,%mm0
-        pi2fd %mm0,%mm0       /* mm0 = 1.0 | -1.0 */
         movd %eax,%mm1
         pi2fd %mm1,%mm1
         movl pnts+16,%eax
@@ -433,7 +436,7 @@
         movq 8(%esi),%mm4     /* mm4 = tmp2[2] | tmp2[3]*/
 	pfpnacc %mm4, %mm4
 	pswapd  %mm4, %mm4    /* mm4 = tmp2[2]+tmp2[3]|tmp2[2]-tmp2[3]*/
-        pfmul %mm0,%mm4       /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/
+        pxor  %mm0,%mm4       /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/
         pfmul %mm1,%mm4       /* mm4 = tmp2[2]+tmp2[3]|(tmp2[3]-tmp2[2])*cos0*/
         movq %mm4,%mm5
         psrlq $32,%mm5        /* mm5 = (tmp2[3]-tmp2[2])*cos0 */
@@ -449,7 +452,7 @@
 	pfpnacc %mm4, %mm4
 	pswapd  %mm4, %mm4
 
-        pfmul %mm0,%mm4
+        pxor  %mm0,%mm4
         pfmul %mm1,%mm4
         movq %mm4,%mm5
         psrlq $32,%mm5
@@ -470,7 +473,7 @@
         movq 40(%esi),%mm4
 	pfpnacc %mm4, %mm4
 	pswapd  %mm4, %mm4
-        pfmul %mm0,%mm4
+        pxor  %mm0,%mm4
         pfmul %mm1,%mm4
         movq %mm4,%mm5
         psrlq $32,%mm5
@@ -484,7 +487,7 @@
         movq 56(%esi),%mm4
 	pfpnacc %mm4, %mm4
 	pswapd  %mm4, %mm4
-        pfmul %mm0,%mm4
+        pxor  %mm0,%mm4
         pfmul %mm1,%mm4
         movq %mm4,%mm5
         psrlq $32,%mm5
@@ -504,7 +507,7 @@
         movq 72(%esi),%mm4
 	pfpnacc %mm4, %mm4
 	pswapd  %mm4, %mm4
-        pfmul %mm0,%mm4
+        pxor  %mm0,%mm4
         pfmul %mm1,%mm4
         movq %mm4,%mm5
         psrlq $32,%mm5
@@ -518,7 +521,7 @@
         movq 88(%esi),%mm4
 	pfpnacc %mm4, %mm4
 	pswapd  %mm4, %mm4
-        pfmul %mm0,%mm4
+        pxor  %mm0,%mm4
         pfmul %mm1,%mm4
         movq %mm4,%mm5
         psrlq $32,%mm5
@@ -538,7 +541,7 @@
         movq 104(%esi),%mm4
 	pfpnacc %mm4, %mm4
 	pswapd  %mm4, %mm4
-        pfmul %mm0,%mm4
+        pxor  %mm0,%mm4
         pfmul %mm1,%mm4
         movq %mm4,%mm5
         psrlq $32,%mm5
@@ -552,7 +555,7 @@
         movq 120(%esi),%mm4
 	pfpnacc %mm4, %mm4
 	pswapd  %mm4, %mm4
-        pfmul %mm0,%mm4
+        pxor  %mm0,%mm4
         pfmul %mm1,%mm4
         movq %mm4,%mm5
         psrlq $32,%mm5