changeset 864:f0a3b5bf2e7a

This version is slightly better then previous, hard to decide.
author atmosfear
date Thu, 24 May 2001 20:14:28 +0000
parents 290801346d62
children 83919c1b9924
files mp3lib/decode_sse.s
diffstat 1 files changed, 57 insertions(+), 60 deletions(-) [+]
line wrap: on
line diff
--- a/mp3lib/decode_sse.s	Thu May 24 20:13:28 2001 +0000
+++ b/mp3lib/decode_sse.s	Thu May 24 20:14:28 2001 +0000
@@ -90,8 +90,7 @@
 .L74:
         pushl %eax
         call  dct64
-        addl  $12,%esp
-	emms
+	addl  $12, %esp
         movl  16(%esp),%edx
         leal  0(,%edx,4),%edx
         movl  $decwin+64,%eax
@@ -100,24 +99,26 @@
         movl  $16,%ebp
 
 .L55:
-	movups	(%ecx), %xmm4
-	mulps	(%ebx), %xmm4
-	movups	16(%ecx), %xmm0
-	mulps	16(%ebx), %xmm0
-	addps	%xmm0, %xmm4
-	movups	32(%ecx), %xmm1
-	mulps	32(%ebx), %xmm1
-	addps	%xmm1, %xmm4
-	movups	48(%ecx), %xmm0
-	mulps	48(%ebx), %xmm0
-	addps	%xmm0, %xmm4
-	movhlps	%xmm4, %xmm1 /* fake of pfacc */
-	addps	%xmm1, %xmm4
-	shufps	$0x55, %xmm4, %xmm1 /* fake of pfnacc. 1|1|1|1 */
-	subps	%xmm1, %xmm4
-	cvtss2si %xmm4, %eax
+	movups (%ecx), %xmm0
+	mulps  (%ebx), %xmm0
+	movups 16(%ecx), %xmm1
+	mulps  16(%ebx), %xmm1
+	addps  %xmm1, %xmm0
+	movups 32(%ecx), %xmm1
+	mulps  32(%ebx), %xmm1
+	addps  %xmm1, %xmm0
+	movups 48(%ecx), %xmm1
+	mulps  48(%ebx), %xmm1
+	addps  %xmm1, %xmm0
+	movhlps %xmm0, %xmm1
+	addps   %xmm1, %xmm0
+	movaps  %xmm0, %xmm1
+	shufps  $0x55, %xmm1, %xmm1 /* fake of pfnacc. 1|1|1|1 */
+	subss	%xmm1, %xmm0
+	cvttss2si %xmm0, %eax
 
-        movw	%ax,(%esi)
+/        sar   $16,%eax
+        movw  %ax,(%esi)
 
         addl  $64,%ebx
         subl  $-128,%ecx
@@ -127,23 +128,24 @@
 
 / --- end of  loop 1 ---
 
-	movups	(%ecx), %xmm4
-	mulps	(%ebx), %xmm4
-	movups	16(%ecx), %xmm0
-	mulps	16(%ebx), %xmm0
-	addps	%xmm0, %xmm4
-	movups	32(%ecx), %xmm1
-	mulps	32(%ebx), %xmm1
-	addps	%xmm1, %xmm4
-	movups	48(%ecx), %xmm0
-	mulps	48(%ebx), %xmm0
-	addps	%xmm0, %xmm4
-	movhlps	%xmm4, %xmm1 /* 3|2|3|2 */
-	addps	%xmm1, %xmm4
-	cvtss2si %xmm4, %eax
+	movups (%ecx), %xmm0
+	mulps  (%ebx), %xmm0
+	movups 16(%ecx), %xmm1
+	mulps  16(%ebx), %xmm1
+	addps  %xmm1, %xmm0
+	movups 32(%ecx), %xmm1
+	mulps  32(%ebx), %xmm1
+	addps  %xmm1, %xmm0
+	movups 48(%ecx), %xmm1
+	mulps  48(%ebx), %xmm1
+	addps  %xmm1, %xmm0
+	movhlps %xmm0, %xmm1	
+	addss	%xmm1, %xmm0
+	cvttss2si %xmm0, %eax
 
+/        sar   $16,%eax
 
-        movw	%ax,(%esi)
+        movw  %ax,(%esi)
 
         addl  $-64,%ebx
         addl  $4,%esi
@@ -151,32 +153,29 @@
         movl  $15,%ebp
 
 .L68:
-	xorps  %xmm0, %xmm0
-
-        movups (%ecx),%xmm2
-        mulps  (%ebx),%xmm2
-        subps %xmm2,%xmm0
-
-        movups 16(%ecx),%xmm2
-        mulps  16(%ebx),%xmm2
-        subps  %xmm2,%xmm0
+	xorps %xmm0, %xmm0
+	movups (%ecx), %xmm1
+	mulps  (%ebx), %xmm1
+	subps  %xmm1, %xmm0
+	movups 16(%ecx), %xmm1
+	mulps  16(%ebx), %xmm1
+	subps  %xmm1, %xmm0
+	movups 32(%ecx), %xmm1
+	mulps  32(%ebx), %xmm1
+	subps  %xmm1, %xmm0
+	movups 48(%ecx), %xmm1
+	mulps  48(%ebx), %xmm1
+	subps  %xmm1, %xmm0
+	movhlps %xmm0, %xmm1
+	subps	%xmm1, %xmm0
+	movaps	%xmm0, %xmm1
+	shufps $0x55, %xmm1, %xmm1 /* fake of pfacc 1|1|1|1 */
+	addss  %xmm1, %xmm0
+	cvttss2si %xmm0, %eax
 
-        movups 32(%ecx),%xmm2
-        mulps  32(%ebx),%xmm2
-        subps  %xmm2,%xmm0
-
-        movups 48(%ecx),%xmm2
-        mulps  48(%ebx),%xmm2
-        subps  %xmm2,%xmm0
+/        sar   $16,%eax
 
-	movhlps	%xmm0, %xmm1 /* 3|2|3|2 */
-	addps	%xmm1, %xmm0
-	shufps	$0x55, %xmm0, %xmm1 /* fake of pfacc 1|1|1|1 */
-	addps	%xmm1, %xmm0
-	cvtss2si %xmm0, %eax
-
-
-        movw	%ax,(%esi)
+        movw  %ax,(%esi)
 
         addl  $-64,%ebx
         subl  $-128,%ecx
@@ -186,8 +185,6 @@
 
 / --- end of loop 2
 
-        emms
-
         movl  %edi,%eax
         popl  %ebx
         popl  %esi