changeset 23464:1b1fdac4a68c

Align output pointer so that we can use movaps instead of movups in dct64_sse; 1.5% faster decode.
author zuxy
date Wed, 06 Jun 2007 05:13:13 +0000
parents b4214e05bb3f
children a1a699833dcf
files mp3lib/dct64_sse.c mp3lib/layer1.c mp3lib/layer2.c mp3lib/layer3.c
diffstat 4 files changed, 9 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/mp3lib/dct64_sse.c	Tue Jun 05 23:30:37 2007 +0000
+++ b/mp3lib/dct64_sse.c	Wed Jun 06 05:13:13 2007 +0000
@@ -5,17 +5,7 @@
  * and mp3lib/dct64_MMX.c
  */
 
-/* NOTE: The following code is suboptimal! It can be improved (at least) by
-
-   1. Replace all movups by movaps. (Can Parameter c be always aligned on 
-      a 16-byte boundary?)
-
-   2. Rewritten using intrinsics. (GCC generally optimizes intrinsics
-      better. However, when __m128 locals are involved, GCC may
-      produce bad code that uses movaps to access a stack not aligned
-      on a 16-byte boundary, which leads to run-time crashes.)
-
-*/
+#include <libavutil/mem.h>
 
 typedef float real;
 
@@ -32,8 +22,8 @@
 
 void dct64_sse(short *out0,short *out1,real *c)
 {
-    static real __attribute__ ((aligned(16))) b1[0x20];
-    static real __attribute__ ((aligned(16))) b2[0x20];
+    static DECLARE_ALIGNED(16, real, b1[0x20]);
+    static DECLARE_ALIGNED(16, real, b2[0x20]);
     static real const one = 1.f;
 
     {
@@ -45,9 +35,9 @@
             asm(
                 "movaps    %2, %%xmm3\n\t"
                 "shufps    $27, %%xmm3, %%xmm3\n\t"
-                "movups    %3, %%xmm1\n\t"
+                "movaps    %3, %%xmm1\n\t"
                 "movaps    %%xmm1, %%xmm4\n\t"
-                "movups    %4, %%xmm2\n\t"
+                "movaps    %4, %%xmm2\n\t"
                 "shufps    $27, %%xmm4, %%xmm4\n\t"
                 "movaps    %%xmm2, %%xmm0\n\t"
                 "shufps    $27, %%xmm0, %%xmm0\n\t"
--- a/mp3lib/layer1.c	Tue Jun 05 23:30:37 2007 +0000
+++ b/mp3lib/layer1.c	Wed Jun 06 05:13:13 2007 +0000
@@ -131,7 +131,7 @@
   int i,stereo = fr->stereo;
   unsigned int balloc[2*SBLIMIT];
   unsigned int scale_index[2][SBLIMIT];
-  real fraction[2][SBLIMIT];
+  DECLARE_ALIGNED(16, real, fraction[2][SBLIMIT]);
 //  int single = fr->single;
 
 //  printf("do_layer1(0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X )\n",
--- a/mp3lib/layer2.c	Tue Jun 05 23:30:37 2007 +0000
+++ b/mp3lib/layer2.c	Wed Jun 06 05:13:13 2007 +0000
@@ -285,7 +285,7 @@
   int clip=0;
   int i,j;
   int stereo = fr->stereo;
-  real fraction[2][4][SBLIMIT]; /* pick_table clears unused subbands */
+  DECLARE_ALIGNED(16, real, fraction[2][4][SBLIMIT]); /* pick_table clears unused subbands */
   unsigned int bit_alloc[64];
   int scale[192];
   int single = fr->single;
--- a/mp3lib/layer3.c	Tue Jun 05 23:30:37 2007 +0000
+++ b/mp3lib/layer3.c	Wed Jun 06 05:13:13 2007 +0000
@@ -1260,8 +1260,8 @@
 
   granules = (fr->lsf) ? 1 : 2;
   for (gr=0;gr<granules;gr++){
-    static real hybridIn[2][SBLIMIT][SSLIMIT];
-    static real hybridOut[2][SSLIMIT][SBLIMIT];
+    static DECLARE_ALIGNED(16, real, hybridIn[2][SBLIMIT][SSLIMIT]);
+    static DECLARE_ALIGNED(16, real, hybridOut[2][SSLIMIT][SBLIMIT]);
 
     { struct gr_info_s *gr_info = &(sideinfo.ch[0].gr[gr]);
       int part2bits;