diff libfaad2/filtbank.c @ 12527:4a370c80fe5c

update to the 2.0 release of faad, patch by adland
author diego
date Wed, 02 Jun 2004 22:59:04 +0000
parents 3185f64f6350
children d81145997036
line wrap: on
line diff
--- a/libfaad2/filtbank.c	Wed Jun 02 22:52:00 2004 +0000
+++ b/libfaad2/filtbank.c	Wed Jun 02 22:59:04 2004 +0000
@@ -1,6 +1,6 @@
 /*
 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
-** Copyright (C) 2003 M. Bakker, Ahead Software AG, http://www.nero.com
+** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
 **  
 ** This program is free software; you can redistribute it and/or modify
 ** it under the terms of the GNU General Public License as published by
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: filtbank.c,v 1.1 2003/08/30 22:30:21 arpi Exp $
+** $Id: filtbank.c,v 1.2 2003/10/03 22:22:27 alex Exp $
 **/
 
 #include "common.h"
@@ -51,7 +51,7 @@
     uint16_t frame_len_ld = frame_len/2;
 #endif
 
-    fb_info *fb = (fb_info*)malloc(sizeof(fb_info));
+    fb_info *fb = (fb_info*)faad_malloc(sizeof(fb_info));
     memset(fb, 0, sizeof(fb_info));
 
     /* normal */
@@ -62,8 +62,10 @@
     fb->mdct1024 = faad_mdct_init(2*frame_len_ld);
 #endif
 
+#ifdef ALLOW_SMALL_FRAMELENGTH
     if (frame_len == 1024)
     {
+#endif
         fb->long_window[0]  = sine_long_1024;
         fb->short_window[0] = sine_short_128;
         fb->long_window[1]  = kbd_long_1024;
@@ -72,6 +74,7 @@
         fb->ld_window[0] = sine_mid_512;
         fb->ld_window[1] = ld_mid_512;
 #endif
+#ifdef ALLOW_SMALL_FRAMELENGTH
     } else /* (frame_len == 960) */ {
         fb->long_window[0]  = sine_long_960;
         fb->short_window[0] = sine_short_120;
@@ -82,6 +85,16 @@
         fb->ld_window[1] = ld_mid_480;
 #endif
     }
+#endif
+
+#ifdef USE_SSE
+    if (cpu_has_sse())
+    {
+        fb->if_func = ifilter_bank_sse;
+    } else {
+        fb->if_func = ifilter_bank;
+    }
+#endif
 
     return fb;
 }
@@ -90,19 +103,24 @@
 {
     if (fb != NULL)
     {
+#ifdef PROFILE
+        printf("FB:                 %I64d cycles\n", fb->cycles);
+#endif
+
         faad_mdct_end(fb->mdct256);
         faad_mdct_end(fb->mdct2048);
 #ifdef LD_DEC
         faad_mdct_end(fb->mdct1024);
 #endif
 
-        free(fb);
+        faad_free(fb);
     }
 }
 
-static INLINE void imdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
+static INLINE void imdct_long(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
 {
-    mdct_info *mdct;
+#ifdef LD_DEC
+    mdct_info *mdct = NULL;
 
     switch (len)
     {
@@ -110,25 +128,47 @@
     case 1920:
         mdct = fb->mdct2048;
         break;
-    case 256:
-    case 240:
-        mdct = fb->mdct256;
-        break;
-#ifdef LD_DEC
     case 1024:
     case 960:
         mdct = fb->mdct1024;
         break;
-#endif
     }
 
     faad_imdct(mdct, in_data, out_data);
+#else
+    faad_imdct(fb->mdct2048, in_data, out_data);
+#endif
 }
 
+#ifdef USE_SSE
+static INLINE void imdct_long_sse(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
+{
+#ifdef LD_DEC
+    mdct_info *mdct = NULL;
+
+    switch (len)
+    {
+    case 2048:
+    case 1920:
+        mdct = fb->mdct2048;
+        break;
+    case 1024:
+    case 960:
+        mdct = fb->mdct1024;
+        break;
+    }
+
+    faad_imdct_sse(mdct, in_data, out_data);
+#else
+    faad_imdct_sse(fb->mdct2048, in_data, out_data);
+#endif
+}
+#endif
+
 #ifdef LTP_DEC
 static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
 {
-    mdct_info *mdct;
+    mdct_info *mdct = NULL;
 
     switch (len)
     {
@@ -154,15 +194,16 @@
 
 void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
                   uint8_t window_shape_prev, real_t *freq_in,
-                  real_t *time_out, uint8_t object_type, uint16_t frame_len)
+                  real_t *time_out, real_t *overlap,
+                  uint8_t object_type, uint16_t frame_len)
 {
     int16_t i;
-    real_t *transf_buf;
+    ALIGN real_t transf_buf[2*1024] = {0};
 
-    real_t *window_long;
-    real_t *window_long_prev;
-    real_t *window_short;
-    real_t *window_short_prev;
+    const real_t *window_long = NULL;
+    const real_t *window_long_prev = NULL;
+    const real_t *window_short = NULL;
+    const real_t *window_short_prev = NULL;
 
     uint16_t nlong = frame_len;
     uint16_t nshort = frame_len/8;
@@ -170,7 +211,137 @@
 
     uint16_t nflat_ls = (nlong-nshort)/2;
 
-    transf_buf = (real_t*)malloc(2*nlong*sizeof(real_t));
+#ifdef PROFILE
+    int64_t count = faad_get_ts();
+#endif
+
+#ifdef LD_DEC
+    if (object_type == LD)
+    {
+        window_long       = fb->ld_window[window_shape];
+        window_long_prev  = fb->ld_window[window_shape_prev];
+    } else {
+#endif
+        window_long       = fb->long_window[window_shape];
+        window_long_prev  = fb->long_window[window_shape_prev];
+        window_short      = fb->short_window[window_shape];
+        window_short_prev = fb->short_window[window_shape_prev];
+#ifdef LD_DEC
+    }
+#endif
+
+
+    switch (window_sequence)
+    {
+    case ONLY_LONG_SEQUENCE:
+        imdct_long(fb, freq_in, transf_buf, 2*nlong);
+        for (i = 0; i < nlong; i+=4)
+        {
+            time_out[i]   = overlap[i]   + MUL_F(transf_buf[i],window_long_prev[i]);
+            time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
+            time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
+            time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
+        }
+        for (i = 0; i < nlong; i+=4)
+        {
+            overlap[i]   = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
+            overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]);
+            overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]);
+            overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]);
+        }
+        break;
+
+    case LONG_START_SEQUENCE:
+        imdct_long(fb, freq_in, transf_buf, 2*nlong);
+        for (i = 0; i < nlong; i+=4)
+        {
+            time_out[i]   = overlap[i]   + MUL_F(transf_buf[i],window_long_prev[i]);
+            time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
+            time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
+            time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
+        }
+        for (i = 0; i < nflat_ls; i++)
+            overlap[i] = transf_buf[nlong+i];
+        for (i = 0; i < nshort; i++)
+            overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]);
+        for (i = 0; i < nflat_ls; i++)
+            overlap[nflat_ls+nshort+i] = 0;
+        break;
+
+    case EIGHT_SHORT_SEQUENCE:
+        faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0);
+        faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1);
+        faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2);
+        faad_imdct(fb->mdct256, freq_in+3*nshort, transf_buf+2*nshort*3);
+        faad_imdct(fb->mdct256, freq_in+4*nshort, transf_buf+2*nshort*4);
+        faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5);
+        faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6);
+        faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7);
+        for (i = 0; i < nflat_ls; i++)
+            time_out[i] = overlap[i];
+        for(i = 0; i < nshort; i++)
+        {
+            time_out[nflat_ls+         i] = overlap[nflat_ls+         i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]);
+            time_out[nflat_ls+1*nshort+i] = overlap[nflat_ls+nshort*1+i] + MUL_F(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*2+i],window_short[i]);
+            time_out[nflat_ls+2*nshort+i] = overlap[nflat_ls+nshort*2+i] + MUL_F(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*4+i],window_short[i]);
+            time_out[nflat_ls+3*nshort+i] = overlap[nflat_ls+nshort*3+i] + MUL_F(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*6+i],window_short[i]);
+            if (i < trans)
+                time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
+        }
+        for(i = 0; i < nshort; i++)
+        {
+            if (i >= trans)
+                overlap[nflat_ls+4*nshort+i-nlong] = MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
+            overlap[nflat_ls+5*nshort+i-nlong] = MUL_F(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*10+i],window_short[i]);
+            overlap[nflat_ls+6*nshort+i-nlong] = MUL_F(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*12+i],window_short[i]);
+            overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]);
+            overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]);
+        }
+        for (i = 0; i < nflat_ls; i++)
+            overlap[nflat_ls+nshort+i] = 0;
+        break;
+
+    case LONG_STOP_SEQUENCE:
+        imdct_long(fb, freq_in, transf_buf, 2*nlong);
+        for (i = 0; i < nflat_ls; i++)
+            time_out[i] = overlap[i];
+        for (i = 0; i < nshort; i++)
+            time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]);
+        for (i = 0; i < nflat_ls; i++)
+            time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i];
+        for (i = 0; i < nlong; i++)
+            overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
+		break;
+    }
+
+#ifdef PROFILE
+    count = faad_get_ts() - count;
+    fb->cycles += count;
+#endif
+}
+
+#ifdef USE_SSE
+void ifilter_bank_sse(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
+                      uint8_t window_shape_prev, real_t *freq_in,
+                      real_t *time_out, uint8_t object_type, uint16_t frame_len)
+{
+    int16_t i;
+    ALIGN real_t transf_buf[2*1024] = {0};
+
+    const real_t *window_long = NULL;
+    const real_t *window_long_prev = NULL;
+    const real_t *window_short = NULL;
+    const real_t *window_short_prev = NULL;
+
+    uint16_t nlong = frame_len;
+    uint16_t nshort = frame_len/8;
+    uint16_t trans = nshort/2;
+
+    uint16_t nflat_ls = (nlong-nshort)/2;
+
+#ifdef PROFILE
+    int64_t count = faad_get_ts();
+#endif
 
 #ifdef LD_DEC
     if (object_type == LD)
@@ -190,85 +361,293 @@
     switch (window_sequence)
     {
     case ONLY_LONG_SEQUENCE:
-        imdct(fb, freq_in, transf_buf, 2*nlong);
+        imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
         for (i = 0; i < nlong; i+=4)
         {
-            time_out[i] = time_out[nlong+i] + MUL_R_C(transf_buf[i],window_long_prev[i]);
-            time_out[i+1] = time_out[nlong+i+1] + MUL_R_C(transf_buf[i+1],window_long_prev[i+1]);
-            time_out[i+2] = time_out[nlong+i+2] + MUL_R_C(transf_buf[i+2],window_long_prev[i+2]);
-            time_out[i+3] = time_out[nlong+i+3] + MUL_R_C(transf_buf[i+3],window_long_prev[i+3]);
-        }
-        for (i = 0; i < nlong; i+=4)
-        {
-            time_out[nlong+i] = MUL_R_C(transf_buf[nlong+i],window_long[nlong-1-i]);
-            time_out[nlong+i+1] = MUL_R_C(transf_buf[nlong+i+1],window_long[nlong-2-i]);
-            time_out[nlong+i+2] = MUL_R_C(transf_buf[nlong+i+2],window_long[nlong-3-i]);
-            time_out[nlong+i+3] = MUL_R_C(transf_buf[nlong+i+3],window_long[nlong-4-i]);
+            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+
+            m1 = _mm_load_ps(&transf_buf[i]);
+            m2 = _mm_load_ps(&window_long_prev[i]);
+            m6 = _mm_load_ps(&window_long[nlong-4-i]);
+            m3 = _mm_load_ps(&time_out[nlong+i]);
+            m5 = _mm_load_ps(&transf_buf[nlong+i]);
+
+            m4 = _mm_mul_ps(m1, m2);
+            m7 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m4 = _mm_add_ps(m4, m3);
+            m8 = _mm_mul_ps(m5, m7);
+
+            _mm_store_ps(&time_out[i], m4);
+            _mm_store_ps(&time_out[nlong+i], m8);
         }
         break;
 
     case LONG_START_SEQUENCE:
-        imdct(fb, freq_in, transf_buf, 2*nlong);
+        imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
         for (i = 0; i < nlong; i+=4)
         {
-            time_out[i] = time_out[nlong+i] + MUL_R_C(transf_buf[i],window_long_prev[i]);
-            time_out[i+1] = time_out[nlong+i+1] + MUL_R_C(transf_buf[i+1],window_long_prev[i+1]);
-            time_out[i+2] = time_out[nlong+i+2] + MUL_R_C(transf_buf[i+2],window_long_prev[i+2]);
-            time_out[i+3] = time_out[nlong+i+3] + MUL_R_C(transf_buf[i+3],window_long_prev[i+3]);
+            __m128 m1 = _mm_load_ps(&transf_buf[i]);
+            __m128 m2 = _mm_load_ps(&window_long_prev[i]);
+            __m128 m3 = _mm_load_ps(&time_out[nlong+i]);
+
+            __m128 m4 = _mm_mul_ps(m1, m2);
+            m4 = _mm_add_ps(m4, m3);
+
+            _mm_store_ps(&time_out[i], m4);
+        }
+        for (i = 0; i < nflat_ls; i+=4)
+        {
+            __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
+            _mm_store_ps(&time_out[nlong+i], m1);
         }
-        for (i = 0; i < nflat_ls; i++)
-            time_out[nlong+i] = transf_buf[nlong+i];
-        for (i = 0; i < nshort; i++)
-            time_out[nlong+nflat_ls+i] = MUL_R_C(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]);
-        for (i = 0; i < nflat_ls; i++)
-            time_out[nlong+nflat_ls+nshort+i] = 0;
+        for (i = 0; i < nshort; i+=4)
+        {
+            __m128 m1 = _mm_load_ps(&transf_buf[nlong+nflat_ls+i]);
+            __m128 m2 = _mm_load_ps(&window_short[nshort-4-i]);
+            __m128 m3, m4;
+
+            m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m4 = _mm_mul_ps(m1, m3);
+
+            _mm_store_ps(&time_out[nlong+nflat_ls+i], m4);
+        }
+        for (i = 0; i < nflat_ls; i+=4)
+        {
+            __m128 m1 = _mm_setzero_ps();
+            _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
+        }
         break;
 
     case EIGHT_SHORT_SEQUENCE:
-        imdct(fb, freq_in+0*nshort, transf_buf+2*nshort*0, 2*nshort);
-        imdct(fb, freq_in+1*nshort, transf_buf+2*nshort*1, 2*nshort);
-        imdct(fb, freq_in+2*nshort, transf_buf+2*nshort*2, 2*nshort);
-        imdct(fb, freq_in+3*nshort, transf_buf+2*nshort*3, 2*nshort);
-        imdct(fb, freq_in+4*nshort, transf_buf+2*nshort*4, 2*nshort);
-        imdct(fb, freq_in+5*nshort, transf_buf+2*nshort*5, 2*nshort);
-        imdct(fb, freq_in+6*nshort, transf_buf+2*nshort*6, 2*nshort);
-        imdct(fb, freq_in+7*nshort, transf_buf+2*nshort*7, 2*nshort);
-        for (i = 0; i < nflat_ls; i++)
-            time_out[i] = time_out[nlong+i];
-        for(i = nshort-1; i >= 0; i--)
+        faad_imdct_sse(fb->mdct256, &freq_in[0*nshort], &transf_buf[2*nshort*0]);
+        faad_imdct_sse(fb->mdct256, &freq_in[1*nshort], &transf_buf[2*nshort*1]);
+        faad_imdct_sse(fb->mdct256, &freq_in[2*nshort], &transf_buf[2*nshort*2]);
+        faad_imdct_sse(fb->mdct256, &freq_in[3*nshort], &transf_buf[2*nshort*3]);
+        faad_imdct_sse(fb->mdct256, &freq_in[4*nshort], &transf_buf[2*nshort*4]);
+        faad_imdct_sse(fb->mdct256, &freq_in[5*nshort], &transf_buf[2*nshort*5]);
+        faad_imdct_sse(fb->mdct256, &freq_in[6*nshort], &transf_buf[2*nshort*6]);
+        faad_imdct_sse(fb->mdct256, &freq_in[7*nshort], &transf_buf[2*nshort*7]);
+        for (i = 0; i < nflat_ls; i+=4)
+        {
+            __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
+            _mm_store_ps(&time_out[i], m1);
+        }
+        for (i = 0; i < nshort; i+=4)
+        {
+            __m128 m1 = _mm_load_ps(&transf_buf[nshort*0+i]);
+            __m128 m2 = _mm_load_ps(&window_short_prev[i]);
+            __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);
+
+            __m128 m4 = _mm_mul_ps(m1, m2);
+            m4 = _mm_add_ps(m4, m3);
+
+            _mm_store_ps(&time_out[nflat_ls+i], m4);
+        }
+        for (i = 0; i < nshort; i+=4)
+        {
+            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+            m1 = _mm_load_ps(&transf_buf[nshort*1+i]);
+            m2 = _mm_load_ps(&window_short[nshort-4-i]);
+            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*1+i]);
+            m6 = _mm_load_ps(&transf_buf[nshort*2+i]);
+            m7 = _mm_load_ps(&window_short[i]);
+
+            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m4 = _mm_mul_ps(m1, m5);
+            m8 = _mm_mul_ps(m6, m7);
+            m4 = _mm_add_ps(m4, m3);
+            m4 = _mm_add_ps(m4, m8);
+
+            _mm_store_ps(&time_out[nflat_ls+1*nshort+i], m4);
+        }
+        for (i = 0; i < nshort; i+=4)
+        {
+            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+            m1 = _mm_load_ps(&transf_buf[nshort*3+i]);
+            m2 = _mm_load_ps(&window_short[nshort-4-i]);
+            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*2+i]);
+            m6 = _mm_load_ps(&transf_buf[nshort*4+i]);
+            m7 = _mm_load_ps(&window_short[i]);
+
+            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m4 = _mm_mul_ps(m1, m5);
+            m8 = _mm_mul_ps(m6, m7);
+            m4 = _mm_add_ps(m4, m3);
+            m4 = _mm_add_ps(m4, m8);
+
+            _mm_store_ps(&time_out[nflat_ls+2*nshort+i], m4);
+        }
+        for (i = 0; i < nshort; i+=4)
+        {
+            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+            m1 = _mm_load_ps(&transf_buf[nshort*5+i]);
+            m2 = _mm_load_ps(&window_short[nshort-4-i]);
+            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*3+i]);
+            m6 = _mm_load_ps(&transf_buf[nshort*6+i]);
+            m7 = _mm_load_ps(&window_short[i]);
+
+            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m4 = _mm_mul_ps(m1, m5);
+            m8 = _mm_mul_ps(m6, m7);
+            m4 = _mm_add_ps(m4, m3);
+            m4 = _mm_add_ps(m4, m8);
+
+            _mm_store_ps(&time_out[nflat_ls+3*nshort+i], m4);
+        }
+        for(i = 0; i < trans; i+=4)
         {
-            time_out[nflat_ls+         i] = time_out[nlong+nflat_ls+         i] + MUL_R_C(transf_buf[nshort*0+i],window_short_prev[i]);
-            time_out[nflat_ls+1*nshort+i] = time_out[nlong+nflat_ls+nshort*1+i] + MUL_R_C(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*2+i],window_short[i]);
-            time_out[nflat_ls+2*nshort+i] = time_out[nlong+nflat_ls+nshort*2+i] + MUL_R_C(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*4+i],window_short[i]);
-            time_out[nflat_ls+3*nshort+i] = time_out[nlong+nflat_ls+nshort*3+i] + MUL_R_C(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*6+i],window_short[i]);
-            if (i < trans)
-                time_out[nflat_ls+4*nshort+i] = time_out[nlong+nflat_ls+nshort*4+i] + MUL_R_C(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*8+i],window_short[i]);
-            else
-                time_out[nflat_ls+4*nshort+i] = MUL_R_C(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*8+i],window_short[i]);
-            time_out[nflat_ls+5*nshort+i] = MUL_R_C(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*10+i],window_short[i]);
-            time_out[nflat_ls+6*nshort+i] = MUL_R_C(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*12+i],window_short[i]);
-            time_out[nflat_ls+7*nshort+i] = MUL_R_C(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*14+i],window_short[i]);
-            time_out[nflat_ls+8*nshort+i] = MUL_R_C(transf_buf[nshort*15+i],window_short[nshort-1-i]);
+            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+            m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
+            m2 = _mm_load_ps(&window_short[nshort-4-i]);
+            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*4+i]);
+            m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
+            m7 = _mm_load_ps(&window_short[i]);
+
+            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m4 = _mm_mul_ps(m1, m5);
+            m8 = _mm_mul_ps(m6, m7);
+            m4 = _mm_add_ps(m4, m3);
+            m4 = _mm_add_ps(m4, m8);
+
+            _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m4);
+        }
+        for (i = trans; i < nshort; i+=4)
+        {
+            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+            m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
+            m2 = _mm_load_ps(&window_short[nshort-4-i]);
+            m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
+            m7 = _mm_load_ps(&window_short[i]);
+
+            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m4 = _mm_mul_ps(m1, m5);
+            m8 = _mm_mul_ps(m6, m7);
+            m3 = _mm_add_ps(m4, m8);
+
+            _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m3);
+        }
+        for (i = 0; i < nshort; i+=4)
+        {
+            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+            m1 = _mm_load_ps(&transf_buf[nshort*9+i]);
+            m2 = _mm_load_ps(&window_short[nshort-4-i]);
+            m6 = _mm_load_ps(&transf_buf[nshort*10+i]);
+            m7 = _mm_load_ps(&window_short[i]);
+
+            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m4 = _mm_mul_ps(m1, m5);
+            m8 = _mm_mul_ps(m6, m7);
+            m3 = _mm_add_ps(m4, m8);
+
+            _mm_store_ps(&time_out[nflat_ls+5*nshort+i], m3);
         }
-        for (i = 0; i < nflat_ls; i++)
-            time_out[nlong+nflat_ls+nshort+i] = 0;
+        for (i = 0; i < nshort; i+=4)
+        {
+            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+            m1 = _mm_load_ps(&transf_buf[nshort*11+i]);
+            m2 = _mm_load_ps(&window_short[nshort-4-i]);
+            m6 = _mm_load_ps(&transf_buf[nshort*12+i]);
+            m7 = _mm_load_ps(&window_short[i]);
+
+            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m4 = _mm_mul_ps(m1, m5);
+            m8 = _mm_mul_ps(m6, m7);
+            m3 = _mm_add_ps(m4, m8);
+
+            _mm_store_ps(&time_out[nflat_ls+6*nshort+i], m3);
+        }
+        for (i = 0; i < nshort; i+=4)
+        {
+            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+            m1 = _mm_load_ps(&transf_buf[nshort*13+i]);
+            m2 = _mm_load_ps(&window_short[nshort-4-i]);
+            m6 = _mm_load_ps(&transf_buf[nshort*14+i]);
+            m7 = _mm_load_ps(&window_short[i]);
+
+            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m4 = _mm_mul_ps(m1, m5);
+            m8 = _mm_mul_ps(m6, m7);
+            m3 = _mm_add_ps(m4, m8);
+
+            _mm_store_ps(&time_out[nflat_ls+7*nshort+i], m3);
+        }
+        for (i = 0; i < nshort; i+=4)
+        {
+            __m128 m1, m2, m3, m5;
+            m1 = _mm_load_ps(&transf_buf[nshort*15+i]);
+            m2 = _mm_load_ps(&window_short[nshort-4-i]);
+
+            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m3 = _mm_mul_ps(m1, m5);
+
+            _mm_store_ps(&time_out[nflat_ls+8*nshort+i], m3);
+        }
+        for (i = 0; i < nflat_ls; i+=4)
+        {
+            __m128 m1 = _mm_setzero_ps();
+            _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
+        }
         break;
 
     case LONG_STOP_SEQUENCE:
-        imdct(fb, freq_in, transf_buf, 2*nlong);
-        for (i = 0; i < nflat_ls; i++)
-            time_out[i] = time_out[nlong+i];
-        for (i = 0; i < nshort; i++)
-            time_out[nflat_ls+i] = time_out[nlong+nflat_ls+i] + MUL_R_C(transf_buf[nflat_ls+i],window_short_prev[i]);
-        for (i = 0; i < nflat_ls; i++)
-            time_out[nflat_ls+nshort+i] = time_out[nlong+nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i];
-        for (i = 0; i < nlong; i++)
-            time_out[nlong+i] = MUL_R_C(transf_buf[nlong+i],window_long[nlong-1-i]);
+        imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
+        for (i = 0; i < nflat_ls; i+=4)
+        {
+            __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
+            _mm_store_ps(&time_out[i], m1);
+        }
+        for (i = 0; i < nshort; i+=4)
+        {
+            __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+i]);
+            __m128 m2 = _mm_load_ps(&window_short_prev[i]);
+            __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);
+
+            __m128 m4 = _mm_mul_ps(m1, m2);
+            m4 = _mm_add_ps(m4, m3);
+
+            _mm_store_ps(&time_out[nflat_ls+i], m4);
+        }
+        for (i = 0; i < nflat_ls; i+=4)
+        {
+            __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+nshort+i]);
+            __m128 m2 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort+i]);
+
+            __m128 m3 = _mm_add_ps(m1, m2);
+
+            _mm_store_ps(&time_out[nflat_ls+nshort+i], m3);
+        }
+        for (i = 0; i < nlong; i+=4)
+        {
+            __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
+            __m128 m2 = _mm_load_ps(&window_long[nlong-4-i]);
+            __m128 m3, m4;
+
+            m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+            m4 = _mm_mul_ps(m1, m3);
+
+            _mm_store_ps(&time_out[nlong+i], m4);
+        }
 		break;
     }
 
-    free(transf_buf);
+#ifdef PROFILE
+    count = faad_get_ts() - count;
+    fb->cycles += count;
+#endif
 }
+#endif
 
 #ifdef LTP_DEC
 /* only works for LTP -> no overlapping, no short blocks */
@@ -277,12 +656,12 @@
                      uint8_t object_type, uint16_t frame_len)
 {
     int16_t i;
-    real_t *windowed_buf;
+    ALIGN real_t windowed_buf[2*1024] = {0};
 
-    real_t *window_long;
-    real_t *window_long_prev;
-    real_t *window_short;
-    real_t *window_short_prev;
+    const real_t *window_long = NULL;
+    const real_t *window_long_prev = NULL;
+    const real_t *window_short = NULL;
+    const real_t *window_short_prev = NULL;
 
     uint16_t nlong = frame_len;
     uint16_t nshort = frame_len/8;
@@ -290,8 +669,6 @@
 
     assert(window_sequence != EIGHT_SHORT_SEQUENCE);
 
-    windowed_buf = (real_t*)malloc(nlong*2*sizeof(real_t));
-
 #ifdef LD_DEC
     if (object_type == LD)
     {
@@ -312,19 +689,19 @@
     case ONLY_LONG_SEQUENCE:
         for (i = nlong-1; i >= 0; i--)
         {
-            windowed_buf[i] = MUL_R_C(in_data[i], window_long_prev[i]);
-            windowed_buf[i+nlong] = MUL_R_C(in_data[i+nlong], window_long[nlong-1-i]);
+            windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
+            windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
         }
         mdct(fb, windowed_buf, out_mdct, 2*nlong);
         break;
 
     case LONG_START_SEQUENCE:
         for (i = 0; i < nlong; i++)
-            windowed_buf[i] = MUL_R_C(in_data[i], window_long_prev[i]);
+            windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
         for (i = 0; i < nflat_ls; i++)
             windowed_buf[i+nlong] = in_data[i+nlong];
         for (i = 0; i < nshort; i++)
-            windowed_buf[i+nlong+nflat_ls] = MUL_R_C(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]);
+            windowed_buf[i+nlong+nflat_ls] = MUL_F(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]);
         for (i = 0; i < nflat_ls; i++)
             windowed_buf[i+nlong+nflat_ls+nshort] = 0;
         mdct(fb, windowed_buf, out_mdct, 2*nlong);
@@ -334,15 +711,13 @@
         for (i = 0; i < nflat_ls; i++)
             windowed_buf[i] = 0;
         for (i = 0; i < nshort; i++)
-            windowed_buf[i+nflat_ls] = MUL_R_C(in_data[i+nflat_ls], window_short_prev[i]);
+            windowed_buf[i+nflat_ls] = MUL_F(in_data[i+nflat_ls], window_short_prev[i]);
         for (i = 0; i < nflat_ls; i++)
             windowed_buf[i+nflat_ls+nshort] = in_data[i+nflat_ls+nshort];
         for (i = 0; i < nlong; i++)
-            windowed_buf[i+nlong] = MUL_R_C(in_data[i+nlong], window_long[nlong-1-i]);
+            windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
         mdct(fb, windowed_buf, out_mdct, 2*nlong);
         break;
     }
-
-    free(windowed_buf);
 }
 #endif