diff libfaad2/filtbank.c @ 18141:59b6fa5b4201

Update to faad2 cvs 20040915+MPlayer fixes Patch by me and Emanuele Giaquinta
author rtognimp
date Tue, 18 Apr 2006 19:39:34 +0000
parents 2ae5ab4331ca
children e83eef58b30a
line wrap: on
line diff
--- a/libfaad2/filtbank.c	Tue Apr 18 19:33:46 2006 +0000
+++ b/libfaad2/filtbank.c	Tue Apr 18 19:39:34 2006 +0000
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: filtbank.c,v 1.38 2004/06/30 12:45:56 menno Exp $
+** $Id: filtbank.c,v 1.41 2004/09/08 09:43:11 gcp Exp $
 **/
 
 #include "common.h"
@@ -87,15 +87,6 @@
     }
 #endif
 
-#ifdef USE_SSE
-    if (cpu_has_sse())
-    {
-        fb->if_func = ifilter_bank_sse;
-    } else {
-        fb->if_func = ifilter_bank;
-    }
-#endif
-
     return fb;
 }
 
@@ -140,30 +131,6 @@
 #endif
 }
 
-#ifdef USE_SSE
-static INLINE void imdct_long_sse(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
-{
-#ifdef LD_DEC
-    mdct_info *mdct = NULL;
-
-    switch (len)
-    {
-    case 2048:
-    case 1920:
-        mdct = fb->mdct2048;
-        break;
-    case 1024:
-    case 960:
-        mdct = fb->mdct1024;
-        break;
-    }
-
-    faad_imdct_sse(mdct, in_data, out_data);
-#else
-    faad_imdct_sse(fb->mdct2048, in_data, out_data);
-#endif
-}
-#endif
 
 #ifdef LTP_DEC
 static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
@@ -350,8 +317,8 @@
 #if 0
     for (i = 0; i < 1024; i++)
     {
-        //printf("%d\n", time_out[i]);
-        printf("0x%.8X\n", time_out[i]);
+        printf("%d\n", time_out[i]);
+        //printf("0x%.8X\n", time_out[i]);
     }
 #endif
 
@@ -362,334 +329,6 @@
 #endif
 }
 
-#ifdef USE_SSE
-void ifilter_bank_sse(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
-                      uint8_t window_shape_prev, real_t *freq_in,
-                      real_t *time_out, uint8_t object_type, uint16_t frame_len)
-{
-    int16_t i;
-    ALIGN real_t transf_buf[2*1024] = {0};
-
-    const real_t *window_long = NULL;
-    const real_t *window_long_prev = NULL;
-    const real_t *window_short = NULL;
-    const real_t *window_short_prev = NULL;
-
-    uint16_t nlong = frame_len;
-    uint16_t nshort = frame_len/8;
-    uint16_t trans = nshort/2;
-
-    uint16_t nflat_ls = (nlong-nshort)/2;
-
-#ifdef PROFILE
-    int64_t count = faad_get_ts();
-#endif
-
-#ifdef LD_DEC
-    if (object_type == LD)
-    {
-        window_long       = fb->ld_window[window_shape];
-        window_long_prev  = fb->ld_window[window_shape_prev];
-    } else {
-#endif
-        window_long       = fb->long_window[window_shape];
-        window_long_prev  = fb->long_window[window_shape_prev];
-        window_short      = fb->short_window[window_shape];
-        window_short_prev = fb->short_window[window_shape_prev];
-#ifdef LD_DEC
-    }
-#endif
-
-    switch (window_sequence)
-    {
-    case ONLY_LONG_SEQUENCE:
-        imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
-        for (i = 0; i < nlong; i+=4)
-        {
-            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
-
-            m1 = _mm_load_ps(&transf_buf[i]);
-            m2 = _mm_load_ps(&window_long_prev[i]);
-            m6 = _mm_load_ps(&window_long[nlong-4-i]);
-            m3 = _mm_load_ps(&time_out[nlong+i]);
-            m5 = _mm_load_ps(&transf_buf[nlong+i]);
-
-            m4 = _mm_mul_ps(m1, m2);
-            m7 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m4 = _mm_add_ps(m4, m3);
-            m8 = _mm_mul_ps(m5, m7);
-
-            _mm_store_ps(&time_out[i], m4);
-            _mm_store_ps(&time_out[nlong+i], m8);
-        }
-        break;
-
-    case LONG_START_SEQUENCE:
-        imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
-        for (i = 0; i < nlong; i+=4)
-        {
-            __m128 m1 = _mm_load_ps(&transf_buf[i]);
-            __m128 m2 = _mm_load_ps(&window_long_prev[i]);
-            __m128 m3 = _mm_load_ps(&time_out[nlong+i]);
-
-            __m128 m4 = _mm_mul_ps(m1, m2);
-            m4 = _mm_add_ps(m4, m3);
-
-            _mm_store_ps(&time_out[i], m4);
-        }
-        for (i = 0; i < nflat_ls; i+=4)
-        {
-            __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
-            _mm_store_ps(&time_out[nlong+i], m1);
-        }
-        for (i = 0; i < nshort; i+=4)
-        {
-            __m128 m1 = _mm_load_ps(&transf_buf[nlong+nflat_ls+i]);
-            __m128 m2 = _mm_load_ps(&window_short[nshort-4-i]);
-            __m128 m3, m4;
-
-            m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m4 = _mm_mul_ps(m1, m3);
-
-            _mm_store_ps(&time_out[nlong+nflat_ls+i], m4);
-        }
-        for (i = 0; i < nflat_ls; i+=4)
-        {
-            __m128 m1 = _mm_setzero_ps();
-            _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
-        }
-        break;
-
-    case EIGHT_SHORT_SEQUENCE:
-        faad_imdct_sse(fb->mdct256, &freq_in[0*nshort], &transf_buf[2*nshort*0]);
-        faad_imdct_sse(fb->mdct256, &freq_in[1*nshort], &transf_buf[2*nshort*1]);
-        faad_imdct_sse(fb->mdct256, &freq_in[2*nshort], &transf_buf[2*nshort*2]);
-        faad_imdct_sse(fb->mdct256, &freq_in[3*nshort], &transf_buf[2*nshort*3]);
-        faad_imdct_sse(fb->mdct256, &freq_in[4*nshort], &transf_buf[2*nshort*4]);
-        faad_imdct_sse(fb->mdct256, &freq_in[5*nshort], &transf_buf[2*nshort*5]);
-        faad_imdct_sse(fb->mdct256, &freq_in[6*nshort], &transf_buf[2*nshort*6]);
-        faad_imdct_sse(fb->mdct256, &freq_in[7*nshort], &transf_buf[2*nshort*7]);
-        for (i = 0; i < nflat_ls; i+=4)
-        {
-            __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
-            _mm_store_ps(&time_out[i], m1);
-        }
-        for (i = 0; i < nshort; i+=4)
-        {
-            __m128 m1 = _mm_load_ps(&transf_buf[nshort*0+i]);
-            __m128 m2 = _mm_load_ps(&window_short_prev[i]);
-            __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);
-
-            __m128 m4 = _mm_mul_ps(m1, m2);
-            m4 = _mm_add_ps(m4, m3);
-
-            _mm_store_ps(&time_out[nflat_ls+i], m4);
-        }
-        for (i = 0; i < nshort; i+=4)
-        {
-            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
-            m1 = _mm_load_ps(&transf_buf[nshort*1+i]);
-            m2 = _mm_load_ps(&window_short[nshort-4-i]);
-            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*1+i]);
-            m6 = _mm_load_ps(&transf_buf[nshort*2+i]);
-            m7 = _mm_load_ps(&window_short[i]);
-
-            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m4 = _mm_mul_ps(m1, m5);
-            m8 = _mm_mul_ps(m6, m7);
-            m4 = _mm_add_ps(m4, m3);
-            m4 = _mm_add_ps(m4, m8);
-
-            _mm_store_ps(&time_out[nflat_ls+1*nshort+i], m4);
-        }
-        for (i = 0; i < nshort; i+=4)
-        {
-            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
-            m1 = _mm_load_ps(&transf_buf[nshort*3+i]);
-            m2 = _mm_load_ps(&window_short[nshort-4-i]);
-            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*2+i]);
-            m6 = _mm_load_ps(&transf_buf[nshort*4+i]);
-            m7 = _mm_load_ps(&window_short[i]);
-
-            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m4 = _mm_mul_ps(m1, m5);
-            m8 = _mm_mul_ps(m6, m7);
-            m4 = _mm_add_ps(m4, m3);
-            m4 = _mm_add_ps(m4, m8);
-
-            _mm_store_ps(&time_out[nflat_ls+2*nshort+i], m4);
-        }
-        for (i = 0; i < nshort; i+=4)
-        {
-            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
-            m1 = _mm_load_ps(&transf_buf[nshort*5+i]);
-            m2 = _mm_load_ps(&window_short[nshort-4-i]);
-            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*3+i]);
-            m6 = _mm_load_ps(&transf_buf[nshort*6+i]);
-            m7 = _mm_load_ps(&window_short[i]);
-
-            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m4 = _mm_mul_ps(m1, m5);
-            m8 = _mm_mul_ps(m6, m7);
-            m4 = _mm_add_ps(m4, m3);
-            m4 = _mm_add_ps(m4, m8);
-
-            _mm_store_ps(&time_out[nflat_ls+3*nshort+i], m4);
-        }
-        for(i = 0; i < trans; i+=4)
-        {
-            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
-            m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
-            m2 = _mm_load_ps(&window_short[nshort-4-i]);
-            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*4+i]);
-            m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
-            m7 = _mm_load_ps(&window_short[i]);
-
-            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m4 = _mm_mul_ps(m1, m5);
-            m8 = _mm_mul_ps(m6, m7);
-            m4 = _mm_add_ps(m4, m3);
-            m4 = _mm_add_ps(m4, m8);
-
-            _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m4);
-        }
-        for (i = trans; i < nshort; i+=4)
-        {
-            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
-            m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
-            m2 = _mm_load_ps(&window_short[nshort-4-i]);
-            m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
-            m7 = _mm_load_ps(&window_short[i]);
-
-            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m4 = _mm_mul_ps(m1, m5);
-            m8 = _mm_mul_ps(m6, m7);
-            m3 = _mm_add_ps(m4, m8);
-
-            _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m3);
-        }
-        for (i = 0; i < nshort; i+=4)
-        {
-            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
-            m1 = _mm_load_ps(&transf_buf[nshort*9+i]);
-            m2 = _mm_load_ps(&window_short[nshort-4-i]);
-            m6 = _mm_load_ps(&transf_buf[nshort*10+i]);
-            m7 = _mm_load_ps(&window_short[i]);
-
-            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m4 = _mm_mul_ps(m1, m5);
-            m8 = _mm_mul_ps(m6, m7);
-            m3 = _mm_add_ps(m4, m8);
-
-            _mm_store_ps(&time_out[nflat_ls+5*nshort+i], m3);
-        }
-        for (i = 0; i < nshort; i+=4)
-        {
-            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
-            m1 = _mm_load_ps(&transf_buf[nshort*11+i]);
-            m2 = _mm_load_ps(&window_short[nshort-4-i]);
-            m6 = _mm_load_ps(&transf_buf[nshort*12+i]);
-            m7 = _mm_load_ps(&window_short[i]);
-
-            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m4 = _mm_mul_ps(m1, m5);
-            m8 = _mm_mul_ps(m6, m7);
-            m3 = _mm_add_ps(m4, m8);
-
-            _mm_store_ps(&time_out[nflat_ls+6*nshort+i], m3);
-        }
-        for (i = 0; i < nshort; i+=4)
-        {
-            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
-            m1 = _mm_load_ps(&transf_buf[nshort*13+i]);
-            m2 = _mm_load_ps(&window_short[nshort-4-i]);
-            m6 = _mm_load_ps(&transf_buf[nshort*14+i]);
-            m7 = _mm_load_ps(&window_short[i]);
-
-            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m4 = _mm_mul_ps(m1, m5);
-            m8 = _mm_mul_ps(m6, m7);
-            m3 = _mm_add_ps(m4, m8);
-
-            _mm_store_ps(&time_out[nflat_ls+7*nshort+i], m3);
-        }
-        for (i = 0; i < nshort; i+=4)
-        {
-            __m128 m1, m2, m3, m5;
-            m1 = _mm_load_ps(&transf_buf[nshort*15+i]);
-            m2 = _mm_load_ps(&window_short[nshort-4-i]);
-
-            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m3 = _mm_mul_ps(m1, m5);
-
-            _mm_store_ps(&time_out[nflat_ls+8*nshort+i], m3);
-        }
-        for (i = 0; i < nflat_ls; i+=4)
-        {
-            __m128 m1 = _mm_setzero_ps();
-            _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
-        }
-        break;
-
-    case LONG_STOP_SEQUENCE:
-        imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
-        for (i = 0; i < nflat_ls; i+=4)
-        {
-            __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
-            _mm_store_ps(&time_out[i], m1);
-        }
-        for (i = 0; i < nshort; i+=4)
-        {
-            __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+i]);
-            __m128 m2 = _mm_load_ps(&window_short_prev[i]);
-            __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);
-
-            __m128 m4 = _mm_mul_ps(m1, m2);
-            m4 = _mm_add_ps(m4, m3);
-
-            _mm_store_ps(&time_out[nflat_ls+i], m4);
-        }
-        for (i = 0; i < nflat_ls; i+=4)
-        {
-            __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+nshort+i]);
-            __m128 m2 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort+i]);
-
-            __m128 m3 = _mm_add_ps(m1, m2);
-
-            _mm_store_ps(&time_out[nflat_ls+nshort+i], m3);
-        }
-        for (i = 0; i < nlong; i+=4)
-        {
-            __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
-            __m128 m2 = _mm_load_ps(&window_long[nlong-4-i]);
-            __m128 m3, m4;
-
-            m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
-
-            m4 = _mm_mul_ps(m1, m3);
-
-            _mm_store_ps(&time_out[nlong+i], m4);
-        }
-		break;
-    }
-
-#ifdef PROFILE
-    count = faad_get_ts() - count;
-    fb->cycles += count;
-#endif
-}
-#endif
 
 #ifdef LTP_DEC
 /* only works for LTP -> no overlapping, no short blocks */