# HG changeset patch
# User lorenm
# Date 1218670428 0
# Node ID 8390efaa0c03d88d75f0ffef1ebe9ea26adb82f8
# Parent  ef456ee01ea248cdaec3871a4a4c9ac9d529ddc2
simd downmix
13% faster ac3 if downmixing

diff -r ef456ee01ea2 -r 8390efaa0c03 ac3dec.c
--- a/ac3dec.c	Wed Aug 13 23:30:53 2008 +0000
+++ b/ac3dec.c	Wed Aug 13 23:33:48 2008 +0000
@@ -632,27 +632,25 @@
 /**
  * Downmix the output to mono or stereo.
  */
-static av_noinline void ac3_downmix(AC3DecodeContext *s,
-                                    float samples[AC3_MAX_CHANNELS][256])
+void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
 {
     int i, j;
     float v0, v1;
-
-    if(s->output_mode == AC3_CHMODE_STEREO) {
-        for(i=0; i<256; i++) {
+    if(out_ch == 2) {
+        for(i=0; i<len; i++) {
             v0 = v1 = 0.0f;
-            for(j=0; j<s->fbw_channels; j++) {
-                v0 += samples[j][i] * s->downmix_coeffs[j][0];
-                v1 += samples[j][i] * s->downmix_coeffs[j][1];
+            for(j=0; j<in_ch; j++) {
+                v0 += samples[j][i] * matrix[j][0];
+                v1 += samples[j][i] * matrix[j][1];
             }
             samples[0][i] = v0;
             samples[1][i] = v1;
         }
-    } else if(s->output_mode == AC3_CHMODE_MONO) {
-        for(i=0; i<256; i++) {
+    } else if(out_ch == 1) {
+        for(i=0; i<len; i++) {
             v0 = 0.0f;
-            for(j=0; j<s->fbw_channels; j++)
-                v0 += samples[j][i] * s->downmix_coeffs[j][0];
+            for(j=0; j<in_ch; j++)
+                v0 += samples[j][i] * matrix[j][0];
             samples[0][i] = v0;
         }
     }
@@ -1018,17 +1016,16 @@
         do_imdct(s, s->channels);
 
         if(downmix_output) {
-            ac3_downmix(s, s->output);
+            s->dsp.ac3_downmix(s->output, s->downmix_coeffs, s->out_channels, s->fbw_channels, 256);
         }
     } else {
         if(downmix_output) {
-            ac3_downmix(s, s->transform_coeffs+1);
+            s->dsp.ac3_downmix(s->transform_coeffs+1, s->downmix_coeffs, s->out_channels, s->fbw_channels, 256);
         }
 
         if(downmix_output && !s->downmixed) {
             s->downmixed = 1;
-            // FIXME delay[] is half the size of the other downmixes
-            ac3_downmix(s, s->delay);
+            s->dsp.ac3_downmix(s->delay, s->downmix_coeffs, s->out_channels, s->fbw_channels, 128);
         }
 
         do_imdct(s, s->out_channels);
diff -r ef456ee01ea2 -r 8390efaa0c03 dsputil.c
--- a/dsputil.c	Wed Aug 13 23:30:53 2008 +0000
+++ b/dsputil.c	Wed Aug 13 23:33:48 2008 +0000
@@ -41,6 +41,9 @@
 /* vorbis.c */
 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
 
+/* ac3dec.c */
+void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
+
 /* flacenc.c */
 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
 
@@ -4476,6 +4479,9 @@
 #ifdef CONFIG_VORBIS_DECODER
     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
 #endif
+#ifdef CONFIG_AC3_DECODER
+    c->ac3_downmix = ff_ac3_downmix_c;
+#endif
 #ifdef CONFIG_FLAC_ENCODER
     c->flac_compute_autocorr = ff_flac_compute_autocorr;
 #endif
diff -r ef456ee01ea2 -r 8390efaa0c03 dsputil.h
--- a/dsputil.h	Wed Aug 13 23:30:53 2008 +0000
+++ b/dsputil.h	Wed Aug 13 23:33:48 2008 +0000
@@ -360,6 +360,7 @@
 
     /* assume len is a multiple of 4, and arrays are 16-byte aligned */
     void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
+    void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
     /* no alignment needed */
     void (*flac_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
     /* assume len is a multiple of 8, and arrays are 16-byte aligned */
diff -r ef456ee01ea2 -r 8390efaa0c03 i386/dsputil_mmx.c
--- a/i386/dsputil_mmx.c	Wed Aug 13 23:30:53 2008 +0000
+++ b/i386/dsputil_mmx.c	Wed Aug 13 23:33:48 2008 +0000
@@ -1842,6 +1842,105 @@
     }
 }
 
+#define IF1(x) x
+#define IF0(x)
+
+#define MIX5(mono,stereo)\
+    asm volatile(\
+        "movss          0(%2), %%xmm5 \n"\
+        "movss          8(%2), %%xmm6 \n"\
+        "movss         24(%2), %%xmm7 \n"\
+        "shufps    $0, %%xmm5, %%xmm5 \n"\
+        "shufps    $0, %%xmm6, %%xmm6 \n"\
+        "shufps    $0, %%xmm7, %%xmm7 \n"\
+        "1: \n"\
+        "movaps       (%0,%1), %%xmm0 \n"\
+        "movaps  0x400(%0,%1), %%xmm1 \n"\
+        "movaps  0x800(%0,%1), %%xmm2 \n"\
+        "movaps  0xc00(%0,%1), %%xmm3 \n"\
+        "movaps 0x1000(%0,%1), %%xmm4 \n"\
+        "mulps         %%xmm5, %%xmm0 \n"\
+        "mulps         %%xmm6, %%xmm1 \n"\
+        "mulps         %%xmm5, %%xmm2 \n"\
+        "mulps         %%xmm7, %%xmm3 \n"\
+        "mulps         %%xmm7, %%xmm4 \n"\
+ stereo("addps         %%xmm1, %%xmm0 \n")\
+        "addps         %%xmm1, %%xmm2 \n"\
+        "addps         %%xmm3, %%xmm0 \n"\
+        "addps         %%xmm4, %%xmm2 \n"\
+   mono("addps         %%xmm2, %%xmm0 \n")\
+        "movaps  %%xmm0,      (%0,%1) \n"\
+ stereo("movaps  %%xmm2, 0x400(%0,%1) \n")\
+        "add $16, %0 \n"\
+        "jl 1b \n"\
+        :"+&r"(i)\
+        :"r"(samples[0]+len), "r"(matrix)\
+        :"memory"\
+    );
+
+#define MIX_MISC(stereo)\
+    asm volatile(\
+        "1: \n"\
+        "movaps  (%3,%0), %%xmm0 \n"\
+ stereo("movaps   %%xmm0, %%xmm1 \n")\
+        "mulps    %%xmm6, %%xmm0 \n"\
+ stereo("mulps    %%xmm7, %%xmm1 \n")\
+        "lea 1024(%3,%0), %1 \n"\
+        "mov %5, %2 \n"\
+        "2: \n"\
+        "movaps   (%1),   %%xmm2 \n"\
+ stereo("movaps   %%xmm2, %%xmm3 \n")\
+        "mulps   (%4,%2), %%xmm2 \n"\
+ stereo("mulps 16(%4,%2), %%xmm3 \n")\
+        "addps    %%xmm2, %%xmm0 \n"\
+ stereo("addps    %%xmm3, %%xmm1 \n")\
+        "add $1024, %1 \n"\
+        "add $32, %2 \n"\
+        "jl 2b \n"\
+        "movaps   %%xmm0,     (%3,%0) \n"\
+ stereo("movaps   %%xmm1, 1024(%3,%0) \n")\
+        "add $16, %0 \n"\
+        "jl 1b \n"\
+        :"+&r"(i), "=&r"(j), "=&r"(k)\
+        :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
+        :"memory"\
+    );
+
+static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
+{
+    int (*matrix_cmp)[2] = (int(*)[2])matrix;
+    intptr_t i,j,k;
+
+    i = -len*sizeof(float);
+    if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
+        MIX5(IF0,IF1);
+    } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
+        MIX5(IF1,IF0);
+    } else {
+        DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]);
+        j = 2*in_ch*sizeof(float);
+        asm volatile(
+            "1: \n"
+            "sub $8, %0 \n"
+            "movss     (%2,%0), %%xmm6 \n"
+            "movss    4(%2,%0), %%xmm7 \n"
+            "shufps $0, %%xmm6, %%xmm6 \n"
+            "shufps $0, %%xmm7, %%xmm7 \n"
+            "movaps %%xmm6,   (%1,%0,4) \n"
+            "movaps %%xmm7, 16(%1,%0,4) \n"
+            "jg 1b \n"
+            :"+&r"(j)
+            :"r"(matrix_simd), "r"(matrix)
+            :"memory"
+        );
+        if(out_ch == 2) {
+            MIX_MISC(IF1);
+        } else {
+            MIX_MISC(IF0);
+        }
+    }
+}
+
 static void vector_fmul_3dnow(float *dst, const float *src, int len){
     x86_reg i = (len-4)*4;
     asm volatile(
@@ -2682,6 +2781,7 @@
         }
         if(mm_flags & MM_SSE){
             c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
+            c->ac3_downmix = ac3_downmix_sse;
             c->vector_fmul = vector_fmul_sse;
             c->vector_fmul_reverse = vector_fmul_reverse_sse;
             c->vector_fmul_add_add = vector_fmul_add_add_sse;