diff x86/dsputil_mmx.c @ 10300:4d1b9ca628fc libavcodec

Drop unused args from vector_fmul_add_add, simpify code, and rename The src3 and step arguments to vector_fmul_add_add() are always zero and one, respectively. This removes these arguments from the function, simplifies the code accordingly, and renames the function to better match the new operation.
author mru
date Sun, 27 Sep 2009 16:51:54 +0000
parents 3b61bc6ce377
children 02798c603744
line wrap: on
line diff
--- a/x86/dsputil_mmx.c	Sun Sep 27 09:43:11 2009 +0000
+++ b/x86/dsputil_mmx.c	Sun Sep 27 16:51:54 2009 +0000
@@ -2125,34 +2125,9 @@
     );
 }
 
-static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
-                                      const float *src2, int src3, int len, int step){
+static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
+                                  const float *src2, int len){
     x86_reg i = (len-4)*4;
-    if(step == 2 && src3 == 0){
-        dst += (len-4)*2;
-        __asm__ volatile(
-            "1: \n\t"
-            "movq   (%2,%0),  %%mm0 \n\t"
-            "movq  8(%2,%0),  %%mm1 \n\t"
-            "pfmul  (%3,%0),  %%mm0 \n\t"
-            "pfmul 8(%3,%0),  %%mm1 \n\t"
-            "pfadd  (%4,%0),  %%mm0 \n\t"
-            "pfadd 8(%4,%0),  %%mm1 \n\t"
-            "movd     %%mm0,   (%1) \n\t"
-            "movd     %%mm1, 16(%1) \n\t"
-            "psrlq      $32,  %%mm0 \n\t"
-            "psrlq      $32,  %%mm1 \n\t"
-            "movd     %%mm0,  8(%1) \n\t"
-            "movd     %%mm1, 24(%1) \n\t"
-            "sub  $32, %1 \n\t"
-            "sub  $16, %0 \n\t"
-            "jge  1b \n\t"
-            :"+r"(i), "+r"(dst)
-            :"r"(src0), "r"(src1), "r"(src2)
-            :"memory"
-        );
-    }
-    else if(step == 1 && src3 == 0){
         __asm__ volatile(
             "1: \n\t"
             "movq    (%2,%0), %%mm0 \n\t"
@@ -2169,47 +2144,11 @@
             :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
             :"memory"
         );
-    }
-    else
-        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
     __asm__ volatile("femms");
 }
-static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
-                                    const float *src2, int src3, int len, int step){
+static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
+                                const float *src2, int len){
     x86_reg i = (len-8)*4;
-    if(step == 2 && src3 == 0){
-        dst += (len-8)*2;
-        __asm__ volatile(
-            "1: \n\t"
-            "movaps   (%2,%0), %%xmm0 \n\t"
-            "movaps 16(%2,%0), %%xmm1 \n\t"
-            "mulps    (%3,%0), %%xmm0 \n\t"
-            "mulps  16(%3,%0), %%xmm1 \n\t"
-            "addps    (%4,%0), %%xmm0 \n\t"
-            "addps  16(%4,%0), %%xmm1 \n\t"
-            "movss     %%xmm0,   (%1) \n\t"
-            "movss     %%xmm1, 32(%1) \n\t"
-            "movhlps   %%xmm0, %%xmm2 \n\t"
-            "movhlps   %%xmm1, %%xmm3 \n\t"
-            "movss     %%xmm2, 16(%1) \n\t"
-            "movss     %%xmm3, 48(%1) \n\t"
-            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
-            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
-            "movss     %%xmm0,  8(%1) \n\t"
-            "movss     %%xmm1, 40(%1) \n\t"
-            "movhlps   %%xmm0, %%xmm2 \n\t"
-            "movhlps   %%xmm1, %%xmm3 \n\t"
-            "movss     %%xmm2, 24(%1) \n\t"
-            "movss     %%xmm3, 56(%1) \n\t"
-            "sub  $64, %1 \n\t"
-            "sub  $32, %0 \n\t"
-            "jge  1b \n\t"
-            :"+r"(i), "+r"(dst)
-            :"r"(src0), "r"(src1), "r"(src2)
-            :"memory"
-        );
-    }
-    else if(step == 1 && src3 == 0){
         __asm__ volatile(
             "1: \n\t"
             "movaps   (%2,%0), %%xmm0 \n\t"
@@ -2226,9 +2165,6 @@
             :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
             :"memory"
         );
-    }
-    else
-        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
 }
 
 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
@@ -3077,7 +3013,7 @@
             c->ac3_downmix = ac3_downmix_sse;
             c->vector_fmul = vector_fmul_sse;
             c->vector_fmul_reverse = vector_fmul_reverse_sse;
-            c->vector_fmul_add_add = vector_fmul_add_add_sse;
+            c->vector_fmul_add = vector_fmul_add_sse;
             c->vector_fmul_window = vector_fmul_window_sse;
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
             c->vector_clipf = vector_clipf_sse;
@@ -3085,7 +3021,7 @@
             c->float_to_int16_interleave = float_to_int16_interleave_sse;
         }
         if(mm_flags & FF_MM_3DNOW)
-            c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+            c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
         if(mm_flags & FF_MM_SSE2){
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
             c->float_to_int16 = float_to_int16_sse2;