diff dsputil.c @ 10644:5da7180afadf libavcodec

refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4.
author lorenm
date Sat, 05 Dec 2009 15:09:10 +0000
parents 546b7ebeaf07
children 36611425fedb
line wrap: on
line diff
--- a/dsputil.c	Sat Dec 05 09:41:23 2009 +0000
+++ b/dsputil.c	Sat Dec 05 15:09:10 2009 +0000
@@ -4298,18 +4298,6 @@
     }
 }
 
-static void add_int16_c(int16_t * v1, int16_t * v2, int order)
-{
-    while (order--)
-       *v1++ += *v2++;
-}
-
-static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
-{
-    while (order--)
-        *v1++ -= *v2++;
-}
-
 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
 {
     int res = 0;
@@ -4320,6 +4308,16 @@
     return res;
 }
 
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
+{
+    int res = 0;
+    while (order--) {
+        res   += *v1 * *v2++;
+        *v1++ += mul * *v3++;
+    }
+    return res;
+}
+
 #define W0 2048
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -4848,9 +4846,8 @@
     c->vector_clipf = vector_clipf_c;
     c->float_to_int16 = ff_float_to_int16_c;
     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
-    c->add_int16 = add_int16_c;
-    c->sub_int16 = sub_int16_c;
     c->scalarproduct_int16 = scalarproduct_int16_c;
+    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
     c->scalarproduct_float = scalarproduct_float_c;
     c->butterflies_float = butterflies_float_c;
     c->vector_fmul_scalar = vector_fmul_scalar_c;