libavcodec.hg: dsputil.c comparison

comparison dsputil.c @ 10644:5da7180afadf libavcodec

refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4.

author	lorenm
date	Sat, 05 Dec 2009 15:09:10 +0000
parents	546b7ebeaf07
children	36611425fedb

comparison

equal deleted inserted replaced

-:7f6911429cdc
+:5da7180afadf
 for(i=0, j=c; i<len; i++, j+=channels)
 dst[j] = float_to_int16_one(src[c]+i);
 }
 }
-static void add_int16_c(int16_t * v1, int16_t * v2, int order)
-{
-while (order--)
-*v1++ += *v2++;
-}
-static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
-{
-while (order--)
-*v1++ -= *v2++;
-}
 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
 {
 int res = 0;
 while (order--)
 res += (*v1++ * *v2++) >> shift;
+return res;
+}
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
+{
+int res = 0;
+while (order--) {
+res   += *v1 * *v2++;
+*v1++ += mul * *v3++;
+}
 return res;
 }
 #define W0 2048
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 c->vector_fmul_window = ff_vector_fmul_window_c;
 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
 c->vector_clipf = vector_clipf_c;
 c->float_to_int16 = ff_float_to_int16_c;
 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
-c->add_int16 = add_int16_c;
-c->sub_int16 = sub_int16_c;
 c->scalarproduct_int16 = scalarproduct_int16_c;
+c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
 c->scalarproduct_float = scalarproduct_float_c;
 c->butterflies_float = butterflies_float_c;
 c->vector_fmul_scalar = vector_fmul_scalar_c;
 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;

Mercurial > libavcodec.hg

comparison dsputil.c @ 10644:5da7180afadf libavcodec