Mercurial > libavcodec.hg
diff dsputil.c @ 10644:5da7180afadf libavcodec
refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2
(Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.)
9-123% faster ape decoding on G4.
author | lorenm |
---|---|
date | Sat, 05 Dec 2009 15:09:10 +0000 |
parents | 546b7ebeaf07 |
children | 36611425fedb |
line wrap: on
line diff
--- a/dsputil.c Sat Dec 05 09:41:23 2009 +0000 +++ b/dsputil.c Sat Dec 05 15:09:10 2009 +0000 @@ -4298,18 +4298,6 @@ } } -static void add_int16_c(int16_t * v1, int16_t * v2, int order) -{ - while (order--) - *v1++ += *v2++; -} - -static void sub_int16_c(int16_t * v1, int16_t * v2, int order) -{ - while (order--) - *v1++ -= *v2++; -} - static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift) { int res = 0; @@ -4320,6 +4308,16 @@ return res; } +static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) +{ + int res = 0; + while (order--) { + res += *v1 * *v2++; + *v1++ += mul * *v3++; + } + return res; +} + #define W0 2048 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ @@ -4848,9 +4846,8 @@ c->vector_clipf = vector_clipf_c; c->float_to_int16 = ff_float_to_int16_c; c->float_to_int16_interleave = ff_float_to_int16_interleave_c; - c->add_int16 = add_int16_c; - c->sub_int16 = sub_int16_c; c->scalarproduct_int16 = scalarproduct_int16_c; + c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; c->scalarproduct_float = scalarproduct_float_c; c->butterflies_float = butterflies_float_c; c->vector_fmul_scalar = vector_fmul_scalar_c;