Mercurial > libavcodec.hg
comparison dsputil.c @ 10644:5da7180afadf libavcodec
refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2
(Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.)
9-123% faster ape decoding on G4.
author | lorenm |
---|---|
date | Sat, 05 Dec 2009 15:09:10 +0000 |
parents | 546b7ebeaf07 |
children | 36611425fedb |
comparison
equal
deleted
inserted
replaced
10643:7f6911429cdc | 10644:5da7180afadf |
---|---|
4296 for(i=0, j=c; i<len; i++, j+=channels) | 4296 for(i=0, j=c; i<len; i++, j+=channels) |
4297 dst[j] = float_to_int16_one(src[c]+i); | 4297 dst[j] = float_to_int16_one(src[c]+i); |
4298 } | 4298 } |
4299 } | 4299 } |
4300 | 4300 |
4301 static void add_int16_c(int16_t * v1, int16_t * v2, int order) | |
4302 { | |
4303 while (order--) | |
4304 *v1++ += *v2++; | |
4305 } | |
4306 | |
4307 static void sub_int16_c(int16_t * v1, int16_t * v2, int order) | |
4308 { | |
4309 while (order--) | |
4310 *v1++ -= *v2++; | |
4311 } | |
4312 | |
4313 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift) | 4301 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift) |
4314 { | 4302 { |
4315 int res = 0; | 4303 int res = 0; |
4316 | 4304 |
4317 while (order--) | 4305 while (order--) |
4318 res += (*v1++ * *v2++) >> shift; | 4306 res += (*v1++ * *v2++) >> shift; |
4319 | 4307 |
4308 return res; | |
4309 } | |
4310 | |
4311 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | |
4312 { | |
4313 int res = 0; | |
4314 while (order--) { | |
4315 res += *v1 * *v2++; | |
4316 *v1++ += mul * *v3++; | |
4317 } | |
4320 return res; | 4318 return res; |
4321 } | 4319 } |
4322 | 4320 |
4323 #define W0 2048 | 4321 #define W0 2048 |
4324 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ | 4322 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ |
4846 c->vector_fmul_window = ff_vector_fmul_window_c; | 4844 c->vector_fmul_window = ff_vector_fmul_window_c; |
4847 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; | 4845 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; |
4848 c->vector_clipf = vector_clipf_c; | 4846 c->vector_clipf = vector_clipf_c; |
4849 c->float_to_int16 = ff_float_to_int16_c; | 4847 c->float_to_int16 = ff_float_to_int16_c; |
4850 c->float_to_int16_interleave = ff_float_to_int16_interleave_c; | 4848 c->float_to_int16_interleave = ff_float_to_int16_interleave_c; |
4851 c->add_int16 = add_int16_c; | |
4852 c->sub_int16 = sub_int16_c; | |
4853 c->scalarproduct_int16 = scalarproduct_int16_c; | 4849 c->scalarproduct_int16 = scalarproduct_int16_c; |
4850 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; | |
4854 c->scalarproduct_float = scalarproduct_float_c; | 4851 c->scalarproduct_float = scalarproduct_float_c; |
4855 c->butterflies_float = butterflies_float_c; | 4852 c->butterflies_float = butterflies_float_c; |
4856 c->vector_fmul_scalar = vector_fmul_scalar_c; | 4853 c->vector_fmul_scalar = vector_fmul_scalar_c; |
4857 | 4854 |
4858 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c; | 4855 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c; |