comparison dsputil.c @ 10644:5da7180afadf libavcodec

refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4.
author lorenm
date Sat, 05 Dec 2009 15:09:10 +0000
parents 546b7ebeaf07
children 36611425fedb
comparison
equal deleted inserted replaced
10643:7f6911429cdc 10644:5da7180afadf
4296 for(i=0, j=c; i<len; i++, j+=channels) 4296 for(i=0, j=c; i<len; i++, j+=channels)
4297 dst[j] = float_to_int16_one(src[c]+i); 4297 dst[j] = float_to_int16_one(src[c]+i);
4298 } 4298 }
4299 } 4299 }
4300 4300
4301 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4302 {
4303 while (order--)
4304 *v1++ += *v2++;
4305 }
4306
4307 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4308 {
4309 while (order--)
4310 *v1++ -= *v2++;
4311 }
4312
4313 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift) 4301 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4314 { 4302 {
4315 int res = 0; 4303 int res = 0;
4316 4304
4317 while (order--) 4305 while (order--)
4318 res += (*v1++ * *v2++) >> shift; 4306 res += (*v1++ * *v2++) >> shift;
4319 4307
4308 return res;
4309 }
4310
4311 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4312 {
4313 int res = 0;
4314 while (order--) {
4315 res += *v1 * *v2++;
4316 *v1++ += mul * *v3++;
4317 }
4320 return res; 4318 return res;
4321 } 4319 }
4322 4320
4323 #define W0 2048 4321 #define W0 2048
4324 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ 4322 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4846 c->vector_fmul_window = ff_vector_fmul_window_c; 4844 c->vector_fmul_window = ff_vector_fmul_window_c;
4847 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; 4845 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4848 c->vector_clipf = vector_clipf_c; 4846 c->vector_clipf = vector_clipf_c;
4849 c->float_to_int16 = ff_float_to_int16_c; 4847 c->float_to_int16 = ff_float_to_int16_c;
4850 c->float_to_int16_interleave = ff_float_to_int16_interleave_c; 4848 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4851 c->add_int16 = add_int16_c;
4852 c->sub_int16 = sub_int16_c;
4853 c->scalarproduct_int16 = scalarproduct_int16_c; 4849 c->scalarproduct_int16 = scalarproduct_int16_c;
4850 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4854 c->scalarproduct_float = scalarproduct_float_c; 4851 c->scalarproduct_float = scalarproduct_float_c;
4855 c->butterflies_float = butterflies_float_c; 4852 c->butterflies_float = butterflies_float_c;
4856 c->vector_fmul_scalar = vector_fmul_scalar_c; 4853 c->vector_fmul_scalar = vector_fmul_scalar_c;
4857 4854
4858 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c; 4855 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;