comparison dsputil.h @ 5737:efa3c1f9259a libavcodec

sse2 version of compute_autocorr(). 4x faster than c (somehow, even though doubles only allow 2x simd). overal flac encoding: 15-50% faster on core2, 4-11% on k8, 3-13% on p4.
author lorenm
date Sat, 29 Sep 2007 22:31:18 +0000
parents d7970c9e3049
children 09f99af1db40
comparison
equal deleted inserted replaced
5736:810067f2c33d 5737:efa3c1f9259a
326 326
327 void (*h261_loop_filter)(uint8_t *src, int stride); 327 void (*h261_loop_filter)(uint8_t *src, int stride);
328 328
329 /* assume len is a multiple of 4, and arrays are 16-byte aligned */ 329 /* assume len is a multiple of 4, and arrays are 16-byte aligned */
330 void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize); 330 void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
331 /* no alignment needed */
332 void (*flac_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
331 /* assume len is a multiple of 8, and arrays are 16-byte aligned */ 333 /* assume len is a multiple of 8, and arrays are 16-byte aligned */
332 void (*vector_fmul)(float *dst, const float *src, int len); 334 void (*vector_fmul)(float *dst, const float *src, int len);
333 void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len); 335 void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
334 /* assume len is a multiple of 8, and src arrays are 16-byte aligned */ 336 /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
335 void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step); 337 void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step);