libavcodec.hg: dsputil.h comparison

comparison dsputil.h @ 10644:5da7180afadf libavcodec

refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4.

author	lorenm
date	Sat, 05 Dec 2009 15:09:10 +0000
parents	74b0c1a0851e
children	3d011a01a6a0

comparison

equal deleted inserted replaced

-:7f6911429cdc
+:5da7180afadf
 /* intrax8 functions */
 void (*x8_spatial_compensation[12])(uint8_t *src , uint8_t *dst, int linesize);
 void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
 int * range, int * sum,  int edges);
-/* ape functions */
-/**
-* Add contents of the second vector to the first one.
-* @param len length of vectors, should be multiple of 16
-*/
-void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
-/**
-* Add contents of the second vector to the first one.
-* @param len length of vectors, should be multiple of 16
-*/
-void (*sub_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
 /**
 * Calculate scalar product of two vectors.
 * @param len length of vectors, should be multiple of 16
 * @param shift number of bits to discard from product
 */
 int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift);
+/* ape functions */
+/**
+* Calculate scalar product of v1 and v2,
+* and v1[i] += v3[i] * mul
+* @param len length of vectors, should be multiple of 16
+*/
+int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, int16_t *v2, int16_t *v3, int len, int mul);
 /* rv30 functions */
 qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
 qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];

Mercurial > libavcodec.hg

comparison dsputil.h @ 10644:5da7180afadf libavcodec