comparison dsputil.h @ 10644:5da7180afadf libavcodec

refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4.
author lorenm
date Sat, 05 Dec 2009 15:09:10 +0000
parents 74b0c1a0851e
children 3d011a01a6a0
comparison
equal deleted inserted replaced
10643:7f6911429cdc 10644:5da7180afadf
558 /* intrax8 functions */ 558 /* intrax8 functions */
559 void (*x8_spatial_compensation[12])(uint8_t *src , uint8_t *dst, int linesize); 559 void (*x8_spatial_compensation[12])(uint8_t *src , uint8_t *dst, int linesize);
560 void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize, 560 void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
561 int * range, int * sum, int edges); 561 int * range, int * sum, int edges);
562 562
563 /* ape functions */
564 /**
565 * Add contents of the second vector to the first one.
566 * @param len length of vectors, should be multiple of 16
567 */
568 void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
569 /**
570 * Add contents of the second vector to the first one.
571 * @param len length of vectors, should be multiple of 16
572 */
573 void (*sub_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
574 /** 563 /**
575 * Calculate scalar product of two vectors. 564 * Calculate scalar product of two vectors.
576 * @param len length of vectors, should be multiple of 16 565 * @param len length of vectors, should be multiple of 16
577 * @param shift number of bits to discard from product 566 * @param shift number of bits to discard from product
578 */ 567 */
579 int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift); 568 int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift);
569 /* ape functions */
570 /**
571 * Calculate scalar product of v1 and v2,
572 * and v1[i] += v3[i] * mul
573 * @param len length of vectors, should be multiple of 16
574 */
575 int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, int16_t *v2, int16_t *v3, int len, int mul);
580 576
581 /* rv30 functions */ 577 /* rv30 functions */
582 qpel_mc_func put_rv30_tpel_pixels_tab[4][16]; 578 qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
583 qpel_mc_func avg_rv30_tpel_pixels_tab[4][16]; 579 qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
584 580