Mercurial > libavcodec.hg
comparison dsputil.h @ 10644:5da7180afadf libavcodec
refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2
(Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.)
9-123% faster ape decoding on G4.
author | lorenm |
---|---|
date | Sat, 05 Dec 2009 15:09:10 +0000 |
parents | 74b0c1a0851e |
children | 3d011a01a6a0 |
comparison
equal
deleted
inserted
replaced
10643:7f6911429cdc | 10644:5da7180afadf |
---|---|
558 /* intrax8 functions */ | 558 /* intrax8 functions */ |
559 void (*x8_spatial_compensation[12])(uint8_t *src , uint8_t *dst, int linesize); | 559 void (*x8_spatial_compensation[12])(uint8_t *src , uint8_t *dst, int linesize); |
560 void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize, | 560 void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize, |
561 int * range, int * sum, int edges); | 561 int * range, int * sum, int edges); |
562 | 562 |
563 /* ape functions */ | |
564 /** | |
565 * Add contents of the second vector to the first one. | |
566 * @param len length of vectors, should be multiple of 16 | |
567 */ | |
568 void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len); | |
569 /** | |
570 * Add contents of the second vector to the first one. | |
571 * @param len length of vectors, should be multiple of 16 | |
572 */ | |
573 void (*sub_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len); | |
574 /** | 563 /** |
575 * Calculate scalar product of two vectors. | 564 * Calculate scalar product of two vectors. |
576 * @param len length of vectors, should be multiple of 16 | 565 * @param len length of vectors, should be multiple of 16 |
577 * @param shift number of bits to discard from product | 566 * @param shift number of bits to discard from product |
578 */ | 567 */ |
579 int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift); | 568 int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift); |
569 /* ape functions */ | |
570 /** | |
571 * Calculate scalar product of v1 and v2, | |
572 * and v1[i] += v3[i] * mul | |
573 * @param len length of vectors, should be multiple of 16 | |
574 */ | |
575 int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, int16_t *v2, int16_t *v3, int len, int mul); | |
580 | 576 |
581 /* rv30 functions */ | 577 /* rv30 functions */ |
582 qpel_mc_func put_rv30_tpel_pixels_tab[4][16]; | 578 qpel_mc_func put_rv30_tpel_pixels_tab[4][16]; |
583 qpel_mc_func avg_rv30_tpel_pixels_tab[4][16]; | 579 qpel_mc_func avg_rv30_tpel_pixels_tab[4][16]; |
584 | 580 |