Mercurial > libavcodec.hg
comparison apedec.c @ 10644:5da7180afadf libavcodec
refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2
(Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.)
9-123% faster ape decoding on G4.
author | lorenm |
---|---|
date | Sat, 05 Dec 2009 15:09:10 +0000 |
parents | 4e0b0c0cb915 |
children | 5518cf6b6f2e |
comparison
equal
deleted
inserted
replaced
10643:7f6911429cdc | 10644:5da7180afadf |
---|---|
646 { | 646 { |
647 do_init_filter(&f[0], buf, order); | 647 do_init_filter(&f[0], buf, order); |
648 do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order); | 648 do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order); |
649 } | 649 } |
650 | 650 |
651 static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits) | 651 static void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits) |
652 { | 652 { |
653 int res; | 653 int res; |
654 int absres; | 654 int absres; |
655 | 655 |
656 while (count--) { | 656 while (count--) { |
657 /* round fixedpoint scalar product */ | 657 /* round fixedpoint scalar product */ |
658 res = (ctx->dsp.scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits; | 658 res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data)); |
659 | 659 res = (res + (1 << (fracbits - 1))) >> fracbits; |
660 if (*data < 0) | |
661 ctx->dsp.add_int16(f->coeffs, f->adaptcoeffs - order, order); | |
662 else if (*data > 0) | |
663 ctx->dsp.sub_int16(f->coeffs, f->adaptcoeffs - order, order); | |
664 | |
665 res += *data; | 660 res += *data; |
666 | |
667 *data++ = res; | 661 *data++ = res; |
668 | 662 |
669 /* Update the output history */ | 663 /* Update the output history */ |
670 *f->delay++ = av_clip_int16(res); | 664 *f->delay++ = av_clip_int16(res); |
671 | 665 |