comparison apedec.c @ 10644:5da7180afadf libavcodec

refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4.
author lorenm
date Sat, 05 Dec 2009 15:09:10 +0000
parents 4e0b0c0cb915
children 5518cf6b6f2e
comparison
equal deleted inserted replaced
10643:7f6911429cdc 10644:5da7180afadf
646 { 646 {
647 do_init_filter(&f[0], buf, order); 647 do_init_filter(&f[0], buf, order);
648 do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order); 648 do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order);
649 } 649 }
650 650
651 static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits) 651 static void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits)
652 { 652 {
653 int res; 653 int res;
654 int absres; 654 int absres;
655 655
656 while (count--) { 656 while (count--) {
657 /* round fixedpoint scalar product */ 657 /* round fixedpoint scalar product */
658 res = (ctx->dsp.scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits; 658 res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data));
659 659 res = (res + (1 << (fracbits - 1))) >> fracbits;
660 if (*data < 0)
661 ctx->dsp.add_int16(f->coeffs, f->adaptcoeffs - order, order);
662 else if (*data > 0)
663 ctx->dsp.sub_int16(f->coeffs, f->adaptcoeffs - order, order);
664
665 res += *data; 660 res += *data;
666
667 *data++ = res; 661 *data++ = res;
668 662
669 /* Update the output history */ 663 /* Update the output history */
670 *f->delay++ = av_clip_int16(res); 664 *f->delay++ = av_clip_int16(res);
671 665