Mercurial > libavcodec.hg
diff vp8dsp.c @ 12238:1a7903913e9b libavcodec
VP8: 30% faster idct_mb
Take shortcuts based on statically common situations.
Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT
blocks are common.
TODO: tie this more directly into the MB mode, since the DC-level transform is
only used for non-splitmv blocks?
author | darkshikari |
---|---|
date | Fri, 23 Jul 2010 02:58:27 +0000 |
parents | e08d65897115 |
children | c7f6ddcc5c01 |
line wrap: on
line diff
--- a/vp8dsp.c Fri Jul 23 01:59:56 2010 +0000 +++ b/vp8dsp.c Fri Jul 23 02:58:27 2010 +0000 @@ -109,6 +109,25 @@ } } +static void vp8_idct_dc_add4_c(uint8_t *dst, DCTELEM block[4][16], int stride) +{ + int i, j; + for (j = 0; j < 4; j++) { + uint8_t *pix = dst+j*4; + int dc = (block[j][0] + 4) >> 3; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc; + block[j][0] = 0; + if (!dc) + continue; + for (i = 0; i < 4; i++) { + pix[0] = cm[pix[0]]; + pix[1] = cm[pix[1]]; + pix[2] = cm[pix[2]]; + pix[3] = cm[pix[3]]; + pix += stride; + } + } +} // because I like only having two parameters to pass functions... #define LOAD_PIXELS\ @@ -460,9 +479,10 @@ av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) { - dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; - dsp->vp8_idct_add = vp8_idct_add_c; - dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; + dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; + dsp->vp8_idct_add = vp8_idct_add_c; + dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; + dsp->vp8_idct_dc_add4 = vp8_idct_dc_add4_c; dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c; dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c;