# HG changeset patch # User darkshikari # Date 1280782623 0 # Node ID b4c63ffd959b2222f7532da63f49e258c1f381f3 # Parent ad24cca213aeabbe708e7b2b4ed7bfdd22a2306d VP8: much faster DC transform handling A lot of the time the DC block is empty: don't do the WHT in this case. A lot of the rest of the time, there's only one coefficient: make a special DC-only transform for that case. When the block is empty, don't incorrectly mark luma DCT blocks as having DC coefficients. diff -r ad24cca213ae -r b4c63ffd959b vp8.c --- a/vp8.c Mon Aug 02 20:35:50 2010 +0000 +++ b/vp8.c Mon Aug 02 20:57:03 2010 +0000 @@ -868,6 +868,7 @@ int i, x, y, luma_start = 0, luma_ctx = 3; int nnz_pred, nnz, nnz_total = 0; int segment = s->segment; + int block_dc = 0; if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) { nnz_pred = t_nnz[8] + l_nnz[8]; @@ -876,8 +877,14 @@ nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred, s->qmat[segment].luma_dc_qmul); l_nnz[8] = t_nnz[8] = !!nnz; - nnz_total += nnz; - s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc); + if (nnz) { + nnz_total += nnz; + block_dc = 1; + if (nnz == 1) + s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc); + else + s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc); + } luma_start = 1; luma_ctx = 0; } @@ -888,8 +895,8 @@ nnz_pred = l_nnz[y] + t_nnz[x]; nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start, nnz_pred, s->qmat[segment].luma_qmul); - // nnz+luma_start may be one more than the actual last index, but we don't care - s->non_zero_count_cache[y][x] = nnz + luma_start; + // nnz+block_dc may be one more than the actual last index, but we don't care + s->non_zero_count_cache[y][x] = nnz + block_dc; t_nnz[x] = l_nnz[y] = !!nnz; nnz_total += nnz; } diff -r ad24cca213ae -r b4c63ffd959b vp8dsp.c --- a/vp8dsp.c Mon Aug 02 20:35:50 2010 +0000 +++ b/vp8dsp.c Mon Aug 02 20:57:03 2010 +0000 @@ -51,13 +51,25 @@ dc[i*4+2] = 0; dc[i*4+3] = 0; - *block[i][0] = (t0 + t1) >> 3; - *block[i][1] = (t3 + t2) >> 3; - *block[i][2] = (t0 - t1) >> 3; - *block[i][3] = (t3 - t2) >> 3; + block[i][0][0] = (t0 + t1) >> 3; + block[i][1][0] = (t3 + t2) >> 3; + block[i][2][0] = (t0 - t1) >> 3; + block[i][3][0] = (t3 - t2) >> 3; } } +static void vp8_luma_dc_wht_dc_c(DCTELEM block[4][4][16], DCTELEM dc[16]) +{ + int i, val = (dc[0] + 3) >> 3; + dc[0] = 0; + + for (i = 0; i < 4; i++) { + block[i][0][0] = val; + block[i][1][0] = val; + block[i][2][0] = val; + block[i][3][0] = val; + } +} #define MUL_20091(a) ((((a)*20091) >> 16) + (a)) #define MUL_35468(a) (((a)*35468) >> 16) @@ -480,6 +492,7 @@ av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) { dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; + dsp->vp8_luma_dc_wht_dc = vp8_luma_dc_wht_dc_c; dsp->vp8_idct_add = vp8_idct_add_c; dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; dsp->vp8_idct_dc_add4y = vp8_idct_dc_add4y_c; diff -r ad24cca213ae -r b4c63ffd959b vp8dsp.h --- a/vp8dsp.h Mon Aug 02 20:35:50 2010 +0000 +++ b/vp8dsp.h Mon Aug 02 20:57:03 2010 +0000 @@ -31,6 +31,7 @@ typedef struct VP8DSPContext { void (*vp8_luma_dc_wht)(DCTELEM block[4][4][16], DCTELEM dc[16]); + void (*vp8_luma_dc_wht_dc)(DCTELEM block[4][4][16], DCTELEM dc[16]); void (*vp8_idct_add)(uint8_t *dst, DCTELEM block[16], int stride); void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride); void (*vp8_idct_dc_add4y)(uint8_t *dst, DCTELEM block[4][16], int stride);