# HG changeset patch # User darkshikari # Date 1280780289 0 # Node ID 2d15f62f4f8a9ea6fa3ecb7e6e93828daa0e61e3 # Parent 57fc7f2d7b289f26de8b0bfc4787a81a5361c9c3 VP8: move zeroing of luma DC block into the WHT Lets us do the zeroing in asm instead of C. Also makes it consistent with the way the regular iDCT code does it. diff -r 57fc7f2d7b28 -r 2d15f62f4f8a vp8.c --- a/vp8.c Mon Aug 02 09:44:53 2010 +0000 +++ b/vp8.c Mon Aug 02 20:18:09 2010 +0000 @@ -117,6 +117,7 @@ */ DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4]; DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16]; + DECLARE_ALIGNED(16, DCTELEM, block_dc)[16]; uint8_t intra4x4_pred_mode_mb[16]; int chroma_pred_mode; ///< 8x8c pred mode of the current macroblock @@ -864,22 +865,19 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9]) { - LOCAL_ALIGNED_16(DCTELEM, dc,[16]); int i, x, y, luma_start = 0, luma_ctx = 3; int nnz_pred, nnz, nnz_total = 0; int segment = s->segment; if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) { - AV_ZERO128(dc); - AV_ZERO128(dc+8); nnz_pred = t_nnz[8] + l_nnz[8]; // decode DC values and do hadamard - nnz = decode_block_coeffs(c, dc, s->prob->token[1], 0, nnz_pred, + nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred, s->qmat[segment].luma_dc_qmul); l_nnz[8] = t_nnz[8] = !!nnz; nnz_total += nnz; - s->vp8dsp.vp8_luma_dc_wht(s->block, dc); + s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc); luma_start = 1; luma_ctx = 0; } diff -r 57fc7f2d7b28 -r 2d15f62f4f8a vp8dsp.c --- a/vp8dsp.c Mon Aug 02 09:44:53 2010 +0000 +++ b/vp8dsp.c Mon Aug 02 20:18:09 2010 +0000 @@ -46,6 +46,10 @@ t1 = dc[i*4+1] + dc[i*4+2]; t2 = dc[i*4+1] - dc[i*4+2]; t3 = dc[i*4+0] - dc[i*4+3] + 3; // rounding + dc[i*4+0] = 0; + dc[i*4+1] = 0; + dc[i*4+2] = 0; + dc[i*4+3] = 0; *block[i][0] = (t0 + t1) >> 3; *block[i][1] = (t3 + t2) >> 3; diff -r 57fc7f2d7b28 -r 2d15f62f4f8a x86/vp8dsp-init.c --- a/x86/vp8dsp-init.c Mon Aug 02 09:44:53 2010 +0000 +++ b/x86/vp8dsp-init.c Mon Aug 02 20:18:09 2010 +0000 @@ -224,6 +224,7 @@ extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride); extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride); extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); +extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]); extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); @@ -335,6 +336,7 @@ if (mm_flags & FF_MM_SSE) { c->vp8_idct_add = ff_vp8_idct_add_sse; + c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; } diff -r 57fc7f2d7b28 -r 2d15f62f4f8a x86/vp8dsp.asm --- a/x86/vp8dsp.asm Mon Aug 02 09:44:53 2010 +0000 +++ b/x86/vp8dsp.asm Mon Aug 02 20:18:09 2010 +0000 @@ -1186,12 +1186,23 @@ SWAP %1, %4, %3 %endmacro -INIT_MMX -cglobal vp8_luma_dc_wht_mmx, 2,3 +%macro VP8_DC_WHT 1 +cglobal vp8_luma_dc_wht_%1, 2,3 movq m0, [r1] movq m1, [r1+8] movq m2, [r1+16] movq m3, [r1+24] +%ifidn %1, sse + xorps xmm0, xmm0 + movaps [r1+ 0], xmm0 + movaps [r1+16], xmm0 +%else + pxor m4, m4 + movq [r1+ 0], m4 + movq [r1+ 8], m4 + movq [r1+16], m4 + movq [r1+24], m4 +%endif HADAMARD4_1D 0, 1, 2, 3 TRANSPOSE4x4W 0, 1, 2, 3, 4 paddw m0, [pw_3] @@ -1203,6 +1214,11 @@ SCATTER_WHT 0, 1, 0 SCATTER_WHT 2, 3, 2 RET +%endmacro + +INIT_MMX +VP8_DC_WHT mmx +VP8_DC_WHT sse ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter_simple_(uint8_t *dst, int stride, int flim);