# HG changeset patch # User darkshikari # Date 1279839788 0 # Node ID d3f3897ddb5c6e04a635b9228c26771b392614cb # Parent 00a4a6b56c7f7ec578bee2bdb95a0d01524a1627 Smarter VP8 prefetching Prefetch all refs (including altref), but only if they've been used so far this frame. ~2.5% faster overall. TODO: Do something even smarter, like using how often each ref has been used so far, so that a couple blocks of a rarely-used ref don't force us to prefetch it. diff -r 00a4a6b56c7f -r d3f3897ddb5c vp8.c --- a/vp8.c Thu Jul 22 22:15:43 2010 +0000 +++ b/vp8.c Thu Jul 22 23:03:08 2010 +0000 @@ -123,6 +123,7 @@ int mbskip_enabled; int sign_bias[4]; ///< one state [0, 1] per ref frame type + int ref_count[3]; /** * Base parameters for segmentation, i.e. per-macroblock parameters. @@ -733,6 +734,7 @@ VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN; else mb->ref_frame = VP56_FRAME_PREVIOUS; + s->ref_count[mb->ref_frame-1]++; // motion vectors, 16.3 find_near_mvs(s, mb, mb_x, mb_y, near, &best, cnt); @@ -1081,15 +1083,19 @@ /* Fetch pixels for estimated mv 4 macroblocks ahead. * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */ -static inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int x_off, int y_off, int ref) +static inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int ref) { - int mx = mb->mv.x + x_off + 8; - int my = mb->mv.y + y_off; - uint8_t **src= s->framep[ref]->data; - int off= mx + (my + (mb_x&3)*4)*s->linesize + 64; - s->dsp.prefetch(src[0]+off, s->linesize, 4); - off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64; - s->dsp.prefetch(src[1]+off, src[2]-src[1], 2); + /* Don't prefetch refs that haven't been used yet this frame. */ + if (s->ref_count[ref-1]) { + int x_off = mb_x << 4, y_off = mb_y << 4; + int mx = mb->mv.x + x_off + 8; + int my = mb->mv.y + y_off; + uint8_t **src= s->framep[ref]->data; + int off= mx + (my + (mb_x&3)*4)*s->linesize + 64; + s->dsp.prefetch(src[0]+off, s->linesize, 4); + off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64; + s->dsp.prefetch(src[1]+off, src[2]-src[1], 2); + } } /** @@ -1103,8 +1109,6 @@ AVFrame *ref = s->framep[mb->ref_frame]; VP56mv *bmv = mb->bmv; - prefetch_motion(s, mb, mb_x, mb_y, x_off, y_off, VP56_FRAME_PREVIOUS); - if (mb->mode < VP8_MVMODE_SPLIT) { vp8_mc_part(s, dst, ref, x_off, y_off, 0, 0, 16, 16, width, height, &mb->mv); @@ -1179,8 +1183,6 @@ 8, 8, 8, 8, width, height, &bmv[3]); break; } - - prefetch_motion(s, mb, mb_x, mb_y, x_off, y_off, VP56_FRAME_GOLDEN); } static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst, @@ -1458,6 +1460,7 @@ // top edge of 127 for intra prediction memset(s->top_border, 127, (s->mb_width+1)*sizeof(*s->top_border)); + memset(s->ref_count, 0, sizeof(s->ref_count)); for (mb_y = 0; mb_y < s->mb_height; mb_y++) { VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)]; @@ -1490,6 +1493,8 @@ decode_mb_mode(s, mb, mb_x, mb_y, intra4x4_mb, segment_mb); + prefetch_motion(s, mb, mb_x, mb_y, VP56_FRAME_PREVIOUS); + if (!mb->skip) decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz); else { @@ -1502,6 +1507,8 @@ else inter_predict(s, dst, mb, mb_x, mb_y); + prefetch_motion(s, mb, mb_x, mb_y, VP56_FRAME_GOLDEN); + if (!mb->skip) { idct_mb(s, dst[0], dst[1], dst[2], mb); } else { @@ -1518,6 +1525,8 @@ if (s->deblock_filter) filter_level_for_mb(s, mb, &s->filter_strength[mb_x]); + prefetch_motion(s, mb, mb_x, mb_y, VP56_FRAME_GOLDEN2); + dst[0] += 16; dst[1] += 8; dst[2] += 8;