comparison vp8.c @ 12237:f0c4dc49c8f1 libavcodec

VP8: smarter prefetching Don't prefetch reference frames that were used less than 1/32th of the time so far in the frame. This helps speed up to ~2% on videos that, in many frames, make near-zero (but not entirely zero) use of golden and/or alt-refs. This is a very common property of videos encoded by libvpx.
author darkshikari
date Fri, 23 Jul 2010 01:59:56 +0000
parents e08d65897115
children 1a7903913e9b
comparison
equal deleted inserted replaced
12236:cabcd751b1e5 12237:f0c4dc49c8f1
1082 s->put_pixels_tab[1 + (block_w == 4)]); 1082 s->put_pixels_tab[1 + (block_w == 4)]);
1083 } 1083 }
1084 1084
1085 /* Fetch pixels for estimated mv 4 macroblocks ahead. 1085 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1086 * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */ 1086 * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1087 static inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int ref) 1087 static inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1088 { 1088 {
1089 /* Don't prefetch refs that haven't been used yet this frame. */ 1089 /* Don't prefetch refs that haven't been used very often this frame. */
1090 if (s->ref_count[ref-1]) { 1090 if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1091 int x_off = mb_x << 4, y_off = mb_y << 4; 1091 int x_off = mb_x << 4, y_off = mb_y << 4;
1092 int mx = mb->mv.x + x_off + 8; 1092 int mx = mb->mv.x + x_off + 8;
1093 int my = mb->mv.y + y_off; 1093 int my = mb->mv.y + y_off;
1094 uint8_t **src= s->framep[ref]->data; 1094 uint8_t **src= s->framep[ref]->data;
1095 int off= mx + (my + (mb_x&3)*4)*s->linesize + 64; 1095 int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1469 for (mb_y = 0; mb_y < s->mb_height; mb_y++) { 1469 for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1470 VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)]; 1470 VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1471 VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2; 1471 VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1472 uint8_t *intra4x4 = s->intra4x4_pred_mode + 4*mb_y*s->b4_stride; 1472 uint8_t *intra4x4 = s->intra4x4_pred_mode + 4*mb_y*s->b4_stride;
1473 uint8_t *segment_map = s->segmentation_map + mb_y*s->mb_stride; 1473 uint8_t *segment_map = s->segmentation_map + mb_y*s->mb_stride;
1474 int mb_xy = mb_y * s->mb_stride;
1474 uint8_t *dst[3] = { 1475 uint8_t *dst[3] = {
1475 curframe->data[0] + 16*mb_y*s->linesize, 1476 curframe->data[0] + 16*mb_y*s->linesize,
1476 curframe->data[1] + 8*mb_y*s->uvlinesize, 1477 curframe->data[1] + 8*mb_y*s->uvlinesize,
1477 curframe->data[2] + 8*mb_y*s->uvlinesize 1478 curframe->data[2] + 8*mb_y*s->uvlinesize
1478 }; 1479 };
1485 for (y = 0; y < 16>>!!i; y++) 1486 for (y = 0; y < 16>>!!i; y++)
1486 dst[i][y*curframe->linesize[i]-1] = 129; 1487 dst[i][y*curframe->linesize[i]-1] = 129;
1487 if (mb_y) 1488 if (mb_y)
1488 memset(s->top_border, 129, sizeof(*s->top_border)); 1489 memset(s->top_border, 129, sizeof(*s->top_border));
1489 1490
1490 for (mb_x = 0; mb_x < s->mb_width; mb_x++) { 1491 for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1491 uint8_t *intra4x4_mb = s->keyframe ? intra4x4 + 4*mb_x : s->intra4x4_pred_mode_mb; 1492 uint8_t *intra4x4_mb = s->keyframe ? intra4x4 + 4*mb_x : s->intra4x4_pred_mode_mb;
1492 uint8_t *segment_mb = segment_map+mb_x; 1493 uint8_t *segment_mb = segment_map+mb_x;
1493 1494
1494 /* Prefetch the current frame, 4 MBs ahead */ 1495 /* Prefetch the current frame, 4 MBs ahead */
1495 s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4); 1496 s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1496 s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2); 1497 s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1497 1498
1498 decode_mb_mode(s, mb, mb_x, mb_y, intra4x4_mb, segment_mb); 1499 decode_mb_mode(s, mb, mb_x, mb_y, intra4x4_mb, segment_mb);
1499 1500
1500 prefetch_motion(s, mb, mb_x, mb_y, VP56_FRAME_PREVIOUS); 1501 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1501 1502
1502 if (!mb->skip) 1503 if (!mb->skip)
1503 decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz); 1504 decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1504 1505
1505 if (mb->mode <= MODE_I4x4) 1506 if (mb->mode <= MODE_I4x4)
1506 intra_predict(s, dst, mb, intra4x4_mb, mb_x, mb_y); 1507 intra_predict(s, dst, mb, intra4x4_mb, mb_x, mb_y);
1507 else 1508 else
1508 inter_predict(s, dst, mb, mb_x, mb_y); 1509 inter_predict(s, dst, mb, mb_x, mb_y);
1509 1510
1510 prefetch_motion(s, mb, mb_x, mb_y, VP56_FRAME_GOLDEN); 1511 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1511 1512
1512 if (!mb->skip) { 1513 if (!mb->skip) {
1513 idct_mb(s, dst[0], dst[1], dst[2], mb); 1514 idct_mb(s, dst[0], dst[1], dst[2], mb);
1514 } else { 1515 } else {
1515 AV_ZERO64(s->left_nnz); 1516 AV_ZERO64(s->left_nnz);
1523 } 1524 }
1524 1525
1525 if (s->deblock_filter) 1526 if (s->deblock_filter)
1526 filter_level_for_mb(s, mb, &s->filter_strength[mb_x]); 1527 filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1527 1528
1528 prefetch_motion(s, mb, mb_x, mb_y, VP56_FRAME_GOLDEN2); 1529 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1529 1530
1530 dst[0] += 16; 1531 dst[0] += 16;
1531 dst[1] += 8; 1532 dst[1] += 8;
1532 dst[2] += 8; 1533 dst[2] += 8;
1533 mb++;
1534 } 1534 }
1535 if (s->deblock_filter) { 1535 if (s->deblock_filter) {
1536 if (s->filter.simple) 1536 if (s->filter.simple)
1537 filter_mb_row_simple(s, mb_y); 1537 filter_mb_row_simple(s, mb_y);
1538 else 1538 else