Mercurial > libavcodec.hg
changeset 11283:853e93a50fe5 libavcodec
Cut the size of mvd_table by yet another factor of 2.
The code read/write code itself was 1 cycle faster, overall its
likely more due to cache effects
author | michael |
---|---|
date | Thu, 25 Feb 2010 04:11:33 +0000 |
parents | cf41a3e8e14e |
children | aaca4b58880f |
files | h264.c h264.h |
diffstat | 2 files changed, 17 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/h264.c Thu Feb 25 02:42:25 2010 +0000 +++ b/h264.c Thu Feb 25 04:11:33 2010 +0000 @@ -757,8 +757,8 @@ FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail) FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail) - FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint8_t), fail); - FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint8_t), fail); + FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 16*big_mb_num * sizeof(uint8_t), fail); + FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 16*big_mb_num * sizeof(uint8_t), fail); FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail); FF_ALLOCZ_OR_GOTO(h->s.avctx, h->list_counts, big_mb_num * sizeof(uint8_t), fail) @@ -775,7 +775,7 @@ const int b8_xy= 2*x + 2*y*h->b8_stride; h->mb2b_xy [mb_xy]= b_xy; - h->mb2br_xy[mb_xy]= FMO ? b_xy : (b_xy % (8*h->b_stride)); + h->mb2br_xy[mb_xy]= 8*(FMO ? mb_xy : (mb_xy % (2*s->mb_stride))); h->mb2b8_xy[mb_xy]= b8_xy; } }
--- a/h264.h Thu Feb 25 02:42:25 2010 +0000 +++ b/h264.h Thu Feb 25 04:11:33 2010 +0000 @@ -1070,23 +1070,23 @@ if( CABAC ) { /* XXX beurk, Load mvd */ if(USES_LIST(top_type, list)){ - const int b_xy= h->mb2br_xy[top_xy] + 3*h->b_stride; + const int b_xy= h->mb2br_xy[top_xy]; AV_COPY64(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]); }else{ AV_ZERO64(h->mvd_cache[list][scan8[0] + 0 - 1*8]); } if(USES_LIST(left_type[0], list)){ - const int b_xy= h->mb2br_xy[left_xy[0]] + 3; - AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]); - AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]); + const int b_xy= h->mb2br_xy[left_xy[0]] + 6; + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy - left_block[0]]); + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy - left_block[1]]); }else{ AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 0*8]); AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 1*8]); } if(USES_LIST(left_type[1], list)){ - const int b_xy= h->mb2br_xy[left_xy[1]] + 3; - AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]); - AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]); + const int b_xy= h->mb2br_xy[left_xy[1]] + 6; + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy - left_block[2]]); + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy - left_block[3]]); }else{ AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 2*8]); AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 3*8]); @@ -1424,13 +1424,15 @@ AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); } if( CABAC ) { - uint8_t (*mvd_dst)[2] = &h->mvd_table[list][FMO ? b_xy : h->mb2br_xy[h->mb_xy]]; + uint8_t (*mvd_dst)[2] = &h->mvd_table[list][FMO ? 8*h->mb_xy : h->mb2br_xy[h->mb_xy]]; uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]]; if(IS_SKIP(mb_type)) - fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 2); - else - for(y=0; y<4; y++){ - AV_COPY64(mvd_dst + y*b_stride, mvd_src + 8*y); + AV_ZERO128(mvd_dst); + else{ + AV_COPY64(mvd_dst, mvd_src + 8*3); + for(y=0; y<3; y++){ + AV_COPY16(mvd_dst + 3 + 3 - y, mvd_src + 3 + 8*y); + } } }