# HG changeset patch # User michael # Date 1267044186 0 # Node ID c12d6c6c027e3f684e1d56177395666b2f28e244 # Parent dd948aea1c207c06567a1b0519f6d5c042dbe768 Change mvd_cache & mvd_table to 8bit, this is overall a bit faster for high resolution videos. about 20cycles faster per MB for cathederal. diff -r dd948aea1c20 -r c12d6c6c027e h264.c --- a/h264.c Wed Feb 24 20:37:58 2010 +0000 +++ b/h264.c Wed Feb 24 20:43:06 2010 +0000 @@ -756,8 +756,8 @@ FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail) FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail) - FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail); - FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail); + FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint8_t), fail); + FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint8_t), fail); FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail); FF_ALLOCZ_OR_GOTO(h->s.avctx, h->list_counts, big_mb_num * sizeof(uint8_t), fail) diff -r dd948aea1c20 -r c12d6c6c027e h264.h --- a/h264.h Wed Feb 24 20:37:58 2010 +0000 +++ b/h264.h Wed Feb 24 20:43:06 2010 +0000 @@ -486,8 +486,8 @@ /* chroma_pred_mode for i4x4 or i16x16, else 0 */ uint8_t *chroma_pred_mode_table; int last_qscale_diff; - int16_t (*mvd_table[2])[2]; - DECLARE_ALIGNED_16(int16_t, mvd_cache)[2][5*8][2]; + uint8_t (*mvd_table[2])[2]; + DECLARE_ALIGNED_16(uint8_t, mvd_cache)[2][5*8][2]; uint8_t *direct_table; uint8_t direct_cache[5*8]; @@ -732,6 +732,14 @@ #endif } +static av_always_inline uint16_t pack8to16(int a, int b){ +#if HAVE_BIGENDIAN + return (b&0xFF) + (a<<8); +#else + return (a&0xFF) + (b<<8); +#endif +} + /** * gets the chroma qp. */ @@ -1060,32 +1068,31 @@ /* XXX beurk, Load mvd */ if(USES_LIST(top_type, list)){ const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; - AV_COPY128(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]); + AV_COPY64(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]); }else{ - AV_ZERO128(h->mvd_cache[list][scan8[0] + 0 - 1*8]); + AV_ZERO64(h->mvd_cache[list][scan8[0] + 0 - 1*8]); } if(USES_LIST(left_type[0], list)){ const int b_xy= h->mb2b_xy[left_xy[0]] + 3; - AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]); - AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]); + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]); + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]); }else{ - AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 0*8]); - AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 1*8]); + AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 0*8]); + AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 1*8]); } if(USES_LIST(left_type[1], list)){ const int b_xy= h->mb2b_xy[left_xy[1]] + 3; - AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]); - AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]); + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]); + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]); }else{ - AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 2*8]); - AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 3*8]); + AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 2*8]); + AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 3*8]); } - AV_ZERO32(h->mvd_cache [list][scan8[5 ]+1]); - AV_ZERO32(h->mvd_cache [list][scan8[7 ]+1]); - AV_ZERO32(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else) - AV_ZERO32(h->mvd_cache [list][scan8[4 ]]); - AV_ZERO32(h->mvd_cache [list][scan8[12]]); - + AV_ZERO16(h->mvd_cache [list][scan8[5 ]+1]); + AV_ZERO16(h->mvd_cache [list][scan8[7 ]+1]); + AV_ZERO16(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else) + AV_ZERO16(h->mvd_cache [list][scan8[4 ]]); + AV_ZERO16(h->mvd_cache [list][scan8[12]]); if(h->slice_type_nos == FF_B_TYPE){ fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1); @@ -1414,13 +1421,13 @@ AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); } if( CABAC ) { - int16_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy]; - int16_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]]; + uint8_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy]; + uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]]; if(IS_SKIP(mb_type)) - fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 4); + fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 2); else for(y=0; y<4; y++){ - AV_COPY128(mvd_dst + y*b_stride, mvd_src + 8*y); + AV_COPY64(mvd_dst + y*b_stride, mvd_src + 8*y); } } diff -r dd948aea1c20 -r c12d6c6c027e h264_cabac.c --- a/h264_cabac.c Wed Feb 24 20:37:58 2010 +0000 +++ b/h264_cabac.c Wed Feb 24 20:43:06 2010 +0000 @@ -938,8 +938,9 @@ while( k-- ) { mvd += get_cabac_bypass( &h->cabac )<cabac, -mvd ); } @@ -1429,7 +1430,7 @@ for(i=0; i<4; i++){ h->ref_cache[list][ scan8[4*i] ]=h->ref_cache[list][ scan8[4*i]+1 ]; if(IS_DIRECT(h->sub_mb_type[i])){ - fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4); + fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2); continue; } @@ -1441,9 +1442,8 @@ int mx, my; const int index= 4*i + block_width*j; int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ]; - int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ]; + uint8_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ]; pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my); - DECODE_CABAC_MB_MVD( h, list, index) tprintf(s->avctx, "final mv:%d %d\n", mx, my); @@ -1478,14 +1478,14 @@ } }else{ fill_rectangle(h->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4); - fill_rectangle(h->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 4); + fill_rectangle(h->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2); } } } } else if( IS_DIRECT(mb_type) ) { ff_h264_pred_direct_motion(h, &mb_type); - fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4); - fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4); + fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2); + fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2); dct8x8_allowed &= h->sps.direct_8x8_inference_flag; } else { int list, i; @@ -1512,7 +1512,7 @@ DECODE_CABAC_MB_MVD( h, list, 0) tprintf(s->avctx, "final mv:%d %d\n", mx, my); - fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mpx,mpy), 4); + fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2); fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4); }else fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4); @@ -1544,10 +1544,10 @@ DECODE_CABAC_MB_MVD( h, list, 8*i) tprintf(s->avctx, "final mv:%d %d\n", mx, my); - fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mpx,mpy), 4); + fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2); fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4); }else{ - fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4); + fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2); fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4); } } @@ -1579,10 +1579,10 @@ DECODE_CABAC_MB_MVD( h, list, 4*i) tprintf(s->avctx, "final mv:%d %d\n", mx, my); - fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mpx,mpy), 4); + fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2); fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4); }else{ - fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4); + fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2); fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4); } }