# HG changeset patch # User michael # Date 1266066145 0 # Node ID 9004c61e3aa09a94da6374009fa454dfd6c05447 # Parent d988e0a6f391d0a9c8c2b861a198e0682db410f8 Restructure spatial direct MV generation so that the zero predictior shortcut is closer at the top. 50-130 cpu cycles faster depending on which kind of direct MBs are meassured. diff -r d988e0a6f391 -r 9004c61e3aa0 h264_direct.c --- a/h264_direct.c Sat Feb 13 03:46:51 2010 +0000 +++ b/h264_direct.c Sat Feb 13 13:02:25 2010 +0000 @@ -151,11 +151,72 @@ const int is_b8x8 = IS_8X8(*mb_type); unsigned int sub_mb_type; int i8, i4; + int ref[2]; + int mv[2]; + int list; assert(h->ref_list[1][0].reference&3); #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM) + *mb_type |= MB_TYPE_L0L1; + sub_mb_type |= MB_TYPE_L0L1; + + /* ref = min(neighbors) */ + for(list=0; list<2; list++){ + int left_ref = h->ref_cache[list][scan8[0] - 1]; + int top_ref = h->ref_cache[list][scan8[0] - 8]; + int refc = h->ref_cache[list][scan8[0] - 8 + 4]; + const int16_t *C= h->mv_cache[list][ scan8[0] - 8 + 4]; + if(refc == PART_NOT_AVAILABLE){ + refc = h->ref_cache[list][scan8[0] - 8 - 1]; + C = h-> mv_cache[list][scan8[0] - 8 - 1]; + } + ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc); + if(ref[list] >= 0){ + //this is just pred_motion() but with the cases removed that cannot happen for direct blocks + const int16_t * const A= h->mv_cache[list][ scan8[0] - 1 ]; + const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ]; + + int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]); + if(match_count > 1){ //most common + mv[list]= (mid_pred(A[0], B[0], C[0])&0xFFFF) + +(mid_pred(A[1], B[1], C[1])<<16); + }else { + assert(match_count==1); + if(left_ref==ref[list]){ + mv[list]= *(uint32_t*)A; + }else if(top_ref==ref[list]){ + mv[list]= *(uint32_t*)B; + }else{ + mv[list]= *(uint32_t*)C; + } + } + }else{ + int mask= ~(MB_TYPE_L0 << (2*list)); + mv[list] = 0; + ref[list] = -1; + if(!is_b8x8) + *mb_type &= mask; + sub_mb_type &= mask; + } + } + if(ref[0] < 0 && ref[1] < 0){ + ref[0] = ref[1] = 0; + if(!is_b8x8) + *mb_type |= MB_TYPE_L0L1; + sub_mb_type |= MB_TYPE_L0L1; + } + + if(!is_b8x8 && (mv[0]|mv[1]) == 0){ + fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); + fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); + fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4); + fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4); + *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; + return; + } + if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL if(!IS_INTERLACED(*mb_type)){ // AFR/FR -> AFL/FL mb_xy= s->mb_x + ((s->mb_y&~1) + h->col_parity)*s->mb_stride; @@ -172,31 +233,31 @@ b8_stride *= 3; b4_stride *= 6; - sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ + sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ if( (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA) && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA) && !is_b8x8){ - *mb_type |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */ + *mb_type |= MB_TYPE_16x8 |MB_TYPE_DIRECT2; /* B_16x8 */ }else{ - *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; + *mb_type |= MB_TYPE_8x8; } }else{ // AFR/FR -> AFR/FR single_col: mb_type_col[0] = mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy]; - sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ + sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ - *mb_type |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */ + *mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */ }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ - *mb_type |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); + *mb_type |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); }else{ if(!h->sps.direct_8x8_inference_flag){ /* FIXME save sub mb types from previous frames (or derive from MVs) * so we know exactly what block size to use */ - sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */ + sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */ } - *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; + *mb_type |= MB_TYPE_8x8; } } } @@ -214,64 +275,8 @@ } } - { - int ref[2]; - int mv[2]; - int list; - /* ref = min(neighbors) */ - for(list=0; list<2; list++){ - int left_ref = h->ref_cache[list][scan8[0] - 1]; - int top_ref = h->ref_cache[list][scan8[0] - 8]; - int refc = h->ref_cache[list][scan8[0] - 8 + 4]; - const int16_t *C= h->mv_cache[list][ scan8[0] - 8 + 4]; - if(refc == PART_NOT_AVAILABLE){ - refc = h->ref_cache[list][scan8[0] - 8 - 1]; - C = h-> mv_cache[list][scan8[0] - 8 - 1]; - } - ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc); - if(ref[list] >= 0){ - //this is just pred_motion() but with the cases removed that cannot happen for direct blocks - const int16_t * const A= h->mv_cache[list][ scan8[0] - 1 ]; - const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ]; - - int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]); - if(match_count > 1){ //most common - mv[list]= (mid_pred(A[0], B[0], C[0])&0xFFFF) - +(mid_pred(A[1], B[1], C[1])<<16); - }else { - assert(match_count==1); - if(left_ref==ref[list]){ - mv[list]= *(uint32_t*)A; - }else if(top_ref==ref[list]){ - mv[list]= *(uint32_t*)B; - }else{ - mv[list]= *(uint32_t*)C; - } - } - }else{ - int mask= ~(MB_TYPE_L0 << (2*list)); - mv[list] = 0; - ref[list] = -1; - if(!is_b8x8) - *mb_type &= mask; - sub_mb_type &= mask; - } - } - if(ref[0] < 0 && ref[1] < 0){ - ref[0] = ref[1] = 0; - if(!is_b8x8) - *mb_type |= MB_TYPE_L0L1; - sub_mb_type |= MB_TYPE_L0L1; - } - - if(!is_b8x8 && (mv[0]|mv[1]) == 0){ - fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); - fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); - fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4); - fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4); - *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; - }else if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){ + if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){ int n=0; for(i8=0; i8<4; i8++){ int x8 = i8&1;