libavcodec.hg: h264_direct.c comparison

comparison h264_direct.c @ 11140:9004c61e3aa0 libavcodec

Restructure spatial direct MV generation so that the zero predictior shortcut is closer at the top. 50-130 cpu cycles faster depending on which kind of direct MBs are meassured.

author	michael
date	Sat, 13 Feb 2010 13:02:25 +0000
parents	d988e0a6f391
children	e81668fc717a

comparison

equal deleted inserted replaced

-:d988e0a6f391
+:9004c61e3aa0
 const int16_t (*l1mv0)[2], (*l1mv1)[2];
 const int8_t *l1ref0, *l1ref1;
 const int is_b8x8 = IS_8X8(*mb_type);
 unsigned int sub_mb_type;
 int i8, i4;
+int ref[2];
+int mv[2];
+int list;
 assert(h->ref_list[1][0].reference&3);
 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
+*mb_type |= MB_TYPE_L0L1;
+sub_mb_type |= MB_TYPE_L0L1;
+/* ref = min(neighbors) */
+for(list=0; list<2; list++){
+int left_ref = h->ref_cache[list][scan8[0] - 1];
+int top_ref  = h->ref_cache[list][scan8[0] - 8];
+int refc = h->ref_cache[list][scan8[0] - 8 + 4];
+const int16_t *C= h->mv_cache[list][ scan8[0] - 8 + 4];
+if(refc == PART_NOT_AVAILABLE){
+refc = h->ref_cache[list][scan8[0] - 8 - 1];
+C    = h-> mv_cache[list][scan8[0] - 8 - 1];
+}
+ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc);
+if(ref[list] >= 0){
+//this is just pred_motion() but with the cases removed that cannot happen for direct blocks
+const int16_t * const A= h->mv_cache[list][ scan8[0] - 1 ];
+const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
+int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]);
+if(match_count > 1){ //most common
+mv[list]= (mid_pred(A[0], B[0], C[0])&0xFFFF)
++(mid_pred(A[1], B[1], C[1])<<16);
+}else {
+assert(match_count==1);
+if(left_ref==ref[list]){
+mv[list]= *(uint32_t*)A;
+}else if(top_ref==ref[list]){
+mv[list]= *(uint32_t*)B;
+}else{
+mv[list]= *(uint32_t*)C;
+}
+}
+}else{
+int mask= ~(MB_TYPE_L0 << (2*list));
+mv[list] = 0;
+ref[list] = -1;
+if(!is_b8x8)
+*mb_type &= mask;
+sub_mb_type &= mask;
+}
+}
+if(ref[0] < 0 && ref[1] < 0){
+ref[0] = ref[1] = 0;
+if(!is_b8x8)
+*mb_type |= MB_TYPE_L0L1;
+sub_mb_type |= MB_TYPE_L0L1;
+}
+if(!is_b8x8 && (mv[0]|mv[1]) == 0){
+fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
+fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
+fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
+fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
+*mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
+return;
+}
 if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
 mb_xy= s->mb_x + ((s->mb_y&~1) + h->col_parity)*s->mb_stride;
 b8_stride = 0;
 mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
 mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
 b8_stride *= 3;
 b4_stride *= 6;
-sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
+sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
 && !is_b8x8){
-*mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
+*mb_type   |= MB_TYPE_16x8 |MB_TYPE_DIRECT2; /* B_16x8 */
 }else{
-*mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
+*mb_type   |= MB_TYPE_8x8;
 }
 }else{                                           //     AFR/FR    -> AFR/FR
 single_col:
 mb_type_col[0] =
 mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
-sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
+sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
-*mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
+*mb_type   |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */
 }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
-*mb_type   |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
+*mb_type   |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
 }else{
 if(!h->sps.direct_8x8_inference_flag){
 /* FIXME save sub mb types from previous frames (or derive from MVs)
 * so we know exactly what block size to use */
-sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
+sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */
 }
-*mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
+*mb_type   |= MB_TYPE_8x8;
 }
 }
 }
 l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
 l1mv0  +=  2*b4_stride;
 l1mv1  +=  2*b4_stride;
 }
 }
-{
-int ref[2];
+if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
-int mv[2];
-int list;
-/* ref = min(neighbors) */
-for(list=0; list<2; list++){
-int left_ref = h->ref_cache[list][scan8[0] - 1];
-int top_ref  = h->ref_cache[list][scan8[0] - 8];
-int refc = h->ref_cache[list][scan8[0] - 8 + 4];
-const int16_t *C= h->mv_cache[list][ scan8[0] - 8 + 4];
-if(refc == PART_NOT_AVAILABLE){
-refc = h->ref_cache[list][scan8[0] - 8 - 1];
-C    = h-> mv_cache[list][scan8[0] - 8 - 1];
-}
-ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc);
-if(ref[list] >= 0){
-//this is just pred_motion() but with the cases removed that cannot happen for direct blocks
-const int16_t * const A= h->mv_cache[list][ scan8[0] - 1 ];
-const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
-int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]);
-if(match_count > 1){ //most common
-mv[list]= (mid_pred(A[0], B[0], C[0])&0xFFFF)
-+(mid_pred(A[1], B[1], C[1])<<16);
-}else {
-assert(match_count==1);
-if(left_ref==ref[list]){
-mv[list]= *(uint32_t*)A;
-}else if(top_ref==ref[list]){
-mv[list]= *(uint32_t*)B;
-}else{
-mv[list]= *(uint32_t*)C;
-}
-}
-}else{
-int mask= ~(MB_TYPE_L0 << (2*list));
-mv[list] = 0;
-ref[list] = -1;
-if(!is_b8x8)
-*mb_type &= mask;
-sub_mb_type &= mask;
-}
-}
-if(ref[0] < 0 && ref[1] < 0){
-ref[0] = ref[1] = 0;
-if(!is_b8x8)
-*mb_type |= MB_TYPE_L0L1;
-sub_mb_type |= MB_TYPE_L0L1;
-}
-if(!is_b8x8 && (mv[0]|mv[1]) == 0){
-fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
-fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
-fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
-fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
-*mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
-}else if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
 int n=0;
 for(i8=0; i8<4; i8++){
 int x8 = i8&1;
 int y8 = i8>>1;
 int xy8 = x8+y8*b8_stride;

Mercurial > libavcodec.hg

comparison h264_direct.c @ 11140:9004c61e3aa0 libavcodec