diff h264_direct.c @ 11292:411ab09ada91 libavcodec

Get rid of mb2b8_xy and b8_stride, change arrays organized based on b8_stride to ones based on mb_stride in h264. about 20 cpu cycles faster overall per MB
author michael
date Thu, 25 Feb 2010 23:44:42 +0000
parents 10c06a9bd3d9
children 7dd2a45249a9
line wrap: on
line diff
--- a/h264_direct.c	Thu Feb 25 18:41:48 2010 +0000
+++ b/h264_direct.c	Thu Feb 25 23:44:42 2010 +0000
@@ -142,7 +142,7 @@
 
 static void pred_spatial_direct_motion(H264Context * const h, int *mb_type){
     MpegEncContext * const s = &h->s;
-    int b8_stride = h->b8_stride;
+    int b8_stride = 2;
     int b4_stride = h->b_stride;
     int mb_xy = h->mb_xy;
     int mb_type_col[2];
@@ -228,7 +228,7 @@
             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
-            b8_stride *= 3;
+            b8_stride = 2+4*s->mb_stride;
             b4_stride *= 6;
 
             sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
@@ -262,12 +262,12 @@
 
     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
-    l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
-    l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
+    l1ref0 = &h->ref_list[1][0].ref_index [0][4*mb_xy];
+    l1ref1 = &h->ref_list[1][0].ref_index [1][4*mb_xy];
     if(!b8_stride){
         if(s->mb_y&1){
-            l1ref0 += h->b8_stride;
-            l1ref1 += h->b8_stride;
+            l1ref0 += 2;
+            l1ref1 += 2;
             l1mv0  +=  2*b4_stride;
             l1mv1  +=  2*b4_stride;
         }
@@ -342,11 +342,12 @@
                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
 
+                assert(b8_stride==2);
                 /* col_zero_flag */
-                if(!IS_INTRA(mb_type_col[0]) && !h->ref_list[1][0].long_ref && (   l1ref0[x8 + y8*b8_stride] == 0
-                                              || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
+                if(!IS_INTRA(mb_type_col[0]) && !h->ref_list[1][0].long_ref && (   l1ref0[i8] == 0
+                                              || (l1ref0[i8] < 0 && l1ref1[i8] == 0
                                                   && h->x264_build>33U))){
-                    const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
+                    const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1;
                     if(IS_SUB_8X8(sub_mb_type)){
                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
@@ -381,7 +382,7 @@
 
 static void pred_temp_direct_motion(H264Context * const h, int *mb_type){
     MpegEncContext * const s = &h->s;
-    int b8_stride = h->b8_stride;
+    int b8_stride = 2;
     int b4_stride = h->b_stride;
     int mb_xy = h->mb_xy;
     int mb_type_col[2];
@@ -406,7 +407,7 @@
             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
-            b8_stride *= 3;
+            b8_stride = 2+4*s->mb_stride;
             b4_stride *= 6;
 
             sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
@@ -441,12 +442,12 @@
 
     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
-    l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
-    l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
+    l1ref0 = &h->ref_list[1][0].ref_index [0][4*mb_xy];
+    l1ref1 = &h->ref_list[1][0].ref_index [1][4*mb_xy];
     if(!b8_stride){
         if(s->mb_y&1){
-            l1ref0 += h->b8_stride;
-            l1ref1 += h->b8_stride;
+            l1ref0 += 2;
+            l1ref1 += 2;
             l1mv0  +=  2*b4_stride;
             l1mv1  +=  2*b4_stride;
         }
@@ -549,11 +550,12 @@
                     continue;
                 }
 
-                ref0 = l1ref0[x8 + y8*b8_stride];
+                assert(b8_stride == 2);
+                ref0 = l1ref0[i8];
                 if(ref0 >= 0)
                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
                 else{
-                    ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
+                    ref0 = map_col_to_list0[1][l1ref1[i8] + ref_offset];
                     l1mv= l1mv1;
                 }
                 scale = dist_scale_factor[ref0];