diff h264.h @ 10906:1b5fba731e24 libavcodec

Rearchitecturing the stiched up goose part 1 Run loop filter per row instead of per MB, this also should make it much easier to switch to per frame filtering and also doing so in a seperate thread in the future if some volunteer wants to try. Overall decoding speedup of 1.7% (single thread on pentium dual / cathedral sample) This change also allows some optimizations to be tried that would not have been possible before.
author michael
date Sun, 17 Jan 2010 20:35:55 +0000
parents f112b4d030fa
children 2d82b73b12ef
line wrap: on
line diff
--- a/h264.h	Sun Jan 17 08:24:45 2010 +0000
+++ b/h264.h	Sun Jan 17 20:35:55 2010 +0000
@@ -300,7 +300,7 @@
      * is 64 if not available.
      */
     DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
-    uint8_t (*non_zero_count)[16];
+    uint8_t (*non_zero_count)[32];
 
     /**
      * Motion vector cache.
@@ -423,6 +423,7 @@
      */
     unsigned int ref_count[2];   ///< counts frames or fields, depending on current mb mode
     unsigned int list_count;
+    uint8_t *list_counts;            ///< Array of list_count per MB specifying the slice type
     Picture *short_ref[32];
     Picture *long_ref[32];
     Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture
@@ -736,8 +737,8 @@
     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 
     //FIXME deblocking could skip the intra and nnz parts.
-    if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
-        return;
+//     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
+//         return;
 
     /* Wow, what a mess, why didn't they simplify the interlacing & intra
      * stuff, I can't imagine that these complex rules are worth it. */
@@ -793,20 +794,33 @@
         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 
-        if(MB_MBAFF && !IS_INTRA(mb_type)){
+        if(!IS_INTRA(mb_type)){
             int list;
             for(list=0; list<h->list_count; list++){
-                //These values where changed for ease of performing MC, we need to change them back
-                //FIXME maybe we can make MC and loop filter use the same values or prevent
-                //the MC code from changing ref_cache and rather use a temporary array.
-                if(USES_LIST(mb_type,list)){
-                    int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
+                int8_t *ref;
+                int y, b_xy;
+                if(!USES_LIST(mb_type, list)){
+                    fill_rectangle(  h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
-                    *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
-                    ref += h->b8_stride;
+                    *(uint32_t*)&h->ref_cache[list][scan8[ 2]] =
                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
-                    *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
+                    *(uint32_t*)&h->ref_cache[list][scan8[10]] = ((LIST_NOT_USED)&0xFF)*0x01010101;
+                    continue;
                 }
+
+                ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
+                *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
+                *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
+                ref += h->b8_stride;
+                *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
+                *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
+
+                b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
+                for(y=0; y<4; y++){
+                    *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride];
+                    *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride];
+                }
+
             }
         }
     }else{
@@ -1196,6 +1210,23 @@
     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
+
+    //FIXME sort better how things are stored in non_zero_count
+
+
+    h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1];
+    h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2];
+    h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3];
+    h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1];
+    h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2];
+    h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3];
+    h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1];
+    h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2];
+    h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3];
+
+    h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1];
+    h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4];
+
 }
 
 static inline void write_back_motion(H264Context *h, int mb_type){
@@ -1271,7 +1302,7 @@
     const int mb_xy= h->mb_xy;
     int mb_type=0;
 
-    memset(h->non_zero_count[mb_xy], 0, 16);
+    memset(h->non_zero_count[mb_xy], 0, 32);
     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
 
     if(MB_FIELD)