libavcodec.hg: h264.h comparison

comparison h264.h @ 10906:1b5fba731e24 libavcodec

Rearchitecturing the stiched up goose part 1 Run loop filter per row instead of per MB, this also should make it much easier to switch to per frame filtering and also doing so in a seperate thread in the future if some volunteer wants to try. Overall decoding speedup of 1.7% (single thread on pentium dual / cathedral sample) This change also allows some optimizations to be tried that would not have been possible before.

author	michael
date	Sun, 17 Jan 2010 20:35:55 +0000
parents	f112b4d030fa
children	2d82b73b12ef

comparison

equal deleted inserted replaced

-:06d20a468d1e
+:1b5fba731e24
 /**
 * non zero coeff count cache.
 * is 64 if not available.
 */
 DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
-uint8_t (*non_zero_count)[16];
+uint8_t (*non_zero_count)[32];
 /**
 * Motion vector cache.
 */
 DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
 /**
 * num_ref_idx_l0/1_active_minus1 + 1
 */
 unsigned int ref_count[2];   ///< counts frames or fields, depending on current mb mode
 unsigned int list_count;
+uint8_t *list_counts;            ///< Array of list_count per MB specifying the slice type
 Picture *short_ref[32];
 Picture *long_ref[32];
 Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture
 Picture ref_list[2][48];         /**< 0..15: frame refs, 16..47: mbaff field refs.
 Reordered version of default_ref_list
 };
 top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 //FIXME deblocking could skip the intra and nnz parts.
-if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
+//     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
-return;
+//         return;
 /* Wow, what a mess, why didn't they simplify the interlacing & intra
 * stuff, I can't imagine that these complex rules are worth it. */
 topleft_xy = top_xy - 1;
 topright_type = 0;
 top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
-if(MB_MBAFF && !IS_INTRA(mb_type)){
+if(!IS_INTRA(mb_type)){
 int list;
 for(list=0; list<h->list_count; list++){
-//These values where changed for ease of performing MC, we need to change them back
+int8_t *ref;
-//FIXME maybe we can make MC and loop filter use the same values or prevent
+int y, b_xy;
-//the MC code from changing ref_cache and rather use a temporary array.
+if(!USES_LIST(mb_type, list)){
-if(USES_LIST(mb_type,list)){
+fill_rectangle(  h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
-int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
-*(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
+*(uint32_t*)&h->ref_cache[list][scan8[ 2]] =
-ref += h->b8_stride;
 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
-*(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
+*(uint32_t*)&h->ref_cache[list][scan8[10]] = ((LIST_NOT_USED)&0xFF)*0x01010101;
+continue;
 }
+ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
+*(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
+*(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
+ref += h->b8_stride;
+*(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
+*(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
+b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
+for(y=0; y<4; y++){
+*(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride];
+*(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride];
+}
 }
 }
 }else{
 topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
+//FIXME sort better how things are stored in non_zero_count
+h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1];
+h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2];
+h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3];
+h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1];
+h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2];
+h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3];
+h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1];
+h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2];
+h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3];
+h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1];
+h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4];
 }
 static inline void write_back_motion(H264Context *h, int mb_type){
 MpegEncContext * const s = &h->s;
 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 static void decode_mb_skip(H264Context *h){
 MpegEncContext * const s = &h->s;
 const int mb_xy= h->mb_xy;
 int mb_type=0;
-memset(h->non_zero_count[mb_xy], 0, 16);
+memset(h->non_zero_count[mb_xy], 0, 32);
 memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
 if(MB_FIELD)
 mb_type|= MB_TYPE_INTERLACED;

Mercurial > libavcodec.hg

comparison h264.h @ 10906:1b5fba731e24 libavcodec