comparison h264.h @ 10906:1b5fba731e24 libavcodec

Rearchitecturing the stiched up goose part 1 Run loop filter per row instead of per MB, this also should make it much easier to switch to per frame filtering and also doing so in a seperate thread in the future if some volunteer wants to try. Overall decoding speedup of 1.7% (single thread on pentium dual / cathedral sample) This change also allows some optimizations to be tried that would not have been possible before.
author michael
date Sun, 17 Jan 2010 20:35:55 +0000
parents f112b4d030fa
children 2d82b73b12ef
comparison
equal deleted inserted replaced
10905:06d20a468d1e 10906:1b5fba731e24
298 /** 298 /**
299 * non zero coeff count cache. 299 * non zero coeff count cache.
300 * is 64 if not available. 300 * is 64 if not available.
301 */ 301 */
302 DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]); 302 DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
303 uint8_t (*non_zero_count)[16]; 303 uint8_t (*non_zero_count)[32];
304 304
305 /** 305 /**
306 * Motion vector cache. 306 * Motion vector cache.
307 */ 307 */
308 DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]); 308 DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
421 /** 421 /**
422 * num_ref_idx_l0/1_active_minus1 + 1 422 * num_ref_idx_l0/1_active_minus1 + 1
423 */ 423 */
424 unsigned int ref_count[2]; ///< counts frames or fields, depending on current mb mode 424 unsigned int ref_count[2]; ///< counts frames or fields, depending on current mb mode
425 unsigned int list_count; 425 unsigned int list_count;
426 uint8_t *list_counts; ///< Array of list_count per MB specifying the slice type
426 Picture *short_ref[32]; 427 Picture *short_ref[32];
427 Picture *long_ref[32]; 428 Picture *long_ref[32];
428 Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture 429 Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture
429 Picture ref_list[2][48]; /**< 0..15: frame refs, 16..47: mbaff field refs. 430 Picture ref_list[2][48]; /**< 0..15: frame refs, 16..47: mbaff field refs.
430 Reordered version of default_ref_list 431 Reordered version of default_ref_list
734 }; 735 };
735 736
736 top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE); 737 top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
737 738
738 //FIXME deblocking could skip the intra and nnz parts. 739 //FIXME deblocking could skip the intra and nnz parts.
739 if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF) 740 // if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
740 return; 741 // return;
741 742
742 /* Wow, what a mess, why didn't they simplify the interlacing & intra 743 /* Wow, what a mess, why didn't they simplify the interlacing & intra
743 * stuff, I can't imagine that these complex rules are worth it. */ 744 * stuff, I can't imagine that these complex rules are worth it. */
744 745
745 topleft_xy = top_xy - 1; 746 topleft_xy = top_xy - 1;
791 topright_type = 0; 792 topright_type = 0;
792 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0; 793 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0;
793 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; 794 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
794 left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0; 795 left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
795 796
796 if(MB_MBAFF && !IS_INTRA(mb_type)){ 797 if(!IS_INTRA(mb_type)){
797 int list; 798 int list;
798 for(list=0; list<h->list_count; list++){ 799 for(list=0; list<h->list_count; list++){
799 //These values where changed for ease of performing MC, we need to change them back 800 int8_t *ref;
800 //FIXME maybe we can make MC and loop filter use the same values or prevent 801 int y, b_xy;
801 //the MC code from changing ref_cache and rather use a temporary array. 802 if(!USES_LIST(mb_type, list)){
802 if(USES_LIST(mb_type,list)){ 803 fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
803 int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
804 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = 804 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
805 *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; 805 *(uint32_t*)&h->ref_cache[list][scan8[ 2]] =
806 ref += h->b8_stride;
807 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = 806 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
808 *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; 807 *(uint32_t*)&h->ref_cache[list][scan8[10]] = ((LIST_NOT_USED)&0xFF)*0x01010101;
808 continue;
809 } 809 }
810
811 ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
812 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
813 *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
814 ref += h->b8_stride;
815 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
816 *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
817
818 b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
819 for(y=0; y<4; y++){
820 *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride];
821 *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride];
822 }
823
810 } 824 }
811 } 825 }
812 }else{ 826 }else{
813 topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0; 827 topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
814 top_type = h->slice_table[top_xy ] == h->slice_num ? s->current_picture.mb_type[top_xy] : 0; 828 top_type = h->slice_table[top_xy ] == h->slice_num ? s->current_picture.mb_type[top_xy] : 0;
1194 h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1]; 1208 h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1195 1209
1196 h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5]; 1210 h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1197 h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5]; 1211 h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1198 h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4]; 1212 h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1213
1214 //FIXME sort better how things are stored in non_zero_count
1215
1216
1217 h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1];
1218 h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2];
1219 h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3];
1220 h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1];
1221 h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2];
1222 h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3];
1223 h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1];
1224 h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2];
1225 h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3];
1226
1227 h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1];
1228 h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4];
1229
1199 } 1230 }
1200 1231
1201 static inline void write_back_motion(H264Context *h, int mb_type){ 1232 static inline void write_back_motion(H264Context *h, int mb_type){
1202 MpegEncContext * const s = &h->s; 1233 MpegEncContext * const s = &h->s;
1203 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; 1234 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1269 static void decode_mb_skip(H264Context *h){ 1300 static void decode_mb_skip(H264Context *h){
1270 MpegEncContext * const s = &h->s; 1301 MpegEncContext * const s = &h->s;
1271 const int mb_xy= h->mb_xy; 1302 const int mb_xy= h->mb_xy;
1272 int mb_type=0; 1303 int mb_type=0;
1273 1304
1274 memset(h->non_zero_count[mb_xy], 0, 16); 1305 memset(h->non_zero_count[mb_xy], 0, 32);
1275 memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui 1306 memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
1276 1307
1277 if(MB_FIELD) 1308 if(MB_FIELD)
1278 mb_type|= MB_TYPE_INTERLACED; 1309 mb_type|= MB_TYPE_INTERLACED;
1279 1310