Mercurial > libavcodec.hg
comparison h264.h @ 10906:1b5fba731e24 libavcodec
Rearchitecturing the stiched up goose part 1
Run loop filter per row instead of per MB, this also should make it
much easier to switch to per frame filtering and also doing so in a
seperate thread in the future if some volunteer wants to try.
Overall decoding speedup of 1.7% (single thread on pentium dual / cathedral sample)
This change also allows some optimizations to be tried that would not have
been possible before.
author | michael |
---|---|
date | Sun, 17 Jan 2010 20:35:55 +0000 |
parents | f112b4d030fa |
children | 2d82b73b12ef |
comparison
equal
deleted
inserted
replaced
10905:06d20a468d1e | 10906:1b5fba731e24 |
---|---|
298 /** | 298 /** |
299 * non zero coeff count cache. | 299 * non zero coeff count cache. |
300 * is 64 if not available. | 300 * is 64 if not available. |
301 */ | 301 */ |
302 DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]); | 302 DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]); |
303 uint8_t (*non_zero_count)[16]; | 303 uint8_t (*non_zero_count)[32]; |
304 | 304 |
305 /** | 305 /** |
306 * Motion vector cache. | 306 * Motion vector cache. |
307 */ | 307 */ |
308 DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]); | 308 DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]); |
421 /** | 421 /** |
422 * num_ref_idx_l0/1_active_minus1 + 1 | 422 * num_ref_idx_l0/1_active_minus1 + 1 |
423 */ | 423 */ |
424 unsigned int ref_count[2]; ///< counts frames or fields, depending on current mb mode | 424 unsigned int ref_count[2]; ///< counts frames or fields, depending on current mb mode |
425 unsigned int list_count; | 425 unsigned int list_count; |
426 uint8_t *list_counts; ///< Array of list_count per MB specifying the slice type | |
426 Picture *short_ref[32]; | 427 Picture *short_ref[32]; |
427 Picture *long_ref[32]; | 428 Picture *long_ref[32]; |
428 Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture | 429 Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture |
429 Picture ref_list[2][48]; /**< 0..15: frame refs, 16..47: mbaff field refs. | 430 Picture ref_list[2][48]; /**< 0..15: frame refs, 16..47: mbaff field refs. |
430 Reordered version of default_ref_list | 431 Reordered version of default_ref_list |
734 }; | 735 }; |
735 | 736 |
736 top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE); | 737 top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE); |
737 | 738 |
738 //FIXME deblocking could skip the intra and nnz parts. | 739 //FIXME deblocking could skip the intra and nnz parts. |
739 if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF) | 740 // if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF) |
740 return; | 741 // return; |
741 | 742 |
742 /* Wow, what a mess, why didn't they simplify the interlacing & intra | 743 /* Wow, what a mess, why didn't they simplify the interlacing & intra |
743 * stuff, I can't imagine that these complex rules are worth it. */ | 744 * stuff, I can't imagine that these complex rules are worth it. */ |
744 | 745 |
745 topleft_xy = top_xy - 1; | 746 topleft_xy = top_xy - 1; |
791 topright_type = 0; | 792 topright_type = 0; |
792 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0; | 793 top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0; |
793 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; | 794 left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; |
794 left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0; | 795 left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0; |
795 | 796 |
796 if(MB_MBAFF && !IS_INTRA(mb_type)){ | 797 if(!IS_INTRA(mb_type)){ |
797 int list; | 798 int list; |
798 for(list=0; list<h->list_count; list++){ | 799 for(list=0; list<h->list_count; list++){ |
799 //These values where changed for ease of performing MC, we need to change them back | 800 int8_t *ref; |
800 //FIXME maybe we can make MC and loop filter use the same values or prevent | 801 int y, b_xy; |
801 //the MC code from changing ref_cache and rather use a temporary array. | 802 if(!USES_LIST(mb_type, list)){ |
802 if(USES_LIST(mb_type,list)){ | 803 fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); |
803 int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]]; | |
804 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = | 804 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = |
805 *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; | 805 *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = |
806 ref += h->b8_stride; | |
807 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = | 806 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = |
808 *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; | 807 *(uint32_t*)&h->ref_cache[list][scan8[10]] = ((LIST_NOT_USED)&0xFF)*0x01010101; |
808 continue; | |
809 } | 809 } |
810 | |
811 ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]]; | |
812 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = | |
813 *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; | |
814 ref += h->b8_stride; | |
815 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = | |
816 *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; | |
817 | |
818 b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; | |
819 for(y=0; y<4; y++){ | |
820 *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]; | |
821 *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]; | |
822 } | |
823 | |
810 } | 824 } |
811 } | 825 } |
812 }else{ | 826 }else{ |
813 topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0; | 827 topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0; |
814 top_type = h->slice_table[top_xy ] == h->slice_num ? s->current_picture.mb_type[top_xy] : 0; | 828 top_type = h->slice_table[top_xy ] == h->slice_num ? s->current_picture.mb_type[top_xy] : 0; |
1194 h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1]; | 1208 h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1]; |
1195 | 1209 |
1196 h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5]; | 1210 h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5]; |
1197 h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5]; | 1211 h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5]; |
1198 h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4]; | 1212 h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4]; |
1213 | |
1214 //FIXME sort better how things are stored in non_zero_count | |
1215 | |
1216 | |
1217 h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1]; | |
1218 h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2]; | |
1219 h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3]; | |
1220 h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1]; | |
1221 h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2]; | |
1222 h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3]; | |
1223 h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1]; | |
1224 h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2]; | |
1225 h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3]; | |
1226 | |
1227 h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1]; | |
1228 h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4]; | |
1229 | |
1199 } | 1230 } |
1200 | 1231 |
1201 static inline void write_back_motion(H264Context *h, int mb_type){ | 1232 static inline void write_back_motion(H264Context *h, int mb_type){ |
1202 MpegEncContext * const s = &h->s; | 1233 MpegEncContext * const s = &h->s; |
1203 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; | 1234 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; |
1269 static void decode_mb_skip(H264Context *h){ | 1300 static void decode_mb_skip(H264Context *h){ |
1270 MpegEncContext * const s = &h->s; | 1301 MpegEncContext * const s = &h->s; |
1271 const int mb_xy= h->mb_xy; | 1302 const int mb_xy= h->mb_xy; |
1272 int mb_type=0; | 1303 int mb_type=0; |
1273 | 1304 |
1274 memset(h->non_zero_count[mb_xy], 0, 16); | 1305 memset(h->non_zero_count[mb_xy], 0, 32); |
1275 memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui | 1306 memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui |
1276 | 1307 |
1277 if(MB_FIELD) | 1308 if(MB_FIELD) |
1278 mb_type|= MB_TYPE_INTERLACED; | 1309 mb_type|= MB_TYPE_INTERLACED; |
1279 | 1310 |