comparison h264.c @ 10906:1b5fba731e24 libavcodec

Rearchitecturing the stiched up goose part 1 Run loop filter per row instead of per MB, this also should make it much easier to switch to per frame filtering and also doing so in a seperate thread in the future if some volunteer wants to try. Overall decoding speedup of 1.7% (single thread on pentium dual / cathedral sample) This change also allows some optimizations to be tried that would not have been possible before.
author michael
date Sun, 17 Jan 2010 20:35:55 +0000
parents d23f1c47fc09
children 2d82b73b12ef
comparison
equal deleted inserted replaced
10905:06d20a468d1e 10906:1b5fba731e24
664 av_freep(&h->mvd_table[1]); 664 av_freep(&h->mvd_table[1]);
665 av_freep(&h->direct_table); 665 av_freep(&h->direct_table);
666 av_freep(&h->non_zero_count); 666 av_freep(&h->non_zero_count);
667 av_freep(&h->slice_table_base); 667 av_freep(&h->slice_table_base);
668 h->slice_table= NULL; 668 h->slice_table= NULL;
669 av_freep(&h->list_counts);
669 670
670 av_freep(&h->mb2b_xy); 671 av_freep(&h->mb2b_xy);
671 av_freep(&h->mb2b8_xy); 672 av_freep(&h->mb2b8_xy);
672 673
673 for(i = 0; i < MAX_THREADS; i++) { 674 for(i = 0; i < MAX_THREADS; i++) {
754 const int big_mb_num= s->mb_stride * (s->mb_height+1); 755 const int big_mb_num= s->mb_stride * (s->mb_height+1);
755 int x,y; 756 int x,y;
756 757
757 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, big_mb_num * 8 * sizeof(uint8_t), fail) 758 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, big_mb_num * 8 * sizeof(uint8_t), fail)
758 759
759 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count , big_mb_num * 16 * sizeof(uint8_t), fail) 760 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count , big_mb_num * 32 * sizeof(uint8_t), fail)
760 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail) 761 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail)
761 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail) 762 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
762 763
763 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail) 764 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail)
764 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail); 765 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail);
765 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail); 766 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail);
766 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail); 767 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail);
768 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->list_counts, big_mb_num * sizeof(uint8_t), fail)
767 769
768 memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base)); 770 memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base));
769 h->slice_table= h->slice_table_base + s->mb_stride*2 + 1; 771 h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
770 772
771 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b_xy , big_mb_num * sizeof(uint32_t), fail); 773 FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b_xy , big_mb_num * sizeof(uint32_t), fail);
943 return 0; 945 return 0;
944 } 946 }
945 947
946 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){ 948 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
947 MpegEncContext * const s = &h->s; 949 MpegEncContext * const s = &h->s;
948 int i;
949 int step = 1;
950 int offset = 1;
951 int uvoffset= 1;
952 int top_idx = 1; 950 int top_idx = 1;
953 int skiplast= 0;
954 951
955 src_y -= linesize; 952 src_y -= linesize;
956 src_cb -= uvlinesize; 953 src_cb -= uvlinesize;
957 src_cr -= uvlinesize; 954 src_cr -= uvlinesize;
958 955
959 if(!simple && FRAME_MBAFF){ 956 if(!simple && FRAME_MBAFF){
960 if(s->mb_y&1){ 957 if(s->mb_y&1){
961 offset = MB_MBAFF ? 1 : 17;
962 uvoffset= MB_MBAFF ? 1 : 9;
963 if(!MB_MBAFF){ 958 if(!MB_MBAFF){
964 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y + 15*linesize); 959 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y + 15*linesize);
965 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize); 960 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
966 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ 961 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
967 *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize); 962 *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
968 *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize); 963 *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
969 } 964 }
970 } 965 }
971 }else{ 966 }else if(MB_MBAFF){
972 if(!MB_MBAFF){ 967 top_idx = 0;
973 h->left_border[0]= h->top_borders[0][s->mb_x][15]; 968 }else
974 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ 969 return;
975 h->left_border[34 ]= h->top_borders[0][s->mb_x][16+7 ];
976 h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
977 }
978 skiplast= 1;
979 }
980 offset =
981 uvoffset=
982 top_idx = MB_MBAFF ? 0 : 1;
983 }
984 step= MB_MBAFF ? 2 : 1;
985 } 970 }
986 971
987 // There are two lines saved, the line above the the top macroblock of a pair, 972 // There are two lines saved, the line above the the top macroblock of a pair,
988 // and the line above the bottom macroblock 973 // and the line above the bottom macroblock
989 h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
990 for(i=1; i<17 - skiplast; i++){
991 h->left_border[offset+i*step]= src_y[15+i* linesize];
992 }
993 974
994 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y + 16*linesize); 975 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y + 16*linesize);
995 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize); 976 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
996 977
997 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ 978 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
998 h->left_border[uvoffset+34 ]= h->top_borders[top_idx][s->mb_x][16+7];
999 h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
1000 for(i=1; i<9 - skiplast; i++){
1001 h->left_border[uvoffset+34 +i*step]= src_cb[7+i*uvlinesize];
1002 h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
1003 }
1004 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize); 979 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
1005 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize); 980 *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
1006 } 981 }
1007 } 982 }
1008 983
1011 int temp8, i; 986 int temp8, i;
1012 uint64_t temp64; 987 uint64_t temp64;
1013 int deblock_left; 988 int deblock_left;
1014 int deblock_top; 989 int deblock_top;
1015 int mb_xy; 990 int mb_xy;
1016 int step = 1;
1017 int offset = 1;
1018 int uvoffset= 1;
1019 int top_idx = 1; 991 int top_idx = 1;
1020 992
1021 if(!simple && FRAME_MBAFF){ 993 if(!simple && FRAME_MBAFF){
1022 if(s->mb_y&1){ 994 if(s->mb_y&1){
1023 offset = MB_MBAFF ? 1 : 17; 995 if(!MB_MBAFF)
1024 uvoffset= MB_MBAFF ? 1 : 9; 996 return;
1025 }else{ 997 }else{
1026 offset =
1027 uvoffset=
1028 top_idx = MB_MBAFF ? 0 : 1; 998 top_idx = MB_MBAFF ? 0 : 1;
1029 } 999 }
1030 step= MB_MBAFF ? 2 : 1;
1031 } 1000 }
1032 1001
1033 if(h->deblocking_filter == 2) { 1002 if(h->deblocking_filter == 2) {
1034 mb_xy = h->mb_xy; 1003 mb_xy = h->mb_xy;
1035 deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1]; 1004 deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
1047 t= a;\ 1016 t= a;\
1048 if(xchg)\ 1017 if(xchg)\
1049 a= b;\ 1018 a= b;\
1050 b= t; 1019 b= t;
1051 1020
1052 if(deblock_left){
1053 for(i = !deblock_top; i<16; i++){
1054 XCHG(h->left_border[offset+i*step], src_y [i* linesize], temp8, xchg);
1055 }
1056 XCHG(h->left_border[offset+i*step], src_y [i* linesize], temp8, 1);
1057 }
1058
1059 if(deblock_top){ 1021 if(deblock_top){
1022 if(deblock_left){
1023 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+8), *(uint64_t*)(src_y -7), temp64, 1);
1024 }
1060 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg); 1025 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
1061 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1); 1026 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
1062 if(s->mb_x+1 < s->mb_width){ 1027 if(s->mb_x+1 < s->mb_width){
1063 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1); 1028 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
1064 } 1029 }
1065 } 1030 }
1066 1031
1067 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ 1032 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
1068 if(deblock_left){
1069 for(i = !deblock_top; i<8; i++){
1070 XCHG(h->left_border[uvoffset+34 +i*step], src_cb[i*uvlinesize], temp8, xchg);
1071 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
1072 }
1073 XCHG(h->left_border[uvoffset+34 +i*step], src_cb[i*uvlinesize], temp8, 1);
1074 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
1075 }
1076 if(deblock_top){ 1033 if(deblock_top){
1034 if(deblock_left){
1035 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+16), *(uint64_t*)(src_cb -7), temp64, 1);
1036 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+24), *(uint64_t*)(src_cr -7), temp64, 1);
1037 }
1077 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1); 1038 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
1078 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1); 1039 XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
1079 } 1040 }
1080 } 1041 }
1081 } 1042 }
1100 dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8; 1061 dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
1101 dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8; 1062 dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
1102 1063
1103 s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4); 1064 s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1104 s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2); 1065 s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
1066
1067 h->list_counts[mb_xy]= h->list_count;
1105 1068
1106 if (!simple && MB_FIELD) { 1069 if (!simple && MB_FIELD) {
1107 linesize = h->mb_linesize = s->linesize * 2; 1070 linesize = h->mb_linesize = s->linesize * 2;
1108 uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2; 1071 uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
1109 block_offset = &h->block_offset[24]; 1072 block_offset = &h->block_offset[24];
1320 } 1283 }
1321 } 1284 }
1322 if(h->cbp || IS_INTRA(mb_type)) 1285 if(h->cbp || IS_INTRA(mb_type))
1323 s->dsp.clear_blocks(h->mb); 1286 s->dsp.clear_blocks(h->mb);
1324 1287
1325 if(h->deblocking_filter) { 1288 if(h->deblocking_filter && 0) {
1326 backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple); 1289 backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
1327 fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb 1290 fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
1328 h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]); 1291 h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
1329 h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]); 1292 h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
1330 if (!simple && FRAME_MBAFF) { 1293 if (!simple && FRAME_MBAFF) {
2172 case FF_SI_TYPE: return 4; 2135 case FF_SI_TYPE: return 4;
2173 default: return -1; 2136 default: return -1;
2174 } 2137 }
2175 } 2138 }
2176 2139
2140 static void loop_filter(H264Context *h){
2141 MpegEncContext * const s = &h->s;
2142 uint8_t *dest_y, *dest_cb, *dest_cr;
2143 int linesize, uvlinesize, mb_x, mb_y;
2144 const int end_mb_y= s->mb_y + FRAME_MBAFF;
2145 const int old_slice_type= h->slice_type;
2146
2147 if(h->deblocking_filter) {
2148 for(mb_x= 0; mb_x<s->mb_width; mb_x++){
2149 for(mb_y=end_mb_y - FRAME_MBAFF; mb_y<= end_mb_y; mb_y++){
2150 int list, mb_xy, mb_type, is_complex;
2151 mb_xy = h->mb_xy = mb_x + mb_y*s->mb_stride;
2152 h->slice_num= h->slice_table[mb_xy];
2153 mb_type= s->current_picture.mb_type[mb_xy];
2154 h->list_count= h->list_counts[mb_xy];
2155 if(h->list_count==2){
2156 h->slice_type= h->slice_type_nos= FF_B_TYPE;
2157 }else if(h->list_count==1){
2158 h->slice_type= h->slice_type_nos= FF_P_TYPE;
2159 }else
2160 h->slice_type= h->slice_type_nos= FF_I_TYPE;
2161
2162 if(FRAME_MBAFF)
2163 h->mb_mbaff = h->mb_field_decoding_flag = !!IS_INTERLACED(mb_type);
2164
2165 is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0; //FIXME qscale might be wrong
2166
2167 s->mb_x= mb_x;
2168 s->mb_y= mb_y;
2169 dest_y = s->current_picture.data[0] + (mb_x + mb_y * s->linesize ) * 16;
2170 dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2171 dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2172 //FIXME simplify above
2173
2174 if (MB_FIELD) {
2175 linesize = h->mb_linesize = s->linesize * 2;
2176 uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2177 if(mb_y&1){ //FIXME move out of this function?
2178 dest_y -= s->linesize*15;
2179 dest_cb-= s->uvlinesize*7;
2180 dest_cr-= s->uvlinesize*7;
2181 }
2182 } else {
2183 linesize = h->mb_linesize = s->linesize;
2184 uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2185 }
2186 backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, !is_complex);
2187 fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2188 h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2189 h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2190
2191 if (is_complex && FRAME_MBAFF) {
2192 ff_h264_filter_mb (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2193 } else {
2194 ff_h264_filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2195 }
2196 }
2197 }
2198 }
2199 h->slice_type= old_slice_type;
2200 s->mb_x= 0;
2201 s->mb_y= end_mb_y - FRAME_MBAFF;
2202 }
2203
2177 static int decode_slice(struct AVCodecContext *avctx, void *arg){ 2204 static int decode_slice(struct AVCodecContext *avctx, void *arg){
2178 H264Context *h = *(void**)arg; 2205 H264Context *h = *(void**)arg;
2179 MpegEncContext * const s = &h->s; 2206 MpegEncContext * const s = &h->s;
2180 const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F; 2207 const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
2181 2208
2220 return -1; 2247 return -1;
2221 } 2248 }
2222 2249
2223 if( ++s->mb_x >= s->mb_width ) { 2250 if( ++s->mb_x >= s->mb_width ) {
2224 s->mb_x = 0; 2251 s->mb_x = 0;
2252 loop_filter(h);
2225 ff_draw_horiz_band(s, 16*s->mb_y, 16); 2253 ff_draw_horiz_band(s, 16*s->mb_y, 16);
2226 ++s->mb_y; 2254 ++s->mb_y;
2227 if(FIELD_OR_MBAFF_PICTURE) { 2255 if(FIELD_OR_MBAFF_PICTURE) {
2228 ++s->mb_y; 2256 ++s->mb_y;
2229 } 2257 }
2257 return -1; 2285 return -1;
2258 } 2286 }
2259 2287
2260 if(++s->mb_x >= s->mb_width){ 2288 if(++s->mb_x >= s->mb_width){
2261 s->mb_x=0; 2289 s->mb_x=0;
2290 loop_filter(h);
2262 ff_draw_horiz_band(s, 16*s->mb_y, 16); 2291 ff_draw_horiz_band(s, 16*s->mb_y, 16);
2263 ++s->mb_y; 2292 ++s->mb_y;
2264 if(FIELD_OR_MBAFF_PICTURE) { 2293 if(FIELD_OR_MBAFF_PICTURE) {
2265 ++s->mb_y; 2294 ++s->mb_y;
2266 } 2295 }