# HG changeset patch # User gpoirier # Date 1189009095 0 # Node ID d2598034f2a9e8afffd00b0134ad16afb4e3196e # Parent 1e93e637fa21345b2a9e970cd3e801675ed1c3c9 Add slice-based parallel H.264 decoding Patch by Andreas ªÓman % andreas A olebyn P nu % NB: depends on having a thread library activated at config time, and on having a source encoded with multiple slices Original threads: date: May 18, 2007 11:00 PM subject: [FFmpeg-devel] Parallelized h264 proof-of-concept date: Jun 15, 2007 10:10 PM subject: [FFmpeg-devel] [PATCH] h264 parallelized, (was: Parallelized h264 proof-of-concept) date: Jun 25, 2007 7:02 PM subject: Re: [FFmpeg-devel] [PATCH] h264 parallelized diff -r 1e93e637fa21 -r d2598034f2a9 h264.c --- a/h264.c Wed Sep 05 11:05:33 2007 +0000 +++ b/h264.c Wed Sep 05 16:18:15 2007 +0000 @@ -2005,6 +2005,7 @@ static void free_tables(H264Context *h){ int i; + H264Context *hx; av_freep(&h->intra4x4_pred_mode); av_freep(&h->chroma_pred_mode_table); av_freep(&h->cbp_table); @@ -2013,20 +2014,25 @@ av_freep(&h->direct_table); av_freep(&h->non_zero_count); av_freep(&h->slice_table_base); - av_freep(&h->top_borders[1]); - av_freep(&h->top_borders[0]); h->slice_table= NULL; av_freep(&h->mb2b_xy); av_freep(&h->mb2b8_xy); - av_freep(&h->s.obmc_scratchpad); - for(i = 0; i < MAX_SPS_COUNT; i++) av_freep(h->sps_buffers + i); for(i = 0; i < MAX_PPS_COUNT; i++) av_freep(h->pps_buffers + i); + + for(i = 0; i < h->s.avctx->thread_count; i++) { + hx = h->thread_context[i]; + if(!hx) continue; + av_freep(&hx->top_borders[1]); + av_freep(&hx->top_borders[0]); + av_freep(&hx->s.obmc_scratchpad); + av_freep(&hx->s.allocated_edge_emu_buffer); + } } static void init_dequant8_coeff_table(H264Context *h){ @@ -2107,8 +2113,6 @@ CHECKED_ALLOCZ(h->non_zero_count , big_mb_num * 16 * sizeof(uint8_t)) CHECKED_ALLOCZ(h->slice_table_base , (big_mb_num+s->mb_stride) * sizeof(uint8_t)) - CHECKED_ALLOCZ(h->top_borders[0] , s->mb_width * (16+8+8) * sizeof(uint8_t)) - CHECKED_ALLOCZ(h->top_borders[1] , s->mb_width * (16+8+8) * sizeof(uint8_t)) CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t)) if( h->pps.cabac ) { @@ -2145,6 +2149,47 @@ return -1; } +/** + * Mimic alloc_tables(), but for every context thread. + */ +static void clone_tables(H264Context *dst, H264Context *src){ + dst->intra4x4_pred_mode = src->intra4x4_pred_mode; + dst->non_zero_count = src->non_zero_count; + dst->slice_table = src->slice_table; + dst->cbp_table = src->cbp_table; + dst->mb2b_xy = src->mb2b_xy; + dst->mb2b8_xy = src->mb2b8_xy; + dst->chroma_pred_mode_table = src->chroma_pred_mode_table; + dst->mvd_table[0] = src->mvd_table[0]; + dst->mvd_table[1] = src->mvd_table[1]; + dst->direct_table = src->direct_table; + + if(!dst->dequant4_coeff[0]) + init_dequant_tables(dst); + dst->s.obmc_scratchpad = NULL; + ff_h264_pred_init(&dst->hpc, src->s.codec_id); + dst->dequant_coeff_pps= -1; +} + +/** + * Init context + * Allocate buffers which are not shared amongst multiple threads. + */ +static int context_init(H264Context *h){ + MpegEncContext * const s = &h->s; + + CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t)) + CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t)) + + // edge emu needs blocksize + filter length - 1 (=17x17 for halfpel / 21x21 for h264) + CHECKED_ALLOCZ(s->allocated_edge_emu_buffer, + (s->width+64)*2*21*2); //(width + edge + align)*interlaced*MBsize*tolerance + s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*21; + return 0; +fail: + return -1; // free_tables will clean up for us +} + static void common_init(H264Context *h){ MpegEncContext * const s = &h->s; @@ -2190,6 +2235,7 @@ h->is_avc = 0; } + h->thread_context[0] = h; return 0; } @@ -2216,11 +2262,12 @@ /* can't be in alloc_tables because linesize isn't known there. * FIXME: redo bipred weight to not require extra buffer? */ - if(!s->obmc_scratchpad) - s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize); + for(i = 0; i < s->avctx->thread_count; i++) + if(!h->thread_context[i]->s.obmc_scratchpad) + h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize); /* some macroblocks will be accessed before they're available */ - if(FRAME_MBAFF) + if(FRAME_MBAFF || s->avctx->thread_count > 1) memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t)); // s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1; @@ -3453,17 +3500,46 @@ h->field_scan8x8_cavlc_q0 = h->field_scan8x8_cavlc; } } + +/** + * Replicates H264 "master" context to thread contexts. + */ +static void clone_slice(H264Context *dst, H264Context *src) +{ + memcpy(dst->block_offset, src->block_offset, sizeof(dst->block_offset)); + dst->s.current_picture_ptr = src->s.current_picture_ptr; + dst->s.current_picture = src->s.current_picture; + dst->s.linesize = src->s.linesize; + dst->s.uvlinesize = src->s.uvlinesize; + + dst->prev_poc_msb = src->prev_poc_msb; + dst->prev_poc_lsb = src->prev_poc_lsb; + dst->prev_frame_num_offset = src->prev_frame_num_offset; + dst->prev_frame_num = src->prev_frame_num; + dst->short_ref_count = src->short_ref_count; + + memcpy(dst->short_ref, src->short_ref, sizeof(dst->short_ref)); + memcpy(dst->long_ref, src->long_ref, sizeof(dst->long_ref)); + memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list)); + memcpy(dst->ref_list, src->ref_list, sizeof(dst->ref_list)); +} + /** * decodes a slice header. * this will allso call MPV_common_init() and frame_start() as needed + * + * @param h h264context + * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding) + * + * @return 0 if okay, <0 if an error occured, 1 if decoding must not be multithreaded */ -static int decode_slice_header(H264Context *h){ +static int decode_slice_header(H264Context *h, H264Context *h0){ MpegEncContext * const s = &h->s; unsigned int first_mb_in_slice; unsigned int pps_id; int num_ref_idx_active_override_flag; static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE}; - unsigned int slice_type, tmp; + unsigned int slice_type, tmp, i; int default_ref_list_done = 0; s->current_picture.reference= h->nal_ref_idc != 0; @@ -3472,7 +3548,7 @@ first_mb_in_slice= get_ue_golomb(&s->gb); if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){ - h->slice_num = 0; + h0->current_slice = 0; s->current_picture_ptr= NULL; } @@ -3489,7 +3565,7 @@ slice_type= slice_type_map[ slice_type ]; if (slice_type == I_TYPE - || (h->slice_num != 0 && slice_type == h->slice_type) ) { + || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) { default_ref_list_done = 1; } h->slice_type= slice_type; @@ -3501,17 +3577,17 @@ av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n"); return -1; } - if(!h->pps_buffers[pps_id]) { + if(!h0->pps_buffers[pps_id]) { av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n"); return -1; } - h->pps= *h->pps_buffers[pps_id]; - - if(!h->sps_buffers[h->pps.sps_id]) { + h->pps= *h0->pps_buffers[pps_id]; + + if(!h0->sps_buffers[h->pps.sps_id]) { av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n"); return -1; } - h->sps = *h->sps_buffers[h->pps.sps_id]; + h->sps = *h0->sps_buffers[h->pps.sps_id]; if(h->dequant_coeff_pps != pps_id){ h->dequant_coeff_pps = pps_id; @@ -3532,16 +3608,35 @@ if (s->context_initialized && ( s->width != s->avctx->width || s->height != s->avctx->height)) { + if(h != h0) + return -1; // width / height changed during parallelized decoding free_tables(h); MPV_common_end(s); } if (!s->context_initialized) { + if(h != h0) + return -1; // we cant (re-)initialize context during parallel decoding if (MPV_common_init(s) < 0) return -1; init_scan_tables(h); alloc_tables(h); + for(i = 1; i < s->avctx->thread_count; i++) { + H264Context *c; + c = h->thread_context[i] = av_malloc(sizeof(H264Context)); + memcpy(c, h, sizeof(MpegEncContext)); + memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext)); + c->sps = h->sps; + c->pps = h->pps; + init_scan_tables(c); + clone_tables(c, h); + } + + for(i = 0; i < s->avctx->thread_count; i++) + if(context_init(h->thread_context[i]) < 0) + return -1; + s->avctx->width = s->width; s->avctx->height = s->height; s->avctx->sample_aspect_ratio= h->sps.sar; @@ -3557,10 +3652,12 @@ } } - if(h->slice_num == 0){ + if(h0->current_slice == 0){ if(frame_start(h) < 0) return -1; } + if(h != h0) + clone_slice(h, h0); s->current_picture_ptr->frame_num= //FIXME frame_num cleanup h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num); @@ -3667,7 +3764,7 @@ h->use_weight = 0; if(s->current_picture.reference) - decode_ref_pic_marking(h, &s->gb); + decode_ref_pic_marking(h0, &s->gb); if(FRAME_MBAFF) fill_mbaff_ref_list(h); @@ -3716,6 +3813,17 @@ h->slice_beta_offset = get_se_golomb(&s->gb) << 1; } } + + if(h->deblocking_filter == 1 && h0->max_contexts > 1) { + h0->max_contexts = 1; + if(!h0->single_decode_warning) { + av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n"); + h0->single_decode_warning = 1; + } + if(h != h0) + return 1; // deblocking switched inside frame + } + if( s->avctx->skip_loop_filter >= AVDISCARD_ALL ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE) ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR && h->slice_type == B_TYPE) @@ -3727,7 +3835,8 @@ slice_group_change_cycle= get_bits(&s->gb, ?); #endif - h->slice_num++; + h0->last_slice_type = slice_type; + h->slice_num = ++h0->current_slice; h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16; h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width; @@ -6295,7 +6404,7 @@ } } -static int decode_slice(H264Context *h){ +static int decode_slice(struct AVCodecContext *avctx, H264Context *h){ MpegEncContext * const s = &h->s; const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F; @@ -6940,10 +7049,48 @@ return 0; } +/** + * Call decode_slice() for each context. + * + * @param h h264 master context + * @param context_count number of contexts to execute + */ +static void execute_decode_slices(H264Context *h, int context_count){ + MpegEncContext * const s = &h->s; + AVCodecContext * const avctx= s->avctx; + H264Context *hx; + int i; + + if(context_count == 1) { + decode_slice(avctx, h); + } else { + for(i = 1; i < context_count; i++) { + hx = h->thread_context[i]; + hx->s.error_resilience = avctx->error_resilience; + hx->s.error_count = 0; + } + + avctx->execute(avctx, (void *)decode_slice, + (void **)h->thread_context, NULL, context_count); + + /* pull back stuff from slices to master context */ + hx = h->thread_context[context_count - 1]; + s->mb_x = hx->s.mb_x; + s->mb_y = hx->s.mb_y; + for(i = 1; i < context_count; i++) + h->s.error_count += h->thread_context[i]->s.error_count; + } +} + + static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){ MpegEncContext * const s = &h->s; AVCodecContext * const avctx= s->avctx; int buf_index=0; + H264Context *hx; ///< thread context + int context_count = 0; + + h->max_contexts = avctx->thread_count; #if 0 int i; for(i=0; i<50; i++){ @@ -6951,7 +7098,7 @@ } #endif if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){ - h->slice_num = 0; + h->current_slice = 0; s->current_picture_ptr= NULL; } @@ -6961,6 +7108,7 @@ int bit_length; uint8_t *ptr; int i, nalsize = 0; + int err; if(h->is_avc) { if(buf_index >= buf_size) break; @@ -6989,7 +7137,9 @@ buf_index+=3; } - ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index); + hx = h->thread_context[context_count]; + + ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index); if (ptr==NULL || dst_length < 0){ return -1; } @@ -6998,7 +7148,7 @@ bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1)); if(s->avctx->debug&FF_DEBUG_STARTCODE){ - av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length); + av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length); } if (h->is_avc && (nalsize != consumed)) @@ -7010,53 +7160,56 @@ ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc == 0)) continue; - switch(h->nal_unit_type){ + again: + err = 0; + switch(hx->nal_unit_type){ case NAL_IDR_SLICE: + if (h->nal_unit_type != NAL_IDR_SLICE) { + av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices"); + return -1; + } idr(h); //FIXME ensure we don't loose some frames if there is reordering case NAL_SLICE: - init_get_bits(&s->gb, ptr, bit_length); - h->intra_gb_ptr= - h->inter_gb_ptr= &s->gb; - s->data_partitioning = 0; - - if(decode_slice_header(h) < 0){ - av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n"); - break; - } - s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE); - if(h->redundant_pic_count==0 && s->hurry_up < 5 - && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc) - && (avctx->skip_frame < AVDISCARD_BIDIR || h->slice_type!=B_TYPE) - && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE) + init_get_bits(&hx->s.gb, ptr, bit_length); + hx->intra_gb_ptr= + hx->inter_gb_ptr= &hx->s.gb; + hx->s.data_partitioning = 0; + + if((err = decode_slice_header(hx, h))) + break; + + s->current_picture_ptr->key_frame= (hx->nal_unit_type == NAL_IDR_SLICE); + if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5 + && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc) + && (avctx->skip_frame < AVDISCARD_BIDIR || hx->slice_type!=B_TYPE) + && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE) && avctx->skip_frame < AVDISCARD_ALL) - decode_slice(h); + context_count++; break; case NAL_DPA: - init_get_bits(&s->gb, ptr, bit_length); - h->intra_gb_ptr= - h->inter_gb_ptr= NULL; - s->data_partitioning = 1; - - if(decode_slice_header(h) < 0){ - av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n"); - } + init_get_bits(&hx->s.gb, ptr, bit_length); + hx->intra_gb_ptr= + hx->inter_gb_ptr= NULL; + hx->s.data_partitioning = 1; + + err = decode_slice_header(hx, h); break; case NAL_DPB: - init_get_bits(&h->intra_gb, ptr, bit_length); - h->intra_gb_ptr= &h->intra_gb; + init_get_bits(&hx->intra_gb, ptr, bit_length); + hx->intra_gb_ptr= &hx->intra_gb; break; case NAL_DPC: - init_get_bits(&h->inter_gb, ptr, bit_length); - h->inter_gb_ptr= &h->inter_gb; - - if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning + init_get_bits(&hx->inter_gb, ptr, bit_length); + hx->inter_gb_ptr= &hx->inter_gb; + + if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning && s->context_initialized && s->hurry_up < 5 - && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc) - && (avctx->skip_frame < AVDISCARD_BIDIR || h->slice_type!=B_TYPE) - && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE) + && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc) + && (avctx->skip_frame < AVDISCARD_BIDIR || hx->slice_type!=B_TYPE) + && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE) && avctx->skip_frame < AVDISCARD_ALL) - decode_slice(h); + context_count++; break; case NAL_SEI: init_get_bits(&s->gb, ptr, bit_length); @@ -7088,8 +7241,27 @@ default: av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length); } - } - + + if(context_count == h->max_contexts) { + execute_decode_slices(h, context_count); + context_count = 0; + } + + if (err < 0) + av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n"); + else if(err == 1) { + /* Slice could not be decoded in parallel mode, copy down + * NAL unit stuff to context 0 and restart. Note that + * rbsp_buffer is not transfered, but since we no longer + * run in parallel mode this should not be an issue. */ + h->nal_unit_type = hx->nal_unit_type; + h->nal_ref_idc = hx->nal_ref_idc; + hx = h; + goto again; + } + } + if(context_count) + execute_decode_slices(h, context_count); return buf_index; } diff -r 1e93e637fa21 -r d2598034f2a9 h264.h --- a/h264.h Wed Sep 05 11:05:33 2007 +0000 +++ b/h264.h Wed Sep 05 16:18:15 2007 +0000 @@ -380,6 +380,35 @@ const uint8_t *field_scan8x8_cavlc_q0; int x264_build; + + /** + * @defgroup multithreading Members for slice based multithreading + * @{ + */ + struct H264Context *thread_context[MAX_THREADS]; + + /** + * current slice number, used to initalize slice_num of each thread/context + */ + int current_slice; + + /** + * Max number of threads / contexts. + * This is equal to AVCodecContext.thread_count unless + * multithreaded decoding is impossible, in which case it is + * reduced to 1. + */ + int max_contexts; + + /** + * 1 if the single thread fallback warning has already been + * displayed, 0 otherwise. + */ + int single_decode_warning; + + int last_slice_type; + /** @} */ + }H264Context; #endif /* H264_H */ diff -r 1e93e637fa21 -r d2598034f2a9 mpegvideo.c --- a/mpegvideo.c Wed Sep 05 11:05:33 2007 +0000 +++ b/mpegvideo.c Wed Sep 05 16:18:15 2007 +0000 @@ -418,7 +418,7 @@ */ int MPV_common_init(MpegEncContext *s) { - int y_size, c_size, yc_size, i, mb_array_size, mv_table_size, x, y; + int y_size, c_size, yc_size, i, mb_array_size, mv_table_size, x, y, threads; s->mb_height = (s->height + 15) / 16; @@ -587,12 +587,16 @@ s->context_initialized = 1; s->thread_context[0]= s; - for(i=1; iavctx->thread_count; i++){ + /* h264 does thread context setup itself, but it needs context[0] + * to be fully initialized for the error resilience code */ + threads = s->codec_id == CODEC_ID_H264 ? 1 : s->avctx->thread_count; + + for(i=1; ithread_context[i]= av_malloc(sizeof(MpegEncContext)); memcpy(s->thread_context[i], s, sizeof(MpegEncContext)); } - for(i=0; iavctx->thread_count; i++){ + for(i=0; ithread_context[i], s) < 0) goto fail; s->thread_context[i]->start_mb_y= (s->mb_height*(i ) + s->avctx->thread_count/2) / s->avctx->thread_count;