# HG changeset patch # User reimar # Date 1255358637 0 # Node ID 19a4f1ecd8fe8bdf37d3a49399e7c6ad0b0aef90 # Parent 98501365c3aa79a9f76abb42039792f863857860 Move dnxhdenc to execute2 multithreading API. This allows for some simplifications like removing some outer loops and gives much better performance for thread_count > number of idle CPUs. diff -r 98501365c3aa -r 19a4f1ecd8fe dnxhdenc.c --- a/dnxhdenc.c Mon Oct 12 11:35:35 2009 +0000 +++ b/dnxhdenc.c Mon Oct 12 14:43:57 2009 +0000 @@ -204,6 +204,7 @@ return -1; FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_size, ctx->m.mb_height*sizeof(uint32_t), fail); + FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_offs, ctx->m.mb_height*sizeof(uint32_t), fail); FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_bits, ctx->m.mb_num *sizeof(uint16_t), fail); FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_qscale, ctx->m.mb_num *sizeof(uint8_t) , fail); @@ -211,7 +212,7 @@ ctx->frame.pict_type = FF_I_TYPE; ctx->m.avctx->coded_frame = &ctx->frame; - if (avctx->thread_count > MAX_THREADS || (avctx->thread_count > ctx->m.mb_height)) { + if (avctx->thread_count > MAX_THREADS) { av_log(avctx, AV_LOG_ERROR, "too many threads\n"); return -1; } @@ -222,11 +223,6 @@ memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext)); } - for (i = 0; i < avctx->thread_count; i++) { - ctx->thread[i]->m.start_mb_y = (ctx->m.mb_height*(i ) + avctx->thread_count/2) / avctx->thread_count; - ctx->thread[i]->m.end_mb_y = (ctx->m.mb_height*(i+1) + avctx->thread_count/2) / avctx->thread_count; - } - return 0; fail: //for FF_ALLOCZ_OR_GOTO return -1; @@ -397,13 +393,13 @@ } } -static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg) +static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) { - DNXHDEncContext *ctx = *(void**)arg; - int mb_y, mb_x; - int qscale = ctx->thread[0]->qscale; + DNXHDEncContext *ctx = avctx->priv_data; + int mb_y = jobnr, mb_x; + int qscale = ctx->qscale; + ctx = ctx->thread[threadnr]; - for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) { ctx->m.last_dc[0] = ctx->m.last_dc[1] = ctx->m.last_dc[2] = 1024; @@ -443,16 +439,16 @@ ctx->mb_rc[qscale][mb].ssd = ssd; ctx->mb_rc[qscale][mb].bits = ac_bits+dc_bits+12+8*ctx->vlc_bits[0]; } - } return 0; } -static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg) +static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) { - DNXHDEncContext *ctx = *(void**)arg; - int mb_y, mb_x; + DNXHDEncContext *ctx = avctx->priv_data; + int mb_y = jobnr, mb_x; + ctx = ctx->thread[threadnr]; + init_put_bits(&ctx->m.pb, (uint8_t *)arg + 640 + ctx->slice_offs[jobnr], ctx->slice_size[jobnr]); - for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) { ctx->m.last_dc[0] = ctx->m.last_dc[1] = ctx->m.last_dc[2] = 1024; @@ -477,18 +473,17 @@ } if (put_bits_count(&ctx->m.pb)&31) put_bits(&ctx->m.pb, 32-(put_bits_count(&ctx->m.pb)&31), 0); - } flush_put_bits(&ctx->m.pb); return 0; } -static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx, uint8_t *buf) +static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx) { int mb_y, mb_x; - int i, offset = 0; - for (i = 0; i < ctx->m.avctx->thread_count; i++) { - int thread_size = 0; - for (mb_y = ctx->thread[i]->m.start_mb_y; mb_y < ctx->thread[i]->m.end_mb_y; mb_y++) { + int offset = 0; + for (mb_y = 0; mb_y < ctx->m.mb_height; mb_y++) { + int thread_size; + ctx->slice_offs[mb_y] = offset; ctx->slice_size[mb_y] = 0; for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { unsigned mb = mb_y * ctx->m.mb_width + mb_x; @@ -496,18 +491,16 @@ } ctx->slice_size[mb_y] = (ctx->slice_size[mb_y]+31)&~31; ctx->slice_size[mb_y] >>= 3; - thread_size += ctx->slice_size[mb_y]; - } - init_put_bits(&ctx->thread[i]->m.pb, buf + 640 + offset, thread_size); + thread_size = ctx->slice_size[mb_y]; offset += thread_size; } } -static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg) +static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) { - DNXHDEncContext *ctx = *(void**)arg; - int mb_y, mb_x; - for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) { + DNXHDEncContext *ctx = avctx->priv_data; + int mb_y = jobnr, mb_x; + ctx = ctx->thread[threadnr]; for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { unsigned mb = mb_y * ctx->m.mb_width + mb_x; uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize) + (mb_x<<4); @@ -516,7 +509,6 @@ ctx->mb_cmp[mb].value = varc; ctx->mb_cmp[mb].mb = mb; } - } return 0; } @@ -528,7 +520,7 @@ for (q = 1; q < avctx->qmax; q++) { ctx->qscale = q; - avctx->execute(avctx, dnxhd_calc_bits_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*)); + avctx->execute2(avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height); } up_step = down_step = 2<lambda; @@ -608,7 +600,7 @@ bits = 0; ctx->qscale = qscale; // XXX avoid recalculating bits - ctx->m.avctx->execute(ctx->m.avctx, dnxhd_calc_bits_thread, &ctx->thread[0], NULL, ctx->m.avctx->thread_count, sizeof(void*)); + ctx->m.avctx->execute2(ctx->m.avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height); for (y = 0; y < ctx->m.mb_height; y++) { for (x = 0; x < ctx->m.mb_width; x++) bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits; @@ -732,7 +724,7 @@ } if (!ret) { if (RC_VARIANCE) - avctx->execute(avctx, dnxhd_mb_var_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*)); + avctx->execute2(avctx, dnxhd_mb_var_thread, NULL, NULL, ctx->m.mb_height); radix_sort(ctx->mb_cmp, ctx->m.mb_num); for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) { int mb = ctx->mb_cmp[x].mb; @@ -795,7 +787,7 @@ return -1; } - dnxhd_setup_threads_slices(ctx, buf); + dnxhd_setup_threads_slices(ctx); offset = 0; for (i = 0; i < ctx->m.mb_height; i++) { @@ -804,7 +796,7 @@ assert(!(ctx->slice_size[i] & 3)); } - avctx->execute(avctx, dnxhd_encode_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*)); + avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height); assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size); memset(buf + 640 + offset, 0, ctx->cid_table->coding_unit_size - 4 - offset - 640); @@ -840,6 +832,7 @@ av_freep(&ctx->mb_rc); av_freep(&ctx->mb_cmp); av_freep(&ctx->slice_size); + av_freep(&ctx->slice_offs); av_freep(&ctx->qmatrix_c); av_freep(&ctx->qmatrix_l); diff -r 98501365c3aa -r 19a4f1ecd8fe dnxhdenc.h --- a/dnxhdenc.h Mon Oct 12 11:35:35 2009 +0000 +++ b/dnxhdenc.h Mon Oct 12 14:43:57 2009 +0000 @@ -46,6 +46,7 @@ const CIDEntry *cid_table; uint8_t *msip; ///< Macroblock Scan Indexes Payload uint32_t *slice_size; + uint32_t *slice_offs; struct DNXHDEncContext *thread[MAX_THREADS];