Mercurial > libavcodec.hg
comparison dnxhdenc.c @ 10387:19a4f1ecd8fe libavcodec
Move dnxhdenc to execute2 multithreading API.
This allows for some simplifications like removing some outer loops
and gives much better performance for thread_count > number of idle CPUs.
author | reimar |
---|---|
date | Mon, 12 Oct 2009 14:43:57 +0000 |
parents | 59ec306245a4 |
children | 08e50bcdcbf1 |
comparison
equal
deleted
inserted
replaced
10386:98501365c3aa | 10387:19a4f1ecd8fe |
---|---|
202 return -1; | 202 return -1; |
203 if (dnxhd_init_rc(ctx) < 0) | 203 if (dnxhd_init_rc(ctx) < 0) |
204 return -1; | 204 return -1; |
205 | 205 |
206 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_size, ctx->m.mb_height*sizeof(uint32_t), fail); | 206 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_size, ctx->m.mb_height*sizeof(uint32_t), fail); |
207 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_offs, ctx->m.mb_height*sizeof(uint32_t), fail); | |
207 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_bits, ctx->m.mb_num *sizeof(uint16_t), fail); | 208 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_bits, ctx->m.mb_num *sizeof(uint16_t), fail); |
208 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_qscale, ctx->m.mb_num *sizeof(uint8_t) , fail); | 209 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_qscale, ctx->m.mb_num *sizeof(uint8_t) , fail); |
209 | 210 |
210 ctx->frame.key_frame = 1; | 211 ctx->frame.key_frame = 1; |
211 ctx->frame.pict_type = FF_I_TYPE; | 212 ctx->frame.pict_type = FF_I_TYPE; |
212 ctx->m.avctx->coded_frame = &ctx->frame; | 213 ctx->m.avctx->coded_frame = &ctx->frame; |
213 | 214 |
214 if (avctx->thread_count > MAX_THREADS || (avctx->thread_count > ctx->m.mb_height)) { | 215 if (avctx->thread_count > MAX_THREADS) { |
215 av_log(avctx, AV_LOG_ERROR, "too many threads\n"); | 216 av_log(avctx, AV_LOG_ERROR, "too many threads\n"); |
216 return -1; | 217 return -1; |
217 } | 218 } |
218 | 219 |
219 ctx->thread[0] = ctx; | 220 ctx->thread[0] = ctx; |
220 for (i = 1; i < avctx->thread_count; i++) { | 221 for (i = 1; i < avctx->thread_count; i++) { |
221 ctx->thread[i] = av_malloc(sizeof(DNXHDEncContext)); | 222 ctx->thread[i] = av_malloc(sizeof(DNXHDEncContext)); |
222 memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext)); | 223 memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext)); |
223 } | |
224 | |
225 for (i = 0; i < avctx->thread_count; i++) { | |
226 ctx->thread[i]->m.start_mb_y = (ctx->m.mb_height*(i ) + avctx->thread_count/2) / avctx->thread_count; | |
227 ctx->thread[i]->m.end_mb_y = (ctx->m.mb_height*(i+1) + avctx->thread_count/2) / avctx->thread_count; | |
228 } | 224 } |
229 | 225 |
230 return 0; | 226 return 0; |
231 fail: //for FF_ALLOCZ_OR_GOTO | 227 fail: //for FF_ALLOCZ_OR_GOTO |
232 return -1; | 228 return -1; |
395 ctx->m.q_intra_matrix = ctx->qmatrix_l; | 391 ctx->m.q_intra_matrix = ctx->qmatrix_l; |
396 return 0; | 392 return 0; |
397 } | 393 } |
398 } | 394 } |
399 | 395 |
400 static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg) | 396 static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) |
401 { | 397 { |
402 DNXHDEncContext *ctx = *(void**)arg; | 398 DNXHDEncContext *ctx = avctx->priv_data; |
403 int mb_y, mb_x; | 399 int mb_y = jobnr, mb_x; |
404 int qscale = ctx->thread[0]->qscale; | 400 int qscale = ctx->qscale; |
405 | 401 ctx = ctx->thread[threadnr]; |
406 for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) { | 402 |
407 ctx->m.last_dc[0] = | 403 ctx->m.last_dc[0] = |
408 ctx->m.last_dc[1] = | 404 ctx->m.last_dc[1] = |
409 ctx->m.last_dc[2] = 1024; | 405 ctx->m.last_dc[2] = 1024; |
410 | 406 |
411 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { | 407 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { |
441 } | 437 } |
442 } | 438 } |
443 ctx->mb_rc[qscale][mb].ssd = ssd; | 439 ctx->mb_rc[qscale][mb].ssd = ssd; |
444 ctx->mb_rc[qscale][mb].bits = ac_bits+dc_bits+12+8*ctx->vlc_bits[0]; | 440 ctx->mb_rc[qscale][mb].bits = ac_bits+dc_bits+12+8*ctx->vlc_bits[0]; |
445 } | 441 } |
446 } | 442 return 0; |
447 return 0; | 443 } |
448 } | 444 |
449 | 445 static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) |
450 static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg) | 446 { |
451 { | 447 DNXHDEncContext *ctx = avctx->priv_data; |
452 DNXHDEncContext *ctx = *(void**)arg; | 448 int mb_y = jobnr, mb_x; |
453 int mb_y, mb_x; | 449 ctx = ctx->thread[threadnr]; |
454 | 450 init_put_bits(&ctx->m.pb, (uint8_t *)arg + 640 + ctx->slice_offs[jobnr], ctx->slice_size[jobnr]); |
455 for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) { | 451 |
456 ctx->m.last_dc[0] = | 452 ctx->m.last_dc[0] = |
457 ctx->m.last_dc[1] = | 453 ctx->m.last_dc[1] = |
458 ctx->m.last_dc[2] = 1024; | 454 ctx->m.last_dc[2] = 1024; |
459 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { | 455 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { |
460 unsigned mb = mb_y * ctx->m.mb_width + mb_x; | 456 unsigned mb = mb_y * ctx->m.mb_width + mb_x; |
475 //STOP_TIMER("encode_block"); | 471 //STOP_TIMER("encode_block"); |
476 } | 472 } |
477 } | 473 } |
478 if (put_bits_count(&ctx->m.pb)&31) | 474 if (put_bits_count(&ctx->m.pb)&31) |
479 put_bits(&ctx->m.pb, 32-(put_bits_count(&ctx->m.pb)&31), 0); | 475 put_bits(&ctx->m.pb, 32-(put_bits_count(&ctx->m.pb)&31), 0); |
480 } | |
481 flush_put_bits(&ctx->m.pb); | 476 flush_put_bits(&ctx->m.pb); |
482 return 0; | 477 return 0; |
483 } | 478 } |
484 | 479 |
485 static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx, uint8_t *buf) | 480 static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx) |
486 { | 481 { |
487 int mb_y, mb_x; | 482 int mb_y, mb_x; |
488 int i, offset = 0; | 483 int offset = 0; |
489 for (i = 0; i < ctx->m.avctx->thread_count; i++) { | 484 for (mb_y = 0; mb_y < ctx->m.mb_height; mb_y++) { |
490 int thread_size = 0; | 485 int thread_size; |
491 for (mb_y = ctx->thread[i]->m.start_mb_y; mb_y < ctx->thread[i]->m.end_mb_y; mb_y++) { | 486 ctx->slice_offs[mb_y] = offset; |
492 ctx->slice_size[mb_y] = 0; | 487 ctx->slice_size[mb_y] = 0; |
493 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { | 488 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { |
494 unsigned mb = mb_y * ctx->m.mb_width + mb_x; | 489 unsigned mb = mb_y * ctx->m.mb_width + mb_x; |
495 ctx->slice_size[mb_y] += ctx->mb_bits[mb]; | 490 ctx->slice_size[mb_y] += ctx->mb_bits[mb]; |
496 } | 491 } |
497 ctx->slice_size[mb_y] = (ctx->slice_size[mb_y]+31)&~31; | 492 ctx->slice_size[mb_y] = (ctx->slice_size[mb_y]+31)&~31; |
498 ctx->slice_size[mb_y] >>= 3; | 493 ctx->slice_size[mb_y] >>= 3; |
499 thread_size += ctx->slice_size[mb_y]; | 494 thread_size = ctx->slice_size[mb_y]; |
500 } | |
501 init_put_bits(&ctx->thread[i]->m.pb, buf + 640 + offset, thread_size); | |
502 offset += thread_size; | 495 offset += thread_size; |
503 } | 496 } |
504 } | 497 } |
505 | 498 |
506 static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg) | 499 static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) |
507 { | 500 { |
508 DNXHDEncContext *ctx = *(void**)arg; | 501 DNXHDEncContext *ctx = avctx->priv_data; |
509 int mb_y, mb_x; | 502 int mb_y = jobnr, mb_x; |
510 for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) { | 503 ctx = ctx->thread[threadnr]; |
511 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { | 504 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { |
512 unsigned mb = mb_y * ctx->m.mb_width + mb_x; | 505 unsigned mb = mb_y * ctx->m.mb_width + mb_x; |
513 uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize) + (mb_x<<4); | 506 uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize) + (mb_x<<4); |
514 int sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize); | 507 int sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize); |
515 int varc = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)(sum*sum))>>8)+128)>>8; | 508 int varc = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)(sum*sum))>>8)+128)>>8; |
516 ctx->mb_cmp[mb].value = varc; | 509 ctx->mb_cmp[mb].value = varc; |
517 ctx->mb_cmp[mb].mb = mb; | 510 ctx->mb_cmp[mb].mb = mb; |
518 } | 511 } |
519 } | |
520 return 0; | 512 return 0; |
521 } | 513 } |
522 | 514 |
523 static int dnxhd_encode_rdo(AVCodecContext *avctx, DNXHDEncContext *ctx) | 515 static int dnxhd_encode_rdo(AVCodecContext *avctx, DNXHDEncContext *ctx) |
524 { | 516 { |
526 int last_lower = INT_MAX, last_higher = 0; | 518 int last_lower = INT_MAX, last_higher = 0; |
527 int x, y, q; | 519 int x, y, q; |
528 | 520 |
529 for (q = 1; q < avctx->qmax; q++) { | 521 for (q = 1; q < avctx->qmax; q++) { |
530 ctx->qscale = q; | 522 ctx->qscale = q; |
531 avctx->execute(avctx, dnxhd_calc_bits_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*)); | 523 avctx->execute2(avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height); |
532 } | 524 } |
533 up_step = down_step = 2<<LAMBDA_FRAC_BITS; | 525 up_step = down_step = 2<<LAMBDA_FRAC_BITS; |
534 lambda = ctx->lambda; | 526 lambda = ctx->lambda; |
535 | 527 |
536 for (;;) { | 528 for (;;) { |
606 qscale = ctx->qscale; | 598 qscale = ctx->qscale; |
607 for (;;) { | 599 for (;;) { |
608 bits = 0; | 600 bits = 0; |
609 ctx->qscale = qscale; | 601 ctx->qscale = qscale; |
610 // XXX avoid recalculating bits | 602 // XXX avoid recalculating bits |
611 ctx->m.avctx->execute(ctx->m.avctx, dnxhd_calc_bits_thread, &ctx->thread[0], NULL, ctx->m.avctx->thread_count, sizeof(void*)); | 603 ctx->m.avctx->execute2(ctx->m.avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height); |
612 for (y = 0; y < ctx->m.mb_height; y++) { | 604 for (y = 0; y < ctx->m.mb_height; y++) { |
613 for (x = 0; x < ctx->m.mb_width; x++) | 605 for (x = 0; x < ctx->m.mb_width; x++) |
614 bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits; | 606 bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits; |
615 bits = (bits+31)&~31; // padding | 607 bits = (bits+31)&~31; // padding |
616 if (bits > ctx->frame_bits) | 608 if (bits > ctx->frame_bits) |
730 } | 722 } |
731 max_bits += 31; //worst padding | 723 max_bits += 31; //worst padding |
732 } | 724 } |
733 if (!ret) { | 725 if (!ret) { |
734 if (RC_VARIANCE) | 726 if (RC_VARIANCE) |
735 avctx->execute(avctx, dnxhd_mb_var_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*)); | 727 avctx->execute2(avctx, dnxhd_mb_var_thread, NULL, NULL, ctx->m.mb_height); |
736 radix_sort(ctx->mb_cmp, ctx->m.mb_num); | 728 radix_sort(ctx->mb_cmp, ctx->m.mb_num); |
737 for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) { | 729 for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) { |
738 int mb = ctx->mb_cmp[x].mb; | 730 int mb = ctx->mb_cmp[x].mb; |
739 max_bits -= ctx->mb_rc[ctx->qscale][mb].bits - ctx->mb_rc[ctx->qscale+1][mb].bits; | 731 max_bits -= ctx->mb_rc[ctx->qscale][mb].bits - ctx->mb_rc[ctx->qscale+1][mb].bits; |
740 ctx->mb_qscale[mb] = ctx->qscale+1; | 732 ctx->mb_qscale[mb] = ctx->qscale+1; |
793 if (ret < 0) { | 785 if (ret < 0) { |
794 av_log(avctx, AV_LOG_ERROR, "picture could not fit ratecontrol constraints\n"); | 786 av_log(avctx, AV_LOG_ERROR, "picture could not fit ratecontrol constraints\n"); |
795 return -1; | 787 return -1; |
796 } | 788 } |
797 | 789 |
798 dnxhd_setup_threads_slices(ctx, buf); | 790 dnxhd_setup_threads_slices(ctx); |
799 | 791 |
800 offset = 0; | 792 offset = 0; |
801 for (i = 0; i < ctx->m.mb_height; i++) { | 793 for (i = 0; i < ctx->m.mb_height; i++) { |
802 AV_WB32(ctx->msip + i * 4, offset); | 794 AV_WB32(ctx->msip + i * 4, offset); |
803 offset += ctx->slice_size[i]; | 795 offset += ctx->slice_size[i]; |
804 assert(!(ctx->slice_size[i] & 3)); | 796 assert(!(ctx->slice_size[i] & 3)); |
805 } | 797 } |
806 | 798 |
807 avctx->execute(avctx, dnxhd_encode_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*)); | 799 avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height); |
808 | 800 |
809 assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size); | 801 assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size); |
810 memset(buf + 640 + offset, 0, ctx->cid_table->coding_unit_size - 4 - offset - 640); | 802 memset(buf + 640 + offset, 0, ctx->cid_table->coding_unit_size - 4 - offset - 640); |
811 | 803 |
812 AV_WB32(buf + ctx->cid_table->coding_unit_size - 4, 0x600DC0DE); // EOF | 804 AV_WB32(buf + ctx->cid_table->coding_unit_size - 4, 0x600DC0DE); // EOF |
838 av_freep(&ctx->mb_bits); | 830 av_freep(&ctx->mb_bits); |
839 av_freep(&ctx->mb_qscale); | 831 av_freep(&ctx->mb_qscale); |
840 av_freep(&ctx->mb_rc); | 832 av_freep(&ctx->mb_rc); |
841 av_freep(&ctx->mb_cmp); | 833 av_freep(&ctx->mb_cmp); |
842 av_freep(&ctx->slice_size); | 834 av_freep(&ctx->slice_size); |
835 av_freep(&ctx->slice_offs); | |
843 | 836 |
844 av_freep(&ctx->qmatrix_c); | 837 av_freep(&ctx->qmatrix_c); |
845 av_freep(&ctx->qmatrix_l); | 838 av_freep(&ctx->qmatrix_l); |
846 av_freep(&ctx->qmatrix_c16); | 839 av_freep(&ctx->qmatrix_c16); |
847 av_freep(&ctx->qmatrix_l16); | 840 av_freep(&ctx->qmatrix_l16); |