comparison dnxhdenc.c @ 10387:19a4f1ecd8fe libavcodec

Move dnxhdenc to execute2 multithreading API. This allows for some simplifications like removing some outer loops and gives much better performance for thread_count > number of idle CPUs.
author reimar
date Mon, 12 Oct 2009 14:43:57 +0000
parents 59ec306245a4
children 08e50bcdcbf1
comparison
equal deleted inserted replaced
10386:98501365c3aa 10387:19a4f1ecd8fe
202 return -1; 202 return -1;
203 if (dnxhd_init_rc(ctx) < 0) 203 if (dnxhd_init_rc(ctx) < 0)
204 return -1; 204 return -1;
205 205
206 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_size, ctx->m.mb_height*sizeof(uint32_t), fail); 206 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_size, ctx->m.mb_height*sizeof(uint32_t), fail);
207 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_offs, ctx->m.mb_height*sizeof(uint32_t), fail);
207 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_bits, ctx->m.mb_num *sizeof(uint16_t), fail); 208 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_bits, ctx->m.mb_num *sizeof(uint16_t), fail);
208 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_qscale, ctx->m.mb_num *sizeof(uint8_t) , fail); 209 FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_qscale, ctx->m.mb_num *sizeof(uint8_t) , fail);
209 210
210 ctx->frame.key_frame = 1; 211 ctx->frame.key_frame = 1;
211 ctx->frame.pict_type = FF_I_TYPE; 212 ctx->frame.pict_type = FF_I_TYPE;
212 ctx->m.avctx->coded_frame = &ctx->frame; 213 ctx->m.avctx->coded_frame = &ctx->frame;
213 214
214 if (avctx->thread_count > MAX_THREADS || (avctx->thread_count > ctx->m.mb_height)) { 215 if (avctx->thread_count > MAX_THREADS) {
215 av_log(avctx, AV_LOG_ERROR, "too many threads\n"); 216 av_log(avctx, AV_LOG_ERROR, "too many threads\n");
216 return -1; 217 return -1;
217 } 218 }
218 219
219 ctx->thread[0] = ctx; 220 ctx->thread[0] = ctx;
220 for (i = 1; i < avctx->thread_count; i++) { 221 for (i = 1; i < avctx->thread_count; i++) {
221 ctx->thread[i] = av_malloc(sizeof(DNXHDEncContext)); 222 ctx->thread[i] = av_malloc(sizeof(DNXHDEncContext));
222 memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext)); 223 memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext));
223 }
224
225 for (i = 0; i < avctx->thread_count; i++) {
226 ctx->thread[i]->m.start_mb_y = (ctx->m.mb_height*(i ) + avctx->thread_count/2) / avctx->thread_count;
227 ctx->thread[i]->m.end_mb_y = (ctx->m.mb_height*(i+1) + avctx->thread_count/2) / avctx->thread_count;
228 } 224 }
229 225
230 return 0; 226 return 0;
231 fail: //for FF_ALLOCZ_OR_GOTO 227 fail: //for FF_ALLOCZ_OR_GOTO
232 return -1; 228 return -1;
395 ctx->m.q_intra_matrix = ctx->qmatrix_l; 391 ctx->m.q_intra_matrix = ctx->qmatrix_l;
396 return 0; 392 return 0;
397 } 393 }
398 } 394 }
399 395
400 static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg) 396 static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
401 { 397 {
402 DNXHDEncContext *ctx = *(void**)arg; 398 DNXHDEncContext *ctx = avctx->priv_data;
403 int mb_y, mb_x; 399 int mb_y = jobnr, mb_x;
404 int qscale = ctx->thread[0]->qscale; 400 int qscale = ctx->qscale;
405 401 ctx = ctx->thread[threadnr];
406 for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) { 402
407 ctx->m.last_dc[0] = 403 ctx->m.last_dc[0] =
408 ctx->m.last_dc[1] = 404 ctx->m.last_dc[1] =
409 ctx->m.last_dc[2] = 1024; 405 ctx->m.last_dc[2] = 1024;
410 406
411 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { 407 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
441 } 437 }
442 } 438 }
443 ctx->mb_rc[qscale][mb].ssd = ssd; 439 ctx->mb_rc[qscale][mb].ssd = ssd;
444 ctx->mb_rc[qscale][mb].bits = ac_bits+dc_bits+12+8*ctx->vlc_bits[0]; 440 ctx->mb_rc[qscale][mb].bits = ac_bits+dc_bits+12+8*ctx->vlc_bits[0];
445 } 441 }
446 } 442 return 0;
447 return 0; 443 }
448 } 444
449 445 static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
450 static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg) 446 {
451 { 447 DNXHDEncContext *ctx = avctx->priv_data;
452 DNXHDEncContext *ctx = *(void**)arg; 448 int mb_y = jobnr, mb_x;
453 int mb_y, mb_x; 449 ctx = ctx->thread[threadnr];
454 450 init_put_bits(&ctx->m.pb, (uint8_t *)arg + 640 + ctx->slice_offs[jobnr], ctx->slice_size[jobnr]);
455 for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) { 451
456 ctx->m.last_dc[0] = 452 ctx->m.last_dc[0] =
457 ctx->m.last_dc[1] = 453 ctx->m.last_dc[1] =
458 ctx->m.last_dc[2] = 1024; 454 ctx->m.last_dc[2] = 1024;
459 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { 455 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
460 unsigned mb = mb_y * ctx->m.mb_width + mb_x; 456 unsigned mb = mb_y * ctx->m.mb_width + mb_x;
475 //STOP_TIMER("encode_block"); 471 //STOP_TIMER("encode_block");
476 } 472 }
477 } 473 }
478 if (put_bits_count(&ctx->m.pb)&31) 474 if (put_bits_count(&ctx->m.pb)&31)
479 put_bits(&ctx->m.pb, 32-(put_bits_count(&ctx->m.pb)&31), 0); 475 put_bits(&ctx->m.pb, 32-(put_bits_count(&ctx->m.pb)&31), 0);
480 }
481 flush_put_bits(&ctx->m.pb); 476 flush_put_bits(&ctx->m.pb);
482 return 0; 477 return 0;
483 } 478 }
484 479
485 static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx, uint8_t *buf) 480 static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx)
486 { 481 {
487 int mb_y, mb_x; 482 int mb_y, mb_x;
488 int i, offset = 0; 483 int offset = 0;
489 for (i = 0; i < ctx->m.avctx->thread_count; i++) { 484 for (mb_y = 0; mb_y < ctx->m.mb_height; mb_y++) {
490 int thread_size = 0; 485 int thread_size;
491 for (mb_y = ctx->thread[i]->m.start_mb_y; mb_y < ctx->thread[i]->m.end_mb_y; mb_y++) { 486 ctx->slice_offs[mb_y] = offset;
492 ctx->slice_size[mb_y] = 0; 487 ctx->slice_size[mb_y] = 0;
493 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { 488 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
494 unsigned mb = mb_y * ctx->m.mb_width + mb_x; 489 unsigned mb = mb_y * ctx->m.mb_width + mb_x;
495 ctx->slice_size[mb_y] += ctx->mb_bits[mb]; 490 ctx->slice_size[mb_y] += ctx->mb_bits[mb];
496 } 491 }
497 ctx->slice_size[mb_y] = (ctx->slice_size[mb_y]+31)&~31; 492 ctx->slice_size[mb_y] = (ctx->slice_size[mb_y]+31)&~31;
498 ctx->slice_size[mb_y] >>= 3; 493 ctx->slice_size[mb_y] >>= 3;
499 thread_size += ctx->slice_size[mb_y]; 494 thread_size = ctx->slice_size[mb_y];
500 }
501 init_put_bits(&ctx->thread[i]->m.pb, buf + 640 + offset, thread_size);
502 offset += thread_size; 495 offset += thread_size;
503 } 496 }
504 } 497 }
505 498
506 static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg) 499 static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
507 { 500 {
508 DNXHDEncContext *ctx = *(void**)arg; 501 DNXHDEncContext *ctx = avctx->priv_data;
509 int mb_y, mb_x; 502 int mb_y = jobnr, mb_x;
510 for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) { 503 ctx = ctx->thread[threadnr];
511 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) { 504 for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
512 unsigned mb = mb_y * ctx->m.mb_width + mb_x; 505 unsigned mb = mb_y * ctx->m.mb_width + mb_x;
513 uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize) + (mb_x<<4); 506 uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize) + (mb_x<<4);
514 int sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize); 507 int sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
515 int varc = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)(sum*sum))>>8)+128)>>8; 508 int varc = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)(sum*sum))>>8)+128)>>8;
516 ctx->mb_cmp[mb].value = varc; 509 ctx->mb_cmp[mb].value = varc;
517 ctx->mb_cmp[mb].mb = mb; 510 ctx->mb_cmp[mb].mb = mb;
518 } 511 }
519 }
520 return 0; 512 return 0;
521 } 513 }
522 514
523 static int dnxhd_encode_rdo(AVCodecContext *avctx, DNXHDEncContext *ctx) 515 static int dnxhd_encode_rdo(AVCodecContext *avctx, DNXHDEncContext *ctx)
524 { 516 {
526 int last_lower = INT_MAX, last_higher = 0; 518 int last_lower = INT_MAX, last_higher = 0;
527 int x, y, q; 519 int x, y, q;
528 520
529 for (q = 1; q < avctx->qmax; q++) { 521 for (q = 1; q < avctx->qmax; q++) {
530 ctx->qscale = q; 522 ctx->qscale = q;
531 avctx->execute(avctx, dnxhd_calc_bits_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*)); 523 avctx->execute2(avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height);
532 } 524 }
533 up_step = down_step = 2<<LAMBDA_FRAC_BITS; 525 up_step = down_step = 2<<LAMBDA_FRAC_BITS;
534 lambda = ctx->lambda; 526 lambda = ctx->lambda;
535 527
536 for (;;) { 528 for (;;) {
606 qscale = ctx->qscale; 598 qscale = ctx->qscale;
607 for (;;) { 599 for (;;) {
608 bits = 0; 600 bits = 0;
609 ctx->qscale = qscale; 601 ctx->qscale = qscale;
610 // XXX avoid recalculating bits 602 // XXX avoid recalculating bits
611 ctx->m.avctx->execute(ctx->m.avctx, dnxhd_calc_bits_thread, &ctx->thread[0], NULL, ctx->m.avctx->thread_count, sizeof(void*)); 603 ctx->m.avctx->execute2(ctx->m.avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height);
612 for (y = 0; y < ctx->m.mb_height; y++) { 604 for (y = 0; y < ctx->m.mb_height; y++) {
613 for (x = 0; x < ctx->m.mb_width; x++) 605 for (x = 0; x < ctx->m.mb_width; x++)
614 bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits; 606 bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits;
615 bits = (bits+31)&~31; // padding 607 bits = (bits+31)&~31; // padding
616 if (bits > ctx->frame_bits) 608 if (bits > ctx->frame_bits)
730 } 722 }
731 max_bits += 31; //worst padding 723 max_bits += 31; //worst padding
732 } 724 }
733 if (!ret) { 725 if (!ret) {
734 if (RC_VARIANCE) 726 if (RC_VARIANCE)
735 avctx->execute(avctx, dnxhd_mb_var_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*)); 727 avctx->execute2(avctx, dnxhd_mb_var_thread, NULL, NULL, ctx->m.mb_height);
736 radix_sort(ctx->mb_cmp, ctx->m.mb_num); 728 radix_sort(ctx->mb_cmp, ctx->m.mb_num);
737 for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) { 729 for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) {
738 int mb = ctx->mb_cmp[x].mb; 730 int mb = ctx->mb_cmp[x].mb;
739 max_bits -= ctx->mb_rc[ctx->qscale][mb].bits - ctx->mb_rc[ctx->qscale+1][mb].bits; 731 max_bits -= ctx->mb_rc[ctx->qscale][mb].bits - ctx->mb_rc[ctx->qscale+1][mb].bits;
740 ctx->mb_qscale[mb] = ctx->qscale+1; 732 ctx->mb_qscale[mb] = ctx->qscale+1;
793 if (ret < 0) { 785 if (ret < 0) {
794 av_log(avctx, AV_LOG_ERROR, "picture could not fit ratecontrol constraints\n"); 786 av_log(avctx, AV_LOG_ERROR, "picture could not fit ratecontrol constraints\n");
795 return -1; 787 return -1;
796 } 788 }
797 789
798 dnxhd_setup_threads_slices(ctx, buf); 790 dnxhd_setup_threads_slices(ctx);
799 791
800 offset = 0; 792 offset = 0;
801 for (i = 0; i < ctx->m.mb_height; i++) { 793 for (i = 0; i < ctx->m.mb_height; i++) {
802 AV_WB32(ctx->msip + i * 4, offset); 794 AV_WB32(ctx->msip + i * 4, offset);
803 offset += ctx->slice_size[i]; 795 offset += ctx->slice_size[i];
804 assert(!(ctx->slice_size[i] & 3)); 796 assert(!(ctx->slice_size[i] & 3));
805 } 797 }
806 798
807 avctx->execute(avctx, dnxhd_encode_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*)); 799 avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height);
808 800
809 assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size); 801 assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size);
810 memset(buf + 640 + offset, 0, ctx->cid_table->coding_unit_size - 4 - offset - 640); 802 memset(buf + 640 + offset, 0, ctx->cid_table->coding_unit_size - 4 - offset - 640);
811 803
812 AV_WB32(buf + ctx->cid_table->coding_unit_size - 4, 0x600DC0DE); // EOF 804 AV_WB32(buf + ctx->cid_table->coding_unit_size - 4, 0x600DC0DE); // EOF
838 av_freep(&ctx->mb_bits); 830 av_freep(&ctx->mb_bits);
839 av_freep(&ctx->mb_qscale); 831 av_freep(&ctx->mb_qscale);
840 av_freep(&ctx->mb_rc); 832 av_freep(&ctx->mb_rc);
841 av_freep(&ctx->mb_cmp); 833 av_freep(&ctx->mb_cmp);
842 av_freep(&ctx->slice_size); 834 av_freep(&ctx->slice_size);
835 av_freep(&ctx->slice_offs);
843 836
844 av_freep(&ctx->qmatrix_c); 837 av_freep(&ctx->qmatrix_c);
845 av_freep(&ctx->qmatrix_l); 838 av_freep(&ctx->qmatrix_l);
846 av_freep(&ctx->qmatrix_c16); 839 av_freep(&ctx->qmatrix_c16);
847 av_freep(&ctx->qmatrix_l16); 840 av_freep(&ctx->qmatrix_l16);