Mercurial > libavcodec.hg
comparison h264_loopfilter.c @ 11032:01bd040f8607 libavcodec
Unroll main loop so the edge==0 case is seperate.
This allows many things to be simplified away.
h264 decoder is overall 1% faster with a mbaff sample and
0.1% slower with the cathedral sample, probably because the slow loop
filter code must be loaded into the code cache for each first MB of each
row but isnt used for the following MBs.
author | michael |
---|---|
date | Thu, 28 Jan 2010 01:24:25 +0000 |
parents | f5678fb91140 |
children | b5577677b97d |
comparison
equal
deleted
inserted
replaced
11031:b37a53d101c3 | 11032:01bd040f8607 |
---|---|
474 | 474 |
475 start = 1; | 475 start = 1; |
476 } | 476 } |
477 | 477 |
478 /* Calculate bS */ | 478 /* Calculate bS */ |
479 for( edge = start; edge < edges; edge++ ) { | 479 if(start==0) { |
480 /* mbn_xy: neighbor macroblock */ | |
481 const int mbn_xy = edge > 0 ? mb_xy : mbm_xy; | |
482 const int mbn_type = s->current_picture.mb_type[mbn_xy]; | |
483 DECLARE_ALIGNED_8(int16_t, bS)[4]; | 480 DECLARE_ALIGNED_8(int16_t, bS)[4]; |
484 int qp; | 481 int qp; |
485 | 482 |
486 if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) | 483 if( IS_INTRA(mb_type|mbm_type)) { |
487 continue; | |
488 | |
489 if( IS_INTRA(mb_type|mbn_type)) { | |
490 *(uint64_t*)bS= 0x0003000300030003ULL; | 484 *(uint64_t*)bS= 0x0003000300030003ULL; |
491 if (edge == 0) { | 485 if ( (!IS_INTERLACED(mb_type|mbm_type)) |
492 if ( (!IS_INTERLACED(mb_type|mbm_type)) | 486 || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0)) |
493 || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0)) | 487 ) |
494 ) | 488 *(uint64_t*)bS= 0x0004000400040004ULL; |
495 *(uint64_t*)bS= 0x0004000400040004ULL; | |
496 } | |
497 } else { | 489 } else { |
498 int i, l; | 490 int i, l; |
499 int mv_done; | 491 int mv_done; |
500 | 492 |
501 if( edge & mask_edge ) { | 493 if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbm_type)) { //FIXME not posible left |
502 *(uint64_t*)bS= 0; | |
503 mv_done = 1; | |
504 } | |
505 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) { | |
506 *(uint64_t*)bS= 0x0001000100010001ULL; | 494 *(uint64_t*)bS= 0x0001000100010001ULL; |
507 mv_done = 1; | 495 mv_done = 1; |
508 } | 496 } |
509 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { | 497 else if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { |
510 int b_idx= 8 + 4 + edge * (dir ? 8:1); | 498 int b_idx= 8 + 4; |
511 int bn_idx= b_idx - (dir ? 8:1); | 499 int bn_idx= b_idx - (dir ? 8:1); |
512 int v = 0; | 500 int v = 0; |
513 | 501 |
514 for( l = 0; !v && l < h->list_count; l++ ) { | 502 for( l = 0; !v && l < h->list_count; l++ ) { |
515 v |= h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] | | 503 v |= h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] | |
532 } | 520 } |
533 else | 521 else |
534 mv_done = 0; | 522 mv_done = 0; |
535 | 523 |
536 for( i = 0; i < 4; i++ ) { | 524 for( i = 0; i < 4; i++ ) { |
537 int x = dir == 0 ? edge : i; | 525 int x = dir == 0 ? 0 : i; |
538 int y = dir == 0 ? i : edge; | 526 int y = dir == 0 ? i : 0; |
539 int b_idx= 8 + 4 + x + 8*y; | 527 int b_idx= 8 + 4 + x + 8*y; |
540 int bn_idx= b_idx - (dir ? 8:1); | 528 int bn_idx= b_idx - (dir ? 8:1); |
541 | 529 |
542 if( h->non_zero_count_cache[b_idx] | | 530 if( h->non_zero_count_cache[b_idx] | |
543 h->non_zero_count_cache[bn_idx] ) { | 531 h->non_zero_count_cache[bn_idx] ) { |
567 } | 555 } |
568 } | 556 } |
569 } | 557 } |
570 } | 558 } |
571 } | 559 } |
572 | |
573 if(bS[0]+bS[1]+bS[2]+bS[3] == 0) | |
574 continue; | |
575 } | 560 } |
576 | 561 |
577 /* Filter edge */ | 562 /* Filter edge */ |
578 // Do not use s->qscale as luma quantizer because it has not the same | 563 // Do not use s->qscale as luma quantizer because it has not the same |
579 // value in IPCM macroblocks. | 564 // value in IPCM macroblocks. |
580 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1; | 565 if(bS[0]+bS[1]+bS[2]+bS[3]){ |
566 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbm_xy] + 1 ) >> 1; | |
567 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]); | |
568 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize); | |
569 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); } | |
570 if( dir == 0 ) { | |
571 filter_mb_edgev( &img_y[0], linesize, bS, qp, h ); | |
572 { | |
573 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1; | |
574 filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, h); | |
575 if(h->pps.chroma_qp_diff) | |
576 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1; | |
577 filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, h); | |
578 } | |
579 } else { | |
580 filter_mb_edgeh( &img_y[0], linesize, bS, qp, h ); | |
581 { | |
582 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1; | |
583 filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, h); | |
584 if(h->pps.chroma_qp_diff) | |
585 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1; | |
586 filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, h); | |
587 } | |
588 } | |
589 } | |
590 } | |
591 /* Calculate bS */ | |
592 for( edge = 1; edge < edges; edge++ ) { | |
593 DECLARE_ALIGNED_8(int16_t, bS)[4]; | |
594 int qp; | |
595 | |
596 if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) | |
597 continue; | |
598 | |
599 if( IS_INTRA(mb_type)) { | |
600 *(uint64_t*)bS= 0x0003000300030003ULL; | |
601 } else { | |
602 int i, l; | |
603 int mv_done; | |
604 | |
605 if( edge & mask_edge ) { | |
606 *(uint64_t*)bS= 0; | |
607 mv_done = 1; | |
608 } | |
609 else if( mask_par0 ) { | |
610 int b_idx= 8 + 4 + edge * (dir ? 8:1); | |
611 int bn_idx= b_idx - (dir ? 8:1); | |
612 int v = 0; | |
613 | |
614 for( l = 0; !v && l < h->list_count; l++ ) { | |
615 v |= h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] | | |
616 h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] + 3 >= 7U | | |
617 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit; | |
618 } | |
619 | |
620 if(h->list_count==2 && v){ | |
621 v=0; | |
622 for( l = 0; !v && l < 2; l++ ) { | |
623 int ln= 1-l; | |
624 v |= h->ref_cache[l][b_idx] != h->ref_cache[ln][bn_idx] | | |
625 h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] + 3 >= 7U | | |
626 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit; | |
627 } | |
628 } | |
629 | |
630 bS[0] = bS[1] = bS[2] = bS[3] = v; | |
631 mv_done = 1; | |
632 } | |
633 else | |
634 mv_done = 0; | |
635 | |
636 for( i = 0; i < 4; i++ ) { | |
637 int x = dir == 0 ? edge : i; | |
638 int y = dir == 0 ? i : edge; | |
639 int b_idx= 8 + 4 + x + 8*y; | |
640 int bn_idx= b_idx - (dir ? 8:1); | |
641 | |
642 if( h->non_zero_count_cache[b_idx] | | |
643 h->non_zero_count_cache[bn_idx] ) { | |
644 bS[i] = 2; | |
645 } | |
646 else if(!mv_done) | |
647 { | |
648 bS[i] = 0; | |
649 for( l = 0; l < h->list_count; l++ ) { | |
650 if( h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] | | |
651 h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] + 3 >= 7U | | |
652 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) { | |
653 bS[i] = 1; | |
654 break; | |
655 } | |
656 } | |
657 | |
658 if(h->list_count == 2 && bS[i]){ | |
659 bS[i] = 0; | |
660 for( l = 0; l < 2; l++ ) { | |
661 int ln= 1-l; | |
662 if( h->ref_cache[l][b_idx] != h->ref_cache[ln][bn_idx] | | |
663 h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] + 3 >= 7U | | |
664 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) { | |
665 bS[i] = 1; | |
666 break; | |
667 } | |
668 } | |
669 } | |
670 } | |
671 } | |
672 | |
673 if(bS[0]+bS[1]+bS[2]+bS[3] == 0) | |
674 continue; | |
675 } | |
676 | |
677 /* Filter edge */ | |
678 // Do not use s->qscale as luma quantizer because it has not the same | |
679 // value in IPCM macroblocks. | |
680 qp = s->current_picture.qscale_table[mb_xy]; | |
581 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]); | 681 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]); |
582 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize); | 682 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize); |
583 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); } | 683 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); } |
584 if( dir == 0 ) { | 684 if( dir == 0 ) { |
585 filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, h ); | 685 filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, h ); |
586 if( (edge&1) == 0 ) { | 686 if( (edge&1) == 0 ) { |
587 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; | 687 filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, h->chroma_qp[0], h); |
588 filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, qp, h); | 688 filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, h->chroma_qp[1], h); |
589 if(h->pps.chroma_qp_diff) | |
590 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; | |
591 filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, qp, h); | |
592 } | 689 } |
593 } else { | 690 } else { |
594 filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h ); | 691 filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h ); |
595 if( (edge&1) == 0 ) { | 692 if( (edge&1) == 0 ) { |
596 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; | 693 filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h); |
597 filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, qp, h); | 694 filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h); |
598 if(h->pps.chroma_qp_diff) | |
599 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; | |
600 filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, qp, h); | |
601 } | 695 } |
602 } | 696 } |
603 } | 697 } |
604 } | 698 } |
605 | 699 |