comparison h264_loopfilter.c @ 11032:01bd040f8607 libavcodec

Unroll main loop so the edge==0 case is seperate. This allows many things to be simplified away. h264 decoder is overall 1% faster with a mbaff sample and 0.1% slower with the cathedral sample, probably because the slow loop filter code must be loaded into the code cache for each first MB of each row but isnt used for the following MBs.
author michael
date Thu, 28 Jan 2010 01:24:25 +0000
parents f5678fb91140
children b5577677b97d
comparison
equal deleted inserted replaced
11031:b37a53d101c3 11032:01bd040f8607
474 474
475 start = 1; 475 start = 1;
476 } 476 }
477 477
478 /* Calculate bS */ 478 /* Calculate bS */
479 for( edge = start; edge < edges; edge++ ) { 479 if(start==0) {
480 /* mbn_xy: neighbor macroblock */
481 const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
482 const int mbn_type = s->current_picture.mb_type[mbn_xy];
483 DECLARE_ALIGNED_8(int16_t, bS)[4]; 480 DECLARE_ALIGNED_8(int16_t, bS)[4];
484 int qp; 481 int qp;
485 482
486 if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) 483 if( IS_INTRA(mb_type|mbm_type)) {
487 continue;
488
489 if( IS_INTRA(mb_type|mbn_type)) {
490 *(uint64_t*)bS= 0x0003000300030003ULL; 484 *(uint64_t*)bS= 0x0003000300030003ULL;
491 if (edge == 0) { 485 if ( (!IS_INTERLACED(mb_type|mbm_type))
492 if ( (!IS_INTERLACED(mb_type|mbm_type)) 486 || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
493 || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0)) 487 )
494 ) 488 *(uint64_t*)bS= 0x0004000400040004ULL;
495 *(uint64_t*)bS= 0x0004000400040004ULL;
496 }
497 } else { 489 } else {
498 int i, l; 490 int i, l;
499 int mv_done; 491 int mv_done;
500 492
501 if( edge & mask_edge ) { 493 if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbm_type)) { //FIXME not posible left
502 *(uint64_t*)bS= 0;
503 mv_done = 1;
504 }
505 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
506 *(uint64_t*)bS= 0x0001000100010001ULL; 494 *(uint64_t*)bS= 0x0001000100010001ULL;
507 mv_done = 1; 495 mv_done = 1;
508 } 496 }
509 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { 497 else if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
510 int b_idx= 8 + 4 + edge * (dir ? 8:1); 498 int b_idx= 8 + 4;
511 int bn_idx= b_idx - (dir ? 8:1); 499 int bn_idx= b_idx - (dir ? 8:1);
512 int v = 0; 500 int v = 0;
513 501
514 for( l = 0; !v && l < h->list_count; l++ ) { 502 for( l = 0; !v && l < h->list_count; l++ ) {
515 v |= h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] | 503 v |= h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] |
532 } 520 }
533 else 521 else
534 mv_done = 0; 522 mv_done = 0;
535 523
536 for( i = 0; i < 4; i++ ) { 524 for( i = 0; i < 4; i++ ) {
537 int x = dir == 0 ? edge : i; 525 int x = dir == 0 ? 0 : i;
538 int y = dir == 0 ? i : edge; 526 int y = dir == 0 ? i : 0;
539 int b_idx= 8 + 4 + x + 8*y; 527 int b_idx= 8 + 4 + x + 8*y;
540 int bn_idx= b_idx - (dir ? 8:1); 528 int bn_idx= b_idx - (dir ? 8:1);
541 529
542 if( h->non_zero_count_cache[b_idx] | 530 if( h->non_zero_count_cache[b_idx] |
543 h->non_zero_count_cache[bn_idx] ) { 531 h->non_zero_count_cache[bn_idx] ) {
567 } 555 }
568 } 556 }
569 } 557 }
570 } 558 }
571 } 559 }
572
573 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
574 continue;
575 } 560 }
576 561
577 /* Filter edge */ 562 /* Filter edge */
578 // Do not use s->qscale as luma quantizer because it has not the same 563 // Do not use s->qscale as luma quantizer because it has not the same
579 // value in IPCM macroblocks. 564 // value in IPCM macroblocks.
580 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1; 565 if(bS[0]+bS[1]+bS[2]+bS[3]){
566 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbm_xy] + 1 ) >> 1;
567 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]);
568 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
569 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
570 if( dir == 0 ) {
571 filter_mb_edgev( &img_y[0], linesize, bS, qp, h );
572 {
573 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
574 filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, h);
575 if(h->pps.chroma_qp_diff)
576 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
577 filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, h);
578 }
579 } else {
580 filter_mb_edgeh( &img_y[0], linesize, bS, qp, h );
581 {
582 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
583 filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, h);
584 if(h->pps.chroma_qp_diff)
585 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
586 filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, h);
587 }
588 }
589 }
590 }
591 /* Calculate bS */
592 for( edge = 1; edge < edges; edge++ ) {
593 DECLARE_ALIGNED_8(int16_t, bS)[4];
594 int qp;
595
596 if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
597 continue;
598
599 if( IS_INTRA(mb_type)) {
600 *(uint64_t*)bS= 0x0003000300030003ULL;
601 } else {
602 int i, l;
603 int mv_done;
604
605 if( edge & mask_edge ) {
606 *(uint64_t*)bS= 0;
607 mv_done = 1;
608 }
609 else if( mask_par0 ) {
610 int b_idx= 8 + 4 + edge * (dir ? 8:1);
611 int bn_idx= b_idx - (dir ? 8:1);
612 int v = 0;
613
614 for( l = 0; !v && l < h->list_count; l++ ) {
615 v |= h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] |
616 h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] + 3 >= 7U |
617 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
618 }
619
620 if(h->list_count==2 && v){
621 v=0;
622 for( l = 0; !v && l < 2; l++ ) {
623 int ln= 1-l;
624 v |= h->ref_cache[l][b_idx] != h->ref_cache[ln][bn_idx] |
625 h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] + 3 >= 7U |
626 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
627 }
628 }
629
630 bS[0] = bS[1] = bS[2] = bS[3] = v;
631 mv_done = 1;
632 }
633 else
634 mv_done = 0;
635
636 for( i = 0; i < 4; i++ ) {
637 int x = dir == 0 ? edge : i;
638 int y = dir == 0 ? i : edge;
639 int b_idx= 8 + 4 + x + 8*y;
640 int bn_idx= b_idx - (dir ? 8:1);
641
642 if( h->non_zero_count_cache[b_idx] |
643 h->non_zero_count_cache[bn_idx] ) {
644 bS[i] = 2;
645 }
646 else if(!mv_done)
647 {
648 bS[i] = 0;
649 for( l = 0; l < h->list_count; l++ ) {
650 if( h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] |
651 h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] + 3 >= 7U |
652 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
653 bS[i] = 1;
654 break;
655 }
656 }
657
658 if(h->list_count == 2 && bS[i]){
659 bS[i] = 0;
660 for( l = 0; l < 2; l++ ) {
661 int ln= 1-l;
662 if( h->ref_cache[l][b_idx] != h->ref_cache[ln][bn_idx] |
663 h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] + 3 >= 7U |
664 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
665 bS[i] = 1;
666 break;
667 }
668 }
669 }
670 }
671 }
672
673 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
674 continue;
675 }
676
677 /* Filter edge */
678 // Do not use s->qscale as luma quantizer because it has not the same
679 // value in IPCM macroblocks.
680 qp = s->current_picture.qscale_table[mb_xy];
581 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]); 681 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]);
582 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize); 682 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
583 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); } 683 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
584 if( dir == 0 ) { 684 if( dir == 0 ) {
585 filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, h ); 685 filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, h );
586 if( (edge&1) == 0 ) { 686 if( (edge&1) == 0 ) {
587 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; 687 filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, h->chroma_qp[0], h);
588 filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, qp, h); 688 filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, h->chroma_qp[1], h);
589 if(h->pps.chroma_qp_diff)
590 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
591 filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, qp, h);
592 } 689 }
593 } else { 690 } else {
594 filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h ); 691 filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h );
595 if( (edge&1) == 0 ) { 692 if( (edge&1) == 0 ) {
596 int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; 693 filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
597 filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, qp, h); 694 filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
598 if(h->pps.chroma_qp_diff)
599 qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
600 filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, qp, h);
601 } 695 }
602 } 696 }
603 } 697 }
604 } 698 }
605 699