comparison h264_loopfilter.c @ 10906:1b5fba731e24 libavcodec

Rearchitecturing the stiched up goose part 1 Run loop filter per row instead of per MB, this also should make it much easier to switch to per frame filtering and also doing so in a seperate thread in the future if some volunteer wants to try. Overall decoding speedup of 1.7% (single thread on pentium dual / cathedral sample) This change also allows some optimizations to be tried that would not have been possible before.
author michael
date Sun, 17 Jan 2010 20:35:55 +0000
parents b30aef0c693f
children f4cf3960b8c6
comparison
equal deleted inserted replaced
10905:06d20a468d1e 10906:1b5fba731e24
618 618
619 /* Filter edge */ 619 /* Filter edge */
620 // Do not use s->qscale as luma quantizer because it has not the same 620 // Do not use s->qscale as luma quantizer because it has not the same
621 // value in IPCM macroblocks. 621 // value in IPCM macroblocks.
622 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1; 622 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
623 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]); 623 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]);
624 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize); 624 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
625 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); } 625 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
626 if( dir == 0 ) { 626 if( dir == 0 ) {
627 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp ); 627 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
628 if( (edge&1) == 0 ) { 628 if( (edge&1) == 0 ) {
648 const int mb_xy= mb_x + mb_y*s->mb_stride; 648 const int mb_xy= mb_x + mb_y*s->mb_stride;
649 const int mb_type = s->current_picture.mb_type[mb_xy]; 649 const int mb_type = s->current_picture.mb_type[mb_xy];
650 const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4; 650 const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
651 int first_vertical_edge_done = 0; 651 int first_vertical_edge_done = 0;
652 av_unused int dir; 652 av_unused int dir;
653 int list;
653 654
654 //for sufficiently low qp, filtering wouldn't do anything 655 //for sufficiently low qp, filtering wouldn't do anything
655 //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp 656 //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
656 if(!FRAME_MBAFF){ 657 if(!FRAME_MBAFF){
657 int qp_thresh = h->qp_thresh; 658 int qp_thresh = h->qp_thresh;
661 && (h->top_mb_xy < 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){ 662 && (h->top_mb_xy < 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
662 return; 663 return;
663 } 664 }
664 } 665 }
665 666
667 h->non_zero_count_cache[7+8*1]=h->non_zero_count[mb_xy][0];
668 h->non_zero_count_cache[7+8*2]=h->non_zero_count[mb_xy][1];
669 h->non_zero_count_cache[7+8*3]=h->non_zero_count[mb_xy][2];
670 h->non_zero_count_cache[7+8*4]=h->non_zero_count[mb_xy][3];
671 h->non_zero_count_cache[4+8*4]=h->non_zero_count[mb_xy][4];
672 h->non_zero_count_cache[5+8*4]=h->non_zero_count[mb_xy][5];
673 h->non_zero_count_cache[6+8*4]=h->non_zero_count[mb_xy][6];
674
675 h->non_zero_count_cache[1+8*2]=h->non_zero_count[mb_xy][9];
676 h->non_zero_count_cache[2+8*2]=h->non_zero_count[mb_xy][8];
677 h->non_zero_count_cache[2+8*1]=h->non_zero_count[mb_xy][7];
678
679 h->non_zero_count_cache[1+8*5]=h->non_zero_count[mb_xy][12];
680 h->non_zero_count_cache[2+8*5]=h->non_zero_count[mb_xy][11];
681 h->non_zero_count_cache[2+8*4]=h->non_zero_count[mb_xy][10];
682
683 h->non_zero_count_cache[6+8*1]=h->non_zero_count[mb_xy][13];
684 h->non_zero_count_cache[6+8*2]=h->non_zero_count[mb_xy][14];
685 h->non_zero_count_cache[6+8*3]=h->non_zero_count[mb_xy][15];
686 h->non_zero_count_cache[5+8*1]=h->non_zero_count[mb_xy][16];
687 h->non_zero_count_cache[5+8*2]=h->non_zero_count[mb_xy][17];
688 h->non_zero_count_cache[5+8*3]=h->non_zero_count[mb_xy][18];
689 h->non_zero_count_cache[4+8*1]=h->non_zero_count[mb_xy][19];
690 h->non_zero_count_cache[4+8*2]=h->non_zero_count[mb_xy][20];
691 h->non_zero_count_cache[4+8*3]=h->non_zero_count[mb_xy][21];
692
693 h->non_zero_count_cache[1+8*1]=h->non_zero_count[mb_xy][22];
694 h->non_zero_count_cache[1+8*4]=h->non_zero_count[mb_xy][23];
695
666 // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs 696 // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
667 if(!h->pps.cabac && h->pps.transform_8x8_mode){ 697 if(!h->pps.cabac && h->pps.transform_8x8_mode){
668 int top_type, left_type[2]; 698 int top_type, left_type[2];
669 top_type = s->current_picture.mb_type[h->top_mb_xy] ; 699 top_type = s->current_picture.mb_type[h->top_mb_xy] ;
670 left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]]; 700 left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
685 h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF 715 h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
686 } 716 }
687 717
688 if(IS_8x8DCT(mb_type)){ 718 if(IS_8x8DCT(mb_type)){
689 h->non_zero_count_cache[scan8[0 ]]= h->non_zero_count_cache[scan8[1 ]]= 719 h->non_zero_count_cache[scan8[0 ]]= h->non_zero_count_cache[scan8[1 ]]=
690 h->non_zero_count_cache[scan8[2 ]]= h->non_zero_count_cache[scan8[3 ]]= h->cbp & 1; 720 h->non_zero_count_cache[scan8[2 ]]= h->non_zero_count_cache[scan8[3 ]]= h->cbp_table[mb_xy] & 1;
691 721
692 h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]= 722 h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
693 h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2; 723 h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
694 724
695 h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]= 725 h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
696 h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4; 726 h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
697 727
698 h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]= 728 h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
699 h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8; 729 h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
700 } 730 }
701 } 731 }
702 732
703 if (FRAME_MBAFF 733 if (FRAME_MBAFF
704 // left mb is in picture 734 // left mb is in picture