Mercurial > libavcodec.hg
comparison h264.c @ 4587:644687c58928 libavcodec
h264 optimization: common case hl_decode_mb patch by (Alexander Strange astrange ithinksw com)
144095->142319 dezicycles for hl_decode_mb() on duron
trailing whitespace removed by me
author | michael |
---|---|
date | Sat, 24 Feb 2007 00:58:28 +0000 |
parents | c140a3360e18 |
children | 9fc8acda3223 |
comparison
equal
deleted
inserted
replaced
4586:365f2fed8461 | 4587:644687c58928 |
---|---|
3495 XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1); | 3495 XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1); |
3496 } | 3496 } |
3497 } | 3497 } |
3498 } | 3498 } |
3499 | 3499 |
3500 static void hl_decode_mb(H264Context *h){ | 3500 static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){ |
3501 MpegEncContext * const s = &h->s; | 3501 MpegEncContext * const s = &h->s; |
3502 const int mb_x= s->mb_x; | 3502 const int mb_x= s->mb_x; |
3503 const int mb_y= s->mb_y; | 3503 const int mb_y= s->mb_y; |
3504 const int mb_xy= mb_x + mb_y*s->mb_stride; | 3504 const int mb_xy= mb_x + mb_y*s->mb_stride; |
3505 const int mb_type= s->current_picture.mb_type[mb_xy]; | 3505 const int mb_type= s->current_picture.mb_type[mb_xy]; |
3506 uint8_t *dest_y, *dest_cb, *dest_cr; | 3506 uint8_t *dest_y, *dest_cb, *dest_cr; |
3507 int linesize, uvlinesize /*dct_offset*/; | 3507 int linesize, uvlinesize /*dct_offset*/; |
3508 int i; | 3508 int i; |
3509 int *block_offset = &h->block_offset[0]; | 3509 int *block_offset = &h->block_offset[0]; |
3510 const unsigned int bottom = mb_y & 1; | 3510 const unsigned int bottom = mb_y & 1; |
3511 const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass); | 3511 const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264); |
3512 void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); | 3512 void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); |
3513 void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); | 3513 void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); |
3514 | |
3515 if(!s->decode) | |
3516 return; | |
3517 | 3514 |
3518 dest_y = s->current_picture.data[0] + (mb_y * 16* s->linesize ) + mb_x * 16; | 3515 dest_y = s->current_picture.data[0] + (mb_y * 16* s->linesize ) + mb_x * 16; |
3519 dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8; | 3516 dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8; |
3520 dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8; | 3517 dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8; |
3521 | 3518 |
3522 s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4); | 3519 s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4); |
3523 s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2); | 3520 s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2); |
3524 | 3521 |
3525 if (MB_FIELD) { | 3522 if (!simple && MB_FIELD) { |
3526 linesize = h->mb_linesize = s->linesize * 2; | 3523 linesize = h->mb_linesize = s->linesize * 2; |
3527 uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2; | 3524 uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2; |
3528 block_offset = &h->block_offset[24]; | 3525 block_offset = &h->block_offset[24]; |
3529 if(mb_y&1){ //FIXME move out of this func? | 3526 if(mb_y&1){ //FIXME move out of this func? |
3530 dest_y -= s->linesize*15; | 3527 dest_y -= s->linesize*15; |
3564 }else{ | 3561 }else{ |
3565 idct_dc_add = s->dsp.h264_idct_dc_add; | 3562 idct_dc_add = s->dsp.h264_idct_dc_add; |
3566 idct_add = s->dsp.h264_idct_add; | 3563 idct_add = s->dsp.h264_idct_add; |
3567 } | 3564 } |
3568 | 3565 |
3569 if(FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type) | 3566 if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type) |
3570 && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){ | 3567 && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){ |
3571 int mbt_y = mb_y&~1; | 3568 int mbt_y = mb_y&~1; |
3572 uint8_t *top_y = s->current_picture.data[0] + (mbt_y * 16* s->linesize ) + mb_x * 16; | 3569 uint8_t *top_y = s->current_picture.data[0] + (mbt_y * 16* s->linesize ) + mb_x * 16; |
3573 uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8; | 3570 uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8; |
3574 uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8; | 3571 uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8; |
3575 xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1); | 3572 xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1); |
3576 } | 3573 } |
3577 | 3574 |
3578 if (IS_INTRA_PCM(mb_type)) { | 3575 if (!simple && IS_INTRA_PCM(mb_type)) { |
3579 unsigned int x, y; | 3576 unsigned int x, y; |
3580 | 3577 |
3581 // The pixels are stored in h->mb array in the same order as levels, | 3578 // The pixels are stored in h->mb array in the same order as levels, |
3582 // copy them in output in the correct order. | 3579 // copy them in output in the correct order. |
3583 for(i=0; i<16; i++) { | 3580 for(i=0; i<16; i++) { |
3601 } | 3598 } |
3602 } | 3599 } |
3603 } | 3600 } |
3604 } else { | 3601 } else { |
3605 if(IS_INTRA(mb_type)){ | 3602 if(IS_INTRA(mb_type)){ |
3606 if(h->deblocking_filter && !FRAME_MBAFF) | 3603 if(h->deblocking_filter && (simple || !FRAME_MBAFF)) |
3607 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1); | 3604 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1); |
3608 | 3605 |
3609 if(!(s->flags&CODEC_FLAG_GRAY)){ | 3606 if(simple || !(s->flags&CODEC_FLAG_GRAY)){ |
3610 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize); | 3607 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize); |
3611 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize); | 3608 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize); |
3612 } | 3609 } |
3613 | 3610 |
3614 if(IS_INTRA4x4(mb_type)){ | 3611 if(IS_INTRA4x4(mb_type)){ |
3615 if(!s->encoding){ | 3612 if(simple || !s->encoding){ |
3616 if(IS_8x8DCT(mb_type)){ | 3613 if(IS_8x8DCT(mb_type)){ |
3617 for(i=0; i<16; i+=4){ | 3614 for(i=0; i<16; i+=4){ |
3618 uint8_t * const ptr= dest_y + block_offset[i]; | 3615 uint8_t * const ptr= dest_y + block_offset[i]; |
3619 const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; | 3616 const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; |
3620 const int nnz = h->non_zero_count_cache[ scan8[i] ]; | 3617 const int nnz = h->non_zero_count_cache[ scan8[i] ]; |
3646 topright= NULL; | 3643 topright= NULL; |
3647 | 3644 |
3648 h->pred4x4[ dir ](ptr, topright, linesize); | 3645 h->pred4x4[ dir ](ptr, topright, linesize); |
3649 nnz = h->non_zero_count_cache[ scan8[i] ]; | 3646 nnz = h->non_zero_count_cache[ scan8[i] ]; |
3650 if(nnz){ | 3647 if(nnz){ |
3651 if(s->codec_id == CODEC_ID_H264){ | 3648 if(is_h264){ |
3652 if(nnz == 1 && h->mb[i*16]) | 3649 if(nnz == 1 && h->mb[i*16]) |
3653 idct_dc_add(ptr, h->mb + i*16, linesize); | 3650 idct_dc_add(ptr, h->mb + i*16, linesize); |
3654 else | 3651 else |
3655 idct_add(ptr, h->mb + i*16, linesize); | 3652 idct_add(ptr, h->mb + i*16, linesize); |
3656 }else | 3653 }else |
3658 } | 3655 } |
3659 } | 3656 } |
3660 } | 3657 } |
3661 }else{ | 3658 }else{ |
3662 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize); | 3659 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize); |
3663 if(s->codec_id == CODEC_ID_H264){ | 3660 if(is_h264){ |
3664 if(!transform_bypass) | 3661 if(!transform_bypass) |
3665 h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]); | 3662 h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]); |
3666 }else | 3663 }else |
3667 svq3_luma_dc_dequant_idct_c(h->mb, s->qscale); | 3664 svq3_luma_dc_dequant_idct_c(h->mb, s->qscale); |
3668 } | 3665 } |
3669 if(h->deblocking_filter && !FRAME_MBAFF) | 3666 if(h->deblocking_filter && (simple || !FRAME_MBAFF)) |
3670 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0); | 3667 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0); |
3671 }else if(s->codec_id == CODEC_ID_H264){ | 3668 }else if(is_h264){ |
3672 hl_motion(h, dest_y, dest_cb, dest_cr, | 3669 hl_motion(h, dest_y, dest_cb, dest_cr, |
3673 s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, | 3670 s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, |
3674 s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, | 3671 s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, |
3675 s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab); | 3672 s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab); |
3676 } | 3673 } |
3677 | 3674 |
3678 | 3675 |
3679 if(!IS_INTRA4x4(mb_type)){ | 3676 if(!IS_INTRA4x4(mb_type)){ |
3680 if(s->codec_id == CODEC_ID_H264){ | 3677 if(is_h264){ |
3681 if(IS_INTRA16x16(mb_type)){ | 3678 if(IS_INTRA16x16(mb_type)){ |
3682 for(i=0; i<16; i++){ | 3679 for(i=0; i<16; i++){ |
3683 if(h->non_zero_count_cache[ scan8[i] ]) | 3680 if(h->non_zero_count_cache[ scan8[i] ]) |
3684 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize); | 3681 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize); |
3685 else if(h->mb[i*16]) | 3682 else if(h->mb[i*16]) |
3705 } | 3702 } |
3706 } | 3703 } |
3707 } | 3704 } |
3708 } | 3705 } |
3709 | 3706 |
3710 if(!(s->flags&CODEC_FLAG_GRAY)){ | 3707 if(simple || !(s->flags&CODEC_FLAG_GRAY)){ |
3711 uint8_t *dest[2] = {dest_cb, dest_cr}; | 3708 uint8_t *dest[2] = {dest_cb, dest_cr}; |
3712 if(transform_bypass){ | 3709 if(transform_bypass){ |
3713 idct_add = idct_dc_add = s->dsp.add_pixels4; | 3710 idct_add = idct_dc_add = s->dsp.add_pixels4; |
3714 }else{ | 3711 }else{ |
3715 idct_add = s->dsp.h264_idct_add; | 3712 idct_add = s->dsp.h264_idct_add; |
3716 idct_dc_add = s->dsp.h264_idct_dc_add; | 3713 idct_dc_add = s->dsp.h264_idct_dc_add; |
3717 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]); | 3714 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]); |
3718 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]); | 3715 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]); |
3719 } | 3716 } |
3720 if(s->codec_id == CODEC_ID_H264){ | 3717 if(is_h264){ |
3721 for(i=16; i<16+8; i++){ | 3718 for(i=16; i<16+8; i++){ |
3722 if(h->non_zero_count_cache[ scan8[i] ]) | 3719 if(h->non_zero_count_cache[ scan8[i] ]) |
3723 idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); | 3720 idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); |
3724 else if(h->mb[i*16]) | 3721 else if(h->mb[i*16]) |
3725 idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); | 3722 idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); |
3733 } | 3730 } |
3734 } | 3731 } |
3735 } | 3732 } |
3736 } | 3733 } |
3737 if(h->deblocking_filter) { | 3734 if(h->deblocking_filter) { |
3738 if (FRAME_MBAFF) { | 3735 if (!simple && FRAME_MBAFF) { |
3739 //FIXME try deblocking one mb at a time? | 3736 //FIXME try deblocking one mb at a time? |
3740 // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border | 3737 // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border |
3741 const int mb_y = s->mb_y - 1; | 3738 const int mb_y = s->mb_y - 1; |
3742 uint8_t *pair_dest_y, *pair_dest_cb, *pair_dest_cr; | 3739 uint8_t *pair_dest_y, *pair_dest_cb, *pair_dest_cr; |
3743 const int mb_xy= mb_x + mb_y*s->mb_stride; | 3740 const int mb_xy= mb_x + mb_y*s->mb_stride; |
3770 backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize); | 3767 backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize); |
3771 fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb | 3768 fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb |
3772 filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize); | 3769 filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize); |
3773 } | 3770 } |
3774 } | 3771 } |
3772 } | |
3773 | |
3774 /** | |
3775 * Process a macroblock; this case avoids checks for expensive uncommon cases. | |
3776 */ | |
3777 static void hl_decode_mb_simple(H264Context *h){ | |
3778 hl_decode_mb_internal(h, 1); | |
3779 } | |
3780 | |
3781 /** | |
3782 * Process a macroblock; this handles edge cases, such as interlacing. | |
3783 */ | |
3784 static void av_noinline hl_decode_mb_complex(H264Context *h){ | |
3785 hl_decode_mb_internal(h, 0); | |
3786 } | |
3787 | |
3788 static void hl_decode_mb(H264Context *h){ | |
3789 MpegEncContext * const s = &h->s; | |
3790 const int mb_x= s->mb_x; | |
3791 const int mb_y= s->mb_y; | |
3792 const int mb_xy= mb_x + mb_y*s->mb_stride; | |
3793 const int mb_type= s->current_picture.mb_type[mb_xy]; | |
3794 int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (s->flags&CODEC_FLAG_GRAY) || s->encoding; | |
3795 | |
3796 if(!s->decode) | |
3797 return; | |
3798 | |
3799 if (is_complex) | |
3800 hl_decode_mb_complex(h); | |
3801 else hl_decode_mb_simple(h); | |
3775 } | 3802 } |
3776 | 3803 |
3777 /** | 3804 /** |
3778 * fills the default_ref_list. | 3805 * fills the default_ref_list. |
3779 */ | 3806 */ |