Mercurial > libavcodec.hg
comparison h264.c @ 3105:2d35fb3cb940 libavcodec
h264: special case dc-only idct. ~1% faster overall
author | lorenm |
---|---|
date | Fri, 10 Feb 2006 06:55:25 +0000 |
parents | 32903d6b5ef1 |
children | 16f9d33c027d |
comparison
equal
deleted
inserted
replaced
3104:78d6bfc238f3 | 3105:2d35fb3cb940 |
---|---|
3312 int i; | 3312 int i; |
3313 int *block_offset = &h->block_offset[0]; | 3313 int *block_offset = &h->block_offset[0]; |
3314 const unsigned int bottom = mb_y & 1; | 3314 const unsigned int bottom = mb_y & 1; |
3315 const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass); | 3315 const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass); |
3316 void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); | 3316 void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); |
3317 void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); | |
3317 | 3318 |
3318 if(!s->decode) | 3319 if(!s->decode) |
3319 return; | 3320 return; |
3320 | 3321 |
3321 dest_y = s->current_picture.data[0] + (mb_y * 16* s->linesize ) + mb_x * 16; | 3322 dest_y = s->current_picture.data[0] + (mb_y * 16* s->linesize ) + mb_x * 16; |
3335 linesize = s->linesize; | 3336 linesize = s->linesize; |
3336 uvlinesize = s->uvlinesize; | 3337 uvlinesize = s->uvlinesize; |
3337 // dct_offset = s->linesize * 16; | 3338 // dct_offset = s->linesize * 16; |
3338 } | 3339 } |
3339 | 3340 |
3340 idct_add = transform_bypass | 3341 if(transform_bypass){ |
3341 ? IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4 | 3342 idct_dc_add = |
3342 : IS_8x8DCT(mb_type) ? s->dsp.h264_idct8_add : s->dsp.h264_idct_add; | 3343 idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4; |
3344 }else if(IS_8x8DCT(mb_type)){ | |
3345 idct_dc_add = s->dsp.h264_idct8_dc_add; | |
3346 idct_add = s->dsp.h264_idct8_add; | |
3347 }else{ | |
3348 idct_dc_add = s->dsp.h264_idct_dc_add; | |
3349 idct_add = s->dsp.h264_idct_add; | |
3350 } | |
3343 | 3351 |
3344 if (IS_INTRA_PCM(mb_type)) { | 3352 if (IS_INTRA_PCM(mb_type)) { |
3345 unsigned int x, y; | 3353 unsigned int x, y; |
3346 | 3354 |
3347 // The pixels are stored in h->mb array in the same order as levels, | 3355 // The pixels are stored in h->mb array in the same order as levels, |
3387 if(!s->encoding){ | 3395 if(!s->encoding){ |
3388 if(IS_8x8DCT(mb_type)){ | 3396 if(IS_8x8DCT(mb_type)){ |
3389 for(i=0; i<16; i+=4){ | 3397 for(i=0; i<16; i+=4){ |
3390 uint8_t * const ptr= dest_y + block_offset[i]; | 3398 uint8_t * const ptr= dest_y + block_offset[i]; |
3391 const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; | 3399 const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; |
3400 const int nnz = h->non_zero_count_cache[ scan8[i] ]; | |
3392 h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000, | 3401 h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000, |
3393 (h->topright_samples_available<<(i+1))&0x8000, linesize); | 3402 (h->topright_samples_available<<(i+1))&0x8000, linesize); |
3394 if(h->non_zero_count_cache[ scan8[i] ]) | 3403 if(nnz){ |
3395 idct_add(ptr, h->mb + i*16, linesize); | 3404 if(nnz == 1 && h->mb[i*16]) |
3405 idct_dc_add(ptr, h->mb + i*16, linesize); | |
3406 else | |
3407 idct_add(ptr, h->mb + i*16, linesize); | |
3408 } | |
3396 } | 3409 } |
3397 }else | 3410 }else |
3398 for(i=0; i<16; i++){ | 3411 for(i=0; i<16; i++){ |
3399 uint8_t * const ptr= dest_y + block_offset[i]; | 3412 uint8_t * const ptr= dest_y + block_offset[i]; |
3400 uint8_t *topright; | 3413 uint8_t *topright; |
3401 const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; | 3414 const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; |
3402 int tr; | 3415 int nnz, tr; |
3403 | 3416 |
3404 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ | 3417 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ |
3405 const int topright_avail= (h->topright_samples_available<<i)&0x8000; | 3418 const int topright_avail= (h->topright_samples_available<<i)&0x8000; |
3406 assert(mb_y || linesize <= block_offset[i]); | 3419 assert(mb_y || linesize <= block_offset[i]); |
3407 if(!topright_avail){ | 3420 if(!topright_avail){ |
3411 topright= ptr + 4 - linesize; | 3424 topright= ptr + 4 - linesize; |
3412 }else | 3425 }else |
3413 topright= NULL; | 3426 topright= NULL; |
3414 | 3427 |
3415 h->pred4x4[ dir ](ptr, topright, linesize); | 3428 h->pred4x4[ dir ](ptr, topright, linesize); |
3416 if(h->non_zero_count_cache[ scan8[i] ]){ | 3429 nnz = h->non_zero_count_cache[ scan8[i] ]; |
3417 if(s->codec_id == CODEC_ID_H264) | 3430 if(nnz){ |
3418 idct_add(ptr, h->mb + i*16, linesize); | 3431 if(s->codec_id == CODEC_ID_H264){ |
3419 else | 3432 if(nnz == 1 && h->mb[i*16]) |
3433 idct_dc_add(ptr, h->mb + i*16, linesize); | |
3434 else | |
3435 idct_add(ptr, h->mb + i*16, linesize); | |
3436 }else | |
3420 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0); | 3437 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0); |
3421 } | 3438 } |
3422 } | 3439 } |
3423 } | 3440 } |
3424 }else{ | 3441 }else{ |
3451 } | 3468 } |
3452 | 3469 |
3453 | 3470 |
3454 if(!IS_INTRA4x4(mb_type)){ | 3471 if(!IS_INTRA4x4(mb_type)){ |
3455 if(s->codec_id == CODEC_ID_H264){ | 3472 if(s->codec_id == CODEC_ID_H264){ |
3456 const int di = IS_8x8DCT(mb_type) ? 4 : 1; | 3473 if(IS_INTRA16x16(mb_type)){ |
3457 for(i=0; i<16; i+=di){ | 3474 for(i=0; i<16; i++){ |
3458 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below | 3475 if(h->non_zero_count_cache[ scan8[i] ]) |
3459 uint8_t * const ptr= dest_y + block_offset[i]; | 3476 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize); |
3460 idct_add(ptr, h->mb + i*16, linesize); | 3477 else if(h->mb[i*16]) |
3478 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize); | |
3479 } | |
3480 }else{ | |
3481 const int di = IS_8x8DCT(mb_type) ? 4 : 1; | |
3482 for(i=0; i<16; i+=di){ | |
3483 int nnz = h->non_zero_count_cache[ scan8[i] ]; | |
3484 if(nnz){ | |
3485 if(nnz==1 && h->mb[i*16]) | |
3486 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize); | |
3487 else | |
3488 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize); | |
3489 } | |
3461 } | 3490 } |
3462 } | 3491 } |
3463 }else{ | 3492 }else{ |
3464 for(i=0; i<16; i++){ | 3493 for(i=0; i<16; i++){ |
3465 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below | 3494 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below |
3469 } | 3498 } |
3470 } | 3499 } |
3471 } | 3500 } |
3472 | 3501 |
3473 if(!(s->flags&CODEC_FLAG_GRAY)){ | 3502 if(!(s->flags&CODEC_FLAG_GRAY)){ |
3474 idct_add = transform_bypass ? s->dsp.add_pixels4 : s->dsp.h264_idct_add; | 3503 uint8_t *dest[2] = {dest_cb, dest_cr}; |
3475 if(!transform_bypass){ | 3504 if(transform_bypass){ |
3505 idct_add = idct_dc_add = s->dsp.add_pixels4; | |
3506 }else{ | |
3507 idct_add = s->dsp.h264_idct_add; | |
3508 idct_dc_add = s->dsp.h264_idct_dc_add; | |
3476 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]); | 3509 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]); |
3477 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]); | 3510 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]); |
3478 } | 3511 } |
3479 if(s->codec_id == CODEC_ID_H264){ | 3512 if(s->codec_id == CODEC_ID_H264){ |
3480 for(i=16; i<16+4; i++){ | 3513 for(i=16; i<16+8; i++){ |
3514 if(h->non_zero_count_cache[ scan8[i] ]) | |
3515 idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); | |
3516 else if(h->mb[i*16]) | |
3517 idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); | |
3518 } | |
3519 }else{ | |
3520 for(i=16; i<16+8; i++){ | |
3481 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ | 3521 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ |
3482 uint8_t * const ptr= dest_cb + block_offset[i]; | 3522 uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i]; |
3483 idct_add(ptr, h->mb + i*16, uvlinesize); | |
3484 } | |
3485 } | |
3486 for(i=20; i<20+4; i++){ | |
3487 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ | |
3488 uint8_t * const ptr= dest_cr + block_offset[i]; | |
3489 idct_add(ptr, h->mb + i*16, uvlinesize); | |
3490 } | |
3491 } | |
3492 }else{ | |
3493 for(i=16; i<16+4; i++){ | |
3494 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ | |
3495 uint8_t * const ptr= dest_cb + block_offset[i]; | |
3496 svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2); | |
3497 } | |
3498 } | |
3499 for(i=20; i<20+4; i++){ | |
3500 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ | |
3501 uint8_t * const ptr= dest_cr + block_offset[i]; | |
3502 svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2); | 3523 svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2); |
3503 } | 3524 } |
3504 } | 3525 } |
3505 } | 3526 } |
3506 } | 3527 } |
5129 if( decode_residual(h, gb, buf, i4x4+4*i8x8, zigzag_scan8x8_cavlc+16*i4x4, | 5150 if( decode_residual(h, gb, buf, i4x4+4*i8x8, zigzag_scan8x8_cavlc+16*i4x4, |
5130 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 ) | 5151 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 ) |
5131 return -1; | 5152 return -1; |
5132 } | 5153 } |
5133 nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ]; | 5154 nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ]; |
5134 nnz[0] |= nnz[1] | nnz[8] | nnz[9]; | 5155 nnz[0] += nnz[1] + nnz[8] + nnz[9]; |
5135 }else{ | 5156 }else{ |
5136 for(i4x4=0; i4x4<4; i4x4++){ | 5157 for(i4x4=0; i4x4<4; i4x4++){ |
5137 const int index= i4x4 + 4*i8x8; | 5158 const int index= i4x4 + 4*i8x8; |
5138 | 5159 |
5139 if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){ | 5160 if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){ |
5688 h->cbp_table[mb_xy] |= 0x40 << n; | 5709 h->cbp_table[mb_xy] |= 0x40 << n; |
5689 else if( cat == 4 ) | 5710 else if( cat == 4 ) |
5690 h->non_zero_count_cache[scan8[16+n]] = coeff_count; | 5711 h->non_zero_count_cache[scan8[16+n]] = coeff_count; |
5691 else { | 5712 else { |
5692 assert( cat == 5 ); | 5713 assert( cat == 5 ); |
5693 fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 1, 1); | 5714 fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1); |
5694 } | 5715 } |
5695 | 5716 |
5696 for( i = coeff_count - 1; i >= 0; i-- ) { | 5717 for( i = coeff_count - 1; i >= 0; i-- ) { |
5697 uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base; | 5718 uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base; |
5698 int j= scantable[index[i]]; | 5719 int j= scantable[index[i]]; |