comparison h264.c @ 3105:2d35fb3cb940 libavcodec

h264: special case dc-only idct. ~1% faster overall
author lorenm
date Fri, 10 Feb 2006 06:55:25 +0000
parents 32903d6b5ef1
children 16f9d33c027d
comparison
equal deleted inserted replaced
3104:78d6bfc238f3 3105:2d35fb3cb940
3312 int i; 3312 int i;
3313 int *block_offset = &h->block_offset[0]; 3313 int *block_offset = &h->block_offset[0];
3314 const unsigned int bottom = mb_y & 1; 3314 const unsigned int bottom = mb_y & 1;
3315 const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass); 3315 const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
3316 void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); 3316 void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3317 void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3317 3318
3318 if(!s->decode) 3319 if(!s->decode)
3319 return; 3320 return;
3320 3321
3321 dest_y = s->current_picture.data[0] + (mb_y * 16* s->linesize ) + mb_x * 16; 3322 dest_y = s->current_picture.data[0] + (mb_y * 16* s->linesize ) + mb_x * 16;
3335 linesize = s->linesize; 3336 linesize = s->linesize;
3336 uvlinesize = s->uvlinesize; 3337 uvlinesize = s->uvlinesize;
3337 // dct_offset = s->linesize * 16; 3338 // dct_offset = s->linesize * 16;
3338 } 3339 }
3339 3340
3340 idct_add = transform_bypass 3341 if(transform_bypass){
3341 ? IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4 3342 idct_dc_add =
3342 : IS_8x8DCT(mb_type) ? s->dsp.h264_idct8_add : s->dsp.h264_idct_add; 3343 idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3344 }else if(IS_8x8DCT(mb_type)){
3345 idct_dc_add = s->dsp.h264_idct8_dc_add;
3346 idct_add = s->dsp.h264_idct8_add;
3347 }else{
3348 idct_dc_add = s->dsp.h264_idct_dc_add;
3349 idct_add = s->dsp.h264_idct_add;
3350 }
3343 3351
3344 if (IS_INTRA_PCM(mb_type)) { 3352 if (IS_INTRA_PCM(mb_type)) {
3345 unsigned int x, y; 3353 unsigned int x, y;
3346 3354
3347 // The pixels are stored in h->mb array in the same order as levels, 3355 // The pixels are stored in h->mb array in the same order as levels,
3387 if(!s->encoding){ 3395 if(!s->encoding){
3388 if(IS_8x8DCT(mb_type)){ 3396 if(IS_8x8DCT(mb_type)){
3389 for(i=0; i<16; i+=4){ 3397 for(i=0; i<16; i+=4){
3390 uint8_t * const ptr= dest_y + block_offset[i]; 3398 uint8_t * const ptr= dest_y + block_offset[i];
3391 const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; 3399 const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3400 const int nnz = h->non_zero_count_cache[ scan8[i] ];
3392 h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000, 3401 h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3393 (h->topright_samples_available<<(i+1))&0x8000, linesize); 3402 (h->topright_samples_available<<(i+1))&0x8000, linesize);
3394 if(h->non_zero_count_cache[ scan8[i] ]) 3403 if(nnz){
3395 idct_add(ptr, h->mb + i*16, linesize); 3404 if(nnz == 1 && h->mb[i*16])
3405 idct_dc_add(ptr, h->mb + i*16, linesize);
3406 else
3407 idct_add(ptr, h->mb + i*16, linesize);
3408 }
3396 } 3409 }
3397 }else 3410 }else
3398 for(i=0; i<16; i++){ 3411 for(i=0; i<16; i++){
3399 uint8_t * const ptr= dest_y + block_offset[i]; 3412 uint8_t * const ptr= dest_y + block_offset[i];
3400 uint8_t *topright; 3413 uint8_t *topright;
3401 const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; 3414 const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3402 int tr; 3415 int nnz, tr;
3403 3416
3404 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ 3417 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3405 const int topright_avail= (h->topright_samples_available<<i)&0x8000; 3418 const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3406 assert(mb_y || linesize <= block_offset[i]); 3419 assert(mb_y || linesize <= block_offset[i]);
3407 if(!topright_avail){ 3420 if(!topright_avail){
3411 topright= ptr + 4 - linesize; 3424 topright= ptr + 4 - linesize;
3412 }else 3425 }else
3413 topright= NULL; 3426 topright= NULL;
3414 3427
3415 h->pred4x4[ dir ](ptr, topright, linesize); 3428 h->pred4x4[ dir ](ptr, topright, linesize);
3416 if(h->non_zero_count_cache[ scan8[i] ]){ 3429 nnz = h->non_zero_count_cache[ scan8[i] ];
3417 if(s->codec_id == CODEC_ID_H264) 3430 if(nnz){
3418 idct_add(ptr, h->mb + i*16, linesize); 3431 if(s->codec_id == CODEC_ID_H264){
3419 else 3432 if(nnz == 1 && h->mb[i*16])
3433 idct_dc_add(ptr, h->mb + i*16, linesize);
3434 else
3435 idct_add(ptr, h->mb + i*16, linesize);
3436 }else
3420 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0); 3437 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3421 } 3438 }
3422 } 3439 }
3423 } 3440 }
3424 }else{ 3441 }else{
3451 } 3468 }
3452 3469
3453 3470
3454 if(!IS_INTRA4x4(mb_type)){ 3471 if(!IS_INTRA4x4(mb_type)){
3455 if(s->codec_id == CODEC_ID_H264){ 3472 if(s->codec_id == CODEC_ID_H264){
3456 const int di = IS_8x8DCT(mb_type) ? 4 : 1; 3473 if(IS_INTRA16x16(mb_type)){
3457 for(i=0; i<16; i+=di){ 3474 for(i=0; i<16; i++){
3458 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below 3475 if(h->non_zero_count_cache[ scan8[i] ])
3459 uint8_t * const ptr= dest_y + block_offset[i]; 3476 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3460 idct_add(ptr, h->mb + i*16, linesize); 3477 else if(h->mb[i*16])
3478 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3479 }
3480 }else{
3481 const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3482 for(i=0; i<16; i+=di){
3483 int nnz = h->non_zero_count_cache[ scan8[i] ];
3484 if(nnz){
3485 if(nnz==1 && h->mb[i*16])
3486 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3487 else
3488 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3489 }
3461 } 3490 }
3462 } 3491 }
3463 }else{ 3492 }else{
3464 for(i=0; i<16; i++){ 3493 for(i=0; i<16; i++){
3465 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below 3494 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3469 } 3498 }
3470 } 3499 }
3471 } 3500 }
3472 3501
3473 if(!(s->flags&CODEC_FLAG_GRAY)){ 3502 if(!(s->flags&CODEC_FLAG_GRAY)){
3474 idct_add = transform_bypass ? s->dsp.add_pixels4 : s->dsp.h264_idct_add; 3503 uint8_t *dest[2] = {dest_cb, dest_cr};
3475 if(!transform_bypass){ 3504 if(transform_bypass){
3505 idct_add = idct_dc_add = s->dsp.add_pixels4;
3506 }else{
3507 idct_add = s->dsp.h264_idct_add;
3508 idct_dc_add = s->dsp.h264_idct_dc_add;
3476 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]); 3509 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3477 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]); 3510 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3478 } 3511 }
3479 if(s->codec_id == CODEC_ID_H264){ 3512 if(s->codec_id == CODEC_ID_H264){
3480 for(i=16; i<16+4; i++){ 3513 for(i=16; i<16+8; i++){
3514 if(h->non_zero_count_cache[ scan8[i] ])
3515 idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3516 else if(h->mb[i*16])
3517 idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3518 }
3519 }else{
3520 for(i=16; i<16+8; i++){
3481 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ 3521 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3482 uint8_t * const ptr= dest_cb + block_offset[i]; 3522 uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3483 idct_add(ptr, h->mb + i*16, uvlinesize);
3484 }
3485 }
3486 for(i=20; i<20+4; i++){
3487 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3488 uint8_t * const ptr= dest_cr + block_offset[i];
3489 idct_add(ptr, h->mb + i*16, uvlinesize);
3490 }
3491 }
3492 }else{
3493 for(i=16; i<16+4; i++){
3494 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3495 uint8_t * const ptr= dest_cb + block_offset[i];
3496 svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3497 }
3498 }
3499 for(i=20; i<20+4; i++){
3500 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3501 uint8_t * const ptr= dest_cr + block_offset[i];
3502 svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2); 3523 svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3503 } 3524 }
3504 } 3525 }
3505 } 3526 }
3506 } 3527 }
5129 if( decode_residual(h, gb, buf, i4x4+4*i8x8, zigzag_scan8x8_cavlc+16*i4x4, 5150 if( decode_residual(h, gb, buf, i4x4+4*i8x8, zigzag_scan8x8_cavlc+16*i4x4,
5130 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 ) 5151 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5131 return -1; 5152 return -1;
5132 } 5153 }
5133 nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ]; 5154 nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5134 nnz[0] |= nnz[1] | nnz[8] | nnz[9]; 5155 nnz[0] += nnz[1] + nnz[8] + nnz[9];
5135 }else{ 5156 }else{
5136 for(i4x4=0; i4x4<4; i4x4++){ 5157 for(i4x4=0; i4x4<4; i4x4++){
5137 const int index= i4x4 + 4*i8x8; 5158 const int index= i4x4 + 4*i8x8;
5138 5159
5139 if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){ 5160 if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5688 h->cbp_table[mb_xy] |= 0x40 << n; 5709 h->cbp_table[mb_xy] |= 0x40 << n;
5689 else if( cat == 4 ) 5710 else if( cat == 4 )
5690 h->non_zero_count_cache[scan8[16+n]] = coeff_count; 5711 h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5691 else { 5712 else {
5692 assert( cat == 5 ); 5713 assert( cat == 5 );
5693 fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 1, 1); 5714 fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5694 } 5715 }
5695 5716
5696 for( i = coeff_count - 1; i >= 0; i-- ) { 5717 for( i = coeff_count - 1; i >= 0; i-- ) {
5697 uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base; 5718 uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5698 int j= scantable[index[i]]; 5719 int j= scantable[index[i]];