# HG changeset patch # User lorenm # Date 1139554525 0 # Node ID 2d35fb3cb94018bf98ed3c36a165ebe205ff0e12 # Parent 78d6bfc238f37ec10b02fc6fd83df95acff92482 h264: special case dc-only idct. ~1% faster overall diff -r 78d6bfc238f3 -r 2d35fb3cb940 dsputil.c --- a/dsputil.c Fri Feb 10 01:19:54 2006 +0000 +++ b/dsputil.c Fri Feb 10 06:55:25 2006 +0000 @@ -3851,6 +3851,8 @@ c->h264_idct_add= ff_h264_idct_add_c; c->h264_idct8_add= ff_h264_idct8_add_c; + c->h264_idct_dc_add= ff_h264_idct_dc_add_c; + c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c; c->get_pixels = get_pixels_c; c->diff_pixels = diff_pixels_c; diff -r 78d6bfc238f3 -r 2d35fb3cb940 dsputil.h --- a/dsputil.h Fri Feb 10 01:19:54 2006 +0000 +++ b/dsputil.h Fri Feb 10 06:55:25 2006 +0000 @@ -52,6 +52,8 @@ void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block); void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block); @@ -330,6 +332,8 @@ void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride); void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride); + void (*h264_idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); + void (*h264_idct8_dc_add)(uint8_t *dst, DCTELEM *block, int stride); } DSPContext; void dsputil_static_init(void); diff -r 78d6bfc238f3 -r 2d35fb3cb940 h264.c --- a/h264.c Fri Feb 10 01:19:54 2006 +0000 +++ b/h264.c Fri Feb 10 06:55:25 2006 +0000 @@ -3314,6 +3314,7 @@ const unsigned int bottom = mb_y & 1; const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass); void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); + void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); if(!s->decode) return; @@ -3337,9 +3338,16 @@ // dct_offset = s->linesize * 16; } - idct_add = transform_bypass - ? IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4 - : IS_8x8DCT(mb_type) ? s->dsp.h264_idct8_add : s->dsp.h264_idct_add; + if(transform_bypass){ + idct_dc_add = + idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4; + }else if(IS_8x8DCT(mb_type)){ + idct_dc_add = s->dsp.h264_idct8_dc_add; + idct_add = s->dsp.h264_idct8_add; + }else{ + idct_dc_add = s->dsp.h264_idct_dc_add; + idct_add = s->dsp.h264_idct_add; + } if (IS_INTRA_PCM(mb_type)) { unsigned int x, y; @@ -3389,17 +3397,22 @@ for(i=0; i<16; i+=4){ uint8_t * const ptr= dest_y + block_offset[i]; const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; + const int nnz = h->non_zero_count_cache[ scan8[i] ]; h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<topright_samples_available<<(i+1))&0x8000, linesize); - if(h->non_zero_count_cache[ scan8[i] ]) - idct_add(ptr, h->mb + i*16, linesize); + if(nnz){ + if(nnz == 1 && h->mb[i*16]) + idct_dc_add(ptr, h->mb + i*16, linesize); + else + idct_add(ptr, h->mb + i*16, linesize); + } } }else for(i=0; i<16; i++){ uint8_t * const ptr= dest_y + block_offset[i]; uint8_t *topright; const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; - int tr; + int nnz, tr; if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ const int topright_avail= (h->topright_samples_available<pred4x4[ dir ](ptr, topright, linesize); - if(h->non_zero_count_cache[ scan8[i] ]){ - if(s->codec_id == CODEC_ID_H264) - idct_add(ptr, h->mb + i*16, linesize); - else + nnz = h->non_zero_count_cache[ scan8[i] ]; + if(nnz){ + if(s->codec_id == CODEC_ID_H264){ + if(nnz == 1 && h->mb[i*16]) + idct_dc_add(ptr, h->mb + i*16, linesize); + else + idct_add(ptr, h->mb + i*16, linesize); + }else svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0); } } @@ -3453,11 +3470,23 @@ if(!IS_INTRA4x4(mb_type)){ if(s->codec_id == CODEC_ID_H264){ - const int di = IS_8x8DCT(mb_type) ? 4 : 1; - for(i=0; i<16; i+=di){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below - uint8_t * const ptr= dest_y + block_offset[i]; - idct_add(ptr, h->mb + i*16, linesize); + if(IS_INTRA16x16(mb_type)){ + for(i=0; i<16; i++){ + if(h->non_zero_count_cache[ scan8[i] ]) + idct_add(dest_y + block_offset[i], h->mb + i*16, linesize); + else if(h->mb[i*16]) + idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize); + } + }else{ + const int di = IS_8x8DCT(mb_type) ? 4 : 1; + for(i=0; i<16; i+=di){ + int nnz = h->non_zero_count_cache[ scan8[i] ]; + if(nnz){ + if(nnz==1 && h->mb[i*16]) + idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize); + else + idct_add(dest_y + block_offset[i], h->mb + i*16, linesize); + } } } }else{ @@ -3471,34 +3500,26 @@ } if(!(s->flags&CODEC_FLAG_GRAY)){ - idct_add = transform_bypass ? s->dsp.add_pixels4 : s->dsp.h264_idct_add; - if(!transform_bypass){ + uint8_t *dest[2] = {dest_cb, dest_cr}; + if(transform_bypass){ + idct_add = idct_dc_add = s->dsp.add_pixels4; + }else{ + idct_add = s->dsp.h264_idct_add; + idct_dc_add = s->dsp.h264_idct_dc_add; chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]); chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]); } if(s->codec_id == CODEC_ID_H264){ - for(i=16; i<16+4; i++){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cb + block_offset[i]; - idct_add(ptr, h->mb + i*16, uvlinesize); - } - } - for(i=20; i<20+4; i++){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cr + block_offset[i]; - idct_add(ptr, h->mb + i*16, uvlinesize); - } + for(i=16; i<16+8; i++){ + if(h->non_zero_count_cache[ scan8[i] ]) + idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); + else if(h->mb[i*16]) + idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); } }else{ - for(i=16; i<16+4; i++){ + for(i=16; i<16+8; i++){ if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cb + block_offset[i]; - svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2); - } - } - for(i=20; i<20+4; i++){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cr + block_offset[i]; + uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i]; svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2); } } @@ -5131,7 +5152,7 @@ return -1; } nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ]; - nnz[0] |= nnz[1] | nnz[8] | nnz[9]; + nnz[0] += nnz[1] + nnz[8] + nnz[9]; }else{ for(i4x4=0; i4x4<4; i4x4++){ const int index= i4x4 + 4*i8x8; @@ -5690,7 +5711,7 @@ h->non_zero_count_cache[scan8[16+n]] = coeff_count; else { assert( cat == 5 ); - fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 1, 1); + fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1); } for( i = coeff_count - 1; i >= 0; i-- ) { diff -r 78d6bfc238f3 -r 2d35fb3cb940 h264idct.c --- a/h264idct.c Fri Feb 10 01:19:54 2006 +0000 +++ b/h264idct.c Fri Feb 10 06:55:25 2006 +0000 @@ -139,3 +139,28 @@ dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ]; } } + +// assumes all AC coefs are 0 +void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){ + int i, j; + uint8_t *cm = cropTbl + MAX_NEG_CROP; + int dc = (block[0] + 32) >> 6; + for( j = 0; j < 4; j++ ) + { + for( i = 0; i < 4; i++ ) + dst[i] = cm[ dst[i] + dc ]; + dst += stride; + } +} + +void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){ + int i, j; + uint8_t *cm = cropTbl + MAX_NEG_CROP; + int dc = (block[0] + 32) >> 6; + for( j = 0; j < 8; j++ ) + { + for( i = 0; i < 8; i++ ) + dst[i] = cm[ dst[i] + dc ]; + dst += stride; + } +} diff -r 78d6bfc238f3 -r 2d35fb3cb940 i386/dsputil_mmx.c --- a/i386/dsputil_mmx.c Fri Feb 10 01:19:54 2006 +0000 +++ b/i386/dsputil_mmx.c Fri Feb 10 06:55:25 2006 +0000 @@ -2754,6 +2754,8 @@ #endif //CONFIG_ENCODERS c->h264_idct_add= ff_h264_idct_add_mmx2; + c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; + c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; diff -r 78d6bfc238f3 -r 2d35fb3cb940 i386/h264dsp_mmx.c --- a/i386/h264dsp_mmx.c Fri Feb 10 01:19:54 2006 +0000 +++ b/i386/h264dsp_mmx.c Fri Feb 10 06:55:25 2006 +0000 @@ -104,6 +104,87 @@ ); } +void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + asm volatile( + "movd %0, %%mm0 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "pmaxsw %%mm7, %%mm0 \n\t" + "pmaxsw %%mm7, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + asm volatile( + "movd %0, %%mm2 \n\t" + "movd %1, %%mm3 \n\t" + "movd %2, %%mm4 \n\t" + "movd %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movd %%mm2, %0 \n\t" + "movd %%mm3, %1 \n\t" + "movd %%mm4, %2 \n\t" + "movd %%mm5, %3 \n\t" + :"+m"(*(uint32_t*)(dst+0*stride)), + "+m"(*(uint32_t*)(dst+1*stride)), + "+m"(*(uint32_t*)(dst+2*stride)), + "+m"(*(uint32_t*)(dst+3*stride)) + ); +} + +void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + int y; + asm volatile( + "movd %0, %%mm0 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "pmaxsw %%mm7, %%mm0 \n\t" + "pmaxsw %%mm7, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + for(y=2; y--; dst += 4*stride){ + asm volatile( + "movq %0, %%mm2 \n\t" + "movq %1, %%mm3 \n\t" + "movq %2, %%mm4 \n\t" + "movq %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movq %%mm2, %0 \n\t" + "movq %%mm3, %1 \n\t" + "movq %%mm4, %2 \n\t" + "movq %%mm5, %3 \n\t" + :"+m"(*(uint64_t*)(dst+0*stride)), + "+m"(*(uint64_t*)(dst+1*stride)), + "+m"(*(uint64_t*)(dst+2*stride)), + "+m"(*(uint64_t*)(dst+3*stride)) + ); + } +} + /***********************************/ /* deblocking */