# HG changeset patch # User lorenm # Date 1156757581 0 # Node ID 47821be55b6c34c5191934773428e6b137483938 # Parent 9eb3d84b6eef395cabd0a158fd0587b800fe0226 mmx implementation of deblocking strength decision. 2-3% faster h264. diff -r 9eb3d84b6eef -r 47821be55b6c dsputil.c --- a/dsputil.c Sun Aug 27 21:45:26 2006 +0000 +++ b/dsputil.c Mon Aug 28 09:33:01 2006 +0000 @@ -4111,6 +4111,7 @@ c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; + c->h264_loop_filter_strength= NULL; c->h263_h_loop_filter= h263_h_loop_filter_c; c->h263_v_loop_filter= h263_v_loop_filter_c; diff -r 9eb3d84b6eef -r 47821be55b6c dsputil.h --- a/dsputil.h Sun Aug 27 21:45:26 2006 +0000 +++ b/dsputil.h Mon Aug 28 09:33:01 2006 +0000 @@ -305,6 +305,9 @@ void (*h264_h_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta); void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta); + // h264_loop_filter_strength: simd only. the C version is inlined in h264.c + void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], + int bidir, int edges, int step, int mask_mv0, int mask_mv1); void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale); void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale); diff -r 9eb3d84b6eef -r 47821be55b6c h264.c --- a/h264.c Sun Aug 27 21:45:26 2006 +0000 +++ b/h264.c Mon Aug 28 09:33:01 2006 +0000 @@ -409,6 +409,7 @@ static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp); static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize); +static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize); static always_inline uint32_t pack16to32(int a, int b){ #ifdef WORDS_BIGENDIAN @@ -3879,7 +3880,7 @@ tprintf("call filter_mb\n"); backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize); fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb - filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize); + filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize); } } } @@ -6694,7 +6695,7 @@ } -static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) { +static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { int i, d; const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 ); const int alpha = alpha_table[index_a]; @@ -6755,7 +6756,7 @@ } } } -static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) { +static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { int i; const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 ); const int alpha = alpha_table[index_a]; @@ -6771,7 +6772,7 @@ } } -static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) { +static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) { int i; for( i = 0; i < 16; i++, pix += stride) { int index_a; @@ -6869,7 +6870,7 @@ } } } -static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) { +static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) { int i; for( i = 0; i < 8; i++, pix += stride) { int index_a; @@ -6922,7 +6923,7 @@ } } -static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) { +static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { int i, d; const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 ); const int alpha = alpha_table[index_a]; @@ -6982,7 +6983,7 @@ } } -static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) { +static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { int i; const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 ); const int alpha = alpha_table[index_a]; @@ -6998,6 +6999,108 @@ } } +static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) { + MpegEncContext * const s = &h->s; + int mb_xy, mb_type; + int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh; + + if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength) { + filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize); + return; + } + assert(!FRAME_MBAFF); + + mb_xy = mb_x + mb_y*s->mb_stride; + mb_type = s->current_picture.mb_type[mb_xy]; + qp = s->current_picture.qscale_table[mb_xy]; + qp0 = s->current_picture.qscale_table[mb_xy-1]; + qp1 = s->current_picture.qscale_table[h->top_mb_xy]; + qpc = get_chroma_qp( h->pps.chroma_qp_index_offset, qp ); + qpc0 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp0 ); + qpc1 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp1 ); + qp0 = (qp + qp0 + 1) >> 1; + qp1 = (qp + qp1 + 1) >> 1; + qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset); + if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh) + return; + qpc0 = (qpc + qpc0 + 1) >> 1; + qpc1 = (qpc + qpc1 + 1) >> 1; + + if( IS_INTRA(mb_type) ) { + int16_t bS4[4] = {4,4,4,4}; + int16_t bS3[4] = {3,3,3,3}; + if( IS_8x8DCT(mb_type) ) { + filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 ); + filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp ); + filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 ); + filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp ); + } else { + filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 ); + filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp ); + filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp ); + filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp ); + filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 ); + filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp ); + filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp ); + filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp ); + } + filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 ); + filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc ); + filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 ); + filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc ); + filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 ); + filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc ); + filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 ); + filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc ); + return; + } else { + DECLARE_ALIGNED_8(int16_t, bS[2][4][4]); + uint64_t (*bSv)[4] = (uint64_t(*)[4])bS; + int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP)) + == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4; + int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 : + (mb_type & MB_TYPE_16x8) ? 1 : 0; + int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) + && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16)) + ? 3 : 0; + int step = IS_8x8DCT(mb_type) ? 2 : 1; + s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache, + (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 ); + if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) ) + bSv[0][0] = 0x0004000400040004ULL; + if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) ) + bSv[1][0] = 0x0004000400040004ULL; + +#define FILTER(hv,dir,edge)\ + if(bSv[dir][edge]) {\ + filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\ + if(!(edge&1)) {\ + filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\ + filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\ + }\ + } + if( edges == 1 ) { + FILTER(v,0,0); + FILTER(h,1,0); + } else if( IS_8x8DCT(mb_type) ) { + FILTER(v,0,0); + FILTER(v,0,2); + FILTER(h,1,0); + FILTER(h,1,2); + } else { + FILTER(v,0,0); + FILTER(v,0,1); + FILTER(v,0,2); + FILTER(v,0,3); + FILTER(h,1,0); + FILTER(h,1,1); + FILTER(h,1,2); + FILTER(h,1,3); + } +#undef FILTER + } +} + static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) { MpegEncContext * const s = &h->s; const int mb_xy= mb_x + mb_y*s->mb_stride; @@ -7035,7 +7138,7 @@ */ const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride; const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride }; - int bS[8]; + int16_t bS[8]; int qp[2]; int chroma_qp[2]; int mb_qp, mbn0_qp, mbn1_qp; @@ -7114,7 +7217,7 @@ int mbn_xy = mb_xy - 2 * s->mb_stride; int qp, chroma_qp; int i, j; - int bS[4]; + int16_t bS[4]; for(j=0; j<2; j++, mbn_xy += s->mb_stride){ if( IS_INTRA(mb_type) || @@ -7150,7 +7253,7 @@ /* mbn_xy: neighbor macroblock */ const int mbn_xy = edge > 0 ? mb_xy : mbm_xy; const int mbn_type = s->current_picture.mb_type[mbn_xy]; - int bS[4]; + int16_t bS[4]; int qp; if( (edge&1) && IS_8x8DCT(mb_type) ) diff -r 9eb3d84b6eef -r 47821be55b6c i386/dsputil_mmx.c --- a/i386/dsputil_mmx.c Sun Aug 27 21:45:26 2006 +0000 +++ b/i386/dsputil_mmx.c Mon Aug 28 09:33:01 2006 +0000 @@ -53,6 +53,9 @@ static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL; static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; +static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL; +static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL; +static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL; static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL; static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; @@ -3282,6 +3285,7 @@ c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; + c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; diff -r 9eb3d84b6eef -r 47821be55b6c i386/h264dsp_mmx.c --- a/i386/h264dsp_mmx.c Sun Aug 27 21:45:26 2006 +0000 +++ b/i386/h264dsp_mmx.c Mon Aug 28 09:33:01 2006 +0000 @@ -561,6 +561,101 @@ transpose4x4(pix-2+4*stride, trans+4, stride, 8); } +static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], + int bidir, int edges, int step, int mask_mv0, int mask_mv1 ) { + int dir; + asm volatile( + "pxor %%mm7, %%mm7 \n\t" + "movq %0, %%mm6 \n\t" + "movq %1, %%mm5 \n\t" + "movq %2, %%mm4 \n\t" + ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) + ); + // could do a special case for dir==0 && edges==1, but it only reduces the + // average filter time by 1.2% + for( dir=1; dir>=0; dir-- ) { + const int d_idx = dir ? -8 : -1; + const int mask_mv = dir ? mask_mv1 : mask_mv0; + const uint64_t mask_dir = dir ? 0 : 0xffffffffffffffffULL; + int b_idx, edge, l; + for( b_idx=12, edge=0; edge= 0; l-- ) { + asm volatile( + "movd %0, %%mm1 \n\t" + "punpckldq %1, %%mm1 \n\t" + "movq %%mm1, %%mm2 \n\t" + "psrlw $7, %%mm2 \n\t" + "pand %%mm6, %%mm2 \n\t" + "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1 + "punpckldq %%mm1, %%mm2 \n\t" + "pcmpeqb %%mm2, %%mm1 \n\t" + "paddb %%mm6, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] + "por %%mm1, %%mm0 \n\t" + + "movq %2, %%mm1 \n\t" + "movq %3, %%mm2 \n\t" + "psubw %4, %%mm1 \n\t" + "psubw %5, %%mm2 \n\t" + "packsswb %%mm2, %%mm1 \n\t" + "paddb %%mm5, %%mm1 \n\t" + "pminub %%mm4, %%mm1 \n\t" + "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit + "por %%mm1, %%mm0 \n\t" + ::"m"(ref[l][b_idx]), + "m"(ref[l][b_idx+d_idx]), + "m"(mv[l][b_idx][0]), + "m"(mv[l][b_idx+2][0]), + "m"(mv[l][b_idx+d_idx][0]), + "m"(mv[l][b_idx+d_idx+2][0]) + ); + } + } + asm volatile( + "movd %0, %%mm1 \n\t" + "por %1, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn] + ::"m"(nnz[b_idx]), + "m"(nnz[b_idx+d_idx]) + ); + asm volatile( + "pcmpeqw %%mm7, %%mm0 \n\t" + "pcmpeqw %%mm7, %%mm0 \n\t" + "psrlw $15, %%mm0 \n\t" // nonzero -> 1 + "psrlw $14, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "por %%mm1, %%mm2 \n\t" + "psrlw $1, %%mm1 \n\t" + "pandn %%mm2, %%mm1 \n\t" + "movq %%mm1, %0 \n\t" + :"=m"(*bS[dir][edge]) + ::"memory" + ); + } + edges = 4; + step = 1; + } + asm volatile( + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm1 \n\t" + "movq 16(%0), %%mm2 \n\t" + "movq 24(%0), %%mm3 \n\t" + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) + "movq %%mm0, (%0) \n\t" + "movq %%mm3, 8(%0) \n\t" + "movq %%mm4, 16(%0) \n\t" + "movq %%mm2, 24(%0) \n\t" + ::"r"(bS[0]) + :"memory" + ); +} /***********************************/ /* motion compensation */