# HG changeset patch # User lorenm # Date 1137044872 0 # Node ID e8599ab02b38437b535b288a0734cb28cdfbe8d4 # Parent 63d7bab7b9ea1c80de3f6a1ee8bb17d797da9d59 faster iterative_me: avoid duplicate mc of neighboring blocks. diff -r 63d7bab7b9ea -r e8599ab02b38 snow.c --- a/snow.c Thu Jan 12 00:00:00 2006 +0000 +++ b/snow.c Thu Jan 12 05:47:52 2006 +0000 @@ -2471,7 +2471,18 @@ int x, y; const int color = block->color[plane_index]; const int color4= color*0x01010101; - if(b_w==16){ + if(b_w==32){ + for(y=0; y < b_h; y++){ + *(uint32_t*)&dst[0 + y*stride]= color4; + *(uint32_t*)&dst[4 + y*stride]= color4; + *(uint32_t*)&dst[8 + y*stride]= color4; + *(uint32_t*)&dst[12+ y*stride]= color4; + *(uint32_t*)&dst[16+ y*stride]= color4; + *(uint32_t*)&dst[20+ y*stride]= color4; + *(uint32_t*)&dst[24+ y*stride]= color4; + *(uint32_t*)&dst[28+ y*stride]= color4; + } + }else if(b_w==16){ for(y=0; y < b_h; y++){ *(uint32_t*)&dst[0 + y*stride]= color4; *(uint32_t*)&dst[4 + y*stride]= color4; @@ -2512,10 +2523,16 @@ assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h); assert(!(b_w&(b_w-1))); assert(b_w>1 && b_h>1); - assert(tab_index>=0 && tab_index<4); + assert(tab_index>=0 && tab_index<4 || b_w==32); if((dx&3) || (dy&3)) mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy); - else if(b_w==b_h) + else if(b_w==32){ + int y; + for(y=0; ydsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride); + s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride); + } + }else if(b_w==b_h) s->dsp.put_h264_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride); else if(b_w==2*b_h){ s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst ,src + 2 + 2*stride,stride); @@ -2688,7 +2705,7 @@ } //FIXME name clenup (b_w, block_w, b_width stuff) -static always_inline void add_yblock(SnowContext *s, DWTELEM *dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){ +static always_inline void add_yblock(SnowContext *s, DWTELEM *dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){ const int b_width = s->b_width << s->block_max_depth; const int b_height= s->b_height << s->block_max_depth; const int b_stride= b_width; @@ -2720,6 +2737,8 @@ if(src_x<0){ //FIXME merge with prev & always round internal width upto *16 obmc -= src_x; b_w += src_x; + if(!offset_dst) + dst -= src_x; src_x=0; }else if(src_x + b_w > w){ b_w = w - src_x; @@ -2727,6 +2746,8 @@ if(src_y<0){ obmc -= src_y*obmc_stride; b_h += src_y; + if(!offset_dst) + dst -= src_y*dst_stride; src_y=0; }else if(src_y + b_h> h){ b_h = h - src_y; @@ -2735,7 +2756,8 @@ if(b_w<=0 || b_h<=0) return; assert(src_stride > 2*MB_SIZE + 5); - dst += src_x + src_y*dst_stride; + if(offset_dst) + dst += src_x + src_y*dst_stride; dst8+= src_x + src_y*src_stride; // src += src_x + src_y*src_stride; @@ -2954,7 +2976,7 @@ w, h, w, ref_stride, obmc_stride, mb_x - 1, mb_y - 1, - add, plane_index); + add, 1, plane_index); STOP_TIMER("add_yblock") } @@ -2978,9 +3000,8 @@ const int obmc_stride= plane_index ? block_size : 2*block_size; const int ref_stride= s->current_picture.linesize[plane_index]; uint8_t *ref= s-> last_picture.data[plane_index]; - uint8_t *dst= s->current_picture.data[plane_index]; uint8_t *src= s-> input_picture.data[plane_index]; - const static DWTELEM zero_dst[4096]; //FIXME + DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4; const int b_stride = s->b_width << s->block_max_depth; const int w= p->width; const int h= p->height; @@ -2992,6 +3013,7 @@ b->type|= BLOCK_INTRA; b->color[plane_index]= 0; + memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM)); for(i=0; i<4; i++){ int mb_x2= mb_x + (i &1) - 1; @@ -2999,20 +3021,23 @@ int x= block_w*mb_x2 + block_w/2; int y= block_w*mb_y2 + block_w/2; - add_yblock(s, zero_dst, dst, ref, obmc, - x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, plane_index); + add_yblock(s, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, ref, obmc, + x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index); for(y2= FFMAX(y, 0); y2h) obmc_v += obmc[index - block_w*obmc_stride]; if(x+block_w>w) obmc_v += obmc[index - block_w]; //FIXME precalc this or simplify it somehow else - ab += (src[x2 + y2*ref_stride] - dst[x2 + y2*ref_stride]) * obmc_v; + d = -dst[index] + (1<<(FRAC_BITS-1)); + dst[index] = d; + ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v; aa += obmc_v * obmc_v; //FIXME precalclate this } } @@ -3022,8 +3047,7 @@ return clip(((ab<<6) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping } -static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){ - int i, y2; +static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){ Plane *p= &s->plane[plane_index]; const int block_size = MB_SIZE >> s->block_max_depth; const int block_w = plane_index ? block_size/2 : block_size; @@ -3032,41 +3056,50 @@ const int ref_stride= s->current_picture.linesize[plane_index]; uint8_t *ref= s-> last_picture.data[plane_index]; uint8_t *dst= s->current_picture.data[plane_index]; - uint8_t *src= s-> input_picture.data[plane_index]; - const static DWTELEM zero_dst[4096]; //FIXME + uint8_t *src= s-> input_picture.data[plane_index]; + DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4; + uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment + uint8_t tmp[ref_stride*(2*MB_SIZE+5)]; const int b_stride = s->b_width << s->block_max_depth; const int b_height = s->b_height<< s->block_max_depth; const int w= p->width; const int h= p->height; - int distortion= 0; + int distortion; int rate= 0; const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp); - - for(i=0; i<4; i++){ - int mb_x2= mb_x + (i &1) - 1; - int mb_y2= mb_y + (i>>1) - 1; - int x= block_w*mb_x2 + block_w/2; - int y= block_w*mb_y2 + block_w/2; - - add_yblock(s, zero_dst, dst, ref, obmc, - x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, plane_index); - - //FIXME find a cleaner/simpler way to skip the outside stuff - for(y2= y; y2<0; y2++) - memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w); - for(y2= h; y2block[mb_x + mb_y*b_stride], plane_index, w, h); + + for(y=y0; y> FRAC_BITS; + if(v&(~255)) v= ~(v>>31); + dst1[x] = v; } - if(x+block_w > w){ - for(y2= y; y2>1))*ref_stride; + distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16); } - - assert(block_w== 8 || block_w==16); - distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w); + }else{ + assert(block_w==8); + distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2); } if(plane_index==0){ @@ -3112,7 +3145,7 @@ return distortion + rate*penalty_factor; } -static always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, int *best_rd){ +static always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){ const int b_stride= s->b_width << s->block_max_depth; BlockNode *block= &s->block[mb_x + mb_y * b_stride]; BlockNode backup= *block; @@ -3138,7 +3171,7 @@ block->type &= ~BLOCK_INTRA; } - rd= get_block_rd(s, mb_x, mb_y, 0); + rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged); //FIXME chroma if(rd < *best_rd){ @@ -3151,9 +3184,9 @@ } /* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */ -static always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int intra, int *best_rd){ +static always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int intra, const uint8_t *obmc_edged, int *best_rd){ int p[2] = {p0, p1}; - return check_block(s, mb_x, mb_y, p, intra, best_rd); + return check_block(s, mb_x, mb_y, p, intra, obmc_edged, best_rd); } static void iterative_me(SnowContext *s){ @@ -3181,6 +3214,8 @@ BlockNode *trb= mb_xblock[index-b_stride+1] : &null_block; BlockNode *blb= mb_x && mb_yblock[index+b_stride-1] : &null_block; BlockNode *brb= mb_xblock[index+b_stride+1] : &null_block; + const int b_w= (MB_SIZE >> s->block_max_depth); + uint8_t obmc_edged[b_w*2][b_w*2]; if(pass && (block->type & BLOCK_OPT)) continue; @@ -3192,13 +3227,68 @@ memset(s->me_cache, 0, sizeof(s->me_cache)); s->me_cache_generation += 1<<22; + //FIXME precalc + { + int x, y; + memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4); + if(mb_x==0) + for(y=0; y input_picture.data[0]; + uint8_t *dst= s->current_picture.data[0]; + const int stride= s->current_picture.linesize[0]; + const int block_w= MB_SIZE >> s->block_max_depth; + const int sx= block_w*mb_x - block_w/2; + const int sy= block_w*mb_y - block_w/2; + const int w= s->plane[0].width; + const int h= s->plane[0].height; + int y; + + for(y=sy; y<0; y++) + memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2); + for(y=h; y w){ + for(y=sy; ymx, block->my, 0, &best_rd); - check_block_inter(s, mb_x, mb_y, 0, 0, 0, &best_rd); - check_block_inter(s, mb_x, mb_y, tb->mx, tb->my, 0, &best_rd); - check_block_inter(s, mb_x, mb_y, lb->mx, lb->my, 0, &best_rd); - check_block_inter(s, mb_x, mb_y, rb->mx, rb->my, 0, &best_rd); - check_block_inter(s, mb_x, mb_y, bb->mx, bb->my, 0, &best_rd); + check_block_inter(s, mb_x, mb_y, block->mx, block->my, 0, *obmc_edged, &best_rd); + check_block_inter(s, mb_x, mb_y, 0, 0, 0, *obmc_edged, &best_rd); + check_block_inter(s, mb_x, mb_y, tb->mx, tb->my, 0, *obmc_edged, &best_rd); + check_block_inter(s, mb_x, mb_y, lb->mx, lb->my, 0, *obmc_edged, &best_rd); + check_block_inter(s, mb_x, mb_y, rb->mx, rb->my, 0, *obmc_edged, &best_rd); + check_block_inter(s, mb_x, mb_y, bb->mx, bb->my, 0, *obmc_edged, &best_rd); /* fullpel ME */ //FIXME avoid subpel interpol / round to nearest integer @@ -3206,10 +3296,10 @@ dia_change=0; for(i=0; iavctx->dia_size, 1); i++){ for(j=0; jmx+4*(i-j), block->my+(4*j), 0, &best_rd); - dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), 0, &best_rd); - dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), 0, &best_rd); - dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), 0, &best_rd); + dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), 0, *obmc_edged, &best_rd); + dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), 0, *obmc_edged, &best_rd); + dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), 0, *obmc_edged, &best_rd); + dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), 0, *obmc_edged, &best_rd); } } }while(dia_change); @@ -3218,14 +3308,11 @@ static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},}; dia_change=0; for(i=0; i<8; i++) - dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], 0, &best_rd); + dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], 0, *obmc_edged, &best_rd); }while(dia_change); //FIXME or try the standard 2 pass qpel or similar #if 1 - for(i=0; i<3; i++){ - color[i]= get_dc(s, mb_x, mb_y, i); - } - check_block(s, mb_x, mb_y, color, 1, &best_rd); + check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd); //FIXME RD style color selection #endif if(!same_block(block, &backup)){ @@ -3719,6 +3806,7 @@ s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t)); s->m.me.map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t)); s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t)); + s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t)); h263_encode_init(&s->m); //mv_penalty if(avctx->flags&CODEC_FLAG_PASS1){ @@ -3999,6 +4087,7 @@ av_freep(&s->m.me.scratchpad); av_freep(&s->m.me.map); av_freep(&s->m.me.score_map); + av_freep(&s->m.obmc_scratchpad); av_freep(&s->block);