comparison snow.c @ 3033:e8599ab02b38 libavcodec

faster iterative_me: avoid duplicate mc of neighboring blocks.
author lorenm
date Thu, 12 Jan 2006 05:47:52 +0000
parents c75fb0747e74
children d0f408fa01c7
comparison
equal deleted inserted replaced
3032:63d7bab7b9ea 3033:e8599ab02b38
2469 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){ 2469 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2470 if(block->type & BLOCK_INTRA){ 2470 if(block->type & BLOCK_INTRA){
2471 int x, y; 2471 int x, y;
2472 const int color = block->color[plane_index]; 2472 const int color = block->color[plane_index];
2473 const int color4= color*0x01010101; 2473 const int color4= color*0x01010101;
2474 if(b_w==16){ 2474 if(b_w==32){
2475 for(y=0; y < b_h; y++){
2476 *(uint32_t*)&dst[0 + y*stride]= color4;
2477 *(uint32_t*)&dst[4 + y*stride]= color4;
2478 *(uint32_t*)&dst[8 + y*stride]= color4;
2479 *(uint32_t*)&dst[12+ y*stride]= color4;
2480 *(uint32_t*)&dst[16+ y*stride]= color4;
2481 *(uint32_t*)&dst[20+ y*stride]= color4;
2482 *(uint32_t*)&dst[24+ y*stride]= color4;
2483 *(uint32_t*)&dst[28+ y*stride]= color4;
2484 }
2485 }else if(b_w==16){
2475 for(y=0; y < b_h; y++){ 2486 for(y=0; y < b_h; y++){
2476 *(uint32_t*)&dst[0 + y*stride]= color4; 2487 *(uint32_t*)&dst[0 + y*stride]= color4;
2477 *(uint32_t*)&dst[4 + y*stride]= color4; 2488 *(uint32_t*)&dst[4 + y*stride]= color4;
2478 *(uint32_t*)&dst[8 + y*stride]= color4; 2489 *(uint32_t*)&dst[8 + y*stride]= color4;
2479 *(uint32_t*)&dst[12+ y*stride]= color4; 2490 *(uint32_t*)&dst[12+ y*stride]= color4;
2510 src= tmp + MB_SIZE; 2521 src= tmp + MB_SIZE;
2511 } 2522 }
2512 assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h); 2523 assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2513 assert(!(b_w&(b_w-1))); 2524 assert(!(b_w&(b_w-1)));
2514 assert(b_w>1 && b_h>1); 2525 assert(b_w>1 && b_h>1);
2515 assert(tab_index>=0 && tab_index<4); 2526 assert(tab_index>=0 && tab_index<4 || b_w==32);
2516 if((dx&3) || (dy&3)) 2527 if((dx&3) || (dy&3))
2517 mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy); 2528 mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2518 else if(b_w==b_h) 2529 else if(b_w==32){
2530 int y;
2531 for(y=0; y<b_h; y+=16){
2532 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2533 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2534 }
2535 }else if(b_w==b_h)
2519 s->dsp.put_h264_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride); 2536 s->dsp.put_h264_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2520 else if(b_w==2*b_h){ 2537 else if(b_w==2*b_h){
2521 s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst ,src + 2 + 2*stride,stride); 2538 s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst ,src + 2 + 2*stride,stride);
2522 s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride); 2539 s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2523 }else{ 2540 }else{
2686 } 2703 }
2687 #endif 2704 #endif
2688 } 2705 }
2689 2706
2690 //FIXME name clenup (b_w, block_w, b_width stuff) 2707 //FIXME name clenup (b_w, block_w, b_width stuff)
2691 static always_inline void add_yblock(SnowContext *s, DWTELEM *dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){ 2708 static always_inline void add_yblock(SnowContext *s, DWTELEM *dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2692 const int b_width = s->b_width << s->block_max_depth; 2709 const int b_width = s->b_width << s->block_max_depth;
2693 const int b_height= s->b_height << s->block_max_depth; 2710 const int b_height= s->b_height << s->block_max_depth;
2694 const int b_stride= b_width; 2711 const int b_stride= b_width;
2695 BlockNode *lt= &s->block[b_x + b_y*b_stride]; 2712 BlockNode *lt= &s->block[b_x + b_y*b_stride];
2696 BlockNode *rt= lt+1; 2713 BlockNode *rt= lt+1;
2718 } 2735 }
2719 2736
2720 if(src_x<0){ //FIXME merge with prev & always round internal width upto *16 2737 if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2721 obmc -= src_x; 2738 obmc -= src_x;
2722 b_w += src_x; 2739 b_w += src_x;
2740 if(!offset_dst)
2741 dst -= src_x;
2723 src_x=0; 2742 src_x=0;
2724 }else if(src_x + b_w > w){ 2743 }else if(src_x + b_w > w){
2725 b_w = w - src_x; 2744 b_w = w - src_x;
2726 } 2745 }
2727 if(src_y<0){ 2746 if(src_y<0){
2728 obmc -= src_y*obmc_stride; 2747 obmc -= src_y*obmc_stride;
2729 b_h += src_y; 2748 b_h += src_y;
2749 if(!offset_dst)
2750 dst -= src_y*dst_stride;
2730 src_y=0; 2751 src_y=0;
2731 }else if(src_y + b_h> h){ 2752 }else if(src_y + b_h> h){
2732 b_h = h - src_y; 2753 b_h = h - src_y;
2733 } 2754 }
2734 2755
2735 if(b_w<=0 || b_h<=0) return; 2756 if(b_w<=0 || b_h<=0) return;
2736 2757
2737 assert(src_stride > 2*MB_SIZE + 5); 2758 assert(src_stride > 2*MB_SIZE + 5);
2738 dst += src_x + src_y*dst_stride; 2759 if(offset_dst)
2760 dst += src_x + src_y*dst_stride;
2739 dst8+= src_x + src_y*src_stride; 2761 dst8+= src_x + src_y*src_stride;
2740 // src += src_x + src_y*src_stride; 2762 // src += src_x + src_y*src_stride;
2741 2763
2742 ptmp= tmp + 3*tmp_step; 2764 ptmp= tmp + 3*tmp_step;
2743 block[0]= ptmp; 2765 block[0]= ptmp;
2952 block_w*mb_y - block_w/2, 2974 block_w*mb_y - block_w/2,
2953 block_w, block_w, 2975 block_w, block_w,
2954 w, h, 2976 w, h,
2955 w, ref_stride, obmc_stride, 2977 w, ref_stride, obmc_stride,
2956 mb_x - 1, mb_y - 1, 2978 mb_x - 1, mb_y - 1,
2957 add, plane_index); 2979 add, 1, plane_index);
2958 2980
2959 STOP_TIMER("add_yblock") 2981 STOP_TIMER("add_yblock")
2960 } 2982 }
2961 2983
2962 STOP_TIMER("predict_slice") 2984 STOP_TIMER("predict_slice")
2976 const int block_w = plane_index ? block_size/2 : block_size; 2998 const int block_w = plane_index ? block_size/2 : block_size;
2977 const uint8_t *obmc = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth]; 2999 const uint8_t *obmc = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2978 const int obmc_stride= plane_index ? block_size : 2*block_size; 3000 const int obmc_stride= plane_index ? block_size : 2*block_size;
2979 const int ref_stride= s->current_picture.linesize[plane_index]; 3001 const int ref_stride= s->current_picture.linesize[plane_index];
2980 uint8_t *ref= s-> last_picture.data[plane_index]; 3002 uint8_t *ref= s-> last_picture.data[plane_index];
2981 uint8_t *dst= s->current_picture.data[plane_index];
2982 uint8_t *src= s-> input_picture.data[plane_index]; 3003 uint8_t *src= s-> input_picture.data[plane_index];
2983 const static DWTELEM zero_dst[4096]; //FIXME 3004 DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2984 const int b_stride = s->b_width << s->block_max_depth; 3005 const int b_stride = s->b_width << s->block_max_depth;
2985 const int w= p->width; 3006 const int w= p->width;
2986 const int h= p->height; 3007 const int h= p->height;
2987 int index= mb_x + mb_y*b_stride; 3008 int index= mb_x + mb_y*b_stride;
2988 BlockNode *b= &s->block[index]; 3009 BlockNode *b= &s->block[index];
2990 int ab=0; 3011 int ab=0;
2991 int aa=0; 3012 int aa=0;
2992 3013
2993 b->type|= BLOCK_INTRA; 3014 b->type|= BLOCK_INTRA;
2994 b->color[plane_index]= 0; 3015 b->color[plane_index]= 0;
3016 memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
2995 3017
2996 for(i=0; i<4; i++){ 3018 for(i=0; i<4; i++){
2997 int mb_x2= mb_x + (i &1) - 1; 3019 int mb_x2= mb_x + (i &1) - 1;
2998 int mb_y2= mb_y + (i>>1) - 1; 3020 int mb_y2= mb_y + (i>>1) - 1;
2999 int x= block_w*mb_x2 + block_w/2; 3021 int x= block_w*mb_x2 + block_w/2;
3000 int y= block_w*mb_y2 + block_w/2; 3022 int y= block_w*mb_y2 + block_w/2;
3001 3023
3002 add_yblock(s, zero_dst, dst, ref, obmc, 3024 add_yblock(s, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, ref, obmc,
3003 x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, plane_index); 3025 x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
3004 3026
3005 for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){ 3027 for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
3006 for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){ 3028 for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
3007 int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride; 3029 int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
3008 int obmc_v= obmc[index]; 3030 int obmc_v= obmc[index];
3031 int d;
3009 if(y<0) obmc_v += obmc[index + block_w*obmc_stride]; 3032 if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
3010 if(x<0) obmc_v += obmc[index + block_w]; 3033 if(x<0) obmc_v += obmc[index + block_w];
3011 if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride]; 3034 if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
3012 if(x+block_w>w) obmc_v += obmc[index - block_w]; 3035 if(x+block_w>w) obmc_v += obmc[index - block_w];
3013 //FIXME precalc this or simplify it somehow else 3036 //FIXME precalc this or simplify it somehow else
3014 3037
3015 ab += (src[x2 + y2*ref_stride] - dst[x2 + y2*ref_stride]) * obmc_v; 3038 d = -dst[index] + (1<<(FRAC_BITS-1));
3039 dst[index] = d;
3040 ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
3016 aa += obmc_v * obmc_v; //FIXME precalclate this 3041 aa += obmc_v * obmc_v; //FIXME precalclate this
3017 } 3042 }
3018 } 3043 }
3019 } 3044 }
3020 *b= backup; 3045 *b= backup;
3021 3046
3022 return clip(((ab<<6) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping 3047 return clip(((ab<<6) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
3023 } 3048 }
3024 3049
3025 static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){ 3050 static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
3026 int i, y2;
3027 Plane *p= &s->plane[plane_index]; 3051 Plane *p= &s->plane[plane_index];
3028 const int block_size = MB_SIZE >> s->block_max_depth; 3052 const int block_size = MB_SIZE >> s->block_max_depth;
3029 const int block_w = plane_index ? block_size/2 : block_size; 3053 const int block_w = plane_index ? block_size/2 : block_size;
3030 const uint8_t *obmc = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth]; 3054 const uint8_t *obmc = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3031 const int obmc_stride= plane_index ? block_size : 2*block_size; 3055 const int obmc_stride= plane_index ? block_size : 2*block_size;
3032 const int ref_stride= s->current_picture.linesize[plane_index]; 3056 const int ref_stride= s->current_picture.linesize[plane_index];
3033 uint8_t *ref= s-> last_picture.data[plane_index]; 3057 uint8_t *ref= s-> last_picture.data[plane_index];
3034 uint8_t *dst= s->current_picture.data[plane_index]; 3058 uint8_t *dst= s->current_picture.data[plane_index];
3035 uint8_t *src= s-> input_picture.data[plane_index]; 3059 uint8_t *src= s-> input_picture.data[plane_index];
3036 const static DWTELEM zero_dst[4096]; //FIXME 3060 DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
3061 uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
3062 uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
3037 const int b_stride = s->b_width << s->block_max_depth; 3063 const int b_stride = s->b_width << s->block_max_depth;
3038 const int b_height = s->b_height<< s->block_max_depth; 3064 const int b_height = s->b_height<< s->block_max_depth;
3039 const int w= p->width; 3065 const int w= p->width;
3040 const int h= p->height; 3066 const int h= p->height;
3041 int distortion= 0; 3067 int distortion;
3042 int rate= 0; 3068 int rate= 0;
3043 const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp); 3069 const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3044 3070 int sx= block_w*mb_x - block_w/2;
3045 for(i=0; i<4; i++){ 3071 int sy= block_w*mb_y - block_w/2;
3046 int mb_x2= mb_x + (i &1) - 1; 3072 const int x0= FFMAX(0,-sx);
3047 int mb_y2= mb_y + (i>>1) - 1; 3073 const int y0= FFMAX(0,-sy);
3048 int x= block_w*mb_x2 + block_w/2; 3074 const int x1= FFMIN(block_w*2, w-sx);
3049 int y= block_w*mb_y2 + block_w/2; 3075 const int y1= FFMIN(block_w*2, h-sy);
3050 3076 int i,x,y;
3051 add_yblock(s, zero_dst, dst, ref, obmc, 3077
3052 x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, plane_index); 3078 pred_block(s, cur, ref, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
3053 3079
3054 //FIXME find a cleaner/simpler way to skip the outside stuff 3080 for(y=y0; y<y1; y++){
3055 for(y2= y; y2<0; y2++) 3081 const uint8_t *obmc1= obmc_edged + y*obmc_stride;
3056 memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w); 3082 const DWTELEM *pred1 = pred + y*obmc_stride;
3057 for(y2= h; y2<y+block_w; y2++) 3083 uint8_t *cur1 = cur + y*ref_stride;
3058 memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w); 3084 uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
3059 if(x<0){ 3085 for(x=x0; x<x1; x++){
3060 for(y2= y; y2<y+block_w; y2++) 3086 int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
3061 memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x); 3087 v = (v + pred1[x]) >> FRAC_BITS;
3062 } 3088 if(v&(~255)) v= ~(v>>31);
3063 if(x+block_w > w){ 3089 dst1[x] = v;
3064 for(y2= y; y2<y+block_w; y2++) 3090 }
3065 memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w); 3091 }
3066 } 3092
3067 3093 //FIXME sad/ssd can be broken up, but wavelet cmp should be one 32x32 block
3068 assert(block_w== 8 || block_w==16); 3094 if(block_w==16){
3069 distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w); 3095 distortion = 0;
3096 for(i=0; i<4; i++){
3097 int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3098 distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3099 }
3100 }else{
3101 assert(block_w==8);
3102 distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3070 } 3103 }
3071 3104
3072 if(plane_index==0){ 3105 if(plane_index==0){
3073 for(i=0; i<4; i++){ 3106 for(i=0; i<4; i++){
3074 /* ..RRr 3107 /* ..RRr
3110 } 3143 }
3111 3144
3112 return distortion + rate*penalty_factor; 3145 return distortion + rate*penalty_factor;
3113 } 3146 }
3114 3147
3115 static always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, int *best_rd){ 3148 static always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3116 const int b_stride= s->b_width << s->block_max_depth; 3149 const int b_stride= s->b_width << s->block_max_depth;
3117 BlockNode *block= &s->block[mb_x + mb_y * b_stride]; 3150 BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3118 BlockNode backup= *block; 3151 BlockNode backup= *block;
3119 int rd, index, value; 3152 int rd, index, value;
3120 3153
3136 block->mx= p[0]; 3169 block->mx= p[0];
3137 block->my= p[1]; 3170 block->my= p[1];
3138 block->type &= ~BLOCK_INTRA; 3171 block->type &= ~BLOCK_INTRA;
3139 } 3172 }
3140 3173
3141 rd= get_block_rd(s, mb_x, mb_y, 0); 3174 rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3142 3175
3143 //FIXME chroma 3176 //FIXME chroma
3144 if(rd < *best_rd){ 3177 if(rd < *best_rd){
3145 *best_rd= rd; 3178 *best_rd= rd;
3146 return 1; 3179 return 1;
3149 return 0; 3182 return 0;
3150 } 3183 }
3151 } 3184 }
3152 3185
3153 /* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */ 3186 /* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3154 static always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int intra, int *best_rd){ 3187 static always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int intra, const uint8_t *obmc_edged, int *best_rd){
3155 int p[2] = {p0, p1}; 3188 int p[2] = {p0, p1};
3156 return check_block(s, mb_x, mb_y, p, intra, best_rd); 3189 return check_block(s, mb_x, mb_y, p, intra, obmc_edged, best_rd);
3157 } 3190 }
3158 3191
3159 static void iterative_me(SnowContext *s){ 3192 static void iterative_me(SnowContext *s){
3160 int pass, mb_x, mb_y; 3193 int pass, mb_x, mb_y;
3161 const int b_width = s->b_width << s->block_max_depth; 3194 const int b_width = s->b_width << s->block_max_depth;
3179 BlockNode *bb = mb_y<b_height ? &s->block[index+b_stride ] : &null_block; 3212 BlockNode *bb = mb_y<b_height ? &s->block[index+b_stride ] : &null_block;
3180 BlockNode *tlb= mb_x && mb_y ? &s->block[index-b_stride-1] : &null_block; 3213 BlockNode *tlb= mb_x && mb_y ? &s->block[index-b_stride-1] : &null_block;
3181 BlockNode *trb= mb_x<b_width && mb_y ? &s->block[index-b_stride+1] : &null_block; 3214 BlockNode *trb= mb_x<b_width && mb_y ? &s->block[index-b_stride+1] : &null_block;
3182 BlockNode *blb= mb_x && mb_y<b_height ? &s->block[index+b_stride-1] : &null_block; 3215 BlockNode *blb= mb_x && mb_y<b_height ? &s->block[index+b_stride-1] : &null_block;
3183 BlockNode *brb= mb_x<b_width && mb_y<b_height ? &s->block[index+b_stride+1] : &null_block; 3216 BlockNode *brb= mb_x<b_width && mb_y<b_height ? &s->block[index+b_stride+1] : &null_block;
3217 const int b_w= (MB_SIZE >> s->block_max_depth);
3218 uint8_t obmc_edged[b_w*2][b_w*2];
3184 3219
3185 if(pass && (block->type & BLOCK_OPT)) 3220 if(pass && (block->type & BLOCK_OPT))
3186 continue; 3221 continue;
3187 block->type |= BLOCK_OPT; 3222 block->type |= BLOCK_OPT;
3188 3223
3190 3225
3191 if(!s->me_cache_generation) 3226 if(!s->me_cache_generation)
3192 memset(s->me_cache, 0, sizeof(s->me_cache)); 3227 memset(s->me_cache, 0, sizeof(s->me_cache));
3193 s->me_cache_generation += 1<<22; 3228 s->me_cache_generation += 1<<22;
3194 3229
3230 //FIXME precalc
3231 {
3232 int x, y;
3233 memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3234 if(mb_x==0)
3235 for(y=0; y<b_w*2; y++)
3236 memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3237 if(mb_x==b_stride-1)
3238 for(y=0; y<b_w*2; y++)
3239 memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3240 if(mb_y==0){
3241 for(x=0; x<b_w*2; x++)
3242 obmc_edged[0][x] += obmc_edged[b_w-1][x];
3243 for(y=1; y<b_w; y++)
3244 memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3245 }
3246 if(mb_y==b_height-1){
3247 for(x=0; x<b_w*2; x++)
3248 obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3249 for(y=b_w; y<b_w*2-1; y++)
3250 memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3251 }
3252 }
3253
3254 //skip stuff outside the picture
3255 if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3256 {
3257 uint8_t *src= s-> input_picture.data[0];
3258 uint8_t *dst= s->current_picture.data[0];
3259 const int stride= s->current_picture.linesize[0];
3260 const int block_w= MB_SIZE >> s->block_max_depth;
3261 const int sx= block_w*mb_x - block_w/2;
3262 const int sy= block_w*mb_y - block_w/2;
3263 const int w= s->plane[0].width;
3264 const int h= s->plane[0].height;
3265 int y;
3266
3267 for(y=sy; y<0; y++)
3268 memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3269 for(y=h; y<sy+block_w*2; y++)
3270 memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3271 if(sx<0){
3272 for(y=sy; y<sy+block_w*2; y++)
3273 memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3274 }
3275 if(sx+block_w*2 > w){
3276 for(y=sy; y<sy+block_w*2; y++)
3277 memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3278 }
3279 }
3280
3281 // intra(black) = neighbors' contribution to the current block
3282 for(i=0; i<3; i++)
3283 color[i]= get_dc(s, mb_x, mb_y, i);
3284
3195 // get previous score (cant be cached due to OBMC) 3285 // get previous score (cant be cached due to OBMC)
3196 check_block_inter(s, mb_x, mb_y, block->mx, block->my, 0, &best_rd); 3286 check_block_inter(s, mb_x, mb_y, block->mx, block->my, 0, *obmc_edged, &best_rd);
3197 check_block_inter(s, mb_x, mb_y, 0, 0, 0, &best_rd); 3287 check_block_inter(s, mb_x, mb_y, 0, 0, 0, *obmc_edged, &best_rd);
3198 check_block_inter(s, mb_x, mb_y, tb->mx, tb->my, 0, &best_rd); 3288 check_block_inter(s, mb_x, mb_y, tb->mx, tb->my, 0, *obmc_edged, &best_rd);
3199 check_block_inter(s, mb_x, mb_y, lb->mx, lb->my, 0, &best_rd); 3289 check_block_inter(s, mb_x, mb_y, lb->mx, lb->my, 0, *obmc_edged, &best_rd);
3200 check_block_inter(s, mb_x, mb_y, rb->mx, rb->my, 0, &best_rd); 3290 check_block_inter(s, mb_x, mb_y, rb->mx, rb->my, 0, *obmc_edged, &best_rd);
3201 check_block_inter(s, mb_x, mb_y, bb->mx, bb->my, 0, &best_rd); 3291 check_block_inter(s, mb_x, mb_y, bb->mx, bb->my, 0, *obmc_edged, &best_rd);
3202 3292
3203 /* fullpel ME */ 3293 /* fullpel ME */
3204 //FIXME avoid subpel interpol / round to nearest integer 3294 //FIXME avoid subpel interpol / round to nearest integer
3205 do{ 3295 do{
3206 dia_change=0; 3296 dia_change=0;
3207 for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){ 3297 for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3208 for(j=0; j<i; j++){ 3298 for(j=0; j<i; j++){
3209 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), 0, &best_rd); 3299 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), 0, *obmc_edged, &best_rd);
3210 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), 0, &best_rd); 3300 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), 0, *obmc_edged, &best_rd);
3211 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), 0, &best_rd); 3301 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), 0, *obmc_edged, &best_rd);
3212 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), 0, &best_rd); 3302 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), 0, *obmc_edged, &best_rd);
3213 } 3303 }
3214 } 3304 }
3215 }while(dia_change); 3305 }while(dia_change);
3216 /* subpel ME */ 3306 /* subpel ME */
3217 do{ 3307 do{
3218 static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},}; 3308 static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3219 dia_change=0; 3309 dia_change=0;
3220 for(i=0; i<8; i++) 3310 for(i=0; i<8; i++)
3221 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], 0, &best_rd); 3311 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], 0, *obmc_edged, &best_rd);
3222 }while(dia_change); 3312 }while(dia_change);
3223 //FIXME or try the standard 2 pass qpel or similar 3313 //FIXME or try the standard 2 pass qpel or similar
3224 #if 1 3314 #if 1
3225 for(i=0; i<3; i++){ 3315 check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3226 color[i]= get_dc(s, mb_x, mb_y, i);
3227 }
3228 check_block(s, mb_x, mb_y, color, 1, &best_rd);
3229 //FIXME RD style color selection 3316 //FIXME RD style color selection
3230 #endif 3317 #endif
3231 if(!same_block(block, &backup)){ 3318 if(!same_block(block, &backup)){
3232 if(tb != &null_block) tb ->type &= ~BLOCK_OPT; 3319 if(tb != &null_block) tb ->type &= ~BLOCK_OPT;
3233 if(lb != &null_block) lb ->type &= ~BLOCK_OPT; 3320 if(lb != &null_block) lb ->type &= ~BLOCK_OPT;
3717 s->m.bit_rate= avctx->bit_rate; 3804 s->m.bit_rate= avctx->bit_rate;
3718 3805
3719 s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t)); 3806 s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
3720 s->m.me.map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t)); 3807 s->m.me.map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3721 s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t)); 3808 s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3809 s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
3722 h263_encode_init(&s->m); //mv_penalty 3810 h263_encode_init(&s->m); //mv_penalty
3723 3811
3724 if(avctx->flags&CODEC_FLAG_PASS1){ 3812 if(avctx->flags&CODEC_FLAG_PASS1){
3725 if(!avctx->stats_out) 3813 if(!avctx->stats_out)
3726 avctx->stats_out = av_mallocz(256); 3814 avctx->stats_out = av_mallocz(256);
3997 av_freep(&s->spatial_dwt_buffer); 4085 av_freep(&s->spatial_dwt_buffer);
3998 4086
3999 av_freep(&s->m.me.scratchpad); 4087 av_freep(&s->m.me.scratchpad);
4000 av_freep(&s->m.me.map); 4088 av_freep(&s->m.me.map);
4001 av_freep(&s->m.me.score_map); 4089 av_freep(&s->m.me.score_map);
4090 av_freep(&s->m.obmc_scratchpad);
4002 4091
4003 av_freep(&s->block); 4092 av_freep(&s->block);
4004 4093
4005 for(plane_index=0; plane_index<3; plane_index++){ 4094 for(plane_index=0; plane_index<3; plane_index++){
4006 for(level=s->spatial_decomposition_count-1; level>=0; level--){ 4095 for(level=s->spatial_decomposition_count-1; level>=0; level--){