changeset 1013:5d4c95f323d0 libavcodec

finetuneing thresholds/factors nicer mb decission a few minor improvements & fixes
author michaelni
date Sun, 19 Jan 2003 17:55:13 +0000
parents 7a5038ec769b
children 48349e11c9b2
files dsputil.c h263.c i386/mpegvideo_mmx_template.c mem.c motion_est.c motion_est_template.c mpegvideo.c mpegvideo.h msmpeg4data.h
diffstat 9 files changed, 376 insertions(+), 116 deletions(-) [+]
line wrap: on
line diff
--- a/dsputil.c	Sun Jan 19 12:06:36 2003 +0000
+++ b/dsputil.c	Sun Jan 19 17:55:13 2003 +0000
@@ -1807,7 +1807,7 @@
     
     memcpy(bak, temp, 64*sizeof(DCTELEM));
     
-    s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
+    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
     s->dct_unquantize(s, temp, 0, s->qscale);
     simple_idct(temp); //FIXME 
     
@@ -1826,19 +1826,7 @@
     const int esc_length= s->ac_esc_length;
     uint8_t * length;
     uint8_t * last_length;
-
-    s->mb_intra=0;
     
-    if (s->mb_intra) {
-        start_i = 1;
-        length     = s->intra_ac_vlc_length;
-        last_length= s->intra_ac_vlc_last_length;
-    } else {
-        start_i = 0;
-        length     = s->inter_ac_vlc_length;
-        last_length= s->inter_ac_vlc_last_length;
-    }
-
     for(i=0; i<8; i++){
         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
@@ -1846,10 +1834,22 @@
 
     s->dsp.diff_pixels(temp, src1, src2, stride);
 
-    last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
+    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
+
+    bits=0;
     
-    bits=0;
-    if(last>=0){
+    if (s->mb_intra) {
+        start_i = 1; 
+        length     = s->intra_ac_vlc_length;
+        last_length= s->intra_ac_vlc_last_length;
+        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
+    } else {
+        start_i = 0;
+        length     = s->inter_ac_vlc_length;
+        last_length= s->inter_ac_vlc_last_length;
+    }
+    
+    if(last>=start_i){
         run=0;
         for(i=start_i; i<last; i++){
             int j= scantable[i];
@@ -1876,6 +1876,9 @@
         }else
             bits+= esc_length;
     
+    }
+
+    if(last>=0){
         s->dct_unquantize(s, temp, 0, s->qscale);
     }
     
@@ -1883,7 +1886,7 @@
     
     distoration= s->dsp.sse[1](NULL, bak, src1, stride);
 
-    return distoration + ((bits*s->qscale*s->qscale*105 + 64)>>7);
+    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
 }
 
 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
@@ -1894,25 +1897,25 @@
     const int esc_length= s->ac_esc_length;
     uint8_t * length;
     uint8_t * last_length;
+    
+    s->dsp.diff_pixels(temp, src1, src2, stride);
 
-    s->mb_intra=0;
+    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
+
+    bits=0;
     
     if (s->mb_intra) {
-        start_i = 1;
+        start_i = 1; 
         length     = s->intra_ac_vlc_length;
         last_length= s->intra_ac_vlc_last_length;
+        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
     } else {
         start_i = 0;
         length     = s->inter_ac_vlc_length;
         last_length= s->inter_ac_vlc_last_length;
     }
-
-    s->dsp.diff_pixels(temp, src1, src2, stride);
-
-    last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
     
-    bits=0;
-    if(last>=0){
+    if(last>=start_i){
         run=0;
         for(i=start_i; i<last; i++){
             int j= scantable[i];
@@ -1929,10 +1932,11 @@
                 run++;
         }
         i= scantable[last];
+                
+        level= temp[i] + 64;
         
-        assert(level);
+        assert(level - 64);
         
-        level= temp[i] + 64;
         if((level&(~127)) == 0){
             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
         }else
--- a/h263.c	Sun Jan 19 12:06:36 2003 +0000
+++ b/h263.c	Sun Jan 19 17:55:13 2003 +0000
@@ -74,8 +74,10 @@
 
 extern UINT32 inverse[256];
 
-static UINT16 uni_DCtab_lum  [512][2];
-static UINT16 uni_DCtab_chrom[512][2];
+static UINT8 uni_DCtab_lum_len[512];
+static UINT8 uni_DCtab_chrom_len[512];
+static UINT16 uni_DCtab_lum_bits[512];
+static UINT16 uni_DCtab_chrom_bits[512];
 
 #ifdef CONFIG_ENCODERS
 static UINT16 (*mv_penalty)[MAX_MV*2+1]= NULL;
@@ -1309,8 +1311,8 @@
                 uni_len++;
             }
         }
-        uni_DCtab_lum[level+256][0]= uni_code;
-        uni_DCtab_lum[level+256][1]= uni_len;
+        uni_DCtab_lum_bits[level+256]= uni_code;
+        uni_DCtab_lum_len [level+256]= uni_len;
 
         /* chrominance */
         uni_code= DCtab_chrom[size][0];
@@ -1324,8 +1326,8 @@
                 uni_len++;
             }
         }
-        uni_DCtab_chrom[level+256][0]= uni_code;
-        uni_DCtab_chrom[level+256][1]= uni_len;
+        uni_DCtab_chrom_bits[level+256]= uni_code;
+        uni_DCtab_chrom_len [level+256]= uni_len;
 
     }
 }
@@ -1446,6 +1448,8 @@
         s->intra_ac_vlc_last_length= uni_mpeg4_intra_rl_len + 128*64;
         s->inter_ac_vlc_length     = uni_mpeg4_inter_rl_len;
         s->inter_ac_vlc_last_length= uni_mpeg4_inter_rl_len + 128*64;
+        s->luma_dc_vlc_length= uni_DCtab_lum_len;
+        s->chroma_dc_vlc_length= uni_DCtab_chrom_len;
         s->ac_esc_length= 7+2+1+6+1+12+1;
         break;
     case CODEC_ID_H263P:
@@ -1957,10 +1961,10 @@
     level+=256;
     if (n < 4) {
 	/* luminance */
-	put_bits(s, uni_DCtab_lum[level][1], uni_DCtab_lum[level][0]);
+	put_bits(s, uni_DCtab_lum_len[level], uni_DCtab_lum_bits[level]);
     } else {
 	/* chrominance */
-	put_bits(s, uni_DCtab_chrom[level][1], uni_DCtab_chrom[level][0]);
+	put_bits(s, uni_DCtab_chrom_len[level], uni_DCtab_chrom_bits[level]);
     }
 #else
     int size, v;
--- a/i386/mpegvideo_mmx_template.c	Sun Jan 19 12:06:36 2003 +0000
+++ b/i386/mpegvideo_mmx_template.c	Sun Jan 19 17:55:13 2003 +0000
@@ -53,8 +53,7 @@
         if (!s->h263_aic) {
 #if 1
         asm volatile (
-        	"xorl %%edx, %%edx	\n\t"
-        	"mul %%ecx		\n\t"
+        	"imul %%ecx		\n\t"
         	: "=d" (level), "=a"(dummy)
         	: "a" ((block[0]>>2) + q), "c" (inverse[q<<1])
         );
--- a/mem.c	Sun Jan 19 12:06:36 2003 +0000
+++ b/mem.c	Sun Jan 19 17:55:13 2003 +0000
@@ -29,6 +29,9 @@
 void *av_malloc(unsigned int size)
 {
     void *ptr;
+    
+//    if(size==0) return NULL;
+    
 #if defined (HAVE_MEMALIGN)
     ptr = memalign(16,size);
     /* Why 64? 
--- a/motion_est.c	Sun Jan 19 12:06:36 2003 +0000
+++ b/motion_est.c	Sun Jan 19 17:55:13 2003 +0000
@@ -305,20 +305,21 @@
 };
 
 static inline int get_penalty_factor(MpegEncContext *s, int type){
-
-    switch(type){
+    switch(type&0xFF){
     default:
     case FF_CMP_SAD:
-        return s->qscale;
+        return s->qscale*2;
     case FF_CMP_DCT:
+        return s->qscale*3;
     case FF_CMP_SATD:
+        return s->qscale*6;
     case FF_CMP_SSE:
-    case FF_CMP_PSNR:
-        return s->qscale*8;
+        return s->qscale*s->qscale*2;
     case FF_CMP_BIT:
         return 1;
     case FF_CMP_RD:
-        return (s->qscale*s->qscale*105 + 64)>>7;
+    case FF_CMP_PSNR:
+        return (s->qscale*s->qscale*185 + 64)>>7;
     }
 }
 
@@ -336,7 +337,9 @@
     }else{
         if(s->avctx->me_sub_cmp&FF_CMP_CHROMA)
             s->me.sub_motion_search= simple_chroma_hpel_motion_search;
-        else if(s->avctx->me_sub_cmp == FF_CMP_SAD && s->avctx->me_cmp == FF_CMP_SAD)
+        else if(   s->avctx->me_sub_cmp == FF_CMP_SAD 
+                && s->avctx->    me_cmp == FF_CMP_SAD 
+                && s->avctx->    mb_cmp == FF_CMP_SAD)
             s->me.sub_motion_search= sad_hpel_motion_search;
         else
             s->me.sub_motion_search= simple_hpel_motion_search;
@@ -355,6 +358,18 @@
     }else{
         s->me.pre_motion_search= simple_epzs_motion_search;
     }
+    
+    if(s->flags&CODEC_FLAG_QPEL){
+        if(s->avctx->mb_cmp&FF_CMP_CHROMA)
+            s->me.get_mb_score= simple_chroma_qpel_get_mb_score;
+        else
+            s->me.get_mb_score= simple_qpel_get_mb_score;
+    }else{
+        if(s->avctx->mb_cmp&FF_CMP_CHROMA)
+            s->me.get_mb_score= simple_chroma_hpel_get_mb_score;
+        else
+            s->me.get_mb_score= simple_hpel_get_mb_score;
+    }
 }
       
 static int pix_dev(UINT8 * pix, int line_size, int mean)
@@ -788,12 +803,11 @@
     }
 }
 
-static inline int mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, int ymax, int mx, int my, int shift)
+static inline int h263_mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, int ymax, int mx, int my, int shift)
 {
     int block;
     int P[10][2];
-    uint8_t *ref_picture= s->last_picture.data[0];
-    int dmin_sum=0;
+    int dmin_sum=0, mx4_sum=0, my4_sum=0;
     uint16_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
 
     for(block=0; block<4; block++){
@@ -838,13 +852,15 @@
             P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
             P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
 
-            if(s->out_format == FMT_H263){
+//            if(s->out_format == FMT_H263){
                 pred_x4 = P_MEDIAN[0];
                 pred_y4 = P_MEDIAN[1];
+#if 0
             }else { /* mpeg1 at least */
                 pred_x4= P_LEFT[0];
                 pred_y4= P_LEFT[1];
             }
+#endif
         }
         P_MV1[0]= mx;
         P_MV1[1]= my;
@@ -855,11 +871,79 @@
         dmin4= s->me.sub_motion_search(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, 
 					  pred_x4, pred_y4, &s->last_picture, block, 1, mv_penalty);
  
+        if(s->dsp.me_sub_cmp != s->dsp.mb_cmp){
+            int dxy;
+            const int offset= ((block&1) + (block>>1)*s->linesize)*8;
+            uint8_t *dest_y = s->me.scratchpad + offset;
+
+            if(s->quarter_sample){
+                uint8_t *ref= s->last_picture.data[0] + (s->mb_x*16 + (mx4>>2)) + (s->mb_y*16 + (my4>>2))*s->linesize + offset;
+                dxy = ((my4 & 3) << 2) | (mx4 & 3);
+
+                if(s->no_rounding)
+                    s->dsp.put_no_rnd_qpel_pixels_tab[0][dxy](dest_y   , ref    , s->linesize);
+                else
+                    s->dsp.put_qpel_pixels_tab       [0][dxy](dest_y   , ref    , s->linesize);
+            }else{
+                uint8_t *ref= s->last_picture.data[0] + (s->mb_x*16 + (mx4>>1)) + (s->mb_y*16 + (my4>>1))*s->linesize + offset;
+                dxy = ((my4 & 1) << 1) | (mx4 & 1);
+
+                if(s->no_rounding)
+                    s->dsp.put_no_rnd_pixels_tab[0][dxy](dest_y    , ref    , s->linesize, 16);
+                else
+                    s->dsp.put_pixels_tab       [0][dxy](dest_y    , ref    , s->linesize, 16);
+            }
+            dmin_sum+= (mv_penalty[mx4-pred_x4] + mv_penalty[my4-pred_y4])*s->me.mb_penalty_factor;
+        }else
+            dmin_sum+= dmin4;
+
+        if(s->quarter_sample){
+            mx4_sum+= mx4/2;
+            my4_sum+= my4/2;
+        }else{
+            mx4_sum+= mx4;
+            my4_sum+= my4;
+        }
+            
         s->motion_val[ s->block_index[block] ][0]= mx4;
         s->motion_val[ s->block_index[block] ][1]= my4;
-        dmin_sum+= dmin4;
+    }
+    
+    if(s->dsp.me_sub_cmp != s->dsp.mb_cmp){
+        dmin_sum += s->dsp.mb_cmp[0](s, s->new_picture.data[0] + s->mb_x*16 + s->mb_y*16*s->linesize, s->me.scratchpad, s->linesize);
     }
-    return dmin_sum;
+    
+    if(s->avctx->mb_cmp&FF_CMP_CHROMA){
+        int dxy;
+        int mx, my;
+        int offset;
+
+        mx= ff_h263_round_chroma(mx4_sum);
+        my= ff_h263_round_chroma(my4_sum);
+        dxy = ((my & 1) << 1) | (mx & 1);
+        
+        offset= (s->mb_x*8 + (mx>>1)) + (s->mb_y*8 + (my>>1))*s->uvlinesize;
+       
+        if(s->no_rounding){
+            s->dsp.put_no_rnd_pixels_tab[1][dxy](s->me.scratchpad    , s->last_picture.data[1] + offset, s->uvlinesize, 8);
+            s->dsp.put_no_rnd_pixels_tab[1][dxy](s->me.scratchpad+8  , s->last_picture.data[2] + offset, s->uvlinesize, 8);
+        }else{
+            s->dsp.put_pixels_tab       [1][dxy](s->me.scratchpad    , s->last_picture.data[1] + offset, s->uvlinesize, 8);
+            s->dsp.put_pixels_tab       [1][dxy](s->me.scratchpad+8  , s->last_picture.data[2] + offset, s->uvlinesize, 8);
+        }
+
+        dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.data[1] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, s->me.scratchpad  , s->uvlinesize);
+        dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.data[2] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, s->me.scratchpad+8, s->uvlinesize);
+    }
+
+    switch(s->avctx->mb_cmp&0xFF){
+    /*case FF_CMP_SSE:
+        return dmin_sum+ 32*s->qscale*s->qscale;*/
+    case FF_CMP_RD:
+        return dmin_sum;
+    default:
+        return dmin_sum+ 11*s->me.mb_penalty_factor;
+    }
 }
 
 void ff_estimate_p_frame_motion(MpegEncContext * s,
@@ -881,6 +965,7 @@
 
     s->me.penalty_factor    = get_penalty_factor(s, s->avctx->me_cmp);
     s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp);
+    s->me.mb_penalty_factor = get_penalty_factor(s, s->avctx->mb_cmp);
 
     get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, s->f_code);
     rel_xmin= xmin - mb_x*16;
@@ -971,6 +1056,7 @@
     pic->mb_var   [s->mb_width * mb_y + mb_x] = varc;
     pic->mc_mb_var[s->mb_width * mb_y + mb_x] = vard;
     pic->mb_mean  [s->mb_width * mb_y + mb_x] = (sum+128)>>8;
+//    pic->mb_cmp_score[s->mb_width * mb_y + mb_x] = dmin; 
     pic->mb_var_sum    += varc;
     pic->mc_mb_var_sum += vard;
 //printf("E%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
@@ -997,44 +1083,36 @@
         }
         if((s->flags&CODEC_FLAG_4MV)
            && !s->me.skip && varc>50 && vard>10){
-            mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
+            h263_mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
             mb_type|=MB_TYPE_INTER4V;
 
             set_p_mv_tables(s, mx, my, 0);
         }else
             set_p_mv_tables(s, mx, my, 1);
     }else{
+        mb_type= MB_TYPE_INTER;
+
+        dmin= s->me.sub_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
+                                    pred_x, pred_y, &s->last_picture, 0, 0, mv_penalty);
+        
+        if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
+            dmin= s->me.get_mb_score(s, mx, my, pred_x, pred_y, &s->last_picture, mv_penalty);
+
+        if((s->flags&CODEC_FLAG_4MV)
+           && !s->me.skip && varc>50 && vard>10){
+            int dmin4= h263_mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
+            if(dmin4 < dmin){
+                mb_type= MB_TYPE_INTER4V;
+                dmin=dmin4;
+            }
+        }
+        pic->mb_cmp_score[s->mb_width * mb_y + mb_x] = dmin; 
+        set_p_mv_tables(s, mx, my, mb_type!=MB_TYPE_INTER4V);
+        
         if (vard <= 64 || vard < varc) {
-//        if (sadP <= 32 || sadP < sadI + 500) {
             s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
-            mb_type|= MB_TYPE_INTER;
-            if (s->me_method != ME_ZERO) {
-                dmin= s->me.sub_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
-                                            pred_x, pred_y, &s->last_picture, 0, 0, mv_penalty);
-                if((s->flags&CODEC_FLAG_4MV)
-                   && !s->me.skip && varc>50 && vard>10){
-                    int dmin4= mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
-                    if(dmin4 + 128 <dmin)
-                        mb_type= MB_TYPE_INTER4V;
-                }
-                set_p_mv_tables(s, mx, my, mb_type!=MB_TYPE_INTER4V);
-
-            } else {
-                mx <<=shift;
-                my <<=shift;
-            }
-#if 0
-            if (vard < 10) {
-                skip++;
-                fprintf(stderr,"\nEarly skip: %d vard: %2d varc: %5d dmin: %d", 
-                                skip, vard, varc, dmin);
-            }
-#endif
         }else{
             s->scene_change_score+= s->qscale;
-            mb_type|= MB_TYPE_INTRA;
-            mx = 0;
-            my = 0;
         }
     }
 
@@ -1117,6 +1195,7 @@
         
     s->me.penalty_factor    = get_penalty_factor(s, s->avctx->me_cmp);
     s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp);
+    s->me.mb_penalty_factor = get_penalty_factor(s, s->avctx->mb_cmp);
 
     get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, f_code);
     rel_xmin= xmin - mb_x*16;
@@ -1186,6 +1265,10 @@
     
     dmin= s->me.sub_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
 				   pred_x, pred_y, picture, 0, 0, mv_penalty);
+                                   
+    if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
+        dmin= s->me.get_mb_score(s, mx, my, pred_x, pred_y, picture, mv_penalty);
+
 //printf("%d %d %d %d//", s->mb_x, s->mb_y, mx, my);
 //    s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
     mv_table[mot_xy][0]= mx;
@@ -1249,10 +1332,14 @@
         s->dsp.avg_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
     }
 
-    fbmin = (mv_penalty[motion_fx-pred_fx] + mv_penalty[motion_fy-pred_fy])*s->me.sub_penalty_factor
-           +(mv_penalty[motion_bx-pred_bx] + mv_penalty[motion_by-pred_by])*s->me.sub_penalty_factor;
-           + s->dsp.me_sub_cmp[0](s, s->new_picture.data[0] + mb_x*16 + mb_y*16*s->linesize, dest_y, s->linesize);
-
+    fbmin = (mv_penalty[motion_fx-pred_fx] + mv_penalty[motion_fy-pred_fy])*s->me.mb_penalty_factor
+           +(mv_penalty[motion_bx-pred_bx] + mv_penalty[motion_by-pred_by])*s->me.mb_penalty_factor
+           + s->dsp.mb_cmp[0](s, s->new_picture.data[0] + mb_x*16 + mb_y*16*s->linesize, dest_y, s->linesize);
+           
+    if(s->avctx->mb_cmp&FF_CMP_CHROMA){
+    }
+    //FIXME CHROMA !!!
+           
     return fbmin;
 }
 
@@ -1356,17 +1443,24 @@
         P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
         P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
     }
-    
+ 
+    //FIXME direct_search  ptr in context!!! (needed for chroma anyway or this will get messy)   
     if(s->flags&CODEC_FLAG_QPEL){
         dmin = simple_direct_qpel_epzs_motion_search(s, 0, &mx, &my, P, 0, 0, xmin, ymin, xmax, ymax, 
                                                      &s->last_picture, mv_table, 1<<14, mv_penalty);
         dmin = simple_direct_qpel_qpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax,
                                                 0, 0, &s->last_picture, 0, 0, mv_penalty);
+        
+        if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
+            dmin= simple_direct_qpel_qpel_get_mb_score(s, mx, my, 0, 0, &s->last_picture, mv_penalty);
     }else{
         dmin = simple_direct_hpel_epzs_motion_search(s, 0, &mx, &my, P, 0, 0, xmin, ymin, xmax, ymax, 
                                                      &s->last_picture, mv_table, 1<<15, mv_penalty);
         dmin = simple_direct_hpel_hpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax,
                                                 0, 0, &s->last_picture, 0, 0, mv_penalty);
+
+        if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
+            dmin= simple_direct_hpel_hpel_get_mb_score(s, mx, my, 0, 0, &s->last_picture, mv_penalty);
     }
 
     s->b_direct_mv_table[mot_xy][0]= mx;
@@ -1377,18 +1471,18 @@
 void ff_estimate_b_frame_motion(MpegEncContext * s,
                              int mb_x, int mb_y)
 {
-    const int penalty_factor= s->me.penalty_factor;
+    const int penalty_factor= s->me.mb_penalty_factor;
     int fmin, bmin, dmin, fbmin;
     int type=0;
     
     dmin= direct_search(s, mb_x, mb_y);
 
-    fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, &s->last_picture, s->f_code);
-    bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, &s->next_picture, s->b_code) - penalty_factor;
+    fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, &s->last_picture, s->f_code) + 3*penalty_factor;
+    bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, &s->next_picture, s->b_code) + 2*penalty_factor;
 //printf(" %d %d ", s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]);
 
-    fbmin= bidir_refine(s, mb_x, mb_y);
-
+    fbmin= bidir_refine(s, mb_x, mb_y) + penalty_factor;
+//printf("%d %d %d %d\n", dmin, fmin, bmin, fbmin);
     {
         int score= dmin;
         type=MB_TYPE_DIRECT;
@@ -1405,9 +1499,10 @@
             score=fbmin;
             type= MB_TYPE_BIDIR;
         }
+        
         score= ((unsigned)(score*score + 128*256))>>16;
         s->current_picture.mc_mb_var_sum += score;
-        s->current_picture.mc_mb_var[mb_y*s->mb_width + mb_x] = score; //FIXME use SSD
+        s->current_picture.mc_mb_var[mb_y*s->mb_width + mb_x] = score; //FIXME use SSE
     }
 
     if(s->flags&CODEC_FLAG_HQ){
--- a/motion_est_template.c	Sun Jan 19 12:06:36 2003 +0000
+++ b/motion_est_template.c	Sun Jan 19 17:55:13 2003 +0000
@@ -39,7 +39,7 @@
     qpel_mc_func (*qpel_put)[16];\
     qpel_mc_func (*qpel_avg)[16]= &s->dsp.avg_qpel_pixels_tab[size];\
     const __attribute__((unused)) int unu= time_pp + time_pb + (int)src_u + (int)src_v + (int)ref_u + (int)ref_v\
-                                           + (int)ref2_y + (int)hpel_avg + (int)qpel_avg;\
+                                           + (int)ref2_y + (int)hpel_avg + (int)qpel_avg + (int)score_map;\
     if(s->no_rounding /*FIXME b_type*/){\
         hpel_put= &s->dsp.put_no_rnd_pixels_tab[size];\
         chroma_hpel_put= &s->dsp.put_no_rnd_pixels_tab[size+1];\
@@ -144,6 +144,7 @@
     const int my = *my_ptr;   
     const int penalty_factor= s->me.sub_penalty_factor;
     me_cmp_func cmp_sub, chroma_cmp_sub;
+    int bx=2*mx, by=2*my;
 
     LOAD_COMMON(xx, yy);
     
@@ -166,7 +167,6 @@
         
     if (mx > xmin && mx < xmax && 
         my > ymin && my < ymax) {
-        int bx=2*mx, by=2*my;
         int d= dmin;
         const int index= (my<<ME_MAP_SHIFT) + mx;
         const int t= score_map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] 
@@ -178,7 +178,7 @@
         const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)]
                      + (mv_penalty[bx   - pred_x] + mv_penalty[by+2 - pred_y])*s->me.penalty_factor;
     
-#if 0
+#if 1
         int key;
         int map_generation= s->me.map_generation;
         uint32_t *map= s->me.map;
@@ -231,20 +231,50 @@
             CHECK_HALF_MV(0, 1, mx  , my)
         }
         assert(bx >= xmin*2 && bx <= xmax*2 && by >= ymin*2 && by <= ymax*2);
-
-        *mx_ptr = bx;
-        *my_ptr = by;
-    }else{
-        *mx_ptr =2*mx;
-        *my_ptr =2*my;
     }
 
+    *mx_ptr = bx;
+    *my_ptr = by;
+    
     return dmin;
 }
 #endif
 
+static int RENAME(hpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pred_x, int pred_y, Picture *ref_picture, 
+                                  uint16_t * const mv_penalty)
+{
+//    const int check_luma= s->dsp.me_sub_cmp != s->dsp.mb_cmp;
+    const int size= 0;
+    const int xx = 16 * s->mb_x;
+    const int yy = 16 * s->mb_y;
+    const int penalty_factor= s->me.mb_penalty_factor;
+    const int xmin= -256*256, ymin= -256*256, xmax= 256*256, ymax= 256*256; //assume that the caller checked these
+    const __attribute__((unused)) int unu2= xmin + xmax +ymin + ymax; //no unused warning shit
+    me_cmp_func cmp_sub, chroma_cmp_sub;
+    int d;
+
+    LOAD_COMMON(xx, yy);
+    
+ //FIXME factorize
+
+    cmp_sub= s->dsp.mb_cmp[size];
+    chroma_cmp_sub= s->dsp.mb_cmp[size+1];
+    
+    assert(!s->me.skip);
+    assert(s->avctx->me_sub_cmp != s->avctx->mb_cmp);
+
+    CMP_HPEL(d, mx&1, my&1, mx>>1, my>>1, size);
+    //FIXME check cbp before adding penalty for (0,0) vector
+    if(mx || my || size>0)
+        d += (mv_penalty[mx - pred_x] + mv_penalty[my - pred_y])*penalty_factor;
+        
+    return d;
+}
+
 #endif /* CMP_HPEL */
 
+
+
 #ifdef CMP_QPEL
 
 #define CHECK_QUARTER_MV(dx, dy, x, y)\
@@ -477,6 +507,37 @@
     return dmin;
 }
 
+static int RENAME(qpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pred_x, int pred_y, Picture *ref_picture, 
+                                  uint16_t * const mv_penalty)
+{
+    const int size= 0;
+    const int xx = 16 * s->mb_x;
+    const int yy = 16 * s->mb_y;
+    const int penalty_factor= s->me.mb_penalty_factor;
+    const int xmin= -256*256, ymin= -256*256, xmax= 256*256, ymax= 256*256; //assume that the caller checked these
+    const __attribute__((unused)) int unu2= xmin + xmax +ymin + ymax; //no unused warning shit
+    me_cmp_func cmp_sub, chroma_cmp_sub;
+    int d;
+
+    LOAD_COMMON(xx, yy);
+    
+ //FIXME factorize
+
+    cmp_sub= s->dsp.mb_cmp[size];
+    chroma_cmp_sub= s->dsp.mb_cmp[size+1];
+    
+    assert(!s->me.skip);
+    assert(s->avctx->me_sub_cmp != s->avctx->mb_cmp);
+
+    CMP_QPEL(d, mx&3, my&3, mx>>2, my>>2, size);
+    //FIXME check cbp before adding penalty for (0,0) vector
+    if(mx || my || size>0)
+        d += (mv_penalty[mx - pred_x] + mv_penalty[my - pred_y])*penalty_factor;
+        
+    return d;
+}
+
+
 #endif /* CMP_QPEL */
 
 #define CHECK_MV(x,y)\
--- a/mpegvideo.c	Sun Jan 19 12:06:36 2003 +0000
+++ b/mpegvideo.c	Sun Jan 19 17:55:13 2003 +0000
@@ -80,6 +80,7 @@
 };
 
 static const uint8_t h263_chroma_roundtab[16] = {
+//  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
     0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
 };
 
@@ -313,6 +314,7 @@
             CHECKED_ALLOCZ(pic->mb_var   , s->mb_num * sizeof(INT16))
             CHECKED_ALLOCZ(pic->mc_mb_var, s->mb_num * sizeof(INT16))
             CHECKED_ALLOCZ(pic->mb_mean  , s->mb_num * sizeof(INT8))
+            CHECKED_ALLOCZ(pic->mb_cmp_score, s->mb_num * sizeof(int32_t))
         }
 
         CHECKED_ALLOCZ(pic->mbskip_table , s->mb_num * sizeof(UINT8)+1) //the +1 is for the slice end check
@@ -338,6 +340,7 @@
     av_freep(&pic->mb_var);
     av_freep(&pic->mc_mb_var);
     av_freep(&pic->mb_mean);
+    av_freep(&pic->mb_cmp_score);
     av_freep(&pic->mbskip_table);
     av_freep(&pic->qscale_table);
     
@@ -1663,6 +1666,14 @@
     pix_op[1][dxy](dest_cr + (dest_offset >> 1), ptr,  uvlinesize, h >> 1);
 }
 
+inline int ff_h263_round_chroma(int x){
+    if (x >= 0)
+        return  (h263_chroma_roundtab[x & 0xf] + ((x >> 3) & ~1));
+    else {
+        x = -x;
+        return -(h263_chroma_roundtab[x & 0xf] + ((x >> 3) & ~1));
+    }
+}
 
 static inline void MPV_motion(MpegEncContext *s, 
                               UINT8 *dest_y, UINT8 *dest_cb, UINT8 *dest_cr,
@@ -1773,20 +1784,8 @@
         if(s->flags&CODEC_FLAG_GRAY) break;
         /* In case of 8X8, we construct a single chroma motion vector
            with a special rounding */
-        for(i=0;i<4;i++) {
-        }
-        if (mx >= 0)
-            mx = (h263_chroma_roundtab[mx & 0xf] + ((mx >> 3) & ~1));
-        else {
-            mx = -mx;
-            mx = -(h263_chroma_roundtab[mx & 0xf] + ((mx >> 3) & ~1));
-        }
-        if (my >= 0)
-            my = (h263_chroma_roundtab[my & 0xf] + ((my >> 3) & ~1));
-        else {
-            my = -my;
-            my = -(h263_chroma_roundtab[my & 0xf] + ((my >> 3) & ~1));
-        }
+        mx= ff_h263_round_chroma(mx);
+        my= ff_h263_round_chroma(my);
         dxy = ((my & 1) << 1) | (mx & 1);
         mx >>= 1;
         my >>= 1;
@@ -2796,6 +2795,7 @@
             s->no_rounding ^= 1;          
     }
     /* Estimate motion for every MB */
+    s->mb_intra=0; //for the rate distoration & bit compare functions
     if(s->pict_type != I_TYPE){
         if(s->pict_type != B_TYPE){
             if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){
@@ -2986,7 +2986,7 @@
         s->block_index[4]= s->block_wrap[4]*(mb_y + 1)                    + s->block_wrap[0]*(s->mb_height*2 + 2);
         s->block_index[5]= s->block_wrap[4]*(mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2);
         for(mb_x=0; mb_x < s->mb_width; mb_x++) {
-            const int mb_type= s->mb_type[mb_y * s->mb_width + mb_x];
+            int mb_type= s->mb_type[mb_y * s->mb_width + mb_x];
             const int xy= (mb_y+1) * (s->mb_width+2) + mb_x + 1;
 //            int d;
             int dmin=10000000;
@@ -3152,8 +3152,93 @@
                 s->last_bits= get_bit_count(&s->pb);
             } else {
                 int motion_x, motion_y;
+                int intra_score;
+                int inter_score= s->current_picture.mb_cmp_score[mb_x + mb_y*s->mb_width];
+                
+              if(!(s->flags&CODEC_FLAG_HQ) && s->pict_type==P_TYPE){
+                /* get luma score */
+                if((s->avctx->mb_cmp&0xFF)==FF_CMP_SSE){
+                    intra_score= (s->current_picture.mb_var[mb_x + mb_y*s->mb_width]<<8) - 500; //FIXME dont scale it down so we dont have to fix it
+                }else{
+                    uint8_t *dest_y;
+
+                    int mean= s->current_picture.mb_mean[mb_x + mb_y*s->mb_width]; //FIXME
+                    mean*= 0x01010101;
+                    
+                    dest_y  = s->new_picture.data[0] + (mb_y * 16 * s->linesize    ) + mb_x * 16;
+                
+                    for(i=0; i<16; i++){
+                        *(uint32_t*)(&s->me.scratchpad[i*s->linesize+ 0]) = mean;
+                        *(uint32_t*)(&s->me.scratchpad[i*s->linesize+ 4]) = mean;
+                        *(uint32_t*)(&s->me.scratchpad[i*s->linesize+ 8]) = mean;
+                        *(uint32_t*)(&s->me.scratchpad[i*s->linesize+12]) = mean;
+                    }
+
+                    s->mb_intra=1;
+                    intra_score= s->dsp.mb_cmp[0](s, s->me.scratchpad, dest_y, s->linesize);
+                                        
+/*                    printf("intra:%7d inter:%7d var:%7d mc_var.%7d\n", intra_score>>8, inter_score>>8, 
+                        s->current_picture.mb_var[mb_x + mb_y*s->mb_width],
+                        s->current_picture.mc_mb_var[mb_x + mb_y*s->mb_width]);*/
+                }
+                
+                /* get chroma score */
+                if(s->avctx->mb_cmp&FF_CMP_CHROMA){
+                    int i;
+                    
+                    s->mb_intra=1;
+                    for(i=1; i<3; i++){
+                        uint8_t *dest_c;
+                        int mean;
+                        
+                        if(s->out_format == FMT_H263){
+                            mean= (s->dc_val[i][mb_x + (mb_y+1)*(s->mb_width+2)] + 4)>>3; //FIXME not exact but simple ;)
+                        }else{
+                            mean= (s->last_dc[i] + 4)>>3;
+                        }
+                        dest_c = s->new_picture.data[i] + (mb_y * 8  * (s->uvlinesize)) + mb_x * 8;
+                        
+                        mean*= 0x01010101;
+                        for(i=0; i<8; i++){
+                            *(uint32_t*)(&s->me.scratchpad[i*s->uvlinesize+ 0]) = mean;
+                            *(uint32_t*)(&s->me.scratchpad[i*s->uvlinesize+ 4]) = mean;
+                        }
+                        
+                        intra_score+= s->dsp.mb_cmp[1](s, s->me.scratchpad, dest_c, s->uvlinesize);
+                    }                
+                }
+
+                /* bias */
+                switch(s->avctx->mb_cmp&0xFF){
+                default:
+                case FF_CMP_SAD:
+                    intra_score+= 32*s->qscale;
+                    break;
+                case FF_CMP_SSE:
+                    intra_score+= 24*s->qscale*s->qscale;
+                    break;
+                case FF_CMP_SATD:
+                    intra_score+= 96*s->qscale;
+                    break;
+                case FF_CMP_DCT:
+                    intra_score+= 48*s->qscale;
+                    break;
+                case FF_CMP_BIT:
+                    intra_score+= 16;
+                    break;
+                case FF_CMP_PSNR:
+                case FF_CMP_RD:
+                    intra_score+= (s->qscale*s->qscale*109*8 + 64)>>7;
+                    break;
+                }
+
+                if(intra_score < inter_score)
+                    mb_type= MB_TYPE_INTRA;
+              }  
+                
                 s->mv_type=MV_TYPE_16X16;
                 // only one MB-Type possible
+                
                 switch(mb_type){
                 case MB_TYPE_INTRA:
                     s->mv_dir = MV_DIR_FORWARD;
@@ -3383,7 +3468,7 @@
         return last_non_zero;
     }
 
-    lambda= (qscale*qscale*64*82 + 50)/100; //FIXME finetune
+    lambda= (qscale*qscale*64*105 + 64)>>7; //FIXME finetune
         
     score_tab[0]= 0;
     for(i=0; i<=last_non_zero - start_i; i++){
--- a/mpegvideo.h	Sun Jan 19 12:06:36 2003 +0000
+++ b/mpegvideo.h	Sun Jan 19 17:55:13 2003 +0000
@@ -98,7 +98,6 @@
     int last_non_b_pict_type;
 }RateControlContext;
 
-
 typedef struct ScanTable{
     const UINT8 *scantable;
     UINT8 permutated[64];
@@ -117,6 +116,7 @@
     uint16_t *mb_var;           /* Table for MB variances */
     uint16_t *mc_mb_var;        /* Table for motion compensated MB variances */
     uint8_t *mb_mean;           /* Table for MB luminance */
+    int32_t *mb_cmp_score;	/* Table for MB cmp scores, for mb decission */
     int b_frame_score;          /* */
 } Picture;
 
@@ -142,6 +142,7 @@
     int pre_penalty_factor;
     int penalty_factor;
     int sub_penalty_factor;
+    int mb_penalty_factor;
     int pre_pass;                      /* = 1 for the pre pass */
     int dia_size;
     UINT16 (*mv_penalty)[MAX_MV*2+1];  /* amount of bits needed to encode a MV */
@@ -160,6 +161,8 @@
                              int P[10][2], int pred_x, int pred_y,
                              int xmin, int ymin, int xmax, int ymax, Picture *ref_picture, int16_t (*last_mv)[2], 
                              int ref_mv_scale, uint16_t * const mv_penalty);
+    int (*get_mb_score)(struct MpegEncContext * s, int mx, int my, int pred_x, int pred_y, Picture *ref_picture, 
+                                  uint16_t * const mv_penalty);
 }MotionEstContext;
 
 typedef struct MpegEncContext {
@@ -321,6 +324,8 @@
     uint8_t *intra_ac_vlc_last_length;
     uint8_t *inter_ac_vlc_length;
     uint8_t *inter_ac_vlc_last_length;
+    uint8_t *luma_dc_vlc_length;
+    uint8_t *chroma_dc_vlc_length;
 #define UNI_AC_ENC_INDEX(run,level) ((run)*128 + (level))
 
     /* precomputed matrix (combine qscale and DCT renorm) */
@@ -719,6 +724,7 @@
 int ff_h263_resync(MpegEncContext *s);
 int ff_h263_get_gob_height(MpegEncContext *s);
 void ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my);
+inline int ff_h263_round_chroma(int x);
 
 
 /* rv10.c */
--- a/msmpeg4data.h	Sun Jan 19 12:06:36 2003 +0000
+++ b/msmpeg4data.h	Sun Jan 19 17:55:13 2003 +0000
@@ -1868,7 +1868,10 @@
 };
 
 static const uint8_t table_inter_intra[4][2]={
-    {0,1},{2,2},{6,3},{7,3}
+    {0,1} /*Luma-Left Chroma-Left*/,
+    {2,2} /*Luma-Top  Chroma-Left*/,
+    {6,3} /*luma-Left Chroma-Top */,
+    {7,3} /*luma-Top  Chroma-Top */
 };
 
 #define WMV2_INTER_CBP_TABLE_COUNT 4