changeset 455:7f2d2be2aee9 libavcodec

dont double check vectors more correct EPZS implementation much faster halfpel ME simplifications / reducing duplicate code 4MV in LQ mode
author michaelni
date Sun, 02 Jun 2002 12:18:18 +0000
parents eda22d628b2d
children c006ee838856
files motion_est.c
diffstat 1 files changed, 613 insertions(+), 369 deletions(-) [+]
line wrap: on
line diff
--- a/motion_est.c	Sun Jun 02 12:16:28 2002 +0000
+++ b/motion_est.c	Sun Jun 02 12:18:18 2002 +0000
@@ -1,6 +1,7 @@
 /*
  * Motion estimation 
  * Copyright (c) 2000,2001 Fabrice Bellard.
+ * Copyright (c) 2002 Michael Niedermayer
  * 
  *
  * This library is free software; you can redistribute it and/or
@@ -25,14 +26,20 @@
 #include "dsputil.h"
 #include "mpegvideo.h"
 
-//#define ABS(a) ((a)>0 ? (a) : -(a))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#define SQ(a) ((a)*(a))
 #define INTER_BIAS	257
 
-static int halfpel_motion_search(MpegEncContext * s,
-				  int *mx_ptr, int *my_ptr, int dmin,
-				  int xmin, int ymin, int xmax, int ymax,
-                                  int pred_x, int pred_y, uint8_t *ref_picture);
+#define P_LAST P[0]
+#define P_LEFT P[1]
+#define P_TOP P[2]
+#define P_TOPRIGHT P[3]
+#define P_MEDIAN P[4]
+#define P_LAST_LEFT P[5]
+#define P_LAST_RIGHT P[6]
+#define P_LAST_TOP P[7]
+#define P_LAST_BOTTOM P[8]
+#define P_MV1 P[9]
+
 
 static int pix_sum(UINT8 * pix, int line_size)
 {
@@ -359,62 +366,61 @@
 
 #define CHECK_MV(x,y)\
 {\
-    d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
-    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
-    if(d<dmin){\
-        best[0]=x;\
-        best[1]=y;\
-        dmin=d;\
+    const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    if(map[index]!=key){\
+        d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+        COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\
+        map[index]= key;\
+        score_map[index]= d;\
     }\
 }
 
 #define CHECK_MV_DIR(x,y,new_dir)\
 {\
-    d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
-    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
-    if(d<dmin){\
-        best[0]=x;\
-        best[1]=y;\
-        dmin=d;\
-        next_dir= new_dir;\
+    const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    if(map[index]!=key){\
+        d = pix_abs(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+        if(d<dmin){\
+            best[0]=x;\
+            best[1]=y;\
+            dmin=d;\
+            next_dir= new_dir;\
+        }\
+        map[index]= key;\
+        score_map[index]= d;\
     }\
 }
 
 #define CHECK_MV4(x,y)\
 {\
-    d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
-    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
-    if(d<dmin){\
-        best[0]=x;\
-        best[1]=y;\
-        dmin=d;\
+    const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    if(map[index]!=key){\
+        d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+        COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\
+        map[index]= key;\
+        score_map[index]= d;\
     }\
 }
 
-#define CHECK_MV4_DIR(x,y,new_dir)\
-{\
-    d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
-    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
-    if(d<dmin){\
-        best[0]=x;\
-        best[1]=y;\
-        dmin=d;\
-        next_dir= new_dir;\
-    }\
-}
-
-
 #define check(x,y,S,v)\
-if( (x)<(xmin<<(S)) ) printf("%d %d %d %d xmin" #v, (x), (y), s->mb_x, s->mb_y);\
-if( (x)>(xmax<<(S)) ) printf("%d %d %d %d xmax" #v, (x), (y), s->mb_x, s->mb_y);\
-if( (y)<(ymin<<(S)) ) printf("%d %d %d %d ymin" #v, (x), (y), s->mb_x, s->mb_y);\
-if( (y)>(ymax<<(S)) ) printf("%d %d %d %d ymax" #v, (x), (y), s->mb_x, s->mb_y);\
+if( (x)<(xmin<<(S)) ) printf("%d %d %d %d %d xmin" #v, xmin, (x), (y), s->mb_x, s->mb_y);\
+if( (x)>(xmax<<(S)) ) printf("%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, s->mb_y);\
+if( (y)<(ymin<<(S)) ) printf("%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
+if( (y)>(ymax<<(S)) ) printf("%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
 
 
 static inline int small_diamond_search(MpegEncContext * s, int *best, int dmin,
                                        UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
                                        int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
-                                       int xmin, int ymin, int xmax, int ymax, int shift)
+                                       int xmin, int ymin, int xmax, int ymax, int shift,
+                                       uint32_t *map, uint16_t *score_map, int map_generation,
+                                       op_pixels_abs_func pix_abs)
 {
     int next_dir=-1;
 
@@ -462,36 +468,19 @@
     */
 }
 
-static inline int small_diamond_search4MV(MpegEncContext * s, int *best, int dmin,
-                                       UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
-                                       int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
-                                       int xmin, int ymin, int xmax, int ymax, int shift)
-{
-    int next_dir=-1;
-
-    for(;;){
-        int d;
-        const int dir= next_dir;
-        const int x= best[0];
-        const int y= best[1];
-        next_dir=-1;
-
-//printf("%d", dir);
-        if(dir!=2 && x>xmin) CHECK_MV4_DIR(x-1, y  , 0)
-        if(dir!=3 && y>ymin) CHECK_MV4_DIR(x  , y-1, 1)
-        if(dir!=0 && x<xmax) CHECK_MV4_DIR(x+1, y  , 2)
-        if(dir!=1 && y<ymax) CHECK_MV4_DIR(x  , y+1, 3)
-
-        if(next_dir==-1){
-            return dmin;
-        }
-    }
-}
-
+#if 1
+#define SNAKE_1 3
+#define SNAKE_2 2
+#else
+#define SNAKE_1 7
+#define SNAKE_2 3
+#endif
 static inline int snake_search(MpegEncContext * s, int *best, int dmin,
                                        UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
                                        int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
-                                       int xmin, int ymin, int xmax, int ymax, int shift)
+                                       int xmin, int ymin, int xmax, int ymax, int shift,
+                                       uint32_t *map, uint16_t *score_map,int map_generation,
+                                       op_pixels_abs_func pix_abs)
 {
     int dir=0;
     int c=1;
@@ -517,8 +506,15 @@
         x+=x_dir[dir];
         y+=y_dir[dir];
         if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){
-            d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);
-            d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;
+            const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;
+            const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);
+            if(map[index]!=key){
+                d = pix_abs(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);
+                d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;
+                map[index]=key;
+                score_map[index]=d;
+            }else
+                d= dmin+1;
         }else{
             d = dmin + 10000; //FIXME smarter boundary handling
         }
@@ -537,20 +533,89 @@
         }else{
 //bad++;
             if(fails){
-                if(fails>=3) return dmin;
+                if(fails>=SNAKE_1+1) return dmin;
             }else{
-                c= -c;
+                if(dir&1) dir-= c*3;
+                else      c= -c;
+//                c= -c;
             }
-            dir+=c*2;
+            dir+=c*SNAKE_2;
             fails++;
         }
         dir&=7;
     }
 }
 
+static inline int cross_search(MpegEncContext * s, int *best, int dmin,
+                                       UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
+                                       int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
+                                       int xmin, int ymin, int xmax, int ymax, int shift,
+                                       uint32_t *map, uint16_t *score_map,int map_generation,
+                                       op_pixels_abs_func pix_abs)
+{
+    static int x_dir[4]= {-1, 0, 1, 0};
+    static int y_dir[4]= { 0,-1, 0, 1};
+    int improvement[2]={100000, 100000};
+    int dirs[2]={2, 3};
+    int dir;
+    int last_dir= -1;
+    
+    for(;;){
+        dir= dirs[ improvement[0] > improvement[1] ? 0 : 1 ];
+        if(improvement[dir&1]==-1) return dmin;
+        
+        {
+            const int x= best[0] + x_dir[dir];
+            const int y= best[1] + y_dir[dir];
+            const int key= (y<<ME_MAP_MV_BITS) + x + map_generation;
+            const int index= ((y<<ME_MAP_SHIFT) + x)&(ME_MAP_SIZE-1);
+            int d;
+            if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){
+                if(map[index]!=key){
+                    d = pix_abs(new_pic, old_pic + x + y*pic_stride, pic_stride);
+                    d += (mv_penalty[(x<<shift)-pred_x] + mv_penalty[(y<<shift)-pred_y])*quant;
+                    map[index]=key;
+                    score_map[index]=d;
+                    if(d<dmin){
+                        improvement[dir&1]= dmin-d;
+                        improvement[(dir&1)^1]++;
+                        dmin=d;
+                        best[0]= x;
+                        best[1]= y;
+                        last_dir=dir;
+                        continue;
+                    }
+                }else{
+                    d= score_map[index];
+                }
+            }else{
+                d= dmin + 1000; //FIXME is this a good idea?
+            }
+            /* evaluated point was cached or checked and worse */
+
+            if(last_dir==dir){
+                improvement[dir&1]= -1;
+            }else{
+                improvement[dir&1]= d-dmin;
+                last_dir= dirs[dir&1]= dir^2;
+            }
+        }
+    }
+}
+
+static inline int update_map_generation(MpegEncContext * s)
+{
+    s->me_map_generation+= 1<<(ME_MAP_MV_BITS*2);
+    if(s->me_map_generation==0){
+        s->me_map_generation= 1<<(ME_MAP_MV_BITS*2);
+        memset(s->me_map, 0, sizeof(uint32_t)*ME_MAP_SIZE);
+    }
+    return s->me_map_generation;
+}
+
 static int epzs_motion_search(MpegEncContext * s,
                              int *mx_ptr, int *my_ptr,
-                             int P[5][2], int pred_x, int pred_y,
+                             int P[10][2], int pred_x, int pred_y,
                              int xmin, int ymin, int xmax, int ymax, uint8_t * ref_picture)
 {
     int best[2]={0, 0};
@@ -561,42 +626,74 @@
     UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
     int quant= s->qscale; // qscale of the prev frame
     const int shift= 1+s->quarter_sample;
+    uint32_t *map= s->me_map;
+    uint16_t *score_map= s->me_score_map;
+    int map_generation;
 
     new_pic = s->new_picture[0] + pic_xy;
     old_pic = ref_picture + pic_xy;
-   
+    
+    map_generation= update_map_generation(s);
+
     dmin = pix_abs16x16(new_pic, old_pic, pic_stride);
-    if(dmin<Z_THRESHOLD){
-        *mx_ptr= 0;
-        *my_ptr= 0;
-//printf("Z");
-        return dmin;
-    }
+    map[0]= map_generation;
+    score_map[0]= dmin;
 
     /* first line */
-    if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line)) {
-        CHECK_MV(P[1][0]>>shift, P[1][1]>>shift)
+    if ((s->mb_y == 0 || s->first_slice_line)) {
+        CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
+        CHECK_MV(P_LAST[0]>>shift, P_LAST[1]>>shift)
     }else{
-        CHECK_MV(P[4][0]>>shift, P[4][1]>>shift)
-        if(dmin<Z_THRESHOLD){
-            *mx_ptr= P[4][0]>>shift;
-            *my_ptr= P[4][1]>>shift;
-//printf("M\n");
+        if(dmin<256 && ( P_LEFT[0]    |P_LEFT[1]
+                        |P_TOP[0]     |P_TOP[1]
+                        |P_TOPRIGHT[0]|P_TOPRIGHT[1])==0){
+            *mx_ptr= 0;
+            *my_ptr= 0;
+            s->skip_me=1;
             return dmin;
         }
-        CHECK_MV(P[1][0]>>shift, P[1][1]>>shift)
-        CHECK_MV(P[2][0]>>shift, P[2][1]>>shift)
-        CHECK_MV(P[3][0]>>shift, P[3][1]>>shift)
+        CHECK_MV(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift)
+        if(dmin>256*2){
+            CHECK_MV(P_LAST[0]    >>shift, P_LAST[1]    >>shift)
+            CHECK_MV(P_LEFT[0]    >>shift, P_LEFT[1]    >>shift)
+            CHECK_MV(P_TOP[0]     >>shift, P_TOP[1]     >>shift)
+            CHECK_MV(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift)
+        }
+    }
+    if(dmin>256*4){
+        CHECK_MV(P_LAST_RIGHT[0] >>shift, P_LAST_RIGHT[1] >>shift)
+        CHECK_MV(P_LAST_BOTTOM[0]>>shift, P_LAST_BOTTOM[1]>>shift)
     }
-    CHECK_MV(P[0][0]>>shift, P[0][1]>>shift)
+#if 0 //doest only slow things down
+    if(dmin>512*3){
+        int step;
+        dmin= score_map[0];
+        best[0]= best[1]=0;
+        for(step=128; step>0; step>>=1){
+            const int step2= step;
+            int y;
+            for(y=-step2+best[1]; y<=step2+best[1]; y+=step){
+                int x;
+                if(y<ymin || y>ymax) continue;
 
+                for(x=-step2+best[0]; x<=step2+best[0]; x+=step){
+                    if(x<xmin || x>xmax) continue;
+                    if(x==best[0] && y==best[1]) continue;
+                    CHECK_MV(x,y)
+                }
+            }
+        }
+    }
+#endif
 //check(best[0],best[1],0, b0)
     if(s->me_method==ME_EPZS)
         dmin= small_diamond_search(s, best, dmin, new_pic, old_pic, pic_stride, 
-                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift);
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
+                                   shift, map, score_map, map_generation, pix_abs16x16);
     else
-        dmin=         snake_search(s, best, dmin, new_pic, old_pic, pic_stride, 
-                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift);
+        dmin=         cross_search(s, best, dmin, new_pic, old_pic, pic_stride, 
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
+                                   shift, map, score_map, map_generation, pix_abs16x16);
 //check(best[0],best[1],0, b1)
     *mx_ptr= best[0];
     *my_ptr= best[1];    
@@ -607,7 +704,7 @@
 
 static int epzs_motion_search4(MpegEncContext * s, int block,
                              int *mx_ptr, int *my_ptr,
-                             int P[6][2], int pred_x, int pred_y,
+                             int P[10][2], int pred_x, int pred_y,
                              int xmin, int ymin, int xmax, int ymax, uint8_t *ref_picture)
 {
     int best[2]={0, 0};
@@ -618,34 +715,47 @@
     UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
     int quant= s->qscale; // qscale of the prev frame
     const int shift= 1+s->quarter_sample;
+    uint32_t *map= s->me_map;
+    uint16_t *score_map= s->me_score_map;
+    int map_generation;
 
     new_pic = s->new_picture[0] + pic_xy;
     old_pic = ref_picture + pic_xy;
-   
-    dmin = pix_abs8x8(new_pic, old_pic, pic_stride);
+
+    map_generation= update_map_generation(s);
 
+    dmin = 1000000;
+//printf("%d %d %d %d //",xmin, ymin, xmax, ymax); 
     /* first line */
-    if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) {
-        CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift)
+    if ((s->mb_y == 0 || s->first_slice_line) && block<2) {
+        CHECK_MV4(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
+        CHECK_MV4(P_LAST[0]>>shift, P_LAST[1]>>shift)
+        CHECK_MV4(P_MV1[0]>>shift, P_MV1[1]>>shift)
     }else{
-        CHECK_MV4(P[4][0]>>shift, P[4][1]>>shift)
-        if(dmin<Z_THRESHOLD){
-            *mx_ptr= P[4][0]>>shift;
-            *my_ptr= P[4][1]>>shift;
-//printf("M\n");
-            return dmin;
+        CHECK_MV4(P_MV1[0]>>shift, P_MV1[1]>>shift)
+        //FIXME try some early stop
+        if(dmin>64*2){
+            CHECK_MV4(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift)
+            CHECK_MV4(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
+            CHECK_MV4(P_TOP[0]>>shift, P_TOP[1]>>shift)
+            CHECK_MV4(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift)
+            CHECK_MV4(P_LAST[0]>>shift, P_LAST[1]>>shift)
         }
-        CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift)
-        CHECK_MV4(P[2][0]>>shift, P[2][1]>>shift)
-        CHECK_MV4(P[3][0]>>shift, P[3][1]>>shift)
+    }
+    if(dmin>64*4){
+        CHECK_MV4(P_LAST_RIGHT[0]>>shift, P_LAST_RIGHT[1]>>shift)
+        CHECK_MV4(P_LAST_BOTTOM[0]>>shift, P_LAST_BOTTOM[1]>>shift)
     }
-    CHECK_MV4(P[0][0]>>shift, P[0][1]>>shift)
-    CHECK_MV4(P[5][0]>>shift, P[5][1]>>shift)
 
-//check(best[0],best[1],0, b0)
-    dmin= small_diamond_search4MV(s, best, dmin, new_pic, old_pic, pic_stride, 
-                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift);
-//check(best[0],best[1],0, b1)
+    if(s->me_method==ME_EPZS)
+        dmin= small_diamond_search(s, best, dmin, new_pic, old_pic, pic_stride, 
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
+                                   shift, map, score_map, map_generation, pix_abs8x8);
+    else
+        dmin=         cross_search(s, best, dmin, new_pic, old_pic, pic_stride, 
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
+                                   shift, map, score_map, map_generation, pix_abs8x8);
+
     *mx_ptr= best[0];
     *my_ptr= best[1];    
 
@@ -654,56 +764,50 @@
 }
 
 #define CHECK_HALF_MV(suffix, x, y) \
-    d= pix_abs16x16_ ## suffix(pix, ptr+((x)>>1), s->linesize);\
+{\
+    d= pix_abs_ ## suffix(pix, ptr+((x)>>1), s->linesize);\
     d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\
-    if(d<dminh){\
-        dminh= d;\
-        mx= mx1 + x;\
-        my= my1 + y;\
-    }
+    COPY3_IF_LT(dminh, d, dx, x, dy, y)\
+}
 
-#define CHECK_HALF_MV4(suffix, x, y) \
-    d= pix_abs8x8_ ## suffix(pix, ptr+((x)>>1), s->linesize);\
-    d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\
-    if(d<dminh){\
-        dminh= d;\
-        mx= mx1 + x;\
-        my= my1 + y;\
-    }
     
 /* The idea would be to make half pel ME after Inter/Intra decision to 
    save time. */
 static inline int halfpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
 				  int xmin, int ymin, int xmax, int ymax,
-                                  int pred_x, int pred_y, uint8_t *ref_picture)
+                                  int pred_x, int pred_y, uint8_t *ref_picture,
+                                  op_pixels_abs_func pix_abs_x2, 
+                                  op_pixels_abs_func pix_abs_y2, op_pixels_abs_func pix_abs_xy2, int n)
 {
     UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
     const int quant= s->qscale;
-    int pen_x, pen_y;
-    int mx, my, mx1, my1, d, xx, yy, dminh;
+    int mx, my, xx, yy, dminh;
     UINT8 *pix, *ptr;
 
+    if(s->skip_me){
+        *mx_ptr = 0;
+        *my_ptr = 0;
+        return dmin;
+    }else
+
+    xx = 16 * s->mb_x + 8*(n&1);
+    yy = 16 * s->mb_y + 8*(n>>1);
+    pix =  s->new_picture[0] + (yy * s->linesize) + xx;
+
     mx = *mx_ptr;
     my = *my_ptr;
-    ptr = ref_picture + (my * s->linesize) + mx;
-
-    xx = 16 * s->mb_x;
-    yy = 16 * s->mb_y;
-    pix =  s->new_picture[0] + (yy * s->linesize) + xx;
+    ptr = ref_picture + ((yy + my) * s->linesize) + (xx + mx);
     
     dminh = dmin;
 
     if (mx > xmin && mx < xmax && 
         my > ymin && my < ymax) {
+        int dx=0, dy=0;
+        int d, pen_x, pen_y; 
 
-        mx= mx1= 2*(mx - xx);
-        my= my1= 2*(my - yy);
-        if(dmin < Z_THRESHOLD && mx==0 && my==0){
-            *mx_ptr = 0;
-            *my_ptr = 0;
-            return dmin;
-        }
+        mx<<=1;
+        my<<=1;
         
         pen_x= pred_x + mx;
         pen_y= pred_y + my;
@@ -720,9 +824,11 @@
         CHECK_HALF_MV(y2 ,  0, +1)
         CHECK_HALF_MV(xy2, +1, +1)
 
+        mx+=dx;
+        my+=dy;
     }else{
-        mx= 2*(mx - xx);
-        my= 2*(my - yy);
+        mx<<=1;
+        my<<=1;
     }
 
     *mx_ptr = mx;
@@ -730,64 +836,115 @@
     return dminh;
 }
 
-static inline void halfpel_motion_search4(MpegEncContext * s,
+static inline int fast_halfpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
 				  int xmin, int ymin, int xmax, int ymax,
-                                  int pred_x, int pred_y, int block_x, int block_y,
-                                  uint8_t *ref_picture)
+                                  int pred_x, int pred_y, uint8_t *ref_picture,
+                                  op_pixels_abs_func pix_abs_x2, 
+                                  op_pixels_abs_func pix_abs_y2, op_pixels_abs_func pix_abs_xy2, int n)
 {
     UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    uint16_t *score_map= s->me_score_map;
     const int quant= s->qscale;
-    int pen_x, pen_y;
-    int mx, my, mx1, my1, d, xx, yy, dminh;
+    int mx, my, xx, yy, dminh;
     UINT8 *pix, *ptr;
 
-    xx = 8 * block_x;
-    yy = 8 * block_y;
+    if(s->skip_me){
+//    printf("S");
+        *mx_ptr = 0;
+        *my_ptr = 0;
+        return dmin;
+    }
+//    printf("N");
+        
+    xx = 16 * s->mb_x + 8*(n&1);
+    yy = 16 * s->mb_y + 8*(n>>1);
     pix =  s->new_picture[0] + (yy * s->linesize) + xx;
-    
+
     mx = *mx_ptr;
     my = *my_ptr;
-    ptr = ref_picture + ((yy+my) * s->linesize) + xx + mx;
-
+    ptr = ref_picture + ((yy + my) * s->linesize) + (xx + mx);
+    
     dminh = dmin;
 
     if (mx > xmin && mx < xmax && 
         my > ymin && my < ymax) {
+        int dx=0, dy=0;
+        int d, pen_x, pen_y; 
+        const int index= (my<<ME_MAP_SHIFT) + mx;
+        const int t= score_map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)];
+        const int l= score_map[(index- 1               )&(ME_MAP_SIZE-1)];
+        const int r= score_map[(index+ 1               )&(ME_MAP_SIZE-1)];
+        const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)];
+        mx<<=1;
+        my<<=1;
 
-        mx= mx1= 2*mx;
-        my= my1= 2*my;
-        if(dmin < Z_THRESHOLD && mx==0 && my==0){
-            *mx_ptr = 0;
-            *my_ptr = 0;
-            return;
-        }
         
         pen_x= pred_x + mx;
         pen_y= pred_y + my;
 
         ptr-= s->linesize;
-        CHECK_HALF_MV4(xy2, -1, -1)
-        CHECK_HALF_MV4(y2 ,  0, -1)
-        CHECK_HALF_MV4(xy2, +1, -1)
-        
-        ptr+= s->linesize;
-        CHECK_HALF_MV4(x2 , -1,  0)
-        CHECK_HALF_MV4(x2 , +1,  0)
-        CHECK_HALF_MV4(xy2, -1, +1)
-        CHECK_HALF_MV4(y2 ,  0, +1)
-        CHECK_HALF_MV4(xy2, +1, +1)
+        if(t<=b){
+            CHECK_HALF_MV(y2 ,  0, -1)
+            if(l<=r){
+                CHECK_HALF_MV(xy2, -1, -1)
+                if(t+r<=b+l){
+                    CHECK_HALF_MV(xy2, +1, -1)
+                    ptr+= s->linesize;
+                }else{
+                    ptr+= s->linesize;
+                    CHECK_HALF_MV(xy2, -1, +1)
+                }
+                CHECK_HALF_MV(x2 , -1,  0)
+            }else{
+                CHECK_HALF_MV(xy2, +1, -1)
+                if(t+l<=b+r){
+                    CHECK_HALF_MV(xy2, -1, -1)
+                    ptr+= s->linesize;
+                }else{
+                    ptr+= s->linesize;
+                    CHECK_HALF_MV(xy2, +1, +1)
+                }
+                CHECK_HALF_MV(x2 , +1,  0)
+            }
+        }else{
+            if(l<=r){
+                if(t+l<=b+r){
+                    CHECK_HALF_MV(xy2, -1, -1)
+                    ptr+= s->linesize;
+                }else{
+                    ptr+= s->linesize;
+                    CHECK_HALF_MV(xy2, +1, +1)
+                }
+                CHECK_HALF_MV(x2 , -1,  0)
+                CHECK_HALF_MV(xy2, -1, +1)
+            }else{
+                if(t+r<=b+l){
+                    CHECK_HALF_MV(xy2, +1, -1)
+                    ptr+= s->linesize;
+                }else{
+                    ptr+= s->linesize;
+                    CHECK_HALF_MV(xy2, -1, +1)
+                }
+                CHECK_HALF_MV(x2 , +1,  0)
+                CHECK_HALF_MV(xy2, +1, +1)
+            }
+            CHECK_HALF_MV(y2 ,  0, +1)
+        }
+        mx+=dx;
+        my+=dy;
 
     }else{
-        mx*=2;
-        my*=2;
+        mx<<=1;
+        my<<=1;
     }
 
     *mx_ptr = mx;
     *my_ptr = my;
+    return dminh;
 }
 
-static inline void set_p_mv_tables(MpegEncContext * s, int mx, int my)
+static inline void set_p_mv_tables(MpegEncContext * s, int mx, int my, int mv4)
 {
     const int xy= s->mb_x + 1 + (s->mb_y + 1)*(s->mb_width + 2);
     
@@ -795,7 +952,7 @@
     s->p_mv_table[xy][1] = my;
 
     /* has allready been set to the 4 MV if 4MV is done */
-    if(!(s->flags&CODEC_FLAG_4MV)){
+    if(mv4){
         int mot_xy= s->block_index[0];
 
         s->motion_val[mot_xy  ][0]= mx;
@@ -840,6 +997,87 @@
     }
 }
 
+static inline int mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, int ymax, int mx, int my, int shift)
+{
+    int block;
+    int P[10][2];
+    uint8_t *ref_picture= s->last_picture[0];
+    int dmin_sum=0;
+
+    for(block=0; block<4; block++){
+        int mx4, my4;
+        int pred_x4, pred_y4;
+        int dmin4;
+        static const int off[4]= {2, 1, 1, -1};
+        const int mot_stride = s->block_wrap[0];
+        const int mot_xy = s->block_index[block];
+//        const int block_x= (block&1);
+//        const int block_y= (block>>1);
+#if 1 // this saves us a bit of cliping work and shouldnt affect compression in a negative way
+        const int rel_xmin4= xmin;
+        const int rel_xmax4= xmax;
+        const int rel_ymin4= ymin;
+        const int rel_ymax4= ymax;
+#else
+        const int rel_xmin4= xmin - block_x*8;
+        const int rel_xmax4= xmax - block_x*8 + 8;
+        const int rel_ymin4= ymin - block_y*8;
+        const int rel_ymax4= ymax - block_y*8 + 8;
+#endif
+        P_LAST[0] = s->motion_val[mot_xy    ][0];
+        P_LAST[1] = s->motion_val[mot_xy    ][1];
+        P_LEFT[0] = s->motion_val[mot_xy - 1][0];
+        P_LEFT[1] = s->motion_val[mot_xy - 1][1];
+        P_LAST_RIGHT[0] = s->motion_val[mot_xy + 1][0];
+        P_LAST_RIGHT[1] = s->motion_val[mot_xy + 1][1];
+        P_LAST_BOTTOM[0]= s->motion_val[mot_xy + 1*mot_stride][0];
+        P_LAST_BOTTOM[1]= s->motion_val[mot_xy + 1*mot_stride][1];
+
+        if(P_LEFT[0]       > (rel_xmax4<<shift)) P_LEFT[0]       = (rel_xmax4<<shift);
+        if(P_LAST_RIGHT[0] < (rel_xmin4<<shift)) P_LAST_RIGHT[0] = (rel_xmin4<<shift);
+        if(P_LAST_BOTTOM[1]< (rel_ymin4<<shift)) P_LAST_BOTTOM[1]= (rel_ymin4<<shift);
+
+        /* special case for first line */
+        if ((s->mb_y == 0 || s->first_slice_line) && block<2) {
+            pred_x4= P_LEFT[0];
+            pred_y4= P_LEFT[1];
+        } else {
+            P_TOP[0]      = s->motion_val[mot_xy - mot_stride             ][0];
+            P_TOP[1]      = s->motion_val[mot_xy - mot_stride             ][1];
+            P_TOPRIGHT[0] = s->motion_val[mot_xy - mot_stride + off[block]][0];
+            P_TOPRIGHT[1] = s->motion_val[mot_xy - mot_stride + off[block]][1];
+            if(P_TOP[1]      > (rel_ymax4<<shift)) P_TOP[1]     = (rel_ymax4<<shift);
+            if(P_TOPRIGHT[0] < (rel_xmin4<<shift)) P_TOPRIGHT[0]= (rel_xmin4<<shift);
+            if(P_TOPRIGHT[0] > (rel_xmax4<<shift)) P_TOPRIGHT[0]= (rel_xmax4<<shift);
+            if(P_TOPRIGHT[1] > (rel_ymax4<<shift)) P_TOPRIGHT[1]= (rel_ymax4<<shift);
+    
+            P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+            P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
+
+            if(s->out_format == FMT_H263){
+                pred_x4 = P_MEDIAN[0];
+                pred_y4 = P_MEDIAN[1];
+            }else { /* mpeg1 at least */
+                pred_x4= P_LEFT[0];
+                pred_y4= P_LEFT[1];
+            }
+        }
+        P_MV1[0]= mx;
+        P_MV1[1]= my;
+
+        dmin4 = epzs_motion_search4(s, block, &mx4, &my4, P, pred_x4, pred_y4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, ref_picture);
+
+        dmin4= fast_halfpel_motion_search(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, 
+                                   pred_x4, pred_y4, ref_picture, pix_abs8x8_x2, 
+                                   pix_abs8x8_y2, pix_abs8x8_xy2, block);
+ 
+        s->motion_val[ s->block_index[block] ][0]= mx4;
+        s->motion_val[ s->block_index[block] ][1]= my4;
+        dmin_sum+= dmin4;
+    }
+    return dmin_sum;
+}
+
 void ff_estimate_p_frame_motion(MpegEncContext * s,
                                 int mb_x, int mb_y)
 {
@@ -848,27 +1086,40 @@
     int xmin, ymin, xmax, ymax;
     int rel_xmin, rel_ymin, rel_xmax, rel_ymax;
     int pred_x=0, pred_y=0;
-    int P[6][2];
+    int P[10][2];
     const int shift= 1+s->quarter_sample;
     int mb_type=0;
     uint8_t *ref_picture= s->last_picture[0];
 
     get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, s->f_code);
+    rel_xmin= xmin - mb_x*16;
+    rel_xmax= xmax - mb_x*16;
+    rel_ymin= ymin - mb_y*16;
+    rel_ymax= ymax - mb_y*16;
+    s->skip_me=0;
 
     switch(s->me_method) {
     case ME_ZERO:
     default:
 	no_motion_search(s, &mx, &my);
+        mx-= mb_x*16;
+        my-= mb_y*16;
         dmin = 0;
         break;
     case ME_FULL:
 	dmin = full_motion_search(s, &mx, &my, range, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
         break;
     case ME_LOG:
 	dmin = log_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
         break;
     case ME_PHODS:
 	dmin = phods_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
         break;
     case ME_X1:
     case ME_EPZS:
@@ -876,133 +1127,68 @@
             const int mot_stride = s->block_wrap[0];
             const int mot_xy = s->block_index[0];
 
-            rel_xmin= xmin - mb_x*16;
-            rel_xmax= xmax - mb_x*16;
-            rel_ymin= ymin - mb_y*16;
-            rel_ymax= ymax - mb_y*16;
+            P_LAST[0]       = s->motion_val[mot_xy    ][0];
+            P_LAST[1]       = s->motion_val[mot_xy    ][1];
+            P_LEFT[0]       = s->motion_val[mot_xy - 1][0];
+            P_LEFT[1]       = s->motion_val[mot_xy - 1][1];
+            P_LAST_RIGHT[0] = s->motion_val[mot_xy + 2][0];
+            P_LAST_RIGHT[1] = s->motion_val[mot_xy + 2][1];
+            P_LAST_BOTTOM[0]= s->motion_val[mot_xy + 2*mot_stride][0];
+            P_LAST_BOTTOM[1]= s->motion_val[mot_xy + 2*mot_stride][1];
 
-            P[0][0] = s->motion_val[mot_xy    ][0];
-            P[0][1] = s->motion_val[mot_xy    ][1];
-            P[1][0] = s->motion_val[mot_xy - 1][0];
-            P[1][1] = s->motion_val[mot_xy - 1][1];
-            if(P[1][0] > (rel_xmax<<shift)) P[1][0]= (rel_xmax<<shift);
+            if(P_LEFT[0]       > (rel_xmax<<shift)) P_LEFT[0]       = (rel_xmax<<shift);
+            if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift);
+            if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift);
 
             /* special case for first line */
-            if ((mb_y == 0 || s->first_slice_line || s->first_gob_line)) {
-                P[4][0] = P[1][0];
-                P[4][1] = P[1][1];
+            if ((mb_y == 0 || s->first_slice_line)) {
+                pred_x= P_LEFT[0];
+                pred_y= P_LEFT[1];
             } else {
-                P[2][0] = s->motion_val[mot_xy - mot_stride             ][0];
-                P[2][1] = s->motion_val[mot_xy - mot_stride             ][1];
-                P[3][0] = s->motion_val[mot_xy - mot_stride + 2         ][0];
-                P[3][1] = s->motion_val[mot_xy - mot_stride + 2         ][1];
-                if(P[2][1] > (rel_ymax<<shift)) P[2][1]= (rel_ymax<<shift);
-                if(P[3][0] < (rel_xmin<<shift)) P[3][0]= (rel_xmin<<shift);
-                if(P[3][1] > (rel_ymax<<shift)) P[3][1]= (rel_ymax<<shift);
+                P_TOP[0]      = s->motion_val[mot_xy - mot_stride    ][0];
+                P_TOP[1]      = s->motion_val[mot_xy - mot_stride    ][1];
+                P_TOPRIGHT[0] = s->motion_val[mot_xy - mot_stride + 2][0];
+                P_TOPRIGHT[1] = s->motion_val[mot_xy - mot_stride + 2][1];
+                if(P_TOP[1]      > (rel_ymax<<shift)) P_TOP[1]     = (rel_ymax<<shift);
+                if(P_TOPRIGHT[0] < (rel_xmin<<shift)) P_TOPRIGHT[0]= (rel_xmin<<shift);
+                if(P_TOPRIGHT[1] > (rel_ymax<<shift)) P_TOPRIGHT[1]= (rel_ymax<<shift);
         
-                P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]);
-                P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]);
-            }
-            if(s->out_format == FMT_H263){
-                pred_x = P[4][0];
-                pred_y = P[4][1];
-            }else { /* mpeg1 at least */
-                pred_x= P[1][0];
-                pred_y= P[1][1];
+                P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+                P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
+
+                if(s->out_format == FMT_H263){
+                    pred_x = P_MEDIAN[0];
+                    pred_y = P_MEDIAN[1];
+                }else { /* mpeg1 at least */
+                    pred_x= P_LEFT[0];
+                    pred_y= P_LEFT[1];
+                }
             }
         }
         dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, ref_picture);
  
-        mx+= mb_x*16;
-        my+= mb_y*16;
         break;
     }
-    
-    if(s->flags&CODEC_FLAG_4MV){
-        int block;
-
-        mb_type|= MB_TYPE_INTER4V;
-
-        for(block=0; block<4; block++){
-            int mx4, my4;
-            int pred_x4, pred_y4;
-            int dmin4;
-            static const int off[4]= {2, 1, 1, -1};
-            const int mot_stride = s->block_wrap[0];
-            const int mot_xy = s->block_index[block];
-            const int block_x= mb_x*2 + (block&1);
-            const int block_y= mb_y*2 + (block>>1);
-
-            const int rel_xmin4= xmin - block_x*8;
-            const int rel_xmax4= xmax - block_x*8 + 8;
-            const int rel_ymin4= ymin - block_y*8;
-            const int rel_ymax4= ymax - block_y*8 + 8;
-
-            P[0][0] = s->motion_val[mot_xy    ][0];
-            P[0][1] = s->motion_val[mot_xy    ][1];
-            P[1][0] = s->motion_val[mot_xy - 1][0];
-            P[1][1] = s->motion_val[mot_xy - 1][1];
-            if(P[1][0] > (rel_xmax4<<shift)) P[1][0]= (rel_xmax4<<shift);
-
-            /* special case for first line */
-            if ((mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) {
-                P[4][0] = P[1][0];
-                P[4][1] = P[1][1];
-            } else {
-                P[2][0] = s->motion_val[mot_xy - mot_stride             ][0];
-                P[2][1] = s->motion_val[mot_xy - mot_stride             ][1];
-                P[3][0] = s->motion_val[mot_xy - mot_stride + off[block]][0];
-                P[3][1] = s->motion_val[mot_xy - mot_stride + off[block]][1];
-                if(P[2][1] > (rel_ymax4<<shift)) P[2][1]= (rel_ymax4<<shift);
-                if(P[3][0] < (rel_xmin4<<shift)) P[3][0]= (rel_xmin4<<shift);
-                if(P[3][0] > (rel_xmax4<<shift)) P[3][0]= (rel_xmax4<<shift);
-                if(P[3][1] > (rel_ymax4<<shift)) P[3][1]= (rel_ymax4<<shift);
-        
-                P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]);
-                P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]);
-            }
-            if(s->out_format == FMT_H263){
-                pred_x4 = P[4][0];
-                pred_y4 = P[4][1];
-            }else { /* mpeg1 at least */
-                pred_x4= P[1][0];
-                pred_y4= P[1][1];
-            }
-            P[5][0]= mx - mb_x*16;
-            P[5][1]= my - mb_y*16;
-
-            dmin4 = epzs_motion_search4(s, block, &mx4, &my4, P, pred_x4, pred_y4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, ref_picture);
-
-            halfpel_motion_search4(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, 
-                                   pred_x4, pred_y4, block_x, block_y, ref_picture);
-     
-            s->motion_val[ s->block_index[block] ][0]= mx4;
-            s->motion_val[ s->block_index[block] ][1]= my4;
-        }
-    }
 
     /* intra / predictive decision */
     xx = mb_x * 16;
     yy = mb_y * 16;
 
     pix = s->new_picture[0] + (yy * s->linesize) + xx;
-    /* At this point (mx,my) are full-pell and the absolute displacement */
-    ppix = ref_picture + (my * s->linesize) + mx;
+    /* At this point (mx,my) are full-pell and the relative displacement */
+    ppix = ref_picture + ((yy+my) * s->linesize) + (xx+mx);
     
     sum = pix_sum(pix, s->linesize);
-#if 0
-    varc = pix_dev(pix, s->linesize, (sum+128)>>8) + INTER_BIAS;
-    vard = pix_abs16x16(pix, ppix, s->linesize);
-#else
+    
     sum= (sum+8)>>4;
-    varc = ((pix_norm1(pix, s->linesize) - sum*sum + 128 + 500)>>8);
+    varc = (pix_norm1(pix, s->linesize) - sum*sum + 500 + 128)>>8;
     vard = (pix_norm(pix, ppix, s->linesize)+128)>>8;
-#endif
-
-    s->mb_var[s->mb_width * mb_y + mb_x] = varc;
-    s->avg_mb_var+= varc;
-    s->mc_mb_var += vard;
-
+//printf("%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
+    s->mb_var   [s->mb_width * mb_y + mb_x] = varc;
+    s->mc_mb_var[s->mb_width * mb_y + mb_x] = vard;
+    s->mb_var_sum    += varc;
+    s->mc_mb_var_sum += vard;
+//printf("E%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
     
 #if 0
     printf("varc=%4d avg_var=%4d (sum=%4d) vard=%4d mx=%2d my=%2d\n",
@@ -1013,19 +1199,49 @@
             mb_type|= MB_TYPE_INTRA;
         if (varc*2 + 200 > vard){
             mb_type|= MB_TYPE_INTER;
-            halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y, ref_picture);
+            if(s->me_method >= ME_EPZS)
+                fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                           pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, 
+                                           pix_abs16x16_xy2, 0);
+            else
+                halfpel_motion_search(     s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                           pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, 
+                                           pix_abs16x16_xy2, 0);                                           
         }else{
-            mx = mx*2 - mb_x*32;
-            my = my*2 - mb_y*32;
+            mx <<=1;
+            my <<=1;
         }
+        if((s->flags&CODEC_FLAG_4MV)
+           && !s->skip_me && varc>50 && vard>10){
+            mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
+            mb_type|=MB_TYPE_INTER4V;
+
+            set_p_mv_tables(s, mx, my, 0);
+        }else
+            set_p_mv_tables(s, mx, my, 1);
     }else{
         if (vard <= 64 || vard < varc) {
             mb_type|= MB_TYPE_INTER;
             if (s->me_method != ME_ZERO) {
-                halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y, ref_picture);
+                if(s->me_method >= ME_EPZS)
+                    dmin= fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                           pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, 
+                                           pix_abs16x16_xy2, 0);
+                else
+                    dmin= halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                           pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, 
+                                           pix_abs16x16_xy2, 0);
+                if((s->flags&CODEC_FLAG_4MV)
+                   && !s->skip_me && varc>50 && vard>10){
+                    int dmin4= mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
+                    if(dmin4 + 128 <dmin)
+                        mb_type= MB_TYPE_INTER4V;
+                }
+                set_p_mv_tables(s, mx, my, mb_type!=MB_TYPE_INTER4V);
+
             } else {
-                mx -= 16 * mb_x;
-                my -= 16 * mb_y;
+                mx <<=1;
+                my <<=1;
             }
 #if 0
             if (vard < 10) {
@@ -1036,13 +1252,12 @@
 #endif
         }else{
             mb_type|= MB_TYPE_INTRA;
-            mx = 0;//mx*2 - 32 * mb_x;
-            my = 0;//my*2 - 32 * mb_y;
+            mx = 0;
+            my = 0;
         }
     }
 
     s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
-    set_p_mv_tables(s, mx, my);
 }
 
 int ff_estimate_motion_b(MpegEncContext * s,
@@ -1052,79 +1267,83 @@
     int xmin, ymin, xmax, ymax;
     int rel_xmin, rel_ymin, rel_xmax, rel_ymax;
     int pred_x=0, pred_y=0;
-    int P[6][2];
+    int P[10][2];
     const int shift= 1+s->quarter_sample;
     const int mot_stride = s->mb_width + 2;
     const int mot_xy = (mb_y + 1)*mot_stride + mb_x + 1;
     
     get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, f_code);
+    rel_xmin= xmin - mb_x*16;
+    rel_xmax= xmax - mb_x*16;
+    rel_ymin= ymin - mb_y*16;
+    rel_ymax= ymax - mb_y*16;
 
     switch(s->me_method) {
     case ME_ZERO:
     default:
 	no_motion_search(s, &mx, &my);
         dmin = 0;
+        mx-= mb_x*16;
+        my-= mb_y*16;
         break;
     case ME_FULL:
 	dmin = full_motion_search(s, &mx, &my, range, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
         break;
     case ME_LOG:
 	dmin = log_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
         break;
     case ME_PHODS:
 	dmin = phods_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
         break;
     case ME_X1:
     case ME_EPZS:
        {
 
-            rel_xmin= xmin - mb_x*16;
-            rel_xmax= xmax - mb_x*16;
-            rel_ymin= ymin - mb_y*16;
-            rel_ymax= ymax - mb_y*16;
+            P_LAST[0]        = mv_table[mot_xy    ][0];
+            P_LAST[1]        = mv_table[mot_xy    ][1];
+            P_LEFT[0]        = mv_table[mot_xy - 1][0];
+            P_LEFT[1]        = mv_table[mot_xy - 1][1];
+            P_LAST_RIGHT[0]  = mv_table[mot_xy + 1][0];
+            P_LAST_RIGHT[1]  = mv_table[mot_xy + 1][1];
+            P_LAST_BOTTOM[0] = mv_table[mot_xy + mot_stride][0];
+            P_LAST_BOTTOM[1] = mv_table[mot_xy + mot_stride][1];
 
-            P[0][0] = mv_table[mot_xy    ][0];
-            P[0][1] = mv_table[mot_xy    ][1];
-            P[1][0] = mv_table[mot_xy - 1][0];
-            P[1][1] = mv_table[mot_xy - 1][1];
-            if(P[1][0] > (rel_xmax<<shift)) P[1][0]= (rel_xmax<<shift);
+            if(P_LEFT[0]       > (rel_xmax<<shift)) P_LEFT[0]       = (rel_xmax<<shift);
+            if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift);
+            if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift);
 
             /* special case for first line */
-            if ((mb_y == 0 || s->first_slice_line || s->first_gob_line)) {
-                P[4][0] = P[1][0];
-                P[4][1] = P[1][1];
+            if ((mb_y == 0 || s->first_slice_line)) {
             } else {
-                P[2][0] = mv_table[mot_xy - mot_stride             ][0];
-                P[2][1] = mv_table[mot_xy - mot_stride             ][1];
-                P[3][0] = mv_table[mot_xy - mot_stride + 1         ][0];
-                P[3][1] = mv_table[mot_xy - mot_stride + 1         ][1];
-                if(P[2][1] > (rel_ymax<<shift)) P[2][1]= (rel_ymax<<shift);
-                if(P[3][0] < (rel_xmin<<shift)) P[3][0]= (rel_xmin<<shift);
-                if(P[3][1] > (rel_ymax<<shift)) P[3][1]= (rel_ymax<<shift);
+                P_TOP[0] = mv_table[mot_xy - mot_stride             ][0];
+                P_TOP[1] = mv_table[mot_xy - mot_stride             ][1];
+                P_TOPRIGHT[0] = mv_table[mot_xy - mot_stride + 1         ][0];
+                P_TOPRIGHT[1] = mv_table[mot_xy - mot_stride + 1         ][1];
+                if(P_TOP[1] > (rel_ymax<<shift)) P_TOP[1]= (rel_ymax<<shift);
+                if(P_TOPRIGHT[0] < (rel_xmin<<shift)) P_TOPRIGHT[0]= (rel_xmin<<shift);
+                if(P_TOPRIGHT[1] > (rel_ymax<<shift)) P_TOPRIGHT[1]= (rel_ymax<<shift);
         
-                P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]);
-                P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]);
+                P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+                P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
             }
-            pred_x= P[1][0];
-            pred_y= P[1][1];
+            pred_x= P_LEFT[0];
+            pred_y= P_LEFT[1];
         }
         dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, ref_picture);
  
-        mx+= mb_x*16;
-        my+= mb_y*16;
         break;
     }
     
-    /* intra / predictive decision */
-//    xx = mb_x * 16;
-//    yy = mb_y * 16;
-
-//    pix = s->new_picture[0] + (yy * s->linesize) + xx;
-    /* At this point (mx,my) are full-pell and the absolute displacement */
-//    ppix = ref_picture + (my * s->linesize) + mx;
-    
-    dmin= halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y, ref_picture);
-
+    dmin= fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, 
+                                pix_abs16x16_xy2, 0);
+//printf("%d %d %d %d//", s->mb_x, s->mb_y, mx, my);
 //    s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
     mv_table[mot_xy][0]= mx;
     mv_table[mot_xy][1]= my;
@@ -1140,6 +1359,7 @@
                    int pred_bx, int pred_by)
 {
     //FIXME optimize?
+    //FIXME direct mode penalty
     UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
     uint8_t *dest_y = s->me_scratchpad;
     uint8_t *ptr;
@@ -1201,7 +1421,7 @@
 static inline int direct_search(MpegEncContext * s,
                                 int mb_x, int mb_y)
 {
-    int P[6][2];
+    int P[10][2];
     const int mot_stride = s->mb_width + 2;
     const int mot_xy = (mb_y + 1)*mot_stride + mb_x + 1;
     int dmin, dmin2;
@@ -1266,23 +1486,29 @@
         }
     }
 
-    P[0][0] = mv_table[mot_xy    ][0];
-    P[0][1] = mv_table[mot_xy    ][1];
-    P[1][0] = mv_table[mot_xy - 1][0];
-    P[1][1] = mv_table[mot_xy - 1][1];
-
+    P_LAST[0]        = mv_table[mot_xy    ][0];
+    P_LAST[1]        = mv_table[mot_xy    ][1];
+    P_LEFT[0]        = mv_table[mot_xy - 1][0];
+    P_LEFT[1]        = mv_table[mot_xy - 1][1];
+    P_LAST_RIGHT[0]  = mv_table[mot_xy + 1][0];
+    P_LAST_RIGHT[1]  = mv_table[mot_xy + 1][1];
+    P_LAST_BOTTOM[0] = mv_table[mot_xy + mot_stride][0];
+    P_LAST_BOTTOM[1] = mv_table[mot_xy + mot_stride][1];
+/*
+    if(P_LEFT[0]       > (rel_xmax<<shift)) P_LEFT[0]       = (rel_xmax<<shift);
+    if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift);
+    if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift);
+*/
     /* special case for first line */
-    if ((mb_y == 0 || s->first_slice_line || s->first_gob_line)) {
-        P[4][0] = P[1][0];
-        P[4][1] = P[1][1];
+    if ((mb_y == 0 || s->first_slice_line)) {
     } else {
-        P[2][0] = mv_table[mot_xy - mot_stride             ][0];
-        P[2][1] = mv_table[mot_xy - mot_stride             ][1];
-        P[3][0] = mv_table[mot_xy - mot_stride + 1         ][0];
-        P[3][1] = mv_table[mot_xy - mot_stride + 1         ][1];
+        P_TOP[0] = mv_table[mot_xy - mot_stride             ][0];
+        P_TOP[1] = mv_table[mot_xy - mot_stride             ][1];
+        P_TOPRIGHT[0] = mv_table[mot_xy - mot_stride + 1         ][0];
+        P_TOPRIGHT[1] = mv_table[mot_xy - mot_stride + 1         ][1];
     
-        P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]);
-        P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]);
+        P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+        P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
     }
     dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, -16, -16, 15, 15, ref_picture);
     if(mx==0 && my==0) dmin=99999999; // not representable, due to rounding stuff
@@ -1332,12 +1558,9 @@
 void ff_estimate_b_frame_motion(MpegEncContext * s,
                              int mb_x, int mb_y)
 {
-    const int mot_stride = s->mb_width + 2;
-    const int xy = (mb_y + 1)*mot_stride + mb_x + 1;
     const int quant= s->qscale;
     int fmin, bmin, dmin, fbmin;
     int type=0;
-    int motion_fx, motion_fy, motion_bx, motion_by;
     
     dmin= direct_search(s, mb_x, mb_y);
 
@@ -1365,24 +1588,36 @@
             score=fbmin;
             type= MB_TYPE_BIDIR;
         }
-        s->mc_mb_var += score;
+        s->mc_mb_var_sum += score;
+        s->mc_mb_var[mb_y*s->mb_width + mb_x] = score;
     }
+/*
+{
+static int count=0;
+static int sum=0;
+if(type==MB_TYPE_DIRECT){
+  int diff= ABS(s->b_forw_mv_table)
+}
+}*/
 
     s->mb_type[mb_y*s->mb_width + mb_x]= type;
+/*    if(mb_y==0 && mb_x==0) printf("\n");
+    if(mb_x==0) printf("\n");
+    printf("%d", av_log2(type));
+*/
 }
 
 /* find best f_code for ME which do unlimited searches */
 int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type)
 {
-    int f_code;
-
     if(s->me_method>=ME_EPZS){
-        int mv_num[8];
+        int score[8];
         int i, y;
-        int loose=0;
         UINT8 * fcode_tab= s->fcode_tab;
+        int best_fcode=-1;
+        int best_score=-10000000;
 
-        for(i=0; i<8; i++) mv_num[i]=0;
+        for(i=0; i<8; i++) score[i]= s->mb_num*(8-i); //FIXME *2 and all other too so its the same but nicer
 
         for(y=0; y<s->mb_height; y++){
             int x;
@@ -1390,25 +1625,30 @@
             i= y*s->mb_width;
             for(x=0; x<s->mb_width; x++){
                 if(s->mb_type[i] & type){
-                    mv_num[ fcode_tab[mv_table[xy][0] + MAX_MV] ]++;
-                    mv_num[ fcode_tab[mv_table[xy][1] + MAX_MV] ]++;
-//printf("%d %d %d\n", s->mv_table[0][i], fcode_tab[s->mv_table[0][i] + MAX_MV], i);
+                    int fcode= MAX(fcode_tab[mv_table[xy][0] + MAX_MV],
+                                   fcode_tab[mv_table[xy][1] + MAX_MV]);
+                    int j;
+                    
+                    for(j=0; j<fcode && j<8; j++){
+                        if(s->pict_type==B_TYPE || s->mc_mb_var[i] < s->mb_var[i])
+                            score[j]-= 170;
+                    }
                 }
                 i++;
                 xy++;
             }
         }
-
-        for(i=MAX_FCODE; i>1; i--){
-            int threshold;
-            loose+= mv_num[i];
+        
+        for(i=1; i<8; i++){
+            if(score[i] > best_score){
+                best_score= score[i];
+                best_fcode= i;
+            }
+//            printf("%d %d\n", i, score[i]);
+        }
 
-            if(s->pict_type==B_TYPE) threshold= 0;
-            else                     threshold= s->mb_num/20; //FIXME 
-            if(loose > threshold) break;
-        }
 //    printf("fcode: %d type: %d\n", i, s->pict_type);
-        return i;
+        return best_fcode;
 /*        for(i=0; i<=MAX_FCODE; i++){
             printf("%d ", mv_num[i]);
         }
@@ -1423,7 +1663,8 @@
     const int f_code= s->f_code;
     int y;
     UINT8 * fcode_tab= s->fcode_tab;
-
+//int clip=0;
+//int noclip=0;
     /* clip / convert to intra 16x16 type MVs */
     for(y=0; y<s->mb_height; y++){
         int x;
@@ -1439,13 +1680,16 @@
                     s->mb_type[i] |= MB_TYPE_INTRA;
                     s->p_mv_table[xy][0] = 0;
                     s->p_mv_table[xy][1] = 0;
+//clip++;
                 }
+//else
+//  noclip++;
             }
             xy++;
             i++;
         }
     }
-
+//printf("%d no:%d %d//\n", clip, noclip, f_code);
     if(s->flags&CODEC_FLAG_4MV){
         const int wrap= 2+ s->mb_width*2;