changeset 1799:95612d423fde libavcodec

multithreaded/SMP motion estimation multithreaded/SMP encoding for MPEG1/MPEG2/MPEG4/H263 all pthread specific code is in pthread.c to try it, run configure --enable-pthreads and ffmpeg ... -threads <num> the internal thread API is a simple AVCodecContext.execute() callback which executes a given function pointer with different arguments and returns after finishing all, that way no mutexes or other thread-mess is needed outside pthread.c
author michael
date Fri, 13 Feb 2004 17:54:10 +0000
parents a3da4b429984
children e039d79185c2
files Makefile avcodec.h common.h h263.c mjpeg.c motion_est.c motion_est_template.c mpeg12.c mpegvideo.c mpegvideo.h pthread.c utils.c
diffstat 12 files changed, 781 insertions(+), 349 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Thu Feb 12 16:21:21 2004 +0000
+++ b/Makefile	Fri Feb 13 17:54:10 2004 +0000
@@ -33,6 +33,10 @@
 endif
 endif
 
+ifeq ($(HAVE_PTHREADS),yes)
+OBJS+= pthread.o
+endif
+
 ifeq ($(AMR_WB),yes)
 OBJS+= amr.o amrwb_float/dec_acelp.o amrwb_float/dec_dtx.o amrwb_float/dec_gain.o \
 		amrwb_float/dec_if.o amrwb_float/dec_lpc.o amrwb_float/dec_main.o \
--- a/avcodec.h	Thu Feb 12 16:21:21 2004 +0000
+++ b/avcodec.h	Fri Feb 13 17:54:10 2004 +0000
@@ -17,7 +17,7 @@
 
 #define FFMPEG_VERSION_INT     0x000408
 #define FFMPEG_VERSION         "0.4.8"
-#define LIBAVCODEC_BUILD       4701
+#define LIBAVCODEC_BUILD       4702
 
 #define LIBAVCODEC_VERSION_INT FFMPEG_VERSION_INT
 #define LIBAVCODEC_VERSION     FFMPEG_VERSION
@@ -1508,6 +1508,32 @@
      * - decoding: unused
      */
     int quantizer_noise_shaping;
+
+    /**
+     * Thread count.
+     * is used to decide how many independant tasks should be passed to execute()
+     * - encoding: set by user
+     * - decoding: set by user
+     */
+    int thread_count;
+    
+    /**
+     * the codec may call this to execute several independant things. it will return only after
+     * finishing all tasks, the user may replace this with some multithreaded implementation, the
+     * default implementation will execute the parts serially
+     * @param count the number of functions this will be identical to thread_count if possible
+     * - encoding: set by lavc, user can override
+     * - decoding: set by lavc, user can override
+     */
+    int (*execute)(struct AVCodecContext *c, int (*func)(struct AVCodecContext *c2, void *arg), void **arg2, int *ret, int count);
+    
+    /**
+     * Thread opaque.
+     * can be used by execute() to store some per AVCodecContext stuff.
+     * - encoding: set by execute()
+     * - decoding: set by execute()
+     */
+    void *thread_opaque;
 } AVCodecContext;
 
 
@@ -1846,6 +1872,11 @@
 void avcodec_default_release_buffer(AVCodecContext *s, AVFrame *pic);
 void avcodec_default_free_buffers(AVCodecContext *s);
 
+int avcodec_pthread_init(AVCodecContext *s, int thread_count);
+void avcodec_pthread_free(AVCodecContext *s);
+int avcodec_pthread_execute(AVCodecContext *s, int (*func)(AVCodecContext *c2, void *arg2),void **arg, int *ret, int count);
+//FIXME func typedef
+
 /**
  * opens / inits the AVCodecContext.
  * not thread save!
--- a/common.h	Thu Feb 12 16:21:21 2004 +0000
+++ b/common.h	Fri Feb 13 17:54:10 2004 +0000
@@ -475,6 +475,28 @@
 #endif
 }
 
+/**
+ *
+ * PutBitContext must be flushed & aligned to a byte boundary before calling this.
+ */
+static inline void skip_put_bytes(PutBitContext *s, int n){
+        assert((put_bits_count(s)&7)==0);
+#ifdef ALT_BITSTREAM_WRITER
+        FIXME may need some cleaning of the buffer
+	s->index += n<<3;
+#else
+        assert(s->bit_left==32);
+	s->buf_ptr += n;
+#endif    
+}
+
+/**
+ * Changes the end of the buffer.
+ */
+static inline void set_put_bits_buffer_size(PutBitContext *s, int size){
+    s->buf_end= s->buf + size;
+}
+
 /* Bitstream reader API docs:
 name
     abritary name which is used as prefix for the internal variables
--- a/h263.c	Thu Feb 12 16:21:21 2004 +0000
+++ b/h263.c	Fri Feb 13 17:54:10 2004 +0000
@@ -1961,7 +1961,9 @@
         s->luma_dc_vlc_length= uni_DCtab_lum_len;
         s->chroma_dc_vlc_length= uni_DCtab_chrom_len;
         s->ac_esc_length= 7+2+1+6+1+12+1;
-        
+        s->y_dc_scale_table= ff_mpeg4_y_dc_scale_table;
+        s->c_dc_scale_table= ff_mpeg4_c_dc_scale_table;
+
         if(s->flags & CODEC_FLAG_GLOBAL_HEADER){
 
             s->avctx->extradata= av_malloc(1024);
@@ -2290,12 +2292,11 @@
     put_bits(&s->pb, 1, s->progressive_sequence ? 0 : 1);
     put_bits(&s->pb, 1, 1);		/* obmc disable */
     if (vo_ver_id == 1) {
-        put_bits(&s->pb, 1, s->vol_sprite_usage=0);		/* sprite enable */
+        put_bits(&s->pb, 1, s->vol_sprite_usage);		/* sprite enable */
     }else{
-        put_bits(&s->pb, 2, s->vol_sprite_usage=0);		/* sprite enable */
+        put_bits(&s->pb, 2, s->vol_sprite_usage);		/* sprite enable */
     }
     
-    s->quant_precision=5;
     put_bits(&s->pb, 1, 0);		/* not 8 bit == false */
     put_bits(&s->pb, 1, s->mpeg_quant);	/* quant type= (0=h263 style)*/
 
@@ -2384,9 +2385,6 @@
     if (s->pict_type == B_TYPE)
 	put_bits(&s->pb, 3, s->b_code);	/* fcode_back */
     //    printf("****frame %d\n", picture_number);
-
-     s->y_dc_scale_table= ff_mpeg4_y_dc_scale_table; //FIXME add short header support 
-     s->c_dc_scale_table= ff_mpeg4_c_dc_scale_table;
 }
 
 #endif //CONFIG_ENCODERS
@@ -2965,8 +2963,16 @@
 
 void ff_mpeg4_init_partitions(MpegEncContext *s)
 {
-    init_put_bits(&s->tex_pb, s->tex_pb_buffer, PB_BUFFER_SIZE);
-    init_put_bits(&s->pb2   , s->pb2_buffer   , PB_BUFFER_SIZE);
+    uint8_t *start= pbBufPtr(&s->pb);
+    uint8_t *end= s->pb.buf_end;
+    int size= end - start;
+    int pb_size = size/3;
+    int pb2_size= size/3;
+    int tex_size= size - pb_size - pb2_size;
+    
+    set_put_bits_buffer_size(&s->pb, pb_size);
+    init_put_bits(&s->tex_pb, start + pb_size           , tex_size);
+    init_put_bits(&s->pb2   , start + pb_size + tex_size, pb2_size);
 }
 
 void ff_mpeg4_merge_partitions(MpegEncContext *s)
@@ -2989,8 +2995,9 @@
     flush_put_bits(&s->pb2);
     flush_put_bits(&s->tex_pb);
 
-    ff_copy_bits(&s->pb, s->pb2_buffer   , pb2_len);
-    ff_copy_bits(&s->pb, s->tex_pb_buffer, tex_pb_len);
+    set_put_bits_buffer_size(&s->pb, s->pb2.buf_end - s->pb.buf);
+    ff_copy_bits(&s->pb, s->pb2.buf   , pb2_len);
+    ff_copy_bits(&s->pb, s->tex_pb.buf, tex_pb_len);
     s->last_bits= put_bits_count(&s->pb);
 }
 
--- a/mjpeg.c	Thu Feb 12 16:21:21 2004 +0000
+++ b/mjpeg.c	Fri Feb 13 17:54:10 2004 +0000
@@ -531,11 +531,16 @@
     }
 }
 
+void ff_mjpeg_stuffing(PutBitContext * pbc)
+{
+    int length;
+    length= (-put_bits_count(pbc))&7;
+    if(length) put_bits(pbc, length, (1<<length)-1);
+}
+
 void mjpeg_picture_trailer(MpegEncContext *s)
 {
-    int pad= (-put_bits_count(&s->pb))&7;
-    
-    put_bits(&s->pb, pad,0xFF>>(8-pad));
+    ff_mjpeg_stuffing(&s->pb);
     flush_put_bits(&s->pb);
 
     assert((s->header_bits&7)==0);
--- a/motion_est.c	Thu Feb 12 16:21:21 2004 +0000
+++ b/motion_est.c	Fri Feb 13 17:54:10 2004 +0000
@@ -805,7 +805,7 @@
         if(P_LEFT[0]       > (s->me.xmax<<shift)) P_LEFT[0]       = (s->me.xmax<<shift);
 
         /* special case for first line */
-        if (s->mb_y == 0 && block<2) {
+        if (s->first_slice_line && block<2) {
             pred_x4= P_LEFT[0];
             pred_y4= P_LEFT[1];
         } else {
@@ -845,13 +845,12 @@
             int dxy;
             const int offset= ((block&1) + (block>>1)*stride)*8;
             uint8_t *dest_y = s->me.scratchpad + offset;
-
             if(s->quarter_sample){
                 uint8_t *ref= ref_data[0] + (mx4>>2) + (my4>>2)*stride;
                 dxy = ((my4 & 3) << 2) | (mx4 & 3);
 
                 if(s->no_rounding)
-                    s->dsp.put_no_rnd_qpel_pixels_tab[1][dxy](dest_y   , ref    , s->linesize);
+                    s->dsp.put_no_rnd_qpel_pixels_tab[1][dxy](dest_y   , ref    , stride);
                 else
                     s->dsp.put_qpel_pixels_tab       [1][dxy](dest_y   , ref    , stride);
             }else{
@@ -966,7 +965,7 @@
             pred_x= P_LEFT[0];
             pred_y= P_LEFT[1];
             
-            if(s->mb_y){
+            if(!s->first_slice_line){
                 P_TOP[0]      = mv_table[xy - mot_stride][0];
                 P_TOP[1]      = mv_table[xy - mot_stride][1];
                 P_TOPRIGHT[0] = mv_table[xy - mot_stride + 1][0];
@@ -1115,7 +1114,7 @@
 
             if(P_LEFT[0]       > (s->me.xmax<<shift)) P_LEFT[0]       = (s->me.xmax<<shift);
 
-            if(mb_y) {
+            if(!s->first_slice_line) {
                 P_TOP[0]      = s->current_picture.motion_val[0][mot_xy - mot_stride    ][0];
                 P_TOP[1]      = s->current_picture.motion_val[0][mot_xy - mot_stride    ][1];
                 P_TOPRIGHT[0] = s->current_picture.motion_val[0][mot_xy - mot_stride + 2][0];
@@ -1164,8 +1163,8 @@
     pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = vard;
     pic->mb_mean  [s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
 //    pic->mb_cmp_score[s->mb_stride * mb_y + mb_x] = dmin; 
-    pic->mb_var_sum    += varc;
-    pic->mc_mb_var_sum += vard;
+    s->mb_var_sum_temp    += varc;
+    s->mc_mb_var_sum_temp += vard;
 //printf("E%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
     
 #if 0
@@ -1326,7 +1325,7 @@
     if(P_LEFT[0]       < (s->me.xmin<<shift)) P_LEFT[0]       = (s->me.xmin<<shift);
 
     /* special case for first line */
-    if (mb_y == s->mb_height-1) {
+    if (s->first_slice_line) {
         pred_x= P_LEFT[0];
         pred_y= P_LEFT[1];
         P_TOP[0]= P_TOPRIGHT[0]= P_MEDIAN[0]=
@@ -1409,7 +1408,7 @@
             if(P_LEFT[0]       > (s->me.xmax<<shift)) P_LEFT[0]       = (s->me.xmax<<shift);
 
             /* special case for first line */
-            if (mb_y) {
+            if (!s->first_slice_line) {
                 P_TOP[0] = mv_table[mot_xy - mot_stride             ][0];
                 P_TOP[1] = mv_table[mot_xy - mot_stride             ][1];
                 P_TOPRIGHT[0] = mv_table[mot_xy - mot_stride + 1         ][0];
@@ -1610,7 +1609,7 @@
     P_LEFT[1]        = clip(mv_table[mot_xy - 1][1], ymin<<shift, ymax<<shift);
 
     /* special case for first line */
-    if (mb_y) {
+    if (!s->first_slice_line) { //FIXME maybe allow this over thread boundary as its cliped
         P_TOP[0]      = clip(mv_table[mot_xy - mot_stride             ][0], xmin<<shift, xmax<<shift);
         P_TOP[1]      = clip(mv_table[mot_xy - mot_stride             ][1], ymin<<shift, ymax<<shift);
         P_TOPRIGHT[0] = clip(mv_table[mot_xy - mot_stride + 1         ][0], xmin<<shift, xmax<<shift);
@@ -1727,7 +1726,7 @@
         }
         
         score= ((unsigned)(score*score + 128*256))>>16;
-        s->current_picture.mc_mb_var_sum += score;
+        s->mc_mb_var_sum_temp += score;
         s->current_picture.mc_mb_var[mb_y*s->mb_stride + mb_x] = score; //FIXME use SSE
     }
 
--- a/motion_est_template.c	Thu Feb 12 16:21:21 2004 +0000
+++ b/motion_est_template.c	Fri Feb 13 17:54:10 2004 +0000
@@ -557,9 +557,11 @@
 
 #define CHECK_CLIPED_MV(ax,ay)\
 {\
-    const int x= FFMAX(xmin, FFMIN(ax, xmax));\
-    const int y= FFMAX(ymin, FFMIN(ay, ymax));\
-    CHECK_MV(x, y)\
+    const int x= ax;\
+    const int y= ay;\
+    const int x2= FFMAX(xmin, FFMIN(x, xmax));\
+    const int y2= FFMAX(ymin, FFMIN(y, ymax));\
+    CHECK_MV(x2, y2)\
 }
 
 #define CHECK_MV_DIR(x,y,new_dir)\
@@ -912,7 +914,7 @@
     score_map[0]= dmin;
 
     /* first line */
-    if (s->mb_y == 0) {
+    if (s->first_slice_line) {
         CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
         CHECK_CLIPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16, 
                         (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
@@ -938,13 +940,15 @@
         if(s->me.pre_pass){
             CHECK_CLIPED_MV((last_mv[ref_mv_xy-1][0]*ref_mv_scale + (1<<15))>>16, 
                             (last_mv[ref_mv_xy-1][1]*ref_mv_scale + (1<<15))>>16)
-            CHECK_CLIPED_MV((last_mv[ref_mv_xy-ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16, 
-                            (last_mv[ref_mv_xy-ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
+            if(!s->first_slice_line)
+                CHECK_CLIPED_MV((last_mv[ref_mv_xy-ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16, 
+                                (last_mv[ref_mv_xy-ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
         }else{
             CHECK_CLIPED_MV((last_mv[ref_mv_xy+1][0]*ref_mv_scale + (1<<15))>>16, 
                             (last_mv[ref_mv_xy+1][1]*ref_mv_scale + (1<<15))>>16)
-            CHECK_CLIPED_MV((last_mv[ref_mv_xy+ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16, 
-                            (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
+            if(s->end_mb_y == s->mb_height || s->mb_y+1<s->end_mb_y)  //FIXME replace at least with last_slice_line
+                CHECK_CLIPED_MV((last_mv[ref_mv_xy+ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16, 
+                                (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
         }
     }
 
@@ -1024,7 +1028,7 @@
     dmin = 1000000;
 //printf("%d %d %d %d //",xmin, ymin, xmax, ymax); 
     /* first line */
-    if (s->mb_y == 0/* && block<2*/) {
+    if (s->first_slice_line) {
 	CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
         CHECK_CLIPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16, 
                         (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
@@ -1044,8 +1048,9 @@
     if(dmin>64*4){
         CHECK_CLIPED_MV((last_mv[ref_mv_xy+1][0]*ref_mv_scale + (1<<15))>>16, 
                         (last_mv[ref_mv_xy+1][1]*ref_mv_scale + (1<<15))>>16)
-        CHECK_CLIPED_MV((last_mv[ref_mv_xy+ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16, 
-                        (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
+        if(s->end_mb_y == s->mb_height || s->mb_y+1<s->end_mb_y)  //FIXME replace at least with last_slice_line
+            CHECK_CLIPED_MV((last_mv[ref_mv_xy+ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16, 
+                            (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
     }
 
     if(s->me.dia_size==-1)
@@ -1102,7 +1107,7 @@
     dmin = 1000000;
 //printf("%d %d %d %d //",xmin, ymin, xmax, ymax); 
     /* first line */
-    if (s->mb_y == 0) {
+    if (s->first_slice_line) {
 	CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
         CHECK_CLIPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16, 
                         (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
@@ -1122,8 +1127,9 @@
     if(dmin>64*4){
         CHECK_CLIPED_MV((last_mv[ref_mv_xy+1][0]*ref_mv_scale + (1<<15))>>16, 
                         (last_mv[ref_mv_xy+1][1]*ref_mv_scale + (1<<15))>>16)
-        CHECK_CLIPED_MV((last_mv[ref_mv_xy+ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16, 
-                        (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
+        if(s->end_mb_y == s->mb_height || s->mb_y+1<s->end_mb_y)  //FIXME replace at least with last_slice_line
+            CHECK_CLIPED_MV((last_mv[ref_mv_xy+ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16, 
+                            (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
     }
 
     if(s->me.dia_size==-1)
--- a/mpeg12.c	Thu Feb 12 16:21:21 2004 +0000
+++ b/mpeg12.c	Fri Feb 13 17:54:10 2004 +0000
@@ -419,7 +419,9 @@
             put_bits(&s->pb, 8, 255);
         }
         put_bits(&s->pb, 2, s->intra_dc_precision);
-        put_bits(&s->pb, 2, s->picture_structure= PICT_FRAME);
+        
+        assert(s->picture_structure == PICT_FRAME);
+        put_bits(&s->pb, 2, s->picture_structure);
         if (s->progressive_sequence) {
             put_bits(&s->pb, 1, 0); /* no repeat */
         } else {
--- a/mpegvideo.c	Thu Feb 12 16:21:21 2004 +0000
+++ b/mpegvideo.c	Fri Feb 13 17:54:10 2004 +0000
@@ -401,6 +401,98 @@
     }
 }
 
+static int init_duplicate_context(MpegEncContext *s, MpegEncContext *base){
+    int i;
+
+    CHECKED_ALLOCZ(s->allocated_edge_emu_buffer, (s->width+64)*2*17*2); //(width + edge + align)*interlaced*MBsize*tolerance
+    s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*17;
+
+     //FIXME should be linesize instead of s->width*2 but that isnt known before get_buffer()
+    CHECKED_ALLOCZ(s->me.scratchpad,  s->width*2*16*2*sizeof(uint8_t)) 
+    s->rd_scratchpad=   s->me.scratchpad;
+    s->b_scratchpad=    s->me.scratchpad;
+    s->obmc_scratchpad= s->me.scratchpad + 16;
+    if (s->encoding) {
+        CHECKED_ALLOCZ(s->me.map      , ME_MAP_SIZE*sizeof(uint32_t))
+        CHECKED_ALLOCZ(s->me.score_map, ME_MAP_SIZE*sizeof(uint32_t))
+        if(s->avctx->noise_reduction){
+            CHECKED_ALLOCZ(s->dct_error_sum, 2 * 64 * sizeof(int))
+        }
+    }   
+    CHECKED_ALLOCZ(s->blocks, 64*6*2 * sizeof(DCTELEM))
+    s->block= s->blocks[0];
+
+    for(i=0;i<12;i++){
+        s->pblocks[i] = (short *)(&s->block[i]);
+    }
+    return 0;
+fail:
+    return -1; //free() through MPV_common_end()
+}
+
+static void free_duplicate_context(MpegEncContext *s){
+    if(s==NULL) return;
+
+    av_freep(&s->allocated_edge_emu_buffer); s->edge_emu_buffer= NULL;
+    av_freep(&s->me.scratchpad);
+    s->rd_scratchpad=   
+    s->b_scratchpad=    
+    s->obmc_scratchpad= NULL;
+    
+    av_freep(&s->dct_error_sum);
+    av_freep(&s->me.map);
+    av_freep(&s->me.score_map);
+    av_freep(&s->blocks);
+    s->block= NULL;
+}
+
+static void backup_duplicate_context(MpegEncContext *bak, MpegEncContext *src){
+#define COPY(a) bak->a= src->a
+    COPY(allocated_edge_emu_buffer);
+    COPY(edge_emu_buffer);
+    COPY(me.scratchpad);
+    COPY(rd_scratchpad);
+    COPY(b_scratchpad);
+    COPY(obmc_scratchpad);
+    COPY(me.map);
+    COPY(me.score_map);
+    COPY(blocks);
+    COPY(block);
+    COPY(start_mb_y);
+    COPY(end_mb_y);
+    COPY(me.map_generation);
+    COPY(pb);
+    COPY(dct_error_sum);
+#undef COPY
+}
+
+static void update_duplicate_context(MpegEncContext *dst, MpegEncContext *src){
+    MpegEncContext bak;
+    //FIXME copy only needed parts
+//START_TIMER
+    backup_duplicate_context(&bak, dst);
+    memcpy(dst, src, sizeof(MpegEncContext));
+    backup_duplicate_context(dst, &bak);
+//STOP_TIMER("update_duplicate_context") //about 10k cycles / 0.01 sec for 1000frames on 1ghz with 2 threads
+}
+
+static void update_duplicate_context_after_me(MpegEncContext *dst, MpegEncContext *src){
+#define COPY(a) dst->a= src->a
+    COPY(pict_type);
+    COPY(current_picture);
+    COPY(f_code);
+    COPY(b_code);
+    COPY(qscale);
+    COPY(lambda);
+    COPY(lambda2);
+    COPY(picture_in_gop_number);
+    COPY(gop_picture_number);
+    COPY(frame_pred_frame_dct); //FIXME dont set in encode_header
+    COPY(progressive_frame); //FIXME dont set in encode_header
+    COPY(partitioned_frame); //FIXME dont set in encode_header
+#undef COPY
+}
+
 /* init common structure for both encoder and decoder */
 int MPV_common_init(MpegEncContext *s)
 {
@@ -456,9 +548,6 @@
                                + (toupper((s->avctx->stream_codec_tag>>16)&0xFF)<<16) 
                                + (toupper((s->avctx->stream_codec_tag>>24)&0xFF)<<24);
 
-    CHECKED_ALLOCZ(s->allocated_edge_emu_buffer, (s->width+64)*2*17*2); //(width + edge + align)*interlaced*MBsize*tolerance
-    s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*17;
-
     s->avctx->coded_frame= (AVFrame*)&s->current_picture;
 
     CHECKED_ALLOCZ(s->mb_index2xy, (s->mb_num+1)*sizeof(int)) //error ressilience code looks cleaner with this
@@ -484,17 +573,6 @@
         s->b_bidir_back_mv_table= s->b_bidir_back_mv_table_base + s->mb_stride + 1;
         s->b_direct_mv_table    = s->b_direct_mv_table_base     + s->mb_stride + 1;
 
-        //FIXME should be linesize instead of s->width*2 but that isnt known before get_buffer()
-        CHECKED_ALLOCZ(s->me.scratchpad,  s->width*2*16*3*sizeof(uint8_t)) 
-        
-        CHECKED_ALLOCZ(s->me.map      , ME_MAP_SIZE*sizeof(uint32_t))
-        CHECKED_ALLOCZ(s->me.score_map, ME_MAP_SIZE*sizeof(uint32_t))
-
-        if(s->codec_id==CODEC_ID_MPEG4){
-            CHECKED_ALLOCZ(s->tex_pb_buffer, PB_BUFFER_SIZE);
-            CHECKED_ALLOCZ(   s->pb2_buffer, PB_BUFFER_SIZE);
-        }
-        
         if(s->msmpeg4_version){
             CHECKED_ALLOCZ(s->ac_stats, 2*2*(MAX_LEVEL+1)*(MAX_RUN+1)*2*sizeof(int));
         }
@@ -513,12 +591,9 @@
         CHECKED_ALLOCZ(s->reordered_input_picture, MAX_PICTURE_COUNT * sizeof(Picture*))
         
         if(s->avctx->noise_reduction){
-            CHECKED_ALLOCZ(s->dct_error_sum, 2 * 64 * sizeof(int))
             CHECKED_ALLOCZ(s->dct_offset, 2 * 64 * sizeof(uint16_t))
         }
     }
-    CHECKED_ALLOCZ(s->blocks, 64*6*2 * sizeof(DCTELEM))
-        
     CHECKED_ALLOCZ(s->picture, MAX_PICTURE_COUNT * sizeof(Picture))
 
     CHECKED_ALLOCZ(s->error_status_table, mb_array_size*sizeof(uint8_t))
@@ -578,12 +653,6 @@
     //Note the +1 is for a quicker mpeg4 slice_end detection
     CHECKED_ALLOCZ(s->prev_pict_types, PREV_PICT_TYPES_BUFFER_SIZE);
     
-    s->block= s->blocks[0];
-
-    for(i=0;i<12;i++){
-        s->pblocks[i] = (short *)(&s->block[i]);
-    }
-
     s->parse_context.state= -1;
     if((s->avctx->debug&(FF_DEBUG_VIS_QP|FF_DEBUG_VIS_MB_TYPE)) || (s->avctx->debug_mv)){
        s->visualization_buffer[0] = av_malloc((s->mb_width*16 + 2*EDGE_WIDTH) * s->mb_height*16 + 2*EDGE_WIDTH);
@@ -592,20 +661,38 @@
     }
 
     s->context_initialized = 1;
+
+    s->thread_context[0]= s;
+    for(i=1; i<s->avctx->thread_count; i++){
+        s->thread_context[i]= av_malloc(sizeof(MpegEncContext));
+        memcpy(s->thread_context[i], s, sizeof(MpegEncContext));
+    }
+
+    for(i=0; i<s->avctx->thread_count; i++){
+        if(init_duplicate_context(s->thread_context[i], s) < 0)
+           goto fail;
+        s->thread_context[i]->start_mb_y= (s->mb_height*(i  ) + s->avctx->thread_count/2) / s->avctx->thread_count;
+        s->thread_context[i]->end_mb_y  = (s->mb_height*(i+1) + s->avctx->thread_count/2) / s->avctx->thread_count;
+    }
+
     return 0;
  fail:
     MPV_common_end(s);
     return -1;
 }
 
-
-//extern int sads;
-
 /* init common structure for both encoder and decoder */
 void MPV_common_end(MpegEncContext *s)
 {
     int i, j, k;
 
+    for(i=0; i<s->avctx->thread_count; i++){
+        free_duplicate_context(s->thread_context[i]);
+    }
+    for(i=1; i<s->avctx->thread_count; i++){
+        av_freep(&s->thread_context[i]);
+    }
+
     av_freep(&s->parse_context.buffer);
     s->parse_context.buffer_size=0;
 
@@ -641,16 +728,10 @@
     av_freep(&s->mbintra_table);
     av_freep(&s->cbp_table);
     av_freep(&s->pred_dir_table);
-    av_freep(&s->me.scratchpad);
-    av_freep(&s->me.map);
-    av_freep(&s->me.score_map);
     
     av_freep(&s->mbskip_table);
     av_freep(&s->prev_pict_types);
     av_freep(&s->bitstream_buffer);
-    av_freep(&s->tex_pb_buffer);
-    av_freep(&s->pb2_buffer);
-    av_freep(&s->allocated_edge_emu_buffer); s->edge_emu_buffer= NULL;
     av_freep(&s->avctx->stats_out);
     av_freep(&s->ac_stats);
     av_freep(&s->error_status_table);
@@ -660,7 +741,6 @@
     av_freep(&s->q_inter_matrix);
     av_freep(&s->q_intra_matrix16);
     av_freep(&s->q_inter_matrix16);
-    av_freep(&s->blocks);
     av_freep(&s->input_picture);
     av_freep(&s->reordered_input_picture);
     av_freep(&s->dct_error_sum);
@@ -797,6 +877,16 @@
         av_log(avctx, AV_LOG_ERROR, "closed gop with scene change detection arent supported yet\n");
         return -1;
     }
+    
+    if(s->avctx->thread_count > 1 && s->codec_id != CODEC_ID_MPEG4 
+       && s->codec_id != CODEC_ID_MPEG1VIDEO && s->codec_id != CODEC_ID_MPEG2VIDEO 
+       && (s->codec_id != CODEC_ID_H263P || !(s->flags & CODEC_FLAG_H263P_SLICE_STRUCT))){
+        av_log(avctx, AV_LOG_ERROR, "multi threaded encoding not supported by codec\n");
+        return -1;
+    }
+    
+    if(s->avctx->thread_count > 1)
+        s->rtp_mode= 1;
 
     i= ff_gcd(avctx->frame_rate, avctx->frame_rate_base);
     if(i > 1){
@@ -990,6 +1080,7 @@
         s->chroma_qscale_table= ff_h263_chroma_qscale_table;
     s->progressive_frame= 
     s->progressive_sequence= !(avctx->flags & (CODEC_FLAG_INTERLACED_DCT|CODEC_FLAG_INTERLACED_ME));
+    s->quant_precision=5;
     
     ff_set_cmp(&s->dsp, s->dsp.ildct_cmp, s->avctx->ildct_cmp);
     
@@ -1903,7 +1994,14 @@
         return -1;
     }
     
-    init_put_bits(&s->pb, buf, buf_size);
+    for(i=0; i<avctx->thread_count; i++){
+        int y= s->thread_context[i]->start_mb_y;
+        int h= s->mb_height;
+        uint8_t *start= buf + buf_size* y   /h;
+        uint8_t *end  = buf + buf_size*(y+1)/h;
+
+        init_put_bits(&s->thread_context[i]->pb, start, end - start);
+    }
 
     s->picture_in_gop_number++;
 
@@ -2400,7 +2498,7 @@
         if(i && mv[i][0]==mv[MID][0] && mv[i][1]==mv[MID][1]){
             ptr[i]= ptr[MID];
         }else{
-            ptr[i]= s->edge_emu_buffer + 16 + 8*(i&1) + s->linesize*8*(i>>1);
+            ptr[i]= s->obmc_scratchpad + 8*(i&1) + s->linesize*8*(i>>1);
             hpel_motion(s, ptr[i], src,
                         src_x, src_y,
                         s->width, s->height, s->linesize,
@@ -3026,9 +3124,9 @@
             dest_cb= s->dest[1];
             dest_cr= s->dest[2];
         }else{
-            dest_y = s->edge_emu_buffer+32; //FIXME cleanup scratchpad pointers
-            dest_cb= s->edge_emu_buffer+48;
-            dest_cr= s->edge_emu_buffer+56;
+            dest_y = s->b_scratchpad;
+            dest_cb= s->b_scratchpad+16*s->linesize;
+            dest_cr= s->b_scratchpad+16*s->linesize+8;
         }
         if (!s->mb_intra) {
             /* motion handling */
@@ -3634,14 +3732,21 @@
 #ifdef CONFIG_ENCODERS
 void ff_copy_bits(PutBitContext *pb, uint8_t *src, int length)
 {
-    int bytes= length>>4;
+    int words= length>>4;
     int bits= length&15;
     int i;
 
     if(length==0) return;
-
-    for(i=0; i<bytes; i++) put_bits(pb, 16, be2me_16(((uint16_t*)src)[i]));
-    put_bits(pb, bits, be2me_16(((uint16_t*)src)[i])>>(16-bits));
+    
+//    if(put_bits_count(pb)&7){ //FIXME
+        for(i=0; i<words; i++) put_bits(pb, 16, be2me_16(((uint16_t*)src)[i]));
+/*    }else{
+        flush_put_bits(pb);
+        memcpy(pbBufPtr(pb), src, 2*words);
+        skip_put_bytes(pb, 2*words);
+    }*/
+        
+    put_bits(pb, bits, be2me_16(((uint16_t*)src)[words])>>(16-bits));
 }
 
 static inline void copy_context_before_encode(MpegEncContext *d, MpegEncContext *s, int type){
@@ -3725,11 +3830,10 @@
     
     if(*next_block){
         memcpy(dest_backup, s->dest, sizeof(s->dest));
-        s->dest[0] = s->me.scratchpad;
-        s->dest[1] = s->me.scratchpad + 16;
-        s->dest[2] = s->me.scratchpad + 16 + 8;
-        assert(2*s->uvlinesize == s->linesize); //should be no prob for encoding
-        assert(s->linesize >= 64); //FIXME
+        s->dest[0] = s->rd_scratchpad;
+        s->dest[1] = s->rd_scratchpad + 16*s->linesize;
+        s->dest[2] = s->rd_scratchpad + 16*s->linesize + 8;
+        assert(s->linesize >= 32); //FIXME
     }
 
     encode_mb(s, motion_x, motion_y);
@@ -3797,16 +3901,75 @@
                +sse(s, s->new_picture.data[2] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[2], w>>1, h>>1, s->uvlinesize);
 }
 
-static void encode_picture(MpegEncContext *s, int picture_number)
-{
+static int pre_estimate_motion_thread(AVCodecContext *c, void *arg){
+    MpegEncContext *s= arg;
+
+    
+    s->me.pre_pass=1;
+    s->me.dia_size= s->avctx->pre_dia_size;
+    s->first_slice_line=1;
+    for(s->mb_y= s->end_mb_y-1; s->mb_y >= s->start_mb_y; s->mb_y--) {
+        for(s->mb_x=s->mb_width-1; s->mb_x >=0 ;s->mb_x--) {
+            ff_pre_estimate_p_frame_motion(s, s->mb_x, s->mb_y);
+        }
+        s->first_slice_line=0;
+    }
+    
+    s->me.pre_pass=0;
+    
+    return 0;
+}
+
+static int estimate_motion_thread(AVCodecContext *c, void *arg){
+    MpegEncContext *s= arg;
+
+    s->me.dia_size= s->avctx->dia_size;
+    s->first_slice_line=1;
+    for(s->mb_y= s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
+        s->mb_x=0; //for block init below
+        ff_init_block_index(s);
+        for(s->mb_x=0; s->mb_x < s->mb_width; s->mb_x++) {
+            s->block_index[0]+=2;
+            s->block_index[1]+=2;
+            s->block_index[2]+=2;
+            s->block_index[3]+=2;
+            
+            /* compute motion vector & mb_type and store in context */
+            if(s->pict_type==B_TYPE)
+                ff_estimate_b_frame_motion(s, s->mb_x, s->mb_y);
+            else
+                ff_estimate_p_frame_motion(s, s->mb_x, s->mb_y);
+        }
+        s->first_slice_line=0;
+    }
+    return 0;
+}
+
+static void write_slice_end(MpegEncContext *s){
+    if(s->codec_id==CODEC_ID_MPEG4){
+        if(s->partitioned_frame){
+            ff_mpeg4_merge_partitions(s);
+        }
+    
+        ff_mpeg4_stuffing(&s->pb);
+    }else if(s->out_format == FMT_MJPEG){
+        ff_mjpeg_stuffing(&s->pb);
+    }
+
+    align_put_bits(&s->pb);
+    flush_put_bits(&s->pb);
+}
+
+static int encode_thread(AVCodecContext *c, void *arg){
+    MpegEncContext *s= arg;
     int mb_x, mb_y, pdif = 0;
     int i, j;
-    int bits;
     MpegEncContext best_s, backup_s;
     uint8_t bit_buf[2][3000];
     uint8_t bit_buf2[2][3000];
     uint8_t bit_buf_tex[2][3000];
     PutBitContext pb[2], pb2[2], tex_pb[2];
+//printf("%d->%d\n", s->resync_mb_y, s->end_mb_y);
 
     for(i=0; i<2; i++){
         init_put_bits(&pb    [i], bit_buf    [i], 3000);
@@ -3814,236 +3977,7 @@
         init_put_bits(&tex_pb[i], bit_buf_tex[i], 3000);
     }
 
-    s->picture_number = picture_number;
-    
-    /* Reset the average MB variance */
-    s->current_picture.mb_var_sum = 0;
-    s->current_picture.mc_mb_var_sum = 0;
-
-#ifdef CONFIG_RISKY
-    /* we need to initialize some time vars before we can encode b-frames */
-    // RAL: Condition added for MPEG1VIDEO
-    if (s->codec_id == CODEC_ID_MPEG1VIDEO || s->codec_id == CODEC_ID_MPEG2VIDEO || (s->h263_pred && !s->h263_msmpeg4))
-        ff_set_mpeg4_time(s, s->picture_number); 
-#endif
-        
-    s->scene_change_score=0;
-    
-    s->lambda= s->current_picture_ptr->quality; //FIXME qscale / ... stuff for ME ratedistoration
-    
-    if(s->pict_type==I_TYPE){
-        if(s->msmpeg4_version >= 3) s->no_rounding=1;
-        else                        s->no_rounding=0;
-    }else if(s->pict_type!=B_TYPE){
-        if(s->flipflop_rounding || s->codec_id == CODEC_ID_H263P || s->codec_id == CODEC_ID_MPEG4)
-            s->no_rounding ^= 1;          
-    }
-    
-    /* Estimate motion for every MB */
-    s->mb_intra=0; //for the rate distoration & bit compare functions
-    if(s->pict_type != I_TYPE){
-        if(s->pict_type != B_TYPE){
-            if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){
-                s->me.pre_pass=1;
-                s->me.dia_size= s->avctx->pre_dia_size;
-
-                for(mb_y=s->mb_height-1; mb_y >=0 ; mb_y--) {
-                    s->mb_y = mb_y;
-                    for(mb_x=s->mb_width-1; mb_x >=0 ; mb_x--) {
-                        s->mb_x = mb_x;
-                        ff_pre_estimate_p_frame_motion(s, mb_x, mb_y);
-                    }
-                }
-                s->me.pre_pass=0;
-            }
-        }
-
-        s->me.dia_size= s->avctx->dia_size;
-        for(mb_y=0; mb_y < s->mb_height; mb_y++) {
-            s->mb_y = mb_y;
-            s->block_index[0]= s->block_wrap[0]*(mb_y*2 + 1) - 1;
-            s->block_index[1]= s->block_wrap[0]*(mb_y*2 + 1);
-            s->block_index[2]= s->block_wrap[0]*(mb_y*2 + 2) - 1;
-            s->block_index[3]= s->block_wrap[0]*(mb_y*2 + 2);
-            for(mb_x=0; mb_x < s->mb_width; mb_x++) {
-                s->mb_x = mb_x;
-                s->block_index[0]+=2;
-                s->block_index[1]+=2;
-                s->block_index[2]+=2;
-                s->block_index[3]+=2;
-                
-                /* compute motion vector & mb_type and store in context */
-                if(s->pict_type==B_TYPE)
-                    ff_estimate_b_frame_motion(s, mb_x, mb_y);
-                else
-                    ff_estimate_p_frame_motion(s, mb_x, mb_y);
-            }
-        }
-    }else /* if(s->pict_type == I_TYPE) */{
-        /* I-Frame */
-        for(i=0; i<s->mb_stride*s->mb_height; i++)
-            s->mb_type[i]= CANDIDATE_MB_TYPE_INTRA;
-        
-        if(!s->fixed_qscale){
-            /* finding spatial complexity for I-frame rate control */
-            for(mb_y=0; mb_y < s->mb_height; mb_y++) {
-                for(mb_x=0; mb_x < s->mb_width; mb_x++) {
-                    int xx = mb_x * 16;
-                    int yy = mb_y * 16;
-                    uint8_t *pix = s->new_picture.data[0] + (yy * s->linesize) + xx;
-                    int varc;
-		    int sum = s->dsp.pix_sum(pix, s->linesize);
-    
-		    varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8;
-
-                    s->current_picture.mb_var [s->mb_stride * mb_y + mb_x] = varc;
-                    s->current_picture.mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
-                    s->current_picture.mb_var_sum    += varc;
-                }
-            }
-        }
-    }
-    emms_c();
-
-    if(s->scene_change_score > s->avctx->scenechange_threshold && s->pict_type == P_TYPE){
-        s->pict_type= I_TYPE;
-        for(i=0; i<s->mb_stride*s->mb_height; i++)
-            s->mb_type[i]= CANDIDATE_MB_TYPE_INTRA;
-//printf("Scene change detected, encoding as I Frame %d %d\n", s->current_picture.mb_var_sum, s->current_picture.mc_mb_var_sum);
-    }
-
-    if(!s->umvplus){
-        if(s->pict_type==P_TYPE || s->pict_type==S_TYPE) {
-            s->f_code= ff_get_best_fcode(s, s->p_mv_table, CANDIDATE_MB_TYPE_INTER);
-
-            if(s->flags & CODEC_FLAG_INTERLACED_ME){
-                int a,b;
-                a= ff_get_best_fcode(s, s->p_field_mv_table[0][0], CANDIDATE_MB_TYPE_INTER_I); //FIXME field_select
-                b= ff_get_best_fcode(s, s->p_field_mv_table[1][1], CANDIDATE_MB_TYPE_INTER_I);
-                s->f_code= FFMAX(s->f_code, FFMAX(a,b));
-            }
-                    
-            ff_fix_long_p_mvs(s);
-            ff_fix_long_mvs(s, NULL, 0, s->p_mv_table, s->f_code, CANDIDATE_MB_TYPE_INTER, 0);
-            if(s->flags & CODEC_FLAG_INTERLACED_ME){
-                for(i=0; i<2; i++){
-                    for(j=0; j<2; j++)
-                        ff_fix_long_mvs(s, s->p_field_select_table[i], j, 
-                                        s->p_field_mv_table[i][j], s->f_code, CANDIDATE_MB_TYPE_INTER_I, 0);
-                }
-            }
-        }
-
-        if(s->pict_type==B_TYPE){
-            int a, b;
-
-            a = ff_get_best_fcode(s, s->b_forw_mv_table, CANDIDATE_MB_TYPE_FORWARD);
-            b = ff_get_best_fcode(s, s->b_bidir_forw_mv_table, CANDIDATE_MB_TYPE_BIDIR);
-            s->f_code = FFMAX(a, b);
-
-            a = ff_get_best_fcode(s, s->b_back_mv_table, CANDIDATE_MB_TYPE_BACKWARD);
-            b = ff_get_best_fcode(s, s->b_bidir_back_mv_table, CANDIDATE_MB_TYPE_BIDIR);
-            s->b_code = FFMAX(a, b);
-
-            ff_fix_long_mvs(s, NULL, 0, s->b_forw_mv_table, s->f_code, CANDIDATE_MB_TYPE_FORWARD, 1);
-            ff_fix_long_mvs(s, NULL, 0, s->b_back_mv_table, s->b_code, CANDIDATE_MB_TYPE_BACKWARD, 1);
-            ff_fix_long_mvs(s, NULL, 0, s->b_bidir_forw_mv_table, s->f_code, CANDIDATE_MB_TYPE_BIDIR, 1);
-            ff_fix_long_mvs(s, NULL, 0, s->b_bidir_back_mv_table, s->b_code, CANDIDATE_MB_TYPE_BIDIR, 1);
-            if(s->flags & CODEC_FLAG_INTERLACED_ME){
-                int dir;
-                for(dir=0; dir<2; dir++){
-                    for(i=0; i<2; i++){
-                        for(j=0; j<2; j++){
-                            int type= dir ? (CANDIDATE_MB_TYPE_BACKWARD_I|CANDIDATE_MB_TYPE_BIDIR_I) 
-                                          : (CANDIDATE_MB_TYPE_FORWARD_I |CANDIDATE_MB_TYPE_BIDIR_I);
-                            ff_fix_long_mvs(s, s->b_field_select_table[dir][i], j, 
-                                            s->b_field_mv_table[dir][i][j], dir ? s->b_code : s->f_code, type, 1);
-                        }
-                    }
-                }
-            }
-        }
-    }
-    
-    if (!s->fixed_qscale) 
-        s->current_picture.quality = ff_rate_estimate_qscale(s);
-
-    if(s->adaptive_quant){
-#ifdef CONFIG_RISKY
-        switch(s->codec_id){
-        case CODEC_ID_MPEG4:
-            ff_clean_mpeg4_qscales(s);
-            break;
-        case CODEC_ID_H263:
-        case CODEC_ID_H263P:
-        case CODEC_ID_FLV1:
-            ff_clean_h263_qscales(s);
-            break;
-        }
-#endif
-
-        s->lambda= s->lambda_table[0];
-        //FIXME broken
-    }else
-        s->lambda= s->current_picture.quality;
-//printf("%d %d\n", s->avctx->global_quality, s->current_picture.quality);
-    update_qscale(s);
-    
-    if(s->qscale < 3 && s->max_qcoeff<=128 && s->pict_type==I_TYPE && !(s->flags & CODEC_FLAG_QSCALE)) 
-        s->qscale= 3; //reduce cliping problems
-        
-    if (s->out_format == FMT_MJPEG) {
-        /* for mjpeg, we do include qscale in the matrix */
-        s->intra_matrix[0] = ff_mpeg1_default_intra_matrix[0];
-        for(i=1;i<64;i++){
-            int j= s->dsp.idct_permutation[i];
-
-            s->intra_matrix[j] = CLAMP_TO_8BIT((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3);
-        }
-        convert_matrix(&s->dsp, s->q_intra_matrix, s->q_intra_matrix16, 
-                       s->intra_matrix, s->intra_quant_bias, 8, 8);
-        s->qscale= 8;
-    }
-    
-    //FIXME var duplication
-    s->current_picture.key_frame= s->pict_type == I_TYPE;
-    s->current_picture.pict_type= s->pict_type;
-
-    if(s->current_picture.key_frame)
-        s->picture_in_gop_number=0;
-
     s->last_bits= put_bits_count(&s->pb);
-    switch(s->out_format) {
-    case FMT_MJPEG:
-        mjpeg_picture_header(s);
-        break;
-#ifdef CONFIG_RISKY
-    case FMT_H263:
-        if (s->codec_id == CODEC_ID_WMV2) 
-            ff_wmv2_encode_picture_header(s, picture_number);
-        else if (s->h263_msmpeg4) 
-            msmpeg4_encode_picture_header(s, picture_number);
-        else if (s->h263_pred)
-            mpeg4_encode_picture_header(s, picture_number);
-        else if (s->codec_id == CODEC_ID_RV10) 
-            rv10_encode_picture_header(s, picture_number);
-        else if (s->codec_id == CODEC_ID_FLV1)
-            ff_flv_encode_picture_header(s, picture_number);
-        else
-            h263_encode_picture_header(s, picture_number);
-        break;
-#endif
-    case FMT_MPEG1:
-        mpeg1_encode_picture_header(s, picture_number);
-        break;
-    case FMT_H264:
-        break;
-    default:
-        assert(0);
-    }
-    bits= put_bits_count(&s->pb);
-    s->header_bits= bits - s->last_bits;
-    s->last_bits= bits;
     s->mv_bits=0;
     s->misc_bits=0;
     s->i_tex_bits=0;
@@ -4080,10 +4014,11 @@
 #endif
 
     s->resync_mb_x=0;
-    s->resync_mb_y=0;
+    s->resync_mb_y=0; 
     s->first_slice_line = 1;
     s->ptr_lastgob = s->pb.buf;
-    for(mb_y=0; mb_y < s->mb_height; mb_y++) {
+    for(mb_y= s->start_mb_y; mb_y < s->end_mb_y; mb_y++) {
+//    printf("row %d at %X\n", s->mb_y, (int)s);
         s->mb_x=0;
         s->mb_y= mb_y;
 
@@ -4105,10 +4040,12 @@
             if(s->rtp_mode){
                 int current_packet_size, is_gob_start;
                 
-                current_packet_size= pbBufPtr(&s->pb) - s->ptr_lastgob;
+                current_packet_size= pbBufPtr(&s->pb) - s->ptr_lastgob; //FIXME wrong
                 
                 is_gob_start= s->avctx->rtp_payload_size && current_packet_size >= s->avctx->rtp_payload_size && mb_y + mb_x>0; 
                 
+                if(s->start_mb_y == mb_y && mb_y > 0 && mb_x==0) is_gob_start=1;
+                
                 switch(s->codec_id){
                 case CODEC_ID_H263:
                 case CODEC_ID_H263P:
@@ -4121,19 +4058,16 @@
                     if(s->mb_skip_run) is_gob_start=0;
                     break;
                 }
-                
+
                 if(is_gob_start){
-                    if(s->codec_id==CODEC_ID_MPEG4 && s->partitioned_frame){
-                        ff_mpeg4_merge_partitions(s);
-                        ff_mpeg4_init_partitions(s);
+                    if(s->start_mb_y != mb_y || mb_x!=0){
+                        write_slice_end(s);
+
+                        if(s->codec_id==CODEC_ID_MPEG4 && s->partitioned_frame){
+                            ff_mpeg4_init_partitions(s);
+                        }
                     }
                 
-                    if(s->codec_id==CODEC_ID_MPEG4) 
-                        ff_mpeg4_stuffing(&s->pb);
-
-                    align_put_bits(&s->pb);
-                    flush_put_bits(&s->pb);
-
                     assert((put_bits_count(&s->pb)&7) == 0);
                     current_packet_size= pbBufPtr(&s->pb) - s->ptr_lastgob;
                     
@@ -4417,10 +4351,10 @@
                     ff_h263_update_motion_val(s);
 #endif
         
-                if(next_block==0){
-                    s->dsp.put_pixels_tab[0][0](s->dest[0], s->me.scratchpad     , s->linesize  ,16);
-                    s->dsp.put_pixels_tab[1][0](s->dest[1], s->me.scratchpad + 16, s->uvlinesize, 8);
-                    s->dsp.put_pixels_tab[1][0](s->dest[2], s->me.scratchpad + 24, s->uvlinesize, 8);
+                if(next_block==0){ //FIXME 16 vs linesize16
+                    s->dsp.put_pixels_tab[0][0](s->dest[0], s->rd_scratchpad                     , s->linesize  ,16);
+                    s->dsp.put_pixels_tab[1][0](s->dest[1], s->rd_scratchpad + 16*s->linesize    , s->uvlinesize, 8);
+                    s->dsp.put_pixels_tab[1][0](s->dest[2], s->rd_scratchpad + 16*s->linesize + 8, s->uvlinesize, 8);
                 }
 
                 if(s->avctx->mb_decision == FF_MB_DECISION_BITS)
@@ -4577,26 +4511,286 @@
 //printf("MB %d %d bits\n", s->mb_x+s->mb_y*s->mb_stride, put_bits_count(&s->pb));
         }
     }
-    emms_c();
 
 #ifdef CONFIG_RISKY
-    if(s->codec_id==CODEC_ID_MPEG4 && s->partitioned_frame)
-        ff_mpeg4_merge_partitions(s);
-
+    //not beautifull here but we must write it before flushing so it has to be here
     if (s->msmpeg4_version && s->msmpeg4_version<4 && s->pict_type == I_TYPE)
         msmpeg4_encode_ext_header(s);
-
-    if(s->codec_id==CODEC_ID_MPEG4) 
-        ff_mpeg4_stuffing(&s->pb);
 #endif
 
+    write_slice_end(s);
+
     /* Send the last GOB if RTP */    
     if (s->avctx->rtp_callback) {
-        flush_put_bits(&s->pb);
         pdif = pbBufPtr(&s->pb) - s->ptr_lastgob;
         /* Call the RTP callback to send the last GOB */
+        emms_c();
         s->avctx->rtp_callback(s->ptr_lastgob, pdif, 0);
     }
+
+    return 0;
+}
+
+#define MERGE(field) dst->field += src->field; src->field=0
+static void merge_context_after_me(MpegEncContext *dst, MpegEncContext *src){
+    MERGE(scene_change_score);
+    MERGE(mc_mb_var_sum_temp);
+    MERGE(mb_var_sum_temp);
+}
+
+static void merge_context_after_encode(MpegEncContext *dst, MpegEncContext *src){
+    int i;
+
+    MERGE(dct_count[0]); //note, the other dct vars are not part of the context
+    MERGE(dct_count[1]);
+    MERGE(mv_bits);
+    MERGE(header_bits);
+    MERGE(i_tex_bits);
+    MERGE(p_tex_bits);
+    MERGE(i_count);
+    MERGE(f_count);
+    MERGE(b_count);
+    MERGE(skip_count);
+    MERGE(misc_bits);
+    MERGE(error_count);
+    MERGE(padding_bug_score);
+
+    if(dst->avctx->noise_reduction){
+        for(i=0; i<64; i++){
+            MERGE(dct_error_sum[0][i]);
+            MERGE(dct_error_sum[1][i]);
+        }
+    }
+    
+    assert(put_bits_count(&src->pb) % 8 ==0);
+    assert(put_bits_count(&dst->pb) % 8 ==0);
+    ff_copy_bits(&dst->pb, src->pb.buf, put_bits_count(&src->pb));
+    flush_put_bits(&dst->pb);
+}
+
+static void encode_picture(MpegEncContext *s, int picture_number)
+{
+    int mb_x, mb_y;
+    int i, j;
+    int bits;
+
+    s->picture_number = picture_number;
+    
+    /* Reset the average MB variance */
+    s->mb_var_sum_temp    =
+    s->mc_mb_var_sum_temp = 0;
+
+#ifdef CONFIG_RISKY
+    /* we need to initialize some time vars before we can encode b-frames */
+    // RAL: Condition added for MPEG1VIDEO
+    if (s->codec_id == CODEC_ID_MPEG1VIDEO || s->codec_id == CODEC_ID_MPEG2VIDEO || (s->h263_pred && !s->h263_msmpeg4))
+        ff_set_mpeg4_time(s, s->picture_number); 
+#endif
+        
+    s->scene_change_score=0;
+    
+    s->lambda= s->current_picture_ptr->quality; //FIXME qscale / ... stuff for ME ratedistoration
+    
+    if(s->pict_type==I_TYPE){
+        if(s->msmpeg4_version >= 3) s->no_rounding=1;
+        else                        s->no_rounding=0;
+    }else if(s->pict_type!=B_TYPE){
+        if(s->flipflop_rounding || s->codec_id == CODEC_ID_H263P || s->codec_id == CODEC_ID_MPEG4)
+            s->no_rounding ^= 1;          
+    }
+    
+    s->mb_intra=0; //for the rate distoration & bit compare functions
+    for(i=1; i<s->avctx->thread_count; i++){
+        update_duplicate_context(s->thread_context[i], s);
+    }
+    
+    /* Estimate motion for every MB */
+    if(s->pict_type != I_TYPE){
+        if(s->pict_type != B_TYPE){
+            if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){
+                s->avctx->execute(s->avctx, pre_estimate_motion_thread, (void**)&(s->thread_context[0]), NULL, s->avctx->thread_count);
+            }
+        }
+
+        s->avctx->execute(s->avctx, estimate_motion_thread, (void**)&(s->thread_context[0]), NULL, s->avctx->thread_count);
+        for(i=1; i<s->avctx->thread_count; i++){
+            merge_context_after_me(s, s->thread_context[i]);
+        }
+    }else /* if(s->pict_type == I_TYPE) */{
+        /* I-Frame */
+        for(i=0; i<s->mb_stride*s->mb_height; i++)
+            s->mb_type[i]= CANDIDATE_MB_TYPE_INTRA;
+        
+        if(!s->fixed_qscale){
+            /* finding spatial complexity for I-frame rate control */
+            for(mb_y=0; mb_y < s->mb_height; mb_y++) {
+                for(mb_x=0; mb_x < s->mb_width; mb_x++) {
+                    int xx = mb_x * 16;
+                    int yy = mb_y * 16;
+                    uint8_t *pix = s->new_picture.data[0] + (yy * s->linesize) + xx;
+                    int varc;
+		    int sum = s->dsp.pix_sum(pix, s->linesize);
+    
+		    varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8;
+
+                    s->current_picture.mb_var [s->mb_stride * mb_y + mb_x] = varc;
+                    s->current_picture.mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
+                    s->mb_var_sum_temp    += varc;
+                }
+            }
+        }
+    }
+    s->current_picture.mc_mb_var_sum= s->current_picture_ptr->mc_mb_var_sum= s->mc_mb_var_sum_temp;
+    s->current_picture.   mb_var_sum= s->current_picture_ptr->   mb_var_sum= s->   mb_var_sum_temp;
+    emms_c();
+
+    if(s->scene_change_score > s->avctx->scenechange_threshold && s->pict_type == P_TYPE){
+        s->pict_type= I_TYPE;
+        for(i=0; i<s->mb_stride*s->mb_height; i++)
+            s->mb_type[i]= CANDIDATE_MB_TYPE_INTRA;
+//printf("Scene change detected, encoding as I Frame %d %d\n", s->current_picture.mb_var_sum, s->current_picture.mc_mb_var_sum);
+    }
+
+    if(!s->umvplus){
+        if(s->pict_type==P_TYPE || s->pict_type==S_TYPE) {
+            s->f_code= ff_get_best_fcode(s, s->p_mv_table, CANDIDATE_MB_TYPE_INTER);
+
+            if(s->flags & CODEC_FLAG_INTERLACED_ME){
+                int a,b;
+                a= ff_get_best_fcode(s, s->p_field_mv_table[0][0], CANDIDATE_MB_TYPE_INTER_I); //FIXME field_select
+                b= ff_get_best_fcode(s, s->p_field_mv_table[1][1], CANDIDATE_MB_TYPE_INTER_I);
+                s->f_code= FFMAX(s->f_code, FFMAX(a,b));
+            }
+                    
+            ff_fix_long_p_mvs(s);
+            ff_fix_long_mvs(s, NULL, 0, s->p_mv_table, s->f_code, CANDIDATE_MB_TYPE_INTER, 0);
+            if(s->flags & CODEC_FLAG_INTERLACED_ME){
+                for(i=0; i<2; i++){
+                    for(j=0; j<2; j++)
+                        ff_fix_long_mvs(s, s->p_field_select_table[i], j, 
+                                        s->p_field_mv_table[i][j], s->f_code, CANDIDATE_MB_TYPE_INTER_I, 0);
+                }
+            }
+        }
+
+        if(s->pict_type==B_TYPE){
+            int a, b;
+
+            a = ff_get_best_fcode(s, s->b_forw_mv_table, CANDIDATE_MB_TYPE_FORWARD);
+            b = ff_get_best_fcode(s, s->b_bidir_forw_mv_table, CANDIDATE_MB_TYPE_BIDIR);
+            s->f_code = FFMAX(a, b);
+
+            a = ff_get_best_fcode(s, s->b_back_mv_table, CANDIDATE_MB_TYPE_BACKWARD);
+            b = ff_get_best_fcode(s, s->b_bidir_back_mv_table, CANDIDATE_MB_TYPE_BIDIR);
+            s->b_code = FFMAX(a, b);
+
+            ff_fix_long_mvs(s, NULL, 0, s->b_forw_mv_table, s->f_code, CANDIDATE_MB_TYPE_FORWARD, 1);
+            ff_fix_long_mvs(s, NULL, 0, s->b_back_mv_table, s->b_code, CANDIDATE_MB_TYPE_BACKWARD, 1);
+            ff_fix_long_mvs(s, NULL, 0, s->b_bidir_forw_mv_table, s->f_code, CANDIDATE_MB_TYPE_BIDIR, 1);
+            ff_fix_long_mvs(s, NULL, 0, s->b_bidir_back_mv_table, s->b_code, CANDIDATE_MB_TYPE_BIDIR, 1);
+            if(s->flags & CODEC_FLAG_INTERLACED_ME){
+                int dir;
+                for(dir=0; dir<2; dir++){
+                    for(i=0; i<2; i++){
+                        for(j=0; j<2; j++){
+                            int type= dir ? (CANDIDATE_MB_TYPE_BACKWARD_I|CANDIDATE_MB_TYPE_BIDIR_I) 
+                                          : (CANDIDATE_MB_TYPE_FORWARD_I |CANDIDATE_MB_TYPE_BIDIR_I);
+                            ff_fix_long_mvs(s, s->b_field_select_table[dir][i], j, 
+                                            s->b_field_mv_table[dir][i][j], dir ? s->b_code : s->f_code, type, 1);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (!s->fixed_qscale) 
+        s->current_picture.quality = ff_rate_estimate_qscale(s); //FIXME pic_ptr
+
+    if(s->adaptive_quant){
+#ifdef CONFIG_RISKY
+        switch(s->codec_id){
+        case CODEC_ID_MPEG4:
+            ff_clean_mpeg4_qscales(s);
+            break;
+        case CODEC_ID_H263:
+        case CODEC_ID_H263P:
+        case CODEC_ID_FLV1:
+            ff_clean_h263_qscales(s);
+            break;
+        }
+#endif
+
+        s->lambda= s->lambda_table[0];
+        //FIXME broken
+    }else
+        s->lambda= s->current_picture.quality;
+//printf("%d %d\n", s->avctx->global_quality, s->current_picture.quality);
+    update_qscale(s);
+    
+    if(s->qscale < 3 && s->max_qcoeff<=128 && s->pict_type==I_TYPE && !(s->flags & CODEC_FLAG_QSCALE)) 
+        s->qscale= 3; //reduce cliping problems
+        
+    if (s->out_format == FMT_MJPEG) {
+        /* for mjpeg, we do include qscale in the matrix */
+        s->intra_matrix[0] = ff_mpeg1_default_intra_matrix[0];
+        for(i=1;i<64;i++){
+            int j= s->dsp.idct_permutation[i];
+
+            s->intra_matrix[j] = CLAMP_TO_8BIT((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3);
+        }
+        convert_matrix(&s->dsp, s->q_intra_matrix, s->q_intra_matrix16, 
+                       s->intra_matrix, s->intra_quant_bias, 8, 8);
+        s->qscale= 8;
+    }
+    
+    //FIXME var duplication
+    s->current_picture.key_frame= s->pict_type == I_TYPE; //FIXME pic_ptr
+    s->current_picture.pict_type= s->pict_type;
+
+    if(s->current_picture.key_frame)
+        s->picture_in_gop_number=0;
+
+    s->last_bits= put_bits_count(&s->pb);
+    switch(s->out_format) {
+    case FMT_MJPEG:
+        mjpeg_picture_header(s);
+        break;
+#ifdef CONFIG_RISKY
+    case FMT_H263:
+        if (s->codec_id == CODEC_ID_WMV2) 
+            ff_wmv2_encode_picture_header(s, picture_number);
+        else if (s->h263_msmpeg4) 
+            msmpeg4_encode_picture_header(s, picture_number);
+        else if (s->h263_pred)
+            mpeg4_encode_picture_header(s, picture_number);
+        else if (s->codec_id == CODEC_ID_RV10) 
+            rv10_encode_picture_header(s, picture_number);
+        else if (s->codec_id == CODEC_ID_FLV1)
+            ff_flv_encode_picture_header(s, picture_number);
+        else
+            h263_encode_picture_header(s, picture_number);
+        break;
+#endif
+    case FMT_MPEG1:
+        mpeg1_encode_picture_header(s, picture_number);
+        break;
+    case FMT_H264:
+        break;
+    default:
+        assert(0);
+    }
+    bits= put_bits_count(&s->pb);
+    s->header_bits= bits - s->last_bits;
+        
+    for(i=1; i<s->avctx->thread_count; i++){
+        update_duplicate_context_after_me(s->thread_context[i], s);
+    }
+    s->avctx->execute(s->avctx, encode_thread, (void**)&(s->thread_context[0]), NULL, s->avctx->thread_count);
+    for(i=1; i<s->avctx->thread_count; i++){
+        merge_context_after_encode(s, s->thread_context[i]);
+    }
+    emms_c();
 }
 
 #endif //CONFIG_ENCODERS
--- a/mpegvideo.h	Thu Feb 12 16:21:21 2004 +0000
+++ b/mpegvideo.h	Fri Feb 13 17:54:10 2004 +0000
@@ -47,6 +47,8 @@
 #define MAX_FCODE 7
 #define MAX_MV 2048
 
+#define MAX_THREADS 8
+
 #define MAX_PICTURE_COUNT 15
 
 #define ME_MAP_SIZE 64
@@ -285,6 +287,10 @@
     Picture **input_picture;   ///< next pictures on display order for encoding
     Picture **reordered_input_picture; ///< pointer to the next pictures in codedorder for encoding
     
+    int start_mb_y;            ///< start mb_y of this thread (so current thread should process start_mb_y <= row < end_mb_y)
+    int end_mb_y;              ///< end   mb_y of this thread (so current thread should process start_mb_y <= row < end_mb_y)
+    struct MpegEncContext *thread_context[MAX_THREADS];
+    
     /** 
      * copy of the previous picture structure.
      * note, linesize & data, might not match the previous picture (for field pictures)
@@ -332,7 +338,10 @@
     uint8_t *cbp_table;           ///< used to store cbp, ac_pred for partitioned decoding 
     uint8_t *pred_dir_table;      ///< used to store pred_dir for partitioned decoding 
     uint8_t *allocated_edge_emu_buffer;
-    uint8_t *edge_emu_buffer;     ///< points into the middle of allocated_edge_emu_buffer  
+    uint8_t *edge_emu_buffer;     ///< points into the middle of allocated_edge_emu_buffer
+    uint8_t *rd_scratchpad;       ///< scartchpad for rate distortion mb decission
+    uint8_t *obmc_scratchpad;
+    uint8_t *b_scratchpad;        ///< scratchpad used for writing into write only buffers
 
     int qscale;                 ///< QP 
     int chroma_qscale;          ///< chroma QP 
@@ -487,6 +496,10 @@
     int misc_bits; ///< cbp, mb_type
     int last_bits; ///< temp var used for calculating the above vars
     
+    /* temp variables for picture complexity calculation */
+    int mc_mb_var_sum_temp;
+    int mb_var_sum_temp;
+
     /* error concealment / resync */
     int error_count;
     uint8_t *error_status_table;       ///< table of the error status of each MB  
@@ -565,9 +578,6 @@
     int intra_dc_threshold;          ///< QP above whch the ac VLC should be used for intra dc 
     PutBitContext tex_pb;            ///< used for data partitioned VOPs 
     PutBitContext pb2;               ///< used for data partitioned VOPs 
-#define PB_BUFFER_SIZE 1024*256
-    uint8_t *tex_pb_buffer;          
-    uint8_t *pb2_buffer;
     int mpeg_quant;
     int t_frame;                       ///< time distance of first I -> B, used for interlaced b frames 
     int padding_bug_score;             ///< used to detect the VERY common padding bug in MPEG4 
@@ -908,6 +918,7 @@
                      DCTELEM block[6][64]);
 void mjpeg_picture_header(MpegEncContext *s);
 void mjpeg_picture_trailer(MpegEncContext *s);
+void ff_mjpeg_stuffing(PutBitContext * pbc);
 
 
 /* rate control */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pthread.c	Fri Feb 13 17:54:10 2004 +0000
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#include <semaphore.h>
+#include <pthread.h>
+
+//#define DEBUG
+
+#include "avcodec.h"
+#include "common.h"
+
+
+typedef struct ThreadContext{
+    AVCodecContext *avctx;
+    pthread_t thread;
+    sem_t work_sem;
+    sem_t done_sem;
+    int (*func)(AVCodecContext *c, void *arg);
+    void *arg;
+    int ret;
+}ThreadContext;
+
+static void * thread_func(void *v){
+    ThreadContext *c= v;
+
+    for(;;){
+//printf("thread_func %X enter wait\n", (int)v); fflush(stdout);
+        sem_wait(&c->work_sem);
+//printf("thread_func %X after wait (func=%X)\n", (int)v, (int)c->func); fflush(stdout);
+        if(c->func)
+            c->ret= c->func(c->avctx, c->arg);
+        else
+            return NULL;
+//printf("thread_func %X signal complete\n", (int)v); fflush(stdout);
+        sem_post(&c->done_sem);
+    }
+    
+    return NULL;
+}
+
+/**
+ * free what has been allocated by avcodec_pthread_init().
+ * must be called after decoding has finished, especially dont call while avcodec_pthread_execute() is running
+ */
+void avcodec_pthread_free(AVCodecContext *s){
+    ThreadContext *c= s->thread_opaque;
+    int i;
+
+    for(i=0; i<s->thread_count; i++){
+        int val;
+        
+        sem_getvalue(&c[i].work_sem, &val); assert(val == 0);
+        sem_getvalue(&c[i].done_sem, &val); assert(val == 0);
+
+        c[i].func= NULL;
+        sem_post(&c[i].work_sem);
+        pthread_join(c[i].thread, NULL);
+        sem_destroy(&c[i].work_sem);
+        sem_destroy(&c[i].done_sem);
+    }
+
+    av_freep(&s->thread_opaque);
+}
+
+int avcodec_pthread_execute(AVCodecContext *s, int (*func)(AVCodecContext *c2, void *arg2),void **arg, int *ret, int count){
+    ThreadContext *c= s->thread_opaque;
+    int i, val;
+    
+    assert(s == c->avctx);
+    assert(count <= s->thread_count);
+    
+    /* note, we can be certain that this is not called with the same AVCodecContext by different threads at the same time */
+
+    for(i=0; i<count; i++){
+        sem_getvalue(&c[i].work_sem, &val); assert(val == 0);
+        sem_getvalue(&c[i].done_sem, &val); assert(val == 0);
+        
+        c[i].arg= arg[i];
+        c[i].func= func;
+        c[i].ret= 12345;
+        sem_post(&c[i].work_sem);
+    }
+    for(i=0; i<count; i++){
+        sem_wait(&c[i].done_sem);
+
+        sem_getvalue(&c[i].work_sem, &val); assert(val == 0);
+        sem_getvalue(&c[i].done_sem, &val); assert(val == 0);
+        
+        c[i].func= NULL;
+        if(ret) ret[i]= c[i].ret;
+    }
+    return 0;
+}
+
+int avcodec_pthread_init(AVCodecContext *s, int thread_count){
+    int i;
+    ThreadContext *c;
+
+    s->thread_count= thread_count;
+
+    assert(!s->thread_opaque);
+    c= av_mallocz(sizeof(ThreadContext)*thread_count);
+    s->thread_opaque= c;
+    
+    for(i=0; i<thread_count; i++){
+//printf("init semaphors %d\n", i); fflush(stdout);
+        c[i].avctx= s;
+        if(sem_init(&c[i].work_sem, 0, 0))
+            goto fail;
+        if(sem_init(&c[i].done_sem, 0, 0))
+            goto fail;
+//printf("create thread %d\n", i); fflush(stdout);
+        if(pthread_create(&c[i].thread, NULL, thread_func, &c[i]))
+            goto fail;
+    }
+//printf("init done\n"); fflush(stdout);
+    
+    s->execute= avcodec_pthread_execute;
+
+    return 0;
+fail:
+    avcodec_pthread_free(s);
+    return -1;
+}
--- a/utils.c	Thu Feb 12 16:21:21 2004 +0000
+++ b/utils.c	Fri Feb 13 17:54:10 2004 +0000
@@ -324,6 +324,16 @@
     return 0;
 }
 
+int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2),void **arg, int *ret, int count){
+    int i;
+
+    for(i=0; i<count; i++){
+        int r= func(c, arg[i]);
+        if(ret) ret[i]= r;
+    }
+    return 0;
+}
+
 enum PixelFormat avcodec_default_get_format(struct AVCodecContext *s, enum PixelFormat * fmt){
     return fmt[0];
 }
@@ -352,6 +362,8 @@
     s->get_buffer= avcodec_default_get_buffer;
     s->release_buffer= avcodec_default_release_buffer;
     s->get_format= avcodec_default_get_format;
+    s->execute= avcodec_default_execute;
+    s->thread_count=1;
     s->me_subpel_quality=8;
     s->lmin= FF_QP2LAMBDA * s->qmin;
     s->lmax= FF_QP2LAMBDA * s->qmax;