changeset 10866:d26e9b4d2ca1 libavcodec

Split cavlc out of h264.c. Seems to speed the code up a little... The placement of many generic functions between h264.c and h264.h is still open Currently they are a little randomly placed between them.
author michael
date Wed, 13 Jan 2010 01:59:19 +0000
parents bcdc5343a577
children bf309c7ce615
files Makefile h264.c h264.h h264_cavlc.c h264data.h
diffstat 5 files changed, 1654 insertions(+), 1605 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Tue Jan 12 23:02:07 2010 +0000
+++ b/Makefile	Wed Jan 13 01:59:19 2010 +0000
@@ -132,7 +132,7 @@
                                           ratecontrol.o h263.o ituh263enc.o flvenc.o mpeg12data.o \
                                           mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_H264_DECODER)            += h264.o h264idct.o h264pred.o h264_loopfilter.o h264_direct.o cabac.o \
-                                          h264_sei.o h264_ps.o h264_refs.o\
+                                          h264_sei.o h264_ps.o h264_refs.o h264_cavlc.o\
                                           mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_H264_ENCODER)            += h264enc.o h264dspenc.o
 OBJS-$(CONFIG_H264_VAAPI_HWACCEL)      += vaapi_h264.o
@@ -283,7 +283,7 @@
                                           motion_est.o h263.o \
                                           mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_SVQ3_DECODER)            += h264.o svq3.o h264idct.o h264pred.o h264_loopfilter.o h264_direct.o  \
-                                          h264_sei.o h264_ps.o h264_refs.o\
+                                          h264_sei.o h264_ps.o h264_refs.o h264_cavlc.o\
                                           cabac.o          \
                                           mpegvideo.o error_resilience.o \
                                           svq1dec.o svq1.o h263.o
--- a/h264.c	Tue Jan 12 23:02:07 2010 +0000
+++ b/h264.c	Wed Jan 13 01:59:19 2010 +0000
@@ -46,30 +46,6 @@
 //#undef NDEBUG
 #include <assert.h>
 
-static VLC coeff_token_vlc[4];
-static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
-static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
-
-static VLC chroma_dc_coeff_token_vlc;
-static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
-static const int chroma_dc_coeff_token_vlc_table_size = 256;
-
-static VLC total_zeros_vlc[15];
-static VLC_TYPE total_zeros_vlc_tables[15][512][2];
-static const int total_zeros_vlc_tables_size = 512;
-
-static VLC chroma_dc_total_zeros_vlc[3];
-static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
-static const int chroma_dc_total_zeros_vlc_tables_size = 8;
-
-static VLC run_vlc[6];
-static VLC_TYPE run_vlc_tables[6][8][2];
-static const int run_vlc_tables_size = 8;
-
-static VLC run7_vlc;
-static VLC_TYPE run7_vlc_table[96][2];
-static const int run7_vlc_table_size = 96;
-
 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 
@@ -81,455 +57,6 @@
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
 };
 
-static const uint8_t left_block_options[4][8]={
-    {0,1,2,3,7,10,8,11},
-    {2,2,3,3,8,11,8,11},
-    {0,0,1,1,7,10,7,10},
-    {0,2,0,2,7,10,7,10}
-};
-
-#define LEVEL_TAB_BITS 8
-static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
-
-static void fill_caches(H264Context *h, int mb_type, int for_deblock){
-    MpegEncContext * const s = &h->s;
-    const int mb_xy= h->mb_xy;
-    int topleft_xy, top_xy, topright_xy, left_xy[2];
-    int topleft_type, top_type, topright_type, left_type[2];
-    const uint8_t * left_block;
-    int topleft_partition= -1;
-    int i;
-
-    top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
-
-    //FIXME deblocking could skip the intra and nnz parts.
-    if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
-        return;
-
-    /* Wow, what a mess, why didn't they simplify the interlacing & intra
-     * stuff, I can't imagine that these complex rules are worth it. */
-
-    topleft_xy = top_xy - 1;
-    topright_xy= top_xy + 1;
-    left_xy[1] = left_xy[0] = mb_xy-1;
-    left_block = left_block_options[0];
-    if(FRAME_MBAFF){
-        const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
-        const int top_pair_xy      = pair_xy     - s->mb_stride;
-        const int topleft_pair_xy  = top_pair_xy - 1;
-        const int topright_pair_xy = top_pair_xy + 1;
-        const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
-        const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
-        const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
-        const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
-        const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
-        const int bottom = (s->mb_y & 1);
-        tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
-
-        if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
-            top_xy -= s->mb_stride;
-        }
-        if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
-            topleft_xy -= s->mb_stride;
-        } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
-            topleft_xy += s->mb_stride;
-            // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
-            topleft_partition = 0;
-        }
-        if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
-            topright_xy -= s->mb_stride;
-        }
-        if (left_mb_field_flag != curr_mb_field_flag) {
-            left_xy[1] = left_xy[0] = pair_xy - 1;
-            if (curr_mb_field_flag) {
-                left_xy[1] += s->mb_stride;
-                left_block = left_block_options[3];
-            } else {
-                left_block= left_block_options[2 - bottom];
-            }
-        }
-    }
-
-    h->top_mb_xy = top_xy;
-    h->left_mb_xy[0] = left_xy[0];
-    h->left_mb_xy[1] = left_xy[1];
-    if(for_deblock){
-        topleft_type = 0;
-        topright_type = 0;
-        top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
-        left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
-        left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
-
-        if(MB_MBAFF && !IS_INTRA(mb_type)){
-            int list;
-            for(list=0; list<h->list_count; list++){
-                //These values where changed for ease of performing MC, we need to change them back
-                //FIXME maybe we can make MC and loop filter use the same values or prevent
-                //the MC code from changing ref_cache and rather use a temporary array.
-                if(USES_LIST(mb_type,list)){
-                    int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
-                    *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
-                    *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
-                    ref += h->b8_stride;
-                    *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
-                    *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
-                }
-            }
-        }
-    }else{
-        topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
-        top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
-        topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
-        left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
-        left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
-
-    if(IS_INTRA(mb_type)){
-        int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
-        h->topleft_samples_available=
-        h->top_samples_available=
-        h->left_samples_available= 0xFFFF;
-        h->topright_samples_available= 0xEEEA;
-
-        if(!(top_type & type_mask)){
-            h->topleft_samples_available= 0xB3FF;
-            h->top_samples_available= 0x33FF;
-            h->topright_samples_available= 0x26EA;
-        }
-        if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
-            if(IS_INTERLACED(mb_type)){
-                if(!(left_type[0] & type_mask)){
-                    h->topleft_samples_available&= 0xDFFF;
-                    h->left_samples_available&= 0x5FFF;
-                }
-                if(!(left_type[1] & type_mask)){
-                    h->topleft_samples_available&= 0xFF5F;
-                    h->left_samples_available&= 0xFF5F;
-                }
-            }else{
-                int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
-                                ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
-                assert(left_xy[0] == left_xy[1]);
-                if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
-                    h->topleft_samples_available&= 0xDF5F;
-                    h->left_samples_available&= 0x5F5F;
-                }
-            }
-        }else{
-            if(!(left_type[0] & type_mask)){
-                h->topleft_samples_available&= 0xDF5F;
-                h->left_samples_available&= 0x5F5F;
-            }
-        }
-
-        if(!(topleft_type & type_mask))
-            h->topleft_samples_available&= 0x7FFF;
-
-        if(!(topright_type & type_mask))
-            h->topright_samples_available&= 0xFBFF;
-
-        if(IS_INTRA4x4(mb_type)){
-            if(IS_INTRA4x4(top_type)){
-                h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
-                h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
-                h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
-                h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
-            }else{
-                int pred;
-                if(!(top_type & type_mask))
-                    pred= -1;
-                else{
-                    pred= 2;
-                }
-                h->intra4x4_pred_mode_cache[4+8*0]=
-                h->intra4x4_pred_mode_cache[5+8*0]=
-                h->intra4x4_pred_mode_cache[6+8*0]=
-                h->intra4x4_pred_mode_cache[7+8*0]= pred;
-            }
-            for(i=0; i<2; i++){
-                if(IS_INTRA4x4(left_type[i])){
-                    h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
-                    h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
-                }else{
-                    int pred;
-                    if(!(left_type[i] & type_mask))
-                        pred= -1;
-                    else{
-                        pred= 2;
-                    }
-                    h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
-                    h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
-                }
-            }
-        }
-    }
-    }
-
-
-/*
-0 . T T. T T T T
-1 L . .L . . . .
-2 L . .L . . . .
-3 . T TL . . . .
-4 L . .L . . . .
-5 L . .. . . . .
-*/
-//FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
-    if(top_type){
-        h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
-        h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
-        h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
-        h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
-
-        h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
-        h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
-
-        h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
-        h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
-
-    }else{
-        h->non_zero_count_cache[4+8*0]=
-        h->non_zero_count_cache[5+8*0]=
-        h->non_zero_count_cache[6+8*0]=
-        h->non_zero_count_cache[7+8*0]=
-
-        h->non_zero_count_cache[1+8*0]=
-        h->non_zero_count_cache[2+8*0]=
-
-        h->non_zero_count_cache[1+8*3]=
-        h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
-
-    }
-
-    for (i=0; i<2; i++) {
-        if(left_type[i]){
-            h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
-            h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
-            h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
-            h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
-        }else{
-            h->non_zero_count_cache[3+8*1 + 2*8*i]=
-            h->non_zero_count_cache[3+8*2 + 2*8*i]=
-            h->non_zero_count_cache[0+8*1 +   8*i]=
-            h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
-        }
-    }
-
-    if( h->pps.cabac ) {
-        // top_cbp
-        if(top_type) {
-            h->top_cbp = h->cbp_table[top_xy];
-        } else if(IS_INTRA(mb_type)) {
-            h->top_cbp = 0x1C0;
-        } else {
-            h->top_cbp = 0;
-        }
-        // left_cbp
-        if (left_type[0]) {
-            h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
-        } else if(IS_INTRA(mb_type)) {
-            h->left_cbp = 0x1C0;
-        } else {
-            h->left_cbp = 0;
-        }
-        if (left_type[0]) {
-            h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
-        }
-        if (left_type[1]) {
-            h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
-        }
-    }
-
-#if 1
-    if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
-        int list;
-        for(list=0; list<h->list_count; list++){
-            if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
-                /*if(!h->mv_cache_clean[list]){
-                    memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
-                    memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
-                    h->mv_cache_clean[list]= 1;
-                }*/
-                continue;
-            }
-            h->mv_cache_clean[list]= 0;
-
-            if(USES_LIST(top_type, list)){
-                const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
-                const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
-                *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
-                *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
-                *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
-                *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
-                h->ref_cache[list][scan8[0] + 0 - 1*8]=
-                h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
-                h->ref_cache[list][scan8[0] + 2 - 1*8]=
-                h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
-            }else{
-                *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
-                *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
-                *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
-                *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
-                *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
-            }
-
-            for(i=0; i<2; i++){
-                int cache_idx = scan8[0] - 1 + i*2*8;
-                if(USES_LIST(left_type[i], list)){
-                    const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
-                    const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
-                    *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
-                    *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
-                    h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
-                    h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
-                }else{
-                    *(uint32_t*)h->mv_cache [list][cache_idx  ]=
-                    *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
-                    h->ref_cache[list][cache_idx  ]=
-                    h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-                }
-            }
-
-            if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
-                continue;
-
-            if(USES_LIST(topleft_type, list)){
-                const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
-                const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
-                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
-                h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
-            }else{
-                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
-                h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-            }
-
-            if(USES_LIST(topright_type, list)){
-                const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
-                const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
-                *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
-                h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
-            }else{
-                *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
-                h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-            }
-
-            if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
-                continue;
-
-            h->ref_cache[list][scan8[5 ]+1] =
-            h->ref_cache[list][scan8[7 ]+1] =
-            h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
-            h->ref_cache[list][scan8[4 ]] =
-            h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
-            *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
-            *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
-            *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
-            *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
-            *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
-
-            if( h->pps.cabac ) {
-                /* XXX beurk, Load mvd */
-                if(USES_LIST(top_type, list)){
-                    const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
-                }else{
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
-                }
-                if(USES_LIST(left_type[0], list)){
-                    const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
-                }else{
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
-                }
-                if(USES_LIST(left_type[1], list)){
-                    const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
-                }else{
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
-                }
-                *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
-                *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
-                *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
-                *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
-                *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
-
-                if(h->slice_type_nos == FF_B_TYPE){
-                    fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
-
-                    if(IS_DIRECT(top_type)){
-                        *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
-                    }else if(IS_8X8(top_type)){
-                        int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
-                        h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
-                        h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
-                    }else{
-                        *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
-                    }
-
-                    if(IS_DIRECT(left_type[0]))
-                        h->direct_cache[scan8[0] - 1 + 0*8]= 1;
-                    else if(IS_8X8(left_type[0]))
-                        h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
-                    else
-                        h->direct_cache[scan8[0] - 1 + 0*8]= 0;
-
-                    if(IS_DIRECT(left_type[1]))
-                        h->direct_cache[scan8[0] - 1 + 2*8]= 1;
-                    else if(IS_8X8(left_type[1]))
-                        h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
-                    else
-                        h->direct_cache[scan8[0] - 1 + 2*8]= 0;
-                }
-            }
-
-            if(FRAME_MBAFF){
-#define MAP_MVS\
-                    MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
-                    MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
-                    MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
-                    MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
-                    MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
-                    MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
-                    MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
-                    MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
-                    MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
-                    MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
-                if(MB_FIELD){
-#define MAP_F2F(idx, mb_type)\
-                    if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
-                        h->ref_cache[list][idx] <<= 1;\
-                        h->mv_cache[list][idx][1] /= 2;\
-                        h->mvd_cache[list][idx][1] /= 2;\
-                    }
-                    MAP_MVS
-#undef MAP_F2F
-                }else{
-#define MAP_F2F(idx, mb_type)\
-                    if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
-                        h->ref_cache[list][idx] >>= 1;\
-                        h->mv_cache[list][idx][1] <<= 1;\
-                        h->mvd_cache[list][idx][1] <<= 1;\
-                    }
-                    MAP_MVS
-#undef MAP_F2F
-                }
-            }
-        }
-    }
-#endif
-
-    h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
-}
-
 void ff_h264_write_back_intra_pred_mode(H264Context *h){
     const int mb_xy= h->mb_xy;
 
@@ -616,105 +143,6 @@
     return mode;
 }
 
-/**
- * gets the predicted intra4x4 prediction mode.
- */
-static inline int pred_intra_mode(H264Context *h, int n){
-    const int index8= scan8[n];
-    const int left= h->intra4x4_pred_mode_cache[index8 - 1];
-    const int top = h->intra4x4_pred_mode_cache[index8 - 8];
-    const int min= FFMIN(left, top);
-
-    tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
-
-    if(min<0) return DC_PRED;
-    else      return min;
-}
-
-static inline void write_back_non_zero_count(H264Context *h){
-    const int mb_xy= h->mb_xy;
-
-    h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
-    h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
-    h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
-    h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
-    h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
-    h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
-    h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
-
-    h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
-    h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
-    h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
-
-    h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
-    h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
-    h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
-}
-
-/**
- * gets the predicted number of non-zero coefficients.
- * @param n block index
- */
-static inline int pred_non_zero_count(H264Context *h, int n){
-    const int index8= scan8[n];
-    const int left= h->non_zero_count_cache[index8 - 1];
-    const int top = h->non_zero_count_cache[index8 - 8];
-    int i= left + top;
-
-    if(i<64) i= (i+1)>>1;
-
-    tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
-
-    return i&31;
-}
-
-static inline void write_back_motion(H264Context *h, int mb_type){
-    MpegEncContext * const s = &h->s;
-    const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
-    const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
-    int list;
-
-    if(!USES_LIST(mb_type, 0))
-        fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
-
-    for(list=0; list<h->list_count; list++){
-        int y;
-        if(!USES_LIST(mb_type, list))
-            continue;
-
-        for(y=0; y<4; y++){
-            *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
-            *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
-        }
-        if( h->pps.cabac ) {
-            if(IS_SKIP(mb_type))
-                fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
-            else
-            for(y=0; y<4; y++){
-                *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
-                *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
-            }
-        }
-
-        {
-            int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
-            ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
-            ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
-            ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
-            ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
-        }
-    }
-
-    if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
-        if(IS_8X8(mb_type)){
-            uint8_t *direct_table = &h->direct_table[b8_xy];
-            direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
-            direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
-            direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
-        }
-    }
-}
-
 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
     int i, si, di;
     uint8_t *dst;
@@ -1225,101 +653,6 @@
     prefetch_motion(h, 1);
 }
 
-static av_cold void init_cavlc_level_tab(void){
-    int suffix_length, mask;
-    unsigned int i;
-
-    for(suffix_length=0; suffix_length<7; suffix_length++){
-        for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
-            int prefix= LEVEL_TAB_BITS - av_log2(2*i);
-            int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
-
-            mask= -(level_code&1);
-            level_code= (((2+level_code)>>1) ^ mask) - mask;
-            if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
-                cavlc_level_tab[suffix_length][i][0]= level_code;
-                cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
-            }else if(prefix + 1 <= LEVEL_TAB_BITS){
-                cavlc_level_tab[suffix_length][i][0]= prefix+100;
-                cavlc_level_tab[suffix_length][i][1]= prefix + 1;
-            }else{
-                cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
-                cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
-            }
-        }
-    }
-}
-
-static av_cold void decode_init_vlc(void){
-    static int done = 0;
-
-    if (!done) {
-        int i;
-        int offset;
-        done = 1;
-
-        chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
-        chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
-        init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
-                 &chroma_dc_coeff_token_len [0], 1, 1,
-                 &chroma_dc_coeff_token_bits[0], 1, 1,
-                 INIT_VLC_USE_NEW_STATIC);
-
-        offset = 0;
-        for(i=0; i<4; i++){
-            coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
-            coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
-            init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
-                     &coeff_token_len [i][0], 1, 1,
-                     &coeff_token_bits[i][0], 1, 1,
-                     INIT_VLC_USE_NEW_STATIC);
-            offset += coeff_token_vlc_tables_size[i];
-        }
-        /*
-         * This is a one time safety check to make sure that
-         * the packed static coeff_token_vlc table sizes
-         * were initialized correctly.
-         */
-        assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
-
-        for(i=0; i<3; i++){
-            chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
-            chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
-            init_vlc(&chroma_dc_total_zeros_vlc[i],
-                     CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
-                     &chroma_dc_total_zeros_len [i][0], 1, 1,
-                     &chroma_dc_total_zeros_bits[i][0], 1, 1,
-                     INIT_VLC_USE_NEW_STATIC);
-        }
-        for(i=0; i<15; i++){
-            total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
-            total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
-            init_vlc(&total_zeros_vlc[i],
-                     TOTAL_ZEROS_VLC_BITS, 16,
-                     &total_zeros_len [i][0], 1, 1,
-                     &total_zeros_bits[i][0], 1, 1,
-                     INIT_VLC_USE_NEW_STATIC);
-        }
-
-        for(i=0; i<6; i++){
-            run_vlc[i].table = run_vlc_tables[i];
-            run_vlc[i].table_allocated = run_vlc_tables_size;
-            init_vlc(&run_vlc[i],
-                     RUN_VLC_BITS, 7,
-                     &run_len [i][0], 1, 1,
-                     &run_bits[i][0], 1, 1,
-                     INIT_VLC_USE_NEW_STATIC);
-        }
-        run7_vlc.table = run7_vlc_table,
-        run7_vlc.table_allocated = run7_vlc_table_size;
-        init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
-                 &run_len [6][0], 1, 1,
-                 &run_bits[6][0], 1, 1,
-                 INIT_VLC_USE_NEW_STATIC);
-
-        init_cavlc_level_tab();
-    }
-}
 
 static void free_tables(H264Context *h){
     int i;
@@ -1530,7 +863,7 @@
 
     avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
 
-    decode_init_vlc();
+    ff_h264_decode_init_vlc();
 
     if(avctx->extradata_size > 0 && avctx->extradata &&
        *(char *)avctx->extradata == 1){
@@ -2840,772 +2173,6 @@
     }
 }
 
-/**
- *
- */
-static inline int get_level_prefix(GetBitContext *gb){
-    unsigned int buf;
-    int log;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf=GET_CACHE(re, gb);
-
-    log= 32 - av_log2(buf);
-#ifdef TRACE
-    print_bin(buf>>(32-log), log);
-    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
-#endif
-
-    LAST_SKIP_BITS(re, gb, log);
-    CLOSE_READER(re, gb);
-
-    return log-1;
-}
-
-static inline int get_dct8x8_allowed(H264Context *h){
-    if(h->sps.direct_8x8_inference_flag)
-        return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
-    else
-        return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
-}
-
-/**
- * decodes a residual block.
- * @param n block index
- * @param scantable scantable
- * @param max_coeff number of coefficients in the block
- * @return <0 if an error occurred
- */
-static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
-    MpegEncContext * const s = &h->s;
-    static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
-    int level[16];
-    int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
-
-    //FIXME put trailing_onex into the context
-
-    if(n == CHROMA_DC_BLOCK_INDEX){
-        coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
-        total_coeff= coeff_token>>2;
-    }else{
-        if(n == LUMA_DC_BLOCK_INDEX){
-            total_coeff= pred_non_zero_count(h, 0);
-            coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
-            total_coeff= coeff_token>>2;
-        }else{
-            total_coeff= pred_non_zero_count(h, n);
-            coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
-            total_coeff= coeff_token>>2;
-            h->non_zero_count_cache[ scan8[n] ]= total_coeff;
-        }
-    }
-
-    //FIXME set last_non_zero?
-
-    if(total_coeff==0)
-        return 0;
-    if(total_coeff > (unsigned)max_coeff) {
-        av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
-        return -1;
-    }
-
-    trailing_ones= coeff_token&3;
-    tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
-    assert(total_coeff<=16);
-
-    i = show_bits(gb, 3);
-    skip_bits(gb, trailing_ones);
-    level[0] = 1-((i&4)>>1);
-    level[1] = 1-((i&2)   );
-    level[2] = 1-((i&1)<<1);
-
-    if(trailing_ones<total_coeff) {
-        int mask, prefix;
-        int suffix_length = total_coeff > 10 && trailing_ones < 3;
-        int bitsi= show_bits(gb, LEVEL_TAB_BITS);
-        int level_code= cavlc_level_tab[suffix_length][bitsi][0];
-
-        skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
-        if(level_code >= 100){
-            prefix= level_code - 100;
-            if(prefix == LEVEL_TAB_BITS)
-                prefix += get_level_prefix(gb);
-
-            //first coefficient has suffix_length equal to 0 or 1
-            if(prefix<14){ //FIXME try to build a large unified VLC table for all this
-                if(suffix_length)
-                    level_code= (prefix<<1) + get_bits1(gb); //part
-                else
-                    level_code= prefix; //part
-            }else if(prefix==14){
-                if(suffix_length)
-                    level_code= (prefix<<1) + get_bits1(gb); //part
-                else
-                    level_code= prefix + get_bits(gb, 4); //part
-            }else{
-                level_code= 30 + get_bits(gb, prefix-3); //part
-                if(prefix>=16)
-                    level_code += (1<<(prefix-3))-4096;
-            }
-
-            if(trailing_ones < 3) level_code += 2;
-
-            suffix_length = 2;
-            mask= -(level_code&1);
-            level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
-        }else{
-            if(trailing_ones < 3) level_code += (level_code>>31)|1;
-
-            suffix_length = 1;
-            if(level_code + 3U > 6U)
-                suffix_length++;
-            level[trailing_ones]= level_code;
-        }
-
-        //remaining coefficients have suffix_length > 0
-        for(i=trailing_ones+1;i<total_coeff;i++) {
-            static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
-            int bitsi= show_bits(gb, LEVEL_TAB_BITS);
-            level_code= cavlc_level_tab[suffix_length][bitsi][0];
-
-            skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
-            if(level_code >= 100){
-                prefix= level_code - 100;
-                if(prefix == LEVEL_TAB_BITS){
-                    prefix += get_level_prefix(gb);
-                }
-                if(prefix<15){
-                    level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
-                }else{
-                    level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
-                    if(prefix>=16)
-                        level_code += (1<<(prefix-3))-4096;
-                }
-                mask= -(level_code&1);
-                level_code= (((2+level_code)>>1) ^ mask) - mask;
-            }
-            level[i]= level_code;
-
-            if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
-                suffix_length++;
-        }
-    }
-
-    if(total_coeff == max_coeff)
-        zeros_left=0;
-    else{
-        if(n == CHROMA_DC_BLOCK_INDEX)
-            zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
-        else
-            zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
-    }
-
-    coeff_num = zeros_left + total_coeff - 1;
-    j = scantable[coeff_num];
-    if(n > 24){
-        block[j] = level[0];
-        for(i=1;i<total_coeff;i++) {
-            if(zeros_left <= 0)
-                run_before = 0;
-            else if(zeros_left < 7){
-                run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
-            }else{
-                run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
-            }
-            zeros_left -= run_before;
-            coeff_num -= 1 + run_before;
-            j= scantable[ coeff_num ];
-
-            block[j]= level[i];
-        }
-    }else{
-        block[j] = (level[0] * qmul[j] + 32)>>6;
-        for(i=1;i<total_coeff;i++) {
-            if(zeros_left <= 0)
-                run_before = 0;
-            else if(zeros_left < 7){
-                run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
-            }else{
-                run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
-            }
-            zeros_left -= run_before;
-            coeff_num -= 1 + run_before;
-            j= scantable[ coeff_num ];
-
-            block[j]= (level[i] * qmul[j] + 32)>>6;
-        }
-    }
-
-    if(zeros_left<0){
-        av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
-        return -1;
-    }
-
-    return 0;
-}
-
-static void predict_field_decoding_flag(H264Context *h){
-    MpegEncContext * const s = &h->s;
-    const int mb_xy= h->mb_xy;
-    int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
-                ? s->current_picture.mb_type[mb_xy-1]
-                : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
-                ? s->current_picture.mb_type[mb_xy-s->mb_stride]
-                : 0;
-    h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
-}
-
-/**
- * decodes a P_SKIP or B_SKIP macroblock
- */
-static void decode_mb_skip(H264Context *h){
-    MpegEncContext * const s = &h->s;
-    const int mb_xy= h->mb_xy;
-    int mb_type=0;
-
-    memset(h->non_zero_count[mb_xy], 0, 16);
-    memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
-
-    if(MB_FIELD)
-        mb_type|= MB_TYPE_INTERLACED;
-
-    if( h->slice_type_nos == FF_B_TYPE )
-    {
-        // just for fill_caches. pred_direct_motion will set the real mb_type
-        mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
-
-        fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
-        ff_h264_pred_direct_motion(h, &mb_type);
-        mb_type|= MB_TYPE_SKIP;
-    }
-    else
-    {
-        int mx, my;
-        mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
-
-        fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
-        pred_pskip_motion(h, &mx, &my);
-        fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
-        fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
-    }
-
-    write_back_motion(h, mb_type);
-    s->current_picture.mb_type[mb_xy]= mb_type;
-    s->current_picture.qscale_table[mb_xy]= s->qscale;
-    h->slice_table[ mb_xy ]= h->slice_num;
-    h->prev_mb_skipped= 1;
-}
-
-/**
- * decodes a macroblock
- * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
- */
-static int decode_mb_cavlc(H264Context *h){
-    MpegEncContext * const s = &h->s;
-    int mb_xy;
-    int partition_count;
-    unsigned int mb_type, cbp;
-    int dct8x8_allowed= h->pps.transform_8x8_mode;
-
-    mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
-
-    tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
-    cbp = 0; /* avoid warning. FIXME: find a solution without slowing
-                down the code */
-    if(h->slice_type_nos != FF_I_TYPE){
-        if(s->mb_skip_run==-1)
-            s->mb_skip_run= get_ue_golomb(&s->gb);
-
-        if (s->mb_skip_run--) {
-            if(FRAME_MBAFF && (s->mb_y&1) == 0){
-                if(s->mb_skip_run==0)
-                    h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
-                else
-                    predict_field_decoding_flag(h);
-            }
-            decode_mb_skip(h);
-            return 0;
-        }
-    }
-    if(FRAME_MBAFF){
-        if( (s->mb_y&1) == 0 )
-            h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
-    }
-
-    h->prev_mb_skipped= 0;
-
-    mb_type= get_ue_golomb(&s->gb);
-    if(h->slice_type_nos == FF_B_TYPE){
-        if(mb_type < 23){
-            partition_count= b_mb_type_info[mb_type].partition_count;
-            mb_type=         b_mb_type_info[mb_type].type;
-        }else{
-            mb_type -= 23;
-            goto decode_intra_mb;
-        }
-    }else if(h->slice_type_nos == FF_P_TYPE){
-        if(mb_type < 5){
-            partition_count= p_mb_type_info[mb_type].partition_count;
-            mb_type=         p_mb_type_info[mb_type].type;
-        }else{
-            mb_type -= 5;
-            goto decode_intra_mb;
-        }
-    }else{
-       assert(h->slice_type_nos == FF_I_TYPE);
-        if(h->slice_type == FF_SI_TYPE && mb_type)
-            mb_type--;
-decode_intra_mb:
-        if(mb_type > 25){
-            av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
-            return -1;
-        }
-        partition_count=0;
-        cbp= i_mb_type_info[mb_type].cbp;
-        h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
-        mb_type= i_mb_type_info[mb_type].type;
-    }
-
-    if(MB_FIELD)
-        mb_type |= MB_TYPE_INTERLACED;
-
-    h->slice_table[ mb_xy ]= h->slice_num;
-
-    if(IS_INTRA_PCM(mb_type)){
-        unsigned int x;
-
-        // We assume these blocks are very rare so we do not optimize it.
-        align_get_bits(&s->gb);
-
-        // The pixels are stored in the same order as levels in h->mb array.
-        for(x=0; x < (CHROMA ? 384 : 256); x++){
-            ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
-        }
-
-        // In deblocking, the quantizer is 0
-        s->current_picture.qscale_table[mb_xy]= 0;
-        // All coeffs are present
-        memset(h->non_zero_count[mb_xy], 16, 16);
-
-        s->current_picture.mb_type[mb_xy]= mb_type;
-        return 0;
-    }
-
-    if(MB_MBAFF){
-        h->ref_count[0] <<= 1;
-        h->ref_count[1] <<= 1;
-    }
-
-    fill_caches(h, mb_type, 0);
-
-    //mb_pred
-    if(IS_INTRA(mb_type)){
-        int pred_mode;
-//            init_top_left_availability(h);
-        if(IS_INTRA4x4(mb_type)){
-            int i;
-            int di = 1;
-            if(dct8x8_allowed && get_bits1(&s->gb)){
-                mb_type |= MB_TYPE_8x8DCT;
-                di = 4;
-            }
-
-//                fill_intra4x4_pred_table(h);
-            for(i=0; i<16; i+=di){
-                int mode= pred_intra_mode(h, i);
-
-                if(!get_bits1(&s->gb)){
-                    const int rem_mode= get_bits(&s->gb, 3);
-                    mode = rem_mode + (rem_mode >= mode);
-                }
-
-                if(di==4)
-                    fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
-                else
-                    h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
-            }
-            ff_h264_write_back_intra_pred_mode(h);
-            if( ff_h264_check_intra4x4_pred_mode(h) < 0)
-                return -1;
-        }else{
-            h->intra16x16_pred_mode= ff_h264_check_intra_pred_mode(h, h->intra16x16_pred_mode);
-            if(h->intra16x16_pred_mode < 0)
-                return -1;
-        }
-        if(CHROMA){
-            pred_mode= ff_h264_check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
-            if(pred_mode < 0)
-                return -1;
-            h->chroma_pred_mode= pred_mode;
-        }
-    }else if(partition_count==4){
-        int i, j, sub_partition_count[4], list, ref[2][4];
-
-        if(h->slice_type_nos == FF_B_TYPE){
-            for(i=0; i<4; i++){
-                h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
-                if(h->sub_mb_type[i] >=13){
-                    av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
-                    return -1;
-                }
-                sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
-                h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
-            }
-            if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
-               || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
-                ff_h264_pred_direct_motion(h, &mb_type);
-                h->ref_cache[0][scan8[4]] =
-                h->ref_cache[1][scan8[4]] =
-                h->ref_cache[0][scan8[12]] =
-                h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
-            }
-        }else{
-            assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
-            for(i=0; i<4; i++){
-                h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
-                if(h->sub_mb_type[i] >=4){
-                    av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
-                    return -1;
-                }
-                sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
-                h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
-            }
-        }
-
-        for(list=0; list<h->list_count; list++){
-            int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
-            for(i=0; i<4; i++){
-                if(IS_DIRECT(h->sub_mb_type[i])) continue;
-                if(IS_DIR(h->sub_mb_type[i], 0, list)){
-                    unsigned int tmp;
-                    if(ref_count == 1){
-                        tmp= 0;
-                    }else if(ref_count == 2){
-                        tmp= get_bits1(&s->gb)^1;
-                    }else{
-                        tmp= get_ue_golomb_31(&s->gb);
-                        if(tmp>=ref_count){
-                            av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
-                            return -1;
-                        }
-                    }
-                    ref[list][i]= tmp;
-                }else{
-                 //FIXME
-                    ref[list][i] = -1;
-                }
-            }
-        }
-
-        if(dct8x8_allowed)
-            dct8x8_allowed = get_dct8x8_allowed(h);
-
-        for(list=0; list<h->list_count; list++){
-            for(i=0; i<4; i++){
-                if(IS_DIRECT(h->sub_mb_type[i])) {
-                    h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
-                    continue;
-                }
-                h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
-                h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
-
-                if(IS_DIR(h->sub_mb_type[i], 0, list)){
-                    const int sub_mb_type= h->sub_mb_type[i];
-                    const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
-                    for(j=0; j<sub_partition_count[i]; j++){
-                        int mx, my;
-                        const int index= 4*i + block_width*j;
-                        int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
-                        pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
-                        mx += get_se_golomb(&s->gb);
-                        my += get_se_golomb(&s->gb);
-                        tprintf(s->avctx, "final mv:%d %d\n", mx, my);
-
-                        if(IS_SUB_8X8(sub_mb_type)){
-                            mv_cache[ 1 ][0]=
-                            mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
-                            mv_cache[ 1 ][1]=
-                            mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
-                        }else if(IS_SUB_8X4(sub_mb_type)){
-                            mv_cache[ 1 ][0]= mx;
-                            mv_cache[ 1 ][1]= my;
-                        }else if(IS_SUB_4X8(sub_mb_type)){
-                            mv_cache[ 8 ][0]= mx;
-                            mv_cache[ 8 ][1]= my;
-                        }
-                        mv_cache[ 0 ][0]= mx;
-                        mv_cache[ 0 ][1]= my;
-                    }
-                }else{
-                    uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
-                    p[0] = p[1]=
-                    p[8] = p[9]= 0;
-                }
-            }
-        }
-    }else if(IS_DIRECT(mb_type)){
-        ff_h264_pred_direct_motion(h, &mb_type);
-        dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
-    }else{
-        int list, mx, my, i;
-         //FIXME we should set ref_idx_l? to 0 if we use that later ...
-        if(IS_16X16(mb_type)){
-            for(list=0; list<h->list_count; list++){
-                    unsigned int val;
-                    if(IS_DIR(mb_type, 0, list)){
-                        if(h->ref_count[list]==1){
-                            val= 0;
-                        }else if(h->ref_count[list]==2){
-                            val= get_bits1(&s->gb)^1;
-                        }else{
-                            val= get_ue_golomb_31(&s->gb);
-                            if(val >= h->ref_count[list]){
-                                av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
-                                return -1;
-                            }
-                        }
-                    }else
-                        val= LIST_NOT_USED&0xFF;
-                    fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
-            }
-            for(list=0; list<h->list_count; list++){
-                unsigned int val;
-                if(IS_DIR(mb_type, 0, list)){
-                    pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
-                    mx += get_se_golomb(&s->gb);
-                    my += get_se_golomb(&s->gb);
-                    tprintf(s->avctx, "final mv:%d %d\n", mx, my);
-
-                    val= pack16to32(mx,my);
-                }else
-                    val=0;
-                fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
-            }
-        }
-        else if(IS_16X8(mb_type)){
-            for(list=0; list<h->list_count; list++){
-                    for(i=0; i<2; i++){
-                        unsigned int val;
-                        if(IS_DIR(mb_type, i, list)){
-                            if(h->ref_count[list] == 1){
-                                val= 0;
-                            }else if(h->ref_count[list] == 2){
-                                val= get_bits1(&s->gb)^1;
-                            }else{
-                                val= get_ue_golomb_31(&s->gb);
-                                if(val >= h->ref_count[list]){
-                                    av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
-                                    return -1;
-                                }
-                            }
-                        }else
-                            val= LIST_NOT_USED&0xFF;
-                        fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
-                    }
-            }
-            for(list=0; list<h->list_count; list++){
-                for(i=0; i<2; i++){
-                    unsigned int val;
-                    if(IS_DIR(mb_type, i, list)){
-                        pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
-                        mx += get_se_golomb(&s->gb);
-                        my += get_se_golomb(&s->gb);
-                        tprintf(s->avctx, "final mv:%d %d\n", mx, my);
-
-                        val= pack16to32(mx,my);
-                    }else
-                        val=0;
-                    fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
-                }
-            }
-        }else{
-            assert(IS_8X16(mb_type));
-            for(list=0; list<h->list_count; list++){
-                    for(i=0; i<2; i++){
-                        unsigned int val;
-                        if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                            if(h->ref_count[list]==1){
-                                val= 0;
-                            }else if(h->ref_count[list]==2){
-                                val= get_bits1(&s->gb)^1;
-                            }else{
-                                val= get_ue_golomb_31(&s->gb);
-                                if(val >= h->ref_count[list]){
-                                    av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
-                                    return -1;
-                                }
-                            }
-                        }else
-                            val= LIST_NOT_USED&0xFF;
-                        fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
-                    }
-            }
-            for(list=0; list<h->list_count; list++){
-                for(i=0; i<2; i++){
-                    unsigned int val;
-                    if(IS_DIR(mb_type, i, list)){
-                        pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
-                        mx += get_se_golomb(&s->gb);
-                        my += get_se_golomb(&s->gb);
-                        tprintf(s->avctx, "final mv:%d %d\n", mx, my);
-
-                        val= pack16to32(mx,my);
-                    }else
-                        val=0;
-                    fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
-                }
-            }
-        }
-    }
-
-    if(IS_INTER(mb_type))
-        write_back_motion(h, mb_type);
-
-    if(!IS_INTRA16x16(mb_type)){
-        cbp= get_ue_golomb(&s->gb);
-        if(cbp > 47){
-            av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
-            return -1;
-        }
-
-        if(CHROMA){
-            if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
-            else                     cbp= golomb_to_inter_cbp   [cbp];
-        }else{
-            if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
-            else                     cbp= golomb_to_inter_cbp_gray[cbp];
-        }
-    }
-    h->cbp = cbp;
-
-    if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
-        if(get_bits1(&s->gb)){
-            mb_type |= MB_TYPE_8x8DCT;
-            h->cbp_table[mb_xy]= cbp;
-        }
-    }
-    s->current_picture.mb_type[mb_xy]= mb_type;
-
-    if(cbp || IS_INTRA16x16(mb_type)){
-        int i8x8, i4x4, chroma_idx;
-        int dquant;
-        GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
-        const uint8_t *scan, *scan8x8, *dc_scan;
-
-//        fill_non_zero_count_cache(h);
-
-        if(IS_INTERLACED(mb_type)){
-            scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
-            scan= s->qscale ? h->field_scan : h->field_scan_q0;
-            dc_scan= luma_dc_field_scan;
-        }else{
-            scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
-            scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
-            dc_scan= luma_dc_zigzag_scan;
-        }
-
-        dquant= get_se_golomb(&s->gb);
-
-        if( dquant > 25 || dquant < -26 ){
-            av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
-            return -1;
-        }
-
-        s->qscale += dquant;
-        if(((unsigned)s->qscale) > 51){
-            if(s->qscale<0) s->qscale+= 52;
-            else            s->qscale-= 52;
-        }
-
-        h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
-        h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
-        if(IS_INTRA16x16(mb_type)){
-            if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
-                return -1; //FIXME continue if partitioned and other return -1 too
-            }
-
-            assert((cbp&15) == 0 || (cbp&15) == 15);
-
-            if(cbp&15){
-                for(i8x8=0; i8x8<4; i8x8++){
-                    for(i4x4=0; i4x4<4; i4x4++){
-                        const int index= i4x4 + 4*i8x8;
-                        if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
-                            return -1;
-                        }
-                    }
-                }
-            }else{
-                fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
-            }
-        }else{
-            for(i8x8=0; i8x8<4; i8x8++){
-                if(cbp & (1<<i8x8)){
-                    if(IS_8x8DCT(mb_type)){
-                        DCTELEM *buf = &h->mb[64*i8x8];
-                        uint8_t *nnz;
-                        for(i4x4=0; i4x4<4; i4x4++){
-                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
-                                                h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
-                                return -1;
-                        }
-                        nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
-                        nnz[0] += nnz[1] + nnz[8] + nnz[9];
-                    }else{
-                        for(i4x4=0; i4x4<4; i4x4++){
-                            const int index= i4x4 + 4*i8x8;
-
-                            if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
-                                return -1;
-                            }
-                        }
-                    }
-                }else{
-                    uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
-                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
-                }
-            }
-        }
-
-        if(cbp&0x30){
-            for(chroma_idx=0; chroma_idx<2; chroma_idx++)
-                if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
-                    return -1;
-                }
-        }
-
-        if(cbp&0x20){
-            for(chroma_idx=0; chroma_idx<2; chroma_idx++){
-                const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
-                for(i4x4=0; i4x4<4; i4x4++){
-                    const int index= 16 + 4*chroma_idx + i4x4;
-                    if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
-                        return -1;
-                    }
-                }
-            }
-        }else{
-            uint8_t * const nnz= &h->non_zero_count_cache[0];
-            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
-            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
-        }
-    }else{
-        uint8_t * const nnz= &h->non_zero_count_cache[0];
-        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
-        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
-        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
-    }
-    s->current_picture.qscale_table[mb_xy]= s->qscale;
-    write_back_non_zero_count(h);
-
-    if(MB_MBAFF){
-        h->ref_count[0] >>= 1;
-        h->ref_count[1] >>= 1;
-    }
-
-    return 0;
-}
 
 static int decode_cabac_field_decoding_flag(H264Context *h) {
     MpegEncContext * const s = &h->s;
@@ -4766,13 +3333,13 @@
 
     } else {
         for(;;){
-            int ret = decode_mb_cavlc(h);
+            int ret = ff_h264_decode_mb_cavlc(h);
 
             if(ret>=0) ff_h264_hl_decode_mb(h);
 
             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
                 s->mb_y++;
-                ret = decode_mb_cavlc(h);
+                ret = ff_h264_decode_mb_cavlc(h);
 
                 if(ret>=0) ff_h264_hl_decode_mb(h);
                 s->mb_y--;
--- a/h264.h	Tue Jan 12 23:02:07 2010 +0000
+++ b/h264.h	Wed Jan 13 01:59:19 2010 +0000
@@ -32,6 +32,7 @@
 #include "cabac.h"
 #include "mpegvideo.h"
 #include "h264pred.h"
+#include "rectangle.h"
 
 #define interlaced_dct interlaced_dct_is_a_bad_name
 #define mb_intra mb_intra_is_not_initialized_see_mb_type
@@ -642,6 +643,13 @@
 int ff_h264_frame_start(H264Context *h);
 av_cold int ff_h264_decode_init(AVCodecContext *avctx);
 av_cold int ff_h264_decode_end(AVCodecContext *avctx);
+av_cold void ff_h264_decode_init_vlc(void);
+
+/**
+ * decodes a macroblock
+ * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
+ */
+int ff_h264_decode_mb_cavlc(H264Context *h);
 
 void ff_h264_direct_dist_scale_factor(H264Context * const h);
 void ff_h264_direct_ref_list_init(H264Context * const h);
@@ -694,4 +702,592 @@
     return h->pps.chroma_qp_table[t][qscale];
 }
 
+static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my);
+
+static void fill_caches(H264Context *h, int mb_type, int for_deblock){
+    MpegEncContext * const s = &h->s;
+    const int mb_xy= h->mb_xy;
+    int topleft_xy, top_xy, topright_xy, left_xy[2];
+    int topleft_type, top_type, topright_type, left_type[2];
+    const uint8_t * left_block;
+    int topleft_partition= -1;
+    int i;
+    static const uint8_t left_block_options[4][8]={
+        {0,1,2,3,7,10,8,11},
+        {2,2,3,3,8,11,8,11},
+        {0,0,1,1,7,10,7,10},
+        {0,2,0,2,7,10,7,10}
+    };
+
+    top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
+
+    //FIXME deblocking could skip the intra and nnz parts.
+    if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
+        return;
+
+    /* Wow, what a mess, why didn't they simplify the interlacing & intra
+     * stuff, I can't imagine that these complex rules are worth it. */
+
+    topleft_xy = top_xy - 1;
+    topright_xy= top_xy + 1;
+    left_xy[1] = left_xy[0] = mb_xy-1;
+    left_block = left_block_options[0];
+    if(FRAME_MBAFF){
+        const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
+        const int top_pair_xy      = pair_xy     - s->mb_stride;
+        const int topleft_pair_xy  = top_pair_xy - 1;
+        const int topright_pair_xy = top_pair_xy + 1;
+        const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
+        const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
+        const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
+        const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
+        const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
+        const int bottom = (s->mb_y & 1);
+        tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
+
+        if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
+            top_xy -= s->mb_stride;
+        }
+        if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
+            topleft_xy -= s->mb_stride;
+        } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
+            topleft_xy += s->mb_stride;
+            // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
+            topleft_partition = 0;
+        }
+        if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
+            topright_xy -= s->mb_stride;
+        }
+        if (left_mb_field_flag != curr_mb_field_flag) {
+            left_xy[1] = left_xy[0] = pair_xy - 1;
+            if (curr_mb_field_flag) {
+                left_xy[1] += s->mb_stride;
+                left_block = left_block_options[3];
+            } else {
+                left_block= left_block_options[2 - bottom];
+            }
+        }
+    }
+
+    h->top_mb_xy = top_xy;
+    h->left_mb_xy[0] = left_xy[0];
+    h->left_mb_xy[1] = left_xy[1];
+    if(for_deblock){
+        topleft_type = 0;
+        topright_type = 0;
+        top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
+        left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
+        left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
+
+        if(MB_MBAFF && !IS_INTRA(mb_type)){
+            int list;
+            for(list=0; list<h->list_count; list++){
+                //These values where changed for ease of performing MC, we need to change them back
+                //FIXME maybe we can make MC and loop filter use the same values or prevent
+                //the MC code from changing ref_cache and rather use a temporary array.
+                if(USES_LIST(mb_type,list)){
+                    int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
+                    *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
+                    *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
+                    ref += h->b8_stride;
+                    *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
+                    *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
+                }
+            }
+        }
+    }else{
+        topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
+        top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
+        topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
+        left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
+        left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
+
+    if(IS_INTRA(mb_type)){
+        int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
+        h->topleft_samples_available=
+        h->top_samples_available=
+        h->left_samples_available= 0xFFFF;
+        h->topright_samples_available= 0xEEEA;
+
+        if(!(top_type & type_mask)){
+            h->topleft_samples_available= 0xB3FF;
+            h->top_samples_available= 0x33FF;
+            h->topright_samples_available= 0x26EA;
+        }
+        if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
+            if(IS_INTERLACED(mb_type)){
+                if(!(left_type[0] & type_mask)){
+                    h->topleft_samples_available&= 0xDFFF;
+                    h->left_samples_available&= 0x5FFF;
+                }
+                if(!(left_type[1] & type_mask)){
+                    h->topleft_samples_available&= 0xFF5F;
+                    h->left_samples_available&= 0xFF5F;
+                }
+            }else{
+                int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
+                                ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
+                assert(left_xy[0] == left_xy[1]);
+                if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
+                    h->topleft_samples_available&= 0xDF5F;
+                    h->left_samples_available&= 0x5F5F;
+                }
+            }
+        }else{
+            if(!(left_type[0] & type_mask)){
+                h->topleft_samples_available&= 0xDF5F;
+                h->left_samples_available&= 0x5F5F;
+            }
+        }
+
+        if(!(topleft_type & type_mask))
+            h->topleft_samples_available&= 0x7FFF;
+
+        if(!(topright_type & type_mask))
+            h->topright_samples_available&= 0xFBFF;
+
+        if(IS_INTRA4x4(mb_type)){
+            if(IS_INTRA4x4(top_type)){
+                h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
+                h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
+                h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
+                h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
+            }else{
+                int pred;
+                if(!(top_type & type_mask))
+                    pred= -1;
+                else{
+                    pred= 2;
+                }
+                h->intra4x4_pred_mode_cache[4+8*0]=
+                h->intra4x4_pred_mode_cache[5+8*0]=
+                h->intra4x4_pred_mode_cache[6+8*0]=
+                h->intra4x4_pred_mode_cache[7+8*0]= pred;
+            }
+            for(i=0; i<2; i++){
+                if(IS_INTRA4x4(left_type[i])){
+                    h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
+                    h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
+                }else{
+                    int pred;
+                    if(!(left_type[i] & type_mask))
+                        pred= -1;
+                    else{
+                        pred= 2;
+                    }
+                    h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
+                    h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
+                }
+            }
+        }
+    }
+    }
+
+
+/*
+0 . T T. T T T T
+1 L . .L . . . .
+2 L . .L . . . .
+3 . T TL . . . .
+4 L . .L . . . .
+5 L . .. . . . .
+*/
+//FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
+    if(top_type){
+        h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
+        h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
+        h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
+        h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
+
+        h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
+        h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
+
+        h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
+        h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
+
+    }else{
+        h->non_zero_count_cache[4+8*0]=
+        h->non_zero_count_cache[5+8*0]=
+        h->non_zero_count_cache[6+8*0]=
+        h->non_zero_count_cache[7+8*0]=
+
+        h->non_zero_count_cache[1+8*0]=
+        h->non_zero_count_cache[2+8*0]=
+
+        h->non_zero_count_cache[1+8*3]=
+        h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
+
+    }
+
+    for (i=0; i<2; i++) {
+        if(left_type[i]){
+            h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
+            h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
+            h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
+            h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
+        }else{
+            h->non_zero_count_cache[3+8*1 + 2*8*i]=
+            h->non_zero_count_cache[3+8*2 + 2*8*i]=
+            h->non_zero_count_cache[0+8*1 +   8*i]=
+            h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
+        }
+    }
+
+    if( h->pps.cabac ) {
+        // top_cbp
+        if(top_type) {
+            h->top_cbp = h->cbp_table[top_xy];
+        } else if(IS_INTRA(mb_type)) {
+            h->top_cbp = 0x1C0;
+        } else {
+            h->top_cbp = 0;
+        }
+        // left_cbp
+        if (left_type[0]) {
+            h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
+        } else if(IS_INTRA(mb_type)) {
+            h->left_cbp = 0x1C0;
+        } else {
+            h->left_cbp = 0;
+        }
+        if (left_type[0]) {
+            h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
+        }
+        if (left_type[1]) {
+            h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
+        }
+    }
+
+#if 1
+    if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
+        int list;
+        for(list=0; list<h->list_count; list++){
+            if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
+                /*if(!h->mv_cache_clean[list]){
+                    memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
+                    memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
+                    h->mv_cache_clean[list]= 1;
+                }*/
+                continue;
+            }
+            h->mv_cache_clean[list]= 0;
+
+            if(USES_LIST(top_type, list)){
+                const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
+                const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
+                *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
+                *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
+                *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
+                *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
+                h->ref_cache[list][scan8[0] + 0 - 1*8]=
+                h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
+                h->ref_cache[list][scan8[0] + 2 - 1*8]=
+                h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
+            }else{
+                *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
+                *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
+                *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
+                *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
+                *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
+            }
+
+            for(i=0; i<2; i++){
+                int cache_idx = scan8[0] - 1 + i*2*8;
+                if(USES_LIST(left_type[i], list)){
+                    const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
+                    const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
+                    *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
+                    *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
+                    h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
+                    h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
+                }else{
+                    *(uint32_t*)h->mv_cache [list][cache_idx  ]=
+                    *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
+                    h->ref_cache[list][cache_idx  ]=
+                    h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+                }
+            }
+
+            if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
+                continue;
+
+            if(USES_LIST(topleft_type, list)){
+                const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
+                const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
+                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
+                h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
+            }else{
+                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
+                h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+            }
+
+            if(USES_LIST(topright_type, list)){
+                const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
+                const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
+                *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
+                h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
+            }else{
+                *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
+                h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+            }
+
+            if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
+                continue;
+
+            h->ref_cache[list][scan8[5 ]+1] =
+            h->ref_cache[list][scan8[7 ]+1] =
+            h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
+            h->ref_cache[list][scan8[4 ]] =
+            h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
+            *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
+            *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
+            *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
+            *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
+            *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
+
+            if( h->pps.cabac ) {
+                /* XXX beurk, Load mvd */
+                if(USES_LIST(top_type, list)){
+                    const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
+                }else{
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
+                }
+                if(USES_LIST(left_type[0], list)){
+                    const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
+                }else{
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
+                }
+                if(USES_LIST(left_type[1], list)){
+                    const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
+                }else{
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
+                }
+                *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
+                *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
+                *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
+                *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
+                *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
+
+                if(h->slice_type_nos == FF_B_TYPE){
+                    fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
+
+                    if(IS_DIRECT(top_type)){
+                        *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
+                    }else if(IS_8X8(top_type)){
+                        int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
+                        h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
+                        h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
+                    }else{
+                        *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
+                    }
+
+                    if(IS_DIRECT(left_type[0]))
+                        h->direct_cache[scan8[0] - 1 + 0*8]= 1;
+                    else if(IS_8X8(left_type[0]))
+                        h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
+                    else
+                        h->direct_cache[scan8[0] - 1 + 0*8]= 0;
+
+                    if(IS_DIRECT(left_type[1]))
+                        h->direct_cache[scan8[0] - 1 + 2*8]= 1;
+                    else if(IS_8X8(left_type[1]))
+                        h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
+                    else
+                        h->direct_cache[scan8[0] - 1 + 2*8]= 0;
+                }
+            }
+
+            if(FRAME_MBAFF){
+#define MAP_MVS\
+                    MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
+                    MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
+                    MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
+                    MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
+                    MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
+                    MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
+                    MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
+                    MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
+                    MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
+                    MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
+                if(MB_FIELD){
+#define MAP_F2F(idx, mb_type)\
+                    if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
+                        h->ref_cache[list][idx] <<= 1;\
+                        h->mv_cache[list][idx][1] /= 2;\
+                        h->mvd_cache[list][idx][1] /= 2;\
+                    }
+                    MAP_MVS
+#undef MAP_F2F
+                }else{
+#define MAP_F2F(idx, mb_type)\
+                    if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
+                        h->ref_cache[list][idx] >>= 1;\
+                        h->mv_cache[list][idx][1] <<= 1;\
+                        h->mvd_cache[list][idx][1] <<= 1;\
+                    }
+                    MAP_MVS
+#undef MAP_F2F
+                }
+            }
+        }
+    }
+#endif
+
+    h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
+}
+
+/**
+ * gets the predicted intra4x4 prediction mode.
+ */
+static inline int pred_intra_mode(H264Context *h, int n){
+    const int index8= scan8[n];
+    const int left= h->intra4x4_pred_mode_cache[index8 - 1];
+    const int top = h->intra4x4_pred_mode_cache[index8 - 8];
+    const int min= FFMIN(left, top);
+
+    tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
+
+    if(min<0) return DC_PRED;
+    else      return min;
+}
+
+static inline void write_back_non_zero_count(H264Context *h){
+    const int mb_xy= h->mb_xy;
+
+    h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
+    h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
+    h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
+    h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
+    h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
+    h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
+    h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
+
+    h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
+    h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
+    h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
+
+    h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
+    h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
+    h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
+}
+
+static inline void write_back_motion(H264Context *h, int mb_type){
+    MpegEncContext * const s = &h->s;
+    const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
+    const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
+    int list;
+
+    if(!USES_LIST(mb_type, 0))
+        fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
+
+    for(list=0; list<h->list_count; list++){
+        int y;
+        if(!USES_LIST(mb_type, list))
+            continue;
+
+        for(y=0; y<4; y++){
+            *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
+            *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
+        }
+        if( h->pps.cabac ) {
+            if(IS_SKIP(mb_type))
+                fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
+            else
+            for(y=0; y<4; y++){
+                *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
+                *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
+            }
+        }
+
+        {
+            int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
+            ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
+            ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
+            ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
+            ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
+        }
+    }
+
+    if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
+        if(IS_8X8(mb_type)){
+            uint8_t *direct_table = &h->direct_table[b8_xy];
+            direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
+            direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
+            direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
+        }
+    }
+}
+
+static inline int get_dct8x8_allowed(H264Context *h){
+    if(h->sps.direct_8x8_inference_flag)
+        return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
+    else
+        return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
+}
+
+static void predict_field_decoding_flag(H264Context *h){
+    MpegEncContext * const s = &h->s;
+    const int mb_xy= h->mb_xy;
+    int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
+                ? s->current_picture.mb_type[mb_xy-1]
+                : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
+                ? s->current_picture.mb_type[mb_xy-s->mb_stride]
+                : 0;
+    h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
+}
+
+/**
+ * decodes a P_SKIP or B_SKIP macroblock
+ */
+static void decode_mb_skip(H264Context *h){
+    MpegEncContext * const s = &h->s;
+    const int mb_xy= h->mb_xy;
+    int mb_type=0;
+
+    memset(h->non_zero_count[mb_xy], 0, 16);
+    memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
+
+    if(MB_FIELD)
+        mb_type|= MB_TYPE_INTERLACED;
+
+    if( h->slice_type_nos == FF_B_TYPE )
+    {
+        // just for fill_caches. pred_direct_motion will set the real mb_type
+        mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
+
+        fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
+        ff_h264_pred_direct_motion(h, &mb_type);
+        mb_type|= MB_TYPE_SKIP;
+    }
+    else
+    {
+        int mx, my;
+        mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
+
+        fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
+        pred_pskip_motion(h, &mx, &my);
+        fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
+        fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
+    }
+
+    write_back_motion(h, mb_type);
+    s->current_picture.mb_type[mb_xy]= mb_type;
+    s->current_picture.qscale_table[mb_xy]= s->qscale;
+    h->slice_table[ mb_xy ]= h->slice_num;
+    h->prev_mb_skipped= 1;
+}
+
 #endif /* AVCODEC_H264_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/h264_cavlc.c	Wed Jan 13 01:59:19 2010 +0000
@@ -0,0 +1,1053 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... cavlc bitstream decoding
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file libavcodec/h264_cavlc.c
+ * H.264 / AVC / MPEG4 part10 cavlc bitstream decoding.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "internal.h"
+#include "avcodec.h"
+#include "mpegvideo.h"
+#include "h264.h"
+#include "h264data.h" // FIXME FIXME FIXME
+#include "h264_mvpred.h"
+#include "golomb.h"
+
+#if ARCH_X86
+#include "x86/h264_i386.h"
+#endif
+
+//#undef NDEBUG
+#include <assert.h>
+
+static const uint8_t golomb_to_intra4x4_cbp[48]={
+ 47, 31, 15,  0, 23, 27, 29, 30,  7, 11, 13, 14, 39, 43, 45, 46,
+ 16,  3,  5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44,  1,  2,  4,
+  8, 17, 18, 20, 24,  6,  9, 22, 25, 32, 33, 34, 36, 40, 38, 41
+};
+
+static const uint8_t golomb_to_inter_cbp[48]={
+  0, 16,  1,  2,  4,  8, 32,  3,  5, 10, 12, 15, 47,  7, 11, 13,
+ 14,  6,  9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46,
+ 17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41
+};
+
+static const uint8_t golomb_to_inter_cbp_gray[16]={
+ 0, 1, 2, 4, 8, 3, 5,10,12,15, 7,11,13,14, 6, 9,
+};
+
+static const uint8_t golomb_to_intra4x4_cbp_gray[16]={
+15, 0, 7,11,13,14, 3, 5,10,12, 1, 2, 4, 8, 6, 9,
+};
+
+static const uint8_t chroma_dc_coeff_token_len[4*5]={
+ 2, 0, 0, 0,
+ 6, 1, 0, 0,
+ 6, 6, 3, 0,
+ 6, 7, 7, 6,
+ 6, 8, 8, 7,
+};
+
+static const uint8_t chroma_dc_coeff_token_bits[4*5]={
+ 1, 0, 0, 0,
+ 7, 1, 0, 0,
+ 4, 6, 1, 0,
+ 3, 3, 2, 5,
+ 2, 3, 2, 0,
+};
+
+static const uint8_t coeff_token_len[4][4*17]={
+{
+     1, 0, 0, 0,
+     6, 2, 0, 0,     8, 6, 3, 0,     9, 8, 7, 5,    10, 9, 8, 6,
+    11,10, 9, 7,    13,11,10, 8,    13,13,11, 9,    13,13,13,10,
+    14,14,13,11,    14,14,14,13,    15,15,14,14,    15,15,15,14,
+    16,15,15,15,    16,16,16,15,    16,16,16,16,    16,16,16,16,
+},
+{
+     2, 0, 0, 0,
+     6, 2, 0, 0,     6, 5, 3, 0,     7, 6, 6, 4,     8, 6, 6, 4,
+     8, 7, 7, 5,     9, 8, 8, 6,    11, 9, 9, 6,    11,11,11, 7,
+    12,11,11, 9,    12,12,12,11,    12,12,12,11,    13,13,13,12,
+    13,13,13,13,    13,14,13,13,    14,14,14,13,    14,14,14,14,
+},
+{
+     4, 0, 0, 0,
+     6, 4, 0, 0,     6, 5, 4, 0,     6, 5, 5, 4,     7, 5, 5, 4,
+     7, 5, 5, 4,     7, 6, 6, 4,     7, 6, 6, 4,     8, 7, 7, 5,
+     8, 8, 7, 6,     9, 8, 8, 7,     9, 9, 8, 8,     9, 9, 9, 8,
+    10, 9, 9, 9,    10,10,10,10,    10,10,10,10,    10,10,10,10,
+},
+{
+     6, 0, 0, 0,
+     6, 6, 0, 0,     6, 6, 6, 0,     6, 6, 6, 6,     6, 6, 6, 6,
+     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,
+     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,
+     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,
+}
+};
+
+static const uint8_t coeff_token_bits[4][4*17]={
+{
+     1, 0, 0, 0,
+     5, 1, 0, 0,     7, 4, 1, 0,     7, 6, 5, 3,     7, 6, 5, 3,
+     7, 6, 5, 4,    15, 6, 5, 4,    11,14, 5, 4,     8,10,13, 4,
+    15,14, 9, 4,    11,10,13,12,    15,14, 9,12,    11,10,13, 8,
+    15, 1, 9,12,    11,14,13, 8,     7,10, 9,12,     4, 6, 5, 8,
+},
+{
+     3, 0, 0, 0,
+    11, 2, 0, 0,     7, 7, 3, 0,     7,10, 9, 5,     7, 6, 5, 4,
+     4, 6, 5, 6,     7, 6, 5, 8,    15, 6, 5, 4,    11,14,13, 4,
+    15,10, 9, 4,    11,14,13,12,     8,10, 9, 8,    15,14,13,12,
+    11,10, 9,12,     7,11, 6, 8,     9, 8,10, 1,     7, 6, 5, 4,
+},
+{
+    15, 0, 0, 0,
+    15,14, 0, 0,    11,15,13, 0,     8,12,14,12,    15,10,11,11,
+    11, 8, 9,10,     9,14,13, 9,     8,10, 9, 8,    15,14,13,13,
+    11,14,10,12,    15,10,13,12,    11,14, 9,12,     8,10,13, 8,
+    13, 7, 9,12,     9,12,11,10,     5, 8, 7, 6,     1, 4, 3, 2,
+},
+{
+     3, 0, 0, 0,
+     0, 1, 0, 0,     4, 5, 6, 0,     8, 9,10,11,    12,13,14,15,
+    16,17,18,19,    20,21,22,23,    24,25,26,27,    28,29,30,31,
+    32,33,34,35,    36,37,38,39,    40,41,42,43,    44,45,46,47,
+    48,49,50,51,    52,53,54,55,    56,57,58,59,    60,61,62,63,
+}
+};
+
+static const uint8_t total_zeros_len[16][16]= {
+    {1,3,3,4,4,5,5,6,6,7,7,8,8,9,9,9},
+    {3,3,3,3,3,4,4,4,4,5,5,6,6,6,6},
+    {4,3,3,3,4,4,3,3,4,5,5,6,5,6},
+    {5,3,4,4,3,3,3,4,3,4,5,5,5},
+    {4,4,4,3,3,3,3,3,4,5,4,5},
+    {6,5,3,3,3,3,3,3,4,3,6},
+    {6,5,3,3,3,2,3,4,3,6},
+    {6,4,5,3,2,2,3,3,6},
+    {6,6,4,2,2,3,2,5},
+    {5,5,3,2,2,2,4},
+    {4,4,3,3,1,3},
+    {4,4,2,1,3},
+    {3,3,1,2},
+    {2,2,1},
+    {1,1},
+};
+
+static const uint8_t total_zeros_bits[16][16]= {
+    {1,3,2,3,2,3,2,3,2,3,2,3,2,3,2,1},
+    {7,6,5,4,3,5,4,3,2,3,2,3,2,1,0},
+    {5,7,6,5,4,3,4,3,2,3,2,1,1,0},
+    {3,7,5,4,6,5,4,3,3,2,2,1,0},
+    {5,4,3,7,6,5,4,3,2,1,1,0},
+    {1,1,7,6,5,4,3,2,1,1,0},
+    {1,1,5,4,3,3,2,1,1,0},
+    {1,1,1,3,3,2,2,1,0},
+    {1,0,1,3,2,1,1,1},
+    {1,0,1,3,2,1,1},
+    {0,1,1,2,1,3},
+    {0,1,1,1,1},
+    {0,1,1,1},
+    {0,1,1},
+    {0,1},
+};
+
+static const uint8_t chroma_dc_total_zeros_len[3][4]= {
+    { 1, 2, 3, 3,},
+    { 1, 2, 2, 0,},
+    { 1, 1, 0, 0,},
+};
+
+static const uint8_t chroma_dc_total_zeros_bits[3][4]= {
+    { 1, 1, 1, 0,},
+    { 1, 1, 0, 0,},
+    { 1, 0, 0, 0,},
+};
+
+static const uint8_t run_len[7][16]={
+    {1,1},
+    {1,2,2},
+    {2,2,2,2},
+    {2,2,2,3,3},
+    {2,2,3,3,3,3},
+    {2,3,3,3,3,3,3},
+    {3,3,3,3,3,3,3,4,5,6,7,8,9,10,11},
+};
+
+static const uint8_t run_bits[7][16]={
+    {1,0},
+    {1,1,0},
+    {3,2,1,0},
+    {3,2,1,1,0},
+    {3,2,3,2,1,0},
+    {3,0,1,3,2,5,4},
+    {7,6,5,4,3,2,1,1,1,1,1,1,1,1,1},
+};
+
+static VLC coeff_token_vlc[4];
+static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
+static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
+
+static VLC chroma_dc_coeff_token_vlc;
+static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
+static const int chroma_dc_coeff_token_vlc_table_size = 256;
+
+static VLC total_zeros_vlc[15];
+static VLC_TYPE total_zeros_vlc_tables[15][512][2];
+static const int total_zeros_vlc_tables_size = 512;
+
+static VLC chroma_dc_total_zeros_vlc[3];
+static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
+static const int chroma_dc_total_zeros_vlc_tables_size = 8;
+
+static VLC run_vlc[6];
+static VLC_TYPE run_vlc_tables[6][8][2];
+static const int run_vlc_tables_size = 8;
+
+static VLC run7_vlc;
+static VLC_TYPE run7_vlc_table[96][2];
+static const int run7_vlc_table_size = 96;
+
+#define LEVEL_TAB_BITS 8
+static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
+
+
+/**
+ * gets the predicted number of non-zero coefficients.
+ * @param n block index
+ */
+static inline int pred_non_zero_count(H264Context *h, int n){
+    const int index8= scan8[n];
+    const int left= h->non_zero_count_cache[index8 - 1];
+    const int top = h->non_zero_count_cache[index8 - 8];
+    int i= left + top;
+
+    if(i<64) i= (i+1)>>1;
+
+    tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
+
+    return i&31;
+}
+
+static av_cold void init_cavlc_level_tab(void){
+    int suffix_length, mask;
+    unsigned int i;
+
+    for(suffix_length=0; suffix_length<7; suffix_length++){
+        for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
+            int prefix= LEVEL_TAB_BITS - av_log2(2*i);
+            int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
+
+            mask= -(level_code&1);
+            level_code= (((2+level_code)>>1) ^ mask) - mask;
+            if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
+                cavlc_level_tab[suffix_length][i][0]= level_code;
+                cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
+            }else if(prefix + 1 <= LEVEL_TAB_BITS){
+                cavlc_level_tab[suffix_length][i][0]= prefix+100;
+                cavlc_level_tab[suffix_length][i][1]= prefix + 1;
+            }else{
+                cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
+                cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
+            }
+        }
+    }
+}
+
+av_cold void ff_h264_decode_init_vlc(void){
+    static int done = 0;
+
+    if (!done) {
+        int i;
+        int offset;
+        done = 1;
+
+        chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
+        chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
+        init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
+                 &chroma_dc_coeff_token_len [0], 1, 1,
+                 &chroma_dc_coeff_token_bits[0], 1, 1,
+                 INIT_VLC_USE_NEW_STATIC);
+
+        offset = 0;
+        for(i=0; i<4; i++){
+            coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
+            coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
+            init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
+                     &coeff_token_len [i][0], 1, 1,
+                     &coeff_token_bits[i][0], 1, 1,
+                     INIT_VLC_USE_NEW_STATIC);
+            offset += coeff_token_vlc_tables_size[i];
+        }
+        /*
+         * This is a one time safety check to make sure that
+         * the packed static coeff_token_vlc table sizes
+         * were initialized correctly.
+         */
+        assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
+
+        for(i=0; i<3; i++){
+            chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
+            chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
+            init_vlc(&chroma_dc_total_zeros_vlc[i],
+                     CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
+                     &chroma_dc_total_zeros_len [i][0], 1, 1,
+                     &chroma_dc_total_zeros_bits[i][0], 1, 1,
+                     INIT_VLC_USE_NEW_STATIC);
+        }
+        for(i=0; i<15; i++){
+            total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
+            total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
+            init_vlc(&total_zeros_vlc[i],
+                     TOTAL_ZEROS_VLC_BITS, 16,
+                     &total_zeros_len [i][0], 1, 1,
+                     &total_zeros_bits[i][0], 1, 1,
+                     INIT_VLC_USE_NEW_STATIC);
+        }
+
+        for(i=0; i<6; i++){
+            run_vlc[i].table = run_vlc_tables[i];
+            run_vlc[i].table_allocated = run_vlc_tables_size;
+            init_vlc(&run_vlc[i],
+                     RUN_VLC_BITS, 7,
+                     &run_len [i][0], 1, 1,
+                     &run_bits[i][0], 1, 1,
+                     INIT_VLC_USE_NEW_STATIC);
+        }
+        run7_vlc.table = run7_vlc_table,
+        run7_vlc.table_allocated = run7_vlc_table_size;
+        init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
+                 &run_len [6][0], 1, 1,
+                 &run_bits[6][0], 1, 1,
+                 INIT_VLC_USE_NEW_STATIC);
+
+        init_cavlc_level_tab();
+    }
+}
+
+/**
+ *
+ */
+static inline int get_level_prefix(GetBitContext *gb){
+    unsigned int buf;
+    int log;
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf=GET_CACHE(re, gb);
+
+    log= 32 - av_log2(buf);
+#ifdef TRACE
+    print_bin(buf>>(32-log), log);
+    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
+#endif
+
+    LAST_SKIP_BITS(re, gb, log);
+    CLOSE_READER(re, gb);
+
+    return log-1;
+}
+
+/**
+ * decodes a residual block.
+ * @param n block index
+ * @param scantable scantable
+ * @param max_coeff number of coefficients in the block
+ * @return <0 if an error occurred
+ */
+static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
+    MpegEncContext * const s = &h->s;
+    static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
+    int level[16];
+    int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
+
+    //FIXME put trailing_onex into the context
+
+    if(n == CHROMA_DC_BLOCK_INDEX){
+        coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
+        total_coeff= coeff_token>>2;
+    }else{
+        if(n == LUMA_DC_BLOCK_INDEX){
+            total_coeff= pred_non_zero_count(h, 0);
+            coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
+            total_coeff= coeff_token>>2;
+        }else{
+            total_coeff= pred_non_zero_count(h, n);
+            coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
+            total_coeff= coeff_token>>2;
+            h->non_zero_count_cache[ scan8[n] ]= total_coeff;
+        }
+    }
+
+    //FIXME set last_non_zero?
+
+    if(total_coeff==0)
+        return 0;
+    if(total_coeff > (unsigned)max_coeff) {
+        av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
+        return -1;
+    }
+
+    trailing_ones= coeff_token&3;
+    tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
+    assert(total_coeff<=16);
+
+    i = show_bits(gb, 3);
+    skip_bits(gb, trailing_ones);
+    level[0] = 1-((i&4)>>1);
+    level[1] = 1-((i&2)   );
+    level[2] = 1-((i&1)<<1);
+
+    if(trailing_ones<total_coeff) {
+        int mask, prefix;
+        int suffix_length = total_coeff > 10 && trailing_ones < 3;
+        int bitsi= show_bits(gb, LEVEL_TAB_BITS);
+        int level_code= cavlc_level_tab[suffix_length][bitsi][0];
+
+        skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
+        if(level_code >= 100){
+            prefix= level_code - 100;
+            if(prefix == LEVEL_TAB_BITS)
+                prefix += get_level_prefix(gb);
+
+            //first coefficient has suffix_length equal to 0 or 1
+            if(prefix<14){ //FIXME try to build a large unified VLC table for all this
+                if(suffix_length)
+                    level_code= (prefix<<1) + get_bits1(gb); //part
+                else
+                    level_code= prefix; //part
+            }else if(prefix==14){
+                if(suffix_length)
+                    level_code= (prefix<<1) + get_bits1(gb); //part
+                else
+                    level_code= prefix + get_bits(gb, 4); //part
+            }else{
+                level_code= 30 + get_bits(gb, prefix-3); //part
+                if(prefix>=16)
+                    level_code += (1<<(prefix-3))-4096;
+            }
+
+            if(trailing_ones < 3) level_code += 2;
+
+            suffix_length = 2;
+            mask= -(level_code&1);
+            level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
+        }else{
+            if(trailing_ones < 3) level_code += (level_code>>31)|1;
+
+            suffix_length = 1;
+            if(level_code + 3U > 6U)
+                suffix_length++;
+            level[trailing_ones]= level_code;
+        }
+
+        //remaining coefficients have suffix_length > 0
+        for(i=trailing_ones+1;i<total_coeff;i++) {
+            static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
+            int bitsi= show_bits(gb, LEVEL_TAB_BITS);
+            level_code= cavlc_level_tab[suffix_length][bitsi][0];
+
+            skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
+            if(level_code >= 100){
+                prefix= level_code - 100;
+                if(prefix == LEVEL_TAB_BITS){
+                    prefix += get_level_prefix(gb);
+                }
+                if(prefix<15){
+                    level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
+                }else{
+                    level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
+                    if(prefix>=16)
+                        level_code += (1<<(prefix-3))-4096;
+                }
+                mask= -(level_code&1);
+                level_code= (((2+level_code)>>1) ^ mask) - mask;
+            }
+            level[i]= level_code;
+
+            if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
+                suffix_length++;
+        }
+    }
+
+    if(total_coeff == max_coeff)
+        zeros_left=0;
+    else{
+        if(n == CHROMA_DC_BLOCK_INDEX)
+            zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
+        else
+            zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
+    }
+
+    coeff_num = zeros_left + total_coeff - 1;
+    j = scantable[coeff_num];
+    if(n > 24){
+        block[j] = level[0];
+        for(i=1;i<total_coeff;i++) {
+            if(zeros_left <= 0)
+                run_before = 0;
+            else if(zeros_left < 7){
+                run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
+            }else{
+                run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
+            }
+            zeros_left -= run_before;
+            coeff_num -= 1 + run_before;
+            j= scantable[ coeff_num ];
+
+            block[j]= level[i];
+        }
+    }else{
+        block[j] = (level[0] * qmul[j] + 32)>>6;
+        for(i=1;i<total_coeff;i++) {
+            if(zeros_left <= 0)
+                run_before = 0;
+            else if(zeros_left < 7){
+                run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
+            }else{
+                run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
+            }
+            zeros_left -= run_before;
+            coeff_num -= 1 + run_before;
+            j= scantable[ coeff_num ];
+
+            block[j]= (level[i] * qmul[j] + 32)>>6;
+        }
+    }
+
+    if(zeros_left<0){
+        av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
+        return -1;
+    }
+
+    return 0;
+}
+
+int ff_h264_decode_mb_cavlc(H264Context *h){
+    MpegEncContext * const s = &h->s;
+    int mb_xy;
+    int partition_count;
+    unsigned int mb_type, cbp;
+    int dct8x8_allowed= h->pps.transform_8x8_mode;
+
+    mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+
+    tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
+    cbp = 0; /* avoid warning. FIXME: find a solution without slowing
+                down the code */
+    if(h->slice_type_nos != FF_I_TYPE){
+        if(s->mb_skip_run==-1)
+            s->mb_skip_run= get_ue_golomb(&s->gb);
+
+        if (s->mb_skip_run--) {
+            if(FRAME_MBAFF && (s->mb_y&1) == 0){
+                if(s->mb_skip_run==0)
+                    h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
+                else
+                    predict_field_decoding_flag(h);
+            }
+            decode_mb_skip(h);
+            return 0;
+        }
+    }
+    if(FRAME_MBAFF){
+        if( (s->mb_y&1) == 0 )
+            h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
+    }
+
+    h->prev_mb_skipped= 0;
+
+    mb_type= get_ue_golomb(&s->gb);
+    if(h->slice_type_nos == FF_B_TYPE){
+        if(mb_type < 23){
+            partition_count= b_mb_type_info[mb_type].partition_count;
+            mb_type=         b_mb_type_info[mb_type].type;
+        }else{
+            mb_type -= 23;
+            goto decode_intra_mb;
+        }
+    }else if(h->slice_type_nos == FF_P_TYPE){
+        if(mb_type < 5){
+            partition_count= p_mb_type_info[mb_type].partition_count;
+            mb_type=         p_mb_type_info[mb_type].type;
+        }else{
+            mb_type -= 5;
+            goto decode_intra_mb;
+        }
+    }else{
+       assert(h->slice_type_nos == FF_I_TYPE);
+        if(h->slice_type == FF_SI_TYPE && mb_type)
+            mb_type--;
+decode_intra_mb:
+        if(mb_type > 25){
+            av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
+            return -1;
+        }
+        partition_count=0;
+        cbp= i_mb_type_info[mb_type].cbp;
+        h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
+        mb_type= i_mb_type_info[mb_type].type;
+    }
+
+    if(MB_FIELD)
+        mb_type |= MB_TYPE_INTERLACED;
+
+    h->slice_table[ mb_xy ]= h->slice_num;
+
+    if(IS_INTRA_PCM(mb_type)){
+        unsigned int x;
+
+        // We assume these blocks are very rare so we do not optimize it.
+        align_get_bits(&s->gb);
+
+        // The pixels are stored in the same order as levels in h->mb array.
+        for(x=0; x < (CHROMA ? 384 : 256); x++){
+            ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
+        }
+
+        // In deblocking, the quantizer is 0
+        s->current_picture.qscale_table[mb_xy]= 0;
+        // All coeffs are present
+        memset(h->non_zero_count[mb_xy], 16, 16);
+
+        s->current_picture.mb_type[mb_xy]= mb_type;
+        return 0;
+    }
+
+    if(MB_MBAFF){
+        h->ref_count[0] <<= 1;
+        h->ref_count[1] <<= 1;
+    }
+
+    fill_caches(h, mb_type, 0);
+
+    //mb_pred
+    if(IS_INTRA(mb_type)){
+        int pred_mode;
+//            init_top_left_availability(h);
+        if(IS_INTRA4x4(mb_type)){
+            int i;
+            int di = 1;
+            if(dct8x8_allowed && get_bits1(&s->gb)){
+                mb_type |= MB_TYPE_8x8DCT;
+                di = 4;
+            }
+
+//                fill_intra4x4_pred_table(h);
+            for(i=0; i<16; i+=di){
+                int mode= pred_intra_mode(h, i);
+
+                if(!get_bits1(&s->gb)){
+                    const int rem_mode= get_bits(&s->gb, 3);
+                    mode = rem_mode + (rem_mode >= mode);
+                }
+
+                if(di==4)
+                    fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
+                else
+                    h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
+            }
+            ff_h264_write_back_intra_pred_mode(h);
+            if( ff_h264_check_intra4x4_pred_mode(h) < 0)
+                return -1;
+        }else{
+            h->intra16x16_pred_mode= ff_h264_check_intra_pred_mode(h, h->intra16x16_pred_mode);
+            if(h->intra16x16_pred_mode < 0)
+                return -1;
+        }
+        if(CHROMA){
+            pred_mode= ff_h264_check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
+            if(pred_mode < 0)
+                return -1;
+            h->chroma_pred_mode= pred_mode;
+        }
+    }else if(partition_count==4){
+        int i, j, sub_partition_count[4], list, ref[2][4];
+
+        if(h->slice_type_nos == FF_B_TYPE){
+            for(i=0; i<4; i++){
+                h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
+                if(h->sub_mb_type[i] >=13){
+                    av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
+                    return -1;
+                }
+                sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
+                h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
+            }
+            if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
+               || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
+                ff_h264_pred_direct_motion(h, &mb_type);
+                h->ref_cache[0][scan8[4]] =
+                h->ref_cache[1][scan8[4]] =
+                h->ref_cache[0][scan8[12]] =
+                h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
+            }
+        }else{
+            assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
+            for(i=0; i<4; i++){
+                h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
+                if(h->sub_mb_type[i] >=4){
+                    av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
+                    return -1;
+                }
+                sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
+                h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
+            }
+        }
+
+        for(list=0; list<h->list_count; list++){
+            int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
+            for(i=0; i<4; i++){
+                if(IS_DIRECT(h->sub_mb_type[i])) continue;
+                if(IS_DIR(h->sub_mb_type[i], 0, list)){
+                    unsigned int tmp;
+                    if(ref_count == 1){
+                        tmp= 0;
+                    }else if(ref_count == 2){
+                        tmp= get_bits1(&s->gb)^1;
+                    }else{
+                        tmp= get_ue_golomb_31(&s->gb);
+                        if(tmp>=ref_count){
+                            av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
+                            return -1;
+                        }
+                    }
+                    ref[list][i]= tmp;
+                }else{
+                 //FIXME
+                    ref[list][i] = -1;
+                }
+            }
+        }
+
+        if(dct8x8_allowed)
+            dct8x8_allowed = get_dct8x8_allowed(h);
+
+        for(list=0; list<h->list_count; list++){
+            for(i=0; i<4; i++){
+                if(IS_DIRECT(h->sub_mb_type[i])) {
+                    h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
+                    continue;
+                }
+                h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
+                h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
+
+                if(IS_DIR(h->sub_mb_type[i], 0, list)){
+                    const int sub_mb_type= h->sub_mb_type[i];
+                    const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
+                    for(j=0; j<sub_partition_count[i]; j++){
+                        int mx, my;
+                        const int index= 4*i + block_width*j;
+                        int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
+                        pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
+                        mx += get_se_golomb(&s->gb);
+                        my += get_se_golomb(&s->gb);
+                        tprintf(s->avctx, "final mv:%d %d\n", mx, my);
+
+                        if(IS_SUB_8X8(sub_mb_type)){
+                            mv_cache[ 1 ][0]=
+                            mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
+                            mv_cache[ 1 ][1]=
+                            mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
+                        }else if(IS_SUB_8X4(sub_mb_type)){
+                            mv_cache[ 1 ][0]= mx;
+                            mv_cache[ 1 ][1]= my;
+                        }else if(IS_SUB_4X8(sub_mb_type)){
+                            mv_cache[ 8 ][0]= mx;
+                            mv_cache[ 8 ][1]= my;
+                        }
+                        mv_cache[ 0 ][0]= mx;
+                        mv_cache[ 0 ][1]= my;
+                    }
+                }else{
+                    uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
+                    p[0] = p[1]=
+                    p[8] = p[9]= 0;
+                }
+            }
+        }
+    }else if(IS_DIRECT(mb_type)){
+        ff_h264_pred_direct_motion(h, &mb_type);
+        dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
+    }else{
+        int list, mx, my, i;
+         //FIXME we should set ref_idx_l? to 0 if we use that later ...
+        if(IS_16X16(mb_type)){
+            for(list=0; list<h->list_count; list++){
+                    unsigned int val;
+                    if(IS_DIR(mb_type, 0, list)){
+                        if(h->ref_count[list]==1){
+                            val= 0;
+                        }else if(h->ref_count[list]==2){
+                            val= get_bits1(&s->gb)^1;
+                        }else{
+                            val= get_ue_golomb_31(&s->gb);
+                            if(val >= h->ref_count[list]){
+                                av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
+                                return -1;
+                            }
+                        }
+                    }else
+                        val= LIST_NOT_USED&0xFF;
+                    fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
+            }
+            for(list=0; list<h->list_count; list++){
+                unsigned int val;
+                if(IS_DIR(mb_type, 0, list)){
+                    pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
+                    mx += get_se_golomb(&s->gb);
+                    my += get_se_golomb(&s->gb);
+                    tprintf(s->avctx, "final mv:%d %d\n", mx, my);
+
+                    val= pack16to32(mx,my);
+                }else
+                    val=0;
+                fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
+            }
+        }
+        else if(IS_16X8(mb_type)){
+            for(list=0; list<h->list_count; list++){
+                    for(i=0; i<2; i++){
+                        unsigned int val;
+                        if(IS_DIR(mb_type, i, list)){
+                            if(h->ref_count[list] == 1){
+                                val= 0;
+                            }else if(h->ref_count[list] == 2){
+                                val= get_bits1(&s->gb)^1;
+                            }else{
+                                val= get_ue_golomb_31(&s->gb);
+                                if(val >= h->ref_count[list]){
+                                    av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
+                                    return -1;
+                                }
+                            }
+                        }else
+                            val= LIST_NOT_USED&0xFF;
+                        fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
+                    }
+            }
+            for(list=0; list<h->list_count; list++){
+                for(i=0; i<2; i++){
+                    unsigned int val;
+                    if(IS_DIR(mb_type, i, list)){
+                        pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
+                        mx += get_se_golomb(&s->gb);
+                        my += get_se_golomb(&s->gb);
+                        tprintf(s->avctx, "final mv:%d %d\n", mx, my);
+
+                        val= pack16to32(mx,my);
+                    }else
+                        val=0;
+                    fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
+                }
+            }
+        }else{
+            assert(IS_8X16(mb_type));
+            for(list=0; list<h->list_count; list++){
+                    for(i=0; i<2; i++){
+                        unsigned int val;
+                        if(IS_DIR(mb_type, i, list)){ //FIXME optimize
+                            if(h->ref_count[list]==1){
+                                val= 0;
+                            }else if(h->ref_count[list]==2){
+                                val= get_bits1(&s->gb)^1;
+                            }else{
+                                val= get_ue_golomb_31(&s->gb);
+                                if(val >= h->ref_count[list]){
+                                    av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
+                                    return -1;
+                                }
+                            }
+                        }else
+                            val= LIST_NOT_USED&0xFF;
+                        fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
+                    }
+            }
+            for(list=0; list<h->list_count; list++){
+                for(i=0; i<2; i++){
+                    unsigned int val;
+                    if(IS_DIR(mb_type, i, list)){
+                        pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
+                        mx += get_se_golomb(&s->gb);
+                        my += get_se_golomb(&s->gb);
+                        tprintf(s->avctx, "final mv:%d %d\n", mx, my);
+
+                        val= pack16to32(mx,my);
+                    }else
+                        val=0;
+                    fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
+                }
+            }
+        }
+    }
+
+    if(IS_INTER(mb_type))
+        write_back_motion(h, mb_type);
+
+    if(!IS_INTRA16x16(mb_type)){
+        cbp= get_ue_golomb(&s->gb);
+        if(cbp > 47){
+            av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
+            return -1;
+        }
+
+        if(CHROMA){
+            if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
+            else                     cbp= golomb_to_inter_cbp   [cbp];
+        }else{
+            if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
+            else                     cbp= golomb_to_inter_cbp_gray[cbp];
+        }
+    }
+    h->cbp = cbp;
+
+    if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
+        if(get_bits1(&s->gb)){
+            mb_type |= MB_TYPE_8x8DCT;
+            h->cbp_table[mb_xy]= cbp;
+        }
+    }
+    s->current_picture.mb_type[mb_xy]= mb_type;
+
+    if(cbp || IS_INTRA16x16(mb_type)){
+        int i8x8, i4x4, chroma_idx;
+        int dquant;
+        GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
+        const uint8_t *scan, *scan8x8, *dc_scan;
+
+//        fill_non_zero_count_cache(h);
+
+        if(IS_INTERLACED(mb_type)){
+            scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
+            scan= s->qscale ? h->field_scan : h->field_scan_q0;
+            dc_scan= luma_dc_field_scan;
+        }else{
+            scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
+            scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
+            dc_scan= luma_dc_zigzag_scan;
+        }
+
+        dquant= get_se_golomb(&s->gb);
+
+        if( dquant > 25 || dquant < -26 ){
+            av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
+            return -1;
+        }
+
+        s->qscale += dquant;
+        if(((unsigned)s->qscale) > 51){
+            if(s->qscale<0) s->qscale+= 52;
+            else            s->qscale-= 52;
+        }
+
+        h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
+        h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
+        if(IS_INTRA16x16(mb_type)){
+            if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
+                return -1; //FIXME continue if partitioned and other return -1 too
+            }
+
+            assert((cbp&15) == 0 || (cbp&15) == 15);
+
+            if(cbp&15){
+                for(i8x8=0; i8x8<4; i8x8++){
+                    for(i4x4=0; i4x4<4; i4x4++){
+                        const int index= i4x4 + 4*i8x8;
+                        if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
+                            return -1;
+                        }
+                    }
+                }
+            }else{
+                fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
+            }
+        }else{
+            for(i8x8=0; i8x8<4; i8x8++){
+                if(cbp & (1<<i8x8)){
+                    if(IS_8x8DCT(mb_type)){
+                        DCTELEM *buf = &h->mb[64*i8x8];
+                        uint8_t *nnz;
+                        for(i4x4=0; i4x4<4; i4x4++){
+                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
+                                                h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
+                                return -1;
+                        }
+                        nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
+                        nnz[0] += nnz[1] + nnz[8] + nnz[9];
+                    }else{
+                        for(i4x4=0; i4x4<4; i4x4++){
+                            const int index= i4x4 + 4*i8x8;
+
+                            if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
+                                return -1;
+                            }
+                        }
+                    }
+                }else{
+                    uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
+                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
+                }
+            }
+        }
+
+        if(cbp&0x30){
+            for(chroma_idx=0; chroma_idx<2; chroma_idx++)
+                if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
+                    return -1;
+                }
+        }
+
+        if(cbp&0x20){
+            for(chroma_idx=0; chroma_idx<2; chroma_idx++){
+                const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
+                for(i4x4=0; i4x4<4; i4x4++){
+                    const int index= 16 + 4*chroma_idx + i4x4;
+                    if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
+                        return -1;
+                    }
+                }
+            }
+        }else{
+            uint8_t * const nnz= &h->non_zero_count_cache[0];
+            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
+        }
+    }else{
+        uint8_t * const nnz= &h->non_zero_count_cache[0];
+        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
+        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
+    }
+    s->current_picture.qscale_table[mb_xy]= s->qscale;
+    write_back_non_zero_count(h);
+
+    if(MB_MBAFF){
+        h->ref_count[0] >>= 1;
+        h->ref_count[1] >>= 1;
+    }
+
+    return 0;
+}
+
--- a/h264data.h	Tue Jan 12 23:02:07 2010 +0000
+++ b/h264data.h	Wed Jan 13 01:59:19 2010 +0000
@@ -38,173 +38,6 @@
 static const uint8_t golomb_to_pict_type[5]=
 {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
 
-static const uint8_t golomb_to_intra4x4_cbp[48]={
- 47, 31, 15,  0, 23, 27, 29, 30,  7, 11, 13, 14, 39, 43, 45, 46,
- 16,  3,  5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44,  1,  2,  4,
-  8, 17, 18, 20, 24,  6,  9, 22, 25, 32, 33, 34, 36, 40, 38, 41
-};
-
-static const uint8_t golomb_to_inter_cbp[48]={
-  0, 16,  1,  2,  4,  8, 32,  3,  5, 10, 12, 15, 47,  7, 11, 13,
- 14,  6,  9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46,
- 17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41
-};
-
-static const uint8_t golomb_to_inter_cbp_gray[16]={
- 0, 1, 2, 4, 8, 3, 5,10,12,15, 7,11,13,14, 6, 9,
-};
-
-static const uint8_t golomb_to_intra4x4_cbp_gray[16]={
-15, 0, 7,11,13,14, 3, 5,10,12, 1, 2, 4, 8, 6, 9,
-};
-
-static const uint8_t chroma_dc_coeff_token_len[4*5]={
- 2, 0, 0, 0,
- 6, 1, 0, 0,
- 6, 6, 3, 0,
- 6, 7, 7, 6,
- 6, 8, 8, 7,
-};
-
-static const uint8_t chroma_dc_coeff_token_bits[4*5]={
- 1, 0, 0, 0,
- 7, 1, 0, 0,
- 4, 6, 1, 0,
- 3, 3, 2, 5,
- 2, 3, 2, 0,
-};
-
-static const uint8_t coeff_token_len[4][4*17]={
-{
-     1, 0, 0, 0,
-     6, 2, 0, 0,     8, 6, 3, 0,     9, 8, 7, 5,    10, 9, 8, 6,
-    11,10, 9, 7,    13,11,10, 8,    13,13,11, 9,    13,13,13,10,
-    14,14,13,11,    14,14,14,13,    15,15,14,14,    15,15,15,14,
-    16,15,15,15,    16,16,16,15,    16,16,16,16,    16,16,16,16,
-},
-{
-     2, 0, 0, 0,
-     6, 2, 0, 0,     6, 5, 3, 0,     7, 6, 6, 4,     8, 6, 6, 4,
-     8, 7, 7, 5,     9, 8, 8, 6,    11, 9, 9, 6,    11,11,11, 7,
-    12,11,11, 9,    12,12,12,11,    12,12,12,11,    13,13,13,12,
-    13,13,13,13,    13,14,13,13,    14,14,14,13,    14,14,14,14,
-},
-{
-     4, 0, 0, 0,
-     6, 4, 0, 0,     6, 5, 4, 0,     6, 5, 5, 4,     7, 5, 5, 4,
-     7, 5, 5, 4,     7, 6, 6, 4,     7, 6, 6, 4,     8, 7, 7, 5,
-     8, 8, 7, 6,     9, 8, 8, 7,     9, 9, 8, 8,     9, 9, 9, 8,
-    10, 9, 9, 9,    10,10,10,10,    10,10,10,10,    10,10,10,10,
-},
-{
-     6, 0, 0, 0,
-     6, 6, 0, 0,     6, 6, 6, 0,     6, 6, 6, 6,     6, 6, 6, 6,
-     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,
-     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,
-     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,     6, 6, 6, 6,
-}
-};
-
-static const uint8_t coeff_token_bits[4][4*17]={
-{
-     1, 0, 0, 0,
-     5, 1, 0, 0,     7, 4, 1, 0,     7, 6, 5, 3,     7, 6, 5, 3,
-     7, 6, 5, 4,    15, 6, 5, 4,    11,14, 5, 4,     8,10,13, 4,
-    15,14, 9, 4,    11,10,13,12,    15,14, 9,12,    11,10,13, 8,
-    15, 1, 9,12,    11,14,13, 8,     7,10, 9,12,     4, 6, 5, 8,
-},
-{
-     3, 0, 0, 0,
-    11, 2, 0, 0,     7, 7, 3, 0,     7,10, 9, 5,     7, 6, 5, 4,
-     4, 6, 5, 6,     7, 6, 5, 8,    15, 6, 5, 4,    11,14,13, 4,
-    15,10, 9, 4,    11,14,13,12,     8,10, 9, 8,    15,14,13,12,
-    11,10, 9,12,     7,11, 6, 8,     9, 8,10, 1,     7, 6, 5, 4,
-},
-{
-    15, 0, 0, 0,
-    15,14, 0, 0,    11,15,13, 0,     8,12,14,12,    15,10,11,11,
-    11, 8, 9,10,     9,14,13, 9,     8,10, 9, 8,    15,14,13,13,
-    11,14,10,12,    15,10,13,12,    11,14, 9,12,     8,10,13, 8,
-    13, 7, 9,12,     9,12,11,10,     5, 8, 7, 6,     1, 4, 3, 2,
-},
-{
-     3, 0, 0, 0,
-     0, 1, 0, 0,     4, 5, 6, 0,     8, 9,10,11,    12,13,14,15,
-    16,17,18,19,    20,21,22,23,    24,25,26,27,    28,29,30,31,
-    32,33,34,35,    36,37,38,39,    40,41,42,43,    44,45,46,47,
-    48,49,50,51,    52,53,54,55,    56,57,58,59,    60,61,62,63,
-}
-};
-
-static const uint8_t total_zeros_len[16][16]= {
-    {1,3,3,4,4,5,5,6,6,7,7,8,8,9,9,9},
-    {3,3,3,3,3,4,4,4,4,5,5,6,6,6,6},
-    {4,3,3,3,4,4,3,3,4,5,5,6,5,6},
-    {5,3,4,4,3,3,3,4,3,4,5,5,5},
-    {4,4,4,3,3,3,3,3,4,5,4,5},
-    {6,5,3,3,3,3,3,3,4,3,6},
-    {6,5,3,3,3,2,3,4,3,6},
-    {6,4,5,3,2,2,3,3,6},
-    {6,6,4,2,2,3,2,5},
-    {5,5,3,2,2,2,4},
-    {4,4,3,3,1,3},
-    {4,4,2,1,3},
-    {3,3,1,2},
-    {2,2,1},
-    {1,1},
-};
-
-static const uint8_t total_zeros_bits[16][16]= {
-    {1,3,2,3,2,3,2,3,2,3,2,3,2,3,2,1},
-    {7,6,5,4,3,5,4,3,2,3,2,3,2,1,0},
-    {5,7,6,5,4,3,4,3,2,3,2,1,1,0},
-    {3,7,5,4,6,5,4,3,3,2,2,1,0},
-    {5,4,3,7,6,5,4,3,2,1,1,0},
-    {1,1,7,6,5,4,3,2,1,1,0},
-    {1,1,5,4,3,3,2,1,1,0},
-    {1,1,1,3,3,2,2,1,0},
-    {1,0,1,3,2,1,1,1},
-    {1,0,1,3,2,1,1},
-    {0,1,1,2,1,3},
-    {0,1,1,1,1},
-    {0,1,1,1},
-    {0,1,1},
-    {0,1},
-};
-
-static const uint8_t chroma_dc_total_zeros_len[3][4]= {
-    { 1, 2, 3, 3,},
-    { 1, 2, 2, 0,},
-    { 1, 1, 0, 0,},
-};
-
-static const uint8_t chroma_dc_total_zeros_bits[3][4]= {
-    { 1, 1, 1, 0,},
-    { 1, 1, 0, 0,},
-    { 1, 0, 0, 0,},
-};
-
-static const uint8_t run_len[7][16]={
-    {1,1},
-    {1,2,2},
-    {2,2,2,2},
-    {2,2,2,3,3},
-    {2,2,3,3,3,3},
-    {2,3,3,3,3,3,3},
-    {3,3,3,3,3,3,3,4,5,6,7,8,9,10,11},
-};
-
-static const uint8_t run_bits[7][16]={
-    {1,0},
-    {1,1,0},
-    {3,2,1,0},
-    {3,2,1,1,0},
-    {3,2,3,2,1,0},
-    {3,0,1,3,2,5,4},
-    {7,6,5,4,3,2,1,1,1,1,1,1,1,1,1},
-};
-
-
 static const uint8_t zigzag_scan[16]={
  0+0*4, 1+0*4, 0+1*4, 0+2*4,
  1+1*4, 2+0*4, 3+0*4, 2+1*4,