# HG changeset patch
# User lorenm
# Date 1117746920 0
# Node ID 975074f04b951ee1366f807f78166624443e7c18
# Parent  a49f140179e960a0369448eabd0b7b49ec235dbb
decode H.264 with 8x8 transform.
deblocking is still incorrect with 8x8+cavlc

diff -r a49f140179e9 -r 975074f04b95 dsputil.c
--- a/dsputil.c	Thu Jun 02 20:45:35 2005 +0000
+++ b/dsputil.c	Thu Jun 02 21:15:20 2005 +0000
@@ -3685,6 +3685,7 @@
     }
 
     c->h264_idct_add= ff_h264_idct_add_c;
+    c->h264_idct8_add= ff_h264_idct8_add_c;
 
     c->get_pixels = get_pixels_c;
     c->diff_pixels = diff_pixels_c;
diff -r a49f140179e9 -r 975074f04b95 dsputil.h
--- a/dsputil.h	Thu Jun 02 20:45:35 2005 +0000
+++ b/dsputil.h	Thu Jun 02 21:15:20 2005 +0000
@@ -50,6 +50,7 @@
 void ff_fdct_mmx2(DCTELEM *block);
 void ff_fdct_sse2(DCTELEM *block);
 
+void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
 void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
@@ -325,6 +326,7 @@
 #define RECON_SHIFT 6
  
     void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
 } DSPContext;
 
 void dsputil_static_init(void);
diff -r a49f140179e9 -r 975074f04b95 h264.c
--- a/h264.c	Thu Jun 02 20:45:35 2005 +0000
+++ b/h264.c	Thu Jun 02 21:15:20 2005 +0000
@@ -109,6 +109,7 @@
     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
     int constrained_intra_pred; ///< constrained_intra_pred_flag
     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
+    int transform_8x8_mode;     ///< transform_8x8_mode_flag
 }PPS;
 
 /**
@@ -174,6 +175,7 @@
     int8_t intra4x4_pred_mode_cache[5*8];
     int8_t (*intra4x4_pred_mode)[8];
     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
+    void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
     void (*pred8x8  [4+3])(uint8_t *src, int stride);
     void (*pred16x16[4+3])(uint8_t *src, int stride);
     unsigned int topleft_samples_available;
@@ -204,6 +206,11 @@
     int mv_cache_clean[2];
 
     /**
+     * number of neighbors (top and/or left) that used 8x8 dct
+     */
+    int neighbor_transform_size;
+
+    /**
      * block_offset[ 0..23] for frame macroblocks
      * block_offset[24..47] for field macroblocks
      */
@@ -229,6 +236,8 @@
      */
     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 
+    uint16_t (*dequant8_coeff)[64];
+
     int slice_num;
     uint8_t *slice_table_base;
     uint8_t *slice_table;      ///< slice_table_base + mb_stride + 1
@@ -318,7 +327,7 @@
      * Cabac
      */
     CABACContext cabac;
-    uint8_t      cabac_state[399];
+    uint8_t      cabac_state[460];
     int          cabac_init_idc;
 
     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
@@ -841,6 +850,8 @@
         }
     }
 #endif
+
+    h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 }
 
 static inline void write_back_intra_pred_mode(H264Context *h){
@@ -2185,11 +2196,7 @@
 static void pred8x8_128_dc_c(uint8_t *src, int stride){
     int i;
 
-    for(i=0; i<4; i++){
-        ((uint32_t*)(src+i*stride))[0]= 
-        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
-    }
-    for(i=4; i<8; i++){
+    for(i=0; i<8; i++){
         ((uint32_t*)(src+i*stride))[0]= 
         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
     }
@@ -2298,6 +2305,248 @@
   }
 }
 
+#define SRC(x,y) src[(x)+(y)*stride]
+#define PL(y) \
+    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_LEFT \
+    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
+                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
+    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
+    const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2;
+
+#define PT(x) \
+    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOP \
+    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
+                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
+    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
+    const int t7 = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
+                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2; \
+
+#define PTR(x) \
+    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOPRIGHT \
+    int t8, t9, t10, t11, t12, t13, t14, t15; \
+    if(has_topright) { \
+        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
+        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
+    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
+
+#define PREDICT_8x8_LOAD_TOPLEFT \
+    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2;
+
+#define PREDICT_8x8_DC(v) \
+    int y; \
+    for( y = 0; y < 8; y++ ) { \
+        ((uint32_t*)src)[0] = \
+        ((uint32_t*)src)[1] = v; \
+        src += stride; \
+    }
+
+static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_DC(0x80808080);
+}
+static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_LEFT;
+    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOP;
+    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
+                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_LEFT;
+#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
+               ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
+    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
+#undef ROW
+}
+static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    int y;
+    PREDICT_8x8_LOAD_TOP;
+    src[0] = t0;
+    src[1] = t1;
+    src[2] = t2;
+    src[3] = t3;
+    src[4] = t4;
+    src[5] = t5;
+    src[6] = t6;
+    src[7] = t7;
+    for( y = 1; y < 8; y++ )
+        *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
+}
+static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
+    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
+    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
+    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
+    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
+}
+static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
+    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+  
+}
+static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
+    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
+    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
+    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
+    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
+    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
+    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
+    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(7,0)= (t6 + t7 + 1) >> 1;
+}
+static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,7)= (l6 + l7 + 1) >> 1;
+    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
+    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
+    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
+    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
+    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
+    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
+    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
+    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
+    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
+    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
+    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
+    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
+    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
+    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
+}
+static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + t1 + 1) >> 1;
+    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
+    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
+    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
+    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
+    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
+    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
+    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
+    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
+    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
+    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(7,6)= (t10 + t11 + 1) >> 1;
+    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
+}
+static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_LEFT;
+    SRC(0,0)= (l0 + l1 + 1) >> 1;
+    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
+    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
+    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
+    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
+    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
+    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
+    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
+    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
+    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
+    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
+    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
+}
+#undef PREDICT_8x8_LOAD_LEFT
+#undef PREDICT_8x8_LOAD_TOP
+#undef PREDICT_8x8_LOAD_TOPLEFT
+#undef PREDICT_8x8_LOAD_TOPRIGHT
+#undef PREDICT_8x8_DC
+#undef PTR
+#undef PT
+#undef PL
+#undef SRC
+
 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int src_x_offset, int src_y_offset,
@@ -2609,6 +2858,19 @@
     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
 
+    h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
+    h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
+    h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
+    h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
+    h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
+    h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
+    h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
+    h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
+    h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
+    h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
+    h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
+    h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
+
     h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
     h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
     h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
@@ -2642,6 +2904,8 @@
     av_freep(&h->mb2b_xy);
     av_freep(&h->mb2b8_xy);
 
+    av_freep(&h->dequant8_coeff);
+
     av_freep(&h->s.obmc_scratchpad);
 }
 
@@ -2652,7 +2916,7 @@
 static int alloc_tables(H264Context *h){
     MpegEncContext * const s = &h->s;
     const int big_mb_num= s->mb_stride * (s->mb_height+1);
-    int x,y;
+    int x,y,q;
 
     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
 
@@ -2685,6 +2949,17 @@
         }
     }
 
+    CHECKED_ALLOCZ(h->dequant8_coeff, 52*64 * sizeof(uint16_t));
+    for(q=0; q<52; q++){
+        int shift = div6[q];
+        int idx = rem6[q];
+        if(shift >= 2) // qp<12 are shifted during dequant
+            shift -= 2;
+        for(x=0; x<64; x++)
+            h->dequant8_coeff[q][x] = dequant8_coeff_init[idx][
+                dequant8_coeff_init_scan[(x>>1)&12 | x&3] ] << shift;
+    }
+
     s->obmc_scratchpad = NULL;
 
     return 0;
@@ -2832,6 +3107,9 @@
     if(deblock_top){
         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
+        if(s->mb_x < s->mb_width){
+            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
+        }
     }
 
     if(!(s->flags&CODEC_FLAG_GRAY)){
@@ -3011,6 +3289,16 @@
 
             if(IS_INTRA4x4(mb_type)){
                 if(!s->encoding){
+                    if(IS_8x8DCT(mb_type)){
+                        for(i=0; i<16; i+=4){
+                            uint8_t * const ptr= dest_y + block_offset[i];
+                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
+                            h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
+                                                   (h->topright_samples_available<<(i+1))&0x8000, linesize);
+                            if(h->non_zero_count_cache[ scan8[i] ])
+                                s->dsp.h264_idct8_add(ptr, h->mb + i*16, linesize);
+                        }
+                    }else
                     for(i=0; i<16; i++){
                         uint8_t * const ptr= dest_y + block_offset[i];
                         uint8_t *topright;
@@ -3023,10 +3311,7 @@
                             if(!topright_avail){
                                 tr= ptr[3 - linesize]*0x01010101;
                                 topright= (uint8_t*) &tr;
-                            }else if(i==5 && h->deblocking_filter){
-                                tr= *(uint32_t*)h->top_borders[h->mb_aff_frame ? IS_INTERLACED(mb_type) ? bottom : 1 : 0][mb_x+1];
-                                topright= (uint8_t*) &tr;
-                            }else
+                            }else 
                                 topright= ptr + 4 - linesize;
                         }else
                             topright= NULL;
@@ -3071,10 +3356,13 @@
 
         if(!IS_INTRA4x4(mb_type)){
             if(s->codec_id == CODEC_ID_H264){
-                for(i=0; i<16; i++){
+                const int di = IS_8x8DCT(mb_type) ? 4 : 1;
+                void (*idct)(uint8_t *dst, DCTELEM *block, int stride) =
+                    IS_8x8DCT(mb_type) ? s->dsp.h264_idct8_add : s->dsp.h264_idct_add;
+                for(i=0; i<16; i+=di){
                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
                         uint8_t * const ptr= dest_y + block_offset[i];
-                        s->dsp.h264_idct_add(ptr, h->mb + i*16, linesize);
+                        idct(ptr, h->mb + i*16, linesize);
                     }
                 }
             }else{
@@ -4075,6 +4363,16 @@
     return log-1;
 }
 
+static inline int get_dct8x8_allowed(H264Context *h){
+    int i;
+    for(i=0; i<4; i++){
+        if(!IS_SUB_8X8(h->sub_mb_type[i])
+           || !h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i]))
+            return 0;
+    }
+    return 1;
+}
+
 /**
  * decodes a residual block.
  * @param n block index
@@ -4082,9 +4380,8 @@
  * @param max_coeff number of coefficients in the block
  * @return <0 if an error occured
  */
-static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, int qp, int max_coeff){
+static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint16_t *qmul, int max_coeff){
     MpegEncContext * const s = &h->s;
-    const uint16_t *qmul= dequant_coeff[qp];
     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
     int level[16], run[16];
     int suffix_length, zeros_left, coeff_num, coeff_token, total_coeff, i, trailing_ones;
@@ -4272,6 +4569,7 @@
     MpegEncContext * const s = &h->s;
     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
     int mb_type, partition_count, cbp;
+    int dct8x8_allowed= h->pps.transform_8x8_mode;
 
     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?    
 
@@ -4328,7 +4626,6 @@
     if(h->mb_field_decoding_flag)
         mb_type |= MB_TYPE_INTERLACED;
 
-    s->current_picture.mb_type[mb_xy]= mb_type;
     h->slice_table[ mb_xy ]= h->slice_num;
     
     if(IS_INTRA_PCM(mb_type)){
@@ -4366,6 +4663,7 @@
         // All coeffs are present
         memset(h->non_zero_count[mb_xy], 16, 16);
         
+        s->current_picture.mb_type[mb_xy]= mb_type;
         return 0;
     }
         
@@ -4376,9 +4674,14 @@
 //            init_top_left_availability(h);
             if(IS_INTRA4x4(mb_type)){
                 int i;
+                int di = 1;
+                if(dct8x8_allowed && get_bits1(&s->gb)){
+                    mb_type |= MB_TYPE_8x8DCT;
+                    di = 4;
+                }
 
 //                fill_intra4x4_pred_table(h);
-                for(i=0; i<16; i++){
+                for(i=0; i<16; i+=di){
                     const int mode_coded= !get_bits1(&s->gb);
                     const int predicted_mode=  pred_intra_mode(h, i);
                     int mode;
@@ -4393,7 +4696,10 @@
                         mode= predicted_mode;
                     }
                     
-                    h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
+                    if(di==4)
+                        fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
+                    else
+                        h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
                 }
                 write_back_intra_pred_mode(h);
                 if( check_intra4x4_pred_mode(h) < 0)
@@ -4454,6 +4760,9 @@
             }
         }
         
+        if(dct8x8_allowed)
+            dct8x8_allowed = get_dct8x8_allowed(h);
+        
         for(list=0; list<2; list++){
             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
             if(ref_count == 0) continue;
@@ -4501,7 +4810,7 @@
         }
     }else if(IS_DIRECT(mb_type)){
         pred_direct_motion(h, &mb_type);
-        s->current_picture.mb_type[mb_xy]= mb_type;
+        dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
     }else{
         int list, mx, my, i;
          //FIXME we should set ref_idx_l? to 0 if we use that later ...
@@ -4597,6 +4906,12 @@
             cbp= golomb_to_inter_cbp[cbp];
     }
 
+    if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
+        if(get_bits1(&s->gb))
+            mb_type |= MB_TYPE_8x8DCT;
+    }
+    s->current_picture.mb_type[mb_xy]= mb_type;
+
     if(cbp || IS_INTRA16x16(mb_type)){
         int i8x8, i4x4, chroma_idx;
         int chroma_qp, dquant;
@@ -4628,7 +4943,7 @@
         
         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
         if(IS_INTRA16x16(mb_type)){
-            if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, s->qscale, 16) < 0){
+            if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, dequant_coeff[s->qscale], 16) < 0){
                 return -1; //FIXME continue if partitioned and other return -1 too
             }
 
@@ -4638,7 +4953,7 @@
                 for(i8x8=0; i8x8<4; i8x8++){
                     for(i4x4=0; i4x4<4; i4x4++){
                         const int index= i4x4 + 4*i8x8;
-                        if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, s->qscale, 15) < 0 ){
+                        if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, dequant_coeff[s->qscale], 15) < 0 ){
                             return -1;
                         }
                     }
@@ -4649,11 +4964,27 @@
         }else{
             for(i8x8=0; i8x8<4; i8x8++){
                 if(cbp & (1<<i8x8)){
-                    for(i4x4=0; i4x4<4; i4x4++){
-                        const int index= i4x4 + 4*i8x8;
+                    if(IS_8x8DCT(mb_type)){
+                        DCTELEM *buf = &h->mb[64*i8x8];
+                        for(i4x4=0; i4x4<4; i4x4++){
+                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, zigzag_scan8x8_cavlc+16*i4x4,
+                                                h->dequant8_coeff[s->qscale], 16) <0 )
+                                return -1;
+                        }
+                        if(s->qscale < 12){
+                            int i;
+                            for(i=0; i<64; i++)
+                                buf[i] = (buf[i] + 2) >> 2;
+                        }
+                        uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
+                        nnz[0] |= nnz[1] | nnz[8] | nnz[9];
+                    }else{
+                        for(i4x4=0; i4x4<4; i4x4++){
+                            const int index= i4x4 + 4*i8x8;
                         
-                        if( decode_residual(h, gb, h->mb + 16*index, index, scan, s->qscale, 16) <0 ){
-                            return -1;
+                            if( decode_residual(h, gb, h->mb + 16*index, index, scan, dequant_coeff[s->qscale], 16) <0 ){
+                                return -1;
+                            }
                         }
                     }
                 }else{
@@ -4665,7 +4996,7 @@
         
         if(cbp&0x30){
             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
-                if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, chroma_qp, 4) < 0){
+                if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, dequant_coeff[chroma_qp], 4) < 0){
                     return -1;
                 }
         }
@@ -4674,7 +5005,7 @@
             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
                 for(i4x4=0; i4x4<4; i4x4++){
                     const int index= 16 + 4*chroma_idx + i4x4;
-                    if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, chroma_qp, 15) < 0){
+                    if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, dequant_coeff[chroma_qp], 15) < 0){
                         return -1;
                     }
                 }
@@ -5012,6 +5343,10 @@
     return type;
 }
 
+static inline int decode_cabac_mb_transform_size( H264Context *h ) {
+    return get_cabac( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
+}
+
 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
     int refa = h->ref_cache[list][scan8[n] - 1];
     int refb = h->ref_cache[list][scan8[n] - 8];
@@ -5107,15 +5442,30 @@
     return ctx + 4 * cat;
 }
 
-static int inline decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int qp, int max_coeff) {
+static int inline decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint16_t *qmul, int max_coeff) {
     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
-    const uint16_t *qmul= dequant_coeff[qp];
     static const int significant_coeff_flag_field_offset[2] = { 105, 277 };
     static const int last_significant_coeff_flag_field_offset[2] = { 166, 338 };
-    static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
-    static const int coeff_abs_level_m1_offset[5] = {227+ 0, 227+10, 227+20, 227+30, 227+39 };
-
-    int index[16];
+    static const int significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 297 };
+    static const int last_significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 251 };
+    static const int coeff_abs_level_m1_offset[6] = { 227+0, 227+10, 227+20, 227+30, 227+39, 426 };
+    static const int identity[15] = {
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+    };
+    static const int significant_coeff_flag_offset_8x8[63] = {
+        0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
+        4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
+        7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
+       12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12
+    };
+    static const int last_coeff_flag_offset_8x8[63] = {
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+        5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
+    };
+
+    int index[64];
 
     int i, last;
     int coeff_count = 0;
@@ -5123,27 +5473,45 @@
     int abslevel1 = 1;
     int abslevelgt1 = 0;
 
+    const int* significant_coeff_ctx_offset;
+    const int* last_coeff_ctx_offset;
+    const int significant_coeff_ctx_base = significant_coeff_flag_offset[cat]
+        + significant_coeff_flag_field_offset[h->mb_field_decoding_flag];
+    const int last_coeff_ctx_base = last_significant_coeff_flag_offset[cat]
+        + last_significant_coeff_flag_field_offset[h->mb_field_decoding_flag];
+
     /* cat: 0-> DC 16x16  n = 0
      *      1-> AC 16x16  n = luma4x4idx
      *      2-> Luma4x4   n = luma4x4idx
      *      3-> DC Chroma n = iCbCr
      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
+     *      5-> Luma8x8   n = 4 * luma8x8idx
      */
 
     /* read coded block flag */
-    if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
-        if( cat == 1 || cat == 2 )
-            h->non_zero_count_cache[scan8[n]] = 0;
-        else if( cat == 4 )
-            h->non_zero_count_cache[scan8[16+n]] = 0;
-
-        return 0;
+    if( cat == 5 ) {
+        significant_coeff_ctx_offset = significant_coeff_flag_offset_8x8;
+        last_coeff_ctx_offset = last_coeff_flag_offset_8x8;
+    } else {
+        if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
+            if( cat == 1 || cat == 2 )
+                h->non_zero_count_cache[scan8[n]] = 0;
+            else if( cat == 4 )
+                h->non_zero_count_cache[scan8[16+n]] = 0;
+
+            return 0;
+        }
+
+        significant_coeff_ctx_offset = 
+        last_coeff_ctx_offset = identity;
     }
 
     for(last= 0; last < max_coeff - 1; last++) {
-        if( get_cabac( &h->cabac, &h->cabac_state[significant_coeff_flag_field_offset[h->mb_field_decoding_flag]+significant_coeff_flag_offset[cat]+last] )) {
+        int sig_ctx = significant_coeff_ctx_base + significant_coeff_ctx_offset[last];
+        if( get_cabac( &h->cabac, &h->cabac_state[sig_ctx] )) {
+            int last_ctx = last_coeff_ctx_base + last_coeff_ctx_offset[last];
             index[coeff_count++] = last;
-            if( get_cabac( &h->cabac, &h->cabac_state[last_significant_coeff_flag_field_offset[h->mb_field_decoding_flag]+significant_coeff_flag_offset[cat]+last] ) ) {
+            if( get_cabac( &h->cabac, &h->cabac_state[last_ctx] ) ) {
                 last= max_coeff;
                 break;
             }
@@ -5160,9 +5528,11 @@
         h->non_zero_count_cache[scan8[n]] = coeff_count;
     else if( cat == 3 )
         h->cbp_table[mb_xy] |= 0x40 << n;
+    else if( cat == 4 )
+        h->non_zero_count_cache[scan8[16+n]] = coeff_count;
     else {
-        assert( cat == 4 );
-        h->non_zero_count_cache[scan8[16+n]] = coeff_count;
+        assert( cat == 5 );
+        fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 1, 1);
     }
 
     for( i = coeff_count - 1; i >= 0; i-- ) {
@@ -5247,6 +5617,7 @@
     MpegEncContext * const s = &h->s;
     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
     int mb_type, partition_count, cbp = 0;
+    int dct8x8_allowed= h->pps.transform_8x8_mode;
 
     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
 
@@ -5305,7 +5676,6 @@
     if(h->mb_field_decoding_flag)
         mb_type |= MB_TYPE_INTERLACED;
 
-    s->current_picture.mb_type[mb_xy]= mb_type;
     h->slice_table[ mb_xy ]= h->slice_num;
 
     if(IS_INTRA_PCM(mb_type)) {
@@ -5351,19 +5721,29 @@
         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
         // All coeffs are present
         memset(h->non_zero_count[mb_xy], 16, 16);
+        s->current_picture.mb_type[mb_xy]= mb_type;
         return 0;
     }
 
     fill_caches(h, mb_type, 0);
 
     if( IS_INTRA( mb_type ) ) {
+        int i;
         if( IS_INTRA4x4( mb_type ) ) {
-            int i;
-            for( i = 0; i < 16; i++ ) {
-                int pred = pred_intra_mode( h, i );
-                h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
+            if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
+                mb_type |= MB_TYPE_8x8DCT;
+                for( i = 0; i < 16; i+=4 ) {
+                    int pred = pred_intra_mode( h, i );
+                    int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
+                    fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
+                }
+            } else {
+                for( i = 0; i < 16; i++ ) {
+                    int pred = pred_intra_mode( h, i );
+                    h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
 
                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
+                }
             }
             write_back_intra_pred_mode(h);
             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
@@ -5420,6 +5800,9 @@
             }
         }
 
+        if(dct8x8_allowed)
+            dct8x8_allowed = get_dct8x8_allowed(h);
+
         for(list=0; list<2; list++){
             for(i=0; i<4; i++){
                 if(IS_DIRECT(h->sub_mb_type[i])){
@@ -5484,9 +5867,9 @@
         }
     } else if( IS_DIRECT(mb_type) ) {
         pred_direct_motion(h, &mb_type);
-        s->current_picture.mb_type[mb_xy]= mb_type;
         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
+        dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
     } else {
         int list, mx, my, i, mpx, mpy;
         if(IS_16X16(mb_type)){
@@ -5585,6 +5968,12 @@
 
     h->cbp_table[mb_xy] = cbp;
 
+    if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
+        if( decode_cabac_mb_transform_size( h ) )
+            mb_type |= MB_TYPE_8x8DCT;
+    }
+    s->current_picture.mb_type[mb_xy]= mb_type;
+
     if( cbp || IS_INTRA16x16( mb_type ) ) {
         const uint8_t *scan, *dc_scan;
         int dqp;
@@ -5608,12 +5997,12 @@
         if( IS_INTRA16x16( mb_type ) ) {
             int i;
             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
-            if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, s->qscale, 16) < 0)
+            if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, dequant_coeff[s->qscale], 16) < 0)
                 return -1;
             if( cbp&15 ) {
                 for( i = 0; i < 16; i++ ) {
                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
-                    if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, s->qscale, 15) < 0 )
+                    if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, dequant_coeff[s->qscale], 15) < 0 )
                         return -1;
                 }
             } else {
@@ -5623,10 +6012,20 @@
             int i8x8, i4x4;
             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
                 if( cbp & (1<<i8x8) ) {
+                    if( IS_8x8DCT(mb_type) ) {
+                        if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
+                                zigzag_scan8x8, h->dequant8_coeff[s->qscale], 64) < 0 )
+                            return -1;
+                        if(s->qscale < 12){
+                            int i;
+                            for(i=0; i<64; i++)
+                                h->mb[64*i8x8+i] = (h->mb[64*i8x8+i] + 2) >> 2;
+                        }
+                    } else
                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
                         const int index = 4*i8x8 + i4x4;
                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
-                        if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, s->qscale, 16) < 0 )
+                        if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, dequant_coeff[s->qscale], 16) < 0 )
                             return -1;
                     }
                 } else {
@@ -5640,7 +6039,7 @@
             int c;
             for( c = 0; c < 2; c++ ) {
                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
-                if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, h->chroma_qp, 4) < 0)
+                if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, dequant_coeff[h->chroma_qp], 4) < 0)
                     return -1;
             }
         }
@@ -5651,7 +6050,7 @@
                 for( i = 0; i < 4; i++ ) {
                     const int index = 16 + 4 * c + i;
                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
-                    if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->chroma_qp, 15) < 0)
+                    if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, dequant_coeff[h->chroma_qp], 15) < 0)
                         return -1;
                 }
             }
@@ -6018,6 +6417,7 @@
                 IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
                 bS[i] = 4;
             } else if( h->non_zero_count_cache[b_idx] != 0 ||
+                /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
                 h->non_zero_count_cache[bn_idx] != 0 ) {
                 bS[i] = 2;
             } else {
@@ -6073,6 +6473,9 @@
             int bS[4];
             int qp;
 
+            if( (edge&1) && IS_8x8DCT(s->current_picture.mb_type[mb_xy]) )
+                continue;
+
             if (h->mb_aff_frame && (dir == 1) && (edge == 0) && ((mb_y & 1) == 0)
                 && !IS_INTERLACED(s->current_picture.mb_type[mb_xy])
                 && IS_INTERLACED(s->current_picture.mb_type[mbn_xy])
@@ -6220,7 +6623,7 @@
                                s->gb.buffer + get_bits_count(&s->gb)/8,
                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
         /* calculate pre-state */
-        for( i= 0; i < 399; i++ ) {
+        for( i= 0; i < 460; i++ ) {
             int pre;
             if( h->slice_type == I_TYPE )
                 pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
@@ -6488,6 +6891,18 @@
     sps->profile_idc= profile_idc;
     sps->level_idc= level_idc;
 
+    if(sps->profile_idc >= 100){ //high profile
+        if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
+            get_bits1(&s->gb);  //residual_color_transform_flag
+        get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
+        get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
+        get_bits1(&s->gb);      //qpprime_y_zero_transform_bypass_flag
+        if(get_bits1(&s->gb)){  //seq_scaling_matrix_present_flag
+            av_log(h->s.avctx, AV_LOG_ERROR, "custom scaling matrix not implemented\n");
+            return -1;
+        }
+    }
+
     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
     sps->poc_type= get_ue_golomb(&s->gb);
     
@@ -6562,7 +6977,7 @@
     return 0;
 }
 
-static inline int decode_picture_parameter_set(H264Context *h){
+static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
     MpegEncContext * const s = &h->s;
     int pps_id= get_ue_golomb(&s->gb);
     PPS *pps= &h->pps_buffer[pps_id];
@@ -6623,9 +7038,18 @@
     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
     pps->constrained_intra_pred= get_bits1(&s->gb);
     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
+
+    if(get_bits_count(&s->gb) < bit_length){
+        pps->transform_8x8_mode= get_bits1(&s->gb);
+        if(get_bits1(&s->gb)){  //pic_scaling_matrix_present_flag
+            av_log(h->s.avctx, AV_LOG_ERROR, "custom scaling matrix not implemented\n");
+            return -1;
+        }
+        get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
+    }
     
     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
-        av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s\n", 
+        av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n", 
                pps_id, pps->sps_id,
                pps->cabac ? "CABAC" : "CAVLC",
                pps->slice_group_count,
@@ -6634,7 +7058,8 @@
                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
                pps->deblocking_filter_parameters_present ? "LPAR" : "",
                pps->constrained_intra_pred ? "CONSTR" : "",
-               pps->redundant_pic_cnt_present ? "REDU" : ""
+               pps->redundant_pic_cnt_present ? "REDU" : "",
+               pps->transform_8x8_mode ? "8x8DCT" : ""
                );
     }
     
@@ -6800,7 +7225,7 @@
         case NAL_PPS:
             init_get_bits(&s->gb, ptr, bit_length);
             
-            decode_picture_parameter_set(h);
+            decode_picture_parameter_set(h, bit_length);
 
             break;
         case NAL_PICTURE_DELIMITER:
diff -r a49f140179e9 -r 975074f04b95 h264data.h
--- a/h264data.h	Thu Jun 02 20:45:35 2005 +0000
+++ b/h264data.h	Thu Jun 02 21:15:20 2005 +0000
@@ -306,8 +306,50 @@
  (0+1*2)*16, (1+1*2)*16,  //FIXME
 };
 
+static const uint8_t zigzag_scan8x8[64]={
+ 0+0*8, 1+0*8, 0+1*8, 0+2*8,
+ 1+1*8, 2+0*8, 3+0*8, 2+1*8,
+ 1+2*8, 0+3*8, 0+4*8, 1+3*8,
+ 2+2*8, 3+1*8, 4+0*8, 5+0*8,
+ 4+1*8, 3+2*8, 2+3*8, 1+4*8,
+ 0+5*8, 0+6*8, 1+5*8, 2+4*8,
+ 3+3*8, 4+2*8, 5+1*8, 6+0*8,
+ 7+0*8, 6+1*8, 5+2*8, 4+3*8,
+ 3+4*8, 2+5*8, 1+6*8, 0+7*8,
+ 1+7*8, 2+6*8, 3+5*8, 4+4*8,
+ 5+3*8, 6+2*8, 7+1*8, 7+2*8,
+ 6+3*8, 5+4*8, 4+5*8, 3+6*8,
+ 2+7*8, 3+7*8, 4+6*8, 5+5*8,
+ 6+4*8, 7+3*8, 7+4*8, 6+5*8,
+ 5+6*8, 4+7*8, 5+7*8, 6+6*8,
+ 7+5*8, 7+6*8, 6+7*8, 7+7*8,
+};
+
+// zigzag_scan8x8_cavlc[i] = zigzag_scan8x8[(i/4) + 16*(i%4)]
+static const uint8_t zigzag_scan8x8_cavlc[64]={
+ 0+0*8, 1+1*8, 1+2*8, 2+2*8,
+ 4+1*8, 0+5*8, 3+3*8, 7+0*8,
+ 3+4*8, 1+7*8, 5+3*8, 6+3*8,
+ 2+7*8, 6+4*8, 5+6*8, 7+5*8,
+ 1+0*8, 2+0*8, 0+3*8, 3+1*8,
+ 3+2*8, 0+6*8, 4+2*8, 6+1*8,
+ 2+5*8, 2+6*8, 6+2*8, 5+4*8,
+ 3+7*8, 7+3*8, 4+7*8, 7+6*8,
+ 0+1*8, 3+0*8, 0+4*8, 4+0*8,
+ 2+3*8, 1+5*8, 5+1*8, 5+2*8,
+ 1+6*8, 3+5*8, 7+1*8, 4+5*8,
+ 4+6*8, 7+4*8, 5+7*8, 6+7*8,
+ 0+2*8, 2+1*8, 1+3*8, 5+0*8,
+ 1+4*8, 2+4*8, 6+0*8, 4+3*8,
+ 0+7*8, 4+4*8, 7+2*8, 3+6*8,
+ 5+5*8, 6+5*8, 6+6*8, 7+7*8,
+};
+
 #define MB_TYPE_REF0       MB_TYPE_ACPRED //dirty but it fits in 16bit
+#define MB_TYPE_8x8DCT     0x01000000
 #define IS_REF0(a)       ((a)&MB_TYPE_REF0)
+#define IS_8x8DCT(a)     ((a)&MB_TYPE_8x8DCT)
+
 
 typedef struct IMbInfo{
     uint16_t type;
@@ -472,6 +514,18 @@
 //{4608,5888,4608,5888, 5888,7424,5888,7424, 4608,5888,4608,5888, 5888,7424,5888,7424, },
 };
 
+static const int dequant8_coeff_init_scan[16] = {
+  0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
+};
+static const int dequant8_coeff_init[6][6]={
+  {20,18,32,19,25,24},
+  {22,19,35,21,28,26},
+  {26,23,42,24,33,31},
+  {28,25,45,26,35,33},
+  {32,28,51,30,40,38},
+  {36,32,58,34,46,43},
+};
+
 #define QUANT_SHIFT 22
 
 static const int quant_coeff[52][16]={
@@ -561,7 +615,7 @@
 
 /* Cabac pre state table */
 
-static const int cabac_context_init_I[399][2] =
+static const int cabac_context_init_I[460][2] =
 {
     /* 0 - 10 */
     { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
@@ -707,10 +761,30 @@
     { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
     { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
     { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
-    { 29, 39 },  { 19, 66 }
+    { 29, 39 },  { 19, 66 },
+
+    /* 399 -> 435 */
+    {  31,  21 }, {  31,  31 }, {  25,  50 },
+    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
+    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
+    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
+    { -23,  68 }, { -24,  50 }, { -11,  74 }, {  23, -13 },
+    {  26, -13 }, {  40, -15 }, {  49, -14 }, {  44,   3 },
+    {  45,   6 }, {  44,  34 }, {  33,  54 }, {  19,  82 },
+    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
+    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
+    {   0,  68 }, {  -9,  92 },
+
+    /* 436 -> 459 */
+    { -14, 106 }, { -13,  97 }, { -15,  90 }, { -12,  90 },
+    { -18,  88 }, { -10,  73 }, {  -9,  79 }, { -14,  86 },
+    { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
+    {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
+    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
+    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
 };
 
-static const int cabac_context_init_PB[3][399][2] =
+static const int cabac_context_init_PB[3][460][2] =
 {
     /* i_cabac_init_idc == 0 */
     {
@@ -847,7 +921,25 @@
         {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
         {  11,  86 },
 
+        /* 399 - 435 */
+        {  12,  40 }, {  11,  51 }, {  14,  59 },
+        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
+        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
+        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
+        { -16,  66 }, { -22,  65 }, { -20,  63 }, {   9,  -2 },
+        {  26,  -9 }, {  33,  -9 }, {  39,  -7 }, {  41,  -2 },
+        {  45,   3 }, {  49,   9 }, {  45,  27 }, {  36,  59 },
+        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
+        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
+        {  -8,  66 }, {  -8,  76 },
 
+        /* 436 - 459 */
+        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
+        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
+        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
+        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
+        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
+        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
     },
 
     /* i_cabac_init_idc == 1 */
@@ -985,6 +1077,25 @@
         {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
         {  11,  83 },
 
+        /* 399 - 435 */
+        {  24,  32 }, {  21,  49 }, {  21,  54 },
+        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
+        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,   8 },
+        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
+        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  17, -10 },
+        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
+        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
+        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
+        {  -2,  52 }, {  -9,  57 }, {  -6,  53 }, {  -4,  65 },
+        {  -4,  67 }, {  -7,  82 },
+
+        /* 436 - 459 */
+        {  -3,  81 }, {  -3,  76 }, {  -7,  72 }, {  -6,  78 },
+        { -12,  72 }, { -14,  68 }, {  -3,  70 }, {  -6,  76 },
+        {  -5,  66 }, {  -5,  62 }, {   0,  57 }, {  -4,  61 },
+        {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
+        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
+        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
     },
 
     /* i_cabac_init_idc == 2 */
@@ -1121,5 +1232,25 @@
         {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
         {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
         {  25,  61 },
+
+        /* 399 - 435 */
+        {  21,  33 }, {  19,  50 }, {  17,  61 },
+        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
+        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
+        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
+        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
+        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
+        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
+        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
+        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
+        {  -6,  68 }, { -10,  79 },
+
+        /* 436 - 459 */
+        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
+        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
+        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
+        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
+        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
+        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
     }
 };
diff -r a49f140179e9 -r 975074f04b95 h264idct.c
--- a/h264idct.c	Thu Jun 02 20:45:35 2005 +0000
+++ b/h264idct.c	Thu Jun 02 21:15:20 2005 +0000
@@ -68,3 +68,74 @@
 void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block){
     idct_internal(dst, block, stride, 8, 3, 0);
 }
+
+void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
+    int i;
+    DCTELEM (*src)[8] = (DCTELEM(*)[8])block;
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+
+    block[0] += 32;
+
+    for( i = 0; i < 8; i++ )
+    {
+        const int a0 =  src[i][0] + src[i][4];
+        const int a2 =  src[i][0] - src[i][4];
+        const int a4 = (src[i][2]>>1) - src[i][6];
+        const int a6 = (src[i][6]>>1) + src[i][2];
+
+        const int b0 = a0 + a6;
+        const int b2 = a2 + a4;
+        const int b4 = a2 - a4;
+        const int b6 = a0 - a6;
+
+        const int a1 = -src[i][3] + src[i][5] - src[i][7] - (src[i][7]>>1);
+        const int a3 =  src[i][1] + src[i][7] - src[i][3] - (src[i][3]>>1);
+        const int a5 = -src[i][1] + src[i][7] + src[i][5] + (src[i][5]>>1);
+        const int a7 =  src[i][3] + src[i][5] + src[i][1] + (src[i][1]>>1);
+
+        const int b1 = (a7>>2) + a1;
+        const int b3 =  a3 + (a5>>2);
+        const int b5 = (a3>>2) - a5;
+        const int b7 =  a7 - (a1>>2);
+
+        src[i][0] = b0 + b7;
+        src[i][7] = b0 - b7;
+        src[i][1] = b2 + b5;
+        src[i][6] = b2 - b5;
+        src[i][2] = b4 + b3;
+        src[i][5] = b4 - b3;
+        src[i][3] = b6 + b1;
+        src[i][4] = b6 - b1;
+    }
+    for( i = 0; i < 8; i++ )
+    {
+        const int a0 =  src[0][i] + src[4][i];
+        const int a2 =  src[0][i] - src[4][i];
+        const int a4 = (src[2][i]>>1) - src[6][i];
+        const int a6 = (src[6][i]>>1) + src[2][i];
+
+        const int b0 = a0 + a6;
+        const int b2 = a2 + a4;
+        const int b4 = a2 - a4;
+        const int b6 = a0 - a6;
+
+        const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1);
+        const int a3 =  src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1);
+        const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1);
+        const int a7 =  src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1);
+
+        const int b1 = (a7>>2) + a1;
+        const int b3 =  a3 + (a5>>2);
+        const int b5 = (a3>>2) - a5;
+        const int b7 =  a7 - (a1>>2);
+
+        dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ];
+        dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ];
+        dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ];
+        dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ];
+        dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ];
+        dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ];
+        dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ];
+        dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ];
+    }
+}