diff h264.h @ 11277:c12d6c6c027e libavcodec

Change mvd_cache & mvd_table to 8bit, this is overall a bit faster for high resolution videos. about 20cycles faster per MB for cathederal.
author michael
date Wed, 24 Feb 2010 20:43:06 +0000
parents e817a3c2ec2e
children cf41a3e8e14e
line wrap: on
line diff
--- a/h264.h	Wed Feb 24 20:37:58 2010 +0000
+++ b/h264.h	Wed Feb 24 20:43:06 2010 +0000
@@ -486,8 +486,8 @@
     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
     uint8_t     *chroma_pred_mode_table;
     int         last_qscale_diff;
-    int16_t     (*mvd_table[2])[2];
-    DECLARE_ALIGNED_16(int16_t, mvd_cache)[2][5*8][2];
+    uint8_t     (*mvd_table[2])[2];
+    DECLARE_ALIGNED_16(uint8_t, mvd_cache)[2][5*8][2];
     uint8_t     *direct_table;
     uint8_t     direct_cache[5*8];
 
@@ -732,6 +732,14 @@
 #endif
 }
 
+static av_always_inline uint16_t pack8to16(int a, int b){
+#if HAVE_BIGENDIAN
+   return (b&0xFF) + (a<<8);
+#else
+   return (a&0xFF) + (b<<8);
+#endif
+}
+
 /**
  * gets the chroma qp.
  */
@@ -1060,32 +1068,31 @@
                 /* XXX beurk, Load mvd */
                 if(USES_LIST(top_type, list)){
                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
-                    AV_COPY128(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]);
+                    AV_COPY64(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]);
                 }else{
-                    AV_ZERO128(h->mvd_cache[list][scan8[0] + 0 - 1*8]);
+                    AV_ZERO64(h->mvd_cache[list][scan8[0] + 0 - 1*8]);
                 }
                 if(USES_LIST(left_type[0], list)){
                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
-                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]);
-                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]);
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]);
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]);
                 }else{
-                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 0*8]);
-                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 1*8]);
+                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 0*8]);
+                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 1*8]);
                 }
                 if(USES_LIST(left_type[1], list)){
                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
-                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]);
-                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]);
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]);
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]);
                 }else{
-                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 2*8]);
-                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 3*8]);
+                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 2*8]);
+                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 3*8]);
                 }
-                AV_ZERO32(h->mvd_cache [list][scan8[5 ]+1]);
-                AV_ZERO32(h->mvd_cache [list][scan8[7 ]+1]);
-                AV_ZERO32(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else)
-                AV_ZERO32(h->mvd_cache [list][scan8[4 ]]);
-                AV_ZERO32(h->mvd_cache [list][scan8[12]]);
-
+                AV_ZERO16(h->mvd_cache [list][scan8[5 ]+1]);
+                AV_ZERO16(h->mvd_cache [list][scan8[7 ]+1]);
+                AV_ZERO16(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else)
+                AV_ZERO16(h->mvd_cache [list][scan8[4 ]]);
+                AV_ZERO16(h->mvd_cache [list][scan8[12]]);
                 if(h->slice_type_nos == FF_B_TYPE){
                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1);
 
@@ -1414,13 +1421,13 @@
             AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
         }
         if( CABAC ) {
-            int16_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy];
-            int16_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
+            uint8_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy];
+            uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
             if(IS_SKIP(mb_type))
-                fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 4);
+                fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 2);
             else
             for(y=0; y<4; y++){
-                AV_COPY128(mvd_dst + y*b_stride, mvd_src + 8*y);
+                AV_COPY64(mvd_dst + y*b_stride, mvd_src + 8*y);
             }
         }