diff h264.h @ 11283:853e93a50fe5 libavcodec

Cut the size of mvd_table by yet another factor of 2. The code read/write code itself was 1 cycle faster, overall its likely more due to cache effects
author michael
date Thu, 25 Feb 2010 04:11:33 +0000
parents cf41a3e8e14e
children aaca4b58880f
line wrap: on
line diff
--- a/h264.h	Thu Feb 25 02:42:25 2010 +0000
+++ b/h264.h	Thu Feb 25 04:11:33 2010 +0000
@@ -1070,23 +1070,23 @@
             if( CABAC ) {
                 /* XXX beurk, Load mvd */
                 if(USES_LIST(top_type, list)){
-                    const int b_xy= h->mb2br_xy[top_xy] + 3*h->b_stride;
+                    const int b_xy= h->mb2br_xy[top_xy];
                     AV_COPY64(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]);
                 }else{
                     AV_ZERO64(h->mvd_cache[list][scan8[0] + 0 - 1*8]);
                 }
                 if(USES_LIST(left_type[0], list)){
-                    const int b_xy= h->mb2br_xy[left_xy[0]] + 3;
-                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]);
-                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]);
+                    const int b_xy= h->mb2br_xy[left_xy[0]] + 6;
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy - left_block[0]]);
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy - left_block[1]]);
                 }else{
                     AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 0*8]);
                     AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 1*8]);
                 }
                 if(USES_LIST(left_type[1], list)){
-                    const int b_xy= h->mb2br_xy[left_xy[1]] + 3;
-                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]);
-                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]);
+                    const int b_xy= h->mb2br_xy[left_xy[1]] + 6;
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy - left_block[2]]);
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy - left_block[3]]);
                 }else{
                     AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 2*8]);
                     AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 3*8]);
@@ -1424,13 +1424,15 @@
             AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
         }
         if( CABAC ) {
-            uint8_t (*mvd_dst)[2] = &h->mvd_table[list][FMO ? b_xy : h->mb2br_xy[h->mb_xy]];
+            uint8_t (*mvd_dst)[2] = &h->mvd_table[list][FMO ? 8*h->mb_xy : h->mb2br_xy[h->mb_xy]];
             uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
             if(IS_SKIP(mb_type))
-                fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 2);
-            else
-            for(y=0; y<4; y++){
-                AV_COPY64(mvd_dst + y*b_stride, mvd_src + 8*y);
+                AV_ZERO128(mvd_dst);
+            else{
+            AV_COPY64(mvd_dst, mvd_src + 8*3);
+            for(y=0; y<3; y++){
+                AV_COPY16(mvd_dst + 3 + 3 - y, mvd_src + 3 + 8*y);
+            }
             }
         }