# HG changeset patch
# User michael
# Date 1189221260 0
# Node ID cd26ab6e3953b8f53cde125eed2427c5fc3b650b
# Parent  7c139ea9065e58b5386f099d93d60d9b2c694d82
cleanup mc_block()
perform interpolation steps in such an order that halfpel interpolation
could be done per picture
this also makes mc_block() match h.264 for the 1/4 pel cases so that the
use of the h264 functions for some cases does not introduce a fantastic mess

diff -r 7c139ea9065e -r cd26ab6e3953 snow.c
--- a/snow.c	Fri Sep 07 19:19:49 2007 +0000
+++ b/snow.c	Sat Sep 08 03:14:20 2007 +0000
@@ -2144,8 +2144,57 @@
 }
 
 static void mc_block(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
-    int x, y;
+    const static uint8_t weight[64]={
+    8,7,6,5,4,3,2,1,
+    7,7,0,0,0,0,0,1,
+    6,0,6,0,0,0,2,0,
+    5,0,0,5,0,3,0,0,
+    4,0,0,0,4,0,0,0,
+    3,0,0,5,0,3,0,0,
+    2,0,6,0,0,0,2,0,
+    1,7,0,0,0,0,0,1,
+    };
+
+    const static uint8_t brane[256]={
+    0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x11,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
+    0x04,0x05,0xcc,0xcc,0xcc,0xcc,0xcc,0x41,0x15,0x16,0xcc,0xcc,0xcc,0xcc,0xcc,0x52,
+    0x04,0xcc,0x05,0xcc,0xcc,0xcc,0x41,0xcc,0x15,0xcc,0x16,0xcc,0xcc,0xcc,0x52,0xcc,
+    0x04,0xcc,0xcc,0x05,0xcc,0x41,0xcc,0xcc,0x15,0xcc,0xcc,0x16,0xcc,0x52,0xcc,0xcc,
+    0x04,0xcc,0xcc,0xcc,0x41,0xcc,0xcc,0xcc,0x15,0xcc,0xcc,0xcc,0x16,0xcc,0xcc,0xcc,
+    0x04,0xcc,0xcc,0x41,0xcc,0x05,0xcc,0xcc,0x15,0xcc,0xcc,0x52,0xcc,0x16,0xcc,0xcc,
+    0x04,0xcc,0x41,0xcc,0xcc,0xcc,0x05,0xcc,0x15,0xcc,0x52,0xcc,0xcc,0xcc,0x16,0xcc,
+    0x04,0x41,0xcc,0xcc,0xcc,0xcc,0xcc,0x05,0x15,0x52,0xcc,0xcc,0xcc,0xcc,0xcc,0x16,
+    0x44,0x45,0x45,0x45,0x45,0x45,0x45,0x45,0x55,0x56,0x56,0x56,0x56,0x56,0x56,0x56,
+    0x48,0x49,0xcc,0xcc,0xcc,0xcc,0xcc,0x85,0x59,0x5A,0xcc,0xcc,0xcc,0xcc,0xcc,0x96,
+    0x48,0xcc,0x49,0xcc,0xcc,0xcc,0x85,0xcc,0x59,0xcc,0x5A,0xcc,0xcc,0xcc,0x96,0xcc,
+    0x48,0xcc,0xcc,0x49,0xcc,0x85,0xcc,0xcc,0x59,0xcc,0xcc,0x5A,0xcc,0x96,0xcc,0xcc,
+    0x48,0xcc,0xcc,0xcc,0x49,0xcc,0xcc,0xcc,0x59,0xcc,0xcc,0xcc,0x96,0xcc,0xcc,0xcc,
+    0x48,0xcc,0xcc,0x85,0xcc,0x49,0xcc,0xcc,0x59,0xcc,0xcc,0x96,0xcc,0x5A,0xcc,0xcc,
+    0x48,0xcc,0x85,0xcc,0xcc,0xcc,0x49,0xcc,0x59,0xcc,0x96,0xcc,0xcc,0xcc,0x5A,0xcc,
+    0x48,0x85,0xcc,0xcc,0xcc,0xcc,0xcc,0x49,0x59,0x96,0xcc,0xcc,0xcc,0xcc,0xcc,0x5A,
+    };
+
+    const static uint8_t needs[16]={
+    0,1,0,0,
+    2,4,2,0,
+    0,1,0,0,
+    15
+    };
+
+    int x, y, b, r, l;
+    int16_t tmpIt   [64*(32+HTAPS)];
+    uint8_t tmp2t[3][stride*(32+HTAPS)];
+    int16_t *tmpI= tmpIt;
+    uint8_t *tmp2= tmp2t[0];
+    uint8_t *hpel[11];
 START_TIMER
+    assert(dx<16 && dy<16);
+    r= brane[dx + 16*dy]&15;
+    l= brane[dx + 16*dy]>>4;
+
+    b= needs[l] | needs[r];
+
+    if(b&5){
     for(y=0; y < b_h+HTAPS-1; y++){
         for(x=0; x < b_w; x++){
             int a_2=src[x + HTAPS/2-5];
@@ -2170,36 +2219,33 @@
 
 //            if(b_w==16) am= 8*(a1+a2);
 
-            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
-            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
-
-            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
+            tmpI[x]= am;
+            am= (am+16)>>5;
             if(am&(~255)) am= ~(am>>31);
-
-            tmp[x] = am;
-
-/*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
-            else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
-            else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
-            else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
+            tmp2[x]= am;
         }
-        tmp += stride;
+        tmpI+= 64;
+        tmp2+= stride;
         src += stride;
     }
-    tmp -= (b_h+HTAPS-1)*stride;
-
+    src -= stride*y;
+    }
+    src += HTAPS/2 - 1;
+    tmp2= tmp2t[1];
+
+    if(b&2){
     for(y=0; y < b_h; y++){
-        for(x=0; x < b_w; x++){
-            int a_2=tmp[x + (HTAPS/2-5)*stride];
-            int a_1=tmp[x + (HTAPS/2-4)*stride];
-            int a0= tmp[x + (HTAPS/2-3)*stride];
-            int a1= tmp[x + (HTAPS/2-2)*stride];
-            int a2= tmp[x + (HTAPS/2-1)*stride];
-            int a3= tmp[x + (HTAPS/2+0)*stride];
-            int a4= tmp[x + (HTAPS/2+1)*stride];
-            int a5= tmp[x + (HTAPS/2+2)*stride];
-            int a6= tmp[x + (HTAPS/2+3)*stride];
-            int a7= tmp[x + (HTAPS/2+4)*stride];
+        for(x=0; x < b_w+1; x++){
+            int a_2=src[x + (HTAPS/2-5)*stride];
+            int a_1=src[x + (HTAPS/2-4)*stride];
+            int a0= src[x + (HTAPS/2-3)*stride];
+            int a1= src[x + (HTAPS/2-2)*stride];
+            int a2= src[x + (HTAPS/2-1)*stride];
+            int a3= src[x + (HTAPS/2+0)*stride];
+            int a4= src[x + (HTAPS/2+1)*stride];
+            int a5= src[x + (HTAPS/2+2)*stride];
+            int a6= src[x + (HTAPS/2+3)*stride];
+            int a7= src[x + (HTAPS/2+4)*stride];
 #if HTAPS==6
             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
 #else
@@ -2211,19 +2257,88 @@
 
 //            if(b_w==16) am= 8*(a1+a2);
 
-            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
-            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
-
+            am= (am + 16)>>5;
             if(am&(~255)) am= ~(am>>31);
-
-            dst[x] = am;
-/*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
-            else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
-            else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
-            else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
+            tmp2[x]= am;
+        }
+        src += stride;
+        tmp2+= stride;
+    }
+    src -= stride*y;
+    }
+    src += stride*(HTAPS/2 - 1);
+    tmp2= tmp2t[2];
+    tmpI= tmpIt;
+    if(b&4){
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                int a_2=tmpI[x + (HTAPS/2-5)*64];
+                int a_1=tmpI[x + (HTAPS/2-4)*64];
+                int a0= tmpI[x + (HTAPS/2-3)*64];
+                int a1= tmpI[x + (HTAPS/2-2)*64];
+                int a2= tmpI[x + (HTAPS/2-1)*64];
+                int a3= tmpI[x + (HTAPS/2+0)*64];
+                int a4= tmpI[x + (HTAPS/2+1)*64];
+                int a5= tmpI[x + (HTAPS/2+2)*64];
+                int a6= tmpI[x + (HTAPS/2+3)*64];
+                int a7= tmpI[x + (HTAPS/2+4)*64];
+#if HTAPS==6
+                int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+#else
+                int am= 21*(a2+a3) - 7*(a1+a4) + 3*(a0+a5) - (a_1+a6);
+#endif
+                am= (am + 512)>>10;
+                if(am&(~255)) am= ~(am>>31);
+                tmp2[x]= am;
+            }
+            tmpI+= 64;
+            tmp2+= stride;
         }
-        dst += stride;
-        tmp += stride;
+    }
+
+    hpel[ 0]= src;
+    hpel[ 1]= tmp2t[0] + stride*(HTAPS/2-1);
+    hpel[ 2]= src + 1;
+
+    hpel[ 4]= tmp2t[1];
+    hpel[ 5]= tmp2t[2];
+    hpel[ 6]= tmp2t[1] + 1;
+
+    hpel[ 8]= src + stride;
+    hpel[ 9]= hpel[1] + stride;
+    hpel[10]= hpel[8] + 1;
+
+    if(b==15){
+        uint8_t *src1= hpel[dx/8 + dy/8*4  ];
+        uint8_t *src2= hpel[dx/8 + dy/8*4+1];
+        uint8_t *src3= hpel[dx/8 + dy/8*4+4];
+        uint8_t *src4= hpel[dx/8 + dy/8*4+5];
+        dx&=7;
+        dy&=7;
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                dst[x]= ((8-dx)*(8-dy)*src1[x] + dx*(8-dy)*src2[x]+
+                         (8-dx)*   dy *src3[x] + dx*   dy *src4[x]+32)>>6;
+            }
+            src1+=stride;
+            src2+=stride;
+            src3+=stride;
+            src4+=stride;
+            dst +=stride;
+        }
+    }else{
+        uint8_t *src1= hpel[l];
+        uint8_t *src2= hpel[r];
+        int a= weight[((dx&7) + (8*(dy&7)))];
+        int b= 8-a;
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                dst[x]= (a*src1[x] + b*src2[x] + 4)>>3;
+            }
+            src1+=stride;
+            src2+=stride;
+            dst +=stride;
+        }
     }
 STOP_TIMER("mc_block")
 }