# HG changeset patch # User michael # Date 1189221260 0 # Node ID cd26ab6e3953b8f53cde125eed2427c5fc3b650b # Parent 7c139ea9065e58b5386f099d93d60d9b2c694d82 cleanup mc_block() perform interpolation steps in such an order that halfpel interpolation could be done per picture this also makes mc_block() match h.264 for the 1/4 pel cases so that the use of the h264 functions for some cases does not introduce a fantastic mess diff -r 7c139ea9065e -r cd26ab6e3953 snow.c --- a/snow.c Fri Sep 07 19:19:49 2007 +0000 +++ b/snow.c Sat Sep 08 03:14:20 2007 +0000 @@ -2144,8 +2144,57 @@ } static void mc_block(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){ - int x, y; + const static uint8_t weight[64]={ + 8,7,6,5,4,3,2,1, + 7,7,0,0,0,0,0,1, + 6,0,6,0,0,0,2,0, + 5,0,0,5,0,3,0,0, + 4,0,0,0,4,0,0,0, + 3,0,0,5,0,3,0,0, + 2,0,6,0,0,0,2,0, + 1,7,0,0,0,0,0,1, + }; + + const static uint8_t brane[256]={ + 0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x11,0x12,0x12,0x12,0x12,0x12,0x12,0x12, + 0x04,0x05,0xcc,0xcc,0xcc,0xcc,0xcc,0x41,0x15,0x16,0xcc,0xcc,0xcc,0xcc,0xcc,0x52, + 0x04,0xcc,0x05,0xcc,0xcc,0xcc,0x41,0xcc,0x15,0xcc,0x16,0xcc,0xcc,0xcc,0x52,0xcc, + 0x04,0xcc,0xcc,0x05,0xcc,0x41,0xcc,0xcc,0x15,0xcc,0xcc,0x16,0xcc,0x52,0xcc,0xcc, + 0x04,0xcc,0xcc,0xcc,0x41,0xcc,0xcc,0xcc,0x15,0xcc,0xcc,0xcc,0x16,0xcc,0xcc,0xcc, + 0x04,0xcc,0xcc,0x41,0xcc,0x05,0xcc,0xcc,0x15,0xcc,0xcc,0x52,0xcc,0x16,0xcc,0xcc, + 0x04,0xcc,0x41,0xcc,0xcc,0xcc,0x05,0xcc,0x15,0xcc,0x52,0xcc,0xcc,0xcc,0x16,0xcc, + 0x04,0x41,0xcc,0xcc,0xcc,0xcc,0xcc,0x05,0x15,0x52,0xcc,0xcc,0xcc,0xcc,0xcc,0x16, + 0x44,0x45,0x45,0x45,0x45,0x45,0x45,0x45,0x55,0x56,0x56,0x56,0x56,0x56,0x56,0x56, + 0x48,0x49,0xcc,0xcc,0xcc,0xcc,0xcc,0x85,0x59,0x5A,0xcc,0xcc,0xcc,0xcc,0xcc,0x96, + 0x48,0xcc,0x49,0xcc,0xcc,0xcc,0x85,0xcc,0x59,0xcc,0x5A,0xcc,0xcc,0xcc,0x96,0xcc, + 0x48,0xcc,0xcc,0x49,0xcc,0x85,0xcc,0xcc,0x59,0xcc,0xcc,0x5A,0xcc,0x96,0xcc,0xcc, + 0x48,0xcc,0xcc,0xcc,0x49,0xcc,0xcc,0xcc,0x59,0xcc,0xcc,0xcc,0x96,0xcc,0xcc,0xcc, + 0x48,0xcc,0xcc,0x85,0xcc,0x49,0xcc,0xcc,0x59,0xcc,0xcc,0x96,0xcc,0x5A,0xcc,0xcc, + 0x48,0xcc,0x85,0xcc,0xcc,0xcc,0x49,0xcc,0x59,0xcc,0x96,0xcc,0xcc,0xcc,0x5A,0xcc, + 0x48,0x85,0xcc,0xcc,0xcc,0xcc,0xcc,0x49,0x59,0x96,0xcc,0xcc,0xcc,0xcc,0xcc,0x5A, + }; + + const static uint8_t needs[16]={ + 0,1,0,0, + 2,4,2,0, + 0,1,0,0, + 15 + }; + + int x, y, b, r, l; + int16_t tmpIt [64*(32+HTAPS)]; + uint8_t tmp2t[3][stride*(32+HTAPS)]; + int16_t *tmpI= tmpIt; + uint8_t *tmp2= tmp2t[0]; + uint8_t *hpel[11]; START_TIMER + assert(dx<16 && dy<16); + r= brane[dx + 16*dy]&15; + l= brane[dx + 16*dy]>>4; + + b= needs[l] | needs[r]; + + if(b&5){ for(y=0; y < b_h+HTAPS-1; y++){ for(x=0; x < b_w; x++){ int a_2=src[x + HTAPS/2-5]; @@ -2170,36 +2219,33 @@ // if(b_w==16) am= 8*(a1+a2); - if(dx<8) am = (32*a2*( 8-dx) + am* dx + 128)>>8; - else am = ( am*(16-dx) + 32*a3*(dx-8) + 128)>>8; - - /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/ + tmpI[x]= am; + am= (am+16)>>5; if(am&(~255)) am= ~(am>>31); - - tmp[x] = am; - -/* if (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) + aL* dx + 32)>>6; - else if(dx< 8) tmp[x + y*stride]= ( aL*( 8-dx) + am*(dx- 4) + 32)>>6; - else if(dx<12) tmp[x + y*stride]= ( am*(12-dx) + aR*(dx- 8) + 32)>>6; - else tmp[x + y*stride]= ( aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/ + tmp2[x]= am; } - tmp += stride; + tmpI+= 64; + tmp2+= stride; src += stride; } - tmp -= (b_h+HTAPS-1)*stride; - + src -= stride*y; + } + src += HTAPS/2 - 1; + tmp2= tmp2t[1]; + + if(b&2){ for(y=0; y < b_h; y++){ - for(x=0; x < b_w; x++){ - int a_2=tmp[x + (HTAPS/2-5)*stride]; - int a_1=tmp[x + (HTAPS/2-4)*stride]; - int a0= tmp[x + (HTAPS/2-3)*stride]; - int a1= tmp[x + (HTAPS/2-2)*stride]; - int a2= tmp[x + (HTAPS/2-1)*stride]; - int a3= tmp[x + (HTAPS/2+0)*stride]; - int a4= tmp[x + (HTAPS/2+1)*stride]; - int a5= tmp[x + (HTAPS/2+2)*stride]; - int a6= tmp[x + (HTAPS/2+3)*stride]; - int a7= tmp[x + (HTAPS/2+4)*stride]; + for(x=0; x < b_w+1; x++){ + int a_2=src[x + (HTAPS/2-5)*stride]; + int a_1=src[x + (HTAPS/2-4)*stride]; + int a0= src[x + (HTAPS/2-3)*stride]; + int a1= src[x + (HTAPS/2-2)*stride]; + int a2= src[x + (HTAPS/2-1)*stride]; + int a3= src[x + (HTAPS/2+0)*stride]; + int a4= src[x + (HTAPS/2+1)*stride]; + int a5= src[x + (HTAPS/2+2)*stride]; + int a6= src[x + (HTAPS/2+3)*stride]; + int a7= src[x + (HTAPS/2+4)*stride]; #if HTAPS==6 int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5); #else @@ -2211,19 +2257,88 @@ // if(b_w==16) am= 8*(a1+a2); - if(dy<8) am = (32*a2*( 8-dy) + am* dy + 128)>>8; - else am = ( am*(16-dy) + 32*a3*(dy-8) + 128)>>8; - + am= (am + 16)>>5; if(am&(~255)) am= ~(am>>31); - - dst[x] = am; -/* if (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) + aL* dy + 32)>>6; - else if(dy< 8) tmp[x + y*stride]= ( aL*( 8-dy) + am*(dy- 4) + 32)>>6; - else if(dy<12) tmp[x + y*stride]= ( am*(12-dy) + aR*(dy- 8) + 32)>>6; - else tmp[x + y*stride]= ( aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/ + tmp2[x]= am; + } + src += stride; + tmp2+= stride; + } + src -= stride*y; + } + src += stride*(HTAPS/2 - 1); + tmp2= tmp2t[2]; + tmpI= tmpIt; + if(b&4){ + for(y=0; y < b_h; y++){ + for(x=0; x < b_w; x++){ + int a_2=tmpI[x + (HTAPS/2-5)*64]; + int a_1=tmpI[x + (HTAPS/2-4)*64]; + int a0= tmpI[x + (HTAPS/2-3)*64]; + int a1= tmpI[x + (HTAPS/2-2)*64]; + int a2= tmpI[x + (HTAPS/2-1)*64]; + int a3= tmpI[x + (HTAPS/2+0)*64]; + int a4= tmpI[x + (HTAPS/2+1)*64]; + int a5= tmpI[x + (HTAPS/2+2)*64]; + int a6= tmpI[x + (HTAPS/2+3)*64]; + int a7= tmpI[x + (HTAPS/2+4)*64]; +#if HTAPS==6 + int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5); +#else + int am= 21*(a2+a3) - 7*(a1+a4) + 3*(a0+a5) - (a_1+a6); +#endif + am= (am + 512)>>10; + if(am&(~255)) am= ~(am>>31); + tmp2[x]= am; + } + tmpI+= 64; + tmp2+= stride; } - dst += stride; - tmp += stride; + } + + hpel[ 0]= src; + hpel[ 1]= tmp2t[0] + stride*(HTAPS/2-1); + hpel[ 2]= src + 1; + + hpel[ 4]= tmp2t[1]; + hpel[ 5]= tmp2t[2]; + hpel[ 6]= tmp2t[1] + 1; + + hpel[ 8]= src + stride; + hpel[ 9]= hpel[1] + stride; + hpel[10]= hpel[8] + 1; + + if(b==15){ + uint8_t *src1= hpel[dx/8 + dy/8*4 ]; + uint8_t *src2= hpel[dx/8 + dy/8*4+1]; + uint8_t *src3= hpel[dx/8 + dy/8*4+4]; + uint8_t *src4= hpel[dx/8 + dy/8*4+5]; + dx&=7; + dy&=7; + for(y=0; y < b_h; y++){ + for(x=0; x < b_w; x++){ + dst[x]= ((8-dx)*(8-dy)*src1[x] + dx*(8-dy)*src2[x]+ + (8-dx)* dy *src3[x] + dx* dy *src4[x]+32)>>6; + } + src1+=stride; + src2+=stride; + src3+=stride; + src4+=stride; + dst +=stride; + } + }else{ + uint8_t *src1= hpel[l]; + uint8_t *src2= hpel[r]; + int a= weight[((dx&7) + (8*(dy&7)))]; + int b= 8-a; + for(y=0; y < b_h; y++){ + for(x=0; x < b_w; x++){ + dst[x]= (a*src1[x] + b*src2[x] + 4)>>3; + } + src1+=stride; + src2+=stride; + dst +=stride; + } } STOP_TIMER("mc_block") }