comparison snow.c @ 5648:cd26ab6e3953 libavcodec

cleanup mc_block() perform interpolation steps in such an order that halfpel interpolation could be done per picture this also makes mc_block() match h.264 for the 1/4 pel cases so that the use of the h264 functions for some cases does not introduce a fantastic mess
author michael
date Sat, 08 Sep 2007 03:14:20 +0000
parents 473cada682a1
children 9fe214a99139
comparison
equal deleted inserted replaced
5647:7c139ea9065e 5648:cd26ab6e3953
2142 } 2142 }
2143 } 2143 }
2144 } 2144 }
2145 2145
2146 static void mc_block(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){ 2146 static void mc_block(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2147 int x, y; 2147 const static uint8_t weight[64]={
2148 8,7,6,5,4,3,2,1,
2149 7,7,0,0,0,0,0,1,
2150 6,0,6,0,0,0,2,0,
2151 5,0,0,5,0,3,0,0,
2152 4,0,0,0,4,0,0,0,
2153 3,0,0,5,0,3,0,0,
2154 2,0,6,0,0,0,2,0,
2155 1,7,0,0,0,0,0,1,
2156 };
2157
2158 const static uint8_t brane[256]={
2159 0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x11,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
2160 0x04,0x05,0xcc,0xcc,0xcc,0xcc,0xcc,0x41,0x15,0x16,0xcc,0xcc,0xcc,0xcc,0xcc,0x52,
2161 0x04,0xcc,0x05,0xcc,0xcc,0xcc,0x41,0xcc,0x15,0xcc,0x16,0xcc,0xcc,0xcc,0x52,0xcc,
2162 0x04,0xcc,0xcc,0x05,0xcc,0x41,0xcc,0xcc,0x15,0xcc,0xcc,0x16,0xcc,0x52,0xcc,0xcc,
2163 0x04,0xcc,0xcc,0xcc,0x41,0xcc,0xcc,0xcc,0x15,0xcc,0xcc,0xcc,0x16,0xcc,0xcc,0xcc,
2164 0x04,0xcc,0xcc,0x41,0xcc,0x05,0xcc,0xcc,0x15,0xcc,0xcc,0x52,0xcc,0x16,0xcc,0xcc,
2165 0x04,0xcc,0x41,0xcc,0xcc,0xcc,0x05,0xcc,0x15,0xcc,0x52,0xcc,0xcc,0xcc,0x16,0xcc,
2166 0x04,0x41,0xcc,0xcc,0xcc,0xcc,0xcc,0x05,0x15,0x52,0xcc,0xcc,0xcc,0xcc,0xcc,0x16,
2167 0x44,0x45,0x45,0x45,0x45,0x45,0x45,0x45,0x55,0x56,0x56,0x56,0x56,0x56,0x56,0x56,
2168 0x48,0x49,0xcc,0xcc,0xcc,0xcc,0xcc,0x85,0x59,0x5A,0xcc,0xcc,0xcc,0xcc,0xcc,0x96,
2169 0x48,0xcc,0x49,0xcc,0xcc,0xcc,0x85,0xcc,0x59,0xcc,0x5A,0xcc,0xcc,0xcc,0x96,0xcc,
2170 0x48,0xcc,0xcc,0x49,0xcc,0x85,0xcc,0xcc,0x59,0xcc,0xcc,0x5A,0xcc,0x96,0xcc,0xcc,
2171 0x48,0xcc,0xcc,0xcc,0x49,0xcc,0xcc,0xcc,0x59,0xcc,0xcc,0xcc,0x96,0xcc,0xcc,0xcc,
2172 0x48,0xcc,0xcc,0x85,0xcc,0x49,0xcc,0xcc,0x59,0xcc,0xcc,0x96,0xcc,0x5A,0xcc,0xcc,
2173 0x48,0xcc,0x85,0xcc,0xcc,0xcc,0x49,0xcc,0x59,0xcc,0x96,0xcc,0xcc,0xcc,0x5A,0xcc,
2174 0x48,0x85,0xcc,0xcc,0xcc,0xcc,0xcc,0x49,0x59,0x96,0xcc,0xcc,0xcc,0xcc,0xcc,0x5A,
2175 };
2176
2177 const static uint8_t needs[16]={
2178 0,1,0,0,
2179 2,4,2,0,
2180 0,1,0,0,
2181 15
2182 };
2183
2184 int x, y, b, r, l;
2185 int16_t tmpIt [64*(32+HTAPS)];
2186 uint8_t tmp2t[3][stride*(32+HTAPS)];
2187 int16_t *tmpI= tmpIt;
2188 uint8_t *tmp2= tmp2t[0];
2189 uint8_t *hpel[11];
2148 START_TIMER 2190 START_TIMER
2191 assert(dx<16 && dy<16);
2192 r= brane[dx + 16*dy]&15;
2193 l= brane[dx + 16*dy]>>4;
2194
2195 b= needs[l] | needs[r];
2196
2197 if(b&5){
2149 for(y=0; y < b_h+HTAPS-1; y++){ 2198 for(y=0; y < b_h+HTAPS-1; y++){
2150 for(x=0; x < b_w; x++){ 2199 for(x=0; x < b_w; x++){
2151 int a_2=src[x + HTAPS/2-5]; 2200 int a_2=src[x + HTAPS/2-5];
2152 int a_1=src[x + HTAPS/2-4]; 2201 int a_1=src[x + HTAPS/2-4];
2153 int a0= src[x + HTAPS/2-3]; 2202 int a0= src[x + HTAPS/2-3];
2168 // int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3; 2217 // int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2169 // int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3; 2218 // int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2170 2219
2171 // if(b_w==16) am= 8*(a1+a2); 2220 // if(b_w==16) am= 8*(a1+a2);
2172 2221
2173 if(dx<8) am = (32*a2*( 8-dx) + am* dx + 128)>>8; 2222 tmpI[x]= am;
2174 else am = ( am*(16-dx) + 32*a3*(dx-8) + 128)>>8; 2223 am= (am+16)>>5;
2175
2176 /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2177 if(am&(~255)) am= ~(am>>31); 2224 if(am&(~255)) am= ~(am>>31);
2178 2225 tmp2[x]= am;
2179 tmp[x] = am; 2226 }
2180 2227 tmpI+= 64;
2181 /* if (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) + aL* dx + 32)>>6; 2228 tmp2+= stride;
2182 else if(dx< 8) tmp[x + y*stride]= ( aL*( 8-dx) + am*(dx- 4) + 32)>>6;
2183 else if(dx<12) tmp[x + y*stride]= ( am*(12-dx) + aR*(dx- 8) + 32)>>6;
2184 else tmp[x + y*stride]= ( aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2185 }
2186 tmp += stride;
2187 src += stride; 2229 src += stride;
2188 } 2230 }
2189 tmp -= (b_h+HTAPS-1)*stride; 2231 src -= stride*y;
2190 2232 }
2233 src += HTAPS/2 - 1;
2234 tmp2= tmp2t[1];
2235
2236 if(b&2){
2191 for(y=0; y < b_h; y++){ 2237 for(y=0; y < b_h; y++){
2192 for(x=0; x < b_w; x++){ 2238 for(x=0; x < b_w+1; x++){
2193 int a_2=tmp[x + (HTAPS/2-5)*stride]; 2239 int a_2=src[x + (HTAPS/2-5)*stride];
2194 int a_1=tmp[x + (HTAPS/2-4)*stride]; 2240 int a_1=src[x + (HTAPS/2-4)*stride];
2195 int a0= tmp[x + (HTAPS/2-3)*stride]; 2241 int a0= src[x + (HTAPS/2-3)*stride];
2196 int a1= tmp[x + (HTAPS/2-2)*stride]; 2242 int a1= src[x + (HTAPS/2-2)*stride];
2197 int a2= tmp[x + (HTAPS/2-1)*stride]; 2243 int a2= src[x + (HTAPS/2-1)*stride];
2198 int a3= tmp[x + (HTAPS/2+0)*stride]; 2244 int a3= src[x + (HTAPS/2+0)*stride];
2199 int a4= tmp[x + (HTAPS/2+1)*stride]; 2245 int a4= src[x + (HTAPS/2+1)*stride];
2200 int a5= tmp[x + (HTAPS/2+2)*stride]; 2246 int a5= src[x + (HTAPS/2+2)*stride];
2201 int a6= tmp[x + (HTAPS/2+3)*stride]; 2247 int a6= src[x + (HTAPS/2+3)*stride];
2202 int a7= tmp[x + (HTAPS/2+4)*stride]; 2248 int a7= src[x + (HTAPS/2+4)*stride];
2203 #if HTAPS==6 2249 #if HTAPS==6
2204 int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5); 2250 int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2205 #else 2251 #else
2206 int am= 21*(a2+a3) - 7*(a1+a4) + 3*(a0+a5) - (a_1+a6); 2252 int am= 21*(a2+a3) - 7*(a1+a4) + 3*(a0+a5) - (a_1+a6);
2207 #endif 2253 #endif
2209 /* int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3; 2255 /* int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2210 int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/ 2256 int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2211 2257
2212 // if(b_w==16) am= 8*(a1+a2); 2258 // if(b_w==16) am= 8*(a1+a2);
2213 2259
2214 if(dy<8) am = (32*a2*( 8-dy) + am* dy + 128)>>8; 2260 am= (am + 16)>>5;
2215 else am = ( am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2216
2217 if(am&(~255)) am= ~(am>>31); 2261 if(am&(~255)) am= ~(am>>31);
2218 2262 tmp2[x]= am;
2219 dst[x] = am; 2263 }
2220 /* if (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) + aL* dy + 32)>>6; 2264 src += stride;
2221 else if(dy< 8) tmp[x + y*stride]= ( aL*( 8-dy) + am*(dy- 4) + 32)>>6; 2265 tmp2+= stride;
2222 else if(dy<12) tmp[x + y*stride]= ( am*(12-dy) + aR*(dy- 8) + 32)>>6; 2266 }
2223 else tmp[x + y*stride]= ( aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/ 2267 src -= stride*y;
2224 } 2268 }
2225 dst += stride; 2269 src += stride*(HTAPS/2 - 1);
2226 tmp += stride; 2270 tmp2= tmp2t[2];
2271 tmpI= tmpIt;
2272 if(b&4){
2273 for(y=0; y < b_h; y++){
2274 for(x=0; x < b_w; x++){
2275 int a_2=tmpI[x + (HTAPS/2-5)*64];
2276 int a_1=tmpI[x + (HTAPS/2-4)*64];
2277 int a0= tmpI[x + (HTAPS/2-3)*64];
2278 int a1= tmpI[x + (HTAPS/2-2)*64];
2279 int a2= tmpI[x + (HTAPS/2-1)*64];
2280 int a3= tmpI[x + (HTAPS/2+0)*64];
2281 int a4= tmpI[x + (HTAPS/2+1)*64];
2282 int a5= tmpI[x + (HTAPS/2+2)*64];
2283 int a6= tmpI[x + (HTAPS/2+3)*64];
2284 int a7= tmpI[x + (HTAPS/2+4)*64];
2285 #if HTAPS==6
2286 int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2287 #else
2288 int am= 21*(a2+a3) - 7*(a1+a4) + 3*(a0+a5) - (a_1+a6);
2289 #endif
2290 am= (am + 512)>>10;
2291 if(am&(~255)) am= ~(am>>31);
2292 tmp2[x]= am;
2293 }
2294 tmpI+= 64;
2295 tmp2+= stride;
2296 }
2297 }
2298
2299 hpel[ 0]= src;
2300 hpel[ 1]= tmp2t[0] + stride*(HTAPS/2-1);
2301 hpel[ 2]= src + 1;
2302
2303 hpel[ 4]= tmp2t[1];
2304 hpel[ 5]= tmp2t[2];
2305 hpel[ 6]= tmp2t[1] + 1;
2306
2307 hpel[ 8]= src + stride;
2308 hpel[ 9]= hpel[1] + stride;
2309 hpel[10]= hpel[8] + 1;
2310
2311 if(b==15){
2312 uint8_t *src1= hpel[dx/8 + dy/8*4 ];
2313 uint8_t *src2= hpel[dx/8 + dy/8*4+1];
2314 uint8_t *src3= hpel[dx/8 + dy/8*4+4];
2315 uint8_t *src4= hpel[dx/8 + dy/8*4+5];
2316 dx&=7;
2317 dy&=7;
2318 for(y=0; y < b_h; y++){
2319 for(x=0; x < b_w; x++){
2320 dst[x]= ((8-dx)*(8-dy)*src1[x] + dx*(8-dy)*src2[x]+
2321 (8-dx)* dy *src3[x] + dx* dy *src4[x]+32)>>6;
2322 }
2323 src1+=stride;
2324 src2+=stride;
2325 src3+=stride;
2326 src4+=stride;
2327 dst +=stride;
2328 }
2329 }else{
2330 uint8_t *src1= hpel[l];
2331 uint8_t *src2= hpel[r];
2332 int a= weight[((dx&7) + (8*(dy&7)))];
2333 int b= 8-a;
2334 for(y=0; y < b_h; y++){
2335 for(x=0; x < b_w; x++){
2336 dst[x]= (a*src1[x] + b*src2[x] + 4)>>3;
2337 }
2338 src1+=stride;
2339 src2+=stride;
2340 dst +=stride;
2341 }
2227 } 2342 }
2228 STOP_TIMER("mc_block") 2343 STOP_TIMER("mc_block")
2229 } 2344 }
2230 2345
2231 #define mca(dx,dy,b_w)\ 2346 #define mca(dx,dy,b_w)\