comparison postproc/postprocess_template.c @ 2595:6c1d6f508466

deinterlace bugfix
author michael
date Wed, 31 Oct 2001 18:29:03 +0000
parents 3b05a6b4d870
children d5636499cafd
comparison
equal deleted inserted replaced
2594:1486c690bece 2595:6c1d6f508466
2115 #endif 2115 #endif
2116 } 2116 }
2117 2117
2118 /** 2118 /**
2119 * Deinterlaces the given block 2119 * Deinterlaces the given block
2120 * will be called for every 8x8 block, and can read & write into an 8x16 block 2120 * will be called for every 8x8 block and can read & write from line 4-15
2121 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2122 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2121 */ 2123 */
2122 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) 2124 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
2123 { 2125 {
2124 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2126 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2127 src+= 4*stride;
2125 asm volatile( 2128 asm volatile(
2126 "leal (%0, %1), %%eax \n\t" 2129 "leal (%0, %1), %%eax \n\t"
2127 "leal (%%eax, %1, 4), %%ebx \n\t" 2130 "leal (%%eax, %1, 4), %%ebx \n\t"
2128 // 0 1 2 3 4 5 6 7 8 9 2131 // 0 1 2 3 4 5 6 7 8 9
2129 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 2132 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2145 : : "r" (src), "r" (stride) 2148 : : "r" (src), "r" (stride)
2146 : "%eax", "%ebx" 2149 : "%eax", "%ebx"
2147 ); 2150 );
2148 #else 2151 #else
2149 int x; 2152 int x;
2153 src+= 4*stride;
2150 for(x=0; x<8; x++) 2154 for(x=0; x<8; x++)
2151 { 2155 {
2152 src[stride] = (src[0] + src[stride*2])>>1; 2156 src[stride] = (src[0] + src[stride*2])>>1;
2153 src[stride*3] = (src[stride*2] + src[stride*4])>>1; 2157 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
2154 src[stride*5] = (src[stride*4] + src[stride*6])>>1; 2158 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
2158 #endif 2162 #endif
2159 } 2163 }
2160 2164
2161 /** 2165 /**
2162 * Deinterlaces the given block 2166 * Deinterlaces the given block
2163 * will be called for every 8x8 block, and can read & write into an 8x16 block 2167 * will be called for every 8x8 block and can read & write from line 4-15
2168 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2169 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2170 * this filter will read lines 3-15 and write 7-13
2164 * no cliping in C version 2171 * no cliping in C version
2165 */ 2172 */
2166 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride) 2173 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
2167 { 2174 {
2168 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2175 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2176 src+= stride*3;
2169 asm volatile( 2177 asm volatile(
2170 "leal (%0, %1), %%eax \n\t" 2178 "leal (%0, %1), %%eax \n\t"
2171 "leal (%%eax, %1, 4), %%ebx \n\t" 2179 "leal (%%eax, %1, 4), %%ebx \n\t"
2172 "leal (%%ebx, %1, 4), %%ecx \n\t" 2180 "leal (%%ebx, %1, 4), %%ecx \n\t"
2173 "addl %1, %%ecx \n\t" 2181 "addl %1, %%ecx \n\t"
2205 : : "r" (src), "r" (stride) 2213 : : "r" (src), "r" (stride)
2206 : "%eax", "%ebx", "ecx" 2214 : "%eax", "%ebx", "ecx"
2207 ); 2215 );
2208 #else 2216 #else
2209 int x; 2217 int x;
2218 src+= stride*3;
2210 for(x=0; x<8; x++) 2219 for(x=0; x<8; x++)
2211 { 2220 {
2212 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; 2221 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
2213 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; 2222 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
2214 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; 2223 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
2218 #endif 2227 #endif
2219 } 2228 }
2220 2229
2221 /** 2230 /**
2222 * Deinterlaces the given block 2231 * Deinterlaces the given block
2223 * will be called for every 8x8 block, and can read & write into an 8x16 block 2232 * will be called for every 8x8 block and can read & write from line 4-15
2233 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2234 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2224 * will shift the image up by 1 line (FIXME if this is a problem) 2235 * will shift the image up by 1 line (FIXME if this is a problem)
2236 * this filter will read lines 4-13 and write 4-11
2225 */ 2237 */
2226 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) 2238 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2227 { 2239 {
2228 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2240 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2241 src+= 4*stride;
2229 asm volatile( 2242 asm volatile(
2230 "leal (%0, %1), %%eax \n\t" 2243 "leal (%0, %1), %%eax \n\t"
2231 "leal (%%eax, %1, 4), %%ebx \n\t" 2244 "leal (%%eax, %1, 4), %%ebx \n\t"
2232 // 0 1 2 3 4 5 6 7 8 9 2245 // 0 1 2 3 4 5 6 7 8 9
2233 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 2246 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2271 : : "r" (src), "r" (stride) 2284 : : "r" (src), "r" (stride)
2272 : "%eax", "%ebx" 2285 : "%eax", "%ebx"
2273 ); 2286 );
2274 #else 2287 #else
2275 int x; 2288 int x;
2289 src+= 4*stride;
2276 for(x=0; x<8; x++) 2290 for(x=0; x<8; x++)
2277 { 2291 {
2278 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; 2292 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2279 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; 2293 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2280 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; 2294 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2288 #endif 2302 #endif
2289 } 2303 }
2290 2304
2291 /** 2305 /**
2292 * Deinterlaces the given block 2306 * Deinterlaces the given block
2293 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block 2307 * will be called for every 8x8 block and can read & write from line 4-15,
2308 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2309 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2294 */ 2310 */
2295 static inline void deInterlaceMedian(uint8_t src[], int stride) 2311 static inline void deInterlaceMedian(uint8_t src[], int stride)
2296 { 2312 {
2297 #ifdef HAVE_MMX 2313 #ifdef HAVE_MMX
2314 src+= 4*stride;
2298 #ifdef HAVE_MMX2 2315 #ifdef HAVE_MMX2
2299 asm volatile( 2316 asm volatile(
2300 "leal (%0, %1), %%eax \n\t" 2317 "leal (%0, %1), %%eax \n\t"
2301 "leal (%%eax, %1, 4), %%ebx \n\t" 2318 "leal (%%eax, %1, 4), %%ebx \n\t"
2302 // 0 1 2 3 4 5 6 7 8 9 2319 // 0 1 2 3 4 5 6 7 8 9
2386 ); 2403 );
2387 #endif // MMX 2404 #endif // MMX
2388 #else 2405 #else
2389 //FIXME 2406 //FIXME
2390 int x; 2407 int x;
2408 src+= 4*stride;
2391 for(x=0; x<8; x++) 2409 for(x=0; x<8; x++)
2392 { 2410 {
2393 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; 2411 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2394 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; 2412 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2395 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; 2413 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2772 horizontal_size >>= 1; 2790 horizontal_size >>= 1;
2773 vertical_size >>= 1; 2791 vertical_size >>= 1;
2774 src_stride >>= 1; 2792 src_stride >>= 1;
2775 dst_stride >>= 1; 2793 dst_stride >>= 1;
2776 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00); 2794 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2795 // mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER |
2796 // MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER);
2777 2797
2778 if(1) 2798 if(1)
2779 { 2799 {
2780 postProcess(src[1], src_stride, dst[1], dst_stride, 2800 postProcess(src[1], src_stride, dst[1], dst_stride,
2781 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); 2801 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
3086 } 3106 }
3087 3107
3088 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF; 3108 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
3089 else QPCorrecture= 256; 3109 else QPCorrecture= 256;
3090 3110
3091 /* copy first row of 8x8 blocks */ 3111 /* line before the first one */
3092 for(x=0; x<width; x+=BLOCK_SIZE) 3112 y=-BLOCK_SIZE;
3093 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); 3113 {
3114 //1% speedup if these are here instead of the inner loop
3115 uint8_t *srcBlock= &(src[y*srcStride]);
3116 uint8_t *dstBlock= &(dst[y*dstStride]);
3117
3118 dstBlock= tempDst + dstStride;
3119
3120 // From this point on it is guranteed that we can read and write 16 lines downward
3121 // finish 1 block before the next otherwise weŽll might have a problem
3122 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3123 for(x=0; x<width; x+=BLOCK_SIZE)
3124 {
3125
3126 #ifdef HAVE_MMX2
3127 /*
3128 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3129 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3130 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3131 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3132 */
3133 /*
3134 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3135 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3136 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3137 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3138 */
3139
3140 asm(
3141 "movl %4, %%eax \n\t"
3142 "shrl $2, %%eax \n\t"
3143 "andl $6, %%eax \n\t"
3144 "addl $8, %%eax \n\t"
3145 "movl %%eax, %%ebx \n\t"
3146 "imul %1, %%eax \n\t"
3147 "imul %3, %%ebx \n\t"
3148 "prefetchnta 32(%%eax, %0) \n\t"
3149 "prefetcht0 32(%%ebx, %2) \n\t"
3150 "addl %1, %%eax \n\t"
3151 "addl %3, %%ebx \n\t"
3152 "prefetchnta 32(%%eax, %0) \n\t"
3153 "prefetcht0 32(%%ebx, %2) \n\t"
3154 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3155 "m" (x)
3156 : "%eax", "%ebx"
3157 );
3158
3159 #elif defined(HAVE_3DNOW)
3160 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3161 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3162 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3163 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3164 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3165 */
3166 #endif
3167
3168 blockCopy(dstBlock + dstStride*8, dstStride,
3169 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3170
3171 if(mode & LINEAR_IPOL_DEINT_FILTER)
3172 deInterlaceInterpolateLinear(dstBlock, dstStride);
3173 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3174 deInterlaceBlendLinear(dstBlock, dstStride);
3175 else if(mode & MEDIAN_DEINT_FILTER)
3176 deInterlaceMedian(dstBlock, dstStride);
3177 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3178 deInterlaceInterpolateCubic(dstBlock, dstStride);
3179 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3180 deInterlaceBlendCubic(dstBlock, dstStride);
3181 */
3182 dstBlock+=8;
3183 srcBlock+=8;
3184 }
3185 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride );
3186 }
3094 3187
3095 for(y=0; y<height; y+=BLOCK_SIZE) 3188 for(y=0; y<height; y+=BLOCK_SIZE)
3096 { 3189 {
3097 //1% speedup if these are here instead of the inner loop 3190 //1% speedup if these are here instead of the inner loop
3098 uint8_t *srcBlock= &(src[y*srcStride]); 3191 uint8_t *srcBlock= &(src[y*srcStride]);
3106 #endif 3199 #endif
3107 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards 3200 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3108 if not than use a temporary buffer */ 3201 if not than use a temporary buffer */
3109 if(y+15 >= height) 3202 if(y+15 >= height)
3110 { 3203 {
3111 /* copy from line 5 to 12 of src, these will be copied with 3204 /* copy from line 8 to 15 of src, these will be copied with
3112 blockcopy to dst later */ 3205 blockcopy to dst later */
3113 memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5, 3206 memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8,
3114 srcStride*MAX(height-y-5, 0) ); 3207 srcStride*MAX(height-y-8, 0) );
3115 3208
3116 /* duplicate last line to fill the void upto line 12 */ 3209 /* duplicate last line to fill the void upto line 15 */
3117 if(y+12 >= height) 3210 if(y+15 >= height)
3118 { 3211 {
3119 int i; 3212 int i;
3120 for(i=height-y; i<=12; i++) 3213 for(i=height-y; i<=15; i++)
3121 memcpy(tempSrc + srcStride*i, 3214 memcpy(tempSrc + srcStride*i,
3122 src + srcStride*(height-1), srcStride); 3215 src + srcStride*(height-1), srcStride);
3123 } 3216 }
3124 3217
3125 3218 /* copy up to 9 lines of dst */
3126 /* copy up to 6 lines of dst */ 3219 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 9) );
3127 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 6) );
3128 dstBlock= tempDst + dstStride; 3220 dstBlock= tempDst + dstStride;
3129 srcBlock= tempSrc; 3221 srcBlock= tempSrc;
3130 } 3222 }
3131 3223
3132 // From this point on it is guranteed that we can read and write 16 lines downward 3224 // From this point on it is guranteed that we can read and write 16 lines downward
3188 3280
3189 asm( 3281 asm(
3190 "movl %4, %%eax \n\t" 3282 "movl %4, %%eax \n\t"
3191 "shrl $2, %%eax \n\t" 3283 "shrl $2, %%eax \n\t"
3192 "andl $6, %%eax \n\t" 3284 "andl $6, %%eax \n\t"
3193 "addl $5, %%eax \n\t" 3285 "addl $8, %%eax \n\t"
3194 "movl %%eax, %%ebx \n\t" 3286 "movl %%eax, %%ebx \n\t"
3195 "imul %1, %%eax \n\t" 3287 "imul %1, %%eax \n\t"
3196 "imul %3, %%ebx \n\t" 3288 "imul %3, %%ebx \n\t"
3197 "prefetchnta 32(%%eax, %0) \n\t" 3289 "prefetchnta 32(%%eax, %0) \n\t"
3198 "prefetcht0 32(%%ebx, %2) \n\t" 3290 "prefetcht0 32(%%ebx, %2) \n\t"
3231 dstBlock= tempDstBlock; 3323 dstBlock= tempDstBlock;
3232 srcBlock= tempSrcBlock; 3324 srcBlock= tempSrcBlock;
3233 } 3325 }
3234 #endif 3326 #endif
3235 3327
3236 blockCopy(dstBlock + dstStride*5, dstStride, 3328 blockCopy(dstBlock + dstStride*8, dstStride,
3237 srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX); 3329 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3238 3330
3239 if(mode & LINEAR_IPOL_DEINT_FILTER) 3331 if(mode & LINEAR_IPOL_DEINT_FILTER)
3240 deInterlaceInterpolateLinear(dstBlock, dstStride); 3332 deInterlaceInterpolateLinear(dstBlock, dstStride);
3241 else if(mode & LINEAR_BLEND_DEINT_FILTER) 3333 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3242 deInterlaceBlendLinear(dstBlock, dstStride); 3334 deInterlaceBlendLinear(dstBlock, dstStride);
3359 tempBlock1= tempBlock2; 3451 tempBlock1= tempBlock2;
3360 tempBlock2 = tmpXchg; 3452 tempBlock2 = tmpXchg;
3361 #endif 3453 #endif
3362 } 3454 }
3363 3455
3364 /* did we use a tmp buffer */ 3456 /* did we use a tmp buffer for the last lines*/
3365 if(y+15 >= height) 3457 if(y+15 >= height)
3366 { 3458 {
3367 uint8_t *dstBlock= &(dst[y*dstStride]); 3459 uint8_t *dstBlock= &(dst[y*dstStride]);
3368 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); 3460 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3369 } 3461 }