Mercurial > libavcodec.hg
comparison libpostproc/postprocess.c @ 142:da4c751fc151 libavcodec
deinterlace bugfix
author | michael |
---|---|
date | Wed, 31 Oct 2001 18:29:03 +0000 |
parents | 626bfabff1f5 |
children | 1cfc4d567c0a |
comparison
equal
deleted
inserted
replaced
141:626bfabff1f5 | 142:da4c751fc151 |
---|---|
2115 #endif | 2115 #endif |
2116 } | 2116 } |
2117 | 2117 |
2118 /** | 2118 /** |
2119 * Deinterlaces the given block | 2119 * Deinterlaces the given block |
2120 * will be called for every 8x8 block, and can read & write into an 8x16 block | 2120 * will be called for every 8x8 block and can read & write from line 4-15 |
2121 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
2122 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
2121 */ | 2123 */ |
2122 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) | 2124 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) |
2123 { | 2125 { |
2124 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2126 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2127 src+= 4*stride; | |
2125 asm volatile( | 2128 asm volatile( |
2126 "leal (%0, %1), %%eax \n\t" | 2129 "leal (%0, %1), %%eax \n\t" |
2127 "leal (%%eax, %1, 4), %%ebx \n\t" | 2130 "leal (%%eax, %1, 4), %%ebx \n\t" |
2128 // 0 1 2 3 4 5 6 7 8 9 | 2131 // 0 1 2 3 4 5 6 7 8 9 |
2129 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 2132 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
2145 : : "r" (src), "r" (stride) | 2148 : : "r" (src), "r" (stride) |
2146 : "%eax", "%ebx" | 2149 : "%eax", "%ebx" |
2147 ); | 2150 ); |
2148 #else | 2151 #else |
2149 int x; | 2152 int x; |
2153 src+= 4*stride; | |
2150 for(x=0; x<8; x++) | 2154 for(x=0; x<8; x++) |
2151 { | 2155 { |
2152 src[stride] = (src[0] + src[stride*2])>>1; | 2156 src[stride] = (src[0] + src[stride*2])>>1; |
2153 src[stride*3] = (src[stride*2] + src[stride*4])>>1; | 2157 src[stride*3] = (src[stride*2] + src[stride*4])>>1; |
2154 src[stride*5] = (src[stride*4] + src[stride*6])>>1; | 2158 src[stride*5] = (src[stride*4] + src[stride*6])>>1; |
2158 #endif | 2162 #endif |
2159 } | 2163 } |
2160 | 2164 |
2161 /** | 2165 /** |
2162 * Deinterlaces the given block | 2166 * Deinterlaces the given block |
2163 * will be called for every 8x8 block, and can read & write into an 8x16 block | 2167 * will be called for every 8x8 block and can read & write from line 4-15 |
2168 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
2169 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
2170 * this filter will read lines 3-15 and write 7-13 | |
2164 * no cliping in C version | 2171 * no cliping in C version |
2165 */ | 2172 */ |
2166 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride) | 2173 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride) |
2167 { | 2174 { |
2168 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2175 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2176 src+= stride*3; | |
2169 asm volatile( | 2177 asm volatile( |
2170 "leal (%0, %1), %%eax \n\t" | 2178 "leal (%0, %1), %%eax \n\t" |
2171 "leal (%%eax, %1, 4), %%ebx \n\t" | 2179 "leal (%%eax, %1, 4), %%ebx \n\t" |
2172 "leal (%%ebx, %1, 4), %%ecx \n\t" | 2180 "leal (%%ebx, %1, 4), %%ecx \n\t" |
2173 "addl %1, %%ecx \n\t" | 2181 "addl %1, %%ecx \n\t" |
2205 : : "r" (src), "r" (stride) | 2213 : : "r" (src), "r" (stride) |
2206 : "%eax", "%ebx", "ecx" | 2214 : "%eax", "%ebx", "ecx" |
2207 ); | 2215 ); |
2208 #else | 2216 #else |
2209 int x; | 2217 int x; |
2218 src+= stride*3; | |
2210 for(x=0; x<8; x++) | 2219 for(x=0; x<8; x++) |
2211 { | 2220 { |
2212 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; | 2221 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; |
2213 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; | 2222 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; |
2214 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; | 2223 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; |
2218 #endif | 2227 #endif |
2219 } | 2228 } |
2220 | 2229 |
2221 /** | 2230 /** |
2222 * Deinterlaces the given block | 2231 * Deinterlaces the given block |
2223 * will be called for every 8x8 block, and can read & write into an 8x16 block | 2232 * will be called for every 8x8 block and can read & write from line 4-15 |
2233 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
2234 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
2224 * will shift the image up by 1 line (FIXME if this is a problem) | 2235 * will shift the image up by 1 line (FIXME if this is a problem) |
2236 * this filter will read lines 4-13 and write 4-11 | |
2225 */ | 2237 */ |
2226 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) | 2238 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) |
2227 { | 2239 { |
2228 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2240 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2241 src+= 4*stride; | |
2229 asm volatile( | 2242 asm volatile( |
2230 "leal (%0, %1), %%eax \n\t" | 2243 "leal (%0, %1), %%eax \n\t" |
2231 "leal (%%eax, %1, 4), %%ebx \n\t" | 2244 "leal (%%eax, %1, 4), %%ebx \n\t" |
2232 // 0 1 2 3 4 5 6 7 8 9 | 2245 // 0 1 2 3 4 5 6 7 8 9 |
2233 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 2246 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
2271 : : "r" (src), "r" (stride) | 2284 : : "r" (src), "r" (stride) |
2272 : "%eax", "%ebx" | 2285 : "%eax", "%ebx" |
2273 ); | 2286 ); |
2274 #else | 2287 #else |
2275 int x; | 2288 int x; |
2289 src+= 4*stride; | |
2276 for(x=0; x<8; x++) | 2290 for(x=0; x<8; x++) |
2277 { | 2291 { |
2278 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | 2292 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; |
2279 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | 2293 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; |
2280 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | 2294 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; |
2288 #endif | 2302 #endif |
2289 } | 2303 } |
2290 | 2304 |
2291 /** | 2305 /** |
2292 * Deinterlaces the given block | 2306 * Deinterlaces the given block |
2293 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | 2307 * will be called for every 8x8 block and can read & write from line 4-15, |
2308 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
2309 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
2294 */ | 2310 */ |
2295 static inline void deInterlaceMedian(uint8_t src[], int stride) | 2311 static inline void deInterlaceMedian(uint8_t src[], int stride) |
2296 { | 2312 { |
2297 #ifdef HAVE_MMX | 2313 #ifdef HAVE_MMX |
2314 src+= 4*stride; | |
2298 #ifdef HAVE_MMX2 | 2315 #ifdef HAVE_MMX2 |
2299 asm volatile( | 2316 asm volatile( |
2300 "leal (%0, %1), %%eax \n\t" | 2317 "leal (%0, %1), %%eax \n\t" |
2301 "leal (%%eax, %1, 4), %%ebx \n\t" | 2318 "leal (%%eax, %1, 4), %%ebx \n\t" |
2302 // 0 1 2 3 4 5 6 7 8 9 | 2319 // 0 1 2 3 4 5 6 7 8 9 |
2386 ); | 2403 ); |
2387 #endif // MMX | 2404 #endif // MMX |
2388 #else | 2405 #else |
2389 //FIXME | 2406 //FIXME |
2390 int x; | 2407 int x; |
2408 src+= 4*stride; | |
2391 for(x=0; x<8; x++) | 2409 for(x=0; x<8; x++) |
2392 { | 2410 { |
2393 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | 2411 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; |
2394 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | 2412 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; |
2395 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | 2413 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; |
2772 horizontal_size >>= 1; | 2790 horizontal_size >>= 1; |
2773 vertical_size >>= 1; | 2791 vertical_size >>= 1; |
2774 src_stride >>= 1; | 2792 src_stride >>= 1; |
2775 dst_stride >>= 1; | 2793 dst_stride >>= 1; |
2776 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00); | 2794 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00); |
2795 // mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER | | |
2796 // MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER); | |
2777 | 2797 |
2778 if(1) | 2798 if(1) |
2779 { | 2799 { |
2780 postProcess(src[1], src_stride, dst[1], dst_stride, | 2800 postProcess(src[1], src_stride, dst[1], dst_stride, |
2781 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); | 2801 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); |
3086 } | 3106 } |
3087 | 3107 |
3088 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF; | 3108 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF; |
3089 else QPCorrecture= 256; | 3109 else QPCorrecture= 256; |
3090 | 3110 |
3091 /* copy first row of 8x8 blocks */ | 3111 /* line before the first one */ |
3092 for(x=0; x<width; x+=BLOCK_SIZE) | 3112 y=-BLOCK_SIZE; |
3093 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); | 3113 { |
3114 //1% speedup if these are here instead of the inner loop | |
3115 uint8_t *srcBlock= &(src[y*srcStride]); | |
3116 uint8_t *dstBlock= &(dst[y*dstStride]); | |
3117 | |
3118 dstBlock= tempDst + dstStride; | |
3119 | |
3120 // From this point on it is guranteed that we can read and write 16 lines downward | |
3121 // finish 1 block before the next otherwise weŽll might have a problem | |
3122 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
3123 for(x=0; x<width; x+=BLOCK_SIZE) | |
3124 { | |
3125 | |
3126 #ifdef HAVE_MMX2 | |
3127 /* | |
3128 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
3129 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
3130 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
3131 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
3132 */ | |
3133 /* | |
3134 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
3135 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
3136 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
3137 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
3138 */ | |
3139 | |
3140 asm( | |
3141 "movl %4, %%eax \n\t" | |
3142 "shrl $2, %%eax \n\t" | |
3143 "andl $6, %%eax \n\t" | |
3144 "addl $8, %%eax \n\t" | |
3145 "movl %%eax, %%ebx \n\t" | |
3146 "imul %1, %%eax \n\t" | |
3147 "imul %3, %%ebx \n\t" | |
3148 "prefetchnta 32(%%eax, %0) \n\t" | |
3149 "prefetcht0 32(%%ebx, %2) \n\t" | |
3150 "addl %1, %%eax \n\t" | |
3151 "addl %3, %%ebx \n\t" | |
3152 "prefetchnta 32(%%eax, %0) \n\t" | |
3153 "prefetcht0 32(%%ebx, %2) \n\t" | |
3154 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | |
3155 "m" (x) | |
3156 : "%eax", "%ebx" | |
3157 ); | |
3158 | |
3159 #elif defined(HAVE_3DNOW) | |
3160 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
3161 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
3162 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
3163 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
3164 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
3165 */ | |
3166 #endif | |
3167 | |
3168 blockCopy(dstBlock + dstStride*8, dstStride, | |
3169 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); | |
3170 | |
3171 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
3172 deInterlaceInterpolateLinear(dstBlock, dstStride); | |
3173 else if(mode & LINEAR_BLEND_DEINT_FILTER) | |
3174 deInterlaceBlendLinear(dstBlock, dstStride); | |
3175 else if(mode & MEDIAN_DEINT_FILTER) | |
3176 deInterlaceMedian(dstBlock, dstStride); | |
3177 else if(mode & CUBIC_IPOL_DEINT_FILTER) | |
3178 deInterlaceInterpolateCubic(dstBlock, dstStride); | |
3179 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | |
3180 deInterlaceBlendCubic(dstBlock, dstStride); | |
3181 */ | |
3182 dstBlock+=8; | |
3183 srcBlock+=8; | |
3184 } | |
3185 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride ); | |
3186 } | |
3094 | 3187 |
3095 for(y=0; y<height; y+=BLOCK_SIZE) | 3188 for(y=0; y<height; y+=BLOCK_SIZE) |
3096 { | 3189 { |
3097 //1% speedup if these are here instead of the inner loop | 3190 //1% speedup if these are here instead of the inner loop |
3098 uint8_t *srcBlock= &(src[y*srcStride]); | 3191 uint8_t *srcBlock= &(src[y*srcStride]); |
3106 #endif | 3199 #endif |
3107 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards | 3200 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
3108 if not than use a temporary buffer */ | 3201 if not than use a temporary buffer */ |
3109 if(y+15 >= height) | 3202 if(y+15 >= height) |
3110 { | 3203 { |
3111 /* copy from line 5 to 12 of src, these will be copied with | 3204 /* copy from line 8 to 15 of src, these will be copied with |
3112 blockcopy to dst later */ | 3205 blockcopy to dst later */ |
3113 memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5, | 3206 memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8, |
3114 srcStride*MAX(height-y-5, 0) ); | 3207 srcStride*MAX(height-y-8, 0) ); |
3115 | 3208 |
3116 /* duplicate last line to fill the void upto line 12 */ | 3209 /* duplicate last line to fill the void upto line 15 */ |
3117 if(y+12 >= height) | 3210 if(y+15 >= height) |
3118 { | 3211 { |
3119 int i; | 3212 int i; |
3120 for(i=height-y; i<=12; i++) | 3213 for(i=height-y; i<=15; i++) |
3121 memcpy(tempSrc + srcStride*i, | 3214 memcpy(tempSrc + srcStride*i, |
3122 src + srcStride*(height-1), srcStride); | 3215 src + srcStride*(height-1), srcStride); |
3123 } | 3216 } |
3124 | 3217 |
3125 | 3218 /* copy up to 9 lines of dst */ |
3126 /* copy up to 6 lines of dst */ | 3219 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 9) ); |
3127 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 6) ); | |
3128 dstBlock= tempDst + dstStride; | 3220 dstBlock= tempDst + dstStride; |
3129 srcBlock= tempSrc; | 3221 srcBlock= tempSrc; |
3130 } | 3222 } |
3131 | 3223 |
3132 // From this point on it is guranteed that we can read and write 16 lines downward | 3224 // From this point on it is guranteed that we can read and write 16 lines downward |
3188 | 3280 |
3189 asm( | 3281 asm( |
3190 "movl %4, %%eax \n\t" | 3282 "movl %4, %%eax \n\t" |
3191 "shrl $2, %%eax \n\t" | 3283 "shrl $2, %%eax \n\t" |
3192 "andl $6, %%eax \n\t" | 3284 "andl $6, %%eax \n\t" |
3193 "addl $5, %%eax \n\t" | 3285 "addl $8, %%eax \n\t" |
3194 "movl %%eax, %%ebx \n\t" | 3286 "movl %%eax, %%ebx \n\t" |
3195 "imul %1, %%eax \n\t" | 3287 "imul %1, %%eax \n\t" |
3196 "imul %3, %%ebx \n\t" | 3288 "imul %3, %%ebx \n\t" |
3197 "prefetchnta 32(%%eax, %0) \n\t" | 3289 "prefetchnta 32(%%eax, %0) \n\t" |
3198 "prefetcht0 32(%%ebx, %2) \n\t" | 3290 "prefetcht0 32(%%ebx, %2) \n\t" |
3231 dstBlock= tempDstBlock; | 3323 dstBlock= tempDstBlock; |
3232 srcBlock= tempSrcBlock; | 3324 srcBlock= tempSrcBlock; |
3233 } | 3325 } |
3234 #endif | 3326 #endif |
3235 | 3327 |
3236 blockCopy(dstBlock + dstStride*5, dstStride, | 3328 blockCopy(dstBlock + dstStride*8, dstStride, |
3237 srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX); | 3329 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); |
3238 | 3330 |
3239 if(mode & LINEAR_IPOL_DEINT_FILTER) | 3331 if(mode & LINEAR_IPOL_DEINT_FILTER) |
3240 deInterlaceInterpolateLinear(dstBlock, dstStride); | 3332 deInterlaceInterpolateLinear(dstBlock, dstStride); |
3241 else if(mode & LINEAR_BLEND_DEINT_FILTER) | 3333 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
3242 deInterlaceBlendLinear(dstBlock, dstStride); | 3334 deInterlaceBlendLinear(dstBlock, dstStride); |
3359 tempBlock1= tempBlock2; | 3451 tempBlock1= tempBlock2; |
3360 tempBlock2 = tmpXchg; | 3452 tempBlock2 = tmpXchg; |
3361 #endif | 3453 #endif |
3362 } | 3454 } |
3363 | 3455 |
3364 /* did we use a tmp buffer */ | 3456 /* did we use a tmp buffer for the last lines*/ |
3365 if(y+15 >= height) | 3457 if(y+15 >= height) |
3366 { | 3458 { |
3367 uint8_t *dstBlock= &(dst[y*dstStride]); | 3459 uint8_t *dstBlock= &(dst[y*dstStride]); |
3368 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); | 3460 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); |
3369 } | 3461 } |