# HG changeset patch # User michael # Date 1002839745 0 # Node ID e57b1d38d71fc4d92f4d3c5c6636abc8405729fe # Parent 29ac11dc53d31b8ce4fa4586350d6fd74d7a13b2 bugfixes: last 3 lines not brightness/contrast corrected brightness statistics messed up with initial black pic changed initial values of the brightness statistics C++ -> C conversation QP range question solved (very likely 1<=QP<=32 according to arpi) new experimental vertical deblocking filter RK filter has 3dNow support now (untested) diff -r 29ac11dc53d3 -r e57b1d38d71f libpostproc/postprocess.c --- a/libpostproc/postprocess.c Wed Oct 10 22:21:19 2001 +0000 +++ b/libpostproc/postprocess.c Thu Oct 11 22:35:45 2001 +0000 @@ -27,6 +27,9 @@ doHorizLowPass E a a* doHorizDefFilter E ac ac deRing +RKAlgo1 E a a* +X1 a E E* + * i dont have a 3dnow CPU -> its untested E = Exact implementation @@ -41,11 +44,13 @@ reduce the time wasted on the mem transfer implement dering implement everything in C at least (done at the moment but ...) -figure range of QP out (assuming <256 for now) unroll stuff if instructions depend too much on the prior one we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? move YScale thing to the end instead of fixing QP write a faster and higher quality deblocking filter :) +do something about the speed of the horizontal filters +make the mainloop more flexible (variable number of blocks at once + (the if/else stuff per block is slowing things down) ... Notes: @@ -54,6 +59,14 @@ /* Changelog: +0.1.3 + bugfixes: last 3 lines not brightness/contrast corrected + brightness statistics messed up with initial black pic + changed initial values of the brightness statistics + C++ -> C conversation + QP range question solved (very likely 1<=QP<=32 according to arpi) + new experimental vertical deblocking filter + RK filter has 3dNow support now (untested) 0.1.2 fixed a bug in the horizontal default filter 3dnow version of the Horizontal & Vertical Lowpass filters @@ -66,6 +79,7 @@ #include #include +#include #include "../config.h" //#undef HAVE_MMX2 //#define HAVE_3DNOW @@ -160,9 +174,10 @@ /** * Check if the middle 8x8 Block in the given 8x10 block is flat */ -static inline bool isVertDC(uint8_t src[], int stride){ +static inline int isVertDC(uint8_t src[], int stride){ // return true; int numEq= 0; + int y; src+= stride; // src points to begin of the 8x8 Block #ifdef HAVE_MMX asm volatile( @@ -242,7 +257,7 @@ // uint8_t *temp= src; #else - for(int y=0; y vFlatnessThreshold; +// for(int i=0; i vFlatnessThreshold) ? 1 : 0; } -static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP) +static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) { #ifdef HAVE_MMX int isOk; @@ -295,13 +311,14 @@ : "=r" (isOk) : "r" (src), "r" (stride) ); - return isOk; + return isOk ? 1 : 0; #else - int isOk2= true; - for(int x=0; x 2*QP) isOk2=false; + if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; } /* if(isOk && !isOk2 || !isOk && isOk2) { @@ -484,8 +501,8 @@ const int l7= stride + l6; const int l8= stride + l7; const int l9= stride + l8; - - for(int x=0; x hFlatnessThreshold; } -static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP) +static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) { #ifdef MMX_FIXME FIXME @@ -1071,9 +1199,9 @@ ); return isOk; #else - if(abs(src[0] - src[7]) > 2*QP) return false; + if(abs(src[0] - src[7]) > 2*QP) return 0; - return true; + return 1; #endif } @@ -1173,7 +1301,8 @@ #else uint8_t *src= tempBlock; - for(int y=0; y>= 1; vertical_size >>= 1; @@ -1512,9 +1642,9 @@ if(1) { postProcess(src[1], src_stride, dst[1], dst_stride, - horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); + horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); postProcess(src[2], src_stride, dst[2], dst_stride, - horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); + horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); } else { @@ -1543,11 +1673,19 @@ /** * Copies a block from src to dst and fixes the blacklevel + * numLines must be a multiple of 4 + * levelFix == 0 -> dont touch the brighness & contrast */ -static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) +static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, + int numLines, int levelFix) { + int i; + if(levelFix) + { #ifdef HAVE_MMX asm volatile( + "movl %4, %%eax \n\t" + "movl %%eax, temp0\n\t" "pushl %0 \n\t" "pushl %1 \n\t" "leal (%2,%2), %%eax \n\t" @@ -1555,14 +1693,6 @@ "movq packedYOffset, %%mm2 \n\t" "movq packedYScale, %%mm3 \n\t" -#define SIMPLE_CPY \ - "movq (%0), %%mm0 \n\t"\ - "movq (%0,%2), %%mm1 \n\t"\ - "psubusb %%mm2, %%mm0 \n\t"\ - "psubusb %%mm2, %%mm1 \n\t"\ - "movq %%mm0, (%1) \n\t"\ - "movq %%mm1, (%1, %3) \n\t"\ - #define SCALED_CPY \ "movq (%0), %%mm0 \n\t"\ "movq (%0,%2), %%mm1 \n\t"\ @@ -1585,33 +1715,75 @@ "packuswb %%mm5, %%mm4 \n\t"\ "movq %%mm4, (%1, %3) \n\t"\ - -#define CPY SCALED_CPY -//#define CPY SIMPLE_CPY -// "prefetchnta 8(%0)\n\t" -CPY + "1: \n\t" +SCALED_CPY "addl %%eax, %0 \n\t" "addl %%ebx, %1 \n\t" -CPY +SCALED_CPY "addl %%eax, %0 \n\t" "addl %%ebx, %1 \n\t" -CPY - "addl %%eax, %0 \n\t" - "addl %%ebx, %1 \n\t" -CPY + "decl temp0 \n\t" + "jnz 1b \n\t" + "popl %1 \n\t" "popl %0 \n\t" : : "r" (src), "r" (dst), "r" (srcStride), - "r" (dstStride) + "r" (dstStride), + "m" (numLines>>2) : "%eax", "%ebx" ); #else - for(int i=0; i>2) + : "%eax", "%ebx" + ); +#else + for(i=0; i0; black--) { if(clipped < maxClipped) break; @@ -1665,9 +1854,9 @@ packedYOffset|= packedYOffset<<16; packedYOffset|= packedYOffset<<8; - double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); + scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); - packedYScale= uint16_t(scale*256.0 + 0.5); + packedYScale= (uint16_t)(scale*256.0 + 0.5); packedYScale|= packedYScale<<32; packedYScale|= packedYScale<<16; } @@ -1677,10 +1866,10 @@ packedYOffset= 0; } - for(int x=0; x>3)*QPStride + (x>>3)]: (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8; @@ -1707,7 +1897,6 @@ #endif - const int stride= dstStride; if(y + 12 < height) { #ifdef MORE_TIMEING @@ -1730,7 +1919,7 @@ if(!isColor) yHistogram[ srcBlock[0] ]++; blockCopy(vertBlock + dstStride*2, dstStride, - vertSrcBlock + srcStride*2, srcStride); + vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX); #ifdef MORE_TIMEING @@ -1742,7 +1931,7 @@ { if(mode & RK_FILTER) vertRKFilter(vertBlock, stride, QP); - else if(0) + else if(mode & X1_FILTER) vertX1Filter(vertBlock, stride, QP); else { @@ -1762,12 +1951,9 @@ #endif } else - { - for(int i=2; i= 0 && x void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, - QP_STORE_T QPs[], int QPStride, bool isColor, int mode); + QP_STORE_T QPs[], int QPStride, int isColor, int mode); #endif #ifdef __cplusplus diff -r 29ac11dc53d3 -r e57b1d38d71f libpostproc/postprocess_template.c --- a/libpostproc/postprocess_template.c Wed Oct 10 22:21:19 2001 +0000 +++ b/libpostproc/postprocess_template.c Thu Oct 11 22:35:45 2001 +0000 @@ -27,6 +27,9 @@ doHorizLowPass E a a* doHorizDefFilter E ac ac deRing +RKAlgo1 E a a* +X1 a E E* + * i dont have a 3dnow CPU -> its untested E = Exact implementation @@ -41,11 +44,13 @@ reduce the time wasted on the mem transfer implement dering implement everything in C at least (done at the moment but ...) -figure range of QP out (assuming <256 for now) unroll stuff if instructions depend too much on the prior one we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? move YScale thing to the end instead of fixing QP write a faster and higher quality deblocking filter :) +do something about the speed of the horizontal filters +make the mainloop more flexible (variable number of blocks at once + (the if/else stuff per block is slowing things down) ... Notes: @@ -54,6 +59,14 @@ /* Changelog: +0.1.3 + bugfixes: last 3 lines not brightness/contrast corrected + brightness statistics messed up with initial black pic + changed initial values of the brightness statistics + C++ -> C conversation + QP range question solved (very likely 1<=QP<=32 according to arpi) + new experimental vertical deblocking filter + RK filter has 3dNow support now (untested) 0.1.2 fixed a bug in the horizontal default filter 3dnow version of the Horizontal & Vertical Lowpass filters @@ -66,6 +79,7 @@ #include #include +#include #include "../config.h" //#undef HAVE_MMX2 //#define HAVE_3DNOW @@ -160,9 +174,10 @@ /** * Check if the middle 8x8 Block in the given 8x10 block is flat */ -static inline bool isVertDC(uint8_t src[], int stride){ +static inline int isVertDC(uint8_t src[], int stride){ // return true; int numEq= 0; + int y; src+= stride; // src points to begin of the 8x8 Block #ifdef HAVE_MMX asm volatile( @@ -242,7 +257,7 @@ // uint8_t *temp= src; #else - for(int y=0; y vFlatnessThreshold; +// for(int i=0; i vFlatnessThreshold) ? 1 : 0; } -static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP) +static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) { #ifdef HAVE_MMX int isOk; @@ -295,13 +311,14 @@ : "=r" (isOk) : "r" (src), "r" (stride) ); - return isOk; + return isOk ? 1 : 0; #else - int isOk2= true; - for(int x=0; x 2*QP) isOk2=false; + if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; } /* if(isOk && !isOk2 || !isOk && isOk2) { @@ -484,8 +501,8 @@ const int l7= stride + l6; const int l8= stride + l7; const int l9= stride + l8; - - for(int x=0; x hFlatnessThreshold; } -static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP) +static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) { #ifdef MMX_FIXME FIXME @@ -1071,9 +1199,9 @@ ); return isOk; #else - if(abs(src[0] - src[7]) > 2*QP) return false; + if(abs(src[0] - src[7]) > 2*QP) return 0; - return true; + return 1; #endif } @@ -1173,7 +1301,8 @@ #else uint8_t *src= tempBlock; - for(int y=0; y>= 1; vertical_size >>= 1; @@ -1512,9 +1642,9 @@ if(1) { postProcess(src[1], src_stride, dst[1], dst_stride, - horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); + horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); postProcess(src[2], src_stride, dst[2], dst_stride, - horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); + horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); } else { @@ -1543,11 +1673,19 @@ /** * Copies a block from src to dst and fixes the blacklevel + * numLines must be a multiple of 4 + * levelFix == 0 -> dont touch the brighness & contrast */ -static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) +static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, + int numLines, int levelFix) { + int i; + if(levelFix) + { #ifdef HAVE_MMX asm volatile( + "movl %4, %%eax \n\t" + "movl %%eax, temp0\n\t" "pushl %0 \n\t" "pushl %1 \n\t" "leal (%2,%2), %%eax \n\t" @@ -1555,14 +1693,6 @@ "movq packedYOffset, %%mm2 \n\t" "movq packedYScale, %%mm3 \n\t" -#define SIMPLE_CPY \ - "movq (%0), %%mm0 \n\t"\ - "movq (%0,%2), %%mm1 \n\t"\ - "psubusb %%mm2, %%mm0 \n\t"\ - "psubusb %%mm2, %%mm1 \n\t"\ - "movq %%mm0, (%1) \n\t"\ - "movq %%mm1, (%1, %3) \n\t"\ - #define SCALED_CPY \ "movq (%0), %%mm0 \n\t"\ "movq (%0,%2), %%mm1 \n\t"\ @@ -1585,33 +1715,75 @@ "packuswb %%mm5, %%mm4 \n\t"\ "movq %%mm4, (%1, %3) \n\t"\ - -#define CPY SCALED_CPY -//#define CPY SIMPLE_CPY -// "prefetchnta 8(%0)\n\t" -CPY + "1: \n\t" +SCALED_CPY "addl %%eax, %0 \n\t" "addl %%ebx, %1 \n\t" -CPY +SCALED_CPY "addl %%eax, %0 \n\t" "addl %%ebx, %1 \n\t" -CPY - "addl %%eax, %0 \n\t" - "addl %%ebx, %1 \n\t" -CPY + "decl temp0 \n\t" + "jnz 1b \n\t" + "popl %1 \n\t" "popl %0 \n\t" : : "r" (src), "r" (dst), "r" (srcStride), - "r" (dstStride) + "r" (dstStride), + "m" (numLines>>2) : "%eax", "%ebx" ); #else - for(int i=0; i>2) + : "%eax", "%ebx" + ); +#else + for(i=0; i0; black--) { if(clipped < maxClipped) break; @@ -1665,9 +1854,9 @@ packedYOffset|= packedYOffset<<16; packedYOffset|= packedYOffset<<8; - double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); + scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); - packedYScale= uint16_t(scale*256.0 + 0.5); + packedYScale= (uint16_t)(scale*256.0 + 0.5); packedYScale|= packedYScale<<32; packedYScale|= packedYScale<<16; } @@ -1677,10 +1866,10 @@ packedYOffset= 0; } - for(int x=0; x>3)*QPStride + (x>>3)]: (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8; @@ -1707,7 +1897,6 @@ #endif - const int stride= dstStride; if(y + 12 < height) { #ifdef MORE_TIMEING @@ -1730,7 +1919,7 @@ if(!isColor) yHistogram[ srcBlock[0] ]++; blockCopy(vertBlock + dstStride*2, dstStride, - vertSrcBlock + srcStride*2, srcStride); + vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX); #ifdef MORE_TIMEING @@ -1742,7 +1931,7 @@ { if(mode & RK_FILTER) vertRKFilter(vertBlock, stride, QP); - else if(0) + else if(mode & X1_FILTER) vertX1Filter(vertBlock, stride, QP); else { @@ -1762,12 +1951,9 @@ #endif } else - { - for(int i=2; i= 0 && x