# HG changeset patch
# User michael
# Date 1002839745 0
# Node ID 21a8f158d19fbfe6e7541bc7bcb1f7de70459131
# Parent  c138a19bae77dfce750e6c8ed72a52c2ba694e35
bugfixes: last 3 lines not brightness/contrast corrected
	brightness statistics messed up with initial black pic
changed initial values of the brightness statistics
C++ -> C conversation
QP range question solved (very likely 1<=QP<=32 according to arpi)
new experimental vertical deblocking filter
RK filter has 3dNow support now (untested)

diff -r c138a19bae77 -r 21a8f158d19f postproc/postprocess.c
--- a/postproc/postprocess.c	Thu Oct 11 16:17:47 2001 +0000
+++ b/postproc/postprocess.c	Thu Oct 11 22:35:45 2001 +0000
@@ -27,6 +27,9 @@
 doHorizLowPass		E		a	a*
 doHorizDefFilter	E	ac	ac
 deRing
+RKAlgo1			E		a	a*
+X1			a		E	E*
+
 
 * i dont have a 3dnow CPU -> its untested
 E = Exact implementation
@@ -41,11 +44,13 @@
 reduce the time wasted on the mem transfer
 implement dering
 implement everything in C at least (done at the moment but ...)
-figure range of QP out (assuming <256 for now)
 unroll stuff if instructions depend too much on the prior one
 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
 move YScale thing to the end instead of fixing QP
 write a faster and higher quality deblocking filter :)
+do something about the speed of the horizontal filters
+make the mainloop more flexible (variable number of blocks at once
+	(the if/else stuff per block is slowing things down)
 ...
 
 Notes:
@@ -54,6 +59,14 @@
 
 /*
 Changelog:
+0.1.3
+	bugfixes: last 3 lines not brightness/contrast corrected
+		brightness statistics messed up with initial black pic
+	changed initial values of the brightness statistics
+	C++ -> C conversation
+	QP range question solved (very likely 1<=QP<=32 according to arpi)
+	new experimental vertical deblocking filter
+	RK filter has 3dNow support now (untested)
 0.1.2
 	fixed a bug in the horizontal default filter
 	3dnow version of the Horizontal & Vertical Lowpass filters
@@ -66,6 +79,7 @@
 
 #include <inttypes.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include "../config.h"
 //#undef HAVE_MMX2
 //#define HAVE_3DNOW
@@ -160,9 +174,10 @@
 /**
  * Check if the middle 8x8 Block in the given 8x10 block is flat
  */
-static inline bool isVertDC(uint8_t src[], int stride){
+static inline int isVertDC(uint8_t src[], int stride){
 //	return true;
 	int numEq= 0;
+	int y;
 	src+= stride; // src points to begin of the 8x8 Block
 #ifdef HAVE_MMX
 	asm volatile(
@@ -242,7 +257,7 @@
 //	uint8_t *temp= src;
 
 #else
-	for(int y=0; y<BLOCK_SIZE-1; y++)
+	for(y=0; y<BLOCK_SIZE-1; y++)
 	{
 		if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
 		if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
@@ -268,10 +283,11 @@
 		}
 	}
 */
-	return numEq > vFlatnessThreshold;
+//	for(int i=0; i<numEq/8; i++) src[i]=255;
+	return (numEq > vFlatnessThreshold) ? 1 : 0;
 }
 
-static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP)
+static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
 {
 #ifdef HAVE_MMX
 	int isOk;
@@ -295,13 +311,14 @@
 		: "=r" (isOk)
 		: "r" (src), "r" (stride)
 		);
-	return isOk;
+	return isOk ? 1 : 0;
 #else
 
-	int isOk2= true;
-	for(int x=0; x<BLOCK_SIZE; x++)
+	int isOk2= 1;
+	int x;
+	for(x=0; x<BLOCK_SIZE; x++)
 	{
-		if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=false;
+		if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
 	}
 /*	if(isOk && !isOk2 || !isOk && isOk2)
 	{
@@ -484,8 +501,8 @@
 	const int l7= stride + l6;
 	const int l8= stride + l7;
 	const int l9= stride + l8;
-
-	for(int x=0; x<BLOCK_SIZE; x++)
+	int x;
+	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
 		const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
@@ -529,7 +546,7 @@
  */
 static inline void vertRKFilter(uint8_t *src, int stride, int QP)
 {
-#ifdef HAVE_MMX2
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 // FIXME rounding
 	asm volatile(
 		"pxor %%mm7, %%mm7				\n\t" // 0
@@ -549,7 +566,7 @@
 		"movq %%mm2, %%mm4				\n\t" // line 4
 		"pcmpeqb %%mm5, %%mm5				\n\t" // -1
 		"pxor %%mm2, %%mm5				\n\t" // -line 4 - 1
-		"pavgb %%mm3, %%mm5				\n\t"
+		PAVGB(%%mm3, %%mm5)
 		"paddb %%mm6, %%mm5				\n\t" // (l5-l4)/2
 		"psubusb %%mm3, %%mm4				\n\t"
 		"psubusb %%mm2, %%mm3				\n\t"
@@ -600,16 +617,18 @@
 	const int l7= stride + l6;
 	const int l8= stride + l7;
 	const int l9= stride + l8;
-	for(int x=0; x<BLOCK_SIZE; x++)
+	int x;
+	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		if(ABS(src[l4]-src[l5]) < QP + QP/4)
 		{
-			int x = src[l5] - src[l4];
+			int v = (src[l5] - src[l4]);
 
-			src[l3] +=x/8;
-			src[l4] +=x/2;
-			src[l5] -=x/2;
-			src[l6] -=x/8;
+			src[l3] +=v/8;
+			src[l4] +=v/2;
+			src[l5] -=v/2;
+			src[l6] -=v/8;
+
 		}
 		src++;
 	}
@@ -619,18 +638,126 @@
 
 /**
  * Experimental Filter 1
+ * will nor damage linear gradients
+ * can only smooth blocks at the expected locations (it cant smooth them if they did move)
+ * MMX2 version does correct clipping C version doesnt
  */
 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
 {
-#ifdef HAVE_MMX2X
-// FIXME
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 	asm volatile(
+		"pxor %%mm7, %%mm7				\n\t" // 0
+//		"movq b80, %%mm6				\n\t" // MIN_SIGNED_BYTE
+		"leal (%0, %1), %%eax				\n\t"
+		"leal (%%eax, %1, 4), %%ebx			\n\t"
+//	0	1	2	3	4	5	6	7	8	9
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
+		"movq (%0, %1, 4), %%mm1			\n\t" // line 4
+		"movq %%mm1, %%mm2				\n\t" // line 4
+		"psubusb %%mm0, %%mm1				\n\t"
+		"psubusb %%mm2, %%mm0				\n\t"
+		"por %%mm1, %%mm0				\n\t" // |l2 - l3|
+		"movq (%%ebx), %%mm3				\n\t" // line 5
+		"movq (%%ebx, %1), %%mm4				\n\t" // line 6
+		"movq %%mm3, %%mm5				\n\t" // line 5
+		"psubusb %%mm4, %%mm3				\n\t"
+		"psubusb %%mm5, %%mm4				\n\t"
+		"por %%mm4, %%mm3				\n\t" // |l5 - l6|
+		PAVGB(%%mm3, %%mm0)				      // (|l2 - l3| + |l5 - l6|)/2
+		"movq %%mm2, %%mm1				\n\t" // line 4
+		"psubusb %%mm5, %%mm2				\n\t"
+		"movq %%mm2, %%mm4				\n\t"
+		"pcmpeqb %%mm7, %%mm2				\n\t" // (l4 - l5) <= 0 ? -1 : 0
+		"psubusb %%mm1, %%mm5				\n\t"
+		"por %%mm5, %%mm4				\n\t" // |l4 - l5|
+		"psubusb %%mm0, %%mm4		\n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
+		"movq %%mm4, %%mm3				\n\t" // d
+		"psubusb pQPb, %%mm4				\n\t"
+		"pcmpeqb %%mm7, %%mm4				\n\t" // d <= QP ? -1 : 0
+		"pand %%mm4, %%mm3				\n\t" // d <= QP ? d : 0
+
+		PAVGB(%%mm7, %%mm3)				      // d/2
+
+		"movq (%0, %1, 4), %%mm0			\n\t" // line 4
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
+		"psubusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%0, %1, 4)			\n\t" // line 4
+
+		"movq (%%ebx), %%mm0				\n\t" // line 5
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
+		"paddusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%%ebx)				\n\t" // line 5
+
+		PAVGB(%%mm7, %%mm3)				      // d/4
+
+		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
+		"psubusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%%eax, %1, 2)			\n\t" // line 3
+
+		"movq (%%ebx, %1), %%mm0			\n\t" // line 6
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
+		"paddusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%%ebx, %1)			\n\t" // line 6
+
+		PAVGB(%%mm7, %%mm3)				      // d/8
+
+		"movq (%%eax, %1), %%mm0			\n\t" // line 2
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
+		"psubusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%%eax, %1)			\n\t" // line 2
+
+		"movq (%%ebx, %1, 2), %%mm0			\n\t" // line 7
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
+		"paddusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%%ebx, %1, 2)			\n\t" // line 7
 
 		:
 		: "r" (src), "r" (stride)
 		: "%eax", "%ebx"
 	);
 #else
+
+ 	const int l1= stride;
+	const int l2= stride + l1;
+	const int l3= stride + l2;
+	const int l4= stride + l3;
+	const int l5= stride + l4;
+	const int l6= stride + l5;
+	const int l7= stride + l6;
+	const int l8= stride + l7;
+	const int l9= stride + l8;
+	int x;
+	for(x=0; x<BLOCK_SIZE; x++)
+	{
+		int a= src[l3] - src[l4];
+		int b= src[l4] - src[l5];
+		int c= src[l6] - src[l7];
+
+		int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
+
+		if(d < QP)
+		{
+			int v = d * SIGN(-b);
+
+			src[l2] +=v/8;
+			src[l3] +=v/4;
+			src[l4] +=v/2;
+			src[l5] -=v/2;
+			src[l6] -=v/4;
+			src[l7] -=v/8;
+
+		}
+		src++;
+	}
+	/*
  	const int l1= stride;
 	const int l2= stride + l1;
 	const int l3= stride + l2;
@@ -658,7 +785,7 @@
 		}
 		src++;
 	}
-
+*/
 #endif
 }
 
@@ -908,8 +1035,8 @@
 	const int l7= stride + l6;
 	const int l8= stride + l7;
 //	const int l9= stride + l8;
-
-	for(int x=0; x<BLOCK_SIZE; x++)
+	int x;
+	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
 		if(ABS(middleEnergy) < 8*QP)
@@ -947,7 +1074,7 @@
 /**
  * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
  */
-static inline bool isHorizDCAndCopy2Temp(uint8_t src[], int stride)
+static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
 {
 //	src++;
 	int numEq= 0;
@@ -1007,7 +1134,8 @@
 //	printf("%d\n", numEq);
 	numEq= (256 - (numEq & 0xFF)) &0xFF;
 #else
-	for(int y=0; y<BLOCK_SIZE; y++)
+	int y;
+	for(y=0; y<BLOCK_SIZE; y++)
 	{
 		if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
 		if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
@@ -1044,7 +1172,7 @@
 	return numEq > hFlatnessThreshold;
 }
 
-static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP)
+static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
 {
 #ifdef MMX_FIXME
 FIXME
@@ -1071,9 +1199,9 @@
 		);
 	return isOk;
 #else
-	if(abs(src[0] - src[7]) > 2*QP) return false;
+	if(abs(src[0] - src[7]) > 2*QP) return 0;
 
-	return true;
+	return 1;
 #endif
 }
 
@@ -1173,7 +1301,8 @@
 #else
 	uint8_t *src= tempBlock;
 
-	for(int y=0; y<BLOCK_SIZE; y++)
+	int y;
+	for(y=0; y<BLOCK_SIZE; y++)
 	{
 		dst[0] = src[0];
 		dst[1] = src[1];
@@ -1375,7 +1504,8 @@
 
 #else
 	uint8_t *temp= tempBlock;
-	for(int y=0; y<BLOCK_SIZE; y++)
+	int y;
+	for(y=0; y<BLOCK_SIZE; y++)
 	{
 		const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
 		const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
@@ -1502,7 +1632,7 @@
 	return;
 */
 	postProcess(src[0], src_stride, dst[0], dst_stride,
-		horizontal_size, vertical_size, QP_store, QP_stride, false, mode);
+		horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
 
 	horizontal_size >>= 1;
 	vertical_size   >>= 1;
@@ -1512,9 +1642,9 @@
 	if(1)
 	{
 		postProcess(src[1], src_stride, dst[1], dst_stride,
-			horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4);
+			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
 		postProcess(src[2], src_stride, dst[2], dst_stride,
-			horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4);
+			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
 	}
 	else
 	{
@@ -1543,11 +1673,19 @@
 
 /**
  * Copies a block from src to dst and fixes the blacklevel
+ * numLines must be a multiple of 4
+ * levelFix == 0 -> dont touch the brighness & contrast
  */
-static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride)
+static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
+	int numLines, int levelFix)
 {
+	int i;
+	if(levelFix)
+	{
 #ifdef HAVE_MMX
 					asm volatile(
+						"movl %4, %%eax \n\t"
+						"movl %%eax, temp0\n\t"
 						"pushl %0 \n\t"
 						"pushl %1 \n\t"
 						"leal (%2,%2), %%eax	\n\t"
@@ -1555,14 +1693,6 @@
 						"movq packedYOffset, %%mm2	\n\t"
 						"movq packedYScale, %%mm3	\n\t"
 
-#define SIMPLE_CPY					\
-						"movq (%0), %%mm0	\n\t"\
-						"movq (%0,%2), %%mm1	\n\t"\
-						"psubusb %%mm2, %%mm0	\n\t"\
-						"psubusb %%mm2, %%mm1	\n\t"\
-						"movq %%mm0, (%1)	\n\t"\
-						"movq %%mm1, (%1, %3)	\n\t"\
-
 #define SCALED_CPY					\
 						"movq (%0), %%mm0	\n\t"\
 						"movq (%0,%2), %%mm1	\n\t"\
@@ -1585,33 +1715,75 @@
 						"packuswb %%mm5, %%mm4	\n\t"\
 						"movq %%mm4, (%1, %3)	\n\t"\
 
-
-#define CPY SCALED_CPY
-//#define CPY SIMPLE_CPY
-//				"prefetchnta 8(%0)\n\t"
-CPY
+						"1:			\n\t"
+SCALED_CPY
 						"addl %%eax, %0		\n\t"
 						"addl %%ebx, %1		\n\t"
-CPY
+SCALED_CPY
 						"addl %%eax, %0		\n\t"
 						"addl %%ebx, %1		\n\t"
-CPY
-						"addl %%eax, %0		\n\t"
-						"addl %%ebx, %1		\n\t"
-CPY
+						"decl temp0		\n\t"
+						"jnz 1b			\n\t"
+
 						"popl %1 \n\t"
 						"popl %0 \n\t"
 						: : "r" (src),
 						"r" (dst),
 						"r" (srcStride),
-						"r" (dstStride)
+						"r" (dstStride),
+						"m" (numLines>>2)
 						: "%eax", "%ebx"
 					);
 #else
-				for(int i=0; i<BLOCK_SIZE; i++) // last 10x8 Block is copied allready so +2
+				for(i=0; i<numLines; i++)
 					memcpy(	&(dst[dstStride*i]),
 						&(src[srcStride*i]), BLOCK_SIZE);
 #endif
+	}
+	else
+	{
+#ifdef HAVE_MMX
+					asm volatile(
+						"movl %4, %%eax \n\t"
+						"movl %%eax, temp0\n\t"
+						"pushl %0 \n\t"
+						"pushl %1 \n\t"
+						"leal (%2,%2), %%eax	\n\t"
+						"leal (%3,%3), %%ebx	\n\t"
+						"movq packedYOffset, %%mm2	\n\t"
+						"movq packedYScale, %%mm3	\n\t"
+
+#define SIMPLE_CPY					\
+						"movq (%0), %%mm0	\n\t"\
+						"movq (%0,%2), %%mm1	\n\t"\
+						"movq %%mm0, (%1)	\n\t"\
+						"movq %%mm1, (%1, %3)	\n\t"\
+
+						"1:			\n\t"
+SIMPLE_CPY
+						"addl %%eax, %0		\n\t"
+						"addl %%ebx, %1		\n\t"
+SIMPLE_CPY
+						"addl %%eax, %0		\n\t"
+						"addl %%ebx, %1		\n\t"
+						"decl temp0		\n\t"
+						"jnz 1b			\n\t"
+
+						"popl %1 \n\t"
+						"popl %0 \n\t"
+						: : "r" (src),
+						"r" (dst),
+						"r" (srcStride),
+						"r" (dstStride),
+						"m" (numLines>>2)
+						: "%eax", "%ebx"
+					);
+#else
+				for(i=0; i<numLines; i++)
+					memcpy(	&(dst[dstStride*i]),
+						&(src[srcStride*i]), BLOCK_SIZE);
+#endif
+	}
 }
 
 
@@ -1619,33 +1791,50 @@
  * Filters array of bytes (Y or U or V values)
  */
 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
-	QP_STORE_T QPs[], int QPStride, bool isColor, int mode)
+	QP_STORE_T QPs[], int QPStride, int isColor, int mode)
 {
+	int x,y;
+	/* we need 64bit here otherwise we´ll going to have a problem
+	   after watching a black picture for 5 hours*/
+	static uint64_t *yHistogram= NULL;
+	int black=0, white=255; // blackest black and whitest white in the picture
 
 #ifdef TIMEING
 	long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
 	sumTime= rdtsc();
 #endif
 
-	/* we need 64bit here otherwise we´ll going to have a problem
-	   after watching a black picture for 5 hours*/
-	static uint64_t *yHistogram= NULL;
 	if(!yHistogram)
 	{
-		yHistogram= new uint64_t[256];
-		for(int i=0; i<256; i++) yHistogram[i]= width*height/64/256;
+		int i;
+		yHistogram= (uint64_t*)malloc(8*256);
+		for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
 	}
 
-	int black=0, white=255; // blackest black and whitest white in the picture
 	if(!isColor)
 	{
 		uint64_t sum= 0;
-		for(int i=0; i<256; i++)
-			sum+= yHistogram[i];
+		int i;
+		static int framenum= -1;
+		uint64_t maxClipped;
+		uint64_t clipped;
+		double scale;
+
+		framenum++;
+		if(framenum == 1) yHistogram[0]= width*height/64*15/256;
 
-		uint64_t maxClipped= (uint64_t)(sum * maxClippedThreshold);
+		for(i=0; i<256; i++)
+		{
+			sum+= yHistogram[i];
+//			printf("%d ", yHistogram[i]);
+		}
+//		printf("\n\n");
 
-		uint64_t clipped= sum;
+		/* we allways get a completly black picture first */
+
+		maxClipped= (uint64_t)(sum * maxClippedThreshold);
+
+		clipped= sum;
 		for(black=255; black>0; black--)
 		{
 			if(clipped < maxClipped) break;
@@ -1665,9 +1854,9 @@
 		packedYOffset|= packedYOffset<<16;
 		packedYOffset|= packedYOffset<<8;
 
-		double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
+		scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
 
-		packedYScale= uint16_t(scale*256.0 + 0.5);
+		packedYScale= (uint16_t)(scale*256.0 + 0.5);
 		packedYScale|= packedYScale<<32;
 		packedYScale|= packedYScale<<16;
 	}
@@ -1677,10 +1866,10 @@
 		packedYOffset= 0;
 	}
 
-	for(int x=0; x<width; x+=BLOCK_SIZE)
-		blockCopy(dst + x, dstStride, src + x, srcStride);
+	for(x=0; x<width; x+=BLOCK_SIZE)
+		blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
 
-	for(int y=0; y<height; y+=BLOCK_SIZE)
+	for(y=0; y<height; y+=BLOCK_SIZE)
 	{
 		//1% speedup if these are here instead of the inner loop
 		uint8_t *srcBlock= &(src[y*srcStride]);
@@ -1690,8 +1879,9 @@
 
 		// finish 1 block before the next otherwise we´ll might have a problem
 		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
-		for(int x=0; x<width; x+=BLOCK_SIZE)
+		for(x=0; x<width; x+=BLOCK_SIZE)
 		{
+			const int stride= dstStride;
 			int QP= isColor ?
 				QPs[(y>>3)*QPStride + (x>>3)]:
 				(QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
@@ -1707,7 +1897,6 @@
 #endif
 
 
-			const int stride= dstStride;
 			if(y + 12 < height)
 			{
 #ifdef MORE_TIMEING
@@ -1730,7 +1919,7 @@
 				if(!isColor) yHistogram[ srcBlock[0] ]++;
 
 				blockCopy(vertBlock + dstStride*2, dstStride,
-					vertSrcBlock + srcStride*2, srcStride);
+					vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
 
 
 #ifdef MORE_TIMEING
@@ -1742,7 +1931,7 @@
 				{
 					if(mode & RK_FILTER)
 						vertRKFilter(vertBlock, stride, QP);
-					else if(0)
+					else if(mode & X1_FILTER)
 						vertX1Filter(vertBlock, stride, QP);
 					else
 					{
@@ -1762,12 +1951,9 @@
 #endif
 			}
 			else
-			{
-				for(int i=2; i<BLOCK_SIZE/2+1; i++) // last 10x8 Block is copied allready so +2
-					memcpy(	&(vertBlock[dstStride*i]),
-						&(vertSrcBlock[srcStride*i]), BLOCK_SIZE);
+				blockCopy(vertBlock + dstStride*1, dstStride,
+					vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
 
-			}
 
 			if(x - 8 >= 0 && x<width)
 			{
@@ -1813,8 +1999,8 @@
 	sumTime= rdtsc() - sumTime;
 	if(!isColor)
 		printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
-			int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000),
-			int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000)
+			(int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
+			(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
 			, black, white);
 #endif
 }
diff -r c138a19bae77 -r 21a8f158d19f postproc/postprocess.h
--- a/postproc/postprocess.h	Thu Oct 11 16:17:47 2001 +0000
+++ b/postproc/postprocess.h	Thu Oct 11 22:35:45 2001 +0000
@@ -26,6 +26,7 @@
 #define V_DEBLOCK	0x01
 #define H_DEBLOCK	0x02
 #define DERING		0x04
+#define LEVEL_FIX	0x08 /* Brightness & Contrast */
 
 #define LUM_V_DEBLOCK	V_DEBLOCK
 #define LUM_H_DEBLOCK	H_DEBLOCK
@@ -33,12 +34,19 @@
 #define CHROM_H_DEBLOCK	(H_DEBLOCK<<4)
 #define LUM_DERING	DERING
 #define CHROM_DERING	(DERING<<4)
+#define LUM_LEVEL_FIX	LEVEL_FIX
+//not supported currently
+#define CHROM_LEVEL_FIX	(LEVEL_FIX<<4)
 
 // Experimental stuff
 #define RK_FILTER		0x0100
 #define LUM_V_RK_FILTER		RK_FILTER
 #define CHROM_V_RK_FILTER	(RK_FILTER<<4)
 
+#define X1_FILTER		0x0200
+#define LUM_V_X1_FILTER		X1_FILTER
+#define CHROM_V_X1_FILTER	(X1_FILTER<<4)
+
 
 #define TIMEING
 #define MORE_TIMEING
@@ -60,7 +68,7 @@
 //#include <inttypes.h>
 
 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
-	QP_STORE_T QPs[], int QPStride, bool isColor, int mode);
+	QP_STORE_T QPs[], int QPStride, int isColor, int mode);
 #endif
 
 #ifdef __cplusplus
diff -r c138a19bae77 -r 21a8f158d19f postproc/postprocess_template.c
--- a/postproc/postprocess_template.c	Thu Oct 11 16:17:47 2001 +0000
+++ b/postproc/postprocess_template.c	Thu Oct 11 22:35:45 2001 +0000
@@ -27,6 +27,9 @@
 doHorizLowPass		E		a	a*
 doHorizDefFilter	E	ac	ac
 deRing
+RKAlgo1			E		a	a*
+X1			a		E	E*
+
 
 * i dont have a 3dnow CPU -> its untested
 E = Exact implementation
@@ -41,11 +44,13 @@
 reduce the time wasted on the mem transfer
 implement dering
 implement everything in C at least (done at the moment but ...)
-figure range of QP out (assuming <256 for now)
 unroll stuff if instructions depend too much on the prior one
 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
 move YScale thing to the end instead of fixing QP
 write a faster and higher quality deblocking filter :)
+do something about the speed of the horizontal filters
+make the mainloop more flexible (variable number of blocks at once
+	(the if/else stuff per block is slowing things down)
 ...
 
 Notes:
@@ -54,6 +59,14 @@
 
 /*
 Changelog:
+0.1.3
+	bugfixes: last 3 lines not brightness/contrast corrected
+		brightness statistics messed up with initial black pic
+	changed initial values of the brightness statistics
+	C++ -> C conversation
+	QP range question solved (very likely 1<=QP<=32 according to arpi)
+	new experimental vertical deblocking filter
+	RK filter has 3dNow support now (untested)
 0.1.2
 	fixed a bug in the horizontal default filter
 	3dnow version of the Horizontal & Vertical Lowpass filters
@@ -66,6 +79,7 @@
 
 #include <inttypes.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include "../config.h"
 //#undef HAVE_MMX2
 //#define HAVE_3DNOW
@@ -160,9 +174,10 @@
 /**
  * Check if the middle 8x8 Block in the given 8x10 block is flat
  */
-static inline bool isVertDC(uint8_t src[], int stride){
+static inline int isVertDC(uint8_t src[], int stride){
 //	return true;
 	int numEq= 0;
+	int y;
 	src+= stride; // src points to begin of the 8x8 Block
 #ifdef HAVE_MMX
 	asm volatile(
@@ -242,7 +257,7 @@
 //	uint8_t *temp= src;
 
 #else
-	for(int y=0; y<BLOCK_SIZE-1; y++)
+	for(y=0; y<BLOCK_SIZE-1; y++)
 	{
 		if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
 		if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
@@ -268,10 +283,11 @@
 		}
 	}
 */
-	return numEq > vFlatnessThreshold;
+//	for(int i=0; i<numEq/8; i++) src[i]=255;
+	return (numEq > vFlatnessThreshold) ? 1 : 0;
 }
 
-static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP)
+static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
 {
 #ifdef HAVE_MMX
 	int isOk;
@@ -295,13 +311,14 @@
 		: "=r" (isOk)
 		: "r" (src), "r" (stride)
 		);
-	return isOk;
+	return isOk ? 1 : 0;
 #else
 
-	int isOk2= true;
-	for(int x=0; x<BLOCK_SIZE; x++)
+	int isOk2= 1;
+	int x;
+	for(x=0; x<BLOCK_SIZE; x++)
 	{
-		if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=false;
+		if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
 	}
 /*	if(isOk && !isOk2 || !isOk && isOk2)
 	{
@@ -484,8 +501,8 @@
 	const int l7= stride + l6;
 	const int l8= stride + l7;
 	const int l9= stride + l8;
-
-	for(int x=0; x<BLOCK_SIZE; x++)
+	int x;
+	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
 		const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
@@ -529,7 +546,7 @@
  */
 static inline void vertRKFilter(uint8_t *src, int stride, int QP)
 {
-#ifdef HAVE_MMX2
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 // FIXME rounding
 	asm volatile(
 		"pxor %%mm7, %%mm7				\n\t" // 0
@@ -549,7 +566,7 @@
 		"movq %%mm2, %%mm4				\n\t" // line 4
 		"pcmpeqb %%mm5, %%mm5				\n\t" // -1
 		"pxor %%mm2, %%mm5				\n\t" // -line 4 - 1
-		"pavgb %%mm3, %%mm5				\n\t"
+		PAVGB(%%mm3, %%mm5)
 		"paddb %%mm6, %%mm5				\n\t" // (l5-l4)/2
 		"psubusb %%mm3, %%mm4				\n\t"
 		"psubusb %%mm2, %%mm3				\n\t"
@@ -600,16 +617,18 @@
 	const int l7= stride + l6;
 	const int l8= stride + l7;
 	const int l9= stride + l8;
-	for(int x=0; x<BLOCK_SIZE; x++)
+	int x;
+	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		if(ABS(src[l4]-src[l5]) < QP + QP/4)
 		{
-			int x = src[l5] - src[l4];
+			int v = (src[l5] - src[l4]);
 
-			src[l3] +=x/8;
-			src[l4] +=x/2;
-			src[l5] -=x/2;
-			src[l6] -=x/8;
+			src[l3] +=v/8;
+			src[l4] +=v/2;
+			src[l5] -=v/2;
+			src[l6] -=v/8;
+
 		}
 		src++;
 	}
@@ -619,18 +638,126 @@
 
 /**
  * Experimental Filter 1
+ * will nor damage linear gradients
+ * can only smooth blocks at the expected locations (it cant smooth them if they did move)
+ * MMX2 version does correct clipping C version doesnt
  */
 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
 {
-#ifdef HAVE_MMX2X
-// FIXME
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 	asm volatile(
+		"pxor %%mm7, %%mm7				\n\t" // 0
+//		"movq b80, %%mm6				\n\t" // MIN_SIGNED_BYTE
+		"leal (%0, %1), %%eax				\n\t"
+		"leal (%%eax, %1, 4), %%ebx			\n\t"
+//	0	1	2	3	4	5	6	7	8	9
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
+		"movq (%0, %1, 4), %%mm1			\n\t" // line 4
+		"movq %%mm1, %%mm2				\n\t" // line 4
+		"psubusb %%mm0, %%mm1				\n\t"
+		"psubusb %%mm2, %%mm0				\n\t"
+		"por %%mm1, %%mm0				\n\t" // |l2 - l3|
+		"movq (%%ebx), %%mm3				\n\t" // line 5
+		"movq (%%ebx, %1), %%mm4				\n\t" // line 6
+		"movq %%mm3, %%mm5				\n\t" // line 5
+		"psubusb %%mm4, %%mm3				\n\t"
+		"psubusb %%mm5, %%mm4				\n\t"
+		"por %%mm4, %%mm3				\n\t" // |l5 - l6|
+		PAVGB(%%mm3, %%mm0)				      // (|l2 - l3| + |l5 - l6|)/2
+		"movq %%mm2, %%mm1				\n\t" // line 4
+		"psubusb %%mm5, %%mm2				\n\t"
+		"movq %%mm2, %%mm4				\n\t"
+		"pcmpeqb %%mm7, %%mm2				\n\t" // (l4 - l5) <= 0 ? -1 : 0
+		"psubusb %%mm1, %%mm5				\n\t"
+		"por %%mm5, %%mm4				\n\t" // |l4 - l5|
+		"psubusb %%mm0, %%mm4		\n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
+		"movq %%mm4, %%mm3				\n\t" // d
+		"psubusb pQPb, %%mm4				\n\t"
+		"pcmpeqb %%mm7, %%mm4				\n\t" // d <= QP ? -1 : 0
+		"pand %%mm4, %%mm3				\n\t" // d <= QP ? d : 0
+
+		PAVGB(%%mm7, %%mm3)				      // d/2
+
+		"movq (%0, %1, 4), %%mm0			\n\t" // line 4
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
+		"psubusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%0, %1, 4)			\n\t" // line 4
+
+		"movq (%%ebx), %%mm0				\n\t" // line 5
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
+		"paddusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%%ebx)				\n\t" // line 5
+
+		PAVGB(%%mm7, %%mm3)				      // d/4
+
+		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
+		"psubusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%%eax, %1, 2)			\n\t" // line 3
+
+		"movq (%%ebx, %1), %%mm0			\n\t" // line 6
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
+		"paddusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%%ebx, %1)			\n\t" // line 6
+
+		PAVGB(%%mm7, %%mm3)				      // d/8
+
+		"movq (%%eax, %1), %%mm0			\n\t" // line 2
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
+		"psubusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%%eax, %1)			\n\t" // line 2
+
+		"movq (%%ebx, %1, 2), %%mm0			\n\t" // line 7
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
+		"paddusb %%mm3, %%mm0				\n\t"
+		"pxor %%mm2, %%mm0				\n\t"
+		"movq %%mm0, (%%ebx, %1, 2)			\n\t" // line 7
 
 		:
 		: "r" (src), "r" (stride)
 		: "%eax", "%ebx"
 	);
 #else
+
+ 	const int l1= stride;
+	const int l2= stride + l1;
+	const int l3= stride + l2;
+	const int l4= stride + l3;
+	const int l5= stride + l4;
+	const int l6= stride + l5;
+	const int l7= stride + l6;
+	const int l8= stride + l7;
+	const int l9= stride + l8;
+	int x;
+	for(x=0; x<BLOCK_SIZE; x++)
+	{
+		int a= src[l3] - src[l4];
+		int b= src[l4] - src[l5];
+		int c= src[l6] - src[l7];
+
+		int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
+
+		if(d < QP)
+		{
+			int v = d * SIGN(-b);
+
+			src[l2] +=v/8;
+			src[l3] +=v/4;
+			src[l4] +=v/2;
+			src[l5] -=v/2;
+			src[l6] -=v/4;
+			src[l7] -=v/8;
+
+		}
+		src++;
+	}
+	/*
  	const int l1= stride;
 	const int l2= stride + l1;
 	const int l3= stride + l2;
@@ -658,7 +785,7 @@
 		}
 		src++;
 	}
-
+*/
 #endif
 }
 
@@ -908,8 +1035,8 @@
 	const int l7= stride + l6;
 	const int l8= stride + l7;
 //	const int l9= stride + l8;
-
-	for(int x=0; x<BLOCK_SIZE; x++)
+	int x;
+	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
 		if(ABS(middleEnergy) < 8*QP)
@@ -947,7 +1074,7 @@
 /**
  * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
  */
-static inline bool isHorizDCAndCopy2Temp(uint8_t src[], int stride)
+static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
 {
 //	src++;
 	int numEq= 0;
@@ -1007,7 +1134,8 @@
 //	printf("%d\n", numEq);
 	numEq= (256 - (numEq & 0xFF)) &0xFF;
 #else
-	for(int y=0; y<BLOCK_SIZE; y++)
+	int y;
+	for(y=0; y<BLOCK_SIZE; y++)
 	{
 		if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
 		if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
@@ -1044,7 +1172,7 @@
 	return numEq > hFlatnessThreshold;
 }
 
-static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP)
+static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
 {
 #ifdef MMX_FIXME
 FIXME
@@ -1071,9 +1199,9 @@
 		);
 	return isOk;
 #else
-	if(abs(src[0] - src[7]) > 2*QP) return false;
+	if(abs(src[0] - src[7]) > 2*QP) return 0;
 
-	return true;
+	return 1;
 #endif
 }
 
@@ -1173,7 +1301,8 @@
 #else
 	uint8_t *src= tempBlock;
 
-	for(int y=0; y<BLOCK_SIZE; y++)
+	int y;
+	for(y=0; y<BLOCK_SIZE; y++)
 	{
 		dst[0] = src[0];
 		dst[1] = src[1];
@@ -1375,7 +1504,8 @@
 
 #else
 	uint8_t *temp= tempBlock;
-	for(int y=0; y<BLOCK_SIZE; y++)
+	int y;
+	for(y=0; y<BLOCK_SIZE; y++)
 	{
 		const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
 		const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
@@ -1502,7 +1632,7 @@
 	return;
 */
 	postProcess(src[0], src_stride, dst[0], dst_stride,
-		horizontal_size, vertical_size, QP_store, QP_stride, false, mode);
+		horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
 
 	horizontal_size >>= 1;
 	vertical_size   >>= 1;
@@ -1512,9 +1642,9 @@
 	if(1)
 	{
 		postProcess(src[1], src_stride, dst[1], dst_stride,
-			horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4);
+			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
 		postProcess(src[2], src_stride, dst[2], dst_stride,
-			horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4);
+			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
 	}
 	else
 	{
@@ -1543,11 +1673,19 @@
 
 /**
  * Copies a block from src to dst and fixes the blacklevel
+ * numLines must be a multiple of 4
+ * levelFix == 0 -> dont touch the brighness & contrast
  */
-static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride)
+static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
+	int numLines, int levelFix)
 {
+	int i;
+	if(levelFix)
+	{
 #ifdef HAVE_MMX
 					asm volatile(
+						"movl %4, %%eax \n\t"
+						"movl %%eax, temp0\n\t"
 						"pushl %0 \n\t"
 						"pushl %1 \n\t"
 						"leal (%2,%2), %%eax	\n\t"
@@ -1555,14 +1693,6 @@
 						"movq packedYOffset, %%mm2	\n\t"
 						"movq packedYScale, %%mm3	\n\t"
 
-#define SIMPLE_CPY					\
-						"movq (%0), %%mm0	\n\t"\
-						"movq (%0,%2), %%mm1	\n\t"\
-						"psubusb %%mm2, %%mm0	\n\t"\
-						"psubusb %%mm2, %%mm1	\n\t"\
-						"movq %%mm0, (%1)	\n\t"\
-						"movq %%mm1, (%1, %3)	\n\t"\
-
 #define SCALED_CPY					\
 						"movq (%0), %%mm0	\n\t"\
 						"movq (%0,%2), %%mm1	\n\t"\
@@ -1585,33 +1715,75 @@
 						"packuswb %%mm5, %%mm4	\n\t"\
 						"movq %%mm4, (%1, %3)	\n\t"\
 
-
-#define CPY SCALED_CPY
-//#define CPY SIMPLE_CPY
-//				"prefetchnta 8(%0)\n\t"
-CPY
+						"1:			\n\t"
+SCALED_CPY
 						"addl %%eax, %0		\n\t"
 						"addl %%ebx, %1		\n\t"
-CPY
+SCALED_CPY
 						"addl %%eax, %0		\n\t"
 						"addl %%ebx, %1		\n\t"
-CPY
-						"addl %%eax, %0		\n\t"
-						"addl %%ebx, %1		\n\t"
-CPY
+						"decl temp0		\n\t"
+						"jnz 1b			\n\t"
+
 						"popl %1 \n\t"
 						"popl %0 \n\t"
 						: : "r" (src),
 						"r" (dst),
 						"r" (srcStride),
-						"r" (dstStride)
+						"r" (dstStride),
+						"m" (numLines>>2)
 						: "%eax", "%ebx"
 					);
 #else
-				for(int i=0; i<BLOCK_SIZE; i++) // last 10x8 Block is copied allready so +2
+				for(i=0; i<numLines; i++)
 					memcpy(	&(dst[dstStride*i]),
 						&(src[srcStride*i]), BLOCK_SIZE);
 #endif
+	}
+	else
+	{
+#ifdef HAVE_MMX
+					asm volatile(
+						"movl %4, %%eax \n\t"
+						"movl %%eax, temp0\n\t"
+						"pushl %0 \n\t"
+						"pushl %1 \n\t"
+						"leal (%2,%2), %%eax	\n\t"
+						"leal (%3,%3), %%ebx	\n\t"
+						"movq packedYOffset, %%mm2	\n\t"
+						"movq packedYScale, %%mm3	\n\t"
+
+#define SIMPLE_CPY					\
+						"movq (%0), %%mm0	\n\t"\
+						"movq (%0,%2), %%mm1	\n\t"\
+						"movq %%mm0, (%1)	\n\t"\
+						"movq %%mm1, (%1, %3)	\n\t"\
+
+						"1:			\n\t"
+SIMPLE_CPY
+						"addl %%eax, %0		\n\t"
+						"addl %%ebx, %1		\n\t"
+SIMPLE_CPY
+						"addl %%eax, %0		\n\t"
+						"addl %%ebx, %1		\n\t"
+						"decl temp0		\n\t"
+						"jnz 1b			\n\t"
+
+						"popl %1 \n\t"
+						"popl %0 \n\t"
+						: : "r" (src),
+						"r" (dst),
+						"r" (srcStride),
+						"r" (dstStride),
+						"m" (numLines>>2)
+						: "%eax", "%ebx"
+					);
+#else
+				for(i=0; i<numLines; i++)
+					memcpy(	&(dst[dstStride*i]),
+						&(src[srcStride*i]), BLOCK_SIZE);
+#endif
+	}
 }
 
 
@@ -1619,33 +1791,50 @@
  * Filters array of bytes (Y or U or V values)
  */
 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
-	QP_STORE_T QPs[], int QPStride, bool isColor, int mode)
+	QP_STORE_T QPs[], int QPStride, int isColor, int mode)
 {
+	int x,y;
+	/* we need 64bit here otherwise we´ll going to have a problem
+	   after watching a black picture for 5 hours*/
+	static uint64_t *yHistogram= NULL;
+	int black=0, white=255; // blackest black and whitest white in the picture
 
 #ifdef TIMEING
 	long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
 	sumTime= rdtsc();
 #endif
 
-	/* we need 64bit here otherwise we´ll going to have a problem
-	   after watching a black picture for 5 hours*/
-	static uint64_t *yHistogram= NULL;
 	if(!yHistogram)
 	{
-		yHistogram= new uint64_t[256];
-		for(int i=0; i<256; i++) yHistogram[i]= width*height/64/256;
+		int i;
+		yHistogram= (uint64_t*)malloc(8*256);
+		for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
 	}
 
-	int black=0, white=255; // blackest black and whitest white in the picture
 	if(!isColor)
 	{
 		uint64_t sum= 0;
-		for(int i=0; i<256; i++)
-			sum+= yHistogram[i];
+		int i;
+		static int framenum= -1;
+		uint64_t maxClipped;
+		uint64_t clipped;
+		double scale;
+
+		framenum++;
+		if(framenum == 1) yHistogram[0]= width*height/64*15/256;
 
-		uint64_t maxClipped= (uint64_t)(sum * maxClippedThreshold);
+		for(i=0; i<256; i++)
+		{
+			sum+= yHistogram[i];
+//			printf("%d ", yHistogram[i]);
+		}
+//		printf("\n\n");
 
-		uint64_t clipped= sum;
+		/* we allways get a completly black picture first */
+
+		maxClipped= (uint64_t)(sum * maxClippedThreshold);
+
+		clipped= sum;
 		for(black=255; black>0; black--)
 		{
 			if(clipped < maxClipped) break;
@@ -1665,9 +1854,9 @@
 		packedYOffset|= packedYOffset<<16;
 		packedYOffset|= packedYOffset<<8;
 
-		double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
+		scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
 
-		packedYScale= uint16_t(scale*256.0 + 0.5);
+		packedYScale= (uint16_t)(scale*256.0 + 0.5);
 		packedYScale|= packedYScale<<32;
 		packedYScale|= packedYScale<<16;
 	}
@@ -1677,10 +1866,10 @@
 		packedYOffset= 0;
 	}
 
-	for(int x=0; x<width; x+=BLOCK_SIZE)
-		blockCopy(dst + x, dstStride, src + x, srcStride);
+	for(x=0; x<width; x+=BLOCK_SIZE)
+		blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
 
-	for(int y=0; y<height; y+=BLOCK_SIZE)
+	for(y=0; y<height; y+=BLOCK_SIZE)
 	{
 		//1% speedup if these are here instead of the inner loop
 		uint8_t *srcBlock= &(src[y*srcStride]);
@@ -1690,8 +1879,9 @@
 
 		// finish 1 block before the next otherwise we´ll might have a problem
 		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
-		for(int x=0; x<width; x+=BLOCK_SIZE)
+		for(x=0; x<width; x+=BLOCK_SIZE)
 		{
+			const int stride= dstStride;
 			int QP= isColor ?
 				QPs[(y>>3)*QPStride + (x>>3)]:
 				(QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
@@ -1707,7 +1897,6 @@
 #endif
 
 
-			const int stride= dstStride;
 			if(y + 12 < height)
 			{
 #ifdef MORE_TIMEING
@@ -1730,7 +1919,7 @@
 				if(!isColor) yHistogram[ srcBlock[0] ]++;
 
 				blockCopy(vertBlock + dstStride*2, dstStride,
-					vertSrcBlock + srcStride*2, srcStride);
+					vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
 
 
 #ifdef MORE_TIMEING
@@ -1742,7 +1931,7 @@
 				{
 					if(mode & RK_FILTER)
 						vertRKFilter(vertBlock, stride, QP);
-					else if(0)
+					else if(mode & X1_FILTER)
 						vertX1Filter(vertBlock, stride, QP);
 					else
 					{
@@ -1762,12 +1951,9 @@
 #endif
 			}
 			else
-			{
-				for(int i=2; i<BLOCK_SIZE/2+1; i++) // last 10x8 Block is copied allready so +2
-					memcpy(	&(vertBlock[dstStride*i]),
-						&(vertSrcBlock[srcStride*i]), BLOCK_SIZE);
+				blockCopy(vertBlock + dstStride*1, dstStride,
+					vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
 
-			}
 
 			if(x - 8 >= 0 && x<width)
 			{
@@ -1813,8 +1999,8 @@
 	sumTime= rdtsc() - sumTime;
 	if(!isColor)
 		printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
-			int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000),
-			int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000)
+			(int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
+			(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
 			, black, white);
 #endif
 }