# HG changeset patch
# User michael
# Date 1003351327 0
# Node ID 8e4c5a16c9fcfd505893f77c5eef470372b47deb
# Parent  e00e5d93457cf5a534747c278835522c13b15070
fixed the height%8!=0 bug
simplified a few things
removed last row variants of the deinterlace filters, they are not needed anymore
added cubic interpolating deinterlacer

diff -r e00e5d93457c -r 8e4c5a16c9fc libpostproc/postprocess.c
--- a/libpostproc/postprocess.c	Wed Oct 17 18:05:35 2001 +0000
+++ b/libpostproc/postprocess.c	Wed Oct 17 20:42:07 2001 +0000
@@ -30,14 +30,15 @@
 Vertical RKAlgo1	E		a	a
 Vertical X1		a		E	E
 Horizontal X1		a		E	E
-LinIpolDeinterlace	a		E	E*
-LinBlendDeinterlace	a		E	E*
+LinIpolDeinterlace	e		E	E*
+CubicIpolDeinterlace	a		e	e*
+LinBlendDeinterlace	e		E	E*
 MedianDeinterlace	 	Ec	Ec
 
 
 * i dont have a 3dnow CPU -> its untested
 E = Exact implementation
-e = allmost exact implementation
+e = allmost exact implementation (slightly different rounding,...)
 a = alternative / approximate impl
 c = checked against the other implementations (-vo md5)
 */
@@ -63,7 +64,6 @@
 
 Notes:
 
-
 */
 
 //Changelog: use the CVS log
@@ -178,12 +178,12 @@
 
 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
 /**
- * Check if the middle 8x8 Block in the given 8x10 block is flat
+ * Check if the middle 8x8 Block in the given 8x16 block is flat
  */
 static inline int isVertDC(uint8_t src[], int stride){
 	int numEq= 0;
 	int y;
-	src+= stride; // src points to begin of the 8x8 Block
+	src+= stride*4; // src points to begin of the 8x8 Block
 #ifdef HAVE_MMX
 	asm volatile(
 		"pushl %1\n\t"
@@ -295,6 +295,7 @@
 {
 #ifdef HAVE_MMX
 	int isOk;
+	src+= stride*3;
 	asm volatile(
 //		"int $3 \n\t"
 		"movq (%1, %2), %%mm0				\n\t"
@@ -320,6 +321,7 @@
 
 	int isOk2= 1;
 	int x;
+	src+= stride*3;
 	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
@@ -343,19 +345,16 @@
 }
 
 /**
- * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle)
+ * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
  */
 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
 {
-//	QP= 64;
-
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
-//#ifdef HAVE_MMX2
+	src+= stride*3;
 	asm volatile(	//"movv %0 %1 %2\n\t"
 		"pushl %0 \n\t"
 		"movq pQPb, %%mm0				\n\t"  // QP,..., QP
-//		"movq bFF  , %%mm0				\n\t"  // QP,..., QP
 
 		"movq (%0), %%mm6				\n\t"
 		"movq (%0, %1), %%mm5				\n\t"
@@ -395,20 +394,7 @@
 		// 6 4 2 2 1 1
 		// 6 4 4 2
 		// 6 8 2
-/*
-		"movq %%mm6, %%mm2				\n\t" //1
-		"movq %%mm6, %%mm3				\n\t" //1
-		"paddusb b02, %%mm3 				\n\t"
-		"psrlw $2, %%mm3				\n\t" //1	/4
-		"pand b3F, %%mm3				\n\t"
-		"psubb %%mm3, %%mm2				\n\t"
-		"movq (%0, %1), %%mm0				\n\t" //  1
-		"movq %%mm0, %%mm1				\n\t" //  1
-		"paddusb b02, %%mm0				\n\t"
-		"psrlw $2, %%mm0				\n\t" //  1	/4
-		"pand b3F, %%mm0				\n\t"
-		"paddusb %%mm2, %%mm0				\n\t" //3 1	/4
-*/
+
 		"movq (%0, %1), %%mm0				\n\t" //  1
 		"movq %%mm0, %%mm1				\n\t" //  1
 		PAVGB(%%mm6, %%mm0)				      //1 1	/2
@@ -470,7 +456,6 @@
 		"movq (%%eax, %1, 2), %%mm6			\n\t" //      1
 		PAVGB(%%mm6, %%mm1)				      //  11  4  2	/8
 		PAVGB(%%mm0, %%mm1)				      //  11224222	/16
-//		"pxor %%mm1, %%mm1 \n\t"
 		"movq %%mm1, (%%eax, %1, 2)			\n\t" //      X
 		// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
 		PAVGB((%%ebx), %%mm2)				      //   112 4	/8
@@ -478,7 +463,6 @@
 		PAVGB(%%mm0, %%mm6)				      //      1 1	/2
 		PAVGB(%%mm7, %%mm6)				      //      1 12	/4
 		PAVGB(%%mm2, %%mm6)				      //   1122424	/4
-//		"pxor %%mm6, %%mm6 \n\t"
 		"movq %%mm6, (%%ebx)				\n\t" //       X
 		// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
 		PAVGB(%%mm7, %%mm5)				      //    11   2	/4
@@ -486,8 +470,6 @@
 
 		PAVGB(%%mm3, %%mm0)				      //      112	/4
 		PAVGB(%%mm0, %%mm5)				      //    112246	/16
-//		"pxor %%mm5, %%mm5 \n\t"
-//		"movq pQPb, %%mm5 \n\t"
 		"movq %%mm5, (%%eax, %1, 4)			\n\t" //        X
 		"popl %0\n\t"
 
@@ -506,6 +488,7 @@
 	const int l8= stride + l7;
 	const int l9= stride + l8;
 	int x;
+	src+= stride*3;
 	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
@@ -551,6 +534,7 @@
 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+	src+= stride*3;
 // FIXME rounding
 	asm volatile(
 		"pxor %%mm7, %%mm7				\n\t" // 0
@@ -622,6 +606,7 @@
 	const int l8= stride + l7;
 	const int l9= stride + l8;
 	int x;
+	src+= stride*3;
 	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		if(ABS(src[l4]-src[l5]) < QP + QP/4)
@@ -650,6 +635,8 @@
 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+	src+= stride*3;
+
 	asm volatile(
 		"pxor %%mm7, %%mm7				\n\t" // 0
 //		"movq b80, %%mm6				\n\t" // MIN_SIGNED_BYTE
@@ -744,6 +731,8 @@
 	const int l8= stride + l7;
 	const int l9= stride + l8;
 	int x;
+
+	src+= stride*3;
 	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		int a= src[l3] - src[l4];
@@ -1007,7 +996,7 @@
 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
 {
 #ifdef HAVE_MMX
-	src+= stride;
+	src+= stride*4;
 	//FIXME try pmul for *5 stuff
 //	src[0]=0;
 	asm volatile(
@@ -1154,7 +1143,6 @@
 		"psubw %%mm7, %%mm5				\n\t" // |2H2 - 5H3 + 5H4 - 2H5|
 // 100 opcodes
 		"movd %2, %%mm2					\n\t" // QP
-//"pcmpeqb %%mm2, %%mm2\n\t"
 		"punpcklwd %%mm2, %%mm2				\n\t"
 		"punpcklwd %%mm2, %%mm2				\n\t"
 		"psllw $3, %%mm2				\n\t" // 8QP
@@ -1232,7 +1220,6 @@
 		"movq %%mm0, (%%eax, %1, 2) 			\n\t"
 		"movq (%0, %1, 4), %%mm0			\n\t"
 		"psubb %%mm4, %%mm0				\n\t"
-//		"pxor %%mm0, %%mm0 \n\t"
 		"movq %%mm0, (%0, %1, 4) 			\n\t"
 
 		:
@@ -1250,6 +1237,7 @@
 	const int l8= stride + l7;
 //	const int l9= stride + l8;
 	int x;
+	src+= stride*3;
 	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
@@ -1881,7 +1869,7 @@
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
  */
 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
 {
@@ -1894,16 +1882,16 @@
 
 		"movq (%0), %%mm0				\n\t"
 		"movq (%%eax, %1), %%mm1			\n\t"
-		PAVGB(%%mm1, %%mm0)\
+		PAVGB(%%mm1, %%mm0)
 		"movq %%mm0, (%%eax)				\n\t"
 		"movq (%0, %1, 4), %%mm0			\n\t"
-		PAVGB(%%mm0, %%mm1)\
+		PAVGB(%%mm0, %%mm1)
 		"movq %%mm1, (%%eax, %1, 2)			\n\t"
 		"movq (%%ebx, %1), %%mm1			\n\t"
-		PAVGB(%%mm1, %%mm0)\
+		PAVGB(%%mm1, %%mm0)
 		"movq %%mm0, (%%ebx)				\n\t"
 		"movq (%0, %1, 8), %%mm0			\n\t"
-		PAVGB(%%mm0, %%mm1)\
+		PAVGB(%%mm0, %%mm1)
 		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
 
 		: : "r" (src), "r" (stride)
@@ -1924,41 +1912,59 @@
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
+ * no cliping in C version
  */
-static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride)
+static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 	asm volatile(
 		"leal (%0, %1), %%eax				\n\t"
 		"leal (%%eax, %1, 4), %%ebx			\n\t"
-//	0	1	2	3	4	5	6	7	8	9
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+		"leal (%%ebx, %1, 4), %%ecx			\n\t"
+		"addl %1, %%ecx					\n\t"
+		"pxor %%mm7, %%mm7				\n\t"
+//	0	1	2	3	4	5	6	7	8	9	10
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1 ecx
 
-		"movq (%0), %%mm0				\n\t"
-		"movq (%%eax, %1), %%mm1			\n\t"
-		PAVGB(%%mm1, %%mm0)\
-		"movq %%mm0, (%%eax)				\n\t"
-		"movq (%0, %1, 4), %%mm0			\n\t"
-		PAVGB(%%mm0, %%mm1)\
-		"movq %%mm1, (%%eax, %1, 2)			\n\t"
-		"movq (%%ebx, %1), %%mm1			\n\t"
-		PAVGB(%%mm1, %%mm0)\
-		"movq %%mm0, (%%ebx)				\n\t"
-		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
+#define DEINT_CUBIC(a,b,c,d,e)\
+		"movq " #a ", %%mm0				\n\t"\
+		"movq " #b ", %%mm1				\n\t"\
+		"movq " #d ", %%mm2				\n\t"\
+		"movq " #e ", %%mm3				\n\t"\
+		PAVGB(%%mm2, %%mm1)					/* (b+d) /2 */\
+		PAVGB(%%mm3, %%mm0)					/* a(a+e) /2 */\
+		"movq %%mm0, %%mm2				\n\t"\
+		"punpcklbw %%mm7, %%mm0				\n\t"\
+		"punpckhbw %%mm7, %%mm2				\n\t"\
+		"movq %%mm1, %%mm3				\n\t"\
+		"punpcklbw %%mm7, %%mm1				\n\t"\
+		"punpckhbw %%mm7, %%mm3				\n\t"\
+		"psubw %%mm1, %%mm0				\n\t"	/* L(a+e - (b+d))/2 */\
+		"psubw %%mm3, %%mm2				\n\t"	/* H(a+e - (b+d))/2 */\
+		"psraw $3, %%mm0				\n\t"	/* L(a+e - (b+d))/16 */\
+		"psraw $3, %%mm2				\n\t"	/* H(a+e - (b+d))/16 */\
+		"psubw %%mm0, %%mm1				\n\t"	/* L(9b + 9d - a - e)/16 */\
+		"psubw %%mm2, %%mm3				\n\t"	/* H(9b + 9d - a - e)/16 */\
+		"packuswb %%mm3, %%mm1				\n\t"\
+		"movq %%mm1, " #c "				\n\t"
 
+DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
+DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
+DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
+DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
 
 		: : "r" (src), "r" (stride)
-		: "%eax", "%ebx"
+		: "%eax", "%ebx", "ecx"
 	);
 #else
 	int x;
 	for(x=0; x<8; x++)
 	{
-		src[stride]   = (src[0]        + src[stride*2])>>1;
-		src[stride*3] = (src[stride*2] + src[stride*4])>>1;
-		src[stride*5] = (src[stride*4] + src[stride*6])>>1;
-		src[stride*7] = src[stride*6];
+		src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
+		src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
+		src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
+		src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
 		src++;
 	}
 #endif
@@ -1966,7 +1972,7 @@
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
  * will shift the image up by 1 line (FIXME if this is a problem)
  */
 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
@@ -2036,70 +2042,6 @@
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
- * will shift the image up by 1 line (FIXME if this is a problem)
- */
-static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride)
-{
-#if defined (HAVE_MMSX2) || defined (HAVE_3DNOW)
-	asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%ebx			\n\t"
-//	0	1	2	3	4	5	6	7	8	9
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
-
-		"movq (%0), %%mm0				\n\t" // L0
-		"movq (%%eax, %1), %%mm1			\n\t" // L2
-		PAVGB(%%mm1, %%mm0)				      // L0+L2
-		"movq (%%eax), %%mm2				\n\t" // L1
-		PAVGB(%%mm2, %%mm0)
-		"movq %%mm0, (%0)				\n\t"
-		"movq (%%eax, %1, 2), %%mm0			\n\t" // L3
-		PAVGB(%%mm0, %%mm2)				      // L1+L3
-		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
-		"movq %%mm2, (%%eax)				\n\t"
-		"movq (%0, %1, 4), %%mm2			\n\t" // L4
-		PAVGB(%%mm2, %%mm1)				      // L2+L4
-		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
-		"movq %%mm1, (%%eax, %1)			\n\t"
-		"movq (%%ebx), %%mm1				\n\t" // L5
-		PAVGB(%%mm1, %%mm0)				      // L3+L5
-		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
-		"movq %%mm0, (%%eax, %1, 2)			\n\t"
-		"movq (%%ebx, %1), %%mm0			\n\t" // L6
-		PAVGB(%%mm0, %%mm2)				      // L4+L6
-		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
-		"movq %%mm2, (%0, %1, 4)			\n\t"
-		"movq (%%ebx, %1, 2), %%mm2			\n\t" // L7
-		PAVGB(%%mm2, %%mm1)				      // L5+L7
-		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
-		"movq %%mm1, (%%ebx)				\n\t"
-		PAVGB(%%mm2, %%mm0)				      // L7 + L8
-		"movq %%mm0, (%%ebx, %1)			\n\t"
-		"movq %%mm0, (%%ebx, %1, 2)			\n\t"
-
-		: : "r" (src), "r" (stride)
-		: "%eax", "%ebx"
-	);
-#else
-	int x;
-	for(x=0; x<8; x++)
-	{
-		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
-		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
-		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
-		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
-		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
-		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
-		src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
-		src[stride*7] = src[stride*6];
-		src++;
-	}
-#endif
-}
-
-/**
- * Deinterlaces the given block
  * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
  */
 static inline void deInterlaceMedian(uint8_t src[], int stride)
@@ -2213,91 +2155,6 @@
 #endif
 }
 
-/**
- * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
- */
-static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
-{
-#ifdef HAVE_MMX
-#ifdef HAVE_MMX2
-	asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%ebx			\n\t"
-//	0	1	2	3	4	5	6	7	8	9
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
-
-		"movq (%0), %%mm0				\n\t" //
-		"movq (%%eax, %1), %%mm2			\n\t" //
-		"movq (%%eax), %%mm1				\n\t" //
-		"movq %%mm0, %%mm3				\n\t"
-		"pmaxub %%mm1, %%mm0				\n\t" //
-		"pminub %%mm3, %%mm1				\n\t" //
-		"pmaxub %%mm2, %%mm1				\n\t" //
-		"pminub %%mm1, %%mm0				\n\t"
-		"movq %%mm0, (%%eax)				\n\t"
-
-		"movq (%0, %1, 4), %%mm0			\n\t" //
-		"movq (%%eax, %1, 2), %%mm1			\n\t" //
-		"movq %%mm2, %%mm3				\n\t"
-		"pmaxub %%mm1, %%mm2				\n\t" //
-		"pminub %%mm3, %%mm1				\n\t" //
-		"pmaxub %%mm0, %%mm1				\n\t" //
-		"pminub %%mm1, %%mm2				\n\t"
-		"movq %%mm2, (%%eax, %1, 2)			\n\t"
-
-		"movq (%%ebx), %%mm2				\n\t" //
-		"movq (%%ebx, %1), %%mm1			\n\t" //
-		"movq %%mm2, %%mm3				\n\t"
-		"pmaxub %%mm0, %%mm2				\n\t" //
-		"pminub %%mm3, %%mm0				\n\t" //
-		"pmaxub %%mm1, %%mm0				\n\t" //
-		"pminub %%mm0, %%mm2				\n\t"
-		"movq %%mm2, (%%ebx)				\n\t"
-
-		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
-
-		: : "r" (src), "r" (stride)
-		: "%eax", "%ebx"
-	);
-#else //MMX & no MMX2
-asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%ebx			\n\t"
-//	0	1	2	3	4	5	6	7	8	9
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
-		"pxor %%mm7, %%mm7				\n\t"
-
-MEDIAN((%0), (%%eax), (%%eax, %1))
-MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
-MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
-
-		"movq (%%ebx, %1), %%mm0			\n\t"
-		"movq %%mm0, (%%ebx, %1, 2)			\n\t"
-
-		: : "r" (src), "r" (stride)
-		: "%eax", "%ebx"
-	);
-
-#endif //MMX
-#else
-	//FIXME
-	int x;
-	for(x=0; x<8; x++)
-	{
-		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
-		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
-		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
-		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
-		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
-		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
-		src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
-		src[stride*7] = src[stride*6];
-		src++;
-	}
-#endif
-}
-
 #ifdef HAVE_ODIVX_POSTPROCESS
 #include "../opendivx/postprocess.h"
 int use_old_pp=0;
@@ -2537,11 +2394,21 @@
 	static uint64_t *yHistogram= NULL;
 	int black=0, white=255; // blackest black and whitest white in the picture
 
+	/* Temporary buffers for handling the last row(s) */
+	static uint8_t *tempDst= NULL;
+	static uint8_t *tempSrc= NULL;
+
 #ifdef TIMING
 	long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
 	sumTime= rdtsc();
 #endif
 
+	if(tempDst==NULL)
+	{
+		tempDst= (uint8_t*)memalign(8, 1024*24);
+		tempSrc= (uint8_t*)memalign(8, 1024*24);
+	}
+
 	if(!yHistogram)
 	{
 		int i;
@@ -2569,7 +2436,6 @@
 //		printf("\n\n");
 
 		/* we allways get a completly black picture first */
-
 		maxClipped= (uint64_t)(sum * maxClippedThreshold);
 
 		clipped= sum;
@@ -2604,16 +2470,40 @@
 		packedYOffset= 0;
 	}
 
+	/* copy first row of 8x8 blocks */
 	for(x=0; x<width; x+=BLOCK_SIZE)
 		blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
 
-	for(y=0; y<height-7; y+=BLOCK_SIZE)
+	for(y=0; y<height; y+=BLOCK_SIZE)
 	{
 		//1% speedup if these are here instead of the inner loop
 		uint8_t *srcBlock= &(src[y*srcStride]);
 		uint8_t *dstBlock= &(dst[y*dstStride]);
-		uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
-		uint8_t *vertBlock= &(dstBlock[dstStride*3]);
+
+		/* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
+		   than use a temporary buffer */
+		if(y+15 >= height)
+		{
+			/* copy from line 5 to 12 of src, these will e copied with
+			   blockcopy to dst later */
+			memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
+				srcStride*MAX(height-y-5, 0) );
+
+			/* duplicate last line to fill the void upto line 12 */
+			if(y+12 >= height)
+			{
+				int i;
+				for(i=height-y; i<=12; i++)
+					memcpy(tempSrc + srcStride*i,
+						src + srcStride*(height-1), srcStride);
+			}
+
+
+			/* copy up to 5 lines of dst */
+			memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) );
+			dstBlock= tempDst;
+			srcBlock= tempSrc;
+		}
 
 		// finish 1 block before the next otherwise we�ll might have a problem
 		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
@@ -2625,53 +2515,54 @@
 				QPs[(y>>4)*QPStride + (x>>4)];
 			if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
 #ifdef HAVE_MMX
-		asm volatile(
-			"movd %0, %%mm7					\n\t"
-			"packuswb %%mm7, %%mm7				\n\t" // 0, 0, 0, QP, 0, 0, 0, QP
-			"packuswb %%mm7, %%mm7				\n\t" // 0,QP, 0, QP, 0,QP, 0, QP
-			"packuswb %%mm7, %%mm7				\n\t" // QP,..., QP
-			"movq %%mm7, pQPb				\n\t"
-			: : "r" (QP)
-		);
+			asm volatile(
+				"movd %0, %%mm7					\n\t"
+				"packuswb %%mm7, %%mm7				\n\t" // 0, 0, 0, QP, 0, 0, 0, QP
+				"packuswb %%mm7, %%mm7				\n\t" // 0,QP, 0, QP, 0,QP, 0, QP
+				"packuswb %%mm7, %%mm7				\n\t" // QP,..., QP
+				"movq %%mm7, pQPb				\n\t"
+				: : "r" (QP)
+			);
 #endif
 
-
-			if(y + 12 < height)
-			{
 #ifdef MORE_TIMING
-				T0= rdtsc();
+			T0= rdtsc();
 #endif
 
 #ifdef HAVE_MMX2
-				prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
-				prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
-				prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
-				prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
+			prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
+			prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
+			prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
+			prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
 #elif defined(HAVE_3DNOW)
 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
-/*				prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
-				prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
-				prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
-				prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
+/*			prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
+			prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
+			prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
+			prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
 */
 #endif
-				if(!isColor) yHistogram[ srcBlock[0] ]++;
+
+			if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
 
-				blockCopy(vertBlock + dstStride*2, dstStride,
-					vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
+			blockCopy(dstBlock + dstStride*5, dstStride,
+				srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
 
-				if(mode & LINEAR_IPOL_DEINT_FILTER)
-					deInterlaceInterpolateLinear(dstBlock, dstStride);
-				else if(mode & LINEAR_BLEND_DEINT_FILTER)
-					deInterlaceBlendLinear(dstBlock, dstStride);
-				else if(mode & MEDIAN_DEINT_FILTER)
-					deInterlaceMedian(dstBlock, dstStride);
-/*				else if(mode & CUBIC_IPOL_DEINT_FILTER)
-					deInterlaceInterpolateCubic(dstBlock, dstStride);
-				else if(mode & CUBIC_BLEND_DEINT_FILTER)
-					deInterlaceBlendCubic(dstBlock, dstStride);
+			if(mode & LINEAR_IPOL_DEINT_FILTER)
+				deInterlaceInterpolateLinear(dstBlock, dstStride);
+			else if(mode & LINEAR_BLEND_DEINT_FILTER)
+				deInterlaceBlendLinear(dstBlock, dstStride);
+			else if(mode & MEDIAN_DEINT_FILTER)
+				deInterlaceMedian(dstBlock, dstStride);
+			else if(mode & CUBIC_IPOL_DEINT_FILTER)
+				deInterlaceInterpolateCubic(dstBlock, dstStride);
+/*			else if(mode & CUBIC_BLEND_DEINT_FILTER)
+				deInterlaceBlendCubic(dstBlock, dstStride);
 */
 
+			/* only deblock if we have 2 blocks */
+			if(y + 8 < height)
+			{
 #ifdef MORE_TIMING
 				T1= rdtsc();
 				memcpyTime+= T1-T0;
@@ -2680,18 +2571,18 @@
 				if(mode & V_DEBLOCK)
 				{
 					if(mode & V_RK1_FILTER)
-						vertRK1Filter(vertBlock, stride, QP);
+						vertRK1Filter(dstBlock, stride, QP);
 					else if(mode & V_X1_FILTER)
-						vertX1Filter(vertBlock, stride, QP);
+						vertX1Filter(dstBlock, stride, QP);
 					else
 					{
-						if( isVertDC(vertBlock, stride))
+						if( isVertDC(dstBlock, stride))
 						{
-							if(isVertMinMaxOk(vertBlock, stride, QP))
-								doVertLowPass(vertBlock, stride, QP);
+							if(isVertMinMaxOk(dstBlock, stride, QP))
+								doVertLowPass(dstBlock, stride, QP);
 						}
 						else
-							doVertDefFilter(vertBlock, stride, QP);
+							doVertDefFilter(dstBlock, stride, QP);
 					}
 				}
 #ifdef MORE_TIMING
@@ -2700,24 +2591,8 @@
 				T0=T1;
 #endif
 			}
-			else
-			{
-				blockCopy(vertBlock + dstStride*1, dstStride,
-					vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
 
-				if(mode & LINEAR_IPOL_DEINT_FILTER)
-					deInterlaceInterpolateLinearLastRow(dstBlock, dstStride);
-				else if(mode & LINEAR_BLEND_DEINT_FILTER)
-					deInterlaceBlendLinearLastRow(dstBlock, dstStride);
-				else if(mode & MEDIAN_DEINT_FILTER)
-					deInterlaceMedianLastRow(dstBlock, dstStride);
-/*				else if(mode & CUBIC_IPOL_DEINT_FILTER)
-					deInterlaceInterpolateCubicLastRow(dstBlock, dstStride);
-				else if(mode & CUBIC_BLEND_DEINT_FILTER)
-					deInterlaceBlendCubicLastRow(dstBlock, dstStride);
-*/
-			}
-
+			/* check if we have a previous block to deblock it with dstBlock */
 			if(x - 8 >= 0 && x<width)
 			{
 #ifdef MORE_TIMING
@@ -2749,11 +2624,15 @@
 				dering(dstBlock - stride*9 + width-9, stride, QP);
 			//FIXME dering filter will not be applied to last block (bottom right)
 
-
 			dstBlock+=8;
 			srcBlock+=8;
-			vertBlock+=8;
-			vertSrcBlock+=8;
+		}
+
+		/* did we use a tmp buffer */
+		if(y+15 > height)
+		{
+			uint8_t *dstBlock= &(dst[y*dstStride]);
+			memcpy(dstBlock, tempDst, dstStride*(height-y) );
 		}
 	}
 #ifdef HAVE_3DNOW
@@ -2772,5 +2651,3 @@
 			, black, white);
 #endif
 }
-
-
diff -r e00e5d93457c -r 8e4c5a16c9fc libpostproc/postprocess.h
--- a/libpostproc/postprocess.h	Wed Oct 17 18:05:35 2001 +0000
+++ b/libpostproc/postprocess.h	Wed Oct 17 20:42:07 2001 +0000
@@ -49,8 +49,8 @@
 //Deinterlacing Filters
 #define	LINEAR_IPOL_DEINT_FILTER	0x10000	// 65536
 #define	LINEAR_BLEND_DEINT_FILTER	0x20000	// 131072
-//#define	CUBIC_BLEND_DEINT_FILTER	0x8000	// (not implemented yet)
-#define	CUBIC_IPOL_DEINT_FILTER		0x40000	// 262144 (not implemented yet)
+#define	CUBIC_BLEND_DEINT_FILTER	0x8000	// (not implemented yet)
+#define	CUBIC_IPOL_DEINT_FILTER		0x40000	// 262144
 #define	MEDIAN_DEINT_FILTER		0x80000	// 524288
 
 
diff -r e00e5d93457c -r 8e4c5a16c9fc libpostproc/postprocess_template.c
--- a/libpostproc/postprocess_template.c	Wed Oct 17 18:05:35 2001 +0000
+++ b/libpostproc/postprocess_template.c	Wed Oct 17 20:42:07 2001 +0000
@@ -30,14 +30,15 @@
 Vertical RKAlgo1	E		a	a
 Vertical X1		a		E	E
 Horizontal X1		a		E	E
-LinIpolDeinterlace	a		E	E*
-LinBlendDeinterlace	a		E	E*
+LinIpolDeinterlace	e		E	E*
+CubicIpolDeinterlace	a		e	e*
+LinBlendDeinterlace	e		E	E*
 MedianDeinterlace	 	Ec	Ec
 
 
 * i dont have a 3dnow CPU -> its untested
 E = Exact implementation
-e = allmost exact implementation
+e = allmost exact implementation (slightly different rounding,...)
 a = alternative / approximate impl
 c = checked against the other implementations (-vo md5)
 */
@@ -63,7 +64,6 @@
 
 Notes:
 
-
 */
 
 //Changelog: use the CVS log
@@ -178,12 +178,12 @@
 
 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
 /**
- * Check if the middle 8x8 Block in the given 8x10 block is flat
+ * Check if the middle 8x8 Block in the given 8x16 block is flat
  */
 static inline int isVertDC(uint8_t src[], int stride){
 	int numEq= 0;
 	int y;
-	src+= stride; // src points to begin of the 8x8 Block
+	src+= stride*4; // src points to begin of the 8x8 Block
 #ifdef HAVE_MMX
 	asm volatile(
 		"pushl %1\n\t"
@@ -295,6 +295,7 @@
 {
 #ifdef HAVE_MMX
 	int isOk;
+	src+= stride*3;
 	asm volatile(
 //		"int $3 \n\t"
 		"movq (%1, %2), %%mm0				\n\t"
@@ -320,6 +321,7 @@
 
 	int isOk2= 1;
 	int x;
+	src+= stride*3;
 	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
@@ -343,19 +345,16 @@
 }
 
 /**
- * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle)
+ * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
  */
 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
 {
-//	QP= 64;
-
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
-//#ifdef HAVE_MMX2
+	src+= stride*3;
 	asm volatile(	//"movv %0 %1 %2\n\t"
 		"pushl %0 \n\t"
 		"movq pQPb, %%mm0				\n\t"  // QP,..., QP
-//		"movq bFF  , %%mm0				\n\t"  // QP,..., QP
 
 		"movq (%0), %%mm6				\n\t"
 		"movq (%0, %1), %%mm5				\n\t"
@@ -395,20 +394,7 @@
 		// 6 4 2 2 1 1
 		// 6 4 4 2
 		// 6 8 2
-/*
-		"movq %%mm6, %%mm2				\n\t" //1
-		"movq %%mm6, %%mm3				\n\t" //1
-		"paddusb b02, %%mm3 				\n\t"
-		"psrlw $2, %%mm3				\n\t" //1	/4
-		"pand b3F, %%mm3				\n\t"
-		"psubb %%mm3, %%mm2				\n\t"
-		"movq (%0, %1), %%mm0				\n\t" //  1
-		"movq %%mm0, %%mm1				\n\t" //  1
-		"paddusb b02, %%mm0				\n\t"
-		"psrlw $2, %%mm0				\n\t" //  1	/4
-		"pand b3F, %%mm0				\n\t"
-		"paddusb %%mm2, %%mm0				\n\t" //3 1	/4
-*/
+
 		"movq (%0, %1), %%mm0				\n\t" //  1
 		"movq %%mm0, %%mm1				\n\t" //  1
 		PAVGB(%%mm6, %%mm0)				      //1 1	/2
@@ -470,7 +456,6 @@
 		"movq (%%eax, %1, 2), %%mm6			\n\t" //      1
 		PAVGB(%%mm6, %%mm1)				      //  11  4  2	/8
 		PAVGB(%%mm0, %%mm1)				      //  11224222	/16
-//		"pxor %%mm1, %%mm1 \n\t"
 		"movq %%mm1, (%%eax, %1, 2)			\n\t" //      X
 		// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
 		PAVGB((%%ebx), %%mm2)				      //   112 4	/8
@@ -478,7 +463,6 @@
 		PAVGB(%%mm0, %%mm6)				      //      1 1	/2
 		PAVGB(%%mm7, %%mm6)				      //      1 12	/4
 		PAVGB(%%mm2, %%mm6)				      //   1122424	/4
-//		"pxor %%mm6, %%mm6 \n\t"
 		"movq %%mm6, (%%ebx)				\n\t" //       X
 		// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
 		PAVGB(%%mm7, %%mm5)				      //    11   2	/4
@@ -486,8 +470,6 @@
 
 		PAVGB(%%mm3, %%mm0)				      //      112	/4
 		PAVGB(%%mm0, %%mm5)				      //    112246	/16
-//		"pxor %%mm5, %%mm5 \n\t"
-//		"movq pQPb, %%mm5 \n\t"
 		"movq %%mm5, (%%eax, %1, 4)			\n\t" //        X
 		"popl %0\n\t"
 
@@ -506,6 +488,7 @@
 	const int l8= stride + l7;
 	const int l9= stride + l8;
 	int x;
+	src+= stride*3;
 	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
@@ -551,6 +534,7 @@
 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+	src+= stride*3;
 // FIXME rounding
 	asm volatile(
 		"pxor %%mm7, %%mm7				\n\t" // 0
@@ -622,6 +606,7 @@
 	const int l8= stride + l7;
 	const int l9= stride + l8;
 	int x;
+	src+= stride*3;
 	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		if(ABS(src[l4]-src[l5]) < QP + QP/4)
@@ -650,6 +635,8 @@
 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+	src+= stride*3;
+
 	asm volatile(
 		"pxor %%mm7, %%mm7				\n\t" // 0
 //		"movq b80, %%mm6				\n\t" // MIN_SIGNED_BYTE
@@ -744,6 +731,8 @@
 	const int l8= stride + l7;
 	const int l9= stride + l8;
 	int x;
+
+	src+= stride*3;
 	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		int a= src[l3] - src[l4];
@@ -1007,7 +996,7 @@
 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
 {
 #ifdef HAVE_MMX
-	src+= stride;
+	src+= stride*4;
 	//FIXME try pmul for *5 stuff
 //	src[0]=0;
 	asm volatile(
@@ -1154,7 +1143,6 @@
 		"psubw %%mm7, %%mm5				\n\t" // |2H2 - 5H3 + 5H4 - 2H5|
 // 100 opcodes
 		"movd %2, %%mm2					\n\t" // QP
-//"pcmpeqb %%mm2, %%mm2\n\t"
 		"punpcklwd %%mm2, %%mm2				\n\t"
 		"punpcklwd %%mm2, %%mm2				\n\t"
 		"psllw $3, %%mm2				\n\t" // 8QP
@@ -1232,7 +1220,6 @@
 		"movq %%mm0, (%%eax, %1, 2) 			\n\t"
 		"movq (%0, %1, 4), %%mm0			\n\t"
 		"psubb %%mm4, %%mm0				\n\t"
-//		"pxor %%mm0, %%mm0 \n\t"
 		"movq %%mm0, (%0, %1, 4) 			\n\t"
 
 		:
@@ -1250,6 +1237,7 @@
 	const int l8= stride + l7;
 //	const int l9= stride + l8;
 	int x;
+	src+= stride*3;
 	for(x=0; x<BLOCK_SIZE; x++)
 	{
 		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
@@ -1881,7 +1869,7 @@
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
  */
 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
 {
@@ -1894,16 +1882,16 @@
 
 		"movq (%0), %%mm0				\n\t"
 		"movq (%%eax, %1), %%mm1			\n\t"
-		PAVGB(%%mm1, %%mm0)\
+		PAVGB(%%mm1, %%mm0)
 		"movq %%mm0, (%%eax)				\n\t"
 		"movq (%0, %1, 4), %%mm0			\n\t"
-		PAVGB(%%mm0, %%mm1)\
+		PAVGB(%%mm0, %%mm1)
 		"movq %%mm1, (%%eax, %1, 2)			\n\t"
 		"movq (%%ebx, %1), %%mm1			\n\t"
-		PAVGB(%%mm1, %%mm0)\
+		PAVGB(%%mm1, %%mm0)
 		"movq %%mm0, (%%ebx)				\n\t"
 		"movq (%0, %1, 8), %%mm0			\n\t"
-		PAVGB(%%mm0, %%mm1)\
+		PAVGB(%%mm0, %%mm1)
 		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
 
 		: : "r" (src), "r" (stride)
@@ -1924,41 +1912,59 @@
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
+ * no cliping in C version
  */
-static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride)
+static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 	asm volatile(
 		"leal (%0, %1), %%eax				\n\t"
 		"leal (%%eax, %1, 4), %%ebx			\n\t"
-//	0	1	2	3	4	5	6	7	8	9
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+		"leal (%%ebx, %1, 4), %%ecx			\n\t"
+		"addl %1, %%ecx					\n\t"
+		"pxor %%mm7, %%mm7				\n\t"
+//	0	1	2	3	4	5	6	7	8	9	10
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1 ecx
 
-		"movq (%0), %%mm0				\n\t"
-		"movq (%%eax, %1), %%mm1			\n\t"
-		PAVGB(%%mm1, %%mm0)\
-		"movq %%mm0, (%%eax)				\n\t"
-		"movq (%0, %1, 4), %%mm0			\n\t"
-		PAVGB(%%mm0, %%mm1)\
-		"movq %%mm1, (%%eax, %1, 2)			\n\t"
-		"movq (%%ebx, %1), %%mm1			\n\t"
-		PAVGB(%%mm1, %%mm0)\
-		"movq %%mm0, (%%ebx)				\n\t"
-		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
+#define DEINT_CUBIC(a,b,c,d,e)\
+		"movq " #a ", %%mm0				\n\t"\
+		"movq " #b ", %%mm1				\n\t"\
+		"movq " #d ", %%mm2				\n\t"\
+		"movq " #e ", %%mm3				\n\t"\
+		PAVGB(%%mm2, %%mm1)					/* (b+d) /2 */\
+		PAVGB(%%mm3, %%mm0)					/* a(a+e) /2 */\
+		"movq %%mm0, %%mm2				\n\t"\
+		"punpcklbw %%mm7, %%mm0				\n\t"\
+		"punpckhbw %%mm7, %%mm2				\n\t"\
+		"movq %%mm1, %%mm3				\n\t"\
+		"punpcklbw %%mm7, %%mm1				\n\t"\
+		"punpckhbw %%mm7, %%mm3				\n\t"\
+		"psubw %%mm1, %%mm0				\n\t"	/* L(a+e - (b+d))/2 */\
+		"psubw %%mm3, %%mm2				\n\t"	/* H(a+e - (b+d))/2 */\
+		"psraw $3, %%mm0				\n\t"	/* L(a+e - (b+d))/16 */\
+		"psraw $3, %%mm2				\n\t"	/* H(a+e - (b+d))/16 */\
+		"psubw %%mm0, %%mm1				\n\t"	/* L(9b + 9d - a - e)/16 */\
+		"psubw %%mm2, %%mm3				\n\t"	/* H(9b + 9d - a - e)/16 */\
+		"packuswb %%mm3, %%mm1				\n\t"\
+		"movq %%mm1, " #c "				\n\t"
 
+DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
+DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
+DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
+DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
 
 		: : "r" (src), "r" (stride)
-		: "%eax", "%ebx"
+		: "%eax", "%ebx", "ecx"
 	);
 #else
 	int x;
 	for(x=0; x<8; x++)
 	{
-		src[stride]   = (src[0]        + src[stride*2])>>1;
-		src[stride*3] = (src[stride*2] + src[stride*4])>>1;
-		src[stride*5] = (src[stride*4] + src[stride*6])>>1;
-		src[stride*7] = src[stride*6];
+		src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
+		src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
+		src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
+		src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
 		src++;
 	}
 #endif
@@ -1966,7 +1972,7 @@
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
  * will shift the image up by 1 line (FIXME if this is a problem)
  */
 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
@@ -2036,70 +2042,6 @@
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
- * will shift the image up by 1 line (FIXME if this is a problem)
- */
-static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride)
-{
-#if defined (HAVE_MMSX2) || defined (HAVE_3DNOW)
-	asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%ebx			\n\t"
-//	0	1	2	3	4	5	6	7	8	9
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
-
-		"movq (%0), %%mm0				\n\t" // L0
-		"movq (%%eax, %1), %%mm1			\n\t" // L2
-		PAVGB(%%mm1, %%mm0)				      // L0+L2
-		"movq (%%eax), %%mm2				\n\t" // L1
-		PAVGB(%%mm2, %%mm0)
-		"movq %%mm0, (%0)				\n\t"
-		"movq (%%eax, %1, 2), %%mm0			\n\t" // L3
-		PAVGB(%%mm0, %%mm2)				      // L1+L3
-		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
-		"movq %%mm2, (%%eax)				\n\t"
-		"movq (%0, %1, 4), %%mm2			\n\t" // L4
-		PAVGB(%%mm2, %%mm1)				      // L2+L4
-		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
-		"movq %%mm1, (%%eax, %1)			\n\t"
-		"movq (%%ebx), %%mm1				\n\t" // L5
-		PAVGB(%%mm1, %%mm0)				      // L3+L5
-		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
-		"movq %%mm0, (%%eax, %1, 2)			\n\t"
-		"movq (%%ebx, %1), %%mm0			\n\t" // L6
-		PAVGB(%%mm0, %%mm2)				      // L4+L6
-		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
-		"movq %%mm2, (%0, %1, 4)			\n\t"
-		"movq (%%ebx, %1, 2), %%mm2			\n\t" // L7
-		PAVGB(%%mm2, %%mm1)				      // L5+L7
-		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
-		"movq %%mm1, (%%ebx)				\n\t"
-		PAVGB(%%mm2, %%mm0)				      // L7 + L8
-		"movq %%mm0, (%%ebx, %1)			\n\t"
-		"movq %%mm0, (%%ebx, %1, 2)			\n\t"
-
-		: : "r" (src), "r" (stride)
-		: "%eax", "%ebx"
-	);
-#else
-	int x;
-	for(x=0; x<8; x++)
-	{
-		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
-		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
-		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
-		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
-		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
-		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
-		src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
-		src[stride*7] = src[stride*6];
-		src++;
-	}
-#endif
-}
-
-/**
- * Deinterlaces the given block
  * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
  */
 static inline void deInterlaceMedian(uint8_t src[], int stride)
@@ -2213,91 +2155,6 @@
 #endif
 }
 
-/**
- * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
- */
-static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
-{
-#ifdef HAVE_MMX
-#ifdef HAVE_MMX2
-	asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%ebx			\n\t"
-//	0	1	2	3	4	5	6	7	8	9
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
-
-		"movq (%0), %%mm0				\n\t" //
-		"movq (%%eax, %1), %%mm2			\n\t" //
-		"movq (%%eax), %%mm1				\n\t" //
-		"movq %%mm0, %%mm3				\n\t"
-		"pmaxub %%mm1, %%mm0				\n\t" //
-		"pminub %%mm3, %%mm1				\n\t" //
-		"pmaxub %%mm2, %%mm1				\n\t" //
-		"pminub %%mm1, %%mm0				\n\t"
-		"movq %%mm0, (%%eax)				\n\t"
-
-		"movq (%0, %1, 4), %%mm0			\n\t" //
-		"movq (%%eax, %1, 2), %%mm1			\n\t" //
-		"movq %%mm2, %%mm3				\n\t"
-		"pmaxub %%mm1, %%mm2				\n\t" //
-		"pminub %%mm3, %%mm1				\n\t" //
-		"pmaxub %%mm0, %%mm1				\n\t" //
-		"pminub %%mm1, %%mm2				\n\t"
-		"movq %%mm2, (%%eax, %1, 2)			\n\t"
-
-		"movq (%%ebx), %%mm2				\n\t" //
-		"movq (%%ebx, %1), %%mm1			\n\t" //
-		"movq %%mm2, %%mm3				\n\t"
-		"pmaxub %%mm0, %%mm2				\n\t" //
-		"pminub %%mm3, %%mm0				\n\t" //
-		"pmaxub %%mm1, %%mm0				\n\t" //
-		"pminub %%mm0, %%mm2				\n\t"
-		"movq %%mm2, (%%ebx)				\n\t"
-
-		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
-
-		: : "r" (src), "r" (stride)
-		: "%eax", "%ebx"
-	);
-#else //MMX & no MMX2
-asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%ebx			\n\t"
-//	0	1	2	3	4	5	6	7	8	9
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
-		"pxor %%mm7, %%mm7				\n\t"
-
-MEDIAN((%0), (%%eax), (%%eax, %1))
-MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
-MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
-
-		"movq (%%ebx, %1), %%mm0			\n\t"
-		"movq %%mm0, (%%ebx, %1, 2)			\n\t"
-
-		: : "r" (src), "r" (stride)
-		: "%eax", "%ebx"
-	);
-
-#endif //MMX
-#else
-	//FIXME
-	int x;
-	for(x=0; x<8; x++)
-	{
-		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
-		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
-		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
-		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
-		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
-		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
-		src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
-		src[stride*7] = src[stride*6];
-		src++;
-	}
-#endif
-}
-
 #ifdef HAVE_ODIVX_POSTPROCESS
 #include "../opendivx/postprocess.h"
 int use_old_pp=0;
@@ -2537,11 +2394,21 @@
 	static uint64_t *yHistogram= NULL;
 	int black=0, white=255; // blackest black and whitest white in the picture
 
+	/* Temporary buffers for handling the last row(s) */
+	static uint8_t *tempDst= NULL;
+	static uint8_t *tempSrc= NULL;
+
 #ifdef TIMING
 	long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
 	sumTime= rdtsc();
 #endif
 
+	if(tempDst==NULL)
+	{
+		tempDst= (uint8_t*)memalign(8, 1024*24);
+		tempSrc= (uint8_t*)memalign(8, 1024*24);
+	}
+
 	if(!yHistogram)
 	{
 		int i;
@@ -2569,7 +2436,6 @@
 //		printf("\n\n");
 
 		/* we allways get a completly black picture first */
-
 		maxClipped= (uint64_t)(sum * maxClippedThreshold);
 
 		clipped= sum;
@@ -2604,16 +2470,40 @@
 		packedYOffset= 0;
 	}
 
+	/* copy first row of 8x8 blocks */
 	for(x=0; x<width; x+=BLOCK_SIZE)
 		blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
 
-	for(y=0; y<height-7; y+=BLOCK_SIZE)
+	for(y=0; y<height; y+=BLOCK_SIZE)
 	{
 		//1% speedup if these are here instead of the inner loop
 		uint8_t *srcBlock= &(src[y*srcStride]);
 		uint8_t *dstBlock= &(dst[y*dstStride]);
-		uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
-		uint8_t *vertBlock= &(dstBlock[dstStride*3]);
+
+		/* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
+		   than use a temporary buffer */
+		if(y+15 >= height)
+		{
+			/* copy from line 5 to 12 of src, these will e copied with
+			   blockcopy to dst later */
+			memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
+				srcStride*MAX(height-y-5, 0) );
+
+			/* duplicate last line to fill the void upto line 12 */
+			if(y+12 >= height)
+			{
+				int i;
+				for(i=height-y; i<=12; i++)
+					memcpy(tempSrc + srcStride*i,
+						src + srcStride*(height-1), srcStride);
+			}
+
+
+			/* copy up to 5 lines of dst */
+			memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) );
+			dstBlock= tempDst;
+			srcBlock= tempSrc;
+		}
 
 		// finish 1 block before the next otherwise we�ll might have a problem
 		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
@@ -2625,53 +2515,54 @@
 				QPs[(y>>4)*QPStride + (x>>4)];
 			if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
 #ifdef HAVE_MMX
-		asm volatile(
-			"movd %0, %%mm7					\n\t"
-			"packuswb %%mm7, %%mm7				\n\t" // 0, 0, 0, QP, 0, 0, 0, QP
-			"packuswb %%mm7, %%mm7				\n\t" // 0,QP, 0, QP, 0,QP, 0, QP
-			"packuswb %%mm7, %%mm7				\n\t" // QP,..., QP
-			"movq %%mm7, pQPb				\n\t"
-			: : "r" (QP)
-		);
+			asm volatile(
+				"movd %0, %%mm7					\n\t"
+				"packuswb %%mm7, %%mm7				\n\t" // 0, 0, 0, QP, 0, 0, 0, QP
+				"packuswb %%mm7, %%mm7				\n\t" // 0,QP, 0, QP, 0,QP, 0, QP
+				"packuswb %%mm7, %%mm7				\n\t" // QP,..., QP
+				"movq %%mm7, pQPb				\n\t"
+				: : "r" (QP)
+			);
 #endif
 
-
-			if(y + 12 < height)
-			{
 #ifdef MORE_TIMING
-				T0= rdtsc();
+			T0= rdtsc();
 #endif
 
 #ifdef HAVE_MMX2
-				prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
-				prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
-				prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
-				prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
+			prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
+			prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
+			prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
+			prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
 #elif defined(HAVE_3DNOW)
 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
-/*				prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
-				prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
-				prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
-				prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
+/*			prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
+			prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
+			prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
+			prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
 */
 #endif
-				if(!isColor) yHistogram[ srcBlock[0] ]++;
+
+			if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
 
-				blockCopy(vertBlock + dstStride*2, dstStride,
-					vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
+			blockCopy(dstBlock + dstStride*5, dstStride,
+				srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
 
-				if(mode & LINEAR_IPOL_DEINT_FILTER)
-					deInterlaceInterpolateLinear(dstBlock, dstStride);
-				else if(mode & LINEAR_BLEND_DEINT_FILTER)
-					deInterlaceBlendLinear(dstBlock, dstStride);
-				else if(mode & MEDIAN_DEINT_FILTER)
-					deInterlaceMedian(dstBlock, dstStride);
-/*				else if(mode & CUBIC_IPOL_DEINT_FILTER)
-					deInterlaceInterpolateCubic(dstBlock, dstStride);
-				else if(mode & CUBIC_BLEND_DEINT_FILTER)
-					deInterlaceBlendCubic(dstBlock, dstStride);
+			if(mode & LINEAR_IPOL_DEINT_FILTER)
+				deInterlaceInterpolateLinear(dstBlock, dstStride);
+			else if(mode & LINEAR_BLEND_DEINT_FILTER)
+				deInterlaceBlendLinear(dstBlock, dstStride);
+			else if(mode & MEDIAN_DEINT_FILTER)
+				deInterlaceMedian(dstBlock, dstStride);
+			else if(mode & CUBIC_IPOL_DEINT_FILTER)
+				deInterlaceInterpolateCubic(dstBlock, dstStride);
+/*			else if(mode & CUBIC_BLEND_DEINT_FILTER)
+				deInterlaceBlendCubic(dstBlock, dstStride);
 */
 
+			/* only deblock if we have 2 blocks */
+			if(y + 8 < height)
+			{
 #ifdef MORE_TIMING
 				T1= rdtsc();
 				memcpyTime+= T1-T0;
@@ -2680,18 +2571,18 @@
 				if(mode & V_DEBLOCK)
 				{
 					if(mode & V_RK1_FILTER)
-						vertRK1Filter(vertBlock, stride, QP);
+						vertRK1Filter(dstBlock, stride, QP);
 					else if(mode & V_X1_FILTER)
-						vertX1Filter(vertBlock, stride, QP);
+						vertX1Filter(dstBlock, stride, QP);
 					else
 					{
-						if( isVertDC(vertBlock, stride))
+						if( isVertDC(dstBlock, stride))
 						{
-							if(isVertMinMaxOk(vertBlock, stride, QP))
-								doVertLowPass(vertBlock, stride, QP);
+							if(isVertMinMaxOk(dstBlock, stride, QP))
+								doVertLowPass(dstBlock, stride, QP);
 						}
 						else
-							doVertDefFilter(vertBlock, stride, QP);
+							doVertDefFilter(dstBlock, stride, QP);
 					}
 				}
 #ifdef MORE_TIMING
@@ -2700,24 +2591,8 @@
 				T0=T1;
 #endif
 			}
-			else
-			{
-				blockCopy(vertBlock + dstStride*1, dstStride,
-					vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
 
-				if(mode & LINEAR_IPOL_DEINT_FILTER)
-					deInterlaceInterpolateLinearLastRow(dstBlock, dstStride);
-				else if(mode & LINEAR_BLEND_DEINT_FILTER)
-					deInterlaceBlendLinearLastRow(dstBlock, dstStride);
-				else if(mode & MEDIAN_DEINT_FILTER)
-					deInterlaceMedianLastRow(dstBlock, dstStride);
-/*				else if(mode & CUBIC_IPOL_DEINT_FILTER)
-					deInterlaceInterpolateCubicLastRow(dstBlock, dstStride);
-				else if(mode & CUBIC_BLEND_DEINT_FILTER)
-					deInterlaceBlendCubicLastRow(dstBlock, dstStride);
-*/
-			}
-
+			/* check if we have a previous block to deblock it with dstBlock */
 			if(x - 8 >= 0 && x<width)
 			{
 #ifdef MORE_TIMING
@@ -2749,11 +2624,15 @@
 				dering(dstBlock - stride*9 + width-9, stride, QP);
 			//FIXME dering filter will not be applied to last block (bottom right)
 
-
 			dstBlock+=8;
 			srcBlock+=8;
-			vertBlock+=8;
-			vertSrcBlock+=8;
+		}
+
+		/* did we use a tmp buffer */
+		if(y+15 > height)
+		{
+			uint8_t *dstBlock= &(dst[y*dstStride]);
+			memcpy(dstBlock, tempDst, dstStride*(height-y) );
 		}
 	}
 #ifdef HAVE_3DNOW
@@ -2772,5 +2651,3 @@
 			, black, white);
 #endif
 }
-
-