changeset 2221:9fd911c931cd

minor cleanups median deinterlace in MMX fixed typos
author michael
date Tue, 16 Oct 2001 02:31:14 +0000
parents 58555da47151
children ddf897c38fb1
files postproc/postprocess.c postproc/postprocess.h postproc/postprocess_template.c
diffstat 3 files changed, 166 insertions(+), 93 deletions(-) [+]
line wrap: on
line diff
--- a/postproc/postprocess.c	Mon Oct 15 20:09:42 2001 +0000
+++ b/postproc/postprocess.c	Tue Oct 16 02:31:14 2001 +0000
@@ -32,7 +32,7 @@
 Horizontal X1		a		E	E
 LinIpolDeinterlace	a		E	E*
 LinBlendDeinterlace	a		E	E*
-MedianDeinterlace	a		E
+MedianDeinterlace	 	Ec	Ec
 
 
 * i dont have a 3dnow CPU -> its untested
@@ -56,37 +56,17 @@
 make the mainloop more flexible (variable number of blocks at once
 	(the if/else stuff per block is slowing things down)
 compare the quality & speed of all filters
-implement a few simple deinterlacing filters
 split this huge file
 fix warnings (unused vars, ...)
+noise reduction filters
 ...
 
 Notes:
 
+
 */
 
-/*
-Changelog: use the CVS log
-rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
-added deinterlace filters (linear interpolate, linear blend, median)
-minor cleanups (removed some outcommented stuff)
-0.1.3
-	bugfixes: last 3 lines not brightness/contrast corrected
-		brightness statistics messed up with initial black pic
-	changed initial values of the brightness statistics
-	C++ -> C conversation
-	QP range question solved (very likely 1<=QP<=32 according to arpi)
-	new experimental vertical deblocking filter
-	RK filter has 3dNow support now (untested)
-0.1.2
-	fixed a bug in the horizontal default filter
-	3dnow version of the Horizontal & Vertical Lowpass filters
-	mmx version of the Horizontal Default filter
-	mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar
-	added mode flags & quality2mode function
-0.1.1
-*/
-
+//Changelog: use the CVS log
 
 #include <inttypes.h>
 #include <stdio.h>
@@ -154,7 +134,7 @@
 //FIXME can never make a movie´s black brighter (anyone needs that?)
 int minAllowedY=0;
 
-#ifdef TIMEING
+#ifdef TIMING
 static inline long long rdtsc()
 {
 	long long l;
@@ -364,7 +344,7 @@
 
 /**
  * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle)
- * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
+ * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
  */
 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
 {
@@ -1583,8 +1563,8 @@
 
 /**
  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
- * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
- * useing the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
+ * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
+ * using the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
  */
 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
 {
@@ -2124,7 +2104,8 @@
  */
 static inline void deInterlaceMedian(uint8_t src[], int stride)
 {
-#if defined (HAVE_MMX2)
+#ifdef HAVE_MMX
+#ifdef HAVE_MMX2
 	asm volatile(
 		"leal (%0, %1), %%eax				\n\t"
 		"leal (%%eax, %1, 4), %%ebx			\n\t"
@@ -2172,6 +2153,48 @@
 		: : "r" (src), "r" (stride)
 		: "%eax", "%ebx"
 	);
+
+#else // MMX without MMX2
+	asm volatile(
+		"leal (%0, %1), %%eax				\n\t"
+		"leal (%%eax, %1, 4), %%ebx			\n\t"
+//	0	1	2	3	4	5	6	7	8	9
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+		"pxor %%mm7, %%mm7				\n\t"
+
+#define MEDIAN(a,b,c)\
+		"movq " #a ", %%mm0				\n\t"\
+		"movq " #b ", %%mm2				\n\t"\
+		"movq " #c ", %%mm1				\n\t"\
+		"movq %%mm0, %%mm3				\n\t"\
+		"movq %%mm1, %%mm4				\n\t"\
+		"movq %%mm2, %%mm5				\n\t"\
+		"psubusb %%mm1, %%mm3				\n\t"\
+		"psubusb %%mm2, %%mm4				\n\t"\
+		"psubusb %%mm0, %%mm5				\n\t"\
+		"pcmpeqb %%mm7, %%mm3				\n\t"\
+		"pcmpeqb %%mm7, %%mm4				\n\t"\
+		"pcmpeqb %%mm7, %%mm5				\n\t"\
+		"movq %%mm3, %%mm6				\n\t"\
+		"pxor %%mm4, %%mm3				\n\t"\
+		"pxor %%mm5, %%mm4				\n\t"\
+		"pxor %%mm6, %%mm5				\n\t"\
+		"por %%mm3, %%mm1				\n\t"\
+		"por %%mm4, %%mm2				\n\t"\
+		"por %%mm5, %%mm0				\n\t"\
+		"pand %%mm2, %%mm0				\n\t"\
+		"pand %%mm1, %%mm0				\n\t"\
+		"movq %%mm0, " #b "				\n\t"
+
+MEDIAN((%0), (%%eax), (%%eax, %1))
+MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
+MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
+MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
+
+		: : "r" (src), "r" (stride)
+		: "%eax", "%ebx"
+	);
+#endif // MMX
 #else
 	//FIXME
 	int x;
@@ -2193,11 +2216,11 @@
 /**
  * Deinterlaces the given block
  * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
- * will shift the image up by 1 line (FIXME if this is a problem)
  */
 static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
 {
-#if defined (HAVE_MMX2)
+#ifdef HAVE_MMX
+#ifdef HAVE_MMX2
 	asm volatile(
 		"leal (%0, %1), %%eax				\n\t"
 		"leal (%%eax, %1, 4), %%ebx			\n\t"
@@ -2237,6 +2260,26 @@
 		: : "r" (src), "r" (stride)
 		: "%eax", "%ebx"
 	);
+#else //MMX & no MMX2
+asm volatile(
+		"leal (%0, %1), %%eax				\n\t"
+		"leal (%%eax, %1, 4), %%ebx			\n\t"
+//	0	1	2	3	4	5	6	7	8	9
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+		"pxor %%mm7, %%mm7				\n\t"
+
+MEDIAN((%0), (%%eax), (%%eax, %1))
+MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
+MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
+
+		"movq (%%ebx, %1), %%mm0			\n\t"
+		"movq %%mm0, (%%ebx, %1, 2)			\n\t"
+
+		: : "r" (src), "r" (stride)
+		: "%eax", "%ebx"
+	);
+
+#endif //MMX
 #else
 	//FIXME
 	int x;
@@ -2255,7 +2298,6 @@
 #endif
 }
 
-
 #ifdef HAVE_ODIVX_POSTPROCESS
 #include "../opendivx/postprocess.h"
 int use_old_pp=0;
@@ -2266,8 +2308,6 @@
 
 /**
  * ...
- * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63)
- * -63 is best quality -1 is worst
  */
 void  postprocess(unsigned char * src[], int src_stride,
                  unsigned char * dst[], int dst_stride,
@@ -2285,9 +2325,6 @@
 	}
 #endif
 
-	// I'm calling this from dec_video.c:video_set_postprocess()
-	// if(mode<0) mode= getModeForQuality(-mode);
-
 /*
 	long long T= rdtsc();
 	for(int y=vertical_size-1; y>=0 ; y--)
@@ -2500,7 +2537,7 @@
 	static uint64_t *yHistogram= NULL;
 	int black=0, white=255; // blackest black and whitest white in the picture
 
-#ifdef TIMEING
+#ifdef TIMING
 	long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
 	sumTime= rdtsc();
 #endif
@@ -2601,7 +2638,7 @@
 
 			if(y + 12 < height)
 			{
-#ifdef MORE_TIMEING
+#ifdef MORE_TIMING
 				T0= rdtsc();
 #endif
 
@@ -2635,7 +2672,7 @@
 					deInterlaceBlendCubic(dstBlock, dstStride);
 */
 
-#ifdef MORE_TIMEING
+#ifdef MORE_TIMING
 				T1= rdtsc();
 				memcpyTime+= T1-T0;
 				T0=T1;
@@ -2657,7 +2694,7 @@
 							doVertDefFilter(vertBlock, stride, QP);
 					}
 				}
-#ifdef MORE_TIMEING
+#ifdef MORE_TIMING
 				T1= rdtsc();
 				vertTime+= T1-T0;
 				T0=T1;
@@ -2683,7 +2720,7 @@
 
 			if(x - 8 >= 0 && x<width)
 			{
-#ifdef MORE_TIMEING
+#ifdef MORE_TIMING
 				T0= rdtsc();
 #endif
 				if(mode & H_DEBLOCK)
@@ -2701,7 +2738,7 @@
 							doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
 					}
 				}
-#ifdef MORE_TIMEING
+#ifdef MORE_TIMING
 				T1= rdtsc();
 				horizTime+= T1-T0;
 				T0=T1;
@@ -2725,7 +2762,7 @@
 	asm volatile("emms");
 #endif
 
-#ifdef TIMEING
+#ifdef TIMING
 	// FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
 	sumTime= rdtsc() - sumTime;
 	if(!isColor)
--- a/postproc/postprocess.h	Mon Oct 15 20:09:42 2001 +0000
+++ b/postproc/postprocess.h	Tue Oct 16 02:31:14 2001 +0000
@@ -47,18 +47,17 @@
 #define H_X1_FILTER	0x2000			// 8192
 
 //Deinterlacing Filters
-#define DEINTERLACE_FILTER_MASK		0xF0000
 #define	LINEAR_IPOL_DEINT_FILTER	0x10000	// 65536
 #define	LINEAR_BLEND_DEINT_FILTER	0x20000	// 131072
-#define	CUBIC_BLEND_DEINT_FILTER	0x30000	// 196608 (not implemented yet)
+//#define	CUBIC_BLEND_DEINT_FILTER	0x8000	// (not implemented yet)
 #define	CUBIC_IPOL_DEINT_FILTER		0x40000	// 262144 (not implemented yet)
-#define	MEDIAN_DEINT_FILTER		0x80000	// 524288 
+#define	MEDIAN_DEINT_FILTER		0x80000	// 524288
 
 
 #define GET_PP_QUALITY_MAX 6
 
-//#define TIMEING
-//#define MORE_TIMEING
+//#define TIMING
+//#define MORE_TIMING
 
 #define QP_STORE_T int
 
--- a/postproc/postprocess_template.c	Mon Oct 15 20:09:42 2001 +0000
+++ b/postproc/postprocess_template.c	Tue Oct 16 02:31:14 2001 +0000
@@ -32,7 +32,7 @@
 Horizontal X1		a		E	E
 LinIpolDeinterlace	a		E	E*
 LinBlendDeinterlace	a		E	E*
-MedianDeinterlace	a		E
+MedianDeinterlace	 	Ec	Ec
 
 
 * i dont have a 3dnow CPU -> its untested
@@ -56,37 +56,17 @@
 make the mainloop more flexible (variable number of blocks at once
 	(the if/else stuff per block is slowing things down)
 compare the quality & speed of all filters
-implement a few simple deinterlacing filters
 split this huge file
 fix warnings (unused vars, ...)
+noise reduction filters
 ...
 
 Notes:
 
+
 */
 
-/*
-Changelog: use the CVS log
-rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
-added deinterlace filters (linear interpolate, linear blend, median)
-minor cleanups (removed some outcommented stuff)
-0.1.3
-	bugfixes: last 3 lines not brightness/contrast corrected
-		brightness statistics messed up with initial black pic
-	changed initial values of the brightness statistics
-	C++ -> C conversation
-	QP range question solved (very likely 1<=QP<=32 according to arpi)
-	new experimental vertical deblocking filter
-	RK filter has 3dNow support now (untested)
-0.1.2
-	fixed a bug in the horizontal default filter
-	3dnow version of the Horizontal & Vertical Lowpass filters
-	mmx version of the Horizontal Default filter
-	mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar
-	added mode flags & quality2mode function
-0.1.1
-*/
-
+//Changelog: use the CVS log
 
 #include <inttypes.h>
 #include <stdio.h>
@@ -154,7 +134,7 @@
 //FIXME can never make a movie´s black brighter (anyone needs that?)
 int minAllowedY=0;
 
-#ifdef TIMEING
+#ifdef TIMING
 static inline long long rdtsc()
 {
 	long long l;
@@ -364,7 +344,7 @@
 
 /**
  * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle)
- * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
+ * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
  */
 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
 {
@@ -1583,8 +1563,8 @@
 
 /**
  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
- * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
- * useing the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
+ * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
+ * using the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
  */
 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
 {
@@ -2124,7 +2104,8 @@
  */
 static inline void deInterlaceMedian(uint8_t src[], int stride)
 {
-#if defined (HAVE_MMX2)
+#ifdef HAVE_MMX
+#ifdef HAVE_MMX2
 	asm volatile(
 		"leal (%0, %1), %%eax				\n\t"
 		"leal (%%eax, %1, 4), %%ebx			\n\t"
@@ -2172,6 +2153,48 @@
 		: : "r" (src), "r" (stride)
 		: "%eax", "%ebx"
 	);
+
+#else // MMX without MMX2
+	asm volatile(
+		"leal (%0, %1), %%eax				\n\t"
+		"leal (%%eax, %1, 4), %%ebx			\n\t"
+//	0	1	2	3	4	5	6	7	8	9
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+		"pxor %%mm7, %%mm7				\n\t"
+
+#define MEDIAN(a,b,c)\
+		"movq " #a ", %%mm0				\n\t"\
+		"movq " #b ", %%mm2				\n\t"\
+		"movq " #c ", %%mm1				\n\t"\
+		"movq %%mm0, %%mm3				\n\t"\
+		"movq %%mm1, %%mm4				\n\t"\
+		"movq %%mm2, %%mm5				\n\t"\
+		"psubusb %%mm1, %%mm3				\n\t"\
+		"psubusb %%mm2, %%mm4				\n\t"\
+		"psubusb %%mm0, %%mm5				\n\t"\
+		"pcmpeqb %%mm7, %%mm3				\n\t"\
+		"pcmpeqb %%mm7, %%mm4				\n\t"\
+		"pcmpeqb %%mm7, %%mm5				\n\t"\
+		"movq %%mm3, %%mm6				\n\t"\
+		"pxor %%mm4, %%mm3				\n\t"\
+		"pxor %%mm5, %%mm4				\n\t"\
+		"pxor %%mm6, %%mm5				\n\t"\
+		"por %%mm3, %%mm1				\n\t"\
+		"por %%mm4, %%mm2				\n\t"\
+		"por %%mm5, %%mm0				\n\t"\
+		"pand %%mm2, %%mm0				\n\t"\
+		"pand %%mm1, %%mm0				\n\t"\
+		"movq %%mm0, " #b "				\n\t"
+
+MEDIAN((%0), (%%eax), (%%eax, %1))
+MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
+MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
+MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
+
+		: : "r" (src), "r" (stride)
+		: "%eax", "%ebx"
+	);
+#endif // MMX
 #else
 	//FIXME
 	int x;
@@ -2193,11 +2216,11 @@
 /**
  * Deinterlaces the given block
  * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
- * will shift the image up by 1 line (FIXME if this is a problem)
  */
 static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
 {
-#if defined (HAVE_MMX2)
+#ifdef HAVE_MMX
+#ifdef HAVE_MMX2
 	asm volatile(
 		"leal (%0, %1), %%eax				\n\t"
 		"leal (%%eax, %1, 4), %%ebx			\n\t"
@@ -2237,6 +2260,26 @@
 		: : "r" (src), "r" (stride)
 		: "%eax", "%ebx"
 	);
+#else //MMX & no MMX2
+asm volatile(
+		"leal (%0, %1), %%eax				\n\t"
+		"leal (%%eax, %1, 4), %%ebx			\n\t"
+//	0	1	2	3	4	5	6	7	8	9
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+		"pxor %%mm7, %%mm7				\n\t"
+
+MEDIAN((%0), (%%eax), (%%eax, %1))
+MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
+MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
+
+		"movq (%%ebx, %1), %%mm0			\n\t"
+		"movq %%mm0, (%%ebx, %1, 2)			\n\t"
+
+		: : "r" (src), "r" (stride)
+		: "%eax", "%ebx"
+	);
+
+#endif //MMX
 #else
 	//FIXME
 	int x;
@@ -2255,7 +2298,6 @@
 #endif
 }
 
-
 #ifdef HAVE_ODIVX_POSTPROCESS
 #include "../opendivx/postprocess.h"
 int use_old_pp=0;
@@ -2266,8 +2308,6 @@
 
 /**
  * ...
- * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63)
- * -63 is best quality -1 is worst
  */
 void  postprocess(unsigned char * src[], int src_stride,
                  unsigned char * dst[], int dst_stride,
@@ -2285,9 +2325,6 @@
 	}
 #endif
 
-	// I'm calling this from dec_video.c:video_set_postprocess()
-	// if(mode<0) mode= getModeForQuality(-mode);
-
 /*
 	long long T= rdtsc();
 	for(int y=vertical_size-1; y>=0 ; y--)
@@ -2500,7 +2537,7 @@
 	static uint64_t *yHistogram= NULL;
 	int black=0, white=255; // blackest black and whitest white in the picture
 
-#ifdef TIMEING
+#ifdef TIMING
 	long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
 	sumTime= rdtsc();
 #endif
@@ -2601,7 +2638,7 @@
 
 			if(y + 12 < height)
 			{
-#ifdef MORE_TIMEING
+#ifdef MORE_TIMING
 				T0= rdtsc();
 #endif
 
@@ -2635,7 +2672,7 @@
 					deInterlaceBlendCubic(dstBlock, dstStride);
 */
 
-#ifdef MORE_TIMEING
+#ifdef MORE_TIMING
 				T1= rdtsc();
 				memcpyTime+= T1-T0;
 				T0=T1;
@@ -2657,7 +2694,7 @@
 							doVertDefFilter(vertBlock, stride, QP);
 					}
 				}
-#ifdef MORE_TIMEING
+#ifdef MORE_TIMING
 				T1= rdtsc();
 				vertTime+= T1-T0;
 				T0=T1;
@@ -2683,7 +2720,7 @@
 
 			if(x - 8 >= 0 && x<width)
 			{
-#ifdef MORE_TIMEING
+#ifdef MORE_TIMING
 				T0= rdtsc();
 #endif
 				if(mode & H_DEBLOCK)
@@ -2701,7 +2738,7 @@
 							doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
 					}
 				}
-#ifdef MORE_TIMEING
+#ifdef MORE_TIMING
 				T1= rdtsc();
 				horizTime+= T1-T0;
 				T0=T1;
@@ -2725,7 +2762,7 @@
 	asm volatile("emms");
 #endif
 
-#ifdef TIMEING
+#ifdef TIMING
 	// FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
 	sumTime= rdtsc() - sumTime;
 	if(!isColor)