diff libmpcodecs/vf_spp.c @ 32702:7af3e6f901fd

Convert some tabs to whitespace to allow using MPlayer filter sourcecode in FFmpeg.
author cehoyos
date Fri, 14 Jan 2011 22:10:21 +0000
parents 8fa2f43cb760
children 30f5e5cd3676
line wrap: on
line diff
--- a/libmpcodecs/vf_spp.c	Fri Jan 14 13:36:48 2011 +0000
+++ b/libmpcodecs/vf_spp.c	Fri Jan 14 22:10:21 2011 +0000
@@ -91,282 +91,282 @@
 };
 
 struct vf_priv_s {
-	int log2_count;
-	int qp;
-	int mode;
-	int mpeg2;
-	int temp_stride;
-	uint8_t *src;
-	int16_t *temp;
-	AVCodecContext *avctx;
-	DSPContext dsp;
+        int log2_count;
+        int qp;
+        int mode;
+        int mpeg2;
+        int temp_stride;
+        uint8_t *src;
+        int16_t *temp;
+        AVCodecContext *avctx;
+        DSPContext dsp;
         char *non_b_qp;
 };
 
 #define SHIFT 22
 
 static void hardthresh_c(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation){
-	int i;
-	int bias= 0; //FIXME
-	unsigned int threshold1, threshold2;
+        int i;
+        int bias= 0; //FIXME
+        unsigned int threshold1, threshold2;
 
-	threshold1= qp*((1<<4) - bias) - 1;
-	threshold2= (threshold1<<1);
+        threshold1= qp*((1<<4) - bias) - 1;
+        threshold2= (threshold1<<1);
 
-	memset(dst, 0, 64*sizeof(DCTELEM));
-	dst[0]= (src[0] + 4)>>3;
+        memset(dst, 0, 64*sizeof(DCTELEM));
+        dst[0]= (src[0] + 4)>>3;
 
         for(i=1; i<64; i++){
-		int level= src[i];
-		if(((unsigned)(level+threshold1))>threshold2){
-			const int j= permutation[i];
-			dst[j]= (level + 4)>>3;
-		}
-	}
+                int level= src[i];
+                if(((unsigned)(level+threshold1))>threshold2){
+                        const int j= permutation[i];
+                        dst[j]= (level + 4)>>3;
+                }
+        }
 }
 
 static void softthresh_c(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation){
-	int i;
-	int bias= 0; //FIXME
-	unsigned int threshold1, threshold2;
+        int i;
+        int bias= 0; //FIXME
+        unsigned int threshold1, threshold2;
 
-	threshold1= qp*((1<<4) - bias) - 1;
-	threshold2= (threshold1<<1);
+        threshold1= qp*((1<<4) - bias) - 1;
+        threshold2= (threshold1<<1);
 
-	memset(dst, 0, 64*sizeof(DCTELEM));
-	dst[0]= (src[0] + 4)>>3;
+        memset(dst, 0, 64*sizeof(DCTELEM));
+        dst[0]= (src[0] + 4)>>3;
 
         for(i=1; i<64; i++){
-		int level= src[i];
-		if(((unsigned)(level+threshold1))>threshold2){
-			const int j= permutation[i];
-			if(level>0)
-				dst[j]= (level - threshold1 + 4)>>3;
-			else
-				dst[j]= (level + threshold1 + 4)>>3;
-		}
-	}
+                int level= src[i];
+                if(((unsigned)(level+threshold1))>threshold2){
+                        const int j= permutation[i];
+                        if(level>0)
+                                dst[j]= (level - threshold1 + 4)>>3;
+                        else
+                                dst[j]= (level + threshold1 + 4)>>3;
+                }
+        }
 }
 
 #if HAVE_MMX
 static void hardthresh_mmx(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation){
-	int bias= 0; //FIXME
-	unsigned int threshold1;
+        int bias= 0; //FIXME
+        unsigned int threshold1;
 
-	threshold1= qp*((1<<4) - bias) - 1;
+        threshold1= qp*((1<<4) - bias) - 1;
 
         __asm__ volatile(
 #define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \
-		"movq " #src0 ", %%mm0	\n\t"\
-		"movq " #src1 ", %%mm1	\n\t"\
-		"movq " #src2 ", %%mm2	\n\t"\
-		"movq " #src3 ", %%mm3	\n\t"\
-		"psubw %%mm4, %%mm0	\n\t"\
-		"psubw %%mm4, %%mm1	\n\t"\
-		"psubw %%mm4, %%mm2	\n\t"\
-		"psubw %%mm4, %%mm3	\n\t"\
-		"paddusw %%mm5, %%mm0	\n\t"\
-		"paddusw %%mm5, %%mm1	\n\t"\
-		"paddusw %%mm5, %%mm2	\n\t"\
-		"paddusw %%mm5, %%mm3	\n\t"\
-		"paddw %%mm6, %%mm0	\n\t"\
-		"paddw %%mm6, %%mm1	\n\t"\
-		"paddw %%mm6, %%mm2	\n\t"\
-		"paddw %%mm6, %%mm3	\n\t"\
-		"psubusw %%mm6, %%mm0	\n\t"\
-		"psubusw %%mm6, %%mm1	\n\t"\
-		"psubusw %%mm6, %%mm2	\n\t"\
-		"psubusw %%mm6, %%mm3	\n\t"\
-		"psraw $3, %%mm0	\n\t"\
-		"psraw $3, %%mm1	\n\t"\
-		"psraw $3, %%mm2	\n\t"\
-		"psraw $3, %%mm3	\n\t"\
+                "movq " #src0 ", %%mm0        \n\t"\
+                "movq " #src1 ", %%mm1        \n\t"\
+                "movq " #src2 ", %%mm2        \n\t"\
+                "movq " #src3 ", %%mm3        \n\t"\
+                "psubw %%mm4, %%mm0        \n\t"\
+                "psubw %%mm4, %%mm1        \n\t"\
+                "psubw %%mm4, %%mm2        \n\t"\
+                "psubw %%mm4, %%mm3        \n\t"\
+                "paddusw %%mm5, %%mm0        \n\t"\
+                "paddusw %%mm5, %%mm1        \n\t"\
+                "paddusw %%mm5, %%mm2        \n\t"\
+                "paddusw %%mm5, %%mm3        \n\t"\
+                "paddw %%mm6, %%mm0        \n\t"\
+                "paddw %%mm6, %%mm1        \n\t"\
+                "paddw %%mm6, %%mm2        \n\t"\
+                "paddw %%mm6, %%mm3        \n\t"\
+                "psubusw %%mm6, %%mm0        \n\t"\
+                "psubusw %%mm6, %%mm1        \n\t"\
+                "psubusw %%mm6, %%mm2        \n\t"\
+                "psubusw %%mm6, %%mm3        \n\t"\
+                "psraw $3, %%mm0        \n\t"\
+                "psraw $3, %%mm1        \n\t"\
+                "psraw $3, %%mm2        \n\t"\
+                "psraw $3, %%mm3        \n\t"\
 \
-		"movq %%mm0, %%mm7	\n\t"\
-		"punpcklwd %%mm2, %%mm0	\n\t" /*A*/\
-		"punpckhwd %%mm2, %%mm7	\n\t" /*C*/\
-		"movq %%mm1, %%mm2	\n\t"\
-		"punpcklwd %%mm3, %%mm1	\n\t" /*B*/\
-		"punpckhwd %%mm3, %%mm2	\n\t" /*D*/\
-		"movq %%mm0, %%mm3	\n\t"\
-		"punpcklwd %%mm1, %%mm0	\n\t" /*A*/\
-		"punpckhwd %%mm7, %%mm3	\n\t" /*C*/\
-		"punpcklwd %%mm2, %%mm7	\n\t" /*B*/\
-		"punpckhwd %%mm2, %%mm1	\n\t" /*D*/\
+                "movq %%mm0, %%mm7        \n\t"\
+                "punpcklwd %%mm2, %%mm0        \n\t" /*A*/\
+                "punpckhwd %%mm2, %%mm7        \n\t" /*C*/\
+                "movq %%mm1, %%mm2        \n\t"\
+                "punpcklwd %%mm3, %%mm1        \n\t" /*B*/\
+                "punpckhwd %%mm3, %%mm2        \n\t" /*D*/\
+                "movq %%mm0, %%mm3        \n\t"\
+                "punpcklwd %%mm1, %%mm0        \n\t" /*A*/\
+                "punpckhwd %%mm7, %%mm3        \n\t" /*C*/\
+                "punpcklwd %%mm2, %%mm7        \n\t" /*B*/\
+                "punpckhwd %%mm2, %%mm1        \n\t" /*D*/\
 \
-		"movq %%mm0, " #dst0 "	\n\t"\
-		"movq %%mm7, " #dst1 "	\n\t"\
-		"movq %%mm3, " #dst2 "	\n\t"\
-		"movq %%mm1, " #dst3 "	\n\t"
+                "movq %%mm0, " #dst0 "        \n\t"\
+                "movq %%mm7, " #dst1 "        \n\t"\
+                "movq %%mm3, " #dst2 "        \n\t"\
+                "movq %%mm1, " #dst3 "        \n\t"
 
-		"movd %2, %%mm4		\n\t"
-		"movd %3, %%mm5		\n\t"
-		"movd %4, %%mm6		\n\t"
-		"packssdw %%mm4, %%mm4	\n\t"
-		"packssdw %%mm5, %%mm5	\n\t"
-		"packssdw %%mm6, %%mm6	\n\t"
-		"packssdw %%mm4, %%mm4	\n\t"
-		"packssdw %%mm5, %%mm5	\n\t"
-		"packssdw %%mm6, %%mm6	\n\t"
-		REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
-		REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
-		REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
-		REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
-		: : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed?
-	);
-	dst[0]= (src[0] + 4)>>3;
+                "movd %2, %%mm4                \n\t"
+                "movd %3, %%mm5                \n\t"
+                "movd %4, %%mm6                \n\t"
+                "packssdw %%mm4, %%mm4        \n\t"
+                "packssdw %%mm5, %%mm5        \n\t"
+                "packssdw %%mm6, %%mm6        \n\t"
+                "packssdw %%mm4, %%mm4        \n\t"
+                "packssdw %%mm5, %%mm5        \n\t"
+                "packssdw %%mm6, %%mm6        \n\t"
+                REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
+                REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
+                REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
+                REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
+                : : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed?
+        );
+        dst[0]= (src[0] + 4)>>3;
 }
 
 static void softthresh_mmx(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation){
-	int bias= 0; //FIXME
-	unsigned int threshold1;
+        int bias= 0; //FIXME
+        unsigned int threshold1;
 
-	threshold1= qp*((1<<4) - bias) - 1;
+        threshold1= qp*((1<<4) - bias) - 1;
 
         __asm__ volatile(
 #undef REQUANT_CORE
 #define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \
-		"movq " #src0 ", %%mm0	\n\t"\
-		"movq " #src1 ", %%mm1	\n\t"\
-		"pxor %%mm6, %%mm6	\n\t"\
-		"pxor %%mm7, %%mm7	\n\t"\
-		"pcmpgtw %%mm0, %%mm6	\n\t"\
-		"pcmpgtw %%mm1, %%mm7	\n\t"\
-		"pxor %%mm6, %%mm0	\n\t"\
-		"pxor %%mm7, %%mm1	\n\t"\
-		"psubusw %%mm4, %%mm0	\n\t"\
-		"psubusw %%mm4, %%mm1	\n\t"\
-		"pxor %%mm6, %%mm0	\n\t"\
-		"pxor %%mm7, %%mm1	\n\t"\
-		"movq " #src2 ", %%mm2	\n\t"\
-		"movq " #src3 ", %%mm3	\n\t"\
-		"pxor %%mm6, %%mm6	\n\t"\
-		"pxor %%mm7, %%mm7	\n\t"\
-		"pcmpgtw %%mm2, %%mm6	\n\t"\
-		"pcmpgtw %%mm3, %%mm7	\n\t"\
-		"pxor %%mm6, %%mm2	\n\t"\
-		"pxor %%mm7, %%mm3	\n\t"\
-		"psubusw %%mm4, %%mm2	\n\t"\
-		"psubusw %%mm4, %%mm3	\n\t"\
-		"pxor %%mm6, %%mm2	\n\t"\
-		"pxor %%mm7, %%mm3	\n\t"\
+                "movq " #src0 ", %%mm0        \n\t"\
+                "movq " #src1 ", %%mm1        \n\t"\
+                "pxor %%mm6, %%mm6        \n\t"\
+                "pxor %%mm7, %%mm7        \n\t"\
+                "pcmpgtw %%mm0, %%mm6        \n\t"\
+                "pcmpgtw %%mm1, %%mm7        \n\t"\
+                "pxor %%mm6, %%mm0        \n\t"\
+                "pxor %%mm7, %%mm1        \n\t"\
+                "psubusw %%mm4, %%mm0        \n\t"\
+                "psubusw %%mm4, %%mm1        \n\t"\
+                "pxor %%mm6, %%mm0        \n\t"\
+                "pxor %%mm7, %%mm1        \n\t"\
+                "movq " #src2 ", %%mm2        \n\t"\
+                "movq " #src3 ", %%mm3        \n\t"\
+                "pxor %%mm6, %%mm6        \n\t"\
+                "pxor %%mm7, %%mm7        \n\t"\
+                "pcmpgtw %%mm2, %%mm6        \n\t"\
+                "pcmpgtw %%mm3, %%mm7        \n\t"\
+                "pxor %%mm6, %%mm2        \n\t"\
+                "pxor %%mm7, %%mm3        \n\t"\
+                "psubusw %%mm4, %%mm2        \n\t"\
+                "psubusw %%mm4, %%mm3        \n\t"\
+                "pxor %%mm6, %%mm2        \n\t"\
+                "pxor %%mm7, %%mm3        \n\t"\
 \
-		"paddsw %%mm5, %%mm0	\n\t"\
-		"paddsw %%mm5, %%mm1	\n\t"\
-		"paddsw %%mm5, %%mm2	\n\t"\
-		"paddsw %%mm5, %%mm3	\n\t"\
-		"psraw $3, %%mm0	\n\t"\
-		"psraw $3, %%mm1	\n\t"\
-		"psraw $3, %%mm2	\n\t"\
-		"psraw $3, %%mm3	\n\t"\
+                "paddsw %%mm5, %%mm0        \n\t"\
+                "paddsw %%mm5, %%mm1        \n\t"\
+                "paddsw %%mm5, %%mm2        \n\t"\
+                "paddsw %%mm5, %%mm3        \n\t"\
+                "psraw $3, %%mm0        \n\t"\
+                "psraw $3, %%mm1        \n\t"\
+                "psraw $3, %%mm2        \n\t"\
+                "psraw $3, %%mm3        \n\t"\
 \
-		"movq %%mm0, %%mm7	\n\t"\
-		"punpcklwd %%mm2, %%mm0	\n\t" /*A*/\
-		"punpckhwd %%mm2, %%mm7	\n\t" /*C*/\
-		"movq %%mm1, %%mm2	\n\t"\
-		"punpcklwd %%mm3, %%mm1	\n\t" /*B*/\
-		"punpckhwd %%mm3, %%mm2	\n\t" /*D*/\
-		"movq %%mm0, %%mm3	\n\t"\
-		"punpcklwd %%mm1, %%mm0	\n\t" /*A*/\
-		"punpckhwd %%mm7, %%mm3	\n\t" /*C*/\
-		"punpcklwd %%mm2, %%mm7	\n\t" /*B*/\
-		"punpckhwd %%mm2, %%mm1	\n\t" /*D*/\
+                "movq %%mm0, %%mm7        \n\t"\
+                "punpcklwd %%mm2, %%mm0        \n\t" /*A*/\
+                "punpckhwd %%mm2, %%mm7        \n\t" /*C*/\
+                "movq %%mm1, %%mm2        \n\t"\
+                "punpcklwd %%mm3, %%mm1        \n\t" /*B*/\
+                "punpckhwd %%mm3, %%mm2        \n\t" /*D*/\
+                "movq %%mm0, %%mm3        \n\t"\
+                "punpcklwd %%mm1, %%mm0        \n\t" /*A*/\
+                "punpckhwd %%mm7, %%mm3        \n\t" /*C*/\
+                "punpcklwd %%mm2, %%mm7        \n\t" /*B*/\
+                "punpckhwd %%mm2, %%mm1        \n\t" /*D*/\
 \
-		"movq %%mm0, " #dst0 "	\n\t"\
-		"movq %%mm7, " #dst1 "	\n\t"\
-		"movq %%mm3, " #dst2 "	\n\t"\
-		"movq %%mm1, " #dst3 "	\n\t"
+                "movq %%mm0, " #dst0 "        \n\t"\
+                "movq %%mm7, " #dst1 "        \n\t"\
+                "movq %%mm3, " #dst2 "        \n\t"\
+                "movq %%mm1, " #dst3 "        \n\t"
 
-		"movd %2, %%mm4		\n\t"
-		"movd %3, %%mm5		\n\t"
-		"packssdw %%mm4, %%mm4	\n\t"
-		"packssdw %%mm5, %%mm5	\n\t"
-		"packssdw %%mm4, %%mm4	\n\t"
-		"packssdw %%mm5, %%mm5	\n\t"
-		REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
-		REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
-		REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
-		REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
-		: : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed?
-	);
+                "movd %2, %%mm4                \n\t"
+                "movd %3, %%mm5                \n\t"
+                "packssdw %%mm4, %%mm4        \n\t"
+                "packssdw %%mm5, %%mm5        \n\t"
+                "packssdw %%mm4, %%mm4        \n\t"
+                "packssdw %%mm5, %%mm5        \n\t"
+                REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
+                REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
+                REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
+                REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
+                : : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed?
+        );
 
-	dst[0]= (src[0] + 4)>>3;
+        dst[0]= (src[0] + 4)>>3;
 }
 #endif
 
 static inline void add_block(int16_t *dst, int stride, DCTELEM block[64]){
-	int y;
+        int y;
 
-	for(y=0; y<8; y++){
-		*(uint32_t*)&dst[0 + y*stride]+= *(uint32_t*)&block[0 + y*8];
-		*(uint32_t*)&dst[2 + y*stride]+= *(uint32_t*)&block[2 + y*8];
-		*(uint32_t*)&dst[4 + y*stride]+= *(uint32_t*)&block[4 + y*8];
-		*(uint32_t*)&dst[6 + y*stride]+= *(uint32_t*)&block[6 + y*8];
-	}
+        for(y=0; y<8; y++){
+                *(uint32_t*)&dst[0 + y*stride]+= *(uint32_t*)&block[0 + y*8];
+                *(uint32_t*)&dst[2 + y*stride]+= *(uint32_t*)&block[2 + y*8];
+                *(uint32_t*)&dst[4 + y*stride]+= *(uint32_t*)&block[4 + y*8];
+                *(uint32_t*)&dst[6 + y*stride]+= *(uint32_t*)&block[6 + y*8];
+        }
 }
 
 static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale){
-	int y, x;
+        int y, x;
 
 #define STORE(pos) \
-	temp= ((src[x + y*src_stride + pos]<<log2_scale) + d[pos])>>6;\
-	if(temp & 0x100) temp= ~(temp>>31);\
-	dst[x + y*dst_stride + pos]= temp;
+        temp= ((src[x + y*src_stride + pos]<<log2_scale) + d[pos])>>6;\
+        if(temp & 0x100) temp= ~(temp>>31);\
+        dst[x + y*dst_stride + pos]= temp;
 
-	for(y=0; y<height; y++){
-		const uint8_t *d= dither[y];
-		for(x=0; x<width; x+=8){
-			int temp;
-			STORE(0);
-			STORE(1);
-			STORE(2);
-			STORE(3);
-			STORE(4);
-			STORE(5);
-			STORE(6);
-			STORE(7);
-		}
-	}
+        for(y=0; y<height; y++){
+                const uint8_t *d= dither[y];
+                for(x=0; x<width; x+=8){
+                        int temp;
+                        STORE(0);
+                        STORE(1);
+                        STORE(2);
+                        STORE(3);
+                        STORE(4);
+                        STORE(5);
+                        STORE(6);
+                        STORE(7);
+                }
+        }
 }
 
 #if HAVE_MMX
 static void store_slice_mmx(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale){
-	int y;
+        int y;
 
-	for(y=0; y<height; y++){
-		uint8_t *dst1= dst;
-		int16_t *src1= src;
-		__asm__ volatile(
-			"movq (%3), %%mm3	\n\t"
-			"movq (%3), %%mm4	\n\t"
-			"movd %4, %%mm2		\n\t"
-			"pxor %%mm0, %%mm0	\n\t"
-			"punpcklbw %%mm0, %%mm3	\n\t"
-			"punpckhbw %%mm0, %%mm4	\n\t"
-			"psraw %%mm2, %%mm3	\n\t"
-			"psraw %%mm2, %%mm4	\n\t"
-			"movd %5, %%mm2		\n\t"
-			"1:			\n\t"
-			"movq (%0), %%mm0	\n\t"
-			"movq 8(%0), %%mm1	\n\t"
-			"paddw %%mm3, %%mm0	\n\t"
-			"paddw %%mm4, %%mm1	\n\t"
-			"psraw %%mm2, %%mm0	\n\t"
-			"psraw %%mm2, %%mm1	\n\t"
-			"packuswb %%mm1, %%mm0	\n\t"
-			"movq %%mm0, (%1) 	\n\t"
-			"add $16, %0		\n\t"
-			"add $8, %1		\n\t"
-			"cmp %2, %1		\n\t"
-			" jb 1b			\n\t"
-			: "+r" (src1), "+r"(dst1)
-			: "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(6-log2_scale)
-		);
-		src += src_stride;
-		dst += dst_stride;
-	}
-//	if(width != mmxw)
-//		store_slice_c(dst + mmxw, src + mmxw, dst_stride, src_stride, width - mmxw, log2_scale);
+        for(y=0; y<height; y++){
+                uint8_t *dst1= dst;
+                int16_t *src1= src;
+                __asm__ volatile(
+                        "movq (%3), %%mm3        \n\t"
+                        "movq (%3), %%mm4        \n\t"
+                        "movd %4, %%mm2                \n\t"
+                        "pxor %%mm0, %%mm0        \n\t"
+                        "punpcklbw %%mm0, %%mm3        \n\t"
+                        "punpckhbw %%mm0, %%mm4        \n\t"
+                        "psraw %%mm2, %%mm3        \n\t"
+                        "psraw %%mm2, %%mm4        \n\t"
+                        "movd %5, %%mm2                \n\t"
+                        "1:                        \n\t"
+                        "movq (%0), %%mm0        \n\t"
+                        "movq 8(%0), %%mm1        \n\t"
+                        "paddw %%mm3, %%mm0        \n\t"
+                        "paddw %%mm4, %%mm1        \n\t"
+                        "psraw %%mm2, %%mm0        \n\t"
+                        "psraw %%mm2, %%mm1        \n\t"
+                        "packuswb %%mm1, %%mm0        \n\t"
+                        "movq %%mm0, (%1)         \n\t"
+                        "add $16, %0                \n\t"
+                        "add $8, %1                \n\t"
+                        "cmp %2, %1                \n\t"
+                        " jb 1b                        \n\t"
+                        : "+r" (src1), "+r"(dst1)
+                        : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(6-log2_scale)
+                );
+                src += src_stride;
+                dst += dst_stride;
+        }
+//        if(width != mmxw)
+//                store_slice_c(dst + mmxw, src + mmxw, dst_stride, src_stride, width - mmxw, log2_scale);
 }
 #endif
 
@@ -375,77 +375,77 @@
 static void (*requantize)(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation)= hardthresh_c;
 
 static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int width, int height, uint8_t *qp_store, int qp_stride, int is_luma){
-	int x, y, i;
-	const int count= 1<<p->log2_count;
-	const int stride= is_luma ? p->temp_stride : ((width+16+15)&(~15));
-	uint64_t __attribute__((aligned(16))) block_align[32];
-	DCTELEM *block = (DCTELEM *)block_align;
-	DCTELEM *block2= (DCTELEM *)(block_align+16);
+        int x, y, i;
+        const int count= 1<<p->log2_count;
+        const int stride= is_luma ? p->temp_stride : ((width+16+15)&(~15));
+        uint64_t __attribute__((aligned(16))) block_align[32];
+        DCTELEM *block = (DCTELEM *)block_align;
+        DCTELEM *block2= (DCTELEM *)(block_align+16);
 
-	if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
-	for(y=0; y<height; y++){
-		int index= 8 + 8*stride + y*stride;
-		fast_memcpy(p->src + index, src + y*src_stride, width);
-		for(x=0; x<8; x++){
-			p->src[index         - x - 1]= p->src[index +         x    ];
-			p->src[index + width + x    ]= p->src[index + width - x - 1];
-		}
-	}
-	for(y=0; y<8; y++){
-		fast_memcpy(p->src + (      7-y)*stride, p->src + (      y+8)*stride, stride);
-		fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
-	}
-	//FIXME (try edge emu)
+        if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
+        for(y=0; y<height; y++){
+                int index= 8 + 8*stride + y*stride;
+                fast_memcpy(p->src + index, src + y*src_stride, width);
+                for(x=0; x<8; x++){
+                        p->src[index         - x - 1]= p->src[index +         x    ];
+                        p->src[index + width + x    ]= p->src[index + width - x - 1];
+                }
+        }
+        for(y=0; y<8; y++){
+                fast_memcpy(p->src + (      7-y)*stride, p->src + (      y+8)*stride, stride);
+                fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
+        }
+        //FIXME (try edge emu)
 
-	for(y=0; y<height+8; y+=8){
-		memset(p->temp + (8+y)*stride, 0, 8*stride*sizeof(int16_t));
-		for(x=0; x<width+8; x+=8){
-			const int qps= 3 + is_luma;
-			int qp;
+        for(y=0; y<height+8; y+=8){
+                memset(p->temp + (8+y)*stride, 0, 8*stride*sizeof(int16_t));
+                for(x=0; x<width+8; x+=8){
+                        const int qps= 3 + is_luma;
+                        int qp;
 
-			if(p->qp)
-				qp= p->qp;
-			else{
-				qp= qp_store[ (XMIN(x, width-1)>>qps) + (XMIN(y, height-1)>>qps) * qp_stride];
-				qp = FFMAX(1, norm_qscale(qp, p->mpeg2));
-			}
-			for(i=0; i<count; i++){
-				const int x1= x + offset[i+count-1][0];
-				const int y1= y + offset[i+count-1][1];
-				const int index= x1 + y1*stride;
-				p->dsp.get_pixels(block, p->src + index, stride);
-				p->dsp.fdct(block);
-				requantize(block2, block, qp, p->dsp.idct_permutation);
-				p->dsp.idct(block2);
-				add_block(p->temp + index, stride, block2);
-			}
-		}
-		if(y)
-			store_slice(dst + (y-8)*dst_stride, p->temp + 8 + y*stride, dst_stride, stride, width, XMIN(8, height+8-y), 6-p->log2_count);
-	}
+                        if(p->qp)
+                                qp= p->qp;
+                        else{
+                                qp= qp_store[ (XMIN(x, width-1)>>qps) + (XMIN(y, height-1)>>qps) * qp_stride];
+                                qp = FFMAX(1, norm_qscale(qp, p->mpeg2));
+                        }
+                        for(i=0; i<count; i++){
+                                const int x1= x + offset[i+count-1][0];
+                                const int y1= y + offset[i+count-1][1];
+                                const int index= x1 + y1*stride;
+                                p->dsp.get_pixels(block, p->src + index, stride);
+                                p->dsp.fdct(block);
+                                requantize(block2, block, qp, p->dsp.idct_permutation);
+                                p->dsp.idct(block2);
+                                add_block(p->temp + index, stride, block2);
+                        }
+                }
+                if(y)
+                        store_slice(dst + (y-8)*dst_stride, p->temp + 8 + y*stride, dst_stride, stride, width, XMIN(8, height+8-y), 6-p->log2_count);
+        }
 #if 0
-	for(y=0; y<height; y++){
-		for(x=0; x<width; x++){
-			if((((x>>6) ^ (y>>6)) & 1) == 0)
-				dst[x + y*dst_stride]= p->src[8 + 8*stride  + x + y*stride];
-			if((x&63) == 0 || (y&63)==0)
-				dst[x + y*dst_stride] += 128;
+        for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                        if((((x>>6) ^ (y>>6)) & 1) == 0)
+                                dst[x + y*dst_stride]= p->src[8 + 8*stride  + x + y*stride];
+                        if((x&63) == 0 || (y&63)==0)
+                                dst[x + y*dst_stride] += 128;
                 }
-	}
+        }
 #endif
-	//FIXME reorder for better caching
+        //FIXME reorder for better caching
 }
 
 static int config(struct vf_instance *vf,
         int width, int height, int d_width, int d_height,
-	unsigned int flags, unsigned int outfmt){
-	int h= (height+16+15)&(~15);
+        unsigned int flags, unsigned int outfmt){
+        int h= (height+16+15)&(~15);
 
-	vf->priv->temp_stride= (width+16+15)&(~15);
+        vf->priv->temp_stride= (width+16+15)&(~15);
         vf->priv->temp= malloc(vf->priv->temp_stride*h*sizeof(int16_t));
         vf->priv->src = malloc(vf->priv->temp_stride*h*sizeof(uint8_t));
 
-	return vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
+        return vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
 }
 
 static void get_image(struct vf_instance *vf, mp_image_t *mpi){
@@ -459,17 +459,17 @@
     if(mpi->flags&MP_IMGFLAG_PLANAR){
         mpi->planes[1]=vf->dmpi->planes[1];
         mpi->planes[2]=vf->dmpi->planes[2];
-	mpi->stride[1]=vf->dmpi->stride[1];
-	mpi->stride[2]=vf->dmpi->stride[2];
+        mpi->stride[1]=vf->dmpi->stride[1];
+        mpi->stride[2]=vf->dmpi->stride[2];
     }
     mpi->flags|=MP_IMGFLAG_DIRECT;
 }
 
 static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts){
-	mp_image_t *dmpi;
+        mp_image_t *dmpi;
 
-	if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
-		// no DR, so get a new image! hope we'll get DR buffer:
+        if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
+                // no DR, so get a new image! hope we'll get DR buffer:
                 dmpi=vf_get_image(vf->next,mpi->imgfmt,
                     MP_IMGTYPE_TEMP,
                     MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
@@ -491,63 +491,63 @@
                 vf->priv->non_b_qp= malloc(w*h);
             fast_memcpy(vf->priv->non_b_qp, mpi->qscale, w*h);
         }
-	if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
+        if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
             char *qp_tab= vf->priv->non_b_qp;
             if((vf->priv->mode&4) || !qp_tab)
                 qp_tab= mpi->qscale;
 
-	    if(qp_tab || vf->priv->qp){
-		filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0], mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
-		filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
-		filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
-	    }else{
-		memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
-		memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
-		memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
-	    }
-	}
+            if(qp_tab || vf->priv->qp){
+                filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0], mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
+                filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
+                filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
+            }else{
+                memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
+                memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
+                memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
+            }
+        }
 
 #if HAVE_MMX
-	if(gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
+        if(gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
 #endif
 #if HAVE_MMX2
-	if(gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
+        if(gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
 #endif
 
-	return vf_next_put_image(vf,dmpi, pts);
+        return vf_next_put_image(vf,dmpi, pts);
 }
 
 static void uninit(struct vf_instance *vf){
-	if(!vf->priv) return;
+        if(!vf->priv) return;
 
-	free(vf->priv->temp);
-	vf->priv->temp= NULL;
-	free(vf->priv->src);
-	vf->priv->src= NULL;
+        free(vf->priv->temp);
+        vf->priv->temp= NULL;
+        free(vf->priv->src);
+        vf->priv->src= NULL;
         free(vf->priv->avctx);
         vf->priv->avctx= NULL;
         free(vf->priv->non_b_qp);
         vf->priv->non_b_qp= NULL;
 
-	free(vf->priv);
-	vf->priv=NULL;
+        free(vf->priv);
+        vf->priv=NULL;
 }
 
 //===========================================================================//
 static int query_format(struct vf_instance *vf, unsigned int fmt){
     switch(fmt){
-	case IMGFMT_YVU9:
-	case IMGFMT_IF09:
-	case IMGFMT_YV12:
-	case IMGFMT_I420:
-	case IMGFMT_IYUV:
-	case IMGFMT_CLPL:
-	case IMGFMT_Y800:
-	case IMGFMT_Y8:
-	case IMGFMT_444P:
-	case IMGFMT_422P:
-	case IMGFMT_411P:
-	    return vf_next_query_format(vf,fmt);
+        case IMGFMT_YVU9:
+        case IMGFMT_IF09:
+        case IMGFMT_YV12:
+        case IMGFMT_I420:
+        case IMGFMT_IYUV:
+        case IMGFMT_CLPL:
+        case IMGFMT_Y800:
+        case IMGFMT_Y8:
+        case IMGFMT_444P:
+        case IMGFMT_422P:
+        case IMGFMT_411P:
+            return vf_next_query_format(vf,fmt);
     }
     return 0;
 }
@@ -555,10 +555,10 @@
 static int control(struct vf_instance *vf, int request, void* data){
     switch(request){
     case VFCTRL_QUERY_MAX_PP_LEVEL:
-	return 6;
+        return 6;
     case VFCTRL_SET_PP_LEVEL:
-	vf->priv->log2_count= *((unsigned int*)data);
-	return CONTROL_TRUE;
+        vf->priv->log2_count= *((unsigned int*)data);
+        return CONTROL_TRUE;
     }
     return vf_next_control(vf,request,data);
 }
@@ -593,17 +593,17 @@
 
     switch(vf->priv->mode&3){
         default:
-	case 0: requantize= hardthresh_c; break;
-	case 1: requantize= softthresh_c; break;
+        case 0: requantize= hardthresh_c; break;
+        case 1: requantize= softthresh_c; break;
     }
 
 #if HAVE_MMX
     if(gCpuCaps.hasMMX){
-	store_slice= store_slice_mmx;
-	switch(vf->priv->mode&3){
-	    case 0: requantize= hardthresh_mmx; break;
-	    case 1: requantize= softthresh_mmx; break;
-	}
+        store_slice= store_slice_mmx;
+        switch(vf->priv->mode&3){
+            case 0: requantize= hardthresh_mmx; break;
+            case 1: requantize= softthresh_mmx; break;
+        }
     }
 #endif