diff i386/dsputil_mmx.c @ 2979:bfabfdf9ce55 libavcodec

COSMETICS: tabs --> spaces, some prettyprinting
author diego
date Thu, 22 Dec 2005 01:10:11 +0000
parents ef2149182f1c
children 0b546eab515d
line wrap: on
line diff
--- a/i386/dsputil_mmx.c	Wed Dec 21 17:50:40 2005 +0000
+++ b/i386/dsputil_mmx.c	Thu Dec 22 01:10:11 2005 +0000
@@ -89,56 +89,56 @@
 // first argument is unmodifed and second is trashed
 // regfe is supposed to contain 0xfefefefefefefefe
 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
-    "movq " #rega ", " #regr "	\n\t"\
-    "pand " #regb ", " #regr "	\n\t"\
-    "pxor " #rega ", " #regb "	\n\t"\
-    "pand " #regfe "," #regb "	\n\t"\
-    "psrlq $1, " #regb " 	\n\t"\
-    "paddb " #regb ", " #regr "	\n\t"
+    "movq " #rega ", " #regr "  \n\t"\
+    "pand " #regb ", " #regr "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pand " #regfe "," #regb "  \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "paddb " #regb ", " #regr " \n\t"
 
 #define PAVGB_MMX(rega, regb, regr, regfe) \
-    "movq " #rega ", " #regr "	\n\t"\
-    "por  " #regb ", " #regr "	\n\t"\
-    "pxor " #rega ", " #regb "	\n\t"\
-    "pand " #regfe "," #regb "	\n\t"\
-    "psrlq $1, " #regb "	\n\t"\
-    "psubb " #regb ", " #regr "	\n\t"
+    "movq " #rega ", " #regr "  \n\t"\
+    "por  " #regb ", " #regr "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pand " #regfe "," #regb "  \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "psubb " #regb ", " #regr " \n\t"
 
 // mm6 is supposed to contain 0xfefefefefefefefe
 #define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
-    "movq " #rega ", " #regr "	\n\t"\
-    "movq " #regc ", " #regp "	\n\t"\
-    "pand " #regb ", " #regr "	\n\t"\
-    "pand " #regd ", " #regp "	\n\t"\
-    "pxor " #rega ", " #regb "	\n\t"\
-    "pxor " #regc ", " #regd "	\n\t"\
-    "pand %%mm6, " #regb "	\n\t"\
-    "pand %%mm6, " #regd "	\n\t"\
-    "psrlq $1, " #regb " 	\n\t"\
-    "psrlq $1, " #regd " 	\n\t"\
-    "paddb " #regb ", " #regr "	\n\t"\
-    "paddb " #regd ", " #regp "	\n\t"
+    "movq " #rega ", " #regr "  \n\t"\
+    "movq " #regc ", " #regp "  \n\t"\
+    "pand " #regb ", " #regr "  \n\t"\
+    "pand " #regd ", " #regp "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pxor " #regc ", " #regd "  \n\t"\
+    "pand %%mm6, " #regb "      \n\t"\
+    "pand %%mm6, " #regd "      \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "psrlq $1, " #regd "        \n\t"\
+    "paddb " #regb ", " #regr " \n\t"\
+    "paddb " #regd ", " #regp " \n\t"
 
 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
-    "movq " #rega ", " #regr "	\n\t"\
-    "movq " #regc ", " #regp "	\n\t"\
-    "por  " #regb ", " #regr "	\n\t"\
-    "por  " #regd ", " #regp "	\n\t"\
-    "pxor " #rega ", " #regb "	\n\t"\
-    "pxor " #regc ", " #regd "	\n\t"\
-    "pand %%mm6, " #regb "     	\n\t"\
-    "pand %%mm6, " #regd "     	\n\t"\
-    "psrlq $1, " #regd "	\n\t"\
-    "psrlq $1, " #regb "	\n\t"\
-    "psubb " #regb ", " #regr "	\n\t"\
-    "psubb " #regd ", " #regp "	\n\t"
+    "movq " #rega ", " #regr "  \n\t"\
+    "movq " #regc ", " #regp "  \n\t"\
+    "por  " #regb ", " #regr "  \n\t"\
+    "por  " #regd ", " #regp "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pxor " #regc ", " #regd "  \n\t"\
+    "pand %%mm6, " #regb "      \n\t"\
+    "pand %%mm6, " #regd "      \n\t"\
+    "psrlq $1, " #regd "        \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "psubb " #regb ", " #regr " \n\t"\
+    "psubb " #regd ", " #regp " \n\t"
 
 /***********************************/
 /* MMX no rounding */
 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
 #define SET_RND  MOVQ_WONE
-#define PAVGBP(a, b, c, d, e, f)	PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
-#define PAVGB(a, b, c, e)		PAVGB_MMX_NO_RND(a, b, c, e)
+#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
 
 #include "dsputil_mmx_rnd.h"
 
@@ -151,8 +151,8 @@
 
 #define DEF(x, y) x ## _ ## y ##_mmx
 #define SET_RND  MOVQ_WTWO
-#define PAVGBP(a, b, c, d, e, f)	PAVGBP_MMX(a, b, c, d, e, f)
-#define PAVGB(a, b, c, e)		PAVGB_MMX(a, b, c, e)
+#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
 
 #include "dsputil_mmx_rnd.h"
 
@@ -193,25 +193,25 @@
 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
 {
     asm volatile(
-        "mov $-128, %%"REG_a"	\n\t"
-        "pxor %%mm7, %%mm7	\n\t"
-        ".balign 16		\n\t"
-        "1:			\n\t"
-        "movq (%0), %%mm0	\n\t"
-        "movq (%0, %2), %%mm2	\n\t"
-        "movq %%mm0, %%mm1	\n\t"
-        "movq %%mm2, %%mm3	\n\t"
-        "punpcklbw %%mm7, %%mm0	\n\t"
-        "punpckhbw %%mm7, %%mm1	\n\t"
-        "punpcklbw %%mm7, %%mm2	\n\t"
-        "punpckhbw %%mm7, %%mm3	\n\t"
-        "movq %%mm0, (%1, %%"REG_a")\n\t"
-        "movq %%mm1, 8(%1, %%"REG_a")\n\t"
-        "movq %%mm2, 16(%1, %%"REG_a")\n\t"
-        "movq %%mm3, 24(%1, %%"REG_a")\n\t"
-        "add %3, %0		\n\t"
-        "add $32, %%"REG_a"	\n\t"
-        "js 1b			\n\t"
+        "mov $-128, %%"REG_a"           \n\t"
+        "pxor %%mm7, %%mm7              \n\t"
+        ".balign 16                     \n\t"
+        "1:                             \n\t"
+        "movq (%0), %%mm0               \n\t"
+        "movq (%0, %2), %%mm2           \n\t"
+        "movq %%mm0, %%mm1              \n\t"
+        "movq %%mm2, %%mm3              \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "movq %%mm0, (%1, %%"REG_a")    \n\t"
+        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
+        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
+        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
+        "add %3, %0                     \n\t"
+        "add $32, %%"REG_a"             \n\t"
+        "js 1b                          \n\t"
         : "+r" (pixels)
         : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
         : "%"REG_a
@@ -221,26 +221,26 @@
 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
 {
     asm volatile(
-        "pxor %%mm7, %%mm7	\n\t"
-        "mov $-128, %%"REG_a"	\n\t"
-        ".balign 16		\n\t"
-        "1:			\n\t"
-        "movq (%0), %%mm0	\n\t"
-        "movq (%1), %%mm2	\n\t"
-        "movq %%mm0, %%mm1	\n\t"
-        "movq %%mm2, %%mm3	\n\t"
-        "punpcklbw %%mm7, %%mm0	\n\t"
-        "punpckhbw %%mm7, %%mm1	\n\t"
-        "punpcklbw %%mm7, %%mm2	\n\t"
-        "punpckhbw %%mm7, %%mm3	\n\t"
-        "psubw %%mm2, %%mm0	\n\t"
-        "psubw %%mm3, %%mm1	\n\t"
-        "movq %%mm0, (%2, %%"REG_a")\n\t"
-        "movq %%mm1, 8(%2, %%"REG_a")\n\t"
-        "add %3, %0		\n\t"
-        "add %3, %1		\n\t"
-        "add $16, %%"REG_a"	\n\t"
-        "jnz 1b			\n\t"
+        "pxor %%mm7, %%mm7              \n\t"
+        "mov $-128, %%"REG_a"           \n\t"
+        ".balign 16                     \n\t"
+        "1:                             \n\t"
+        "movq (%0), %%mm0               \n\t"
+        "movq (%1), %%mm2               \n\t"
+        "movq %%mm0, %%mm1              \n\t"
+        "movq %%mm2, %%mm3              \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "psubw %%mm2, %%mm0             \n\t"
+        "psubw %%mm3, %%mm1             \n\t"
+        "movq %%mm0, (%2, %%"REG_a")    \n\t"
+        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
+        "add %3, %0                     \n\t"
+        "add %3, %1                     \n\t"
+        "add $16, %%"REG_a"             \n\t"
+        "jnz 1b                         \n\t"
         : "+r" (s1), "+r" (s2)
         : "r" (block+64), "r" ((long)stride)
         : "%"REG_a
@@ -257,25 +257,25 @@
     p = block;
     pix = pixels;
     /* unrolled loop */
-	__asm __volatile(
-		"movq	%3, %%mm0\n\t"
-		"movq	8%3, %%mm1\n\t"
-		"movq	16%3, %%mm2\n\t"
-		"movq	24%3, %%mm3\n\t"
-		"movq	32%3, %%mm4\n\t"
-		"movq	40%3, %%mm5\n\t"
-		"movq	48%3, %%mm6\n\t"
-		"movq	56%3, %%mm7\n\t"
-		"packuswb %%mm1, %%mm0\n\t"
-		"packuswb %%mm3, %%mm2\n\t"
-		"packuswb %%mm5, %%mm4\n\t"
-		"packuswb %%mm7, %%mm6\n\t"
-		"movq	%%mm0, (%0)\n\t"
-		"movq	%%mm2, (%0, %1)\n\t"
-		"movq	%%mm4, (%0, %1, 2)\n\t"
-		"movq	%%mm6, (%0, %2)\n\t"
-		::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
-		:"memory");
+        __asm __volatile(
+                "movq   %3, %%mm0               \n\t"
+                "movq   8%3, %%mm1              \n\t"
+                "movq   16%3, %%mm2             \n\t"
+                "movq   24%3, %%mm3             \n\t"
+                "movq   32%3, %%mm4             \n\t"
+                "movq   40%3, %%mm5             \n\t"
+                "movq   48%3, %%mm6             \n\t"
+                "movq   56%3, %%mm7             \n\t"
+                "packuswb %%mm1, %%mm0          \n\t"
+                "packuswb %%mm3, %%mm2          \n\t"
+                "packuswb %%mm5, %%mm4          \n\t"
+                "packuswb %%mm7, %%mm6          \n\t"
+                "movq   %%mm0, (%0)             \n\t"
+                "movq   %%mm2, (%0, %1)         \n\t"
+                "movq   %%mm4, (%0, %1, 2)      \n\t"
+                "movq   %%mm6, (%0, %2)         \n\t"
+                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
+                :"memory");
         pix += line_size*4;
         p += 32;
 
@@ -283,24 +283,24 @@
     // compiler would generate some very strange code
     // thus using "r"
     __asm __volatile(
-	    "movq	(%3), %%mm0\n\t"
-	    "movq	8(%3), %%mm1\n\t"
-	    "movq	16(%3), %%mm2\n\t"
-	    "movq	24(%3), %%mm3\n\t"
-	    "movq	32(%3), %%mm4\n\t"
-	    "movq	40(%3), %%mm5\n\t"
-	    "movq	48(%3), %%mm6\n\t"
-	    "movq	56(%3), %%mm7\n\t"
-	    "packuswb %%mm1, %%mm0\n\t"
-	    "packuswb %%mm3, %%mm2\n\t"
-	    "packuswb %%mm5, %%mm4\n\t"
-	    "packuswb %%mm7, %%mm6\n\t"
-	    "movq	%%mm0, (%0)\n\t"
-	    "movq	%%mm2, (%0, %1)\n\t"
-	    "movq	%%mm4, (%0, %1, 2)\n\t"
-	    "movq	%%mm6, (%0, %2)\n\t"
-	    ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
-	    :"memory");
+            "movq       (%3), %%mm0             \n\t"
+            "movq       8(%3), %%mm1            \n\t"
+            "movq       16(%3), %%mm2           \n\t"
+            "movq       24(%3), %%mm3           \n\t"
+            "movq       32(%3), %%mm4           \n\t"
+            "movq       40(%3), %%mm5           \n\t"
+            "movq       48(%3), %%mm6           \n\t"
+            "movq       56(%3), %%mm7           \n\t"
+            "packuswb %%mm1, %%mm0              \n\t"
+            "packuswb %%mm3, %%mm2              \n\t"
+            "packuswb %%mm5, %%mm4              \n\t"
+            "packuswb %%mm7, %%mm6              \n\t"
+            "movq       %%mm0, (%0)             \n\t"
+            "movq       %%mm2, (%0, %1)         \n\t"
+            "movq       %%mm4, (%0, %1, 2)      \n\t"
+            "movq       %%mm6, (%0, %2)         \n\t"
+            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
+            :"memory");
 }
 
 static const unsigned char __align8 vector128[8] =
@@ -333,30 +333,30 @@
     MOVQ_ZERO(mm7);
     i = 4;
     do {
-	__asm __volatile(
-		"movq	(%2), %%mm0\n\t"
-		"movq	8(%2), %%mm1\n\t"
-		"movq	16(%2), %%mm2\n\t"
-		"movq	24(%2), %%mm3\n\t"
-		"movq	%0, %%mm4\n\t"
-		"movq	%1, %%mm6\n\t"
-		"movq	%%mm4, %%mm5\n\t"
-		"punpcklbw %%mm7, %%mm4\n\t"
-		"punpckhbw %%mm7, %%mm5\n\t"
-		"paddsw	%%mm4, %%mm0\n\t"
-		"paddsw	%%mm5, %%mm1\n\t"
-		"movq	%%mm6, %%mm5\n\t"
-		"punpcklbw %%mm7, %%mm6\n\t"
-		"punpckhbw %%mm7, %%mm5\n\t"
-		"paddsw	%%mm6, %%mm2\n\t"
-		"paddsw	%%mm5, %%mm3\n\t"
-		"packuswb %%mm1, %%mm0\n\t"
-		"packuswb %%mm3, %%mm2\n\t"
-		"movq	%%mm0, %0\n\t"
-		"movq	%%mm2, %1\n\t"
-		:"+m"(*pix), "+m"(*(pix+line_size))
-		:"r"(p)
-		:"memory");
+        __asm __volatile(
+                "movq   (%2), %%mm0     \n\t"
+                "movq   8(%2), %%mm1    \n\t"
+                "movq   16(%2), %%mm2   \n\t"
+                "movq   24(%2), %%mm3   \n\t"
+                "movq   %0, %%mm4       \n\t"
+                "movq   %1, %%mm6       \n\t"
+                "movq   %%mm4, %%mm5    \n\t"
+                "punpcklbw %%mm7, %%mm4 \n\t"
+                "punpckhbw %%mm7, %%mm5 \n\t"
+                "paddsw %%mm4, %%mm0    \n\t"
+                "paddsw %%mm5, %%mm1    \n\t"
+                "movq   %%mm6, %%mm5    \n\t"
+                "punpcklbw %%mm7, %%mm6 \n\t"
+                "punpckhbw %%mm7, %%mm5 \n\t"
+                "paddsw %%mm6, %%mm2    \n\t"
+                "paddsw %%mm5, %%mm3    \n\t"
+                "packuswb %%mm1, %%mm0  \n\t"
+                "packuswb %%mm3, %%mm2  \n\t"
+                "movq   %%mm0, %0       \n\t"
+                "movq   %%mm2, %1       \n\t"
+                :"+m"(*pix), "+m"(*(pix+line_size))
+                :"r"(p)
+                :"memory");
         pix += line_size*2;
         p += 16;
     } while (--i);
@@ -365,101 +365,101 @@
 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
-	 "lea (%3, %3), %%"REG_a"	\n\t"
-	 ".balign 8			\n\t"
-	 "1:				\n\t"
-	 "movd (%1), %%mm0		\n\t"
-	 "movd (%1, %3), %%mm1		\n\t"
-	 "movd %%mm0, (%2)		\n\t"
-	 "movd %%mm1, (%2, %3)		\n\t"
-	 "add %%"REG_a", %1		\n\t"
-	 "add %%"REG_a", %2		\n\t"
-	 "movd (%1), %%mm0		\n\t"
-	 "movd (%1, %3), %%mm1		\n\t"
-	 "movd %%mm0, (%2)		\n\t"
-	 "movd %%mm1, (%2, %3)		\n\t"
-	 "add %%"REG_a", %1		\n\t"
-	 "add %%"REG_a", %2		\n\t"
-	 "subl $4, %0			\n\t"
-	 "jnz 1b			\n\t"
-	 : "+g"(h), "+r" (pixels),  "+r" (block)
-	 : "r"((long)line_size)
-	 : "%"REG_a, "memory"
-	);
+         "lea (%3, %3), %%"REG_a"       \n\t"
+         ".balign 8                     \n\t"
+         "1:                            \n\t"
+         "movd (%1), %%mm0              \n\t"
+         "movd (%1, %3), %%mm1          \n\t"
+         "movd %%mm0, (%2)              \n\t"
+         "movd %%mm1, (%2, %3)          \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "movd (%1), %%mm0              \n\t"
+         "movd (%1, %3), %%mm1          \n\t"
+         "movd %%mm0, (%2)              \n\t"
+         "movd %%mm1, (%2, %3)          \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "subl $4, %0                   \n\t"
+         "jnz 1b                        \n\t"
+         : "+g"(h), "+r" (pixels),  "+r" (block)
+         : "r"((long)line_size)
+         : "%"REG_a, "memory"
+        );
 }
 
 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
-	 "lea (%3, %3), %%"REG_a"	\n\t"
-	 ".balign 8			\n\t"
-	 "1:				\n\t"
-	 "movq (%1), %%mm0		\n\t"
-	 "movq (%1, %3), %%mm1		\n\t"
-     	 "movq %%mm0, (%2)		\n\t"
-	 "movq %%mm1, (%2, %3)		\n\t"
-	 "add %%"REG_a", %1		\n\t"
-	 "add %%"REG_a", %2		\n\t"
-	 "movq (%1), %%mm0		\n\t"
-	 "movq (%1, %3), %%mm1		\n\t"
-	 "movq %%mm0, (%2)		\n\t"
-	 "movq %%mm1, (%2, %3)		\n\t"
-	 "add %%"REG_a", %1		\n\t"
-	 "add %%"REG_a", %2		\n\t"
-	 "subl $4, %0			\n\t"
-	 "jnz 1b			\n\t"
-	 : "+g"(h), "+r" (pixels),  "+r" (block)
-	 : "r"((long)line_size)
-	 : "%"REG_a, "memory"
-	);
+         "lea (%3, %3), %%"REG_a"       \n\t"
+         ".balign 8                     \n\t"
+         "1:                            \n\t"
+         "movq (%1), %%mm0              \n\t"
+         "movq (%1, %3), %%mm1          \n\t"
+         "movq %%mm0, (%2)              \n\t"
+         "movq %%mm1, (%2, %3)          \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "movq (%1), %%mm0              \n\t"
+         "movq (%1, %3), %%mm1          \n\t"
+         "movq %%mm0, (%2)              \n\t"
+         "movq %%mm1, (%2, %3)          \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "subl $4, %0                   \n\t"
+         "jnz 1b                        \n\t"
+         : "+g"(h), "+r" (pixels),  "+r" (block)
+         : "r"((long)line_size)
+         : "%"REG_a, "memory"
+        );
 }
 
 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
-	 "lea (%3, %3), %%"REG_a"	\n\t"
-	 ".balign 8			\n\t"
-	 "1:				\n\t"
-	 "movq (%1), %%mm0		\n\t"
-	 "movq 8(%1), %%mm4		\n\t"
-	 "movq (%1, %3), %%mm1		\n\t"
-	 "movq 8(%1, %3), %%mm5		\n\t"
-     	 "movq %%mm0, (%2)		\n\t"
-     	 "movq %%mm4, 8(%2)		\n\t"
-	 "movq %%mm1, (%2, %3)		\n\t"
-	 "movq %%mm5, 8(%2, %3)		\n\t"
-	 "add %%"REG_a", %1		\n\t"
-	 "add %%"REG_a", %2       	\n\t"
-	 "movq (%1), %%mm0		\n\t"
-	 "movq 8(%1), %%mm4		\n\t"
-	 "movq (%1, %3), %%mm1		\n\t"
-	 "movq 8(%1, %3), %%mm5		\n\t"
-	 "movq %%mm0, (%2)		\n\t"
-	 "movq %%mm4, 8(%2)		\n\t"
-	 "movq %%mm1, (%2, %3)		\n\t"
-	 "movq %%mm5, 8(%2, %3)		\n\t"
-	 "add %%"REG_a", %1		\n\t"
-	 "add %%"REG_a", %2       	\n\t"
-	 "subl $4, %0			\n\t"
-	 "jnz 1b			\n\t"
-	 : "+g"(h), "+r" (pixels),  "+r" (block)
-	 : "r"((long)line_size)
-	 : "%"REG_a, "memory"
-	);
+         "lea (%3, %3), %%"REG_a"       \n\t"
+         ".balign 8                     \n\t"
+         "1:                            \n\t"
+         "movq (%1), %%mm0              \n\t"
+         "movq 8(%1), %%mm4             \n\t"
+         "movq (%1, %3), %%mm1          \n\t"
+         "movq 8(%1, %3), %%mm5         \n\t"
+         "movq %%mm0, (%2)              \n\t"
+         "movq %%mm4, 8(%2)             \n\t"
+         "movq %%mm1, (%2, %3)          \n\t"
+         "movq %%mm5, 8(%2, %3)         \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "movq (%1), %%mm0              \n\t"
+         "movq 8(%1), %%mm4             \n\t"
+         "movq (%1, %3), %%mm1          \n\t"
+         "movq 8(%1, %3), %%mm5         \n\t"
+         "movq %%mm0, (%2)              \n\t"
+         "movq %%mm4, 8(%2)             \n\t"
+         "movq %%mm1, (%2, %3)          \n\t"
+         "movq %%mm5, 8(%2, %3)         \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "subl $4, %0                   \n\t"
+         "jnz 1b                        \n\t"
+         : "+g"(h), "+r" (pixels),  "+r" (block)
+         : "r"((long)line_size)
+         : "%"REG_a, "memory"
+        );
 }
 
 static void clear_blocks_mmx(DCTELEM *blocks)
 {
     __asm __volatile(
-                "pxor %%mm7, %%mm7		\n\t"
-                "mov $-128*6, %%"REG_a"	\n\t"
-                "1:				\n\t"
-                "movq %%mm7, (%0, %%"REG_a")	\n\t"
-                "movq %%mm7, 8(%0, %%"REG_a")	\n\t"
-                "movq %%mm7, 16(%0, %%"REG_a")	\n\t"
-                "movq %%mm7, 24(%0, %%"REG_a")	\n\t"
-                "add $32, %%"REG_a"		\n\t"
-                " js 1b				\n\t"
+                "pxor %%mm7, %%mm7              \n\t"
+                "mov $-128*6, %%"REG_a"         \n\t"
+                "1:                             \n\t"
+                "movq %%mm7, (%0, %%"REG_a")    \n\t"
+                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
+                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
+                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
+                "add $32, %%"REG_a"             \n\t"
+                " js 1b                         \n\t"
                 : : "r" (((uint8_t *)blocks)+128*6)
                 : "%"REG_a
         );
@@ -472,31 +472,31 @@
     long index= -line_size*h;
 
     __asm __volatile(
-                "pxor %%mm7, %%mm7		\n\t"
-                "pxor %%mm6, %%mm6		\n\t"
-                "1:				\n\t"
-                "movq (%2, %1), %%mm0		\n\t"
-                "movq (%2, %1), %%mm1		\n\t"
-                "movq 8(%2, %1), %%mm2		\n\t"
-                "movq 8(%2, %1), %%mm3		\n\t"
-                "punpcklbw %%mm7, %%mm0		\n\t"
-                "punpckhbw %%mm7, %%mm1		\n\t"
-                "punpcklbw %%mm7, %%mm2		\n\t"
-                "punpckhbw %%mm7, %%mm3		\n\t"
-                "paddw %%mm0, %%mm1		\n\t"
-                "paddw %%mm2, %%mm3		\n\t"
-                "paddw %%mm1, %%mm3		\n\t"
-                "paddw %%mm3, %%mm6		\n\t"
-                "add %3, %1			\n\t"
-                " js 1b				\n\t"
-                "movq %%mm6, %%mm5		\n\t"
-                "psrlq $32, %%mm6		\n\t"
-                "paddw %%mm5, %%mm6		\n\t"
-                "movq %%mm6, %%mm5		\n\t"
-                "psrlq $16, %%mm6		\n\t"
-                "paddw %%mm5, %%mm6		\n\t"
-                "movd %%mm6, %0			\n\t"
-                "andl $0xFFFF, %0		\n\t"
+                "pxor %%mm7, %%mm7              \n\t"
+                "pxor %%mm6, %%mm6              \n\t"
+                "1:                             \n\t"
+                "movq (%2, %1), %%mm0           \n\t"
+                "movq (%2, %1), %%mm1           \n\t"
+                "movq 8(%2, %1), %%mm2          \n\t"
+                "movq 8(%2, %1), %%mm3          \n\t"
+                "punpcklbw %%mm7, %%mm0         \n\t"
+                "punpckhbw %%mm7, %%mm1         \n\t"
+                "punpcklbw %%mm7, %%mm2         \n\t"
+                "punpckhbw %%mm7, %%mm3         \n\t"
+                "paddw %%mm0, %%mm1             \n\t"
+                "paddw %%mm2, %%mm3             \n\t"
+                "paddw %%mm1, %%mm3             \n\t"
+                "paddw %%mm3, %%mm6             \n\t"
+                "add %3, %1                     \n\t"
+                " js 1b                         \n\t"
+                "movq %%mm6, %%mm5              \n\t"
+                "psrlq $32, %%mm6               \n\t"
+                "paddw %%mm5, %%mm6             \n\t"
+                "movq %%mm6, %%mm5              \n\t"
+                "psrlq $16, %%mm6               \n\t"
+                "paddw %%mm5, %%mm6             \n\t"
+                "movd %%mm6, %0                 \n\t"
+                "andl $0xFFFF, %0               \n\t"
                 : "=&r" (sum), "+r" (index)
                 : "r" (pix - index), "r" ((long)line_size)
         );
@@ -508,18 +508,18 @@
 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
     long i=0;
     asm volatile(
-        "1:				\n\t"
-        "movq  (%1, %0), %%mm0		\n\t"
-        "movq  (%2, %0), %%mm1		\n\t"
-        "paddb %%mm0, %%mm1		\n\t"
-        "movq %%mm1, (%2, %0)		\n\t"
-        "movq 8(%1, %0), %%mm0		\n\t"
-        "movq 8(%2, %0), %%mm1		\n\t"
-        "paddb %%mm0, %%mm1		\n\t"
-        "movq %%mm1, 8(%2, %0)		\n\t"
-        "add $16, %0			\n\t"
-        "cmp %3, %0			\n\t"
-        " jb 1b				\n\t"
+        "1:                             \n\t"
+        "movq  (%1, %0), %%mm0          \n\t"
+        "movq  (%2, %0), %%mm1          \n\t"
+        "paddb %%mm0, %%mm1             \n\t"
+        "movq %%mm1, (%2, %0)           \n\t"
+        "movq 8(%1, %0), %%mm0          \n\t"
+        "movq 8(%2, %0), %%mm1          \n\t"
+        "paddb %%mm0, %%mm1             \n\t"
+        "movq %%mm1, 8(%2, %0)          \n\t"
+        "add $16, %0                    \n\t"
+        "cmp %3, %0                     \n\t"
+        " jb 1b                         \n\t"
         : "+r" (i)
         : "r"(src), "r"(dst), "r"((long)w-15)
     );
@@ -528,75 +528,75 @@
 }
 
 #define H263_LOOP_FILTER \
-        "pxor %%mm7, %%mm7		\n\t"\
-        "movq  %0, %%mm0		\n\t"\
-        "movq  %0, %%mm1		\n\t"\
-        "movq  %3, %%mm2		\n\t"\
-        "movq  %3, %%mm3		\n\t"\
-        "punpcklbw %%mm7, %%mm0		\n\t"\
-        "punpckhbw %%mm7, %%mm1		\n\t"\
-        "punpcklbw %%mm7, %%mm2		\n\t"\
-        "punpckhbw %%mm7, %%mm3		\n\t"\
-        "psubw %%mm2, %%mm0		\n\t"\
-        "psubw %%mm3, %%mm1		\n\t"\
-        "movq  %1, %%mm2		\n\t"\
-        "movq  %1, %%mm3		\n\t"\
-        "movq  %2, %%mm4		\n\t"\
-        "movq  %2, %%mm5		\n\t"\
-        "punpcklbw %%mm7, %%mm2		\n\t"\
-        "punpckhbw %%mm7, %%mm3		\n\t"\
-        "punpcklbw %%mm7, %%mm4		\n\t"\
-        "punpckhbw %%mm7, %%mm5		\n\t"\
-        "psubw %%mm2, %%mm4		\n\t"\
-        "psubw %%mm3, %%mm5		\n\t"\
-        "psllw $2, %%mm4		\n\t"\
-        "psllw $2, %%mm5		\n\t"\
-        "paddw %%mm0, %%mm4		\n\t"\
-        "paddw %%mm1, %%mm5		\n\t"\
-        "pxor %%mm6, %%mm6		\n\t"\
-        "pcmpgtw %%mm4, %%mm6		\n\t"\
-        "pcmpgtw %%mm5, %%mm7		\n\t"\
-        "pxor %%mm6, %%mm4		\n\t"\
-        "pxor %%mm7, %%mm5		\n\t"\
-        "psubw %%mm6, %%mm4		\n\t"\
-        "psubw %%mm7, %%mm5		\n\t"\
-        "psrlw $3, %%mm4		\n\t"\
-        "psrlw $3, %%mm5		\n\t"\
-        "packuswb %%mm5, %%mm4		\n\t"\
-        "packsswb %%mm7, %%mm6		\n\t"\
-        "pxor %%mm7, %%mm7		\n\t"\
-        "movd %4, %%mm2			\n\t"\
-        "punpcklbw %%mm2, %%mm2		\n\t"\
-        "punpcklbw %%mm2, %%mm2		\n\t"\
-        "punpcklbw %%mm2, %%mm2		\n\t"\
-        "psubusb %%mm4, %%mm2		\n\t"\
-        "movq %%mm2, %%mm3		\n\t"\
-        "psubusb %%mm4, %%mm3		\n\t"\
-        "psubb %%mm3, %%mm2		\n\t"\
-        "movq %1, %%mm3			\n\t"\
-        "movq %2, %%mm4			\n\t"\
-        "pxor %%mm6, %%mm3		\n\t"\
-        "pxor %%mm6, %%mm4		\n\t"\
-        "paddusb %%mm2, %%mm3		\n\t"\
-        "psubusb %%mm2, %%mm4		\n\t"\
-        "pxor %%mm6, %%mm3		\n\t"\
-        "pxor %%mm6, %%mm4		\n\t"\
-        "paddusb %%mm2, %%mm2		\n\t"\
-        "packsswb %%mm1, %%mm0		\n\t"\
-        "pcmpgtb %%mm0, %%mm7		\n\t"\
-        "pxor %%mm7, %%mm0		\n\t"\
-        "psubb %%mm7, %%mm0		\n\t"\
-        "movq %%mm0, %%mm1		\n\t"\
-        "psubusb %%mm2, %%mm0		\n\t"\
-        "psubb %%mm0, %%mm1		\n\t"\
-        "pand %5, %%mm1			\n\t"\
-        "psrlw $2, %%mm1		\n\t"\
-        "pxor %%mm7, %%mm1		\n\t"\
-        "psubb %%mm7, %%mm1		\n\t"\
-        "movq %0, %%mm5			\n\t"\
-        "movq %3, %%mm6			\n\t"\
-        "psubb %%mm1, %%mm5		\n\t"\
-        "paddb %%mm1, %%mm6		\n\t"
+        "pxor %%mm7, %%mm7              \n\t"\
+        "movq  %0, %%mm0                \n\t"\
+        "movq  %0, %%mm1                \n\t"\
+        "movq  %3, %%mm2                \n\t"\
+        "movq  %3, %%mm3                \n\t"\
+        "punpcklbw %%mm7, %%mm0         \n\t"\
+        "punpckhbw %%mm7, %%mm1         \n\t"\
+        "punpcklbw %%mm7, %%mm2         \n\t"\
+        "punpckhbw %%mm7, %%mm3         \n\t"\
+        "psubw %%mm2, %%mm0             \n\t"\
+        "psubw %%mm3, %%mm1             \n\t"\
+        "movq  %1, %%mm2                \n\t"\
+        "movq  %1, %%mm3                \n\t"\
+        "movq  %2, %%mm4                \n\t"\
+        "movq  %2, %%mm5                \n\t"\
+        "punpcklbw %%mm7, %%mm2         \n\t"\
+        "punpckhbw %%mm7, %%mm3         \n\t"\
+        "punpcklbw %%mm7, %%mm4         \n\t"\
+        "punpckhbw %%mm7, %%mm5         \n\t"\
+        "psubw %%mm2, %%mm4             \n\t"\
+        "psubw %%mm3, %%mm5             \n\t"\
+        "psllw $2, %%mm4                \n\t"\
+        "psllw $2, %%mm5                \n\t"\
+        "paddw %%mm0, %%mm4             \n\t"\
+        "paddw %%mm1, %%mm5             \n\t"\
+        "pxor %%mm6, %%mm6              \n\t"\
+        "pcmpgtw %%mm4, %%mm6           \n\t"\
+        "pcmpgtw %%mm5, %%mm7           \n\t"\
+        "pxor %%mm6, %%mm4              \n\t"\
+        "pxor %%mm7, %%mm5              \n\t"\
+        "psubw %%mm6, %%mm4             \n\t"\
+        "psubw %%mm7, %%mm5             \n\t"\
+        "psrlw $3, %%mm4                \n\t"\
+        "psrlw $3, %%mm5                \n\t"\
+        "packuswb %%mm5, %%mm4          \n\t"\
+        "packsswb %%mm7, %%mm6          \n\t"\
+        "pxor %%mm7, %%mm7              \n\t"\
+        "movd %4, %%mm2                 \n\t"\
+        "punpcklbw %%mm2, %%mm2         \n\t"\
+        "punpcklbw %%mm2, %%mm2         \n\t"\
+        "punpcklbw %%mm2, %%mm2         \n\t"\
+        "psubusb %%mm4, %%mm2           \n\t"\
+        "movq %%mm2, %%mm3              \n\t"\
+        "psubusb %%mm4, %%mm3           \n\t"\
+        "psubb %%mm3, %%mm2             \n\t"\
+        "movq %1, %%mm3                 \n\t"\
+        "movq %2, %%mm4                 \n\t"\
+        "pxor %%mm6, %%mm3              \n\t"\
+        "pxor %%mm6, %%mm4              \n\t"\
+        "paddusb %%mm2, %%mm3           \n\t"\
+        "psubusb %%mm2, %%mm4           \n\t"\
+        "pxor %%mm6, %%mm3              \n\t"\
+        "pxor %%mm6, %%mm4              \n\t"\
+        "paddusb %%mm2, %%mm2           \n\t"\
+        "packsswb %%mm1, %%mm0          \n\t"\
+        "pcmpgtb %%mm0, %%mm7           \n\t"\
+        "pxor %%mm7, %%mm0              \n\t"\
+        "psubb %%mm7, %%mm0             \n\t"\
+        "movq %%mm0, %%mm1              \n\t"\
+        "psubusb %%mm2, %%mm0           \n\t"\
+        "psubb %%mm0, %%mm1             \n\t"\
+        "pand %5, %%mm1                 \n\t"\
+        "psrlw $2, %%mm1                \n\t"\
+        "pxor %%mm7, %%mm1              \n\t"\
+        "psubb %%mm7, %%mm1             \n\t"\
+        "movq %0, %%mm5                 \n\t"\
+        "movq %3, %%mm6                 \n\t"\
+        "psubb %%mm1, %%mm5             \n\t"\
+        "paddb %%mm1, %%mm6             \n\t"
 
 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
     const int strength= ff_h263_loop_filter_strength[qscale];
@@ -605,10 +605,10 @@
 
         H263_LOOP_FILTER
 
-        "movq %%mm3, %1			\n\t"
-        "movq %%mm4, %2			\n\t"
-        "movq %%mm5, %0			\n\t"
-        "movq %%mm6, %3			\n\t"
+        "movq %%mm3, %1                 \n\t"
+        "movq %%mm4, %2                 \n\t"
+        "movq %%mm5, %0                 \n\t"
+        "movq %%mm6, %3                 \n\t"
         : "+m" (*(uint64_t*)(src - 2*stride)),
           "+m" (*(uint64_t*)(src - 1*stride)),
           "+m" (*(uint64_t*)(src + 0*stride)),
@@ -619,21 +619,21 @@
 
 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
     asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
-        "movd  %4, %%mm0		\n\t"
-        "movd  %5, %%mm1		\n\t"
-        "movd  %6, %%mm2		\n\t"
-        "movd  %7, %%mm3		\n\t"
-        "punpcklbw %%mm1, %%mm0		\n\t"
-        "punpcklbw %%mm3, %%mm2		\n\t"
-        "movq %%mm0, %%mm1		\n\t"
-        "punpcklwd %%mm2, %%mm0		\n\t"
-        "punpckhwd %%mm2, %%mm1		\n\t"
-        "movd  %%mm0, %0		\n\t"
-        "punpckhdq %%mm0, %%mm0		\n\t"
-        "movd  %%mm0, %1		\n\t"
-        "movd  %%mm1, %2		\n\t"
-        "punpckhdq %%mm1, %%mm1		\n\t"
-        "movd  %%mm1, %3		\n\t"
+        "movd  %4, %%mm0                \n\t"
+        "movd  %5, %%mm1                \n\t"
+        "movd  %6, %%mm2                \n\t"
+        "movd  %7, %%mm3                \n\t"
+        "punpcklbw %%mm1, %%mm0         \n\t"
+        "punpcklbw %%mm3, %%mm2         \n\t"
+        "movq %%mm0, %%mm1              \n\t"
+        "punpcklwd %%mm2, %%mm0         \n\t"
+        "punpckhwd %%mm2, %%mm1         \n\t"
+        "movd  %%mm0, %0                \n\t"
+        "punpckhdq %%mm0, %%mm0         \n\t"
+        "movd  %%mm0, %1                \n\t"
+        "movd  %%mm1, %2                \n\t"
+        "punpckhdq %%mm1, %%mm1         \n\t"
+        "movd  %%mm1, %3                \n\t"
 
         : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
           "=m" (*(uint32_t*)(dst + 1*dst_stride)),
@@ -666,30 +666,30 @@
     );
 
     asm volatile(
-        "movq %%mm5, %%mm1		\n\t"
-        "movq %%mm4, %%mm0		\n\t"
-        "punpcklbw %%mm3, %%mm5		\n\t"
-        "punpcklbw %%mm6, %%mm4		\n\t"
-        "punpckhbw %%mm3, %%mm1		\n\t"
-        "punpckhbw %%mm6, %%mm0		\n\t"
-        "movq %%mm5, %%mm3		\n\t"
-        "movq %%mm1, %%mm6		\n\t"
-        "punpcklwd %%mm4, %%mm5		\n\t"
-        "punpcklwd %%mm0, %%mm1		\n\t"
-        "punpckhwd %%mm4, %%mm3		\n\t"
-        "punpckhwd %%mm0, %%mm6		\n\t"
-        "movd %%mm5, (%0)		\n\t"
-        "punpckhdq %%mm5, %%mm5		\n\t"
-        "movd %%mm5, (%0,%2)		\n\t"
-        "movd %%mm3, (%0,%2,2)		\n\t"
-        "punpckhdq %%mm3, %%mm3		\n\t"
-        "movd %%mm3, (%0,%3)		\n\t"
-        "movd %%mm1, (%1)		\n\t"
-        "punpckhdq %%mm1, %%mm1		\n\t"
-        "movd %%mm1, (%1,%2)		\n\t"
-        "movd %%mm6, (%1,%2,2)		\n\t"
-        "punpckhdq %%mm6, %%mm6		\n\t"
-        "movd %%mm6, (%1,%3)		\n\t"
+        "movq %%mm5, %%mm1              \n\t"
+        "movq %%mm4, %%mm0              \n\t"
+        "punpcklbw %%mm3, %%mm5         \n\t"
+        "punpcklbw %%mm6, %%mm4         \n\t"
+        "punpckhbw %%mm3, %%mm1         \n\t"
+        "punpckhbw %%mm6, %%mm0         \n\t"
+        "movq %%mm5, %%mm3              \n\t"
+        "movq %%mm1, %%mm6              \n\t"
+        "punpcklwd %%mm4, %%mm5         \n\t"
+        "punpcklwd %%mm0, %%mm1         \n\t"
+        "punpckhwd %%mm4, %%mm3         \n\t"
+        "punpckhwd %%mm0, %%mm6         \n\t"
+        "movd %%mm5, (%0)               \n\t"
+        "punpckhdq %%mm5, %%mm5         \n\t"
+        "movd %%mm5, (%0,%2)            \n\t"
+        "movd %%mm3, (%0,%2,2)          \n\t"
+        "punpckhdq %%mm3, %%mm3         \n\t"
+        "movd %%mm3, (%0,%3)            \n\t"
+        "movd %%mm1, (%1)               \n\t"
+        "punpckhdq %%mm1, %%mm1         \n\t"
+        "movd %%mm1, (%1,%2)            \n\t"
+        "movd %%mm6, (%1,%2,2)          \n\t"
+        "punpckhdq %%mm6, %%mm6         \n\t"
+        "movd %%mm6, (%1,%3)            \n\t"
         :: "r" (src),
            "r" (src + 4*stride),
            "r" ((long)   stride ),
@@ -705,26 +705,26 @@
       "pxor %%mm0,%%mm0\n"
       "pxor %%mm7,%%mm7\n"
       "1:\n"
-      "movq (%0),%%mm2\n"	/* mm2 = pix[0-7] */
-      "movq 8(%0),%%mm3\n"	/* mm3 = pix[8-15] */
+      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
+      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
 
-      "movq %%mm2,%%mm1\n"	/* mm1 = mm2 = pix[0-7] */
+      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
 
-      "punpckhbw %%mm0,%%mm1\n"	/* mm1 = [pix4-7] */
-      "punpcklbw %%mm0,%%mm2\n"	/* mm2 = [pix0-3] */
+      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
+      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
 
-      "movq %%mm3,%%mm4\n"	/* mm4 = mm3 = pix[8-15] */
-      "punpckhbw %%mm0,%%mm3\n"	/* mm3 = [pix12-15] */
-      "punpcklbw %%mm0,%%mm4\n"	/* mm4 = [pix8-11] */
+      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
+      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
+      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
 
-      "pmaddwd %%mm1,%%mm1\n"	/* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
-      "pmaddwd %%mm2,%%mm2\n"	/* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
+      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
+      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
 
       "pmaddwd %%mm3,%%mm3\n"
       "pmaddwd %%mm4,%%mm4\n"
 
-      "paddd %%mm1,%%mm2\n"	/* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
-					  pix2^2+pix3^2+pix6^2+pix7^2) */
+      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
+                                          pix2^2+pix3^2+pix6^2+pix7^2) */
       "paddd %%mm3,%%mm4\n"
       "paddd %%mm2,%%mm7\n"
 
@@ -734,7 +734,7 @@
       "jnz 1b\n"
 
       "movq %%mm7,%%mm1\n"
-      "psrlq $32, %%mm7\n"	/* shift hi dword to lo */
+      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
       "paddd %%mm7,%%mm1\n"
       "movd %%mm1,%1\n"
       : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
@@ -746,13 +746,13 @@
   asm volatile (
       "movl %4,%%ecx\n"
       "shr $1,%%ecx\n"
-      "pxor %%mm0,%%mm0\n"	/* mm0 = 0 */
-      "pxor %%mm7,%%mm7\n"	/* mm7 holds the sum */
+      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
+      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
       "1:\n"
-      "movq (%0),%%mm1\n"	/* mm1 = pix1[0][0-7] */
-      "movq (%1),%%mm2\n"	/* mm2 = pix2[0][0-7] */
-      "movq (%0,%3),%%mm3\n"	/* mm3 = pix1[1][0-7] */
-      "movq (%1,%3),%%mm4\n"	/* mm4 = pix2[1][0-7] */
+      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
+      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
+      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
+      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
 
       /* todo: mm1-mm2, mm3-mm4 */
       /* algo: substract mm1 from mm2 with saturation and vice versa */
@@ -773,16 +773,16 @@
 
       "punpckhbw %%mm0,%%mm2\n"
       "punpckhbw %%mm0,%%mm4\n"
-      "punpcklbw %%mm0,%%mm1\n"	/* mm1 now spread over (mm1,mm2) */
-      "punpcklbw %%mm0,%%mm3\n"	/* mm4 now spread over (mm3,mm4) */
+      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
+      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
 
       "pmaddwd %%mm2,%%mm2\n"
       "pmaddwd %%mm4,%%mm4\n"
       "pmaddwd %%mm1,%%mm1\n"
       "pmaddwd %%mm3,%%mm3\n"
 
-      "lea (%0,%3,2), %0\n"	/* pix1 += 2*line_size */
-      "lea (%1,%3,2), %1\n"	/* pix2 += 2*line_size */
+      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
+      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
 
       "paddd %%mm2,%%mm1\n"
       "paddd %%mm4,%%mm3\n"
@@ -793,7 +793,7 @@
       "jnz 1b\n"
 
       "movq %%mm7,%%mm1\n"
-      "psrlq $32, %%mm7\n"	/* shift hi dword to lo */
+      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
       "paddd %%mm7,%%mm1\n"
       "movd %%mm1,%2\n"
       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
@@ -806,13 +806,13 @@
     int tmp;
   asm volatile (
       "movl %4,%%ecx\n"
-      "pxor %%mm0,%%mm0\n"	/* mm0 = 0 */
-      "pxor %%mm7,%%mm7\n"	/* mm7 holds the sum */
+      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
+      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
       "1:\n"
-      "movq (%0),%%mm1\n"	/* mm1 = pix1[0-7] */
-      "movq (%1),%%mm2\n"	/* mm2 = pix2[0-7] */
-      "movq 8(%0),%%mm3\n"	/* mm3 = pix1[8-15] */
-      "movq 8(%1),%%mm4\n"	/* mm4 = pix2[8-15] */
+      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
+      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
+      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
+      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
 
       /* todo: mm1-mm2, mm3-mm4 */
       /* algo: substract mm1 from mm2 with saturation and vice versa */
@@ -833,8 +833,8 @@
 
       "punpckhbw %%mm0,%%mm2\n"
       "punpckhbw %%mm0,%%mm4\n"
-      "punpcklbw %%mm0,%%mm1\n"	/* mm1 now spread over (mm1,mm2) */
-      "punpcklbw %%mm0,%%mm3\n"	/* mm4 now spread over (mm3,mm4) */
+      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
+      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
 
       "pmaddwd %%mm2,%%mm2\n"
       "pmaddwd %%mm4,%%mm4\n"
@@ -853,7 +853,7 @@
       "jnz 1b\n"
 
       "movq %%mm7,%%mm1\n"
-      "psrlq $32, %%mm7\n"	/* shift hi dword to lo */
+      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
       "paddd %%mm7,%%mm1\n"
       "movd %%mm1,%2\n"
       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
@@ -866,13 +866,13 @@
     int tmp;
   asm volatile (
       "shr $1,%2\n"
-      "pxor %%xmm0,%%xmm0\n"	/* mm0 = 0 */
-      "pxor %%xmm7,%%xmm7\n"	/* mm7 holds the sum */
+      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
+      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
       "1:\n"
-      "movdqu (%0),%%xmm1\n"	/* mm1 = pix1[0][0-15] */
-      "movdqu (%1),%%xmm2\n"	/* mm2 = pix2[0][0-15] */
-      "movdqu (%0,%4),%%xmm3\n"	/* mm3 = pix1[1][0-15] */
-      "movdqu (%1,%4),%%xmm4\n"	/* mm4 = pix2[1][0-15] */
+      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
+      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
+      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
+      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
 
       /* todo: mm1-mm2, mm3-mm4 */
       /* algo: substract mm1 from mm2 with saturation and vice versa */
@@ -893,16 +893,16 @@
 
       "punpckhbw %%xmm0,%%xmm2\n"
       "punpckhbw %%xmm0,%%xmm4\n"
-      "punpcklbw %%xmm0,%%xmm1\n"	/* mm1 now spread over (mm1,mm2) */
-      "punpcklbw %%xmm0,%%xmm3\n"	/* mm4 now spread over (mm3,mm4) */
+      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
+      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
 
       "pmaddwd %%xmm2,%%xmm2\n"
       "pmaddwd %%xmm4,%%xmm4\n"
       "pmaddwd %%xmm1,%%xmm1\n"
       "pmaddwd %%xmm3,%%xmm3\n"
 
-      "lea (%0,%4,2), %0\n"	/* pix1 += 2*line_size */
-      "lea (%1,%4,2), %1\n"	/* pix2 += 2*line_size */
+      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
+      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
 
       "paddd %%xmm2,%%xmm1\n"
       "paddd %%xmm4,%%xmm3\n"
@@ -913,10 +913,10 @@
       "jnz 1b\n"
 
       "movdqa %%xmm7,%%xmm1\n"
-      "psrldq $8, %%xmm7\n"	/* shift hi qword to lo */
+      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
       "paddd %%xmm1,%%xmm7\n"
       "movdqa %%xmm7,%%xmm1\n"
-      "psrldq $4, %%xmm7\n"	/* shift hi dword to lo */
+      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
       "paddd %%xmm1,%%xmm7\n"
       "movd %%xmm7,%3\n"
       : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
@@ -1427,18 +1427,18 @@
 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
     long i=0;
     asm volatile(
-        "1:				\n\t"
-        "movq  (%2, %0), %%mm0		\n\t"
-        "movq  (%1, %0), %%mm1		\n\t"
-        "psubb %%mm0, %%mm1		\n\t"
-        "movq %%mm1, (%3, %0)		\n\t"
-        "movq 8(%2, %0), %%mm0		\n\t"
-        "movq 8(%1, %0), %%mm1		\n\t"
-        "psubb %%mm0, %%mm1		\n\t"
-        "movq %%mm1, 8(%3, %0)		\n\t"
-        "add $16, %0			\n\t"
-        "cmp %4, %0			\n\t"
-        " jb 1b				\n\t"
+        "1:                             \n\t"
+        "movq  (%2, %0), %%mm0          \n\t"
+        "movq  (%1, %0), %%mm1          \n\t"
+        "psubb %%mm0, %%mm1             \n\t"
+        "movq %%mm1, (%3, %0)           \n\t"
+        "movq 8(%2, %0), %%mm0          \n\t"
+        "movq 8(%1, %0), %%mm1          \n\t"
+        "psubb %%mm0, %%mm1             \n\t"
+        "movq %%mm1, 8(%3, %0)          \n\t"
+        "add $16, %0                    \n\t"
+        "cmp %4, %0                     \n\t"
+        " jb 1b                         \n\t"
         : "+r" (i)
         : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
     );
@@ -1451,24 +1451,24 @@
     uint8_t l, lt;
 
     asm volatile(
-        "1:				\n\t"
-        "movq  -1(%1, %0), %%mm0	\n\t" // LT
-        "movq  (%1, %0), %%mm1		\n\t" // T
-        "movq  -1(%2, %0), %%mm2	\n\t" // L
-        "movq  (%2, %0), %%mm3		\n\t" // X
-        "movq %%mm2, %%mm4		\n\t" // L
-        "psubb %%mm0, %%mm2		\n\t"
-        "paddb %%mm1, %%mm2		\n\t" // L + T - LT
-        "movq %%mm4, %%mm5		\n\t" // L
-        "pmaxub %%mm1, %%mm4		\n\t" // max(T, L)
-        "pminub %%mm5, %%mm1		\n\t" // min(T, L)
-        "pminub %%mm2, %%mm4		\n\t"
-        "pmaxub %%mm1, %%mm4		\n\t"
-        "psubb %%mm4, %%mm3		\n\t" // dst - pred
-        "movq %%mm3, (%3, %0)		\n\t"
-        "add $8, %0			\n\t"
-        "cmp %4, %0			\n\t"
-        " jb 1b				\n\t"
+        "1:                             \n\t"
+        "movq  -1(%1, %0), %%mm0        \n\t" // LT
+        "movq  (%1, %0), %%mm1          \n\t" // T
+        "movq  -1(%2, %0), %%mm2        \n\t" // L
+        "movq  (%2, %0), %%mm3          \n\t" // X
+        "movq %%mm2, %%mm4              \n\t" // L
+        "psubb %%mm0, %%mm2             \n\t"
+        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
+        "movq %%mm4, %%mm5              \n\t" // L
+        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
+        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
+        "pminub %%mm2, %%mm4            \n\t"
+        "pmaxub %%mm1, %%mm4            \n\t"
+        "psubb %%mm4, %%mm3             \n\t" // dst - pred
+        "movq %%mm3, (%3, %0)           \n\t"
+        "add $8, %0                     \n\t"
+        "cmp %4, %0                     \n\t"
+        " jb 1b                         \n\t"
         : "+r" (i)
         : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
     );
@@ -1483,12 +1483,12 @@
 }
 
 #define LBUTTERFLY2(a1,b1,a2,b2)\
-    "paddw " #b1 ", " #a1 "		\n\t"\
-    "paddw " #b2 ", " #a2 "		\n\t"\
-    "paddw " #b1 ", " #b1 "		\n\t"\
-    "paddw " #b2 ", " #b2 "		\n\t"\
-    "psubw " #a1 ", " #b1 "		\n\t"\
-    "psubw " #a2 ", " #b2 "		\n\t"
+    "paddw " #b1 ", " #a1 "           \n\t"\
+    "paddw " #b2 ", " #a2 "           \n\t"\
+    "paddw " #b1 ", " #b1 "           \n\t"\
+    "paddw " #b2 ", " #b2 "           \n\t"\
+    "psubw " #a1 ", " #b1 "           \n\t"\
+    "psubw " #a2 ", " #b2 "           \n\t"
 
 #define HADAMARD48\
         LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
@@ -1499,33 +1499,33 @@
         LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
 
 #define MMABS(a,z)\
-    "pxor " #z ", " #z "		\n\t"\
-    "pcmpgtw " #a ", " #z "		\n\t"\
-    "pxor " #z ", " #a "		\n\t"\
-    "psubw " #z ", " #a "		\n\t"
+    "pxor " #z ", " #z "              \n\t"\
+    "pcmpgtw " #a ", " #z "           \n\t"\
+    "pxor " #z ", " #a "              \n\t"\
+    "psubw " #z ", " #a "             \n\t"
 
 #define MMABS_SUM(a,z, sum)\
-    "pxor " #z ", " #z "		\n\t"\
-    "pcmpgtw " #a ", " #z "		\n\t"\
-    "pxor " #z ", " #a "		\n\t"\
-    "psubw " #z ", " #a "		\n\t"\
-    "paddusw " #a ", " #sum "		\n\t"
+    "pxor " #z ", " #z "              \n\t"\
+    "pcmpgtw " #a ", " #z "           \n\t"\
+    "pxor " #z ", " #a "              \n\t"\
+    "psubw " #z ", " #a "             \n\t"\
+    "paddusw " #a ", " #sum "         \n\t"
 
 #define MMABS_MMX2(a,z)\
-    "pxor " #z ", " #z "		\n\t"\
-    "psubw " #a ", " #z "		\n\t"\
-    "pmaxsw " #z ", " #a "		\n\t"
+    "pxor " #z ", " #z "              \n\t"\
+    "psubw " #a ", " #z "             \n\t"\
+    "pmaxsw " #z ", " #a "            \n\t"
 
 #define MMABS_SUM_MMX2(a,z, sum)\
-    "pxor " #z ", " #z "		\n\t"\
-    "psubw " #a ", " #z "		\n\t"\
-    "pmaxsw " #z ", " #a "		\n\t"\
-    "paddusw " #a ", " #sum "		\n\t"
+    "pxor " #z ", " #z "              \n\t"\
+    "psubw " #a ", " #z "             \n\t"\
+    "pmaxsw " #z ", " #a "            \n\t"\
+    "paddusw " #a ", " #sum "         \n\t"
 
 #define SBUTTERFLY(a,b,t,n)\
-    "movq " #a ", " #t "		\n\t" /* abcd */\
-    "punpckl" #n " " #b ", " #a "	\n\t" /* aebf */\
-    "punpckh" #n " " #b ", " #t "	\n\t" /* cgdh */\
+    "movq " #a ", " #t "              \n\t" /* abcd */\
+    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
+    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
 
 #define TRANSPOSE4(a,b,c,d,t)\
     SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
@@ -1534,16 +1534,16 @@
     SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
 
 #define LOAD4(o, a, b, c, d)\
-        "movq "#o"(%1), " #a "		\n\t"\
-        "movq "#o"+16(%1), " #b "	\n\t"\
-        "movq "#o"+32(%1), " #c "	\n\t"\
-        "movq "#o"+48(%1), " #d "	\n\t"
+        "movq "#o"(%1), " #a "        \n\t"\
+        "movq "#o"+16(%1), " #b "     \n\t"\
+        "movq "#o"+32(%1), " #c "     \n\t"\
+        "movq "#o"+48(%1), " #d "     \n\t"
 
 #define STORE4(o, a, b, c, d)\
-        "movq "#a", "#o"(%1)		\n\t"\
-        "movq "#b", "#o"+16(%1)		\n\t"\
-        "movq "#c", "#o"+32(%1)		\n\t"\
-        "movq "#d", "#o"+48(%1)		\n\t"\
+        "movq "#a", "#o"(%1)          \n\t"\
+        "movq "#b", "#o"+16(%1)       \n\t"\
+        "movq "#c", "#o"+32(%1)       \n\t"\
+        "movq "#d", "#o"+48(%1)       \n\t"\
 
 static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
     uint64_t temp[16] __align8;
@@ -1559,12 +1559,12 @@
 
         HADAMARD48
 
-        "movq %%mm7, 112(%1)		\n\t"
+        "movq %%mm7, 112(%1)            \n\t"
 
         TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
         STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
 
-        "movq 112(%1), %%mm7 		\n\t"
+        "movq 112(%1), %%mm7            \n\t"
         TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
         STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
 
@@ -1573,23 +1573,23 @@
 
         HADAMARD48
 
-        "movq %%mm7, 120(%1)		\n\t"
+        "movq %%mm7, 120(%1)            \n\t"
 
         TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
         STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
 
-        "movq 120(%1), %%mm7 		\n\t"
+        "movq 120(%1), %%mm7            \n\t"
         TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
-        "movq %%mm7, %%mm5		\n\t"//FIXME remove
-        "movq %%mm6, %%mm7		\n\t"
-        "movq %%mm0, %%mm6		\n\t"
+        "movq %%mm7, %%mm5              \n\t"//FIXME remove
+        "movq %%mm6, %%mm7              \n\t"
+        "movq %%mm0, %%mm6              \n\t"
 //        STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
 
         LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
 //        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
 
         HADAMARD48
-        "movq %%mm7, 64(%1)		\n\t"
+        "movq %%mm7, 64(%1)             \n\t"
         MMABS(%%mm0, %%mm7)
         MMABS_SUM(%%mm1, %%mm7, %%mm0)
         MMABS_SUM(%%mm2, %%mm7, %%mm0)
@@ -1597,15 +1597,15 @@
         MMABS_SUM(%%mm4, %%mm7, %%mm0)
         MMABS_SUM(%%mm5, %%mm7, %%mm0)
         MMABS_SUM(%%mm6, %%mm7, %%mm0)
-        "movq 64(%1), %%mm1		\n\t"
+        "movq 64(%1), %%mm1             \n\t"
         MMABS_SUM(%%mm1, %%mm7, %%mm0)
-        "movq %%mm0, 64(%1)		\n\t"
+        "movq %%mm0, 64(%1)             \n\t"
 
         LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
         LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
 
         HADAMARD48
-        "movq %%mm7, (%1)		\n\t"
+        "movq %%mm7, (%1)               \n\t"
         MMABS(%%mm0, %%mm7)
         MMABS_SUM(%%mm1, %%mm7, %%mm0)
         MMABS_SUM(%%mm2, %%mm7, %%mm0)
@@ -1613,18 +1613,18 @@
         MMABS_SUM(%%mm4, %%mm7, %%mm0)
         MMABS_SUM(%%mm5, %%mm7, %%mm0)
         MMABS_SUM(%%mm6, %%mm7, %%mm0)
-        "movq (%1), %%mm1		\n\t"
+        "movq (%1), %%mm1               \n\t"
         MMABS_SUM(%%mm1, %%mm7, %%mm0)
-        "movq 64(%1), %%mm1		\n\t"
+        "movq 64(%1), %%mm1             \n\t"
         MMABS_SUM(%%mm1, %%mm7, %%mm0)
 
-        "movq %%mm0, %%mm1		\n\t"
-        "psrlq $32, %%mm0		\n\t"
-        "paddusw %%mm1, %%mm0		\n\t"
-        "movq %%mm0, %%mm1		\n\t"
-        "psrlq $16, %%mm0		\n\t"
-        "paddusw %%mm1, %%mm0		\n\t"
-        "movd %%mm0, %0			\n\t"
+        "movq %%mm0, %%mm1              \n\t"
+        "psrlq $32, %%mm0               \n\t"
+        "paddusw %%mm1, %%mm0           \n\t"
+        "movq %%mm0, %%mm1              \n\t"
+        "psrlq $16, %%mm0               \n\t"
+        "paddusw %%mm1, %%mm0           \n\t"
+        "movd %%mm0, %0                 \n\t"
 
         : "=r" (sum)
         : "r"(temp)
@@ -1646,12 +1646,12 @@
 
         HADAMARD48
 
-        "movq %%mm7, 112(%1)		\n\t"
+        "movq %%mm7, 112(%1)            \n\t"
 
         TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
         STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
 
-        "movq 112(%1), %%mm7 		\n\t"
+        "movq 112(%1), %%mm7            \n\t"
         TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
         STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
 
@@ -1660,23 +1660,23 @@
 
         HADAMARD48
 
-        "movq %%mm7, 120(%1)		\n\t"
+        "movq %%mm7, 120(%1)            \n\t"
 
         TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
         STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
 
-        "movq 120(%1), %%mm7 		\n\t"
+        "movq 120(%1), %%mm7            \n\t"
         TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
-        "movq %%mm7, %%mm5		\n\t"//FIXME remove
-        "movq %%mm6, %%mm7		\n\t"
-        "movq %%mm0, %%mm6		\n\t"
+        "movq %%mm7, %%mm5              \n\t"//FIXME remove
+        "movq %%mm6, %%mm7              \n\t"
+        "movq %%mm0, %%mm6              \n\t"
 //        STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
 
         LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
 //        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
 
         HADAMARD48
-        "movq %%mm7, 64(%1)		\n\t"
+        "movq %%mm7, 64(%1)             \n\t"
         MMABS_MMX2(%%mm0, %%mm7)
         MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
         MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
@@ -1684,15 +1684,15 @@
         MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
         MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
         MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
-        "movq 64(%1), %%mm1		\n\t"
+        "movq 64(%1), %%mm1             \n\t"
         MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
-        "movq %%mm0, 64(%1)		\n\t"
+        "movq %%mm0, 64(%1)             \n\t"
 
         LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
         LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
 
         HADAMARD48
-        "movq %%mm7, (%1)		\n\t"
+        "movq %%mm7, (%1)               \n\t"
         MMABS_MMX2(%%mm0, %%mm7)
         MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
         MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
@@ -1700,16 +1700,16 @@
         MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
         MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
         MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
-        "movq (%1), %%mm1		\n\t"
+        "movq (%1), %%mm1               \n\t"
         MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
-        "movq 64(%1), %%mm1		\n\t"
+        "movq 64(%1), %%mm1             \n\t"
         MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
 
         "pshufw $0x0E, %%mm0, %%mm1     \n\t"
-        "paddusw %%mm1, %%mm0		\n\t"
+        "paddusw %%mm1, %%mm0           \n\t"
         "pshufw $0x01, %%mm0, %%mm1     \n\t"
-        "paddusw %%mm1, %%mm0		\n\t"
-        "movd %%mm0, %0			\n\t"
+        "paddusw %%mm1, %%mm0           \n\t"
+        "movd %%mm0, %0                 \n\t"
 
         : "=r" (sum)
         : "r"(temp)
@@ -1726,24 +1726,24 @@
 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
 
 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
-        "paddw " #m4 ", " #m3 "		\n\t" /* x1 */\
-        "movq "MANGLE(ff_pw_20)", %%mm4		\n\t" /* 20 */\
-        "pmullw " #m3 ", %%mm4		\n\t" /* 20x1 */\
-        "movq "#in7", " #m3 "		\n\t" /* d */\
-        "movq "#in0", %%mm5		\n\t" /* D */\
-        "paddw " #m3 ", %%mm5		\n\t" /* x4 */\
-        "psubw %%mm5, %%mm4		\n\t" /* 20x1 - x4 */\
-        "movq "#in1", %%mm5		\n\t" /* C */\
-        "movq "#in2", %%mm6		\n\t" /* B */\
-        "paddw " #m6 ", %%mm5		\n\t" /* x3 */\
-        "paddw " #m5 ", %%mm6		\n\t" /* x2 */\
-        "paddw %%mm6, %%mm6		\n\t" /* 2x2 */\
-        "psubw %%mm6, %%mm5		\n\t" /* -2x2 + x3 */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm5	\n\t" /* -6x2 + 3x3 */\
-        "paddw " #rnd ", %%mm4		\n\t" /* x2 */\
-        "paddw %%mm4, %%mm5		\n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
-        "psraw $5, %%mm5		\n\t"\
-        "packuswb %%mm5, %%mm5		\n\t"\
+        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
+        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
+        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
+        "movq "#in7", " #m3 "             \n\t" /* d */\
+        "movq "#in0", %%mm5               \n\t" /* D */\
+        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
+        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
+        "movq "#in1", %%mm5               \n\t" /* C */\
+        "movq "#in2", %%mm6               \n\t" /* B */\
+        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
+        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
+        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
+        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
+        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
+        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
+        "psraw $5, %%mm5                  \n\t"\
+        "packuswb %%mm5, %%mm5            \n\t"\
         OP(%%mm5, out, %%mm7, d)
 
 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
@@ -1751,116 +1751,116 @@
     uint64_t temp;\
 \
     asm volatile(\
-        "pxor %%mm7, %%mm7		\n\t"\
-        "1:				\n\t"\
-        "movq  (%0), %%mm0		\n\t" /* ABCDEFGH */\
-        "movq %%mm0, %%mm1		\n\t" /* ABCDEFGH */\
-        "movq %%mm0, %%mm2		\n\t" /* ABCDEFGH */\
-        "punpcklbw %%mm7, %%mm0		\n\t" /* 0A0B0C0D */\
-        "punpckhbw %%mm7, %%mm1		\n\t" /* 0E0F0G0H */\
-        "pshufw $0x90, %%mm0, %%mm5	\n\t" /* 0A0A0B0C */\
-        "pshufw $0x41, %%mm0, %%mm6	\n\t" /* 0B0A0A0B */\
-        "movq %%mm2, %%mm3		\n\t" /* ABCDEFGH */\
-        "movq %%mm2, %%mm4		\n\t" /* ABCDEFGH */\
-        "psllq $8, %%mm2		\n\t" /* 0ABCDEFG */\
-        "psllq $16, %%mm3		\n\t" /* 00ABCDEF */\
-        "psllq $24, %%mm4		\n\t" /* 000ABCDE */\
-        "punpckhbw %%mm7, %%mm2		\n\t" /* 0D0E0F0G */\
-        "punpckhbw %%mm7, %%mm3		\n\t" /* 0C0D0E0F */\
-        "punpckhbw %%mm7, %%mm4		\n\t" /* 0B0C0D0E */\
-        "paddw %%mm3, %%mm5		\n\t" /* b */\
-        "paddw %%mm2, %%mm6		\n\t" /* c */\
-        "paddw %%mm5, %%mm5		\n\t" /* 2b */\
-        "psubw %%mm5, %%mm6		\n\t" /* c - 2b */\
-        "pshufw $0x06, %%mm0, %%mm5	\n\t" /* 0C0B0A0A */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm6		\n\t" /* 3c - 6b */\
-        "paddw %%mm4, %%mm0		\n\t" /* a */\
-        "paddw %%mm1, %%mm5		\n\t" /* d */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm0		\n\t" /* 20a */\
-        "psubw %%mm5, %%mm0		\n\t" /* 20a - d */\
-        "paddw %6, %%mm6		\n\t"\
-        "paddw %%mm6, %%mm0		\n\t" /* 20a - 6b + 3c - d */\
-        "psraw $5, %%mm0		\n\t"\
-        "movq %%mm0, %5			\n\t"\
+        "pxor %%mm7, %%mm7                \n\t"\
+        "1:                               \n\t"\
+        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
+        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
+        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
+        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
+        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
+        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
+        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
+        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
+        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
+        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
+        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
+        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
+        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
+        "paddw %%mm3, %%mm5               \n\t" /* b */\
+        "paddw %%mm2, %%mm6               \n\t" /* c */\
+        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
+        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
+        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
+        "paddw %%mm4, %%mm0               \n\t" /* a */\
+        "paddw %%mm1, %%mm5               \n\t" /* d */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
+        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
+        "paddw %6, %%mm6                  \n\t"\
+        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm0                  \n\t"\
+        "movq %%mm0, %5                   \n\t"\
         /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
         \
-        "movq 5(%0), %%mm0		\n\t" /* FGHIJKLM */\
-        "movq %%mm0, %%mm5		\n\t" /* FGHIJKLM */\
-        "movq %%mm0, %%mm6		\n\t" /* FGHIJKLM */\
-        "psrlq $8, %%mm0		\n\t" /* GHIJKLM0 */\
-        "psrlq $16, %%mm5		\n\t" /* HIJKLM00 */\
-        "punpcklbw %%mm7, %%mm0		\n\t" /* 0G0H0I0J */\
-        "punpcklbw %%mm7, %%mm5		\n\t" /* 0H0I0J0K */\
-        "paddw %%mm0, %%mm2		\n\t" /* b */\
-        "paddw %%mm5, %%mm3		\n\t" /* c */\
-        "paddw %%mm2, %%mm2		\n\t" /* 2b */\
-        "psubw %%mm2, %%mm3		\n\t" /* c - 2b */\
-        "movq %%mm6, %%mm2		\n\t" /* FGHIJKLM */\
-        "psrlq $24, %%mm6		\n\t" /* IJKLM000 */\
-        "punpcklbw %%mm7, %%mm2		\n\t" /* 0F0G0H0I */\
-        "punpcklbw %%mm7, %%mm6		\n\t" /* 0I0J0K0L */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm3		\n\t" /* 3c - 6b */\
-        "paddw %%mm2, %%mm1		\n\t" /* a */\
-        "paddw %%mm6, %%mm4		\n\t" /* d */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm1		\n\t" /* 20a */\
-        "psubw %%mm4, %%mm3		\n\t" /* - 6b +3c - d */\
-        "paddw %6, %%mm1		\n\t"\
-        "paddw %%mm1, %%mm3		\n\t" /* 20a - 6b +3c - d */\
-        "psraw $5, %%mm3		\n\t"\
-        "movq %5, %%mm1			\n\t"\
-        "packuswb %%mm3, %%mm1		\n\t"\
+        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
+        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
+        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
+        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
+        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
+        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
+        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
+        "paddw %%mm0, %%mm2               \n\t" /* b */\
+        "paddw %%mm5, %%mm3               \n\t" /* c */\
+        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
+        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
+        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
+        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
+        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
+        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
+        "paddw %%mm2, %%mm1               \n\t" /* a */\
+        "paddw %%mm6, %%mm4               \n\t" /* d */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
+        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
+        "paddw %6, %%mm1                  \n\t"\
+        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
+        "psraw $5, %%mm3                  \n\t"\
+        "movq %5, %%mm1                   \n\t"\
+        "packuswb %%mm3, %%mm1            \n\t"\
         OP_MMX2(%%mm1, (%1),%%mm4, q)\
         /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
         \
-        "movq 9(%0), %%mm1		\n\t" /* JKLMNOPQ */\
-        "movq %%mm1, %%mm4		\n\t" /* JKLMNOPQ */\
-        "movq %%mm1, %%mm3		\n\t" /* JKLMNOPQ */\
-        "psrlq $8, %%mm1		\n\t" /* KLMNOPQ0 */\
-        "psrlq $16, %%mm4		\n\t" /* LMNOPQ00 */\
-        "punpcklbw %%mm7, %%mm1		\n\t" /* 0K0L0M0N */\
-        "punpcklbw %%mm7, %%mm4		\n\t" /* 0L0M0N0O */\
-        "paddw %%mm1, %%mm5		\n\t" /* b */\
-        "paddw %%mm4, %%mm0		\n\t" /* c */\
-        "paddw %%mm5, %%mm5		\n\t" /* 2b */\
-        "psubw %%mm5, %%mm0		\n\t" /* c - 2b */\
-        "movq %%mm3, %%mm5		\n\t" /* JKLMNOPQ */\
-        "psrlq $24, %%mm3		\n\t" /* MNOPQ000 */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm0		\n\t" /* 3c - 6b */\
-        "punpcklbw %%mm7, %%mm3		\n\t" /* 0M0N0O0P */\
-        "paddw %%mm3, %%mm2		\n\t" /* d */\
-        "psubw %%mm2, %%mm0		\n\t" /* -6b + 3c - d */\
-        "movq %%mm5, %%mm2		\n\t" /* JKLMNOPQ */\
-        "punpcklbw %%mm7, %%mm2		\n\t" /* 0J0K0L0M */\
-        "punpckhbw %%mm7, %%mm5		\n\t" /* 0N0O0P0Q */\
-        "paddw %%mm2, %%mm6		\n\t" /* a */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm6		\n\t" /* 20a */\
-        "paddw %6, %%mm0		\n\t"\
-        "paddw %%mm6, %%mm0		\n\t" /* 20a - 6b + 3c - d */\
-        "psraw $5, %%mm0		\n\t"\
+        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
+        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
+        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
+        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
+        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
+        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
+        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
+        "paddw %%mm1, %%mm5               \n\t" /* b */\
+        "paddw %%mm4, %%mm0               \n\t" /* c */\
+        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
+        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
+        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
+        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
+        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
+        "paddw %%mm3, %%mm2               \n\t" /* d */\
+        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
+        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
+        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
+        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
+        "paddw %%mm2, %%mm6               \n\t" /* a */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
+        "paddw %6, %%mm0                  \n\t"\
+        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm0                  \n\t"\
         /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
         \
-        "paddw %%mm5, %%mm3		\n\t" /* a */\
-        "pshufw $0xF9, %%mm5, %%mm6	\n\t" /* 0O0P0Q0Q */\
-        "paddw %%mm4, %%mm6		\n\t" /* b */\
-        "pshufw $0xBE, %%mm5, %%mm4	\n\t" /* 0P0Q0Q0P */\
-        "pshufw $0x6F, %%mm5, %%mm5	\n\t" /* 0Q0Q0P0O */\
-        "paddw %%mm1, %%mm4		\n\t" /* c */\
-        "paddw %%mm2, %%mm5		\n\t" /* d */\
-        "paddw %%mm6, %%mm6		\n\t" /* 2b */\
-        "psubw %%mm6, %%mm4		\n\t" /* c - 2b */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm3		\n\t" /* 20a */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm4		\n\t" /* 3c - 6b */\
-        "psubw %%mm5, %%mm3		\n\t" /* -6b + 3c - d */\
-        "paddw %6, %%mm4		\n\t"\
-        "paddw %%mm3, %%mm4		\n\t" /* 20a - 6b + 3c - d */\
-        "psraw $5, %%mm4		\n\t"\
-        "packuswb %%mm4, %%mm0		\n\t"\
+        "paddw %%mm5, %%mm3               \n\t" /* a */\
+        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
+        "paddw %%mm4, %%mm6               \n\t" /* b */\
+        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
+        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
+        "paddw %%mm1, %%mm4               \n\t" /* c */\
+        "paddw %%mm2, %%mm5               \n\t" /* d */\
+        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
+        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
+        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
+        "paddw %6, %%mm4                  \n\t"\
+        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm4                  \n\t"\
+        "packuswb %%mm4, %%mm0            \n\t"\
         OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
         \
-        "add %3, %0			\n\t"\
-        "add %4, %1			\n\t"\
-        "decl %2			\n\t"\
-        " jnz 1b				\n\t"\
+        "add %3, %0                       \n\t"\
+        "add %4, %1                       \n\t"\
+        "decl %2                          \n\t"\
+        " jnz 1b                          \n\t"\
         : "+a"(src), "+c"(dst), "+m"(h)\
         : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
         : "memory"\
@@ -1890,21 +1890,21 @@
         temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
         temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
         asm volatile(\
-            "movq (%0), %%mm0		\n\t"\
-            "movq 8(%0), %%mm1		\n\t"\
-            "paddw %2, %%mm0		\n\t"\
-            "paddw %2, %%mm1		\n\t"\
-            "psraw $5, %%mm0		\n\t"\
-            "psraw $5, %%mm1		\n\t"\
-            "packuswb %%mm1, %%mm0	\n\t"\
+            "movq (%0), %%mm0               \n\t"\
+            "movq 8(%0), %%mm1              \n\t"\
+            "paddw %2, %%mm0                \n\t"\
+            "paddw %2, %%mm1                \n\t"\
+            "psraw $5, %%mm0                \n\t"\
+            "psraw $5, %%mm1                \n\t"\
+            "packuswb %%mm1, %%mm0          \n\t"\
             OP_3DNOW(%%mm0, (%1), %%mm1, q)\
-            "movq 16(%0), %%mm0		\n\t"\
-            "movq 24(%0), %%mm1		\n\t"\
-            "paddw %2, %%mm0		\n\t"\
-            "paddw %2, %%mm1		\n\t"\
-            "psraw $5, %%mm0		\n\t"\
-            "psraw $5, %%mm1		\n\t"\
-            "packuswb %%mm1, %%mm0	\n\t"\
+            "movq 16(%0), %%mm0             \n\t"\
+            "movq 24(%0), %%mm1             \n\t"\
+            "paddw %2, %%mm0                \n\t"\
+            "paddw %2, %%mm1                \n\t"\
+            "psraw $5, %%mm0                \n\t"\
+            "psraw $5, %%mm1                \n\t"\
+            "packuswb %%mm1, %%mm0          \n\t"\
             OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
             :: "r"(temp), "r"(dst), "m"(ROUNDER)\
             : "memory"\
@@ -1918,62 +1918,62 @@
     uint64_t temp;\
 \
     asm volatile(\
-        "pxor %%mm7, %%mm7		\n\t"\
-        "1:				\n\t"\
-        "movq  (%0), %%mm0		\n\t" /* ABCDEFGH */\
-        "movq %%mm0, %%mm1		\n\t" /* ABCDEFGH */\
-        "movq %%mm0, %%mm2		\n\t" /* ABCDEFGH */\
-        "punpcklbw %%mm7, %%mm0		\n\t" /* 0A0B0C0D */\
-        "punpckhbw %%mm7, %%mm1		\n\t" /* 0E0F0G0H */\
-        "pshufw $0x90, %%mm0, %%mm5	\n\t" /* 0A0A0B0C */\
-        "pshufw $0x41, %%mm0, %%mm6	\n\t" /* 0B0A0A0B */\
-        "movq %%mm2, %%mm3		\n\t" /* ABCDEFGH */\
-        "movq %%mm2, %%mm4		\n\t" /* ABCDEFGH */\
-        "psllq $8, %%mm2		\n\t" /* 0ABCDEFG */\
-        "psllq $16, %%mm3		\n\t" /* 00ABCDEF */\
-        "psllq $24, %%mm4		\n\t" /* 000ABCDE */\
-        "punpckhbw %%mm7, %%mm2		\n\t" /* 0D0E0F0G */\
-        "punpckhbw %%mm7, %%mm3		\n\t" /* 0C0D0E0F */\
-        "punpckhbw %%mm7, %%mm4		\n\t" /* 0B0C0D0E */\
-        "paddw %%mm3, %%mm5		\n\t" /* b */\
-        "paddw %%mm2, %%mm6		\n\t" /* c */\
-        "paddw %%mm5, %%mm5		\n\t" /* 2b */\
-        "psubw %%mm5, %%mm6		\n\t" /* c - 2b */\
-        "pshufw $0x06, %%mm0, %%mm5	\n\t" /* 0C0B0A0A */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm6		\n\t" /* 3c - 6b */\
-        "paddw %%mm4, %%mm0		\n\t" /* a */\
-        "paddw %%mm1, %%mm5		\n\t" /* d */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm0		\n\t" /* 20a */\
-        "psubw %%mm5, %%mm0		\n\t" /* 20a - d */\
-        "paddw %6, %%mm6		\n\t"\
-        "paddw %%mm6, %%mm0		\n\t" /* 20a - 6b + 3c - d */\
-        "psraw $5, %%mm0		\n\t"\
+        "pxor %%mm7, %%mm7                \n\t"\
+        "1:                               \n\t"\
+        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
+        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
+        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
+        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
+        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
+        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
+        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
+        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
+        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
+        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
+        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
+        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
+        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
+        "paddw %%mm3, %%mm5               \n\t" /* b */\
+        "paddw %%mm2, %%mm6               \n\t" /* c */\
+        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
+        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
+        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
+        "paddw %%mm4, %%mm0               \n\t" /* a */\
+        "paddw %%mm1, %%mm5               \n\t" /* d */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
+        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
+        "paddw %6, %%mm6                  \n\t"\
+        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm0                  \n\t"\
         /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
         \
-        "movd 5(%0), %%mm5		\n\t" /* FGHI */\
-        "punpcklbw %%mm7, %%mm5		\n\t" /* 0F0G0H0I */\
-        "pshufw $0xF9, %%mm5, %%mm6	\n\t" /* 0G0H0I0I */\
-        "paddw %%mm5, %%mm1		\n\t" /* a */\
-        "paddw %%mm6, %%mm2		\n\t" /* b */\
-        "pshufw $0xBE, %%mm5, %%mm6	\n\t" /* 0H0I0I0H */\
-        "pshufw $0x6F, %%mm5, %%mm5	\n\t" /* 0I0I0H0G */\
-        "paddw %%mm6, %%mm3		\n\t" /* c */\
-        "paddw %%mm5, %%mm4		\n\t" /* d */\
-        "paddw %%mm2, %%mm2		\n\t" /* 2b */\
-        "psubw %%mm2, %%mm3		\n\t" /* c - 2b */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm1		\n\t" /* 20a */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm3		\n\t" /* 3c - 6b */\
-        "psubw %%mm4, %%mm3		\n\t" /* -6b + 3c - d */\
-        "paddw %6, %%mm1		\n\t"\
-        "paddw %%mm1, %%mm3		\n\t" /* 20a - 6b + 3c - d */\
-        "psraw $5, %%mm3		\n\t"\
-        "packuswb %%mm3, %%mm0		\n\t"\
+        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
+        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
+        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
+        "paddw %%mm5, %%mm1               \n\t" /* a */\
+        "paddw %%mm6, %%mm2               \n\t" /* b */\
+        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
+        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
+        "paddw %%mm6, %%mm3               \n\t" /* c */\
+        "paddw %%mm5, %%mm4               \n\t" /* d */\
+        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
+        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
+        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
+        "paddw %6, %%mm1                  \n\t"\
+        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm3                  \n\t"\
+        "packuswb %%mm3, %%mm0            \n\t"\
         OP_MMX2(%%mm0, (%1), %%mm4, q)\
         \
-        "add %3, %0			\n\t"\
-        "add %4, %1			\n\t"\
-        "decl %2			\n\t"\
-        " jnz 1b			\n\t"\
+        "add %3, %0                       \n\t"\
+        "add %4, %1                       \n\t"\
+        "decl %2                          \n\t"\
+        " jnz 1b                          \n\t"\
         : "+a"(src), "+c"(dst), "+m"(h)\
         : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
         : "memory"\
@@ -1995,13 +1995,13 @@
         temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
         temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
         asm volatile(\
-            "movq (%0), %%mm0		\n\t"\
-            "movq 8(%0), %%mm1		\n\t"\
-            "paddw %2, %%mm0		\n\t"\
-            "paddw %2, %%mm1		\n\t"\
-            "psraw $5, %%mm0		\n\t"\
-            "psraw $5, %%mm1		\n\t"\
-            "packuswb %%mm1, %%mm0	\n\t"\
+            "movq (%0), %%mm0           \n\t"\
+            "movq 8(%0), %%mm1          \n\t"\
+            "paddw %2, %%mm0            \n\t"\
+            "paddw %2, %%mm1            \n\t"\
+            "psraw $5, %%mm0            \n\t"\
+            "psraw $5, %%mm1            \n\t"\
+            "packuswb %%mm1, %%mm0      \n\t"\
             OP_3DNOW(%%mm0, (%1), %%mm1, q)\
             :: "r"(temp), "r"(dst), "m"(ROUNDER)\
             :"memory"\
@@ -2020,24 +2020,24 @@
 \
     /*FIXME unroll */\
     asm volatile(\
-        "pxor %%mm7, %%mm7		\n\t"\
-        "1:				\n\t"\
-        "movq (%0), %%mm0		\n\t"\
-        "movq (%0), %%mm1		\n\t"\
-        "movq 8(%0), %%mm2		\n\t"\
-        "movq 8(%0), %%mm3		\n\t"\
-        "punpcklbw %%mm7, %%mm0		\n\t"\
-        "punpckhbw %%mm7, %%mm1		\n\t"\
-        "punpcklbw %%mm7, %%mm2		\n\t"\
-        "punpckhbw %%mm7, %%mm3		\n\t"\
-        "movq %%mm0, (%1)		\n\t"\
-        "movq %%mm1, 17*8(%1)		\n\t"\
-        "movq %%mm2, 2*17*8(%1)		\n\t"\
-        "movq %%mm3, 3*17*8(%1)		\n\t"\
-        "add $8, %1			\n\t"\
-        "add %3, %0			\n\t"\
-        "decl %2			\n\t"\
-        " jnz 1b			\n\t"\
+        "pxor %%mm7, %%mm7              \n\t"\
+        "1:                             \n\t"\
+        "movq (%0), %%mm0               \n\t"\
+        "movq (%0), %%mm1               \n\t"\
+        "movq 8(%0), %%mm2              \n\t"\
+        "movq 8(%0), %%mm3              \n\t"\
+        "punpcklbw %%mm7, %%mm0         \n\t"\
+        "punpckhbw %%mm7, %%mm1         \n\t"\
+        "punpcklbw %%mm7, %%mm2         \n\t"\
+        "punpckhbw %%mm7, %%mm3         \n\t"\
+        "movq %%mm0, (%1)               \n\t"\
+        "movq %%mm1, 17*8(%1)           \n\t"\
+        "movq %%mm2, 2*17*8(%1)         \n\t"\
+        "movq %%mm3, 3*17*8(%1)         \n\t"\
+        "add $8, %1                     \n\t"\
+        "add %3, %0                     \n\t"\
+        "decl %2                        \n\t"\
+        " jnz 1b                        \n\t"\
         : "+r" (src), "+r" (temp_ptr), "+r"(count)\
         : "r" ((long)srcStride)\
         : "memory"\
@@ -2048,42 +2048,42 @@
     \
 /*FIXME reorder for speed */\
     asm volatile(\
-        /*"pxor %%mm7, %%mm7		\n\t"*/\
-        "1:				\n\t"\
-        "movq (%0), %%mm0		\n\t"\
-        "movq 8(%0), %%mm1		\n\t"\
-        "movq 16(%0), %%mm2		\n\t"\
-        "movq 24(%0), %%mm3		\n\t"\
+        /*"pxor %%mm7, %%mm7              \n\t"*/\
+        "1:                             \n\t"\
+        "movq (%0), %%mm0               \n\t"\
+        "movq 8(%0), %%mm1              \n\t"\
+        "movq 16(%0), %%mm2             \n\t"\
+        "movq 24(%0), %%mm3             \n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
-        "add %4, %1			\n\t"\
+        "add %4, %1                     \n\t"\
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
         \
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
-        "add %4, %1			\n\t"\
+        "add %4, %1                     \n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
-        "add %4, %1			\n\t"\
+        "add %4, %1                     \n\t"\
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
-        "add %4, %1			\n\t"\
+        "add %4, %1                     \n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
-        "add %4, %1			\n\t"\
+        "add %4, %1                     \n\t"\
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
-        "add %4, %1			\n\t"\
+        "add %4, %1                     \n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
         \
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
-        "add %4, %1			\n\t"  \
+        "add %4, %1                     \n\t"  \
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
         \
-        "add $136, %0			\n\t"\
-        "add %6, %1			\n\t"\
-        "decl %2			\n\t"\
-        " jnz 1b			\n\t"\
+        "add $136, %0                   \n\t"\
+        "add %6, %1                     \n\t"\
+        "decl %2                        \n\t"\
+        " jnz 1b                        \n\t"\
         \
         : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
         : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
@@ -2098,18 +2098,18 @@
 \
     /*FIXME unroll */\
     asm volatile(\
-        "pxor %%mm7, %%mm7		\n\t"\
-        "1:				\n\t"\
-        "movq (%0), %%mm0		\n\t"\
-        "movq (%0), %%mm1		\n\t"\
-        "punpcklbw %%mm7, %%mm0		\n\t"\
-        "punpckhbw %%mm7, %%mm1		\n\t"\
-        "movq %%mm0, (%1)		\n\t"\
-        "movq %%mm1, 9*8(%1)		\n\t"\
-        "add $8, %1			\n\t"\
-        "add %3, %0			\n\t"\
-        "decl %2			\n\t"\
-        " jnz 1b			\n\t"\
+        "pxor %%mm7, %%mm7              \n\t"\
+        "1:                             \n\t"\
+        "movq (%0), %%mm0               \n\t"\
+        "movq (%0), %%mm1               \n\t"\
+        "punpcklbw %%mm7, %%mm0         \n\t"\
+        "punpckhbw %%mm7, %%mm1         \n\t"\
+        "movq %%mm0, (%1)               \n\t"\
+        "movq %%mm1, 9*8(%1)            \n\t"\
+        "add $8, %1                     \n\t"\
+        "add %3, %0                     \n\t"\
+        "decl %2                        \n\t"\
+        " jnz 1b                        \n\t"\
         : "+r" (src), "+r" (temp_ptr), "+r"(count)\
         : "r" ((long)srcStride)\
         : "memory"\
@@ -2120,30 +2120,30 @@
     \
 /*FIXME reorder for speed */\
     asm volatile(\
-        /*"pxor %%mm7, %%mm7		\n\t"*/\
-        "1:				\n\t"\
-        "movq (%0), %%mm0		\n\t"\
-        "movq 8(%0), %%mm1		\n\t"\
-        "movq 16(%0), %%mm2		\n\t"\
-        "movq 24(%0), %%mm3		\n\t"\
+        /*"pxor %%mm7, %%mm7              \n\t"*/\
+        "1:                             \n\t"\
+        "movq (%0), %%mm0               \n\t"\
+        "movq 8(%0), %%mm1              \n\t"\
+        "movq 16(%0), %%mm2             \n\t"\
+        "movq 24(%0), %%mm3             \n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
-        "add %4, %1			\n\t"\
+        "add %4, %1                     \n\t"\
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
         \
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
-        "add %4, %1			\n\t"\
+        "add %4, %1                     \n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
         \
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
-        "add %4, %1			\n\t"\
+        "add %4, %1                     \n\t"\
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
                 \
-        "add $72, %0			\n\t"\
-        "add %6, %1			\n\t"\
-        "decl %2			\n\t"\
-        " jnz 1b			\n\t"\
+        "add $72, %0                    \n\t"\
+        "add %6, %1                     \n\t"\
+        "decl %2                        \n\t"\
+        " jnz 1b                        \n\t"\
          \
         : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
         : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
@@ -2374,15 +2374,15 @@
     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
 }
 
-#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "	\n\t"
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
 #define AVG_3DNOW_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "	\n\t"\
-"pavgusb " #temp ", " #a "	\n\t"\
-"mov" #size " " #a ", " #b "	\n\t"
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgusb " #temp ", " #a "        \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
 #define AVG_MMX2_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "	\n\t"\
-"pavgb " #temp ", " #a "	\n\t"\
-"mov" #size " " #a ", " #b "	\n\t"
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgb " #temp ", " #a "          \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
 
 QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
 QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
@@ -2410,40 +2410,40 @@
     scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
 
     asm volatile(
-        "pcmpeqw %%mm6, %%mm6		\n\t" // -1w
-        "psrlw $15, %%mm6		\n\t" //  1w
-        "pxor %%mm7, %%mm7		\n\t"
-        "movd  %4, %%mm5		\n\t"
-        "punpcklwd %%mm5, %%mm5		\n\t"
-        "punpcklwd %%mm5, %%mm5		\n\t"
-        "1:				\n\t"
-        "movq  (%1, %0), %%mm0		\n\t"
-        "movq  8(%1, %0), %%mm1		\n\t"
-        "pmulhw %%mm5, %%mm0		\n\t"
-        "pmulhw %%mm5, %%mm1		\n\t"
-        "paddw %%mm6, %%mm0		\n\t"
-        "paddw %%mm6, %%mm1		\n\t"
-        "psraw $1, %%mm0		\n\t"
-        "psraw $1, %%mm1		\n\t"
-        "paddw (%2, %0), %%mm0		\n\t"
-        "paddw 8(%2, %0), %%mm1		\n\t"
-        "psraw $6, %%mm0		\n\t"
-        "psraw $6, %%mm1		\n\t"
-        "pmullw (%3, %0), %%mm0		\n\t"
-        "pmullw 8(%3, %0), %%mm1	\n\t"
-        "pmaddwd %%mm0, %%mm0		\n\t"
-        "pmaddwd %%mm1, %%mm1		\n\t"
-        "paddd %%mm1, %%mm0		\n\t"
-        "psrld $4, %%mm0		\n\t"
-        "paddd %%mm0, %%mm7		\n\t"
-        "add $16, %0			\n\t"
-        "cmp $128, %0			\n\t" //FIXME optimize & bench
-        " jb 1b				\n\t"
-        "movq %%mm7, %%mm6		\n\t"
-        "psrlq $32, %%mm7		\n\t"
-        "paddd %%mm6, %%mm7		\n\t"
-        "psrld $2, %%mm7		\n\t"
-        "movd %%mm7, %0			\n\t"
+        "pcmpeqw %%mm6, %%mm6           \n\t" // -1w
+        "psrlw $15, %%mm6               \n\t" //  1w
+        "pxor %%mm7, %%mm7              \n\t"
+        "movd  %4, %%mm5                \n\t"
+        "punpcklwd %%mm5, %%mm5         \n\t"
+        "punpcklwd %%mm5, %%mm5         \n\t"
+        "1:                             \n\t"
+        "movq  (%1, %0), %%mm0          \n\t"
+        "movq  8(%1, %0), %%mm1         \n\t"
+        "pmulhw %%mm5, %%mm0            \n\t"
+        "pmulhw %%mm5, %%mm1            \n\t"
+        "paddw %%mm6, %%mm0             \n\t"
+        "paddw %%mm6, %%mm1             \n\t"
+        "psraw $1, %%mm0                \n\t"
+        "psraw $1, %%mm1                \n\t"
+        "paddw (%2, %0), %%mm0          \n\t"
+        "paddw 8(%2, %0), %%mm1         \n\t"
+        "psraw $6, %%mm0                \n\t"
+        "psraw $6, %%mm1                \n\t"
+        "pmullw (%3, %0), %%mm0         \n\t"
+        "pmullw 8(%3, %0), %%mm1        \n\t"
+        "pmaddwd %%mm0, %%mm0           \n\t"
+        "pmaddwd %%mm1, %%mm1           \n\t"
+        "paddd %%mm1, %%mm0             \n\t"
+        "psrld $4, %%mm0                \n\t"
+        "paddd %%mm0, %%mm7             \n\t"
+        "add $16, %0                    \n\t"
+        "cmp $128, %0                   \n\t" //FIXME optimize & bench
+        " jb 1b                         \n\t"
+        "movq %%mm7, %%mm6              \n\t"
+        "psrlq $32, %%mm7               \n\t"
+        "paddd %%mm6, %%mm7             \n\t"
+        "psrld $2, %%mm7                \n\t"
+        "movd %%mm7, %0                 \n\t"
 
         : "+r" (i)
         : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
@@ -2457,27 +2457,27 @@
     if(ABS(scale) < 256){
         scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
         asm volatile(
-                "pcmpeqw %%mm6, %%mm6		\n\t" // -1w
-                "psrlw $15, %%mm6		\n\t" //  1w
-                "movd  %3, %%mm5		\n\t"
-                "punpcklwd %%mm5, %%mm5		\n\t"
-                "punpcklwd %%mm5, %%mm5		\n\t"
-                "1:				\n\t"
-                "movq  (%1, %0), %%mm0		\n\t"
-                "movq  8(%1, %0), %%mm1		\n\t"
-                "pmulhw %%mm5, %%mm0		\n\t"
-                "pmulhw %%mm5, %%mm1		\n\t"
-                "paddw %%mm6, %%mm0		\n\t"
-                "paddw %%mm6, %%mm1		\n\t"
-                "psraw $1, %%mm0		\n\t"
-                "psraw $1, %%mm1		\n\t"
-                "paddw (%2, %0), %%mm0		\n\t"
-                "paddw 8(%2, %0), %%mm1		\n\t"
-                "movq %%mm0, (%2, %0)		\n\t"
-                "movq %%mm1, 8(%2, %0)		\n\t"
-                "add $16, %0			\n\t"
-                "cmp $128, %0			\n\t" //FIXME optimize & bench
-                " jb 1b				\n\t"
+                "pcmpeqw %%mm6, %%mm6   \n\t" // -1w
+                "psrlw $15, %%mm6       \n\t" //  1w
+                "movd  %3, %%mm5        \n\t"
+                "punpcklwd %%mm5, %%mm5 \n\t"
+                "punpcklwd %%mm5, %%mm5 \n\t"
+                "1:                     \n\t"
+                "movq  (%1, %0), %%mm0  \n\t"
+                "movq  8(%1, %0), %%mm1 \n\t"
+                "pmulhw %%mm5, %%mm0    \n\t"
+                "pmulhw %%mm5, %%mm1    \n\t"
+                "paddw %%mm6, %%mm0     \n\t"
+                "paddw %%mm6, %%mm1     \n\t"
+                "psraw $1, %%mm0        \n\t"
+                "psraw $1, %%mm1        \n\t"
+                "paddw (%2, %0), %%mm0  \n\t"
+                "paddw 8(%2, %0), %%mm1 \n\t"
+                "movq %%mm0, (%2, %0)   \n\t"
+                "movq %%mm1, 8(%2, %0)  \n\t"
+                "add $16, %0            \n\t"
+                "cmp $128, %0           \n\t" //FIXME optimize & bench
+                " jb 1b                 \n\t"
 
                 : "+r" (i)
                 : "r"(basis), "r"(rem), "g"(scale)
@@ -2569,10 +2569,10 @@
     mm_flags = mm_support();
 
     if (avctx->dsp_mask) {
-	if (avctx->dsp_mask & FF_MM_FORCE)
-	    mm_flags |= (avctx->dsp_mask & 0xffff);
-	else
-	    mm_flags &= ~(avctx->dsp_mask & 0xffff);
+        if (avctx->dsp_mask & FF_MM_FORCE)
+            mm_flags |= (avctx->dsp_mask & 0xffff);
+        else
+            mm_flags &= ~(avctx->dsp_mask & 0xffff);
     }
 
 #if 0
@@ -2598,7 +2598,7 @@
         if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
             if(mm_flags & MM_SSE2){
                 c->fdct = ff_fdct_sse2;
-	    }else if(mm_flags & MM_MMXEXT){
+            }else if(mm_flags & MM_MMXEXT){
                 c->fdct = ff_fdct_mmx2;
             }else{
                 c->fdct = ff_fdct_mmx;
@@ -2709,13 +2709,13 @@
         c->hadamard8_diff[0]= hadamard8_diff16_mmx;
         c->hadamard8_diff[1]= hadamard8_diff_mmx;
 
-	c->pix_norm1 = pix_norm1_mmx;
-	c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
-  	c->sse[1] = sse8_mmx;
+        c->pix_norm1 = pix_norm1_mmx;
+        c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
+          c->sse[1] = sse8_mmx;
         c->vsad[4]= vsad_intra16_mmx;
 
-	c->nsse[0] = nsse16_mmx;
-	c->nsse[1] = nsse8_mmx;
+        c->nsse[0] = nsse16_mmx;
+        c->nsse[1] = nsse8_mmx;
         if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
             c->vsad[0] = vsad16_mmx;
         }
@@ -2729,7 +2729,7 @@
 
         c->h263_v_loop_filter= h263_v_loop_filter_mmx;
         c->h263_h_loop_filter= h263_h_loop_filter_mmx;
-	c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
+        c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
         c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
 
         if (mm_flags & MM_MMXEXT) {
@@ -2829,7 +2829,7 @@
             dspfunc(avg_h264_qpel, 2, 4);
 #undef dspfunc
 
-	    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
+            c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
             c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
             c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
             c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
@@ -2941,7 +2941,7 @@
             dspfunc(avg_h264_qpel, 1, 8);
             dspfunc(avg_h264_qpel, 2, 4);
 
-	    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
+            c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
             c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
         }
     }