changeset 386:f49629bab18d libavcodec

hopefully faster mmx2&3dnow MC
author michaelni
date Fri, 17 May 2002 01:04:14 +0000
parents 7ac7a48fbe5e
children b8f3affeb8e1
files i386/dsputil_mmx.c i386/dsputil_mmx_avg.h
diffstat 2 files changed, 363 insertions(+), 324 deletions(-) [+]
line wrap: on
line diff
--- a/i386/dsputil_mmx.c	Thu May 16 23:29:09 2002 +0000
+++ b/i386/dsputil_mmx.c	Fri May 17 01:04:14 2002 +0000
@@ -21,6 +21,7 @@
 
 #include "../dsputil.h"
 #include "../simple_idct.h"
+#include "../mangle.h"
 
 int mm_flags; /* multimedia extension flags */
 
@@ -49,6 +50,7 @@
 void ff_mmxext_idct(DCTELEM *block);
 
 /* pixel operations */
+static const unsigned long long int mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101LL;
 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL;
 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL;
 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
@@ -90,7 +92,7 @@
 /***********************************/
 /* MMX2 specific */
 
-#define DEF(x) x ## _sse
+#define DEF(x) x ## _mmx2
 
 /* Introduced only in MMX2 set */
 #define PAVGB "pavgb"
@@ -105,41 +107,38 @@
 
 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
 {
-    DCTELEM *p;
-    const UINT8 *pix;
-    int i;
-
-    /* read the pixels */
-    p = block;
-    pix = pixels;
-    MOVQ_ZERO(mm7);
-    for(i=0;i<4;i++) {
-	__asm __volatile(
-		"movq	%1, %%mm0\n\t"
-		"movq	%2, %%mm1\n\t"
-		"movq	%%mm0, %%mm2\n\t"
-		"movq	%%mm1, %%mm3\n\t"
-		"punpcklbw %%mm7, %%mm0\n\t"
-		"punpckhbw %%mm7, %%mm2\n\t"
-		"punpcklbw %%mm7, %%mm1\n\t"
-		"punpckhbw %%mm7, %%mm3\n\t"
-		"movq	%%mm0, %0\n\t"
-		"movq	%%mm2, 8%0\n\t"
-		"movq	%%mm1, 16%0\n\t"
-		"movq	%%mm3, 24%0\n\t"
-		:"=m"(*p)
-		:"m"(*pix), "m"(*(pix+line_size))
-		:"memory");
-        pix += line_size*2;
-        p += 16;
-    }
+    asm volatile(
+        "movl $-128, %%eax	\n\t"
+        "pxor %%mm7, %%mm7	\n\t"
+        ".balign 16		\n\t"
+        "1:			\n\t"
+        "movq (%0), %%mm0	\n\t"
+        "movq (%0, %2), %%mm2	\n\t"
+        "movq %%mm0, %%mm1	\n\t"
+        "movq %%mm2, %%mm3	\n\t"
+        "punpcklbw %%mm7, %%mm0	\n\t"
+        "punpckhbw %%mm7, %%mm1	\n\t"
+        "punpcklbw %%mm7, %%mm2	\n\t"
+        "punpckhbw %%mm7, %%mm3	\n\t"
+        "movq %%mm0, (%1, %%eax)\n\t"
+        "movq %%mm1, 8(%1, %%eax)\n\t"
+        "movq %%mm2, 16(%1, %%eax)\n\t"
+        "movq %%mm3, 24(%1, %%eax)\n\t"
+        "addl %3, %0		\n\t"
+        "addl $32, %%eax	\n\t"
+        "js 1b			\n\t"
+        : "+r" (pixels)
+        : "r" (block+64), "r" (line_size), "r" (line_size*2)
+        : "%eax"
+    );
 }
 
 static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
 {
     asm volatile(
+        "pxor %%mm7, %%mm7	\n\t"
+        "movl $-128, %%eax	\n\t"
         ".balign 16		\n\t"
-        "movl $-128, %%eax	\n\t"
         "1:			\n\t"
         "movq (%0), %%mm0	\n\t"
         "movq (%1), %%mm2	\n\t"
@@ -261,56 +260,62 @@
 
 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-    int hh;
-    UINT8 *p;
-    const UINT8 *pix;
-
-    p   = block;
-    pix = pixels; // 2s
-#if 0
-    do {
-      __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-	pix += line_size;
-	p += line_size;
-    } while (--h);
+#if 0 //FIXME h==4 case
+    asm volatile(
+        "xorl %%eax, %%eax		\n\t"
+        "movl %3, %%esi			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "subl $8, %%esi			\n\t"
+        " jnz 1b			\n\t"
+    :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
+    : "%eax", "%esi", "memory"
+    );
 #else
-    // this optimized code is not very usefull
-    // the above loop is definitely faster
-    // at least on Celeron 500MHz
-    hh = h & 3;
-    while (hh) {
-      __asm __volatile(
-	  "movq	%1, %%mm0\n\t"
-	  "movq	%%mm0, %0\n\t"
-	  :"=m"(*p)
-	  :"m"(*pix)
-	  :"memory");
-	pix += line_size;
-	p += line_size;
-	hh--;
-    }
-    hh=h>>2;
-    while (hh) {
-    __asm __volatile(
-	"movq	(%1), %%mm0		\n\t"
-	"movq	(%1, %2), %%mm1		\n\t"
-	"movq	(%1, %2, 2), %%mm2	\n\t"
-	"movq	(%1, %3), %%mm3		\n\t"
-	"movq	%%mm0, (%0)		\n\t"
-	"movq	%%mm1, (%0, %2)		\n\t"
-	"movq	%%mm2, (%0, %2, 2)	\n\t"
-	"movq	%%mm3, (%0, %3)		\n\t"
-	::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3)
-	:"memory");
-        pix += line_size*4;
-	p += line_size*4;
-        hh--;
-    }
+    asm volatile(
+        "xorl %%eax, %%eax		\n\t"
+        "movl %3, %%esi			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq %%mm0, (%0, %%eax)	\n\t"
+        "addl %2, %%eax			\n\t"
+        "subl $4, %%esi			\n\t"
+        " jnz 1b			\n\t"
+    :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
+    : "%eax", "%esi", "memory"
+    );
 #endif
 }
 
@@ -1124,7 +1129,7 @@
         avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
         avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
         avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
-        
+
         sub_pixels_tab[0] = sub_pixels_mmx;
         sub_pixels_tab[1] = sub_pixels_x2_mmx;
         sub_pixels_tab[2] = sub_pixels_y2_mmx;
@@ -1140,20 +1145,24 @@
             pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
             pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
             pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
-            
-            put_pixels_tab[1] = put_pixels_x2_sse;
-            put_pixels_tab[2] = put_pixels_y2_sse;
+
+            put_pixels_tab[1] = put_pixels_x2_mmx2;
+            put_pixels_tab[2] = put_pixels_y2_mmx2;
+            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
+            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
             
-            avg_pixels_tab[0] = avg_pixels_sse;
-            avg_pixels_tab[1] = avg_pixels_x2_sse;
-            avg_pixels_tab[2] = avg_pixels_y2_sse;
-            avg_pixels_tab[3] = avg_pixels_xy2_sse;
+            avg_pixels_tab[0] = avg_pixels_mmx2;
+            avg_pixels_tab[1] = avg_pixels_x2_mmx2;
+            avg_pixels_tab[2] = avg_pixels_y2_mmx2;
+            avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
 
-            sub_pixels_tab[1] = sub_pixels_x2_sse;
-            sub_pixels_tab[2] = sub_pixels_y2_sse;
+            sub_pixels_tab[1] = sub_pixels_x2_mmx2;
+            sub_pixels_tab[2] = sub_pixels_y2_mmx2;
         } else if (mm_flags & MM_3DNOW) {
             put_pixels_tab[1] = put_pixels_x2_3dnow;
             put_pixels_tab[2] = put_pixels_y2_3dnow;
+            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
+            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
             
             avg_pixels_tab[0] = avg_pixels_3dnow;
             avg_pixels_tab[1] = avg_pixels_x2_3dnow;
--- a/i386/dsputil_mmx_avg.h	Thu May 16 23:29:09 2002 +0000
+++ b/i386/dsputil_mmx_avg.h	Fri May 17 01:04:14 2002 +0000
@@ -1,6 +1,7 @@
 /*
  * DSP utils : average functions are compiled twice for 3dnow/mmx2
  * Copyright (c) 2000, 2001 Gerard Lantau.
+ * Copyright (c) 2002 Michael Niedermayer
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -17,271 +18,300 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  */
 
 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  int dh, hh;
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>2;
-  dh=h&3;
-  while(hh--) {
+    __asm __volatile(
+        "xorl %%eax, %%eax		\n\t"
+        ".balign 16			\n\t"
+        "1:				\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	"movq 1(%1, %%eax), %%mm1	\n\t"
+	"movq (%2, %%eax), %%mm2	\n\t"
+	"movq 1(%2, %%eax), %%mm3	\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm2		\n\t"
+	"movq %%mm0, (%3, %%eax)	\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+        "addl %5, %%eax			\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	"movq 1(%1, %%eax), %%mm1	\n\t"
+	"movq (%2, %%eax), %%mm2	\n\t"
+	"movq 1(%2, %%eax), %%mm3	\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm2		\n\t"
+	"movq %%mm0, (%3, %%eax)	\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+        "addl %5, %%eax			\n\t"
+        "subl $4, %0			\n\t"
+        " jnz 1b			\n\t"
+	:"+g"(h)
+        :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
+         "r"(line_size<<1)
+	:"%eax", "memory");
+}
+ 
+static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
     __asm __volatile(
-	"movq	(%1), %%mm0\n\t"
-	"movq	1(%1), %%mm1\n\t"
-	"movq	(%1, %2), %%mm2\n\t"
-	"movq	1(%1, %2), %%mm3\n\t"
-	"movq	(%1, %2, 2), %%mm4\n\t"
-	"movq	1(%1, %2, 2), %%mm5\n\t"
-	"movq	(%1, %3), %%mm6\n\t"
-	"movq	1(%1, %3), %%mm7\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	PAVGB"  %%mm3, %%mm2\n\t"
-	PAVGB"  %%mm5, %%mm4\n\t"
-	PAVGB"  %%mm7, %%mm6\n\t"
-	"movq	%%mm0, (%0)\n\t"
-	"movq	%%mm2, (%0, %2)\n\t"
-	"movq	%%mm4, (%0, %2, 2)\n\t"
-	"movq	%%mm6, (%0, %3)\n\t"
-	::"r"(p), "r"(pix), "r" (line_size), "r" (line_size*3)
-	:"memory");
-     pix += line_size*4; p += line_size*4;
-  }
-  while(dh--) {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	1%1, %%mm1\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-     pix += line_size; p += line_size;
-  }
+        "xorl %%eax, %%eax		\n\t"
+        "movq "MANGLE(mm_bone)", %%mm7	\n\t"
+        ".balign 16			\n\t"
+        "1:				\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	"movq 1(%1, %%eax), %%mm1	\n\t"
+	"movq (%2, %%eax), %%mm2	\n\t"
+	"movq 1(%2, %%eax), %%mm3	\n\t"
+        "psubusb %%mm7, %%mm0		\n\t"
+        "psubusb %%mm7, %%mm2		\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm2		\n\t"
+	"movq %%mm0, (%3, %%eax)	\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+        "addl %5, %%eax			\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	"movq 1(%1, %%eax), %%mm1	\n\t"
+	"movq (%2, %%eax), %%mm2	\n\t"
+	"movq 1(%2, %%eax), %%mm3	\n\t"
+        "psubusb %%mm7, %%mm0		\n\t"
+        "psubusb %%mm7, %%mm2		\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm2		\n\t"
+	"movq %%mm0, (%3, %%eax)	\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+        "addl %5, %%eax			\n\t"
+        "subl $4, %0			\n\t"
+        " jnz 1b			\n\t"
+	:"+g"(h)
+        :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
+         "r"(line_size<<1)
+	:"%eax", "memory");
 }
 
 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  int dh, hh;
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-
-  hh=h>>1;
-  dh=h&1;
-  while(hh--) {
     __asm __volatile(
-	"movq	%2, %%mm0\n\t"
-	"movq	%3, %%mm1\n\t"
-	"movq	%4, %%mm2\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	PAVGB"  %%mm2, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, %1\n\t"
-	:"=m"(*p), "=m"(*(p+line_size))
-	:"m"(*pix), "m"(*(pix+line_size)),
-	 "m"(*(pix+line_size*2))
-	:"memory");
-     pix += line_size*2;
-     p += line_size*2;
-  }
-  if(dh) {
+        "xorl %%eax, %%eax		\n\t"
+	"movq (%1), %%mm0		\n\t"
+        ".balign 16			\n\t"
+        "1:				\n\t"
+	"movq (%2, %%eax), %%mm1	\n\t"
+	"movq (%3, %%eax), %%mm2	\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm2, %%mm1		\n\t"
+	"movq %%mm0, (%4, %%eax)	\n\t"
+	"movq %%mm1, (%5, %%eax)	\n\t"
+        "addl %6, %%eax			\n\t"
+	"movq (%2, %%eax), %%mm1	\n\t"
+	"movq (%3, %%eax), %%mm0	\n\t"
+	PAVGB" %%mm1, %%mm2		\n\t"
+	PAVGB" %%mm0, %%mm1		\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+	"movq %%mm1, (%5, %%eax)	\n\t"
+        "addl %6, %%eax			\n\t"
+        "subl $4, %0			\n\t"
+        " jnz 1b			\n\t"
+	:"+g"(h)
+	:"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), 
+         "r" (block+line_size), "g"(line_size<<1)
+	:"%eax",  "memory");
+}
+
+static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
     __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-  }
+        "movq "MANGLE(mm_bone)", %%mm7	\n\t"
+        "xorl %%eax, %%eax		\n\t"
+	"movq (%1), %%mm0		\n\t"
+        ".balign 16			\n\t"
+        "1:				\n\t"
+	"movq (%2, %%eax), %%mm1	\n\t"
+	"movq (%3, %%eax), %%mm2	\n\t"
+        "psubusb %%mm7, %%mm1		\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm2, %%mm1		\n\t"
+	"movq %%mm0, (%4, %%eax)	\n\t"
+	"movq %%mm1, (%5, %%eax)	\n\t"
+        "addl %6, %%eax			\n\t"
+	"movq (%2, %%eax), %%mm1	\n\t"
+	"movq (%3, %%eax), %%mm0	\n\t"
+        "psubusb %%mm7, %%mm1		\n\t"
+	PAVGB" %%mm1, %%mm2		\n\t"
+	PAVGB" %%mm0, %%mm1		\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+	"movq %%mm1, (%5, %%eax)	\n\t"
+        "addl %6, %%eax			\n\t"
+        "subl $4, %0			\n\t"
+        " jnz 1b			\n\t"
+	:"+g"(h)
+	:"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), 
+         "r" (block+line_size), "g"(line_size<<1)
+	:"%eax",  "memory");
 }
 
 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  int dh, hh;
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>2;
-  dh=h&3;
-  while(hh--) {
     __asm __volatile(
-	"movq	(%0), %%mm0\n\t"
-	"movq	(%1), %%mm1\n\t"
-	"movq	(%0, %2), %%mm2\n\t"
-	"movq	(%1, %2), %%mm3\n\t"
-	"movq	(%0, %2, 2), %%mm4\n\t"
-	"movq	(%1, %2, 2), %%mm5\n\t"
-	"movq	(%0, %3), %%mm6\n\t"
-	"movq	(%1, %3), %%mm7\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	PAVGB"  %%mm3, %%mm2\n\t"
-	PAVGB"  %%mm5, %%mm4\n\t"
-	PAVGB"  %%mm7, %%mm6\n\t"
-	"movq	%%mm0, (%0)\n\t"
-	"movq	%%mm2, (%0, %2)\n\t"
-	"movq	%%mm4, (%0, %2, 2)\n\t"
-	"movq	%%mm6, (%0, %3)\n\t"
-	::"r"(p), "r"(pix), "r" (line_size), "r" (line_size*3)
-	:"memory");
-     pix += line_size*4; p += line_size*4;
-  }
-  while(dh--) {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm1\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix)
-	:"memory");
-     pix += line_size; p += line_size;
-  }
+        "xorl %%eax, %%eax		\n\t"
+        ".balign 16			\n\t"
+        "1:				\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	"movq (%2, %%eax), %%mm2	\n\t"
+	"movq (%3, %%eax), %%mm3	\n\t"
+	"movq (%4, %%eax), %%mm4	\n\t"
+	PAVGB" %%mm3, %%mm0		\n\t"
+	PAVGB" %%mm4, %%mm2		\n\t"
+	"movq %%mm0, (%3, %%eax)	\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+        "addl %5, %%eax			\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	"movq (%2, %%eax), %%mm2	\n\t"
+	"movq (%3, %%eax), %%mm3	\n\t"
+	"movq (%4, %%eax), %%mm4	\n\t"
+	PAVGB" %%mm3, %%mm0		\n\t"
+	PAVGB" %%mm4, %%mm2		\n\t"
+	"movq %%mm0, (%3, %%eax)	\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+        "addl %5, %%eax			\n\t"
+        "subl $4, %0			\n\t"
+        " jnz 1b			\n\t"
+	:"+g"(h)
+        :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
+         "r"(line_size<<1)
+	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels_x2)( UINT8  *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  int dh, hh;
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>1;
-  dh=h&1;
-  while(hh--) {
     __asm __volatile(
-	"movq	%2, %%mm2\n\t"
-	"movq	1%2, %%mm3\n\t"
-	"movq	%3, %%mm4\n\t"
-	"movq	1%3, %%mm5\n\t"
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm1\n\t"
-	PAVGB"	%%mm3, %%mm2\n\t"
-	PAVGB"	%%mm2, %%mm0\n\t"
-	PAVGB"	%%mm5, %%mm4\n\t"
-	PAVGB"	%%mm4, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, %1\n\t"
-	:"+m"(*p), "+m"(*(p+line_size))
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size*2;
-   p +=   line_size*2;
-  }
-  if(dh) {
-    __asm __volatile(
-	"movq	%1, %%mm1\n\t"
-	"movq	1%1, %%mm2\n\t"
-	"movq	%0, %%mm0\n\t"
-	PAVGB"	%%mm2, %%mm1\n\t"
-	PAVGB"	%%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix)
-	:"memory");
-  }
+        "xorl %%eax, %%eax		\n\t"
+        ".balign 16			\n\t"
+        "1:				\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	"movq 1(%1, %%eax), %%mm1	\n\t"
+	"movq (%2, %%eax), %%mm2	\n\t"
+	"movq 1(%2, %%eax), %%mm3	\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm2		\n\t"
+	"movq (%3, %%eax), %%mm3	\n\t"
+	"movq (%4, %%eax), %%mm4	\n\t"
+	PAVGB" %%mm3, %%mm0		\n\t"
+	PAVGB" %%mm4, %%mm2		\n\t"
+	"movq %%mm0, (%3, %%eax)	\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+        "addl %5, %%eax			\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	"movq 1(%1, %%eax), %%mm1	\n\t"
+	"movq (%2, %%eax), %%mm2	\n\t"
+	"movq 1(%2, %%eax), %%mm3	\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm2		\n\t"
+	"movq (%3, %%eax), %%mm3	\n\t"
+	"movq (%4, %%eax), %%mm4	\n\t"
+	PAVGB" %%mm3, %%mm0		\n\t"
+	PAVGB" %%mm4, %%mm2		\n\t"
+	"movq %%mm0, (%3, %%eax)	\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+        "addl %5, %%eax			\n\t"
+        "subl $4, %0			\n\t"
+        " jnz 1b			\n\t"
+	:"+g"(h)
+        :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
+         "r"(line_size<<1)
+	:"%eax", "memory");
 }
 
-static void  DEF(avg_pixels_y2)( UINT8  *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  int dh, hh;
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>1;
-  dh=h&1;
-  while(hh--) {
     __asm __volatile(
-	"movq	%2, %%mm2\n\t"
-	"movq	%3, %%mm3\n\t"
-	"movq	%3, %%mm4\n\t"
-	"movq	%4, %%mm5\n\t"
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm1\n\t"
-	PAVGB"	%%mm3, %%mm2\n\t"
-	PAVGB"	%%mm2, %%mm0\n\t"
-	PAVGB"	%%mm5, %%mm4\n\t"
-	PAVGB"	%%mm4, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, %1\n\t"
-	:"+m"(*p), "+m"(*(p+line_size))
-	:"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2))
-	:"memory");
-   pix += line_size*2;
-   p +=   line_size*2;
-  }
-  if(dh) {
-    __asm __volatile(
-	"movq	%1, %%mm1\n\t"
-	"movq	%2, %%mm2\n\t"
-	"movq	%0, %%mm0\n\t"
-	PAVGB"	%%mm2, %%mm1\n\t"
-	PAVGB"	%%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-  }
+        "xorl %%eax, %%eax		\n\t"
+	"movq (%1), %%mm0		\n\t"
+        ".balign 16			\n\t"
+        "1:				\n\t"
+	"movq (%2, %%eax), %%mm1	\n\t"
+	"movq (%3, %%eax), %%mm2	\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm2, %%mm1		\n\t"
+	"movq (%4, %%eax), %%mm3	\n\t"
+	"movq (%5, %%eax), %%mm4	\n\t"
+	PAVGB" %%mm3, %%mm0		\n\t"
+	PAVGB" %%mm4, %%mm1		\n\t"
+	"movq %%mm0, (%4, %%eax)	\n\t"
+	"movq %%mm1, (%5, %%eax)	\n\t"
+        "addl %6, %%eax			\n\t"
+	"movq (%2, %%eax), %%mm1	\n\t"
+	"movq (%3, %%eax), %%mm0	\n\t"
+	PAVGB" %%mm1, %%mm2		\n\t"
+	PAVGB" %%mm0, %%mm1		\n\t"
+	"movq (%4, %%eax), %%mm3	\n\t"
+	"movq (%5, %%eax), %%mm4	\n\t"
+	PAVGB" %%mm3, %%mm2		\n\t"
+	PAVGB" %%mm4, %%mm1		\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+	"movq %%mm1, (%5, %%eax)	\n\t"
+        "addl %6, %%eax			\n\t"
+        "subl $4, %0			\n\t"
+        " jnz 1b			\n\t"
+	:"+g"(h)
+	:"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), 
+         "r" (block+line_size), "g"(line_size<<1)
+	:"%eax",  "memory");
 }
 
-static void DEF(avg_pixels_xy2)( UINT8  *block, const UINT8 *pixels, int line_size, int h)
+// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter 
+static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wtwo));
-  do {
     __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	PAVGB"	%0, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   line_size ;
-  } while(--h);
+        "movq "MANGLE(mm_bone)", %%mm7	\n\t"
+        "xorl %%eax, %%eax		\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq 1(%1), %%mm1		\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+        ".balign 16			\n\t"
+        "1:				\n\t"
+	"movq (%2, %%eax), %%mm1	\n\t"
+	"movq (%3, %%eax), %%mm2	\n\t"
+	"movq 1(%2, %%eax), %%mm3	\n\t"
+	"movq 1(%3, %%eax), %%mm4	\n\t"
+        "psubusb %%mm7, %%mm2		\n\t"
+	PAVGB" %%mm3, %%mm1		\n\t"
+	PAVGB" %%mm4, %%mm2		\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm2, %%mm1		\n\t"
+	"movq (%4, %%eax), %%mm3	\n\t"
+	"movq (%5, %%eax), %%mm4	\n\t"
+	PAVGB" %%mm3, %%mm0		\n\t"
+	PAVGB" %%mm4, %%mm1		\n\t"
+	"movq %%mm0, (%4, %%eax)	\n\t"
+	"movq %%mm1, (%5, %%eax)	\n\t"
+        "addl %6, %%eax			\n\t"
+	"movq (%2, %%eax), %%mm1	\n\t"
+	"movq (%3, %%eax), %%mm0	\n\t"
+	"movq 1(%2, %%eax), %%mm3	\n\t"
+	"movq 1(%3, %%eax), %%mm4	\n\t"
+	PAVGB" %%mm3, %%mm1		\n\t"
+	PAVGB" %%mm4, %%mm0		\n\t"
+	PAVGB" %%mm1, %%mm2		\n\t"
+	PAVGB" %%mm0, %%mm1		\n\t"
+	"movq (%4, %%eax), %%mm3	\n\t"
+	"movq (%5, %%eax), %%mm4	\n\t"
+	PAVGB" %%mm3, %%mm2		\n\t"
+	PAVGB" %%mm4, %%mm1		\n\t"
+	"movq %%mm2, (%4, %%eax)	\n\t"
+	"movq %%mm1, (%5, %%eax)	\n\t"
+        "addl %6, %%eax			\n\t"
+        "subl $4, %0			\n\t"
+        " jnz 1b			\n\t"
+	:"+g"(h)
+	:"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), 
+         "r" (block+line_size), "g"(line_size<<1)
+	:"%eax",  "memory");
 }
 
+//Note: the sub* functions are no used 
+
 static void DEF(sub_pixels_x2)( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
 {
   DCTELEM  *p;