diff i386/dsputil_mmx_avg.h @ 651:45e8f39fda50 libavcodec

put/avg_pixels16 fixing 2 small qpel bugs
author michaelni
date Wed, 11 Sep 2002 12:39:53 +0000
parents e8c8ca9106aa
children 13aec7e50c52
line wrap: on
line diff
--- a/i386/dsputil_mmx_avg.h	Wed Sep 11 07:26:43 2002 +0000
+++ b/i386/dsputil_mmx_avg.h	Wed Sep 11 12:39:53 2002 +0000
@@ -25,7 +25,7 @@
 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
    clobber bug - now it will work with 2.95.2 and also with -fPIC
  */
-static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -52,9 +52,49 @@
 	:"r" (line_size)
 	:"%eax", "memory");
 }
+
+static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    __asm __volatile(
+	"lea (%3, %3), %%eax		\n\t"
+	"1:				\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq 8(%1), %%mm2		\n\t"
+	"movq 8(%1, %3), %%mm3		\n\t"
+	PAVGB" 1(%1), %%mm0		\n\t"
+	PAVGB" 1(%1, %3), %%mm1		\n\t"
+	PAVGB" 9(%1), %%mm2		\n\t"
+	PAVGB" 9(%1, %3), %%mm3		\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"movq %%mm2, 8(%2)		\n\t"
+	"movq %%mm3, 8(%2, %3)		\n\t"
+	"addl %%eax, %1			\n\t"
+	"addl %%eax, %2			\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq 8(%1), %%mm2		\n\t"
+	"movq 8(%1, %3), %%mm3		\n\t"
+	PAVGB" 1(%1), %%mm0		\n\t"
+	PAVGB" 1(%1, %3), %%mm1		\n\t"
+	PAVGB" 9(%1), %%mm2		\n\t"
+	PAVGB" 9(%1, %3), %%mm3		\n\t"
+	"addl %%eax, %1			\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"movq %%mm2, 8(%2)		\n\t"
+	"movq %%mm3, 8(%2, %3)		\n\t"
+	"addl %%eax, %2			\n\t"
+	"subl $4, %0			\n\t"
+	"jnz 1b				\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r" (line_size)
+	:"%eax", "memory");
+}
  
 /* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
@@ -91,7 +131,7 @@
 	:"%eax", "memory");
 }
 
-static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -122,7 +162,7 @@
 }
 
 /* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
@@ -155,7 +195,7 @@
 	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -183,7 +223,7 @@
 	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -215,7 +255,7 @@
 	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -254,7 +294,7 @@
 }
 
 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter 
-static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
@@ -294,3 +334,34 @@
 	:"r" (line_size)
 	:"%eax",  "memory");
 }
+
+//FIXME the following could be optimized too ...
+static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
+    DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_no_rnd_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(avg_pixels8)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+