diff i386/dsputil_mmx.c @ 651:45e8f39fda50 libavcodec

put/avg_pixels16 fixing 2 small qpel bugs
author michaelni
date Wed, 11 Sep 2002 12:39:53 +0000
parents 47a8964ba5cd
children 894b61908734
line wrap: on
line diff
--- a/i386/dsputil_mmx.c	Wed Sep 11 07:26:43 2002 +0000
+++ b/i386/dsputil_mmx.c	Wed Sep 11 12:39:53 2002 +0000
@@ -343,7 +343,7 @@
     } while (--i);
 }
 
-static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	 "lea (%3, %3), %%eax		\n\t"
@@ -369,6 +369,40 @@
 	);
 }
 
+static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    __asm __volatile(
+	 "lea (%3, %3), %%eax		\n\t"
+	 ".balign 8			\n\t"
+	 "1:				\n\t"
+	 "movq (%1), %%mm0		\n\t"
+	 "movq 8(%1), %%mm4		\n\t"
+	 "movq (%1, %3), %%mm1		\n\t"
+	 "movq 8(%1, %3), %%mm5		\n\t"
+     	 "movq %%mm0, (%2)		\n\t"
+     	 "movq %%mm4, 8(%2)		\n\t"
+	 "movq %%mm1, (%2, %3)		\n\t"
+	 "movq %%mm5, 8(%2, %3)		\n\t"
+	 "addl %%eax, %1		\n\t"
+         "addl %%eax, %2       		\n\t"
+	 "movq (%1), %%mm0		\n\t"
+	 "movq 8(%1), %%mm4		\n\t"
+	 "movq (%1, %3), %%mm1		\n\t"
+	 "movq 8(%1, %3), %%mm5		\n\t"
+	 "movq %%mm0, (%2)		\n\t"
+	 "movq %%mm4, 8(%2)		\n\t"
+	 "movq %%mm1, (%2, %3)		\n\t"
+	 "movq %%mm5, 8(%2, %3)		\n\t"
+	 "addl %%eax, %1		\n\t"
+	 "addl %%eax, %2       		\n\t"
+	 "subl $4, %0			\n\t"
+	 "jnz 1b			\n\t"
+	 : "+g"(h), "+r" (pixels),  "+r" (block)
+	 : "r"(line_size)
+	 : "%eax", "memory"
+	);
+}
+
 static void clear_blocks_mmx(DCTELEM *blocks)
 {
     __asm __volatile(
@@ -424,25 +458,45 @@
         pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
         pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
 
-        put_pixels_tab[0] = put_pixels_mmx;
-        put_pixels_tab[1] = put_pixels_x2_mmx;
-        put_pixels_tab[2] = put_pixels_y2_mmx;
-        put_pixels_tab[3] = put_pixels_xy2_mmx;
+        put_pixels_tab[0][0] = put_pixels16_mmx;
+        put_pixels_tab[0][1] = put_pixels16_x2_mmx;
+        put_pixels_tab[0][2] = put_pixels16_y2_mmx;
+        put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
 
-        put_no_rnd_pixels_tab[0] = put_pixels_mmx;
-        put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
-        put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
-        put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
+        put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
+        put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
+        put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
+        put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
+
+        avg_pixels_tab[0][0] = avg_pixels16_mmx;
+        avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
+        avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
+        avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
 
-        avg_pixels_tab[0] = avg_pixels_mmx;
-        avg_pixels_tab[1] = avg_pixels_x2_mmx;
-        avg_pixels_tab[2] = avg_pixels_y2_mmx;
-        avg_pixels_tab[3] = avg_pixels_xy2_mmx;
+        avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
+        avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
+        avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
+        avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
+        
+        put_pixels_tab[1][0] = put_pixels8_mmx;
+        put_pixels_tab[1][1] = put_pixels8_x2_mmx;
+        put_pixels_tab[1][2] = put_pixels8_y2_mmx;
+        put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
 
-        avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
-        avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
-        avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
-        avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
+        put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
+        put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
+        put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
+        put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
+
+        avg_pixels_tab[1][0] = avg_pixels8_mmx;
+        avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
+        avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
+        avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
+
+        avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
+        avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
+        avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
+        avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
 
         if (mm_flags & MM_MMXEXT) {
             pix_abs16x16    = pix_abs16x16_mmx2;
@@ -455,25 +509,45 @@
             pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
             pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
 
-            put_pixels_tab[1] = put_pixels_x2_mmx2;
-            put_pixels_tab[2] = put_pixels_y2_mmx2;
-            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
-            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
+            put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
+            put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
+            put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
+            put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
+
+            avg_pixels_tab[0][0] = avg_pixels16_mmx2;
+            avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
+            avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
+            avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
 
-            avg_pixels_tab[0] = avg_pixels_mmx2;
-            avg_pixels_tab[1] = avg_pixels_x2_mmx2;
-            avg_pixels_tab[2] = avg_pixels_y2_mmx2;
-            avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
+            put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
+            put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
+            put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
+            put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
+
+            avg_pixels_tab[1][0] = avg_pixels8_mmx2;
+            avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
+            avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
+            avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
         } else if (mm_flags & MM_3DNOW) {
-            put_pixels_tab[1] = put_pixels_x2_3dnow;
-            put_pixels_tab[2] = put_pixels_y2_3dnow;
-            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
-            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
+            put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
+            put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
+            put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
+            put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
 
-            avg_pixels_tab[0] = avg_pixels_3dnow;
-            avg_pixels_tab[1] = avg_pixels_x2_3dnow;
-            avg_pixels_tab[2] = avg_pixels_y2_3dnow;
-            avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
+            avg_pixels_tab[0][0] = avg_pixels16_3dnow;
+            avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
+            avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+            avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
+            
+            put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
+            put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
+            put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
+            put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
+
+            avg_pixels_tab[1][0] = avg_pixels8_3dnow;
+            avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
+            avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
+            avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
         }
 
         /* idct */
@@ -552,21 +626,22 @@
 void dsputil_set_bit_exact_mmx(void)
 {
     if (mm_flags & MM_MMX) {
+    
+        /* MMX2 & 3DNOW */
+        put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
+        put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
+        avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
+        put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
+        put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
+        avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
+
         if (mm_flags & MM_MMXEXT) {
-            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
-            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
-            avg_pixels_tab[3] = avg_pixels_xy2_mmx;
-
             pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
             pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
             pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
             pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
             pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
             pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
-        } else if (mm_flags & MM_3DNOW) {
-            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
-            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
-            avg_pixels_tab[3] = avg_pixels_xy2_mmx;
         }
 #ifdef SIMPLE_IDCT
         if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx)