libavcodec.hg: i386/dsputil_mmx

comparison i386/dsputil_mmx_avg.h @ 651:45e8f39fda50 libavcodec

put/avg_pixels16 fixing 2 small qpel bugs

author	michaelni
date	Wed, 11 Sep 2002 12:39:53 +0000
parents	e8c8ca9106aa
children	13aec7e50c52

comparison

equal deleted inserted replaced

-:ef4a33aad86e
+:45e8f39fda50
 */
 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
 clobber bug - now it will work with 2.95.2 and also with -fPIC
 */
-static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
 __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
 	"1:				\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
 	PAVGB" 1(%1), %%mm0		\n\t"
 	PAVGB" 1(%1, %3), %%mm1		\n\t"
 	"movq %%mm0, (%2)		\n\t"
 	"movq %%mm1, (%2, %3)		\n\t"
 	"addl %%eax, %1			\n\t"
 	"addl %%eax, %2			\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
 	PAVGB" 1(%1), %%mm0		\n\t"
 	PAVGB" 1(%1, %3), %%mm1		\n\t"
 	"addl %%eax, %1			\n\t"
 	"movq %%mm0, (%2)		\n\t"
 	"movq %%mm1, (%2, %3)		\n\t"
+	"addl %%eax, %2			\n\t"
+	"subl $4, %0			\n\t"
+	"jnz 1b				\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r" (line_size)
+	:"%eax", "memory");
+}
+static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+__asm __volatile(
+	"lea (%3, %3), %%eax		\n\t"
+	"1:				\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq 8(%1), %%mm2		\n\t"
+	"movq 8(%1, %3), %%mm3		\n\t"
+	PAVGB" 1(%1), %%mm0		\n\t"
+	PAVGB" 1(%1, %3), %%mm1		\n\t"
+	PAVGB" 9(%1), %%mm2		\n\t"
+	PAVGB" 9(%1, %3), %%mm3		\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"movq %%mm2, 8(%2)		\n\t"
+	"movq %%mm3, 8(%2, %3)		\n\t"
+	"addl %%eax, %1			\n\t"
+	"addl %%eax, %2			\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq 8(%1), %%mm2		\n\t"
+	"movq 8(%1, %3), %%mm3		\n\t"
+	PAVGB" 1(%1), %%mm0		\n\t"
+	PAVGB" 1(%1, %3), %%mm1		\n\t"
+	PAVGB" 9(%1), %%mm2		\n\t"
+	PAVGB" 9(%1, %3), %%mm3		\n\t"
+	"addl %%eax, %1			\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"movq %%mm2, 8(%2)		\n\t"
+	"movq %%mm3, 8(%2, %3)		\n\t"
 	"addl %%eax, %2			\n\t"
 	"subl $4, %0			\n\t"
 	"jnz 1b				\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
 	:"r" (line_size)
 	:"%eax", "memory");
 }
 /* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
 MOVQ_BONE(mm6);
 __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
 	"1:				\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
 	:"r" (line_size)
 	:"%eax", "memory");
 }
-static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
 __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"subl %3, %2			\n\t"
 	:"r" (line_size)
 	:"%eax", "memory");
 }
 /* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
 MOVQ_BONE(mm6);
 __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
 	"movq (%1), %%mm0		\n\t"
 	:"+g"(h), "+S"(pixels), "+D" (block)
 	:"r" (line_size)
 	:"%eax", "memory");
 }
-static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
 __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
 	"1:				\n\t"
 	"movq (%2), %%mm0		\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
 	:"r" (line_size)
 	:"%eax", "memory");
 }
-static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
 __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
 	"1:				\n\t"
 	"movq (%1), %%mm0		\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
 	:"r" (line_size)
 	:"%eax", "memory");
 }
-static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
 __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"subl %3, %2			\n\t"
 	:"r" (line_size)
 	:"%eax", "memory");
 }
 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
-static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
 MOVQ_BONE(mm6);
 __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"jnz 1b				\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
 	:"r" (line_size)
 	:"%eax",  "memory");
 }
+//FIXME the following could be optimized too ...
+static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
+DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
+DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_no_rnd_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
+DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+DEF(avg_pixels8)(block  , pixels  , line_size, h);
+DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
+DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
+DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
+DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
+}

Mercurial > libavcodec.hg

comparison i386/dsputil_mmx_avg.h @ 651:45e8f39fda50 libavcodec