diff i386/dsputil_mmx.c @ 4749:7011f597e473 libavcodec

mmx 16-bit ssd. 2.3x faster svq1 encoding.
author lorenm
date Fri, 30 Mar 2007 19:15:31 +0000
parents 30261f4ed12d
children 231daf8387b1
line wrap: on
line diff
--- a/i386/dsputil_mmx.c	Fri Mar 30 09:26:13 2007 +0000
+++ b/i386/dsputil_mmx.c	Fri Mar 30 19:15:31 2007 +0000
@@ -1730,6 +1730,38 @@
 
 WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
 WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
+
+static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){
+    int sum;
+    long i=size;
+    asm volatile(
+        "pxor %%mm4, %%mm4 \n"
+        "1: \n"
+        "sub $8, %0 \n"
+        "movq (%2,%0), %%mm2 \n"
+        "movq (%3,%0,2), %%mm0 \n"
+        "movq 8(%3,%0,2), %%mm1 \n"
+        "punpckhbw %%mm2, %%mm3 \n"
+        "punpcklbw %%mm2, %%mm2 \n"
+        "psraw $8, %%mm3 \n"
+        "psraw $8, %%mm2 \n"
+        "psubw %%mm3, %%mm1 \n"
+        "psubw %%mm2, %%mm0 \n"
+        "pmaddwd %%mm1, %%mm1 \n"
+        "pmaddwd %%mm0, %%mm0 \n"
+        "paddd %%mm1, %%mm4 \n"
+        "paddd %%mm0, %%mm4 \n"
+        "jg 1b \n"
+        "movq %%mm4, %%mm3 \n"
+        "psrlq $32, %%mm3 \n"
+        "paddd %%mm3, %%mm4 \n"
+        "movd %%mm4, %1 \n"
+        :"+r"(i), "=r"(sum)
+        :"r"(pix1), "r"(pix2)
+    );
+    return sum;
+}
+
 #endif //CONFIG_ENCODERS
 
 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
@@ -3215,6 +3247,8 @@
         }
         c->add_8x8basis= add_8x8basis_mmx;
 
+        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
+
 #endif //CONFIG_ENCODERS
 
         c->h263_v_loop_filter= h263_v_loop_filter_mmx;