diff i386/dsputil_mmx.c @ 3536:545a15c19c91 libavcodec

sse & sse2 implementations of vorbis channel coupling. 9% faster vorbis (on a K8).
author lorenm
date Thu, 03 Aug 2006 03:18:47 +0000
parents 419409926166
children 3fbddeb13686
line wrap: on
line diff
--- a/i386/dsputil_mmx.c	Thu Aug 03 02:18:07 2006 +0000
+++ b/i386/dsputil_mmx.c	Thu Aug 03 03:18:47 2006 +0000
@@ -2711,6 +2711,59 @@
 }
 #endif
 
+static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
+{
+    int i;
+    asm volatile("pxor %%mm7, %%mm7":);
+    for(i=0; i<blocksize; i+=2) {
+        asm volatile(
+            "movq    %0,    %%mm0 \n\t"
+            "movq    %1,    %%mm1 \n\t"
+            "movq    %%mm0, %%mm2 \n\t"
+            "movq    %%mm1, %%mm3 \n\t"
+            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
+            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
+            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
+            "pxor    %%mm2, %%mm1 \n\t"
+            "movq    %%mm3, %%mm4 \n\t"
+            "pand    %%mm1, %%mm3 \n\t"
+            "pandn   %%mm1, %%mm4 \n\t"
+            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+            "movq    %%mm3, %1    \n\t"
+            "movq    %%mm0, %0    \n\t"
+            :"+m"(mag[i]), "+m"(ang[i])
+            ::"memory"
+        );
+    }
+    asm volatile("emms");
+}
+static void vorbis_inverse_coupling_sse2(float *mag, float *ang, int blocksize)
+{
+    int i;
+    for(i=0; i<blocksize; i+=4) {
+        asm volatile(
+            "movaps  %0,     %%xmm0 \n\t"
+            "movaps  %1,     %%xmm1 \n\t"
+            "pxor    %%xmm2, %%xmm2 \n\t"
+            "pxor    %%xmm3, %%xmm3 \n\t"
+            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
+            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
+            "pslld   $31,    %%xmm2 \n\t" // keep only the sign bit
+            "pxor    %%xmm2, %%xmm1 \n\t"
+            "movaps  %%xmm3, %%xmm4 \n\t"
+            "pand    %%xmm1, %%xmm3 \n\t"
+            "pandn   %%xmm1, %%xmm4 \n\t"
+            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+            "movaps  %%xmm3, %1     \n\t"
+            "movaps  %%xmm0, %0     \n\t"
+            :"+m"(mag[i]), "+m"(ang[i])
+            ::"memory"
+        );
+    }
+}
+
 #ifdef CONFIG_SNOW_ENCODER
 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
@@ -3137,6 +3190,11 @@
             c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
         }
 #endif
+
+        if(mm_flags & MM_SSE2)
+            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse2;
+        else if(mm_flags & MM_SSE)
+            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
     }
 
 #ifdef CONFIG_ENCODERS