# HG changeset patch
# User upsuper
# Date 1355332424 0
# Node ID db12239148e95e6e966ebf876aecb07b0fe14143
# Parent  715e7aa1ffdb9a803e1743ac69a52e77ac5b1220
Accelerate ass rendering by using SSE4 for yuv422.

The render_frame_yuv422_sse4 is ~4x faster than render_frame_yuv422.

diff -r 715e7aa1ffdb -r db12239148e9 libmpcodecs/vf_ass.c
--- a/libmpcodecs/vf_ass.c	Wed Dec 12 17:13:39 2012 +0000
+++ b/libmpcodecs/vf_ass.c	Wed Dec 12 17:13:44 2012 +0000
@@ -45,6 +45,9 @@
 #include "sub/ass_mp.h"
 #include "sub/eosd.h"
 
+#include "cpudetect.h"
+#include "libavutil/x86_cpu.h"
+
 #define _r(c)  ((c)>>24)
 #define _g(c)  (((c)>>16)&0xFF)
 #define _b(c)  (((c)>>8)&0xFF)
@@ -58,6 +61,15 @@
 /* map 0 - 0xFF -> 0 - 0x10101 */
 #define MAP_24BIT(v) RSHIFT(0x10203 * (v), 8)
 
+#if HAVE_SSE4
+
+#define CLEAN_XMM(n) \
+    __asm__ volatile ( "pxor %%xmm" #n ", %%xmm" #n " \n\t" : )
+DECLARE_ASM_CONST(16, uint32_t, SSE_32BIT_80H[4]) = { [0 ... 3] = 0x80 };
+DECLARE_ASM_CONST(16, uint32_t, SSE_32BIT_MAP[4]) = { [0 ... 3] = 0x102 };
+
+#endif // HAVE_SSE4
+
 static const struct vf_priv_s {
     int outh, outw;
 
@@ -139,6 +151,7 @@
             size_t p = i * outw + j;
             dst_u[p] = (dst_u[p] + dst_u[p + 1]) / 2;
             dst_v[p] = (dst_v[p] + dst_v[p + 1]) / 2;
+            dst_u[p + 1] = dst_v[p + 1] = 0;
         }
     }
 }
@@ -192,6 +205,116 @@
     }
 }
 
+#if HAVE_SSE4
+
+static void render_frame_yuv422_sse4(vf_instance_t *vf)
+{
+    uint8_t *alpha = vf->priv->alphas[0];
+    uint8_t *src_y = vf->priv->planes[0],
+            *src_u = vf->priv->planes[1],
+            *src_v = vf->priv->planes[2];
+    int outw = vf->priv->outw,
+        outh = vf->priv->outh;
+    struct dirty_rows_extent *dr = vf->priv->dirty_rows;
+    uint8_t *dst = vf->dmpi->planes[0];
+    int stride = vf->dmpi->stride[0];
+    int is_uyvy = vf->priv->outfmt == IMGFMT_UYVY;
+    int i;
+
+    CLEAN_XMM(7);
+
+    for (i = 0; i < outh; i++) {
+        size_t xmin = dr[i].xmin & ~7,
+               xmax = dr[i].xmax;
+        __asm__ volatile (
+                "jmp 4f \n\t"
+                "1: \n\t"
+
+                "cmpl   $-1,    0(%[alpha], %[j], 1) \n\t"
+                "jne    2f \n\t"
+                "cmpl   $-1,    4(%[alpha], %[j], 1) \n\t"
+                "jne    2f \n\t"
+                "jmp    3f \n\t"
+
+                "2: \n\t"
+                "movq       (%[alpha], %[j], 1),    %%xmm0 \n\t"
+                "punpcklbw  %%xmm7, %%xmm0 \n\t"
+                "movdqa     %%xmm0, %%xmm1 \n\t"
+                "punpcklwd  %%xmm7, %%xmm0 \n\t"
+                "punpckhwd  %%xmm7, %%xmm1 \n\t"
+                "pmulld     "MANGLE(SSE_32BIT_MAP)",    %%xmm0 \n\t"
+                "pmulld     "MANGLE(SSE_32BIT_MAP)",    %%xmm1 \n\t"
+                "paddd      "MANGLE(SSE_32BIT_80H)",    %%xmm0 \n\t"
+                "paddd      "MANGLE(SSE_32BIT_80H)",    %%xmm1 \n\t"
+                "psrld      $8, %%xmm0 \n\t"
+                "psrld      $8, %%xmm1 \n\t"
+                "movdqa     %%xmm0, %%xmm2 \n\t"
+                "movdqa     %%xmm1, %%xmm3 \n\t"
+                "packssdw   %%xmm1, %%xmm0 \n\t"
+                "phaddd     %%xmm3, %%xmm2 \n\t"
+                "psrld      $1, %%xmm2 \n\t"
+                "packusdw   %%xmm7, %%xmm2 \n\t"
+                "punpcklwd  %%xmm2, %%xmm2 \n\t"
+
+                "movdqu     (%[dst], %[j], 2),  %%xmm1 \n\t"
+                "movdqa     %%xmm1, %%xmm3 \n\t"
+                "test       %[f],   %[f] \n\t"
+                "jz         11f \n\t"
+                "psrlw      $8, %%xmm1 \n\t"
+                "psllw      $8, %%xmm3 \n\t"
+                "psrlw      $8, %%xmm3 \n\t"
+                "jmp        12f \n\t"
+                "11: \n\t"
+                "psllw      $8, %%xmm1 \n\t"
+                "psrlw      $8, %%xmm1 \n\t"
+                "psrlw      $8, %%xmm3 \n\t"
+                "12: \n\t"
+                "pmullw     %%xmm0, %%xmm1 \n\t"
+                "pmullw     %%xmm2, %%xmm3 \n\t"
+                "psrlw      $8, %%xmm1 \n\t"
+                "psrlw      $8, %%xmm3 \n\t"
+                "packuswb   %%xmm7, %%xmm1 \n\t"
+                "packuswb   %%xmm7, %%xmm3 \n\t"
+                "movq       (%[src_y], %[j], 1),    %%xmm4 \n\t"
+                "movq       (%[src_u], %[j], 1),    %%xmm5 \n\t"
+                "movq       (%[src_v], %[j], 1),    %%xmm6 \n\t"
+                "packuswb   %%xmm7, %%xmm5 \n\t"
+                "packuswb   %%xmm7, %%xmm6 \n\t"
+                "punpcklbw  %%xmm6, %%xmm5 \n\t"
+                "test       %[f],   %[f] \n\t"
+                "jz         21f \n\t"
+                "punpcklbw  %%xmm1, %%xmm3 \n\t"
+                "punpcklbw  %%xmm4, %%xmm5 \n\t"
+                "paddb      %%xmm5, %%xmm3 \n\t"
+                "movdqu     %%xmm3, (%[dst], %[j], 2) \n\t"
+                "jmp        22f \n\t"
+                "21: \n\t"
+                "punpcklbw  %%xmm3, %%xmm1 \n\t"
+                "punpcklbw  %%xmm5, %%xmm4 \n\t"
+                "paddb      %%xmm4, %%xmm1 \n\t"
+                "movdqu     %%xmm1, (%[dst], %[j], 2) \n\t"
+                "22: \n\t"
+
+                "3: \n\t"
+                "add    $8, %[j] \n\t"
+                "4: \n\t"
+                "cmp    %[xmax],    %[j] \n\t"
+                "jl     1b \n\t"
+
+                : : [dst]   "r" (dst + i * stride),
+                    [alpha] "r" (alpha + i * outw),
+                    [src_y] "r" (src_y + i * outw),
+                    [src_u] "r" (src_u + i * outw),
+                    [src_v] "r" (src_v + i * outw),
+                    [j]     "r" (xmin),
+                    [xmax]  "g" (xmax),
+                    [f]     "r" (is_uyvy)
+        );
+    }
+}
+
+#endif // HAVE_SSE4
+
 static void prepare_buffer_420p(vf_instance_t *vf)
 {
     int outw = vf->priv->outw,
@@ -334,6 +457,10 @@
         vf->priv->draw_image = draw_image_yuv;
         vf->priv->render_frame = render_frame_yuv422;
         vf->priv->prepare_buffer = prepare_buffer_422;
+#if HAVE_SSE4
+        if (gCpuCaps.hasSSE4 && outw % 8 == 0)
+            vf->priv->render_frame = render_frame_yuv422_sse4;
+#endif
         break;
     default:
         return 0;