changeset 35576:2508973357da

Accelerate ass rendering by using SSE4 for yuv420p. The render_frame_yuv420p_sse4 is ~3x faster than render_frame_yuv420p.
author upsuper
date Wed, 12 Dec 2012 17:18:01 +0000
parents db12239148e9
children b356942130df
files libmpcodecs/vf_ass.c
diffstat 1 files changed, 172 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/libmpcodecs/vf_ass.c	Wed Dec 12 17:13:44 2012 +0000
+++ b/libmpcodecs/vf_ass.c	Wed Dec 12 17:18:01 2012 +0000
@@ -341,6 +341,28 @@
                         dst_v[q2] + dst_v[q2 + 1] + 2) / 4;
         }
     }
+
+#if HAVE_SSE4
+    // for render_frame_yuv420p_sse4
+    if (gCpuCaps.hasSSE4 && outw % 32 == 0) {
+        for (i = 0; i < outh; i += 2) {
+            int xmin = FFMIN(dirty_rows[i].xmin, dirty_rows[i + 1].xmin) & ~1,
+                xmax = FFMAX(dirty_rows[i].xmax, dirty_rows[i + 1].xmax);
+            if (xmin >= xmax)
+                continue;
+            for (j = xmin & ~31; j < xmin; j += 2) {
+                size_t p = i * outw / 4 + j / 2;
+                dst_a[p] = 0xFF;
+                dst_u[p] = dst_v[p] = 0;
+            }
+            for (j = xmax; j < FFALIGN(xmax, 32); j += 2) {
+                size_t p = i * outw / 4 + j / 2;
+                dst_a[p] = 0xFF;
+                dst_u[p] = dst_v[p] = 0;
+            }
+        }
+    }
+#endif // HAVE_SSE4
 }
 
 static void render_frame_yuv420p(vf_instance_t *vf)
@@ -392,6 +414,152 @@
     }
 }
 
+#if HAVE_SSE4
+
+static void render_frame_yuv420p_sse4(vf_instance_t *vf)
+{
+    struct dirty_rows_extent *dr = vf->priv->dirty_rows;
+    uint8_t *alpha;
+    uint8_t *src_y = vf->priv->planes[0],
+            *src_u = vf->priv->planes[1],
+            *src_v = vf->priv->planes[2];
+    uint8_t *dst_y = vf->dmpi->planes[0],
+            *dst_u = vf->dmpi->planes[1],
+            *dst_v = vf->dmpi->planes[2];
+    int stride;
+    int outw = vf->priv->outw,
+        outh = vf->priv->outh;
+    int i;
+
+    CLEAN_XMM(7);
+
+#define CHECK_16_ALPHA \
+            "cmpl   $-1,     0(%[alpha], %[j], 1) \n\t" \
+            "jne    2f \n\t"                            \
+            "cmpl   $-1,     4(%[alpha], %[j], 1) \n\t" \
+            "jne    2f \n\t"                            \
+            "cmpl   $-1,     8(%[alpha], %[j], 1) \n\t" \
+            "jne    2f \n\t"                            \
+            "cmpl   $-1,    12(%[alpha], %[j], 1) \n\t" \
+            "jne    2f \n\t"                            \
+            "jmp    3f \n\t"
+
+#define MAP_16_ALPHA \
+            "movq       0(%[alpha], %[j], 1),   %%xmm0 \n\t"        \
+            "movq       8(%[alpha], %[j], 1),   %%xmm2 \n\t"        \
+            "punpcklbw  %%xmm7, %%xmm0 \n\t"                        \
+            "punpcklbw  %%xmm7, %%xmm2 \n\t"                        \
+            "movdqa     %%xmm0, %%xmm1 \n\t"                        \
+            "movdqa     %%xmm2, %%xmm3 \n\t"                        \
+            "punpcklwd  %%xmm7, %%xmm0 \n\t"                        \
+            "punpckhwd  %%xmm7, %%xmm1 \n\t"                        \
+            "punpcklwd  %%xmm7, %%xmm2 \n\t"                        \
+            "punpckhwd  %%xmm7, %%xmm3 \n\t"                        \
+            "pmulld     "MANGLE(SSE_32BIT_MAP)",    %%xmm0 \n\t"    \
+            "pmulld     "MANGLE(SSE_32BIT_MAP)",    %%xmm1 \n\t"    \
+            "pmulld     "MANGLE(SSE_32BIT_MAP)",    %%xmm2 \n\t"    \
+            "pmulld     "MANGLE(SSE_32BIT_MAP)",    %%xmm3 \n\t"    \
+            "paddd      "MANGLE(SSE_32BIT_80H)",    %%xmm0 \n\t"    \
+            "paddd      "MANGLE(SSE_32BIT_80H)",    %%xmm1 \n\t"    \
+            "paddd      "MANGLE(SSE_32BIT_80H)",    %%xmm2 \n\t"    \
+            "paddd      "MANGLE(SSE_32BIT_80H)",    %%xmm3 \n\t"    \
+            "psrld      $8, %%xmm0 \n\t"                            \
+            "psrld      $8, %%xmm1 \n\t"                            \
+            "psrld      $8, %%xmm2 \n\t"                            \
+            "psrld      $8, %%xmm3 \n\t"                            \
+            "packssdw   %%xmm1, %%xmm0 \n\t"                        \
+            "packssdw   %%xmm3, %%xmm2 \n\t"
+
+#define MUL_ALPHA(dst, xmm1, xmm2) \
+            "movq       0(%["#dst"], %[j], 1),  %%"#xmm1" \n\t" \
+            "movq       8(%["#dst"], %[j], 1),  %%"#xmm2" \n\t" \
+            "punpcklbw  %%xmm7, %%"#xmm1" \n\t"                 \
+            "punpcklbw  %%xmm7, %%"#xmm2" \n\t"                 \
+            "pmullw     %%xmm0, %%"#xmm1" \n\t"                 \
+            "pmullw     %%xmm2, %%"#xmm2" \n\t"                 \
+            "psrlw      $8, %%"#xmm1" \n\t"                     \
+            "psrlw      $8, %%"#xmm2" \n\t"                     \
+            "packuswb   %%"#xmm2", %%"#xmm1" \n\t"
+
+    // y
+    alpha = vf->priv->alphas[0];
+    stride = vf->dmpi->stride[0];
+    for (i = 0; i < outh; i++) {
+        size_t xmin = dr[i].xmin & ~15,
+               xmax = dr[i].xmax;
+        __asm__ volatile (
+                "jmp    4f \n\t"
+
+                "1: \n\t"
+                CHECK_16_ALPHA
+
+                "2: \n\t"
+                MAP_16_ALPHA
+                MUL_ALPHA(dst_y, xmm1, xmm3)
+                "movdqa (%[src_y], %[j], 1),    %%xmm0 \n\t"
+                "paddb  %%xmm0, %%xmm1 \n\t"
+                "movdqu %%xmm1, (%[dst_y], %[j], 1) \n\t"
+
+                "3: \n\t"
+                "add    $16,    %[j] \n\t"
+                "4: \n\t"
+                "cmp    %[xmax],    %[j] \n\t"
+                "jl     1b \n\t"
+
+                : : [j] "r" (xmin),
+                    [xmax] "g" (xmax),
+                    [alpha] "r" (alpha + i * outw),
+                    [src_y] "r" (src_y + i * outw),
+                    [dst_y] "r" (dst_y + i * stride)
+        );
+    }
+
+    // u & v
+    alpha = vf->priv->alphas[1];
+    stride = vf->dmpi->stride[1];
+    for (i = 0; i < outh / 2; i++) {
+        size_t xmin = FFMIN(dr[i * 2].xmin, dr[i * 2 + 1].xmin) & ~31,
+               xmax = FFMAX(dr[i * 2].xmax, dr[i * 2 + 1].xmax);
+        __asm__ volatile (
+                "jmp    4f \n\t"
+
+                "1: \n\t"
+                CHECK_16_ALPHA
+
+                "2: \n\t"
+                MAP_16_ALPHA
+                MUL_ALPHA(dst_u, xmm1, xmm4)
+                MUL_ALPHA(dst_v, xmm3, xmm5)
+                "movdqa (%[src_u], %[j], 1),    %%xmm0 \n\t"
+                "movdqa (%[src_v], %[j], 1),    %%xmm2 \n\t"
+                "paddb  %%xmm0, %%xmm1 \n\t"
+                "paddb  %%xmm2, %%xmm3 \n\t"
+                "movdqu %%xmm1, (%[dst_u], %[j], 1) \n\t"
+                "movdqu %%xmm3, (%[dst_v], %[j], 1) \n\t"
+
+                "3: \n\t"
+                "add    $16,    %[j] \n\t"
+                "4: \n\t"
+                "cmp    %[xmax],    %[j] \n\t"
+                "jl     1b \n\t"
+
+                : : [j] "r" (xmin / 2),
+                    [xmax] "g" ((xmax + 1) / 2),
+                    [alpha] "r" (alpha + i * outw / 2),
+                    [src_u] "r" (src_u + i * outw / 2),
+                    [src_v] "r" (src_v + i * outw / 2),
+                    [dst_u] "r" (dst_u + i * stride),
+                    [dst_v] "r" (dst_v + i * stride)
+        );
+    }
+
+#undef CHECK_16_ALPHA
+#undef MAP_16_ALPHA
+#undef MUL_ALPHA
+}
+
+#endif // HAVE_SSE4
+
 static void clean_buffer(vf_instance_t *vf)
 {
     int outw = vf->priv->outw,
@@ -448,6 +616,10 @@
         vf->priv->draw_image = draw_image_yuv;
         vf->priv->render_frame = render_frame_yuv420p;
         vf->priv->prepare_buffer = prepare_buffer_420p;
+#if HAVE_SSE4
+        if (gCpuCaps.hasSSE4 && outw % 32 == 0)
+            vf->priv->render_frame = render_frame_yuv420p_sse4;
+#endif
         break;
     case IMGFMT_UYVY:
     case IMGFMT_YUY2: