changeset 36392:741131acf556

Add ARM NEON optimization of common channel interleave cases. While the code is certainly far from optimal, it is about 3 times faster.
author reimar
date Sat, 26 Oct 2013 08:30:29 +0000
parents 3b1fb70800f4
children 06cf8a9249f7
files libmpcodecs/ad_ffmpeg.c
diffstat 1 files changed, 39 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/libmpcodecs/ad_ffmpeg.c	Fri Oct 25 19:59:59 2013 +0000
+++ b/libmpcodecs/ad_ffmpeg.c	Sat Oct 26 08:30:29 2013 +0000
@@ -220,6 +220,45 @@
 {
     size_t s, c, o = 0;
 
+#if HAVE_NEON
+    if (nb_channels == 2 && bps == 4) {
+        const unsigned char *src0 = src[0];
+        const unsigned char *src1 = src[1];
+        size_t aligned = nb_samples & ~7;
+        const unsigned char *src0_end = src0 + aligned*bps;
+        while (src0 < src0_end) {
+           __asm__ (
+               "vld1.32 {q0}, [%0]!\n\t"
+               "vld1.32 {q1}, [%1]!\n\t"
+               "vld1.32 {q2}, [%0]!\n\t"
+               "vld1.32 {q3}, [%1]!\n\t"
+               "vst2.32 {q0,q1}, [%2]!\n\t"
+               "vst2.32 {q2,q3}, [%2]!\n\t"
+               : "+&r"(src0), "+&r"(src1), "+&r"(dst)
+               :: "q0", "q1", "q2", "q3", "memory");
+        }
+        o += aligned*bps;
+        nb_samples -= aligned;
+    } else if (nb_channels == 2 && bps == 2) {
+        const unsigned char *src0 = src[0];
+        const unsigned char *src1 = src[1];
+        size_t aligned = nb_samples & ~15;
+        const unsigned char *src0_end = src0 + aligned*bps;
+        while (src0 < src0_end) {
+           __asm__ (
+               "vld1.16 {q0}, [%0]!\n\t"
+               "vld1.16 {q1}, [%1]!\n\t"
+               "vld1.16 {q2}, [%0]!\n\t"
+               "vld1.16 {q3}, [%1]!\n\t"
+               "vst2.16 {q0,q1}, [%2]!\n\t"
+               "vst2.16 {q2,q3}, [%2]!\n\t"
+               : "+&r"(src0), "+&r"(src1), "+&r"(dst)
+               :: "q0", "q1", "q2", "q3", "memory");
+        }
+        o += aligned*bps;
+        nb_samples -= aligned;
+    }
+#endif
     for (s = 0; s < nb_samples; s++) {
         for (c = 0; c < nb_channels; c++) {
             memcpy(dst, src[c] + o, bps);