changeset 36393:06cf8a9249f7

ARM NEON optimization for float->int conversion. Not optimal, but since lrintf is incredibly slow on ARM it is > 10x faster than the old code. A fallback solution that (incorrectly) defines lrintf(x) as (int)(x) might make sense to avoid these kind of issues for the pure C code, however we would still need to know whether lrintf is slow or not. Though maybe the better solution was if all architectures provided a non-braindead implementation of lrintf.
author reimar
date Sat, 26 Oct 2013 09:17:28 +0000
parents 741131acf556
children 2f815fdd521c
files libaf/af_format.c
diffstat 1 files changed, 26 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/libaf/af_format.c	Sat Oct 26 08:30:29 2013 +0000
+++ b/libaf/af_format.c	Sat Oct 26 09:17:28 2013 +0000
@@ -494,8 +494,34 @@
       ((int8_t *)out)[i] = av_clip_int8(lrintf(128.0f * in[i]));
     break;
   case(2):
+#if HAVE_NEON
+    {
+    const float *in_end = in + len;
+    while (in < in_end - 7) {
+      __asm__(
+          "vld1.32 {q0,q1}, [%0]!\n\t"
+          "vcvt.s32.f32 q0, q0, #31\n\t"
+          "vqrshrn.s32  d0, q0, #15\n\t"
+          "vcvt.s32.f32 q1, q1, #31\n\t"
+          "vqrshrn.s32  d1, q1, #15\n\t"
+          "vst1.16 {q0}, [%1]!\n\t"
+      : "+r"(in), "+r"(out)
+      :: "q0", "q1", "memory");
+    }
+    while (in < in_end) {
+      __asm__(
+          "vld1.32 {d0[0]}, [%0]!\n\t"
+          "vcvt.s32.f32 d0, d0, #31\n\t"
+          "vqrshrn.s32  d0, q0, #15\n\t"
+          "vst1.16 {d0[0]}, [%1]!\n\t"
+      : "+r"(in), "+r"(out)
+      :: "d0", "memory");
+    }
+    }
+#else
     for(i=0;i<len;i++)
       ((int16_t*)out)[i] = av_clip_int16(lrintf(32768.0f * in[i]));
+#endif
     break;
   case(3):
     for(i=0;i<len;i++){