# HG changeset patch
# User reimar
# Date 1382779048 0
# Node ID 06cf8a9249f7ee6c365e1c34561d11055b45fe9e
# Parent  741131acf556a4bba86969829228f498880ca34c
ARM NEON optimization for float->int conversion.

Not optimal, but since lrintf is incredibly slow on
ARM it is > 10x faster than the old code.
A fallback solution that (incorrectly) defines lrintf(x)
as (int)(x) might make sense to avoid these kind of issues
for the pure C code, however we would still need to know whether
lrintf is slow or not.
Though maybe the better solution was if all architectures provided
a non-braindead implementation of lrintf.

diff -r 741131acf556 -r 06cf8a9249f7 libaf/af_format.c
--- a/libaf/af_format.c	Sat Oct 26 08:30:29 2013 +0000
+++ b/libaf/af_format.c	Sat Oct 26 09:17:28 2013 +0000
@@ -494,8 +494,34 @@
       ((int8_t *)out)[i] = av_clip_int8(lrintf(128.0f * in[i]));
     break;
   case(2):
+#if HAVE_NEON
+    {
+    const float *in_end = in + len;
+    while (in < in_end - 7) {
+      __asm__(
+          "vld1.32 {q0,q1}, [%0]!\n\t"
+          "vcvt.s32.f32 q0, q0, #31\n\t"
+          "vqrshrn.s32  d0, q0, #15\n\t"
+          "vcvt.s32.f32 q1, q1, #31\n\t"
+          "vqrshrn.s32  d1, q1, #15\n\t"
+          "vst1.16 {q0}, [%1]!\n\t"
+      : "+r"(in), "+r"(out)
+      :: "q0", "q1", "memory");
+    }
+    while (in < in_end) {
+      __asm__(
+          "vld1.32 {d0[0]}, [%0]!\n\t"
+          "vcvt.s32.f32 d0, d0, #31\n\t"
+          "vqrshrn.s32  d0, q0, #15\n\t"
+          "vst1.16 {d0[0]}, [%1]!\n\t"
+      : "+r"(in), "+r"(out)
+      :: "d0", "memory");
+    }
+    }
+#else
     for(i=0;i<len;i++)
       ((int16_t*)out)[i] = av_clip_int16(lrintf(32768.0f * in[i]));
+#endif
     break;
   case(3):
     for(i=0;i<len;i++){