changeset 36403:07e9ebd91b98

af_volume: add NEON optimization for common float case. gcc is too stupid to use vmin/vmax, which leads to float code interleaved with status register reads, which has simply horrible performance.
author reimar
date Wed, 30 Oct 2013 18:45:48 +0000
parents 2c9356ac0d01
children 4df68a2d918b
files libaf/af_volume.c
diffstat 1 files changed, 21 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/libaf/af_volume.c	Tue Oct 29 19:24:16 2013 +0000
+++ b/libaf/af_volume.c	Wed Oct 30 18:45:48 2013 +0000
@@ -125,6 +125,27 @@
 static av_always_inline void float_inner_loop(float *data, int len, int offset, int step, float level, int softclip)
 {
   int i;
+#if HAVE_NEON
+  if (offset == 0 && step == 1 && !softclip && len >= 8)
+  {
+    __asm__(
+      "vmov.32 d2[0], %2\n\t"
+      "vdup.32 q8, %3\n\t"
+      "vneg.f32 q9, q8\n\t"
+"0:\n\t"
+      "vld1.32 {q0}, [%0]\n\t"
+      "vmul.f32 q0, q0, d2[0]\n\t"
+      "cmp %0, %1\n\t"
+      "vmin.f32 q0, q0, q8\n\t"
+      "vmax.f32 q0, q0, q9\n\t"
+      "vst1.32 {q0}, [%0]!\n\t"
+      "blo 0b\n\t"
+    : "+&r"(data)
+    : "r"(data + len - 7), "r"(level), "r"(0x3f800000)
+    : "cc", "q0", "d2", "q8", "q9", "memory");
+    len &= 3;
+  }
+#endif
   for (i = offset; i < len; i += step)
   {
     register float x = data[i];