Mercurial > mplayer.hg
changeset 36403:07e9ebd91b98
af_volume: add NEON optimization for common float case.
gcc is too stupid to use vmin/vmax, which leads to float
code interleaved with status register reads, which has simply
horrible performance.
author | reimar |
---|---|
date | Wed, 30 Oct 2013 18:45:48 +0000 |
parents | 2c9356ac0d01 |
children | 4df68a2d918b |
files | libaf/af_volume.c |
diffstat | 1 files changed, 21 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/libaf/af_volume.c Tue Oct 29 19:24:16 2013 +0000 +++ b/libaf/af_volume.c Wed Oct 30 18:45:48 2013 +0000 @@ -125,6 +125,27 @@ static av_always_inline void float_inner_loop(float *data, int len, int offset, int step, float level, int softclip) { int i; +#if HAVE_NEON + if (offset == 0 && step == 1 && !softclip && len >= 8) + { + __asm__( + "vmov.32 d2[0], %2\n\t" + "vdup.32 q8, %3\n\t" + "vneg.f32 q9, q8\n\t" +"0:\n\t" + "vld1.32 {q0}, [%0]\n\t" + "vmul.f32 q0, q0, d2[0]\n\t" + "cmp %0, %1\n\t" + "vmin.f32 q0, q0, q8\n\t" + "vmax.f32 q0, q0, q9\n\t" + "vst1.32 {q0}, [%0]!\n\t" + "blo 0b\n\t" + : "+&r"(data) + : "r"(data + len - 7), "r"(level), "r"(0x3f800000) + : "cc", "q0", "d2", "q8", "q9", "memory"); + len &= 3; + } +#endif for (i = offset; i < len; i += step) { register float x = data[i];