Mercurial > mplayer.hg
changeset 36393:06cf8a9249f7
ARM NEON optimization for float->int conversion.
Not optimal, but since lrintf is incredibly slow on
ARM it is > 10x faster than the old code.
A fallback solution that (incorrectly) defines lrintf(x)
as (int)(x) might make sense to avoid these kind of issues
for the pure C code, however we would still need to know whether
lrintf is slow or not.
Though maybe the better solution was if all architectures provided
a non-braindead implementation of lrintf.
author | reimar |
---|---|
date | Sat, 26 Oct 2013 09:17:28 +0000 |
parents | 741131acf556 |
children | 2f815fdd521c |
files | libaf/af_format.c |
diffstat | 1 files changed, 26 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/libaf/af_format.c Sat Oct 26 08:30:29 2013 +0000 +++ b/libaf/af_format.c Sat Oct 26 09:17:28 2013 +0000 @@ -494,8 +494,34 @@ ((int8_t *)out)[i] = av_clip_int8(lrintf(128.0f * in[i])); break; case(2): +#if HAVE_NEON + { + const float *in_end = in + len; + while (in < in_end - 7) { + __asm__( + "vld1.32 {q0,q1}, [%0]!\n\t" + "vcvt.s32.f32 q0, q0, #31\n\t" + "vqrshrn.s32 d0, q0, #15\n\t" + "vcvt.s32.f32 q1, q1, #31\n\t" + "vqrshrn.s32 d1, q1, #15\n\t" + "vst1.16 {q0}, [%1]!\n\t" + : "+r"(in), "+r"(out) + :: "q0", "q1", "memory"); + } + while (in < in_end) { + __asm__( + "vld1.32 {d0[0]}, [%0]!\n\t" + "vcvt.s32.f32 d0, d0, #31\n\t" + "vqrshrn.s32 d0, q0, #15\n\t" + "vst1.16 {d0[0]}, [%1]!\n\t" + : "+r"(in), "+r"(out) + :: "d0", "memory"); + } + } +#else for(i=0;i<len;i++) ((int16_t*)out)[i] = av_clip_int16(lrintf(32768.0f * in[i])); +#endif break; case(3): for(i=0;i<len;i++){