Mercurial > mplayer.hg
changeset 36392:741131acf556
Add ARM NEON optimization of common channel interleave cases.
While the code is certainly far from optimal, it is about
3 times faster.
author | reimar |
---|---|
date | Sat, 26 Oct 2013 08:30:29 +0000 |
parents | 3b1fb70800f4 |
children | 06cf8a9249f7 |
files | libmpcodecs/ad_ffmpeg.c |
diffstat | 1 files changed, 39 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/libmpcodecs/ad_ffmpeg.c Fri Oct 25 19:59:59 2013 +0000 +++ b/libmpcodecs/ad_ffmpeg.c Sat Oct 26 08:30:29 2013 +0000 @@ -220,6 +220,45 @@ { size_t s, c, o = 0; +#if HAVE_NEON + if (nb_channels == 2 && bps == 4) { + const unsigned char *src0 = src[0]; + const unsigned char *src1 = src[1]; + size_t aligned = nb_samples & ~7; + const unsigned char *src0_end = src0 + aligned*bps; + while (src0 < src0_end) { + __asm__ ( + "vld1.32 {q0}, [%0]!\n\t" + "vld1.32 {q1}, [%1]!\n\t" + "vld1.32 {q2}, [%0]!\n\t" + "vld1.32 {q3}, [%1]!\n\t" + "vst2.32 {q0,q1}, [%2]!\n\t" + "vst2.32 {q2,q3}, [%2]!\n\t" + : "+&r"(src0), "+&r"(src1), "+&r"(dst) + :: "q0", "q1", "q2", "q3", "memory"); + } + o += aligned*bps; + nb_samples -= aligned; + } else if (nb_channels == 2 && bps == 2) { + const unsigned char *src0 = src[0]; + const unsigned char *src1 = src[1]; + size_t aligned = nb_samples & ~15; + const unsigned char *src0_end = src0 + aligned*bps; + while (src0 < src0_end) { + __asm__ ( + "vld1.16 {q0}, [%0]!\n\t" + "vld1.16 {q1}, [%1]!\n\t" + "vld1.16 {q2}, [%0]!\n\t" + "vld1.16 {q3}, [%1]!\n\t" + "vst2.16 {q0,q1}, [%2]!\n\t" + "vst2.16 {q2,q3}, [%2]!\n\t" + : "+&r"(src0), "+&r"(src1), "+&r"(dst) + :: "q0", "q1", "q2", "q3", "memory"); + } + o += aligned*bps; + nb_samples -= aligned; + } +#endif for (s = 0; s < nb_samples; s++) { for (c = 0; c < nb_channels; c++) { memcpy(dst, src[c] + o, bps);