libavcodec.hg: x86/mlpdsp.c comparison

comparison x86/mlpdsp.c @ 9688:128531f67aa1 libavcodec

MLP DSP functions x86-optimized. 12.59% overall speedup in x86_32 9.98% overall speedup in x86_64 compared to gcc 4.3.3

author	ramiro
date	Sat, 23 May 2009 00:23:30 +0000
parents
children	dc3c984a1c1a

comparison

equal deleted inserted replaced

-:d9f8496b3b91
+:128531f67aa1
+/*
+* MLP DSP functions x86-optimized
+* Copyright (c) 2009 Ramiro Polla
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mlp.h"
+#if HAVE_7REGS && HAVE_TEN_OPERANDS
+extern void ff_mlp_firorder_8;
+extern void ff_mlp_firorder_7;
+extern void ff_mlp_firorder_6;
+extern void ff_mlp_firorder_5;
+extern void ff_mlp_firorder_4;
+extern void ff_mlp_firorder_3;
+extern void ff_mlp_firorder_2;
+extern void ff_mlp_firorder_1;
+extern void ff_mlp_firorder_0;
+extern void ff_mlp_iirorder_4;
+extern void ff_mlp_iirorder_3;
+extern void ff_mlp_iirorder_2;
+extern void ff_mlp_iirorder_1;
+extern void ff_mlp_iirorder_0;
+static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
+&ff_mlp_firorder_2, &ff_mlp_firorder_3,
+&ff_mlp_firorder_4, &ff_mlp_firorder_5,
+&ff_mlp_firorder_6, &ff_mlp_firorder_7,
+&ff_mlp_firorder_8 };
+static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
+&ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
+&ff_mlp_iirorder_4 };
+#if ARCH_X86_64
+#define MLPMUL(label, offset, offs, offc)   \
+MANGLE(label)":                   \n\t" \
+"movslq "offset"+"offs"(%0), %%rax\n\t" \
+"movslq "offset"+"offc"(%1), %%rdx\n\t" \
+"imul                 %%rdx, %%rax\n\t" \
+"add                  %%rax, %%rsi\n\t"
+#define FIRMULREG(label, offset, firc)\
+MANGLE(label)":             \n\t" \
+"movslq "#offset"(%0), %%rax\n\t" \
+"imul        %"#firc", %%rax\n\t" \
+"add            %%rax, %%rsi\n\t"
+#define CLEAR_ACCUM                   \
+"xor            %%rsi, %%rsi\n\t"
+#define SHIFT_ACCUM                   \
+"shr     %%cl,         %%rsi\n\t"
+#define ACCUM    "%%rdx"
+#define RESULT   "%%rsi"
+#define RESULT32 "%%esi"
+#define READVAL "r"
+#define RDWRVAL "+r"
+#define COUNTER "c"
+#define ECXUSED
+#else /* if ARCH_X86_32 */
+#define MLPMUL(label, offset, offs, offc)  \
+MANGLE(label)":                  \n\t" \
+"mov   "offset"+"offs"(%0), %%eax\n\t" \
+"imull "offset"+"offc"(%1)       \n\t" \
+"add                %%eax , %%esi\n\t" \
+"adc                %%edx , %%ecx\n\t"
+#define FIRMULREG(label, offset, firc)  \
+MLPMUL(label, #offset, "0", "0")
+#define CLEAR_ACCUM                  \
+"xor           %%esi, %%esi\n\t" \
+"xor           %%ecx, %%ecx\n\t"
+#define SHIFT_ACCUM                  \
+"mov           %%ecx, %%edx\n\t" \
+"mov           %%esi, %%eax\n\t" \
+"movzbl        %7   , %%ecx\n\t" \
+"shrd    %%cl, %%edx, %%eax\n\t" \
+#define ACCUM    "%%edx"
+#define RESULT   "%%eax"
+#define RESULT32 "%%eax"
+#define READVAL "m"
+#define RDWRVAL "+m"
+#define COUNTER "m"
+#define ECXUSED , "ecx"
+#endif /* !ARCH_X86_64 */
+#define BINC  AV_STRINGIFY(4* MAX_CHANNELS)
+#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
+#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
+#define FIRMUL(label, offset) MLPMUL(label, #offset,   "0",   "0")
+#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
+static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
+int firorder, int iirorder,
+unsigned int filter_shift, int32_t mask,
+int blocksize, int32_t *sample_buffer)
+{
+const void *firjump = firtable[firorder];
+const void *iirjump = iirtable[iirorder];
+blocksize = -blocksize;
+__asm__ volatile(
+"1:                           \n\t"
+CLEAR_ACCUM
+"jmp  *%5                     \n\t"
+FIRMUL   (ff_mlp_firorder_8, 0x1c   )
+FIRMUL   (ff_mlp_firorder_7, 0x18   )
+FIRMUL   (ff_mlp_firorder_6, 0x14   )
+FIRMUL   (ff_mlp_firorder_5, 0x10   )
+FIRMUL   (ff_mlp_firorder_4, 0x0c   )
+FIRMULREG(ff_mlp_firorder_3, 0x08,10)
+FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
+FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
+MANGLE   (ff_mlp_firorder_0)":\n\t"
+"jmp  *%6                     \n\t"
+IIRMUL   (ff_mlp_iirorder_4, 0x0c   )
+IIRMUL   (ff_mlp_iirorder_3, 0x08   )
+IIRMUL   (ff_mlp_iirorder_2, 0x04   )
+IIRMUL   (ff_mlp_iirorder_1, 0x00   )
+MANGLE   (ff_mlp_iirorder_0)":\n\t"
+SHIFT_ACCUM
+"mov  "RESULT"  ,"ACCUM"      \n\t"
+"add  (%2)      ,"RESULT"     \n\t"
+"and   %4       ,"RESULT"     \n\t"
+"sub   $4       ,  %0         \n\t"
+"mov  "RESULT32", (%0)        \n\t"
+"mov  "RESULT32", (%2)        \n\t"
+"add $"BINC"    ,  %2         \n\t"
+"sub  "ACCUM"   ,"RESULT"     \n\t"
+"mov  "RESULT32","IOFFS"(%0)  \n\t"
+"incl              %3         \n\t"
+"js 1b                        \n\t"
+: /* 0*/"+r"(state),
+/* 1*/"+r"(coeff),
+/* 2*/"+r"(sample_buffer),
+/* 3*/RDWRVAL(blocksize)
+:
+/* 4*/READVAL((x86_reg)mask),
+/* 5*/READVAL(firjump),
+/* 6*/READVAL(iirjump),
+/* 7*/COUNTER(filter_shift)
+#if ARCH_X86_64
+, /* 8*/"r"((int64_t)coeff[0])
+, /* 9*/"r"((int64_t)coeff[1])
+, /*10*/"r"((int64_t)coeff[2])
+#endif /* ARCH_X86_64 */
+: REG_a, REG_d, REG_S
+ECXUSED
+);
+}
+#endif /* HAVE_7REGS && HAVE_TEN_OPERANDS */
+void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx)
+{
+#if HAVE_7REGS && HAVE_TEN_OPERANDS
+c->mlp_filter_channel = mlp_filter_channel_x86;
+#endif
+}

Mercurial > libavcodec.hg

comparison x86/mlpdsp.c @ 9688:128531f67aa1 libavcodec