# HG changeset patch # User ramiro # Date 1243038210 0 # Node ID 128531f67aa11b4613c62e4c38b89e85c2587fad # Parent d9f8496b3b9138539b6f04a93d78fc016f352422 MLP DSP functions x86-optimized. 12.59% overall speedup in x86_32 9.98% overall speedup in x86_64 compared to gcc 4.3.3 diff -r d9f8496b3b91 -r 128531f67aa1 Makefile --- a/Makefile Fri May 22 22:01:50 2009 +0000 +++ b/Makefile Sat May 23 00:23:30 2009 +0000 @@ -445,8 +445,10 @@ MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o MMX-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flacdsp_mmx.o MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o +MMX-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp_mmx.o MMX-OBJS-$(CONFIG_THEORA_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o +MMX-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o MMX-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o MMX-OBJS-$(CONFIG_VP5_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o diff -r d9f8496b3b91 -r 128531f67aa1 mlpdsp.c --- a/mlpdsp.c Fri May 22 22:01:50 2009 +0000 +++ b/mlpdsp.c Sat May 23 00:23:30 2009 +0000 @@ -55,7 +55,11 @@ } } +void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx); + void ff_mlp_init(DSPContext* c, AVCodecContext *avctx) { c->mlp_filter_channel = ff_mlp_filter_channel; + if (ARCH_X86) + ff_mlp_init_x86(c, avctx); } diff -r d9f8496b3b91 -r 128531f67aa1 x86/mlpdsp.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/mlpdsp.c Sat May 23 00:23:30 2009 +0000 @@ -0,0 +1,190 @@ +/* + * MLP DSP functions x86-optimized + * Copyright (c) 2009 Ramiro Polla + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mlp.h" + +#if HAVE_7REGS && HAVE_TEN_OPERANDS + +extern void ff_mlp_firorder_8; +extern void ff_mlp_firorder_7; +extern void ff_mlp_firorder_6; +extern void ff_mlp_firorder_5; +extern void ff_mlp_firorder_4; +extern void ff_mlp_firorder_3; +extern void ff_mlp_firorder_2; +extern void ff_mlp_firorder_1; +extern void ff_mlp_firorder_0; + +extern void ff_mlp_iirorder_4; +extern void ff_mlp_iirorder_3; +extern void ff_mlp_iirorder_2; +extern void ff_mlp_iirorder_1; +extern void ff_mlp_iirorder_0; + +static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1, + &ff_mlp_firorder_2, &ff_mlp_firorder_3, + &ff_mlp_firorder_4, &ff_mlp_firorder_5, + &ff_mlp_firorder_6, &ff_mlp_firorder_7, + &ff_mlp_firorder_8 }; +static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1, + &ff_mlp_iirorder_2, &ff_mlp_iirorder_3, + &ff_mlp_iirorder_4 }; + +#if ARCH_X86_64 + +#define MLPMUL(label, offset, offs, offc) \ + MANGLE(label)": \n\t" \ + "movslq "offset"+"offs"(%0), %%rax\n\t" \ + "movslq "offset"+"offc"(%1), %%rdx\n\t" \ + "imul %%rdx, %%rax\n\t" \ + "add %%rax, %%rsi\n\t" + +#define FIRMULREG(label, offset, firc)\ + MANGLE(label)": \n\t" \ + "movslq "#offset"(%0), %%rax\n\t" \ + "imul %"#firc", %%rax\n\t" \ + "add %%rax, %%rsi\n\t" + +#define CLEAR_ACCUM \ + "xor %%rsi, %%rsi\n\t" + +#define SHIFT_ACCUM \ + "shr %%cl, %%rsi\n\t" + +#define ACCUM "%%rdx" +#define RESULT "%%rsi" +#define RESULT32 "%%esi" + +#define READVAL "r" +#define RDWRVAL "+r" +#define COUNTER "c" +#define ECXUSED + +#else /* if ARCH_X86_32 */ + +#define MLPMUL(label, offset, offs, offc) \ + MANGLE(label)": \n\t" \ + "mov "offset"+"offs"(%0), %%eax\n\t" \ + "imull "offset"+"offc"(%1) \n\t" \ + "add %%eax , %%esi\n\t" \ + "adc %%edx , %%ecx\n\t" + +#define FIRMULREG(label, offset, firc) \ + MLPMUL(label, #offset, "0", "0") + +#define CLEAR_ACCUM \ + "xor %%esi, %%esi\n\t" \ + "xor %%ecx, %%ecx\n\t" + +#define SHIFT_ACCUM \ + "mov %%ecx, %%edx\n\t" \ + "mov %%esi, %%eax\n\t" \ + "movzbl %7 , %%ecx\n\t" \ + "shrd %%cl, %%edx, %%eax\n\t" \ + +#define ACCUM "%%edx" +#define RESULT "%%eax" +#define RESULT32 "%%eax" + +#define READVAL "m" +#define RDWRVAL "+m" +#define COUNTER "m" +#define ECXUSED , "ecx" + +#endif /* !ARCH_X86_64 */ + +#define BINC AV_STRINGIFY(4* MAX_CHANNELS) +#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE)) +#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER) + +#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0") +#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC) + +static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff, + int firorder, int iirorder, + unsigned int filter_shift, int32_t mask, + int blocksize, int32_t *sample_buffer) +{ + const void *firjump = firtable[firorder]; + const void *iirjump = iirtable[iirorder]; + + blocksize = -blocksize; + + __asm__ volatile( + "1: \n\t" + CLEAR_ACCUM + "jmp *%5 \n\t" + FIRMUL (ff_mlp_firorder_8, 0x1c ) + FIRMUL (ff_mlp_firorder_7, 0x18 ) + FIRMUL (ff_mlp_firorder_6, 0x14 ) + FIRMUL (ff_mlp_firorder_5, 0x10 ) + FIRMUL (ff_mlp_firorder_4, 0x0c ) + FIRMULREG(ff_mlp_firorder_3, 0x08,10) + FIRMULREG(ff_mlp_firorder_2, 0x04, 9) + FIRMULREG(ff_mlp_firorder_1, 0x00, 8) + MANGLE (ff_mlp_firorder_0)":\n\t" + "jmp *%6 \n\t" + IIRMUL (ff_mlp_iirorder_4, 0x0c ) + IIRMUL (ff_mlp_iirorder_3, 0x08 ) + IIRMUL (ff_mlp_iirorder_2, 0x04 ) + IIRMUL (ff_mlp_iirorder_1, 0x00 ) + MANGLE (ff_mlp_iirorder_0)":\n\t" + SHIFT_ACCUM + "mov "RESULT" ,"ACCUM" \n\t" + "add (%2) ,"RESULT" \n\t" + "and %4 ,"RESULT" \n\t" + "sub $4 , %0 \n\t" + "mov "RESULT32", (%0) \n\t" + "mov "RESULT32", (%2) \n\t" + "add $"BINC" , %2 \n\t" + "sub "ACCUM" ,"RESULT" \n\t" + "mov "RESULT32","IOFFS"(%0) \n\t" + "incl %3 \n\t" + "js 1b \n\t" + : /* 0*/"+r"(state), + /* 1*/"+r"(coeff), + /* 2*/"+r"(sample_buffer), + /* 3*/RDWRVAL(blocksize) + : + /* 4*/READVAL((x86_reg)mask), + /* 5*/READVAL(firjump), + /* 6*/READVAL(iirjump), + /* 7*/COUNTER(filter_shift) +#if ARCH_X86_64 + , /* 8*/"r"((int64_t)coeff[0]) + , /* 9*/"r"((int64_t)coeff[1]) + , /*10*/"r"((int64_t)coeff[2]) +#endif /* ARCH_X86_64 */ + : REG_a, REG_d, REG_S + ECXUSED + ); +} + +#endif /* HAVE_7REGS && HAVE_TEN_OPERANDS */ + +void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx) +{ +#if HAVE_7REGS && HAVE_TEN_OPERANDS + c->mlp_filter_channel = mlp_filter_channel_x86; +#endif +}