# HG changeset patch # User cehoyos # Date 1231032971 0 # Node ID cc64e134339712862e1de1d4554c50319028e7cc # Parent f2c406b05158c7e666b44128dfa397b0bb8fc580 Use H264 MMX chroma functions to accelerate RV40 decoding. Patch by Mathieu Velten (matmaul A gmail) diff -r f2c406b05158 -r cc64e1343397 x86/dsputil_h264_template_mmx.c --- a/x86/dsputil_h264_template_mmx.c Sat Jan 03 19:21:48 2009 +0000 +++ b/x86/dsputil_h264_template_mmx.c Sun Jan 04 01:36:11 2009 +0000 @@ -25,9 +25,8 @@ * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function */ -static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) +static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) { - const uint64_t *rnd_reg; DECLARE_ALIGNED_8(uint64_t, AA); DECLARE_ALIGNED_8(uint64_t, DD); int i; @@ -45,17 +44,15 @@ /* 1 dimensional filter only */ const int dxy = x ? 1 : stride; - rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3; - __asm__ volatile( "movd %0, %%mm5\n\t" "movq %1, %%mm4\n\t" - "movq %2, %%mm6\n\t" /* mm6 = rnd */ + "movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */ "punpcklwd %%mm5, %%mm5\n\t" "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ "pxor %%mm7, %%mm7\n\t" "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ - :: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg)); + :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1))); for(i=0; i> 3 */ + /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */ "paddw %%mm6, %%mm0\n\t" "paddw %%mm6, %%mm1\n\t" "paddw %%mm2, %%mm0\n\t" @@ -97,7 +94,6 @@ } /* general case, bilinear */ - rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28.a; __asm__ volatile("movd %2, %%mm4\n\t" "movd %3, %%mm6\n\t" "punpcklwd %%mm4, %%mm4\n\t" @@ -172,7 +168,7 @@ : : "m" (src[0]), "m" (src[1]), "m" (DD)); __asm__ volatile( - /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */ + /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */ "paddw %1, %%mm2\n\t" "paddw %1, %%mm3\n\t" "psrlw $6, %%mm2\n\t" @@ -185,7 +181,7 @@ } } -static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) { __asm__ volatile( "pxor %%mm7, %%mm7 \n\t" @@ -249,7 +245,7 @@ "sub $2, %2 \n\t" "jnz 1b \n\t" : "+r"(dst), "+r"(src), "+r"(h) - : "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y) + : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y) ); } diff -r f2c406b05158 -r cc64e1343397 x86/dsputil_mmx.c --- a/x86/dsputil_mmx.c Sat Jan 03 19:21:48 2009 +0000 +++ b/x86/dsputil_mmx.c Sun Jan 04 01:36:11 2009 +0000 @@ -1733,6 +1733,7 @@ #undef PREFETCH #include "h264dsp_mmx.c" +#include "rv40dsp_mmx.c" /* CAVS specific */ void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx); @@ -2638,6 +2639,9 @@ c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd; + c->put_rv40_chroma_pixels_tab[0]= put_rv40_chroma_mc8_mmx; + c->put_rv40_chroma_pixels_tab[1]= put_rv40_chroma_mc4_mmx; + c->h264_idct_dc_add= c->h264_idct_add= ff_h264_idct_add_mmx; c->h264_idct8_dc_add= @@ -2723,6 +2727,9 @@ SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2); SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2); + c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_mmx2; + c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_mmx2; + c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd; c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2; c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2; @@ -2808,6 +2815,9 @@ c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd; c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; + c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_3dnow; + c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_3dnow; + if (ENABLE_CAVS_DECODER) ff_cavsdsp_init_3dnow(c, avctx); } diff -r f2c406b05158 -r cc64e1343397 x86/h264dsp_mmx.c --- a/x86/h264dsp_mmx.c Sat Jan 03 19:21:48 2009 +0000 +++ b/x86/h264dsp_mmx.c Sun Jan 04 01:36:11 2009 +0000 @@ -2084,22 +2084,30 @@ H264_MC_816(H264_MC_HV, ssse3) #endif +/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ +DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg[4]) = { + 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL +}; #define H264_CHROMA_OP(S,D) #define H264_CHROMA_OP4(S,D,T) -#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx -#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx +#define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx +#define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx #include "dsputil_h264_template_mmx.c" static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) { - put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 1); + put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg); } static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) { - put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 0); + put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2); +} +static void put_h264_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg); } #undef H264_CHROMA_OP @@ -2112,14 +2120,18 @@ #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t" #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ "pavgb " #T ", " #D " \n\t" -#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2 -#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2 +#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2 +#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 #include "dsputil_h264_template_mmx.c" static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) { - avg_h264_chroma_mc8_mmx2(dst, src, stride, h, x, y, 1); + avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg); +} +static void avg_h264_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg); } #undef H264_CHROMA_OP #undef H264_CHROMA_OP4 @@ -2131,13 +2143,17 @@ #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t" #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ "pavgusb " #T ", " #D " \n\t" -#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow -#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow +#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow +#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow #include "dsputil_h264_template_mmx.c" static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) { - avg_h264_chroma_mc8_3dnow(dst, src, stride, h, x, y, 1); + avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg); +} +static void avg_h264_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg); } #undef H264_CHROMA_OP #undef H264_CHROMA_OP4 diff -r f2c406b05158 -r cc64e1343397 x86/rv40dsp_mmx.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/rv40dsp_mmx.c Sun Jan 04 01:36:11 2009 +0000 @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2008 Konstantin Shishkov, Mathieu Velten + * + * MMX-optimized DSP functions for RV40, based on H.264 optimizations by + * Michael Niedermayer and Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dsputil_mmx.h" + +/* bias interleaved with bias div 8, use p+1 to access bias div 8 */ +DECLARE_ALIGNED_8(static const uint64_t, rv40_bias_reg[4][8]) = { + { 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0010001000100010ULL, 0x0002000200020002ULL, + 0x0020002000200020ULL, 0x0004000400040004ULL, 0x0010001000100010ULL, 0x0002000200020002ULL }, + { 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL, + 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL }, + { 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0020002000200020ULL, 0x0004000400040004ULL, + 0x0010001000100010ULL, 0x0002000200020002ULL, 0x0020002000200020ULL, 0x0004000400040004ULL }, + { 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL, + 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL } +}; + +static void put_rv40_chroma_mc8_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]); +} +static void put_rv40_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]); +} +static void avg_rv40_chroma_mc8_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]); +} +static void avg_rv40_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]); +} +static void avg_rv40_chroma_mc8_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]); +} +static void avg_rv40_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]); +}