Mercurial > libavcodec.hg
diff x86/dsputil_h264_template_mmx.c @ 8519:cc64e1343397 libavcodec
Use H264 MMX chroma functions to accelerate RV40 decoding.
Patch by Mathieu Velten (matmaul A gmail)
author | cehoyos |
---|---|
date | Sun, 04 Jan 2009 01:36:11 +0000 |
parents | 7768bdfd4f7b |
children | 98970e51365a |
line wrap: on
line diff
--- a/x86/dsputil_h264_template_mmx.c Sat Jan 03 19:21:48 2009 +0000 +++ b/x86/dsputil_h264_template_mmx.c Sun Jan 04 01:36:11 2009 +0000 @@ -25,9 +25,8 @@ * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function */ -static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) +static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) { - const uint64_t *rnd_reg; DECLARE_ALIGNED_8(uint64_t, AA); DECLARE_ALIGNED_8(uint64_t, DD); int i; @@ -45,17 +44,15 @@ /* 1 dimensional filter only */ const int dxy = x ? 1 : stride; - rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3; - __asm__ volatile( "movd %0, %%mm5\n\t" "movq %1, %%mm4\n\t" - "movq %2, %%mm6\n\t" /* mm6 = rnd */ + "movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */ "punpcklwd %%mm5, %%mm5\n\t" "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ "pxor %%mm7, %%mm7\n\t" "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ - :: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg)); + :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1))); for(i=0; i<h; i++) { __asm__ volatile( @@ -78,7 +75,7 @@ "pmullw %%mm5, %%mm2\n\t" "pmullw %%mm5, %%mm3\n\t" - /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */ + /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */ "paddw %%mm6, %%mm0\n\t" "paddw %%mm6, %%mm1\n\t" "paddw %%mm2, %%mm0\n\t" @@ -97,7 +94,6 @@ } /* general case, bilinear */ - rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28.a; __asm__ volatile("movd %2, %%mm4\n\t" "movd %3, %%mm6\n\t" "punpcklwd %%mm4, %%mm4\n\t" @@ -172,7 +168,7 @@ : : "m" (src[0]), "m" (src[1]), "m" (DD)); __asm__ volatile( - /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */ + /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */ "paddw %1, %%mm2\n\t" "paddw %1, %%mm3\n\t" "psrlw $6, %%mm2\n\t" @@ -185,7 +181,7 @@ } } -static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) { __asm__ volatile( "pxor %%mm7, %%mm7 \n\t" @@ -249,7 +245,7 @@ "sub $2, %2 \n\t" "jnz 1b \n\t" : "+r"(dst), "+r"(src), "+r"(h) - : "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y) + : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y) ); }