Mercurial > libavcodec.hg
diff i386/dsputil_mmx.c @ 6331:c57670e07668 libavcodec
ssse3 h264 motion compensation.
25% faster tham mmx on core2, 35% if you discount fullpel, 4% overall decoding.
author | lorenm |
---|---|
date | Tue, 05 Feb 2008 11:22:55 +0000 |
parents | 5969caa9190d |
children | beb52d4a5efe |
line wrap: on
line diff
--- a/i386/dsputil_mmx.c Tue Feb 05 03:58:13 2008 +0000 +++ b/i386/dsputil_mmx.c Tue Feb 05 11:22:55 2008 +0000 @@ -49,10 +49,10 @@ DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_5 ) = 0x0005000500050005ULL; +DECLARE_ALIGNED_16(const xmm_t, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL; +DECLARE_ALIGNED_16(const xmm_t, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; DECLARE_ALIGNED_16(const xmm_t, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; @@ -478,6 +478,54 @@ ); } +static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm __volatile( + "1: \n\t" + "movdqu (%1), %%xmm0 \n\t" + "movdqu (%1,%3), %%xmm1 \n\t" + "movdqu (%1,%3,2), %%xmm2 \n\t" + "movdqu (%1,%4), %%xmm3 \n\t" + "movdqa %%xmm0, (%2) \n\t" + "movdqa %%xmm1, (%2,%3) \n\t" + "movdqa %%xmm2, (%2,%3,2) \n\t" + "movdqa %%xmm3, (%2,%4) \n\t" + "subl $4, %0 \n\t" + "lea (%1,%3,4), %1 \n\t" + "lea (%2,%3,4), %2 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"((long)line_size), "r"(3L*line_size) + : "memory" + ); +} + +static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm __volatile( + "1: \n\t" + "movdqu (%1), %%xmm0 \n\t" + "movdqu (%1,%3), %%xmm1 \n\t" + "movdqu (%1,%3,2), %%xmm2 \n\t" + "movdqu (%1,%4), %%xmm3 \n\t" + "pavgb (%2), %%xmm0 \n\t" + "pavgb (%2,%3), %%xmm1 \n\t" + "pavgb (%2,%3,2), %%xmm2 \n\t" + "pavgb (%2,%4), %%xmm3 \n\t" + "movdqa %%xmm0, (%2) \n\t" + "movdqa %%xmm1, (%2,%3) \n\t" + "movdqa %%xmm2, (%2,%3,2) \n\t" + "movdqa %%xmm3, (%2,%4) \n\t" + "subl $4, %0 \n\t" + "lea (%1,%3,4), %1 \n\t" + "lea (%2,%3,4), %2 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"((long)line_size), "r"(3L*line_size) + : "memory" + ); +} + static void clear_blocks_mmx(DCTELEM *blocks) { __asm __volatile( @@ -3475,6 +3523,23 @@ c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; } +/* FIXME works in most codecs, but crashes svq1 due to unaligned chroma + if((mm_flags & MM_SSE2) && !(mm_flags & MM_3DNOW)){ + // these functions are slower than mmx on AMD, but faster on Intel + c->put_pixels_tab[0][0] = put_pixels16_sse2; + c->avg_pixels_tab[0][0] = avg_pixels16_sse2; + } +*/ + +#ifdef HAVE_SSSE3 + if(mm_flags & MM_SSSE3){ + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, ssse3); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, ssse3); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, ssse3); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, ssse3); + } +#endif + #ifdef CONFIG_ENCODERS if(mm_flags & MM_SSE2){ c->sum_abs_dctelem= sum_abs_dctelem_sse2;