# HG changeset patch # User lorenm # Date 1255896610 0 # Node ID 12c8175d6db5f5031a16c69a7328e5af5177b22c # Parent 289dd8daf4ee8360bf4b169b022926d17186965b simd add_hfyu_left_prediction 2.2x faster than C on conroe, 3.6x on penryn. 4-6% faster huffyuv decoding if using left or plane mode and yuv diff -r 289dd8daf4ee -r 12c8175d6db5 dsputil.h --- a/dsputil.h Sun Oct 18 19:51:18 2009 +0000 +++ b/dsputil.h Sun Oct 18 20:10:10 2009 +0000 @@ -349,7 +349,7 @@ */ void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top); void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, uint8_t *diff, int w, int *left, int *left_top); - int (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int acc); + int (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left); void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue); /* this might write to dst[w] */ void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); diff -r 289dd8daf4ee -r 12c8175d6db5 x86/dsputil_mmx.c --- a/x86/dsputil_mmx.c Sun Oct 18 19:51:18 2009 +0000 +++ b/x86/dsputil_mmx.c Sun Oct 18 20:10:10 2009 +0000 @@ -2385,6 +2385,8 @@ void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top); +int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, uint8_t *src, int w, int left); +int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, uint8_t *src, int w, int left); void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); @@ -2951,6 +2953,11 @@ c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; +#if HAVE_YASM + c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; + if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe + c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; +#endif } #endif diff -r 289dd8daf4ee -r 12c8175d6db5 x86/dsputil_yasm.asm --- a/x86/dsputil_yasm.asm Sun Oct 18 19:51:18 2009 +0000 +++ b/x86/dsputil_yasm.asm Sun Oct 18 20:10:10 2009 +0000 @@ -21,6 +21,13 @@ %include "x86inc.asm" +SECTION_RODATA +pb_f: times 16 db 15 +pb_zzzzzzzz77777777: times 8 db -1 +pb_7: times 8 db 7 +pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 +pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 + section .text align=16 %macro PSWAPD_SSE 2 @@ -150,3 +157,70 @@ movzx r2d, byte [topq-1] mov [left_topq], r2d RET + + +%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned + add srcq, wq + add dstq, wq + neg wq +%%.loop: + mova m1, [srcq+wq] + mova m2, m1 + psllw m1, 8 + paddb m1, m2 + mova m2, m1 + pshufb m1, m3 + paddb m1, m2 + pshufb m0, m5 + mova m2, m1 + pshufb m1, m4 + paddb m1, m2 +%if mmsize == 16 + mova m2, m1 + pshufb m1, m6 + paddb m1, m2 +%endif + paddb m0, m1 +%if %1 + mova [dstq+wq], m0 +%else + movq [dstq+wq], m0 + movhps [dstq+wq+8], m0 +%endif + add wq, mmsize + jl %%.loop + mov eax, mmsize-1 + sub eax, wd + movd m1, eax + pshufb m0, m1 + movd eax, m0 + RET +%endmacro + +; int ff_add_hfyu_left_prediction(uint8_t *dst, uint8_t *src, int w, int left) +INIT_MMX +cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left +.skip_prologue: + mova m5, [pb_7 GLOBAL] + mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] + mova m3, [pb_zz11zz55zz99zzdd GLOBAL] + movd m0, leftm + psllq m0, 56 + ADD_HFYU_LEFT_LOOP 1 + +INIT_XMM +cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left + mova m5, [pb_f GLOBAL] + mova m6, [pb_zzzzzzzz77777777 GLOBAL] + mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] + mova m3, [pb_zz11zz55zz99zzdd GLOBAL] + movd m0, leftm + pslldq m0, 15 + test srcq, 15 + jnz ff_add_hfyu_left_prediction_ssse3 %+ .skip_prologue + test dstq, 15 + jnz .unaligned + ADD_HFYU_LEFT_LOOP 1 +.unaligned: + ADD_HFYU_LEFT_LOOP 0 + diff -r 289dd8daf4ee -r 12c8175d6db5 x86/x86inc.asm --- a/x86/x86inc.asm Sun Oct 18 19:51:18 2009 +0000 +++ b/x86/x86inc.asm Sun Oct 18 20:10:10 2009 +0000 @@ -221,6 +221,7 @@ CAT_UNDEF arg_name %+ %%i, d CAT_UNDEF arg_name %+ %%i, w CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m CAT_UNDEF arg_name, %%i %assign %%i %%i+1 %endrep @@ -232,6 +233,7 @@ %xdefine %1d r %+ %%i %+ d %xdefine %1w r %+ %%i %+ w %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m CAT_XDEFINE arg_name, %%i, %1 %assign %%i %%i+1 %rotate 1