# HG changeset patch # User darkshikari # Date 1279750263 0 # Node ID baf13deed97e25766a2cfc8b90c9d71dffb55138 # Parent 9eef00a4328022ab400cb96ff4d931611c8c7411 Various VP8 x86 deblocking speedups SSSE3 versions, improve SSE2 versions a bit. SSE2/SSSE3 mbedge h functions are currently broken, so explicitly disable them. diff -r 9eef00a43280 -r baf13deed97e x86/vp8dsp-init.c --- a/x86/vp8dsp-init.c Wed Jul 21 20:51:01 2010 +0000 +++ b/x86/vp8dsp-init.c Wed Jul 21 22:11:03 2010 +0000 @@ -223,64 +223,31 @@ extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); -extern void ff_vp8_v_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); -extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); -extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); -extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); -extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); -extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); - -extern void ff_vp8_v_loop_filter16y_inner_mmx (uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_v_loop_filter16y_inner_mmxext(uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_v_loop_filter16y_inner_sse2 (uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_h_loop_filter16y_inner_mmx (uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_h_loop_filter16y_inner_mmxext(uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_h_loop_filter16y_inner_sse2 (uint8_t *dst, int stride, - int e, int i, int hvt); +#define DECLARE_LOOP_FILTER(NAME)\ +extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\ +extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\ +extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\ + int e, int i, int hvt);\ +extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\ + int e, int i, int hvt);\ +extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\ + int s, int e, int i, int hvt);\ +extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\ + int s, int e, int i, int hvt);\ +extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\ + int e, int i, int hvt);\ +extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\ + int e, int i, int hvt);\ +extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\ + int s, int e, int i, int hvt);\ +extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\ + int s, int e, int i, int hvt); -extern void ff_vp8_v_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); -extern void ff_vp8_v_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); -extern void ff_vp8_v_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); -extern void ff_vp8_h_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); -extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); -extern void ff_vp8_h_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); +DECLARE_LOOP_FILTER(mmx) +DECLARE_LOOP_FILTER(mmxext) +DECLARE_LOOP_FILTER(sse2) +DECLARE_LOOP_FILTER(ssse3) -extern void ff_vp8_v_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_v_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_v_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_h_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_h_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_h_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride, - int e, int i, int hvt); - -extern void ff_vp8_v_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); -extern void ff_vp8_v_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); -extern void ff_vp8_v_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); -extern void ff_vp8_h_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); -extern void ff_vp8_h_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); -extern void ff_vp8_h_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV, - int s, int e, int i, int hvt); #endif #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ @@ -384,8 +351,8 @@ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; - c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; - c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; + //c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; + //c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; } if (mm_flags & FF_MM_SSSE3) { @@ -395,6 +362,19 @@ VP8_BILINEAR_MC_FUNC(0, 16, ssse3); VP8_BILINEAR_MC_FUNC(1, 8, ssse3); VP8_BILINEAR_MC_FUNC(2, 4, ssse3); + + c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; + + c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3; + + c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3; + //c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; + c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; + //c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; } if (mm_flags & FF_MM_SSE4) { diff -r 9eef00a43280 -r baf13deed97e x86/vp8dsp.asm --- a/x86/vp8dsp.asm Wed Jul 21 20:51:01 2010 +0000 +++ b/x86/vp8dsp.asm Wed Jul 21 22:11:03 2010 +0000 @@ -1229,18 +1229,22 @@ movd [%7+%9*2], m%4 %endmacro -%macro SPLATB_REG 3 +%macro SPLATB_REG 3-4 movd %1, %2 +%ifidn %3, ssse3 + pshufb %1, %4 +%else punpcklbw %1, %1 %if mmsize == 16 ; sse2 - punpcklwd %1, %1 - pshufd %1, %1, 0x0 + pshuflw %1, %1, 0x0 + punpcklqdq %1, %1 %elifidn %3, mmx punpcklwd %1, %1 punpckldq %1, %1 %else ; mmxext pshufw %1, %1, 0x0 %endif +%endif %endmacro %macro SIMPLE_LOOPFILTER 3 @@ -1252,7 +1256,10 @@ %if mmsize == 8 ; mmx/mmxext mov r3, 2 %endif - SPLATB_REG m7, r2, %1 ; splat "flim" into register +%ifidn %1, ssse3 + pxor m0, m0 +%endif + SPLATB_REG m7, r2, %1, m0 ; splat "flim" into register ; set up indexes to address 4 rows mov r2, r1 @@ -1398,6 +1405,8 @@ INIT_XMM SIMPLE_LOOPFILTER sse2, v, 3 SIMPLE_LOOPFILTER sse2, h, 6 +SIMPLE_LOOPFILTER ssse3, v, 3 +SIMPLE_LOOPFILTER ssse3, h, 6 ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter_inner_(uint8_t *dst, [uint8_t *v,] int stride, @@ -1433,11 +1442,15 @@ %define stack_reg hev_thr_reg %endif +%ifidn %1, ssse3 + pxor m7, m7 +%endif + %ifndef m8 ; mmx/mmxext or sse2 on x86-32 ; splat function arguments - SPLATB_REG m0, E_reg, %1 ; E - SPLATB_REG m1, I_reg, %1 ; I - SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh + SPLATB_REG m0, E_reg, %1, m7 ; E + SPLATB_REG m1, I_reg, %1, m7 ; I + SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh ; align stack mov stack_reg, rsp ; backup stack pointer @@ -1470,9 +1483,9 @@ %define q0backup m8 ; splat function arguments - SPLATB_REG flim_E, E_reg, %1 ; E - SPLATB_REG flim_I, I_reg, %1 ; I - SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh + SPLATB_REG flim_E, E_reg, %1, m7 ; E + SPLATB_REG flim_I, I_reg, %1, m7 ; I + SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh %endif %if mmsize == 8 && %4 == 16 ; mmx/mmxext @@ -1884,15 +1897,15 @@ %endmacro INIT_MMX -INNER_LOOPFILTER mmx, v, 6, 16, 8 -INNER_LOOPFILTER mmx, h, 6, 16, 8 -INNER_LOOPFILTER mmxext, v, 6, 16, 8 -INNER_LOOPFILTER mmxext, h, 6, 16, 8 +INNER_LOOPFILTER mmx, v, 6, 16, 0 +INNER_LOOPFILTER mmx, h, 6, 16, 0 +INNER_LOOPFILTER mmxext, v, 6, 16, 0 +INNER_LOOPFILTER mmxext, h, 6, 16, 0 -INNER_LOOPFILTER mmx, v, 6, 8, 8 -INNER_LOOPFILTER mmx, h, 6, 8, 8 -INNER_LOOPFILTER mmxext, v, 6, 8, 8 -INNER_LOOPFILTER mmxext, h, 6, 8, 8 +INNER_LOOPFILTER mmx, v, 6, 8, 0 +INNER_LOOPFILTER mmx, h, 6, 8, 0 +INNER_LOOPFILTER mmxext, v, 6, 8, 0 +INNER_LOOPFILTER mmxext, h, 6, 8, 0 INIT_XMM INNER_LOOPFILTER sse2, v, 5, 16, 13 @@ -1904,6 +1917,15 @@ INNER_LOOPFILTER sse2, v, 6, 8, 13 INNER_LOOPFILTER sse2, h, 6, 8, 13 +INNER_LOOPFILTER ssse3, v, 5, 16, 13 +%ifdef m8 +INNER_LOOPFILTER ssse3, h, 5, 16, 13 +%else +INNER_LOOPFILTER ssse3, h, 6, 16, 13 +%endif +INNER_LOOPFILTER ssse3, v, 6, 8, 13 +INNER_LOOPFILTER ssse3, h, 6, 8, 13 + ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter_mbedge_(uint8_t *dst, [uint8_t *v,] int stride, ; int flimE, int flimI, int hev_thr); @@ -1984,11 +2006,15 @@ %define stack_reg hev_thr_reg %endif +%ifidn %1, ssse3 + pxor m7, m7 +%endif + %ifndef m8 ; mmx/mmxext or sse2 on x86-32 ; splat function arguments - SPLATB_REG m0, E_reg, %1 ; E - SPLATB_REG m1, I_reg, %1 ; I - SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh + SPLATB_REG m0, E_reg, %1, m7 ; E + SPLATB_REG m1, I_reg, %1, m7 ; I + SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh ; align stack mov stack_reg, rsp ; backup stack pointer @@ -2028,9 +2054,9 @@ %define lim_sign m15 ; splat function arguments - SPLATB_REG flim_E, E_reg, %1 ; E - SPLATB_REG flim_I, I_reg, %1 ; I - SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh + SPLATB_REG flim_E, E_reg, %1, m7 ; E + SPLATB_REG flim_I, I_reg, %1, m7 ; I + SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh %endif %if mmsize == 8 && %4 == 16 ; mmx/mmxext @@ -2521,15 +2547,15 @@ %endmacro INIT_MMX -MBEDGE_LOOPFILTER mmx, v, 6, 16, 8 -MBEDGE_LOOPFILTER mmx, h, 6, 16, 8 -MBEDGE_LOOPFILTER mmxext, v, 6, 16, 8 -MBEDGE_LOOPFILTER mmxext, h, 6, 16, 8 +MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 +MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 +MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 +MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 -MBEDGE_LOOPFILTER mmx, v, 6, 8, 8 -MBEDGE_LOOPFILTER mmx, h, 6, 8, 8 -MBEDGE_LOOPFILTER mmxext, v, 6, 8, 8 -MBEDGE_LOOPFILTER mmxext, h, 6, 8, 8 +MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 +MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 +MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 +MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 INIT_XMM MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 @@ -2540,3 +2566,12 @@ %endif MBEDGE_LOOPFILTER sse2, v, 6, 8, 16 MBEDGE_LOOPFILTER sse2, h, 6, 8, 16 + +MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16 +%ifdef m8 +MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16 +%else +MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 +%endif +MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 +MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16