# HG changeset patch # User rbultje # Date 1279828774 0 # Node ID d07e6037846d36e6360ab17dbaacac6d85d7cd59 # Parent 36940062c081d4c70e2b5ee4ed3522dd0cb79dcf Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on CPUs supporting it. diff -r 36940062c081 -r d07e6037846d x86/vp8dsp-init.c --- a/x86/vp8dsp-init.c Thu Jul 22 12:35:32 2010 +0000 +++ b/x86/vp8dsp-init.c Thu Jul 22 19:59:34 2010 +0000 @@ -247,6 +247,7 @@ DECLARE_LOOP_FILTER(mmxext) DECLARE_LOOP_FILTER(sse2) DECLARE_LOOP_FILTER(ssse3) +DECLARE_LOOP_FILTER(sse4) #endif @@ -379,6 +380,9 @@ if (mm_flags & FF_MM_SSE4) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; + + c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; + c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; } #endif } diff -r 36940062c081 -r d07e6037846d x86/vp8dsp.asm --- a/x86/vp8dsp.asm Thu Jul 22 12:35:32 2010 +0000 +++ b/x86/vp8dsp.asm Thu Jul 22 19:59:34 2010 +0000 @@ -1932,10 +1932,24 @@ ; write 4 or 8 words in the mmx/xmm registers as 8 lines ; 1 and 2 are the registers to write, this can be the same (for SSE2) +; for pre-SSE4: ; 3 is a general-purpose register that we will clobber +; for SSE4: +; 3 is a pointer to the destination's 5th line ; 4 is a pointer to the destination's 4th line -; 5 is -stride and +stride -%macro WRITE_8W 6 +; 5/6 is -stride and +stride +; 7 is optimization string +%macro WRITE_8W 7 +%ifidn %7, sse4 + pextrw [%4+%5*4], %1, 0 + pextrw [%3+%5*4], %1, 1 + pextrw [%4+%5*2], %1, 2 + pextrw [%4+%5 ], %1, 3 + pextrw [%4 ], %1, 4 + pextrw [%3 ], %1, 5 + pextrw [%3+%6 ], %1, 6 + pextrw [%3+%6*2], %1, 7 +%else movd %3, %1 %if mmsize == 8 punpckhdq %1, %1 @@ -1974,6 +1988,7 @@ %if mmsize == 8 add %4, %5 %endif +%endif %endmacro %macro MBEDGE_LOOPFILTER 5 @@ -2509,14 +2524,17 @@ %if mmsize == 8 ; mmx/mmxext (h) WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg add dst_reg, 4 - WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg + WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4 %else ; sse2 (h) lea dst8_reg, [dst8_reg+mstride_reg+1] WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 lea dst_reg, [dst2_reg+mstride_reg+4] lea dst8_reg, [dst8_reg+mstride_reg+4] - WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg - WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg + WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2 +%ifidn %2, sse4 + lea dst_reg, [dst8_reg+ stride_reg] +%endif + WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2 %endif %endif @@ -2574,3 +2592,10 @@ %endif MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 + +%ifdef m8 +MBEDGE_LOOPFILTER sse4, h, 5, 16, 16 +%else +MBEDGE_LOOPFILTER sse4, h, 6, 16, 16 +%endif +MBEDGE_LOOPFILTER sse4, h, 6, 8, 16