# HG changeset patch # User rbultje # Date 1278185190 0 # Node ID d780ae746855cfbf02daafe747036f8cb084b202 # Parent 8454bb880008aa596d9a30b9c73737b2a296ce70 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros). diff -r 8454bb880008 -r d780ae746855 x86/dsputil_mmx.c --- a/x86/dsputil_mmx.c Sat Jul 03 18:36:10 2010 +0000 +++ b/x86/dsputil_mmx.c Sat Jul 03 19:26:30 2010 +0000 @@ -63,12 +63,16 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL}; DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; diff -r 8454bb880008 -r d780ae746855 x86/vp8dsp-init.c --- a/x86/vp8dsp-init.c Sat Jul 03 18:36:10 2010 +0000 +++ b/x86/vp8dsp-init.c Sat Jul 03 19:26:30 2010 +0000 @@ -222,6 +222,13 @@ extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]); extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); + +extern void ff_vp8_v_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); +extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); +extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); +extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); +extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); +extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); #endif #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ @@ -260,6 +267,9 @@ c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx; c->put_vp8_epel_pixels_tab[1][0][0] = c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; + + c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx; + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx; } /* note that 4-tap width=16 functions are missing because w=16 @@ -272,6 +282,9 @@ VP8_BILINEAR_MC_FUNC(0, 16, mmxext); VP8_BILINEAR_MC_FUNC(1, 8, mmxext); VP8_BILINEAR_MC_FUNC(2, 4, mmxext); + + c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; } if (mm_flags & FF_MM_SSE) { @@ -284,6 +297,9 @@ VP8_MC_FUNC(1, 8, sse2); VP8_BILINEAR_MC_FUNC(0, 16, sse2); VP8_BILINEAR_MC_FUNC(1, 8, sse2); + + c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; } if (mm_flags & FF_MM_SSSE3) { diff -r 8454bb880008 -r d780ae746855 x86/vp8dsp.asm --- a/x86/vp8dsp.asm Sat Jul 03 18:36:10 2010 +0000 +++ b/x86/vp8dsp.asm Sat Jul 03 19:26:30 2010 +0000 @@ -146,8 +146,13 @@ pw_17734: times 4 dw 17734 cextern pw_3 +cextern pb_3 cextern pw_4 +cextern pb_4 cextern pw_64 +cextern pb_80 +cextern pb_F8 +cextern pb_FE SECTION .text @@ -1063,3 +1068,304 @@ add r0, 2*16*4 SCATTER_WHT 3 RET + +;----------------------------------------------------------------------------- +; void vp8_h/v_loop_filter_simple_(uint8_t *dst, int stride, int flim); +;----------------------------------------------------------------------------- + +; macro called with 7 mm register indexes as argument, and 4 regular registers +; +; first 4 mm registers will carry the transposed pixel data +; the other three are scratchspace (one would be sufficient, but this allows +; for more spreading/pipelining and thus faster execution on OOE CPUs) +; +; first two regular registers are buf+4*stride and buf+5*stride +; third is -stride, fourth is +stride +%macro READ_8x4_INTERLEAVED 11 + ; interleave 8 (A-H) rows of 4 pixels each + movd m%1, [%8+%10*4] ; A0-3 + movd m%5, [%9+%10*4] ; B0-3 + movd m%2, [%8+%10*2] ; C0-3 + movd m%6, [%8+%10] ; D0-3 + movd m%3, [%8] ; E0-3 + movd m%7, [%9] ; F0-3 + movd m%4, [%9+%11] ; G0-3 + punpcklbw m%1, m%5 ; A/B interleaved + movd m%5, [%9+%11*2] ; H0-3 + punpcklbw m%2, m%6 ; C/D interleaved + punpcklbw m%3, m%7 ; E/F interleaved + punpcklbw m%4, m%5 ; G/H interleaved +%endmacro + +; macro called with 7 mm register indexes as argument, and 5 regular registers +; first 11 mean the same as READ_8x4_TRANSPOSED above +; fifth regular register is scratchspace to reach the bottom 8 rows, it +; will be set to second regular register + 8*stride at the end +%macro READ_16x4_INTERLEAVED 12 + ; transpose 16 (A-P) rows of 4 pixels each + lea %12, [r0+8*r2] + + ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M + movd m%1, [%8+%10*4] ; A0-3 + movd m%3, [%12+%10*4] ; I0-3 + movd m%2, [%8+%10*2] ; C0-3 + movd m%4, [%12+%10*2] ; K0-3 + movd m%6, [%8+%10] ; D0-3 + movd m%5, [%12+%10] ; L0-3 + movd m%7, [%12] ; M0-3 + add %12, %11 + punpcklbw m%1, m%3 ; A/I + movd m%3, [%8] ; E0-3 + punpcklbw m%2, m%4 ; C/K + punpcklbw m%6, m%5 ; D/L + punpcklbw m%3, m%7 ; E/M + punpcklbw m%2, m%6 ; C/D/K/L interleaved + + ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P + movd m%5, [%9+%10*4] ; B0-3 + movd m%4, [%12+%10*4] ; J0-3 + movd m%7, [%9] ; F0-3 + movd m%6, [%12] ; N0-3 + punpcklbw m%5, m%4 ; B/J + punpcklbw m%7, m%6 ; F/N + punpcklbw m%1, m%5 ; A/B/I/J interleaved + punpcklbw m%3, m%7 ; E/F/M/N interleaved + movd m%4, [%9+%11] ; G0-3 + movd m%6, [%12+%11] ; O0-3 + movd m%5, [%9+%11*2] ; H0-3 + movd m%7, [%12+%11*2] ; P0-3 + punpcklbw m%4, m%6 ; G/O + punpcklbw m%5, m%7 ; H/P + punpcklbw m%4, m%5 ; G/H/O/P interleaved +%endmacro + +; write 4 mm registers of 2 dwords each +; first four arguments are mm register indexes containing source data +; last four are registers containing buf+4*stride, buf+5*stride, +; -stride and +stride +%macro WRITE_4x2D 8 + ; write out (2 dwords per register) + movd [%5+%7*4], m%1 + movd [%5+%7*2], m%2 + movd [%5], m%3 + movd [%6+%8], m%4 + punpckhdq m%1, m%1 + punpckhdq m%2, m%2 + punpckhdq m%3, m%3 + punpckhdq m%4, m%4 + movd [%6+%7*4], m%1 + movd [%5+%7], m%2 + movd [%6], m%3 + movd [%6+%8*2], m%4 +%endmacro + +; write 4 xmm registers of 4 dwords each +; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular +; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride +; we add 1*stride to the third regular registry in the process +%macro WRITE_4x4D 9 + ; write out (4 dwords per register), start with dwords zero + movd [%5+%8*4], m%1 + movd [%5], m%2 + movd [%5+%9*4], m%3 + movd [%5+%9*8], m%4 + + ; store dwords 1 + psrldq m%1, 4 + psrldq m%2, 4 + psrldq m%3, 4 + psrldq m%4, 4 + movd [%6+%8*4], m%1 + movd [%6], m%2 + movd [%6+%9*4], m%3 + movd [%6+%9*8], m%4 + + ; write dwords 2 + psrldq m%1, 4 + psrldq m%2, 4 + psrldq m%3, 4 + psrldq m%4, 4 + movd [%5+%8*2], m%1 + movd [%6+%9], m%2 + movd [%7+%8*2], m%3 + movd [%7+%9*2], m%4 + add %7, %9 + + ; store dwords 3 + psrldq m%1, 4 + psrldq m%2, 4 + psrldq m%3, 4 + psrldq m%4, 4 + movd [%5+%8], m%1 + movd [%6+%9*2], m%2 + movd [%7+%8*2], m%3 + movd [%7+%9*2], m%4 +%endmacro + +%macro SIMPLE_LOOPFILTER 3 +cglobal vp8_%2_loop_filter_simple_%1, 3, %3 +%ifidn %2, h + mov r5, rsp ; backup stack pointer + and rsp, ~(mmsize-1) ; align stack +%endif +%if mmsize == 8 ; mmx/mmxext + mov r3, 2 +%endif + + ; splat register with "flim" + movd m7, r2 + punpcklbw m7, m7 +%if mmsize == 16 ; sse2 + punpcklwd m7, m7 + pshufd m7, m7, 0x0 +%elifidn %1, mmx + punpcklwd m7, m7 + punpckldq m7, m7 +%else ; mmxext + pshufw m7, m7, 0x0 +%endif + + ; set up indexes to address 4 rows + mov r2, r1 + neg r1 +%ifidn %2, h + lea r0, [r0+4*r2-2] + sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 +%endif + +%if mmsize == 8 ; mmx / mmxext +.next8px +%endif +%ifidn %2, v + ; read 4 half/full rows of pixels + mova m0, [r0+r1*2] ; p1 + mova m1, [r0+r1] ; p0 + mova m2, [r0] ; q0 + mova m3, [r0+r2] ; q1 +%else ; h + lea r4, [r0+r2] + +%if mmsize == 8 ; mmx/mmxext + READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 +%else ; sse2 + READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 +%endif + TRANSPOSE4x4W 0, 1, 2, 3, 4 + + mova [rsp], m0 ; store p1 + mova [rsp+mmsize], m3 ; store q1 +%endif + + ; simple_limit + mova m5, m2 ; m5=backup of q0 + mova m6, m1 ; m6=backup of p0 + psubusb m1, m2 ; p0-q0 + psubusb m2, m6 ; q0-p0 + por m1, m2 ; FFABS(p0-q0) + paddusb m1, m1 ; m1=FFABS(p0-q0)*2 + + mova m4, m3 + mova m2, m0 + psubusb m3, m0 ; q1-p1 + psubusb m0, m4 ; p1-q1 + por m3, m0 ; FFABS(p1-q1) + mova m0, [pb_80] + pxor m2, m0 + pxor m4, m0 + psubsb m2, m4 ; m2=p1-q1 (signed) backup for below + pand m3, [pb_FE] + psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed + paddusb m3, m1 + psubusb m3, m7 + pxor m1, m1 + pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) + + ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) + mova m4, m5 + pxor m5, m0 + pxor m0, m6 + psubsb m5, m0 ; q0-p0 (signed) + paddsb m2, m5 + paddsb m2, m5 + paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) + pand m2, m3 ; apply filter mask (m3) + + mova m3, [pb_F8] + mova m1, m2 + paddsb m2, [pb_4] ; f1<<3=a+4 + paddsb m1, [pb_3] ; f2<<3=a+3 + pand m2, m3 + pand m1, m3 ; cache f2<<3 + + pxor m0, m0 + pxor m3, m3 + pcmpgtb m0, m2 ; which values are <0? + psubb m3, m2 ; -f1<<3 + psrlq m2, 3 ; +f1 + psrlq m3, 3 ; -f1 + pand m3, m0 + pandn m0, m2 + psubusb m4, m0 + paddusb m4, m3 ; q0-f1 + + pxor m0, m0 + pxor m3, m3 + pcmpgtb m0, m1 ; which values are <0? + psubb m3, m1 ; -f2<<3 + psrlq m1, 3 ; +f2 + psrlq m3, 3 ; -f2 + pand m3, m0 + pandn m0, m1 + paddusb m6, m0 + psubusb m6, m3 ; p0+f2 + + ; store +%ifidn %2, v + mova [r0], m4 + mova [r0+r1], m6 +%else ; h + mova m0, [rsp] ; p1 + SWAP 2, 4 ; p0 + SWAP 1, 6 ; q0 + mova m3, [rsp+mmsize] ; q1 + + TRANSPOSE4x4B 0, 1, 2, 3, 4 +%if mmsize == 16 ; sse2 + add r3, r1 ; change from r4*8*stride to r0+8*stride + WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2 +%else ; mmx/mmxext + WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 +%endif +%endif + +%if mmsize == 8 ; mmx/mmxext + ; next 8 pixels +%ifidn %2, v + add r0, 8 ; advance 8 cols = pixels +%else ; h + lea r0, [r0+r2*8] ; advance 8 rows = lines +%endif + dec r3 + jg .next8px +%ifidn %2, v + REP_RET +%else ; h + mov rsp, r5 ; restore stack pointer + RET +%endif +%else ; sse2 +%ifidn %2, h + mov rsp, r5 ; restore stack pointer +%endif + RET +%endif +%endmacro + +INIT_MMX +SIMPLE_LOOPFILTER mmx, v, 4 +SIMPLE_LOOPFILTER mmx, h, 6 +SIMPLE_LOOPFILTER mmxext, v, 4 +SIMPLE_LOOPFILTER mmxext, h, 6 +INIT_XMM +SIMPLE_LOOPFILTER sse2, v, 3 +SIMPLE_LOOPFILTER sse2, h, 6 diff -r 8454bb880008 -r d780ae746855 x86/x86util.asm --- a/x86/x86util.asm Sat Jul 03 18:36:10 2010 +0000 +++ b/x86/x86util.asm Sat Jul 03 19:26:30 2010 +0000 @@ -37,6 +37,14 @@ SWAP %2, %4, %3 %endmacro +%macro TRANSPOSE4x4B 5 + SBUTTERFLY bw, %1, %2, %5 + SBUTTERFLY bw, %3, %4, %5 + SBUTTERFLY wd, %1, %3, %5 + SBUTTERFLY wd, %2, %4, %5 + SWAP %2, %3 +%endmacro + %macro TRANSPOSE4x4W 5 SBUTTERFLY wd, %1, %2, %5 SBUTTERFLY wd, %3, %4, %5