# HG changeset patch # User rbultje # Date 1279663458 0 # Node ID 563339ea87aa35d7804aa0d408222743df61582e # Parent a2c993c7ae902a50caa6f53c200c1e620a19f53f Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder. diff -r a2c993c7ae90 -r 563339ea87aa x86/vp8dsp-init.c --- a/x86/vp8dsp-init.c Tue Jul 20 21:54:46 2010 +0000 +++ b/x86/vp8dsp-init.c Tue Jul 20 22:04:18 2010 +0000 @@ -242,6 +242,19 @@ int e, int i, int hvt); extern void ff_vp8_h_loop_filter16y_inner_sse2 (uint8_t *dst, int stride, int e, int i, int hvt); + +extern void ff_vp8_v_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); +extern void ff_vp8_v_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); +extern void ff_vp8_v_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); +extern void ff_vp8_h_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); +extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); +extern void ff_vp8_h_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); #endif #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ @@ -286,6 +299,8 @@ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx; } /* note that 4-tap width=16 functions are missing because w=16 @@ -304,6 +319,8 @@ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext; } if (mm_flags & FF_MM_SSE) { @@ -321,10 +338,12 @@ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; } if (mm_flags & FF_MM_SSE2) { c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; } if (mm_flags & FF_MM_SSSE3) { diff -r a2c993c7ae90 -r 563339ea87aa x86/vp8dsp.asm --- a/x86/vp8dsp.asm Tue Jul 20 21:54:46 2010 +0000 +++ b/x86/vp8dsp.asm Tue Jul 20 22:04:18 2010 +0000 @@ -1164,12 +1164,16 @@ ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride ; we add 1*stride to the third regular registry in the process -%macro WRITE_4x4D 9 +; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the +; same memory region), or 8 if they cover two separate buffers (third one points to +; a different memory region than the first two), allowing for more optimal code for +; the 16-width case +%macro WRITE_4x4D 10 ; write out (4 dwords per register), start with dwords zero movd [%5+%8*4], m%1 movd [%5], m%2 - movd [%5+%9*4], m%3 - movd [%5+%9*8], m%4 + movd [%7+%8*4], m%3 + movd [%7], m%4 ; store dwords 1 psrldq m%1, 4 @@ -1178,15 +1182,23 @@ psrldq m%4, 4 movd [%6+%8*4], m%1 movd [%6], m%2 +%if %10 == 16 movd [%6+%9*4], m%3 - movd [%6+%9*8], m%4 +%endif + movd [%7+%9], m%4 ; write dwords 2 psrldq m%1, 4 psrldq m%2, 4 +%if %10 == 8 + movd [%5+%8*2], m%1 + movd %5, m%3 +%endif psrldq m%3, 4 psrldq m%4, 4 +%if %10 == 16 movd [%5+%8*2], m%1 +%endif movd [%6+%9], m%2 movd [%7+%8*2], m%3 movd [%7+%9*2], m%4 @@ -1197,7 +1209,12 @@ psrldq m%2, 4 psrldq m%3, 4 psrldq m%4, 4 +%if %10 == 8 + mov [%7+%8*4], %5d + movd [%6+%8*2], m%1 +%else movd [%5+%8], m%1 +%endif movd [%6+%9*2], m%2 movd [%7+%8*2], m%3 movd [%7+%9*2], m%4 @@ -1335,7 +1352,7 @@ TRANSPOSE4x4B 0, 1, 2, 3, 4 %if mmsize == 16 ; sse2 add r3, r1 ; change from r4*8*stride to r0+8*stride - WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2 + WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16 %else ; mmx/mmxext WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 %endif @@ -1374,13 +1391,20 @@ SIMPLE_LOOPFILTER sse2, h, 6 ;----------------------------------------------------------------------------- -; void vp8_h/v_loop_filter_inner_(uint8_t *dst, int stride, +; void vp8_h/v_loop_filter_inner_(uint8_t *dst, [uint8_t *v,] int stride, ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- -%macro INNER_LOOPFILTER 4 -cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %4 -%define dst_reg r0 +%macro INNER_LOOPFILTER 5 +%if %4 == 8 ; chroma +cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 +%define dst8_reg r1 +%define mstride_reg r2 +%define E_reg r3 +%define I_reg r4 +%define hev_thr_reg r5 +%else ; luma +cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %define mstride_reg r1 %define E_reg r2 %define I_reg r3 @@ -1392,6 +1416,8 @@ %else ; x86-32, mmx/mmxext %define cnt_reg r5 %endif +%endif +%define dst_reg r0 %define stride_reg E_reg %define dst2_reg I_reg %ifndef m8 @@ -1418,6 +1444,8 @@ %define flim_I [rsp+mmsize] %define hev_thr [rsp+mmsize*2] %define mask_res [rsp+mmsize*3] +%define p0backup [rsp+mmsize*3] +%define q0backup [rsp+mmsize*4] mova flim_E, m0 mova flim_I, m1 @@ -1429,6 +1457,8 @@ %define flim_I m10 %define hev_thr m11 %define mask_res m12 +%define p0backup m12 +%define q0backup m8 ; splat function arguments SPLATB_REG flim_E, E_reg, %1 ; E @@ -1436,13 +1466,16 @@ SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh %endif -%if mmsize == 8 ; mmx/mmxext +%if mmsize == 8 && %4 == 16 ; mmx/mmxext mov cnt_reg, 2 %endif mov stride_reg, mstride_reg neg mstride_reg %ifidn %2, h lea dst_reg, [dst_reg + stride_reg*4-4] +%if %4 == 8 + lea dst8_reg, [dst8_reg+ stride_reg*4-4] +%endif %endif %if mmsize == 8 @@ -1451,12 +1484,27 @@ ; read lea dst2_reg, [dst_reg + stride_reg] %ifidn %2, v - mova m0, [dst_reg +mstride_reg*4] ; p3 - mova m1, [dst2_reg+mstride_reg*4] ; p2 - mova m2, [dst_reg +mstride_reg*2] ; p1 - mova m5, [dst2_reg] ; q1 - mova m6, [dst2_reg+ stride_reg] ; q2 - mova m7, [dst2_reg+ stride_reg*2] ; q3 +%if %4 == 8 && mmsize == 16 +%define movrow movh +%else +%define movrow mova +%endif + movrow m0, [dst_reg +mstride_reg*4] ; p3 + movrow m1, [dst2_reg+mstride_reg*4] ; p2 + movrow m2, [dst_reg +mstride_reg*2] ; p1 + movrow m5, [dst2_reg] ; q1 + movrow m6, [dst2_reg+ stride_reg] ; q2 + movrow m7, [dst2_reg+ stride_reg*2] ; q3 +%if mmsize == 16 && %4 == 8 + movhps m0, [dst8_reg+mstride_reg*4] + movhps m2, [dst8_reg+mstride_reg*2] + add dst8_reg, stride_reg + movhps m1, [dst8_reg+mstride_reg*4] + movhps m5, [dst8_reg] + movhps m6, [dst8_reg+ stride_reg] + movhps m7, [dst8_reg+ stride_reg*2] + add dst8_reg, mstride_reg +%endif %elif mmsize == 8 ; mmx/mmxext (h) ; read 8 rows of 8px each movu m0, [dst_reg +mstride_reg*4] @@ -1469,35 +1517,24 @@ ; 8x8 transpose TRANSPOSE4x4B 0, 1, 2, 3, 7 -%ifdef m13 - SWAP 1, 8 -%else - mova [rsp+mmsize*4], m1 -%endif + mova q0backup, m1 movu m7, [dst2_reg+ stride_reg*2] TRANSPOSE4x4B 4, 5, 6, 7, 1 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 -%ifdef m13 - SWAP 1, 8 - SWAP 2, 8 -%else - mova m1, [rsp+mmsize*4] - mova [rsp+mmsize*4], m2 ; store q0 -%endif + mova m1, q0backup + mova q0backup, m2 ; store q0 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 -%ifdef m14 - SWAP 5, 12 -%else - mova [rsp+mmsize*3], m5 ; store p0 -%endif + mova p0backup, m5 ; store p0 SWAP 1, 4 SWAP 2, 4 SWAP 6, 3 SWAP 5, 3 %else ; sse2 (h) +%if %4 == 16 lea dst8_reg, [dst_reg + stride_reg*8] +%endif ; read 16 rows of 8px each, interleave movh m0, [dst_reg +mstride_reg*4] @@ -1526,10 +1563,10 @@ ; 8x16 transpose TRANSPOSE4x4B 0, 1, 2, 3, 7 -%ifdef m13 +%ifdef m8 SWAP 1, 8 %else - mova [rsp+mmsize*4], m1 + mova q0backup, m1 %endif movh m7, [dst2_reg+ stride_reg*2] movh m1, [dst8_reg+ stride_reg*2] @@ -1538,18 +1575,18 @@ SBUTTERFLY dq, 0, 4, 1 ; p3/p2 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 -%ifdef m13 +%ifdef m8 SWAP 1, 8 SWAP 2, 8 %else - mova m1, [rsp+mmsize*4] - mova [rsp+mmsize*4], m2 ; store q0 + mova m1, q0backup + mova q0backup, m2 ; store q0 %endif SBUTTERFLY dq, 1, 5, 2 ; p1/p0 -%ifdef m14 +%ifdef m12 SWAP 5, 12 %else - mova [rsp+mmsize*3], m5 ; store p0 + mova p0backup, m5 ; store p0 %endif SWAP 1, 4 SWAP 2, 4 @@ -1583,11 +1620,7 @@ por m6, m4 ; abs(q2-q1) %ifidn %1, mmx -%ifdef m10 - SWAP 4, 10 -%else - mova m4, [rsp+mmsize] -%endif + mova m4, flim_I pxor m3, m3 psubusb m0, m4 psubusb m1, m4 @@ -1609,11 +1642,14 @@ ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero %ifidn %2, v - mova m3, [dst_reg +mstride_reg] ; p0 -%elifdef m14 + movrow m3, [dst_reg +mstride_reg] ; p0 +%if mmsize == 16 && %4 == 8 + movhps m3, [dst8_reg+mstride_reg] +%endif +%elifdef m12 SWAP 3, 12 %else - mova m3, [rsp+mmsize*3] + mova m3, p0backup %endif mova m1, m2 @@ -1630,11 +1666,7 @@ pcmpeqb m1, m7 ; abs(p1-p0) <= I pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh pand m0, m1 -%ifdef m12 - SWAP 6, 12 -%else - mova [rsp+mmsize*3], m6 -%endif + mova mask_res, m6 %else ; mmxext/sse2 pmaxub m0, m1 ; max_I SWAP 1, 4 ; max_hev_thresh @@ -1642,11 +1674,14 @@ SWAP 6, 4 ; now m6 is I %ifidn %2, v - mova m4, [dst_reg] ; q0 -%elifdef m13 + movrow m4, [dst_reg] ; q0 +%if mmsize == 16 && %4 == 8 + movhps m4, [dst8_reg] +%endif +%elifdef m8 SWAP 4, 8 %else - mova m4, [rsp+mmsize*4] + mova m4, q0backup %endif mova m1, m4 SWAP 1, 4 @@ -1662,11 +1697,7 @@ pxor m6, m6 pcmpeqb m1, m6 ; abs(q1-q0) <= I pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh -%ifdef m12 - SWAP 6, 12 -%else - mova m6, [rsp+mmsize*3] -%endif + mova m6, mask_res pand m0, m1 ; abs([pq][321]-[pq][210]) <= I pand m6, m7 %else ; mmxext/sse2 @@ -1681,7 +1712,7 @@ %ifdef m12 SWAP 6, 12 %else - mova [rsp+mmsize*3], m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) + mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) %endif ; simple_limit @@ -1765,7 +1796,7 @@ %ifdef m12 SWAP 6, 12 %else - mova m6, [rsp+mmsize*3] + mova m6, mask_res %endif %ifidn %1, mmx mova m7, [pb_1] @@ -1793,13 +1824,19 @@ ; store %ifidn %2, v - mova [dst_reg+mstride_reg*2], m2 - mova [dst_reg+mstride_reg ], m3 - mova [dst_reg], m4 - mova [dst_reg+ stride_reg ], m5 + movrow [dst_reg +mstride_reg*2], m2 + movrow [dst_reg +mstride_reg ], m3 + movrow [dst_reg], m4 + movrow [dst_reg + stride_reg ], m5 +%if mmsize == 16 && %4 == 8 + movhps [dst8_reg+mstride_reg*2], m2 + movhps [dst8_reg+mstride_reg ], m3 + movhps [dst8_reg], m4 + movhps [dst8_reg+ stride_reg ], m5 +%endif %else ; h - add dst_reg, 2 - add dst2_reg, 2 + add dst_reg, 2 + add dst2_reg, 2 ; 4x8/16 transpose TRANSPOSE4x4B 2, 3, 4, 5, 6 @@ -1808,11 +1845,19 @@ WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg %else ; sse2 (h) lea dst8_reg, [dst8_reg+mstride_reg+2] - WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg + WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 %endif %endif %if mmsize == 8 +%if %4 == 8 ; chroma +%ifidn %2, h + sub dst_reg, 2 +%endif + cmp dst_reg, dst8_reg + mov dst_reg, dst8_reg + jnz .next8px +%else %ifidn %2, h lea dst_reg, [dst_reg + stride_reg*8-2] %else ; v @@ -1821,6 +1866,7 @@ dec cnt_reg jg .next8px %endif +%endif %ifndef m8 ; sse2 on x86-32 or mmx/mmxext mov rsp, stack_reg ; restore stack pointer @@ -1829,14 +1875,22 @@ %endmacro INIT_MMX -INNER_LOOPFILTER mmx, v, 6, 8 -INNER_LOOPFILTER mmx, h, 6, 8 -INNER_LOOPFILTER mmxext, v, 6, 8 -INNER_LOOPFILTER mmxext, h, 6, 8 +INNER_LOOPFILTER mmx, v, 6, 16, 8 +INNER_LOOPFILTER mmx, h, 6, 16, 8 +INNER_LOOPFILTER mmxext, v, 6, 16, 8 +INNER_LOOPFILTER mmxext, h, 6, 16, 8 + +INNER_LOOPFILTER mmx, v, 6, 8, 8 +INNER_LOOPFILTER mmx, h, 6, 8, 8 +INNER_LOOPFILTER mmxext, v, 6, 8, 8 +INNER_LOOPFILTER mmxext, h, 6, 8, 8 + INIT_XMM -INNER_LOOPFILTER sse2, v, 5, 13 +INNER_LOOPFILTER sse2, v, 5, 16, 13 %ifdef m8 -INNER_LOOPFILTER sse2, h, 5, 13 +INNER_LOOPFILTER sse2, h, 5, 16, 13 %else -INNER_LOOPFILTER sse2, h, 6, 13 +INNER_LOOPFILTER sse2, h, 6, 16, 13 %endif +INNER_LOOPFILTER sse2, v, 6, 8, 13 +INNER_LOOPFILTER sse2, h, 6, 8, 13