# HG changeset patch # User rbultje # Date 1279576408 0 # Node ID 552c7c10bc731c0319296cedc6d16039f5140a1b # Parent e7847fcff0f4cfbb38edb37ec8350adb1aab867c Implement chroma (width=8) inner loopfilter MMX/MMX2/SSE2 functions. diff -r e7847fcff0f4 -r 552c7c10bc73 x86/vp8dsp-init.c --- a/x86/vp8dsp-init.c Mon Jul 19 21:45:36 2010 +0000 +++ b/x86/vp8dsp-init.c Mon Jul 19 21:53:28 2010 +0000 @@ -242,6 +242,19 @@ int e, int i, int hvt); extern void ff_vp8_h_loop_filter16y_inner_sse2 (uint8_t *dst, int stride, int e, int i, int hvt); + +extern void ff_vp8_v_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); +extern void ff_vp8_v_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); +extern void ff_vp8_v_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); +extern void ff_vp8_h_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); +extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); +extern void ff_vp8_h_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV, + int s, int e, int i, int hvt); #endif #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ @@ -286,6 +299,8 @@ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx; } /* note that 4-tap width=16 functions are missing because w=16 @@ -304,6 +319,8 @@ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext; } if (mm_flags & FF_MM_SSE) { @@ -322,6 +339,8 @@ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; } if (mm_flags & FF_MM_SSSE3) { diff -r e7847fcff0f4 -r 552c7c10bc73 x86/vp8dsp.asm --- a/x86/vp8dsp.asm Mon Jul 19 21:45:36 2010 +0000 +++ b/x86/vp8dsp.asm Mon Jul 19 21:53:28 2010 +0000 @@ -1164,12 +1164,16 @@ ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride ; we add 1*stride to the third regular registry in the process -%macro WRITE_4x4D 9 +; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the +; same memory region), or 8 if they cover two separate buffers (third one points to +; a different memory region than the first two), allowing for more optimal code for +; the 16-width case +%macro WRITE_4x4D 10 ; write out (4 dwords per register), start with dwords zero movd [%5+%8*4], m%1 movd [%5], m%2 - movd [%5+%9*4], m%3 - movd [%5+%9*8], m%4 + movd [%7+%8*4], m%3 + movd [%7], m%4 ; store dwords 1 psrldq m%1, 4 @@ -1178,15 +1182,23 @@ psrldq m%4, 4 movd [%6+%8*4], m%1 movd [%6], m%2 +%if %10 == 16 movd [%6+%9*4], m%3 - movd [%6+%9*8], m%4 +%endif + movd [%7+%9], m%4 ; write dwords 2 psrldq m%1, 4 psrldq m%2, 4 +%if %10 == 8 + movd [%5+%8*2], m%1 + movd %5, m%3 +%endif psrldq m%3, 4 psrldq m%4, 4 +%if %10 == 16 movd [%5+%8*2], m%1 +%endif movd [%6+%9], m%2 movd [%7+%8*2], m%3 movd [%7+%9*2], m%4 @@ -1197,7 +1209,12 @@ psrldq m%2, 4 psrldq m%3, 4 psrldq m%4, 4 +%if %10 == 8 + mov [%7+%8*4], %5 + movd [%6+%8*2], m%1 +%else movd [%5+%8], m%1 +%endif movd [%6+%9*2], m%2 movd [%7+%8*2], m%3 movd [%7+%9*2], m%4 @@ -1335,7 +1352,7 @@ TRANSPOSE4x4B 0, 1, 2, 3, 4 %if mmsize == 16 ; sse2 add r3, r1 ; change from r4*8*stride to r0+8*stride - WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2 + WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16 %else ; mmx/mmxext WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 %endif @@ -1374,13 +1391,20 @@ SIMPLE_LOOPFILTER sse2, h, 6 ;----------------------------------------------------------------------------- -; void vp8_h/v_loop_filter_inner_(uint8_t *dst, int stride, +; void vp8_h/v_loop_filter_inner_(uint8_t *dst, [uint8_t *v,] int stride, ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- -%macro INNER_LOOPFILTER 4 -cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %4 -%define dst_reg r0 +%macro INNER_LOOPFILTER 5 +%if %4 == 8 ; chroma +cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 +%define dst8_reg r1 +%define mstride_reg r2 +%define E_reg r3 +%define I_reg r4 +%define hev_thr_reg r5 +%else ; luma +cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %define mstride_reg r1 %define E_reg r2 %define I_reg r3 @@ -1392,6 +1416,8 @@ %else ; x86-32, mmx/mmxext %define cnt_reg r5 %endif +%endif +%define dst_reg r0 %define stride_reg E_reg %define dst2_reg I_reg %ifndef m8 @@ -1436,13 +1462,16 @@ SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh %endif -%if mmsize == 8 ; mmx/mmxext +%if mmsize == 8 && %4 == 16 ; mmx/mmxext mov cnt_reg, 2 %endif mov stride_reg, mstride_reg neg mstride_reg %ifidn %2, h lea dst_reg, [dst_reg + stride_reg*4-4] +%if %4 == 8 + lea dst8_reg, [dst8_reg+ stride_reg*4-4] +%endif %endif %if mmsize == 8 @@ -1451,12 +1480,27 @@ ; read lea dst2_reg, [dst_reg + stride_reg] %ifidn %2, v - mova m0, [dst_reg +mstride_reg*4] ; p3 - mova m1, [dst2_reg+mstride_reg*4] ; p2 - mova m2, [dst_reg +mstride_reg*2] ; p1 - mova m5, [dst2_reg] ; q1 - mova m6, [dst2_reg+ stride_reg] ; q2 - mova m7, [dst2_reg+ stride_reg*2] ; q3 +%if %4 == 8 && mmsize == 16 +%define movrow movh +%else +%define movrow mova +%endif + movrow m0, [dst_reg +mstride_reg*4] ; p3 + movrow m1, [dst2_reg+mstride_reg*4] ; p2 + movrow m2, [dst_reg +mstride_reg*2] ; p1 + movrow m5, [dst2_reg] ; q1 + movrow m6, [dst2_reg+ stride_reg] ; q2 + movrow m7, [dst2_reg+ stride_reg*2] ; q3 +%if mmsize == 16 && %4 == 8 + movhps m0, [dst8_reg+mstride_reg*4] + movhps m2, [dst8_reg+mstride_reg*2] + add dst8_reg, stride_reg + movhps m1, [dst8_reg+mstride_reg*4] + movhps m5, [dst8_reg] + movhps m6, [dst8_reg+ stride_reg] + movhps m7, [dst8_reg+ stride_reg*2] + add dst8_reg, mstride_reg +%endif %elif mmsize == 8 ; mmx/mmxext (h) ; read 8 rows of 8px each movu m0, [dst_reg +mstride_reg*4] @@ -1497,7 +1541,9 @@ SWAP 6, 3 SWAP 5, 3 %else ; sse2 (h) +%if %4 == 16 lea dst8_reg, [dst_reg + stride_reg*8] +%endif ; read 16 rows of 8px each, interleave movh m0, [dst_reg +mstride_reg*4] @@ -1609,7 +1655,10 @@ ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero %ifidn %2, v - mova m3, [dst_reg +mstride_reg] ; p0 + movrow m3, [dst_reg +mstride_reg] ; p0 +%if mmsize == 16 && %4 == 8 + movhps m3, [dst8_reg+mstride_reg] +%endif %elifdef m14 SWAP 3, 12 %else @@ -1642,7 +1691,10 @@ SWAP 6, 4 ; now m6 is I %ifidn %2, v - mova m4, [dst_reg] ; q0 + movrow m4, [dst_reg] ; q0 +%if mmsize == 16 && %4 == 8 + movhps m4, [dst8_reg] +%endif %elifdef m13 SWAP 4, 8 %else @@ -1793,13 +1845,19 @@ ; store %ifidn %2, v - mova [dst_reg+mstride_reg*2], m2 - mova [dst_reg+mstride_reg ], m3 - mova [dst_reg], m4 - mova [dst_reg+ stride_reg ], m5 + movrow [dst_reg +mstride_reg*2], m2 + movrow [dst_reg +mstride_reg ], m3 + movrow [dst_reg], m4 + movrow [dst_reg + stride_reg ], m5 +%if mmsize == 16 && %4 == 8 + movhps [dst8_reg+mstride_reg*2], m2 + movhps [dst8_reg+mstride_reg ], m3 + movhps [dst8_reg], m4 + movhps [dst8_reg+ stride_reg ], m5 +%endif %else ; h - add dst_reg, 2 - add dst2_reg, 2 + add dst_reg, 2 + add dst2_reg, 2 ; 4x8/16 transpose TRANSPOSE4x4B 2, 3, 4, 5, 6 @@ -1808,11 +1866,19 @@ WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg %else ; sse2 (h) lea dst8_reg, [dst8_reg+mstride_reg+2] - WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg + WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 %endif %endif %if mmsize == 8 +%if %4 == 8 ; chroma +%ifidn %2, h + sub dst_reg, 2 +%endif + cmp dst_reg, dst8_reg + mov dst_reg, dst8_reg + jnz .next8px +%else %ifidn %2, h lea dst_reg, [dst_reg + stride_reg*8-2] %else ; v @@ -1821,6 +1887,7 @@ dec cnt_reg jg .next8px %endif +%endif %ifndef m8 ; sse2 on x86-32 or mmx/mmxext mov rsp, stack_reg ; restore stack pointer @@ -1829,14 +1896,23 @@ %endmacro INIT_MMX -INNER_LOOPFILTER mmx, v, 6, 8 -INNER_LOOPFILTER mmx, h, 6, 8 -INNER_LOOPFILTER mmxext, v, 6, 8 -INNER_LOOPFILTER mmxext, h, 6, 8 +INNER_LOOPFILTER mmx, v, 6, 16, 8 +INNER_LOOPFILTER mmx, h, 6, 16, 8 +INNER_LOOPFILTER mmxext, v, 6, 16, 8 +INNER_LOOPFILTER mmxext, h, 6, 16, 8 + +INNER_LOOPFILTER mmx, v, 6, 8, 8 +INNER_LOOPFILTER mmx, h, 6, 8, 8 +INNER_LOOPFILTER mmxext, v, 6, 8, 8 +INNER_LOOPFILTER mmxext, h, 6, 8, 8 + INIT_XMM -INNER_LOOPFILTER sse2, v, 5, 13 +INNER_LOOPFILTER sse2, v, 5, 16, 13 %ifdef m8 -INNER_LOOPFILTER sse2, h, 5, 13 +INNER_LOOPFILTER sse2, h, 5, 16, 13 %else -INNER_LOOPFILTER sse2, h, 6, 13 +INNER_LOOPFILTER sse2, h, 6, 16, 13 %endif + +INNER_LOOPFILTER sse2, v, 6, 8, 13 +INNER_LOOPFILTER sse2, h, 6, 8, 13