# HG changeset patch # User rbultje # Date 1279574284 0 # Node ID 80b142c2e9f74f1867dda0b6c0ba29b35735231a # Parent 0a63bed2a00e621b91d0aea4b80524f8e3ba9b31 Change function prototypes for width=8 inner and mbedge loopfilter functions so that it does both U and V planes at the same time. This will have speed advantages when using SSE2 (or higher) optimizations, since we can do both the U and V rows together in a single xmm register. This also renames filter16 to filter16y and filter8 to filter8uv so that it's more obvious what each function is used for. diff -r 0a63bed2a00e -r 80b142c2e9f7 vp8.c --- a/vp8.c Mon Jul 19 20:53:58 2010 +0000 +++ b/vp8.c Mon Jul 19 21:18:04 2010 +0000 @@ -1245,31 +1245,45 @@ bedge_lim = 2* filter_level + inner_limit; if (mb_x) { - s->vp8dsp.vp8_h_loop_filter16(dst[0], s->linesize, mbedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_h_loop_filter8 (dst[1], s->uvlinesize, mbedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_h_loop_filter8 (dst[2], s->uvlinesize, mbedge_lim, inner_limit, hev_thresh); + s->vp8dsp.vp8_h_loop_filter16y(dst[0], s->linesize, + mbedge_lim, inner_limit, hev_thresh); + s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], s->uvlinesize, + mbedge_lim, inner_limit, hev_thresh); } if (!mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT) { - s->vp8dsp.vp8_h_loop_filter16_inner(dst[0]+ 4, s->linesize, bedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_h_loop_filter16_inner(dst[0]+ 8, s->linesize, bedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_h_loop_filter16_inner(dst[0]+12, s->linesize, bedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_h_loop_filter8_inner (dst[1]+ 4, s->uvlinesize, bedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_h_loop_filter8_inner (dst[2]+ 4, s->uvlinesize, bedge_lim, inner_limit, hev_thresh); + s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, s->linesize, bedge_lim, + inner_limit, hev_thresh); + s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, s->linesize, bedge_lim, + inner_limit, hev_thresh); + s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, s->linesize, bedge_lim, + inner_limit, hev_thresh); + s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4, + s->uvlinesize, bedge_lim, + inner_limit, hev_thresh); } if (mb_y) { - s->vp8dsp.vp8_v_loop_filter16(dst[0], s->linesize, mbedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_v_loop_filter8 (dst[1], s->uvlinesize, mbedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_v_loop_filter8 (dst[2], s->uvlinesize, mbedge_lim, inner_limit, hev_thresh); + s->vp8dsp.vp8_v_loop_filter16y(dst[0], s->linesize, + mbedge_lim, inner_limit, hev_thresh); + s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], s->uvlinesize, + mbedge_lim, inner_limit, hev_thresh); } if (!mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT) { - s->vp8dsp.vp8_v_loop_filter16_inner(dst[0]+ 4*s->linesize, s->linesize, bedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_v_loop_filter16_inner(dst[0]+ 8*s->linesize, s->linesize, bedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_v_loop_filter16_inner(dst[0]+12*s->linesize, s->linesize, bedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_v_loop_filter8_inner (dst[1]+ 4*s->uvlinesize, s->uvlinesize, bedge_lim, inner_limit, hev_thresh); - s->vp8dsp.vp8_v_loop_filter8_inner (dst[2]+ 4*s->uvlinesize, s->uvlinesize, bedge_lim, inner_limit, hev_thresh); + s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*s->linesize, + s->linesize, bedge_lim, + inner_limit, hev_thresh); + s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*s->linesize, + s->linesize, bedge_lim, + inner_limit, hev_thresh); + s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*s->linesize, + s->linesize, bedge_lim, + inner_limit, hev_thresh); + s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * s->uvlinesize, + dst[2] + 4 * s->uvlinesize, + s->uvlinesize, bedge_lim, + inner_limit, hev_thresh); } } diff -r 0a63bed2a00e -r 80b142c2e9f7 vp8dsp.c --- a/vp8dsp.c Mon Jul 19 20:53:58 2010 +0000 +++ b/vp8dsp.c Mon Jul 19 21:18:04 2010 +0000 @@ -196,8 +196,8 @@ p[ 2*stride] = cm[q2 - a2]; } -#define LOOP_FILTER(dir, size, stridea, strideb) \ -static void vp8_ ## dir ## _loop_filter ## size ## _c(uint8_t *dst, int stride,\ +#define LOOP_FILTER(dir, size, stridea, strideb, maybe_inline) \ +static maybe_inline void vp8_ ## dir ## _loop_filter ## size ## _c(uint8_t *dst, int stride,\ int flim_E, int flim_I, int hev_thresh)\ {\ int i;\ @@ -211,7 +211,7 @@ }\ }\ \ -static void vp8_ ## dir ## _loop_filter ## size ## _inner_c(uint8_t *dst, int stride,\ +static maybe_inline void vp8_ ## dir ## _loop_filter ## size ## _inner_c(uint8_t *dst, int stride,\ int flim_E, int flim_I, int hev_thresh)\ {\ int i;\ @@ -226,10 +226,26 @@ }\ } -LOOP_FILTER(v, 16, 1, stride) -LOOP_FILTER(h, 16, stride, 1) -LOOP_FILTER(v, 8, 1, stride) -LOOP_FILTER(h, 8, stride, 1) +LOOP_FILTER(v, 16, 1, stride,) +LOOP_FILTER(h, 16, stride, 1,) + +#define UV_LOOP_FILTER(dir, stridea, strideb) \ +LOOP_FILTER(dir, 8, stridea, strideb, av_always_inline) \ +static void vp8_ ## dir ## _loop_filter8uv_c(uint8_t *dstU, uint8_t *dstV, int stride,\ + int fE, int fI, int hev_thresh)\ +{\ + vp8_ ## dir ## _loop_filter8_c(dstU, stride, fE, fI, hev_thresh);\ + vp8_ ## dir ## _loop_filter8_c(dstV, stride, fE, fI, hev_thresh);\ +}\ +static void vp8_ ## dir ## _loop_filter8uv_inner_c(uint8_t *dstU, uint8_t *dstV, int stride,\ + int fE, int fI, int hev_thresh)\ +{\ + vp8_ ## dir ## _loop_filter8_inner_c(dstU, stride, fE, fI, hev_thresh);\ + vp8_ ## dir ## _loop_filter8_inner_c(dstV, stride, fE, fI, hev_thresh);\ +} + +UV_LOOP_FILTER(v, 1, stride) +UV_LOOP_FILTER(h, stride, 1) static void vp8_v_loop_filter_simple_c(uint8_t *dst, int stride, int flim) { @@ -443,15 +459,15 @@ dsp->vp8_idct_add = vp8_idct_add_c; dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; - dsp->vp8_v_loop_filter16 = vp8_v_loop_filter16_c; - dsp->vp8_h_loop_filter16 = vp8_h_loop_filter16_c; - dsp->vp8_v_loop_filter8 = vp8_v_loop_filter8_c; - dsp->vp8_h_loop_filter8 = vp8_h_loop_filter8_c; + dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c; + dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c; + dsp->vp8_v_loop_filter8uv = vp8_v_loop_filter8uv_c; + dsp->vp8_h_loop_filter8uv = vp8_h_loop_filter8uv_c; - dsp->vp8_v_loop_filter16_inner = vp8_v_loop_filter16_inner_c; - dsp->vp8_h_loop_filter16_inner = vp8_h_loop_filter16_inner_c; - dsp->vp8_v_loop_filter8_inner = vp8_v_loop_filter8_inner_c; - dsp->vp8_h_loop_filter8_inner = vp8_h_loop_filter8_inner_c; + dsp->vp8_v_loop_filter16y_inner = vp8_v_loop_filter16_inner_c; + dsp->vp8_h_loop_filter16y_inner = vp8_h_loop_filter16_inner_c; + dsp->vp8_v_loop_filter8uv_inner = vp8_v_loop_filter8uv_inner_c; + dsp->vp8_h_loop_filter8uv_inner = vp8_h_loop_filter8uv_inner_c; dsp->vp8_v_loop_filter_simple = vp8_v_loop_filter_simple_c; dsp->vp8_h_loop_filter_simple = vp8_h_loop_filter_simple_c; diff -r 0a63bed2a00e -r 80b142c2e9f7 vp8dsp.h --- a/vp8dsp.h Mon Jul 19 20:53:58 2010 +0000 +++ b/vp8dsp.h Mon Jul 19 21:18:04 2010 +0000 @@ -35,16 +35,24 @@ void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride); // loop filter applied to edges between macroblocks - void (*vp8_v_loop_filter16)(uint8_t *dst, int stride, int flim_E, int flim_I, int hev_thresh); - void (*vp8_h_loop_filter16)(uint8_t *dst, int stride, int flim_E, int flim_I, int hev_thresh); - void (*vp8_v_loop_filter8)(uint8_t *dst, int stride, int flim_E, int flim_I, int hev_thresh); - void (*vp8_h_loop_filter8)(uint8_t *dst, int stride, int flim_E, int flim_I, int hev_thresh); + void (*vp8_v_loop_filter16y)(uint8_t *dst, int stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_h_loop_filter16y)(uint8_t *dst, int stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_v_loop_filter8uv)(uint8_t *dstU, uint8_t *dstV, int stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_h_loop_filter8uv)(uint8_t *dstU, uint8_t *dstV, int stride, + int flim_E, int flim_I, int hev_thresh); // loop filter applied to inner macroblock edges - void (*vp8_v_loop_filter16_inner)(uint8_t *dst, int stride, int flim_E, int flim_I, int hev_thresh); - void (*vp8_h_loop_filter16_inner)(uint8_t *dst, int stride, int flim_E, int flim_I, int hev_thresh); - void (*vp8_v_loop_filter8_inner)(uint8_t *dst, int stride, int flim_E, int flim_I, int hev_thresh); - void (*vp8_h_loop_filter8_inner)(uint8_t *dst, int stride, int flim_E, int flim_I, int hev_thresh); + void (*vp8_v_loop_filter16y_inner)(uint8_t *dst, int stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_h_loop_filter16y_inner)(uint8_t *dst, int stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_v_loop_filter8uv_inner)(uint8_t *dstU, uint8_t *dstV, int stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_h_loop_filter8uv_inner)(uint8_t *dstU, uint8_t *dstV, int stride, + int flim_E, int flim_I, int hev_thresh); void (*vp8_v_loop_filter_simple)(uint8_t *dst, int stride, int flim); void (*vp8_h_loop_filter_simple)(uint8_t *dst, int stride, int flim); diff -r 0a63bed2a00e -r 80b142c2e9f7 x86/vp8dsp-init.c --- a/x86/vp8dsp-init.c Mon Jul 19 20:53:58 2010 +0000 +++ b/x86/vp8dsp-init.c Mon Jul 19 21:18:04 2010 +0000 @@ -230,18 +230,18 @@ extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); -extern void ff_vp8_v_loop_filter16_inner_mmx (uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_v_loop_filter16_inner_mmxext(uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_v_loop_filter16_inner_sse2 (uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_h_loop_filter16_inner_mmx (uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_h_loop_filter16_inner_mmxext(uint8_t *dst, int stride, - int e, int i, int hvt); -extern void ff_vp8_h_loop_filter16_inner_sse2 (uint8_t *dst, int stride, - int e, int i, int hvt); +extern void ff_vp8_v_loop_filter16y_inner_mmx (uint8_t *dst, int stride, + int e, int i, int hvt); +extern void ff_vp8_v_loop_filter16y_inner_mmxext(uint8_t *dst, int stride, + int e, int i, int hvt); +extern void ff_vp8_v_loop_filter16y_inner_sse2 (uint8_t *dst, int stride, + int e, int i, int hvt); +extern void ff_vp8_h_loop_filter16y_inner_mmx (uint8_t *dst, int stride, + int e, int i, int hvt); +extern void ff_vp8_h_loop_filter16y_inner_mmxext(uint8_t *dst, int stride, + int e, int i, int hvt); +extern void ff_vp8_h_loop_filter16y_inner_sse2 (uint8_t *dst, int stride, + int e, int i, int hvt); #endif #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ @@ -284,8 +284,8 @@ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx; - c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_mmx; - c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_mmx; + c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx; + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx; } /* note that 4-tap width=16 functions are missing because w=16 @@ -302,8 +302,8 @@ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; - c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_mmxext; - c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_mmxext; + c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext; + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext; } if (mm_flags & FF_MM_SSE) { @@ -320,8 +320,8 @@ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; - c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_sse2; - c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_sse2; + c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; } if (mm_flags & FF_MM_SSSE3) { diff -r 0a63bed2a00e -r 80b142c2e9f7 x86/vp8dsp.asm --- a/x86/vp8dsp.asm Mon Jul 19 20:53:58 2010 +0000 +++ b/x86/vp8dsp.asm Mon Jul 19 21:18:04 2010 +0000 @@ -1379,7 +1379,7 @@ ;----------------------------------------------------------------------------- %macro INNER_LOOPFILTER 4 -cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 +cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %4 %define dst_reg r0 %define mstride_reg r1 %define E_reg r2