Mercurial > libavcodec.hg
changeset 2902:3c79bc9f3aa9 libavcodec
h264 mmx weighted prediction. up to 3% overall speedup.
author | lorenm |
---|---|
date | Sun, 09 Oct 2005 23:38:52 +0000 |
parents | bf8d87efc31a |
children | 59af72b453e3 |
files | i386/dsputil_mmx.c i386/h264dsp_mmx.c |
diffstat | 2 files changed, 115 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/i386/dsputil_mmx.c Thu Oct 06 12:13:25 2005 +0000 +++ b/i386/dsputil_mmx.c Sun Oct 09 23:38:52 2005 +0000 @@ -2832,6 +2832,24 @@ c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; + c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; + c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; + c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; + c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; + c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; + c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; + c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; + c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; + + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; + c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; + c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; + c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; + c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; + #ifdef CONFIG_ENCODERS c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; #endif //CONFIG_ENCODERS
--- a/i386/h264dsp_mmx.c Thu Oct 06 12:13:25 2005 +0000 +++ b/i386/h264dsp_mmx.c Sun Oct 09 23:38:52 2005 +0000 @@ -909,3 +909,100 @@ #undef H264_CHROMA_OP #undef H264_CHROMA_MC8_TMPL +/***********************************/ +/* weighted prediction */ + +static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) +{ + int x, y; + offset <<= log2_denom; + offset += (1 << log2_denom) >> 1; + asm volatile( + "movd %0, %%mm4 \n\t" + "movd %1, %%mm5 \n\t" + "movd %2, %%mm6 \n\t" + "pshufw $0, %%mm4, %%mm4 \n\t" + "pshufw $0, %%mm5, %%mm5 \n\t" + "pxor %%mm7, %%mm7 \n\t" + :: "g"(weight), "g"(offset), "g"(log2_denom) + ); + for(y=0; y<h; y+=2){ + for(x=0; x<w; x+=4){ + asm volatile( + "movd %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pmullw %%mm4, %%mm0 \n\t" + "pmullw %%mm4, %%mm1 \n\t" + "paddw %%mm5, %%mm0 \n\t" + "paddw %%mm5, %%mm1 \n\t" + "psraw %%mm6, %%mm0 \n\t" + "psraw %%mm6, %%mm1 \n\t" + "packuswb %%mm7, %%mm0 \n\t" + "packuswb %%mm7, %%mm1 \n\t" + "movd %%mm0, %0 \n\t" + "movd %%mm1, %1 \n\t" + : "+m"(*(uint32_t*)(dst+x)), + "+m"(*(uint32_t*)(dst+x+stride)) + ); + } + dst += 2*stride; + } +} + +static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets, int w, int h) +{ + int x, y; + int offset = ((offsets + offsetd + 1) | 1) << log2_denom; + asm volatile( + "movd %0, %%mm3 \n\t" + "movd %1, %%mm4 \n\t" + "movd %2, %%mm5 \n\t" + "movd %3, %%mm6 \n\t" + "pshufw $0, %%mm3, %%mm3 \n\t" + "pshufw $0, %%mm4, %%mm4 \n\t" + "pshufw $0, %%mm5, %%mm5 \n\t" + "pxor %%mm7, %%mm7 \n\t" + :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1) + ); + for(y=0; y<h; y++){ + for(x=0; x<w; x+=4){ + asm volatile( + "movd %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pmullw %%mm3, %%mm0 \n\t" + "pmullw %%mm4, %%mm1 \n\t" + "paddw %%mm5, %%mm0 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "psraw %%mm6, %%mm0 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "movd %%mm0, %0 \n\t" + : "+m"(*(uint32_t*)(dst+x)) + : "m"(*(uint32_t*)(src+x)) + ); + } + src += stride; + dst += stride; + } +} + +#define H264_WEIGHT(W,H) \ +static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \ + ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offsetd, offsets, W, H); \ +} \ +static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ + ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \ +} + +H264_WEIGHT(16,16) +H264_WEIGHT(16, 8) +H264_WEIGHT( 8,16) +H264_WEIGHT( 8, 8) +H264_WEIGHT( 8, 4) +H264_WEIGHT( 4, 8) +H264_WEIGHT( 4, 4) +H264_WEIGHT( 4, 2) +