# HG changeset patch # User lorenm # Date 1143144996 0 # Node ID 06f98047ff260fc6223d9ace0360b8da5ecc13bf # Parent 91f89a395b286983ad52c69eca7235715478ed92 prefetch pixels for future motion compensation. 2-5% faster h264. diff -r 91f89a395b28 -r 06f98047ff26 dsputil.c --- a/dsputil.c Wed Mar 22 22:08:28 2006 +0000 +++ b/dsputil.c Thu Mar 23 20:16:36 2006 +0000 @@ -3773,6 +3773,8 @@ dest[0] = cm[dest[0] + ((block[0] + 4)>>3)]; } +static void just_return() { return; } + /* init static data */ void dsputil_static_init(void) { @@ -4054,6 +4056,8 @@ c->inner_add_yblock = ff_snow_inner_add_yblock; #endif + c->prefetch= just_return; + #ifdef HAVE_MMX dsputil_init_mmx(c, avctx); #endif diff -r 91f89a395b28 -r 06f98047ff26 dsputil.h --- a/dsputil.h Wed Mar 22 22:08:28 2006 +0000 +++ b/dsputil.h Thu Mar 23 20:16:36 2006 +0000 @@ -343,6 +343,8 @@ void (*vertical_compose97i)(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); void (*horizontal_compose97i)(DWTELEM *b, int width); void (*inner_add_yblock)(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); + + void (*prefetch)(void *mem, int stride, int h); } DSPContext; void dsputil_static_init(void); diff -r 91f89a395b28 -r 06f98047ff26 h264.c --- a/h264.c Wed Mar 22 22:08:28 2006 +0000 +++ b/h264.c Thu Mar 23 20:16:36 2006 +0000 @@ -2752,6 +2752,22 @@ x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1); } +static inline void prefetch_motion(H264Context *h, int list){ + /* fetch pixels for estimated mv 4 macroblocks ahead + * optimized for 64byte cache lines */ + MpegEncContext * const s = &h->s; + const int refn = h->ref_cache[list][scan8[0]]; + if(refn >= 0){ + const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8; + const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y; + uint8_t **src= h->ref_list[list][refn].data; + int off= mx + (my + (s->mb_x&3)*4)*s->linesize + 64; + s->dsp.prefetch(src[0]+off, s->linesize, 4); + off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64; + s->dsp.prefetch(src[1]+off, src[2]-src[1], 2); + } +} + static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), @@ -2762,6 +2778,8 @@ assert(IS_INTER(mb_type)); + prefetch_motion(h, 0); + if(IS_16X16(mb_type)){ mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], @@ -2833,6 +2851,8 @@ } } } + + prefetch_motion(h, 1); } static void decode_init_vlc(H264Context *h){ diff -r 91f89a395b28 -r 06f98047ff26 i386/dsputil_mmx.c --- a/i386/dsputil_mmx.c Wed Mar 22 22:08:28 2006 +0000 +++ b/i386/dsputil_mmx.c Thu Mar 23 20:16:36 2006 +0000 @@ -2489,6 +2489,18 @@ } } +#define PREFETCH(name, op) \ +void name(void *mem, int stride, int h){\ + const uint8_t *p= mem;\ + do{\ + asm volatile(#op" %0" :: "m"(*p));\ + p+= stride;\ + }while(--h);\ +} +PREFETCH(prefetch_mmx2, prefetcht0) +PREFETCH(prefetch_3dnow, prefetch) +#undef PREFETCH + #include "h264dsp_mmx.c" /* external functions, from idct_mmx.c */ @@ -2749,6 +2761,8 @@ c->h264_idct8_add= ff_h264_idct8_add_mmx; if (mm_flags & MM_MMXEXT) { + c->prefetch = prefetch_mmx2; + c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; @@ -2879,6 +2893,8 @@ c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; #endif //CONFIG_ENCODERS } else if (mm_flags & MM_3DNOW) { + c->prefetch = prefetch_3dnow; + c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;