# HG changeset patch # User conrad # Date 1271469870 0 # Node ID f7281af560fe5ff8485ec672fe8e7c144124ebb6 # Parent a9e758788a12d1f09413c516ef03e87fcd043ab9 vp3: DC-only IDCT 2-4% faster overall decode diff -r a9e758788a12 -r f7281af560fe arm/dsputil_init_neon.c --- a/arm/dsputil_init_neon.c Fri Apr 16 12:21:44 2010 +0000 +++ b/arm/dsputil_init_neon.c Sat Apr 17 02:04:30 2010 +0000 @@ -32,6 +32,7 @@ void ff_vp3_idct_neon(DCTELEM *data); void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); +void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data); void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); @@ -294,6 +295,7 @@ if (CONFIG_VP3_DECODER) { c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon; c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon; + c->vp3_idct_dc_add = ff_vp3_idct_dc_add_neon; } c->vector_fmul = ff_vector_fmul_neon; diff -r a9e758788a12 -r f7281af560fe arm/vp3dsp_neon.S --- a/arm/vp3dsp_neon.S Fri Apr 16 12:21:44 2010 +0000 +++ b/arm/vp3dsp_neon.S Sat Apr 17 02:04:30 2010 +0000 @@ -374,3 +374,47 @@ vst1.64 {d7}, [r2,:64], r1 bx lr endfunc + +function ff_vp3_idct_dc_add_neon, export=1 + ldrsh r2, [r2] + movw r3, #46341 + mul r2, r3, r2 + smulwt r2, r3, r2 + mov r3, r0 + vdup.16 q15, r2 + vrshr.s16 q15, q15, #4 + + vld1.8 {d0}, [r0,:64], r1 + vld1.8 {d1}, [r0,:64], r1 + vld1.8 {d2}, [r0,:64], r1 + vaddw.u8 q8, q15, d0 + vld1.8 {d3}, [r0,:64], r1 + vaddw.u8 q9, q15, d1 + vld1.8 {d4}, [r0,:64], r1 + vaddw.u8 q10, q15, d2 + vld1.8 {d5}, [r0,:64], r1 + vaddw.u8 q11, q15, d3 + vld1.8 {d6}, [r0,:64], r1 + vaddw.u8 q12, q15, d4 + vld1.8 {d7}, [r0,:64], r1 + vaddw.u8 q13, q15, d5 + vqmovun.s16 d0, q8 + vaddw.u8 q14, q15, d6 + vqmovun.s16 d1, q9 + vaddw.u8 q15, q15, d7 + vqmovun.s16 d2, q10 + vst1.8 {d0}, [r3,:64], r1 + vqmovun.s16 d3, q11 + vst1.8 {d1}, [r3,:64], r1 + vqmovun.s16 d4, q12 + vst1.8 {d2}, [r3,:64], r1 + vqmovun.s16 d5, q13 + vst1.8 {d3}, [r3,:64], r1 + vqmovun.s16 d6, q14 + vst1.8 {d4}, [r3,:64], r1 + vqmovun.s16 d7, q15 + vst1.8 {d5}, [r3,:64], r1 + vst1.8 {d6}, [r3,:64], r1 + vst1.8 {d7}, [r3,:64], r1 + bx lr +endfunc diff -r a9e758788a12 -r f7281af560fe dsputil.c --- a/dsputil.c Fri Apr 16 12:21:44 2010 +0000 +++ b/dsputil.c Sat Apr 17 02:04:30 2010 +0000 @@ -4467,6 +4467,7 @@ if (CONFIG_VP3_DECODER) { c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c; c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c; + c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c; } if (CONFIG_VP6_DECODER) { c->vp6_filter_diag4= ff_vp6_filter_diag4_c; diff -r a9e758788a12 -r f7281af560fe dsputil.h --- a/dsputil.h Fri Apr 16 12:21:44 2010 +0000 +++ b/dsputil.h Sat Apr 17 02:04:30 2010 +0000 @@ -86,6 +86,7 @@ void ff_vp3_idct_c(DCTELEM *block/* align 16*/); void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); +void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/); void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values); void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values); @@ -373,6 +374,7 @@ void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale); void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale); + void (*vp3_idct_dc_add)(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/); void (*vp3_v_loop_filter)(uint8_t *src, int stride, int *bounding_values); void (*vp3_h_loop_filter)(uint8_t *src, int stride, int *bounding_values); diff -r a9e758788a12 -r f7281af560fe vp3.c --- a/vp3.c Fri Apr 16 12:21:44 2010 +0000 +++ b/vp3.c Sat Apr 17 02:04:30 2010 +0000 @@ -1395,8 +1395,6 @@ /* transform if this block was coded */ if (s->all_fragments[i].coding_method != MODE_COPY) { - int intra = s->all_fragments[i].coding_method == MODE_INTRA; - if ((s->all_fragments[i].coding_method == MODE_USING_GOLDEN) || (s->all_fragments[i].coding_method == MODE_GOLDEN_MV)) motion_source= golden_plane; @@ -1456,11 +1454,11 @@ } s->dsp.clear_block(block); - vp3_dequant(s, s->all_fragments + i, plane, !intra, block); /* invert DCT and place (or add) in final output */ if (s->all_fragments[i].coding_method == MODE_INTRA) { + vp3_dequant(s, s->all_fragments + i, plane, 0, block); if(s->avctx->idct_algo!=FF_IDCT_VP3) block[0] += 128<<3; s->dsp.idct_put( @@ -1468,10 +1466,14 @@ stride, block); } else { + if (vp3_dequant(s, s->all_fragments + i, plane, 1, block)) { s->dsp.idct_add( output_plane + first_pixel, stride, block); + } else { + s->dsp.vp3_idct_dc_add(output_plane + first_pixel, stride, block); + } } } else { diff -r a9e758788a12 -r f7281af560fe vp3dsp.c --- a/vp3dsp.c Fri Apr 16 12:21:44 2010 +0000 +++ b/vp3dsp.c Sat Apr 17 02:04:30 2010 +0000 @@ -223,6 +223,25 @@ idct(dest, line_size, block, 2); } +void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/){ + const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + int i, dc = block[0]; + dc = (46341*dc)>>16; + dc = (46341*dc + (8<<16))>>20; + + for(i = 0; i < 8; i++){ + dest[0] = cm[dest[0]+dc]; + dest[1] = cm[dest[1]+dc]; + dest[2] = cm[dest[2]+dc]; + dest[3] = cm[dest[3]+dc]; + dest[4] = cm[dest[4]+dc]; + dest[5] = cm[dest[5]+dc]; + dest[6] = cm[dest[6]+dc]; + dest[7] = cm[dest[7]+dc]; + dest += line_size; + } +} + void ff_vp3_v_loop_filter_c(uint8_t *first_pixel, int stride, int *bounding_values) { unsigned char *end; diff -r a9e758788a12 -r f7281af560fe x86/dsputil_mmx.c --- a/x86/dsputil_mmx.c Fri Apr 16 12:21:44 2010 +0000 +++ b/x86/dsputil_mmx.c Sat Apr 17 02:04:30 2010 +0000 @@ -2653,6 +2653,9 @@ c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; } } + if (CONFIG_VP3_DECODER) { + c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2; + } #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ diff -r a9e758788a12 -r f7281af560fe x86/vp3dsp_mmx.c --- a/x86/vp3dsp_mmx.c Fri Apr 16 12:21:44 2010 +0000 +++ b/x86/vp3dsp_mmx.c Sat Apr 17 02:04:30 2010 +0000 @@ -395,3 +395,44 @@ ff_vp3_idct_mmx(block); add_pixels_clamped_mmx(block, dest, line_size); } + +void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block) +{ + int dc = block[0]; + dc = (46341*dc)>>16; + dc = (46341*dc + (8<<16))>>20; + + __asm__ volatile( + "movd %3, %%mm0 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + +#define DC_ADD \ + "movq (%0), %%mm2 \n\t" \ + "movq (%0,%1), %%mm3 \n\t" \ + "paddusb %%mm0, %%mm2 \n\t" \ + "movq (%0,%1,2), %%mm4 \n\t" \ + "paddusb %%mm0, %%mm3 \n\t" \ + "movq (%0,%2), %%mm5 \n\t" \ + "paddusb %%mm0, %%mm4 \n\t" \ + "paddusb %%mm0, %%mm5 \n\t" \ + "psubusb %%mm1, %%mm2 \n\t" \ + "psubusb %%mm1, %%mm3 \n\t" \ + "movq %%mm2, (%0) \n\t" \ + "psubusb %%mm1, %%mm4 \n\t" \ + "movq %%mm3, (%0,%1) \n\t" \ + "psubusb %%mm1, %%mm5 \n\t" \ + "movq %%mm4, (%0,%1,2) \n\t" \ + "movq %%mm5, (%0,%2) \n\t" + + DC_ADD + "lea (%0,%1,4), %0 \n\t" + DC_ADD + + : "+r"(dest) + : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc) + ); +} diff -r a9e758788a12 -r f7281af560fe x86/vp3dsp_mmx.h --- a/x86/vp3dsp_mmx.h Fri Apr 16 12:21:44 2010 +0000 +++ b/x86/vp3dsp_mmx.h Sat Apr 17 02:04:30 2010 +0000 @@ -28,6 +28,7 @@ void ff_vp3_idct_mmx(int16_t *data); void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);