# HG changeset patch # User lorenm # Date 1178937685 0 # Node ID 689490842cf55c20998b40b0ded5fef3b2a8af6d # Parent 02199b094850a1a2c7449ada9acaf5d0e78ab7ec factor sum_abs_dctelem out of dct_sad, and simd it. sum_abs_dctelem_* alone: core2: c=186 mmx2=39 sse2=21 ssse3=13 (cycles) k8: c=163 mmx2=33 sse2=31 p4: c=370 mmx2=60 sse2=60 dct_sad including sum_abs_dctelem_*: core2: c=405 mmx2=258 sse2=240 ssse3=232 k8: c=624 mmx2=394 sse2=392 p4: c=849 mmx2=556 sse2=556 diff -r 02199b094850 -r 689490842cf5 dsputil.c --- a/dsputil.c Sat May 12 01:16:06 2007 +0000 +++ b/dsputil.c Sat May 12 02:41:25 2007 +0000 @@ -592,6 +592,14 @@ } } +static int sum_abs_dctelem_c(DCTELEM *block) +{ + int sum=0, i; + for(i=0; i<64; i++) + sum+= FFABS(block[i]); + return sum; +} + #if 0 #define PIXOP2(OPNAME, OP) \ @@ -3385,19 +3393,14 @@ static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ MpegEncContext * const s= (MpegEncContext *)c; - DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); + DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); DCTELEM * const temp= (DCTELEM*)aligned_temp; - int sum=0, i; assert(h==8); s->dsp.diff_pixels(temp, src1, src2, stride); s->dsp.fdct(temp); - - for(i=0; i<64; i++) - sum+= FFABS(temp[i]); - - return sum; + return s->dsp.sum_abs_dctelem(temp); } #ifdef CONFIG_GPL @@ -3905,6 +3908,7 @@ c->add_pixels_clamped = add_pixels_clamped_c; c->add_pixels8 = add_pixels8_c; c->add_pixels4 = add_pixels4_c; + c->sum_abs_dctelem = sum_abs_dctelem_c; c->gmc1 = gmc1_c; c->gmc = ff_gmc_c; c->clear_blocks = clear_blocks_c; diff -r 02199b094850 -r 689490842cf5 dsputil.h --- a/dsputil.h Sat May 12 01:16:06 2007 +0000 +++ b/dsputil.h Sat May 12 02:41:25 2007 +0000 @@ -163,6 +163,7 @@ void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size); void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size); + int (*sum_abs_dctelem)(DCTELEM *block/*align 16*/); /** * translational global motion compensation. */ diff -r 02199b094850 -r 689490842cf5 i386/dsputil_mmx.c --- a/i386/dsputil_mmx.c Sat May 12 01:16:06 2007 +0000 +++ b/i386/dsputil_mmx.c Sat May 12 02:41:25 2007 +0000 @@ -1649,6 +1649,9 @@ "movq "#c", "#o"+16(%1) \n\t"\ "movq "#d", "#o"+24(%1) \n\t"\ +/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to + * about 100k on extreme inputs. But that's very unlikely to occur in natural video, + * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ #define HSUM_MMX(a, t, dst)\ "movq "#a", "#t" \n\t"\ "psrlq $32, "#a" \n\t"\ @@ -1802,6 +1805,71 @@ #undef MMABS_SUM_8x8 #endif +#define DCT_SAD4(m,mm,o)\ + "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ + "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ + "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ + "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ + MMABS_SUM(mm##2, mm##6, mm##0)\ + MMABS_SUM(mm##3, mm##7, mm##1)\ + MMABS_SUM(mm##4, mm##6, mm##0)\ + MMABS_SUM(mm##5, mm##7, mm##1)\ + +#define DCT_SAD_MMX\ + "pxor %%mm0, %%mm0 \n\t"\ + "pxor %%mm1, %%mm1 \n\t"\ + DCT_SAD4(q, %%mm, 0)\ + DCT_SAD4(q, %%mm, 8)\ + DCT_SAD4(q, %%mm, 64)\ + DCT_SAD4(q, %%mm, 72)\ + "paddusw %%mm1, %%mm0 \n\t"\ + HSUM(%%mm0, %%mm1, %0) + +#define DCT_SAD_SSE2\ + "pxor %%xmm0, %%xmm0 \n\t"\ + "pxor %%xmm1, %%xmm1 \n\t"\ + DCT_SAD4(dqa, %%xmm, 0)\ + DCT_SAD4(dqa, %%xmm, 64)\ + "paddusw %%xmm1, %%xmm0 \n\t"\ + HSUM(%%xmm0, %%xmm1, %0) + +#define DCT_SAD_FUNC(cpu) \ +static int sum_abs_dctelem_##cpu(DCTELEM *block){\ + int sum;\ + asm volatile(\ + DCT_SAD\ + :"=r"(sum)\ + :"r"(block)\ + );\ + return sum&0xFFFF;\ +} + +#define DCT_SAD DCT_SAD_MMX +#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) +#define MMABS(a,z) MMABS_MMX(a,z) +DCT_SAD_FUNC(mmx) +#undef MMABS +#undef HSUM + +#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) +#define MMABS(a,z) MMABS_MMX2(a,z) +DCT_SAD_FUNC(mmx2) +#undef HSUM +#undef DCT_SAD + +#define DCT_SAD DCT_SAD_SSE2 +#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) +DCT_SAD_FUNC(sse2) +#undef MMABS + +#ifdef HAVE_SSSE3 +#define MMABS(a,z) MMABS_SSSE3(a,z) +DCT_SAD_FUNC(ssse3) +#undef MMABS +#endif +#undef HSUM +#undef DCT_SAD + static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){ int sum; long i=size; @@ -3298,6 +3366,7 @@ c->add_bytes= add_bytes_mmx; #ifdef CONFIG_ENCODERS c->diff_bytes= diff_bytes_mmx; + c->sum_abs_dctelem= sum_abs_dctelem_mmx; c->hadamard8_diff[0]= hadamard8_diff16_mmx; c->hadamard8_diff[1]= hadamard8_diff_mmx; @@ -3350,6 +3419,7 @@ c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; #ifdef CONFIG_ENCODERS + c->sum_abs_dctelem= sum_abs_dctelem_mmx2; c->hadamard8_diff[0]= hadamard8_diff16_mmx2; c->hadamard8_diff[1]= hadamard8_diff_mmx2; c->vsad[4]= vsad_intra16_mmx2; @@ -3569,12 +3639,14 @@ #ifdef CONFIG_ENCODERS if(mm_flags & MM_SSE2){ + c->sum_abs_dctelem= sum_abs_dctelem_sse2; c->hadamard8_diff[0]= hadamard8_diff16_sse2; c->hadamard8_diff[1]= hadamard8_diff_sse2; } #ifdef HAVE_SSSE3 if(mm_flags & MM_SSSE3){ + c->sum_abs_dctelem= sum_abs_dctelem_ssse3; c->hadamard8_diff[0]= hadamard8_diff16_ssse3; c->hadamard8_diff[1]= hadamard8_diff_ssse3; }