# HG changeset patch # User gpoirier # Date 1143029315 0 # Node ID b77b5e7072d60aee0c408ae4fafe8e4b71451206 # Parent 81cafbc23b8d2c64b1a6d896eb553f1a36ae814f add MMX and SSE versions of ff_snow_inner_add_yblock Patch by Robert Edele < yartrebo AH earthlink POIS net > Original Thread: Date: Mar 22, 2006 3:24 AM Subject: [Ffmpeg-devel] [PATCH] snow mmx + sse2 part 5 diff -r 81cafbc23b8d -r b77b5e7072d6 i386/dsputil_mmx.c --- a/i386/dsputil_mmx.c Tue Mar 21 21:51:07 2006 +0000 +++ b/i386/dsputil_mmx.c Wed Mar 22 12:08:35 2006 +0000 @@ -2569,6 +2569,10 @@ extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width); extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); +extern void ff_snow_inner_add_yblock_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); +extern void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); #endif void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) @@ -2962,10 +2966,12 @@ if(mm_flags & MM_SSE2){ c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; + c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; } else{ c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; + c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; } #endif } diff -r 81cafbc23b8d -r b77b5e7072d6 i386/snowdsp_mmx.c --- a/i386/snowdsp_mmx.c Tue Mar 21 21:51:07 2006 +0000 +++ b/i386/snowdsp_mmx.c Wed Mar 22 12:08:35 2006 +0000 @@ -653,3 +653,294 @@ "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): "%"REG_a"","%"REG_b"","%"REG_c""); } + +#define snow_inner_add_yblock_sse2_header \ + DWTELEM * * dst_array = sb->line + src_y;\ + asm volatile(\ + "mov %6, %%"REG_c" \n\t"\ + "mov %5, %%"REG_b" \n\t"\ + "mov %3, %%"REG_S" \n\t"\ + "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ + "pcmpeqd %%xmm3, %%xmm3 \n\t"\ + "pslld $31, %%xmm3 \n\t"\ + "psrld $24, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ + "1: \n\t"\ + "mov %1, %%"REG_D" \n\t"\ + "mov (%%"REG_D"), %%"REG_D" \n\t"\ + "add %2, %%"REG_D" \n\t" + +#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movq (%%"REG_d"), %%"out_reg1" \n\t"\ + "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ + "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ + "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + "pmullw %%xmm0, %%"out_reg1" \n\t"\ + "pmullw %%xmm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movq (%%"REG_d"), %%"out_reg1" \n\t"\ + "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ + "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ + "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + "pmullw %%xmm0, %%"out_reg1" \n\t"\ + "pmullw %%xmm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ + snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ + "paddusw %%xmm2, %%xmm1 \n\t"\ + "paddusw %%xmm6, %%xmm5 \n\t" + +#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ + snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ + "paddusw %%xmm2, %%xmm1 \n\t"\ + "paddusw %%xmm6, %%xmm5 \n\t" + +#define snow_inner_add_yblock_sse2_end_common1\ + "add $32, %%"REG_S" \n\t"\ + "add %%"REG_c", %0 \n\t"\ + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ + "add %%"REG_c", (%%"REG_a") \n\t" + +#define snow_inner_add_yblock_sse2_end_common2\ + "jnz 1b \n\t"\ + :"+m"(dst8),"+m"(dst_array)\ + :\ + "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):\ + "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); + +#define snow_inner_add_yblock_sse2_end_8\ + "sal $1, %%"REG_c" \n\t"\ + "add $"PTR_SIZE"*2, %1 \n\t"\ + snow_inner_add_yblock_sse2_end_common1\ + "sar $1, %%"REG_c" \n\t"\ + "sub $2, %%"REG_b" \n\t"\ + snow_inner_add_yblock_sse2_end_common2 + +#define snow_inner_add_yblock_sse2_end_16\ + "add $"PTR_SIZE"*1, %1 \n\t"\ + snow_inner_add_yblock_sse2_end_common1\ + "dec %%"REG_b" \n\t"\ + snow_inner_add_yblock_sse2_end_common2 + +static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, + int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_sse2_header +snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") +snow_inner_add_yblock_sse2_accum_8("2", "8") +snow_inner_add_yblock_sse2_accum_8("1", "128") +snow_inner_add_yblock_sse2_accum_8("0", "136") + + "mov %0, %%"REG_d" \n\t" + "movdqa (%%"REG_D"), %%xmm0 \n\t" + "movdqa %%xmm1, %%xmm2 \n\t" + + "punpckhwd %%xmm7, %%xmm1 \n\t" + "punpcklwd %%xmm7, %%xmm2 \n\t" + "paddd %%xmm2, %%xmm0 \n\t" + "movdqa 16(%%"REG_D"), %%xmm2 \n\t" + "paddd %%xmm1, %%xmm2 \n\t" + "paddd %%xmm3, %%xmm0 \n\t" + "paddd %%xmm3, %%xmm2 \n\t" + + "mov %1, %%"REG_D" \n\t" + "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" + "add %2, %%"REG_D" \n\t" + + "movdqa (%%"REG_D"), %%xmm4 \n\t" + "movdqa %%xmm5, %%xmm6 \n\t" + "punpckhwd %%xmm7, %%xmm5 \n\t" + "punpcklwd %%xmm7, %%xmm6 \n\t" + "paddd %%xmm6, %%xmm4 \n\t" + "movdqa 16(%%"REG_D"), %%xmm6 \n\t" + "paddd %%xmm5, %%xmm6 \n\t" + "paddd %%xmm3, %%xmm4 \n\t" + "paddd %%xmm3, %%xmm6 \n\t" + + "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ + "packssdw %%xmm2, %%xmm0 \n\t" + "packuswb %%xmm7, %%xmm0 \n\t" + "movq %%xmm0, (%%"REG_d") \n\t" + + "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ + "packssdw %%xmm6, %%xmm4 \n\t" + "packuswb %%xmm7, %%xmm4 \n\t" + "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" +snow_inner_add_yblock_sse2_end_8 +} + +static void inner_add_yblock_bw_16_obmc_32_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, + int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_sse2_header +snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") +snow_inner_add_yblock_sse2_accum_16("2", "16") +snow_inner_add_yblock_sse2_accum_16("1", "512") +snow_inner_add_yblock_sse2_accum_16("0", "528") + + "mov %0, %%"REG_d" \n\t" + "movdqa %%xmm1, %%xmm0 \n\t" + "movdqa %%xmm5, %%xmm4 \n\t" + "punpcklwd %%xmm7, %%xmm0 \n\t" + "paddd (%%"REG_D"), %%xmm0 \n\t" + "punpckhwd %%xmm7, %%xmm1 \n\t" + "paddd 16(%%"REG_D"), %%xmm1 \n\t" + "punpcklwd %%xmm7, %%xmm4 \n\t" + "paddd 32(%%"REG_D"), %%xmm4 \n\t" + "punpckhwd %%xmm7, %%xmm5 \n\t" + "paddd 48(%%"REG_D"), %%xmm5 \n\t" + "paddd %%xmm3, %%xmm0 \n\t" + "paddd %%xmm3, %%xmm1 \n\t" + "paddd %%xmm3, %%xmm4 \n\t" + "paddd %%xmm3, %%xmm5 \n\t" + "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm1 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm5 \n\t" /* FRAC_BITS. */ + + "packssdw %%xmm1, %%xmm0 \n\t" + "packssdw %%xmm5, %%xmm4 \n\t" + "packuswb %%xmm4, %%xmm0 \n\t" + + "movdqu %%xmm0, (%%"REG_d") \n\t" + +snow_inner_add_yblock_sse2_end_16 +} + +#define snow_inner_add_yblock_mmx_header \ + DWTELEM * * dst_array = sb->line + src_y;\ + asm volatile(\ + "mov %6, %%"REG_c" \n\t"\ + "mov %5, %%"REG_b" \n\t"\ + "mov %3, %%"REG_S" \n\t"\ + "pxor %%mm7, %%mm7 \n\t" /* 0 */\ + "pcmpeqd %%mm3, %%mm3 \n\t"\ + "pslld $31, %%mm3 \n\t"\ + "psrld $24, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ + "1: \n\t"\ + "mov %1, %%"REG_D" \n\t"\ + "mov (%%"REG_D"), %%"REG_D" \n\t"\ + "add %2, %%"REG_D" \n\t" + +#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ + "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ + "punpcklbw %%mm7, %%"out_reg1" \n\t"\ + "punpcklbw %%mm7, %%"out_reg2" \n\t"\ + "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ + "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "pmullw %%mm0, %%"out_reg1" \n\t"\ + "pmullw %%mm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ + snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ + "paddusw %%mm2, %%mm1 \n\t"\ + "paddusw %%mm6, %%mm5 \n\t" + +#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ + "mov %0, %%"REG_d" \n\t"\ + "movq %%mm1, %%mm0 \n\t"\ + "movq %%mm5, %%mm4 \n\t"\ + "punpcklwd %%mm7, %%mm0 \n\t"\ + "paddd "read_offset"(%%"REG_D"), %%mm0 \n\t"\ + "punpckhwd %%mm7, %%mm1 \n\t"\ + "paddd "read_offset"+8(%%"REG_D"), %%mm1 \n\t"\ + "punpcklwd %%mm7, %%mm4 \n\t"\ + "paddd "read_offset"+16(%%"REG_D"), %%mm4 \n\t"\ + "punpckhwd %%mm7, %%mm5 \n\t"\ + "paddd "read_offset"+24(%%"REG_D"), %%mm5 \n\t"\ + "paddd %%mm3, %%mm0 \n\t"\ + "paddd %%mm3, %%mm1 \n\t"\ + "paddd %%mm3, %%mm4 \n\t"\ + "paddd %%mm3, %%mm5 \n\t"\ + "psrad $8, %%mm0 \n\t"\ + "psrad $8, %%mm1 \n\t"\ + "psrad $8, %%mm4 \n\t"\ + "psrad $8, %%mm5 \n\t"\ +\ + "packssdw %%mm1, %%mm0 \n\t"\ + "packssdw %%mm5, %%mm4 \n\t"\ + "packuswb %%mm4, %%mm0 \n\t"\ + "movq %%mm0, "write_offset"(%%"REG_d") \n\t" + +#define snow_inner_add_yblock_mmx_end(s_step)\ + "add $"s_step", %%"REG_S" \n\t"\ + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ + "add %%"REG_c", (%%"REG_a") \n\t"\ + "add $"PTR_SIZE"*1, %1 \n\t"\ + "add %%"REG_c", %0 \n\t"\ + "dec %%"REG_b" \n\t"\ + "jnz 1b \n\t"\ + :"+m"(dst8),"+m"(dst_array)\ + :\ + "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):\ + "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); + +static void inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, + int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_mmx_header +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") +snow_inner_add_yblock_mmx_accum("2", "8", "0") +snow_inner_add_yblock_mmx_accum("1", "128", "0") +snow_inner_add_yblock_mmx_accum("0", "136", "0") +snow_inner_add_yblock_mmx_mix("0", "0") +snow_inner_add_yblock_mmx_end("16") +} + +static void inner_add_yblock_bw_16_obmc_32_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, + int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_mmx_header +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") +snow_inner_add_yblock_mmx_accum("2", "16", "0") +snow_inner_add_yblock_mmx_accum("1", "512", "0") +snow_inner_add_yblock_mmx_accum("0", "528", "0") +snow_inner_add_yblock_mmx_mix("0", "0") + +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") +snow_inner_add_yblock_mmx_accum("2", "24", "8") +snow_inner_add_yblock_mmx_accum("1", "520", "8") +snow_inner_add_yblock_mmx_accum("0", "536", "8") +snow_inner_add_yblock_mmx_mix("32", "8") +snow_inner_add_yblock_mmx_end("32") +} + +void ff_snow_inner_add_yblock_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ + + if (b_w == 16) + inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else if (b_w == 8 && obmc_stride == 16) { + if (!(b_h & 1)) + inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else + inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + } else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); +} + +void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ + if (b_w == 16) + inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else if (b_w == 8 && obmc_stride == 16) + inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); +}