Mercurial > libavcodec.hg
changeset 5594:384629ebcb93 libavcodec
avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum
(patch for plain mmx support is welcome ...)
author | michael |
---|---|
date | Sun, 26 Aug 2007 01:11:02 +0000 |
parents | bd015f9ea964 |
children | 9da9e00a04a5 |
files | i386/dsputil_mmx.c i386/snowdsp_mmx.c |
diffstat | 2 files changed, 77 insertions(+), 30 deletions(-) [+] |
line wrap: on
line diff
--- a/i386/dsputil_mmx.c Sat Aug 25 19:04:33 2007 +0000 +++ b/i386/dsputil_mmx.c Sun Aug 26 01:11:02 2007 +0000 @@ -3627,8 +3627,10 @@ c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; } else{ + if(mm_flags & MM_MMXEXT){ c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; + } c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; } #endif
--- a/i386/snowdsp_mmx.c Sat Aug 25 19:04:33 2007 +0000 +++ b/i386/snowdsp_mmx.c Sun Aug 26 01:11:02 2007 +0000 @@ -111,22 +111,29 @@ i = 0; asm volatile( - "psllw $2, %%xmm7 \n\t" + "psllw $13, %%xmm7 \n\t" + "pcmpeqw %%xmm6, %%xmm6 \n\t" + "psrlw $13, %%xmm6 \n\t" + "paddw %%xmm7, %%xmm6 \n\t" ::); for(; i<w_l-15; i+=16){ asm volatile( - "movdqu (%1), %%xmm1 \n\t" - "movdqu 16(%1), %%xmm5 \n\t" - "movdqu 2(%1), %%xmm0 \n\t" - "movdqu 18(%1), %%xmm4 \n\t" //FIXME try aligned reads and shifts - "paddw %%xmm1, %%xmm0 \n\t" - "paddw %%xmm5, %%xmm4 \n\t" - "paddw %%xmm7, %%xmm0 \n\t" - "paddw %%xmm7, %%xmm4 \n\t" + "movdqu (%1), %%xmm0 \n\t" + "movdqu 16(%1), %%xmm4 \n\t" + "movdqu 2(%1), %%xmm1 \n\t" + "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts + "paddw %%xmm6, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm4 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm5 \n\t" + "pavgw %%xmm1, %%xmm0 \n\t" + "pavgw %%xmm5, %%xmm4 \n\t" + "psubw %%xmm7, %%xmm0 \n\t" + "psubw %%xmm7, %%xmm4 \n\t" + "psraw $1, %%xmm0 \n\t" + "psraw $1, %%xmm4 \n\t" "movdqa (%0), %%xmm1 \n\t" "movdqa 16(%0), %%xmm5 \n\t" - "psraw $2, %%xmm0 \n\t" - "psraw $2, %%xmm4 \n\t" "paddw %%xmm1, %%xmm0 \n\t" "paddw %%xmm5, %%xmm4 \n\t" "psraw $2, %%xmm0 \n\t" @@ -288,18 +295,27 @@ i = 1; b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); asm volatile( - "psllw $2, %%mm7 \n\t" + "psllw $13, %%mm7 \n\t" + "pcmpeqw %%mm6, %%mm6 \n\t" + "psrlw $13, %%mm6 \n\t" + "paddw %%mm7, %%mm6 \n\t" ::); for(; i<w_l-7; i+=8){ asm volatile( "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" - "paddw 2(%1), %%mm0 \n\t" - "paddw 10(%1), %%mm4 \n\t" - "paddw %%mm7, %%mm0 \n\t" - "paddw %%mm7, %%mm4 \n\t" - "psraw $2, %%mm0 \n\t" - "psraw $2, %%mm4 \n\t" + "movq 2(%1), %%mm1 \n\t" + "movq 10(%1), %%mm5 \n\t" + "paddw %%mm6, %%mm0 \n\t" + "paddw %%mm6, %%mm4 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm5 \n\t" + "pavgw %%mm1, %%mm0 \n\t" + "pavgw %%mm5, %%mm4 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psubw %%mm7, %%mm4 \n\t" + "psraw $1, %%mm0 \n\t" + "psraw $1, %%mm4 \n\t" "movq (%0), %%mm1 \n\t" "movq 8(%0), %%mm5 \n\t" "paddw %%mm1, %%mm0 \n\t" @@ -467,16 +483,31 @@ snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6") "mov %2, %%"REG_a" \n\t" - snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6") + + "pcmpeqw %%xmm7, %%xmm7 \n\t" + "pcmpeqw %%xmm5, %%xmm5 \n\t" + "psllw $15, %%xmm7 \n\t" + "psrlw $13, %%xmm5 \n\t" + "paddw %%xmm7, %%xmm5 \n\t" + snow_vertical_compose_sse2_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") + "movq (%%"REG_a",%%"REG_d",2), %%xmm1 \n\t" + "movq 8(%%"REG_a",%%"REG_d",2), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm0 \n\t" + "pavgw %%xmm3, %%xmm2 \n\t" + "movq 16(%%"REG_a",%%"REG_d",2), %%xmm1 \n\t" + "movq 24(%%"REG_a",%%"REG_d",2), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm4 \n\t" + "pavgw %%xmm3, %%xmm6 \n\t" + snow_vertical_compose_sse2_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6") snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") - "pcmpeqd %%xmm1, %%xmm1 \n\t" - "psllw $15, %%xmm1 \n\t" - "psrlw $14, %%xmm1 \n\t" "mov %1, %%"REG_S" \n\t" - snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6") snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6") @@ -569,16 +600,30 @@ snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6") "mov %2, %%"REG_a" \n\t" - snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6") + "pcmpeqw %%mm7, %%mm7 \n\t" + "pcmpeqw %%mm5, %%mm5 \n\t" + "psllw $15, %%mm7 \n\t" + "psrlw $13, %%mm5 \n\t" + "paddw %%mm7, %%mm5 \n\t" + snow_vertical_compose_mmx_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") + "movq (%%"REG_a",%%"REG_d",2), %%mm1 \n\t" + "movq 8(%%"REG_a",%%"REG_d",2), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm0 \n\t" + "pavgw %%mm3, %%mm2 \n\t" + "movq 16(%%"REG_a",%%"REG_d",2), %%mm1 \n\t" + "movq 24(%%"REG_a",%%"REG_d",2), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm4 \n\t" + "pavgw %%mm3, %%mm6 \n\t" + snow_vertical_compose_sse2_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") - "pcmpeqw %%mm1, %%mm1 \n\t" - "psllw $15, %%mm1 \n\t" - "psrlw $14, %%mm1 \n\t" "mov %1, %%"REG_S" \n\t" - snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6")