Mercurial > libavcodec.hg
changeset 3569:c42c03f3b402 libavcodec
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
please complain if they are slower on sse2 cpus ...
author | michael |
---|---|
date | Thu, 10 Aug 2006 20:24:58 +0000 |
parents | 945caa35ee9a |
children | 991ef6ade276 |
files | i386/dsputil_mmx.c |
diffstat | 1 files changed, 22 insertions(+), 22 deletions(-) [+] |
line wrap: on
line diff
--- a/i386/dsputil_mmx.c Thu Aug 10 19:06:25 2006 +0000 +++ b/i386/dsputil_mmx.c Thu Aug 10 20:24:58 2006 +0000 @@ -2820,12 +2820,14 @@ ); asm volatile("femms"); } -static void vector_fmul_reverse_sse2(float *dst, const float *src0, const float *src1, int len){ +static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ long i = len*4-32; asm volatile( "1: \n\t" - "pshufd $0x1b, 16(%1), %%xmm0 \n\t" - "pshufd $0x1b, (%1), %%xmm1 \n\t" + "movaps 16(%1), %%xmm0 \n\t" + "movaps (%1), %%xmm1 \n\t" + "shufps $0x1b, %%xmm0, %%xmm0 \n\t" + "shufps $0x1b, %%xmm1, %%xmm1 \n\t" "mulps (%3,%0), %%xmm0 \n\t" "mulps 16(%3,%0), %%xmm1 \n\t" "movaps %%xmm0, (%2,%0) \n\t" @@ -2882,7 +2884,7 @@ ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); asm volatile("femms"); } -static void vector_fmul_add_add_sse2(float *dst, const float *src0, const float *src1, +static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, const float *src2, float src3, int len, int step){ long i; if(step == 2 && src3 == 0){ @@ -2896,20 +2898,20 @@ "mulps 16(%3,%0), %%xmm1 \n\t" "addps (%4,%0), %%xmm0 \n\t" "addps 16(%4,%0), %%xmm1 \n\t" - "movd %%xmm0, (%1) \n\t" - "movd %%xmm1, 32(%1) \n\t" - "psrldq $4, %%xmm0 \n\t" - "psrldq $4, %%xmm1 \n\t" - "movd %%xmm0, 8(%1) \n\t" - "movd %%xmm1, 40(%1) \n\t" - "psrldq $4, %%xmm0 \n\t" - "psrldq $4, %%xmm1 \n\t" - "movd %%xmm0, 16(%1) \n\t" - "movd %%xmm1, 48(%1) \n\t" - "psrldq $4, %%xmm0 \n\t" - "psrldq $4, %%xmm1 \n\t" - "movd %%xmm0, 24(%1) \n\t" - "movd %%xmm1, 56(%1) \n\t" + "movss %%xmm0, (%1) \n\t" + "movss %%xmm1, 32(%1) \n\t" + "movhlps %%xmm0, %%xmm2 \n\t" + "movhlps %%xmm1, %%xmm3 \n\t" + "movss %%xmm2, 16(%1) \n\t" + "movss %%xmm3, 48(%1) \n\t" + "shufps $0xb1, %%xmm0, %%xmm0 \n\t" + "shufps $0xb1, %%xmm1, %%xmm1 \n\t" + "movss %%xmm0, 8(%1) \n\t" + "movss %%xmm1, 40(%1) \n\t" + "movhlps %%xmm0, %%xmm2 \n\t" + "movhlps %%xmm1, %%xmm3 \n\t" + "movss %%xmm2, 24(%1) \n\t" + "movss %%xmm3, 56(%1) \n\t" "sub $64, %1 \n\t" "sub $32, %0 \n\t" "jge 1b \n\t" @@ -3403,10 +3405,8 @@ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; c->vector_fmul = vector_fmul_sse; c->float_to_int16 = float_to_int16_sse; - } - if(mm_flags & MM_SSE2){ - c->vector_fmul_reverse = vector_fmul_reverse_sse2; - c->vector_fmul_add_add = vector_fmul_add_add_sse2; + c->vector_fmul_reverse = vector_fmul_reverse_sse; + c->vector_fmul_add_add = vector_fmul_add_add_sse; } if(mm_flags & MM_3DNOW) c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse2