Mercurial > libavcodec.hg
changeset 10964:abb3b23bda35 libavcodec
Implement an sse version of scalarproduct_float().
author | alexc |
---|---|
date | Fri, 22 Jan 2010 23:07:58 +0000 |
parents | 81033a080136 |
children | d27deb92257b |
files | x86/dsputil_mmx.c x86/dsputil_yasm.asm |
diffstat | 2 files changed, 29 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/x86/dsputil_mmx.c Fri Jan 22 19:07:44 2010 +0000 +++ b/x86/dsputil_mmx.c Fri Jan 22 23:07:58 2010 +0000 @@ -2510,6 +2510,8 @@ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); +float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); + void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) { mm_flags = mm_support(); @@ -2965,6 +2967,9 @@ c->vector_clipf = vector_clipf_sse; c->float_to_int16 = float_to_int16_sse; c->float_to_int16_interleave = float_to_int16_interleave_sse; +#if HAVE_YASM + c->scalarproduct_float = ff_scalarproduct_float_sse; +#endif } if(mm_flags & FF_MM_3DNOW) c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
--- a/x86/dsputil_yasm.asm Fri Jan 22 19:07:44 2010 +0000 +++ b/x86/dsputil_yasm.asm Fri Jan 22 23:07:58 2010 +0000 @@ -397,3 +397,27 @@ .unaligned: ADD_HFYU_LEFT_LOOP 0 + +; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len) +cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset + neg offsetq + shl offsetq, 2 + sub v1q, offsetq + sub v2q, offsetq + xorps xmm0, xmm0 + .loop: + movaps xmm1, [v1q+offsetq] + mulps xmm1, [v2q+offsetq] + addps xmm0, xmm1 + add offsetq, 16 + js .loop + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%ifndef ARCH_X86_64 + movd r0m, xmm0 + fld dword r0m +%endif + RET