comparison x86/dsputil_yasm.asm @ 10964:abb3b23bda35 libavcodec

Implement an sse version of scalarproduct_float().
author alexc
date Fri, 22 Jan 2010 23:07:58 +0000
parents f0f34732208a
children 980030a3e315
comparison
equal deleted inserted replaced
10963:81033a080136 10964:abb3b23bda35
395 jnz .unaligned 395 jnz .unaligned
396 ADD_HFYU_LEFT_LOOP 1 396 ADD_HFYU_LEFT_LOOP 1
397 .unaligned: 397 .unaligned:
398 ADD_HFYU_LEFT_LOOP 0 398 ADD_HFYU_LEFT_LOOP 0
399 399
400
401 ; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len)
402 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
403 neg offsetq
404 shl offsetq, 2
405 sub v1q, offsetq
406 sub v2q, offsetq
407 xorps xmm0, xmm0
408 .loop:
409 movaps xmm1, [v1q+offsetq]
410 mulps xmm1, [v2q+offsetq]
411 addps xmm0, xmm1
412 add offsetq, 16
413 js .loop
414 movhlps xmm1, xmm0
415 addps xmm0, xmm1
416 movss xmm1, xmm0
417 shufps xmm0, xmm0, 1
418 addss xmm0, xmm1
419 %ifndef ARCH_X86_64
420 movd r0m, xmm0
421 fld dword r0m
422 %endif
423 RET