Mercurial > libavcodec.hg
comparison x86/dsputil_yasm.asm @ 10964:abb3b23bda35 libavcodec
Implement an sse version of scalarproduct_float().
author | alexc |
---|---|
date | Fri, 22 Jan 2010 23:07:58 +0000 |
parents | f0f34732208a |
children | 980030a3e315 |
comparison
equal
deleted
inserted
replaced
10963:81033a080136 | 10964:abb3b23bda35 |
---|---|
395 jnz .unaligned | 395 jnz .unaligned |
396 ADD_HFYU_LEFT_LOOP 1 | 396 ADD_HFYU_LEFT_LOOP 1 |
397 .unaligned: | 397 .unaligned: |
398 ADD_HFYU_LEFT_LOOP 0 | 398 ADD_HFYU_LEFT_LOOP 0 |
399 | 399 |
400 | |
401 ; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len) | |
402 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset | |
403 neg offsetq | |
404 shl offsetq, 2 | |
405 sub v1q, offsetq | |
406 sub v2q, offsetq | |
407 xorps xmm0, xmm0 | |
408 .loop: | |
409 movaps xmm1, [v1q+offsetq] | |
410 mulps xmm1, [v2q+offsetq] | |
411 addps xmm0, xmm1 | |
412 add offsetq, 16 | |
413 js .loop | |
414 movhlps xmm1, xmm0 | |
415 addps xmm0, xmm1 | |
416 movss xmm1, xmm0 | |
417 shufps xmm0, xmm0, 1 | |
418 addss xmm0, xmm1 | |
419 %ifndef ARCH_X86_64 | |
420 movd r0m, xmm0 | |
421 fld dword r0m | |
422 %endif | |
423 RET |