Mercurial > libavcodec.hg
comparison x86/fft_mmx.asm @ 12399:020540442072 libavcodec
Convert ff_imdct_half_sse() to yasm.
This is to avoid split asm sections that attempt to preserve some
registers between sections.
author | alexc |
---|---|
date | Sun, 22 Aug 2010 14:39:58 +0000 |
parents | 6f064ab48463 |
children | f61e22f8cf28 |
comparison
equal
deleted
inserted
replaced
12398:31736ceab3aa | 12399:020540442072 |
---|---|
26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results | 26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results |
27 ; in blocks as conventient to the vector size. | 27 ; in blocks as conventient to the vector size. |
28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) | 28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) |
29 | 29 |
30 %include "x86inc.asm" | 30 %include "x86inc.asm" |
31 | |
32 %ifdef ARCH_X86_64 | |
33 %define pointer resq | |
34 %else | |
35 %define pointer resd | |
36 %endif | |
37 | |
38 struc FFTContext | |
39 .nbits: resd 1 | |
40 .reverse: resd 1 | |
41 .revtab: pointer 1 | |
42 .tmpbuf: pointer 1 | |
43 .mdctsize: resd 1 | |
44 .mdctbits: resd 1 | |
45 .tcos: pointer 1 | |
46 .tsin: pointer 1 | |
47 endstruc | |
31 | 48 |
32 SECTION_RODATA | 49 SECTION_RODATA |
33 | 50 |
34 %define M_SQRT1_2 0.70710678118654752440 | 51 %define M_SQRT1_2 0.70710678118654752440 |
35 ps_root2: times 4 dd M_SQRT1_2 | 52 ps_root2: times 4 dd M_SQRT1_2 |
426 %define SECTION_REL - $$ | 443 %define SECTION_REL - $$ |
427 %else | 444 %else |
428 %define SECTION_REL | 445 %define SECTION_REL |
429 %endif | 446 %endif |
430 | 447 |
448 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs | |
449 lea r2, [dispatch_tab%1] | |
450 mov r2, [r2 + (%2q-2)*gprsize] | |
451 %ifdef PIC | |
452 lea r3, [$$] | |
453 add r2, r3 | |
454 %endif | |
455 call r2 | |
456 %endmacro ; FFT_DISPATCH | |
457 | |
431 %macro DECL_FFT 2-3 ; nbits, cpu, suffix | 458 %macro DECL_FFT 2-3 ; nbits, cpu, suffix |
432 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL | 459 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL |
433 %if %1==5 | 460 %if %1==5 |
434 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL | 461 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL |
435 %endif | 462 %endif |
462 section .text | 489 section .text |
463 | 490 |
464 ; On x86_32, this function does the register saving and restoring for all of fft. | 491 ; On x86_32, this function does the register saving and restoring for all of fft. |
465 ; The others pass args in registers and don't spill anything. | 492 ; The others pass args in registers and don't spill anything. |
466 cglobal fft_dispatch%3%2, 2,5,8, z, nbits | 493 cglobal fft_dispatch%3%2, 2,5,8, z, nbits |
467 lea r2, [dispatch_tab%3%2] | 494 FFT_DISPATCH %3%2, nbits |
468 mov r2, [r2 + (nbitsq-2)*gprsize] | |
469 %ifdef PIC | |
470 lea r3, [$$] | |
471 add r2, r3 | |
472 %endif | |
473 call r2 | |
474 RET | 495 RET |
475 %endmacro ; DECL_FFT | 496 %endmacro ; DECL_FFT |
476 | 497 |
477 DECL_FFT 5, _sse | 498 DECL_FFT 5, _sse |
478 DECL_FFT 5, _sse, _interleave | 499 DECL_FFT 5, _sse, _interleave |
479 DECL_FFT 4, _3dn | 500 DECL_FFT 4, _3dn |
480 DECL_FFT 4, _3dn, _interleave | 501 DECL_FFT 4, _3dn, _interleave |
481 DECL_FFT 4, _3dn2 | 502 DECL_FFT 4, _3dn2 |
482 DECL_FFT 4, _3dn2, _interleave | 503 DECL_FFT 4, _3dn2, _interleave |
483 | 504 |
505 INIT_XMM | |
506 %undef mulps | |
507 %undef addps | |
508 %undef subps | |
509 %undef unpcklps | |
510 %undef unpckhps | |
511 | |
512 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 | |
513 movaps xmm0, [%3+%2*4] | |
514 movaps xmm1, [%3+%1*4-0x10] | |
515 movaps xmm2, xmm0 | |
516 shufps xmm0, xmm1, 0x88 | |
517 shufps xmm1, xmm2, 0x77 | |
518 movlps xmm4, [%4+%2*2] | |
519 movlps xmm5, [%5+%2*2+0x0] | |
520 movhps xmm4, [%4+%1*2-0x8] | |
521 movhps xmm5, [%5+%1*2-0x8] | |
522 movaps xmm2, xmm0 | |
523 movaps xmm3, xmm1 | |
524 mulps xmm0, xmm5 | |
525 mulps xmm1, xmm4 | |
526 mulps xmm2, xmm4 | |
527 mulps xmm3, xmm5 | |
528 subps xmm1, xmm0 | |
529 addps xmm2, xmm3 | |
530 movaps xmm0, xmm1 | |
531 unpcklps xmm1, xmm2 | |
532 unpckhps xmm0, xmm2 | |
533 %endmacro | |
534 | |
535 %macro PREROTATEW 3 ;addr1, addr2, xmm | |
536 movlps %1, %3 | |
537 movhps %2, %3 | |
538 %endmacro | |
539 | |
540 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 | |
541 movaps xmm6, [%4+%1*2] | |
542 movaps %2, [%4+%1*2+0x10] | |
543 movaps %3, xmm6 | |
544 movaps xmm7, %2 | |
545 mulps xmm6, [%5+%1*1] | |
546 mulps %2, [%6+%1*1] | |
547 mulps %3, [%6+%1*1] | |
548 mulps xmm7, [%5+%1*1] | |
549 subps %2, xmm6 | |
550 addps %3, xmm7 | |
551 %endmacro | |
552 | |
553 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 | |
554 .post: | |
555 CMUL %1, xmm0, xmm1, %3, %4, %5 | |
556 CMUL %2, xmm4, xmm5, %3, %4, %5 | |
557 shufps xmm1, xmm1, 0x1b | |
558 shufps xmm5, xmm5, 0x1b | |
559 movaps xmm6, xmm4 | |
560 unpckhps xmm4, xmm1 | |
561 unpcklps xmm6, xmm1 | |
562 movaps xmm2, xmm0 | |
563 unpcklps xmm0, xmm5 | |
564 unpckhps xmm2, xmm5 | |
565 movaps [%3+%2*2], xmm6 | |
566 movaps [%3+%2*2+0x10], xmm4 | |
567 movaps [%3+%1*2], xmm0 | |
568 movaps [%3+%1*2+0x10], xmm2 | |
569 sub %2, 0x10 | |
570 add %1, 0x10 | |
571 jl .post | |
572 %endmacro | |
573 | |
574 cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input | |
575 %ifdef ARCH_X86_64 | |
576 %define rrevtab r10 | |
577 %define rtcos r11 | |
578 %define rtsin r12 | |
579 push r10 | |
580 push r11 | |
581 push r12 | |
582 push r13 | |
583 push r14 | |
584 %else | |
585 %define rrevtab r6 | |
586 %define rtsin r6 | |
587 %define rtcos r5 | |
588 %endif | |
589 mov r3d, [r0+FFTContext.mdctsize] | |
590 add r2, r3 | |
591 shr r3, 1 | |
592 mov rtcos, [r0+FFTContext.tcos] | |
593 mov rtsin, [r0+FFTContext.tsin] | |
594 add rtcos, r3 | |
595 add rtsin, r3 | |
596 %ifndef ARCH_X86_64 | |
597 push rtcos | |
598 push rtsin | |
599 %endif | |
600 shr r3, 1 | |
601 mov rrevtab, [r0+FFTContext.revtab] | |
602 add rrevtab, r3 | |
603 %ifndef ARCH_X86_64 | |
604 push rrevtab | |
605 %endif | |
606 | |
607 sub r3, 4 | |
608 %ifdef ARCH_X86_64 | |
609 xor r4, r4 | |
610 sub r4, r3 | |
611 %endif | |
612 .pre: | |
613 %ifndef ARCH_X86_64 | |
614 ;unspill | |
615 xor r4, r4 | |
616 sub r4, r3 | |
617 mov rtsin, [esp+4] | |
618 mov rtcos, [esp+8] | |
619 %endif | |
620 | |
621 PREROTATER r4, r3, r2, rtcos, rtsin | |
622 %ifdef ARCH_X86_64 | |
623 movzx r5, word [rrevtab+r4*1-4] | |
624 movzx r6, word [rrevtab+r4*1-2] | |
625 movzx r13, word [rrevtab+r3*1] | |
626 movzx r14, word [rrevtab+r3*1+2] | |
627 PREROTATEW [r1+r5 *8], [r1+r6 *8], xmm0 | |
628 PREROTATEW [r1+r13*8], [r1+r14*8], xmm1 | |
629 add r4, 4 | |
630 %else | |
631 mov r6, [esp] | |
632 movzx r5, word [r6+r4*1-4] | |
633 movzx r4, word [r6+r4*1-2] | |
634 PREROTATEW [r1+r5*8], [r1+r4*8], xmm0 | |
635 movzx r5, word [r6+r3*1] | |
636 movzx r4, word [r6+r3*1+2] | |
637 PREROTATEW [r1+r5*8], [r1+r4*8], xmm1 | |
638 %endif | |
639 sub r3, 4 | |
640 jns .pre | |
641 | |
642 mov r5, r0 | |
643 mov r6, r1 | |
644 mov r0, r1 | |
645 mov r1d, [r5+FFTContext.nbits] | |
646 | |
647 FFT_DISPATCH _sse, r1 | |
648 | |
649 mov r0d, [r5+FFTContext.mdctsize] | |
650 add r6, r0 | |
651 shr r0, 1 | |
652 %ifndef ARCH_X86_64 | |
653 %define rtcos r2 | |
654 %define rtsin r3 | |
655 mov rtcos, [esp+8] | |
656 mov rtsin, [esp+4] | |
657 %endif | |
658 neg r0 | |
659 mov r1, -16 | |
660 sub r1, r0 | |
661 POSROTATESHUF r0, r1, r6, rtcos, rtsin | |
662 %ifdef ARCH_X86_64 | |
663 pop r14 | |
664 pop r13 | |
665 pop r12 | |
666 pop r11 | |
667 pop r10 | |
668 %else | |
669 add esp, 12 | |
670 %endif | |
671 RET |