Mercurial > mplayer.hg
comparison mp3lib/dct64_3dnow.s @ 1271:2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
author | nick |
---|---|
date | Wed, 04 Jul 2001 09:47:56 +0000 |
parents | 03b7e2955a20 |
children | 3a9699d9e7da |
comparison
equal
deleted
inserted
replaced
1270:8a9fa696b77d | 1271:2864e32cd267 |
---|---|
1 # This code was taken from http://www.mpg123.org | 1 # This code was taken from http://www.mpg123.org |
2 # See ChangeLog of mpg123-0.59s-pre.1 for detail | 2 # See ChangeLog of mpg123-0.59s-pre.1 for detail |
3 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> | 3 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> |
4 # Partial 3dnow! optimization by Nick Kurshev | 4 # Partial 3dnow! optimization by Nick Kurshev |
5 # | 5 # |
6 # TODO: finish 3dnow! optimization at least in scalar mode | 6 # TODO: optimize scalar 3dnow! code |
7 # Warning: Phases 7 & 8 are not tested | |
7 # | 8 # |
8 | |
9 .data | |
10 .align 8 | |
11 plus_minus_3dnow: .long 0x00000000, 0x80000000 | |
12 costab: | |
13 .long 1056974725 | |
14 .long 1057056395 | |
15 .long 1057223771 | |
16 .long 1057485416 | |
17 .long 1057855544 | |
18 .long 1058356026 | |
19 .long 1059019886 | |
20 .long 1059897405 | |
21 .long 1061067246 | |
22 .long 1062657950 | |
23 .long 1064892987 | |
24 .long 1066774581 | |
25 .long 1069414683 | |
26 .long 1073984175 | |
27 .long 1079645762 | |
28 .long 1092815430 | |
29 .long 1057005197 | |
30 .long 1057342072 | |
31 .long 1058087743 | |
32 .long 1059427869 | |
33 .long 1061799040 | |
34 .long 1065862217 | |
35 .long 1071413542 | |
36 .long 1084439708 | |
37 .long 1057128951 | |
38 .long 1058664893 | |
39 .long 1063675095 | |
40 .long 1076102863 | |
41 .long 1057655764 | |
42 .long 1067924853 | |
43 .long 1060439283 | |
44 | 9 |
45 .text | 10 .text |
46 | 11 |
47 .align 16 | 12 .align 16 |
48 | 13 |
51 pushl %ebx | 16 pushl %ebx |
52 pushl %esi | 17 pushl %esi |
53 pushl %edi | 18 pushl %edi |
54 subl $256,%esp | 19 subl $256,%esp |
55 movl 280(%esp),%eax | 20 movl 280(%esp),%eax |
56 | |
57 leal 128(%esp),%edx | 21 leal 128(%esp),%edx |
58 movl 272(%esp),%esi | 22 movl 272(%esp),%esi |
59 movl 276(%esp),%edi | 23 movl 276(%esp),%edi |
60 movl $costab,%ebx | 24 movl $costab_mmx,%ebx |
61 orl %ecx,%ecx | 25 orl %ecx,%ecx |
62 movl %esp,%ecx | 26 movl %esp,%ecx |
63 femms | 27 |
64 /* Phase 1*/ | 28 /* Phase 1*/ |
65 movq (%eax), %mm0 | 29 movq (%eax), %mm0 |
66 movq 8(%eax), %mm4 | 30 movq 8(%eax), %mm4 |
67 movq %mm0, %mm3 | 31 movq %mm0, %mm3 |
68 movq %mm4, %mm7 | 32 movq %mm4, %mm7 |
533 psrlq $32, %mm7 | 497 psrlq $32, %mm7 |
534 movd %mm3, 104(%ecx) | 498 movd %mm3, 104(%ecx) |
535 movd %mm7, 120(%ecx) | 499 movd %mm7, 120(%ecx) |
536 | 500 |
537 /* Phase 6. This is the end of easy road. */ | 501 /* Phase 6. This is the end of easy road. */ |
538 movl $1, %eax | 502 /* Code below is coded in scalar mode. Should be optimized */ |
539 movd %eax, %mm7 | 503 |
540 pi2fd %mm7, %mm7 | 504 movd 32(%ecx), %mm0 |
541 movq 32(%ecx), %mm0 | 505 pfadd 36(%ecx), %mm0 |
542 punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */ | 506 movd %mm0, 32(%edx) |
507 | |
508 movd 32(%ecx), %mm0 | |
509 pfsub 36(%ecx), %mm0 | |
510 pfmul 120(%ebx),%mm0 | |
511 movd %mm0, 36(%edx) | |
512 | |
513 movd 44(%ecx), %mm0 | |
514 pfsub 40(%ecx), %mm0 | |
515 pfmul 120(%ebx),%mm0 | |
516 | |
517 movd %mm0, 44(%edx) | |
518 pfadd 40(%ecx), %mm0 | |
519 pfadd 44(%ecx), %mm0 | |
520 movd %mm0, 40(%edx) | |
521 | |
522 movd 48(%ecx), %mm3 | |
523 pfsub 52(%ecx), %mm3 | |
524 pfmul 120(%ebx), %mm3 | |
525 | |
526 movd 60(%ecx), %mm2 | |
527 pfsub 56(%ecx), %mm2 | |
528 pfmul 120(%ebx), %mm2 | |
529 movq %mm2, %mm1 | |
530 | |
531 pfadd 56(%ecx), %mm1 | |
532 pfadd 60(%ecx), %mm1 | |
533 movq %mm1, %mm0 | |
534 | |
535 pfadd 48(%ecx), %mm0 | |
536 pfadd 52(%ecx), %mm0 | |
537 movd %mm0, 48(%edx) | |
538 pfadd %mm3, %mm1 | |
539 movd %mm1, 56(%edx) | |
540 movd %mm2, 60(%edx) | |
541 pfadd %mm3, %mm2 | |
542 movd %mm2, 52(%edx) | |
543 | |
544 /*---*/ | |
545 movd 64(%ecx), %mm0 | |
546 pfadd 68(%ecx), %mm0 | |
547 movd %mm0, 64(%edx) | |
548 | |
549 movd 64(%ecx), %mm0 | |
550 pfsub 68(%ecx), %mm0 | |
551 pfmul 120(%ebx), %mm0 | |
552 movd %mm0, 68(%edx) | |
553 | |
554 movd 76(%ecx), %mm0 | |
555 pfsub 72(%ecx), %mm0 | |
556 pfmul 120(%ebx), %mm0 | |
557 movd %mm0, 76(%edx) | |
558 pfadd 72(%ecx), %mm0 | |
559 pfadd 76(%ecx), %mm0 | |
560 movd %mm0, 72(%edx) | |
561 | |
562 movd 92(%ecx), %mm0 | |
563 pfsub 88(%ecx), %mm0 | |
564 pfmul 120(%ebx), %mm0 | |
565 movd %mm0, 92(%edx) | |
566 pfadd 92(%ecx), %mm0 | |
567 pfadd 88(%ecx), %mm0 | |
568 movq %mm0, %mm1 | |
569 | |
570 pfadd 80(%ecx), %mm0 | |
571 pfadd 84(%ecx), %mm0 | |
572 movd %mm0, 80(%edx) | |
573 | |
574 movd 80(%ecx), %mm0 | |
575 pfsub 84(%ecx), %mm0 | |
576 pfmul 120(%ebx), %mm0 | |
577 pfadd %mm0, %mm1 | |
578 pfadd 92(%edx), %mm0 | |
579 movd %mm0, 84(%edx) | |
580 movd %mm1, 88(%edx) | |
581 | |
582 movd 96(%ecx), %mm0 | |
583 pfadd 100(%ecx), %mm0 | |
584 movd %mm0, 96(%edx) | |
585 | |
586 movd 96(%ecx), %mm0 | |
587 pfsub 100(%ecx), %mm0 | |
588 pfmul 120(%ebx), %mm0 | |
589 movd %mm0, 100(%edx) | |
590 | |
591 movd 108(%ecx), %mm0 | |
592 pfsub 104(%ecx), %mm0 | |
593 pfmul 120(%ebx), %mm0 | |
594 movd %mm0, 108(%edx) | |
595 pfadd 104(%ecx), %mm0 | |
596 pfadd 108(%ecx), %mm0 | |
597 movd %mm0, 104(%edx) | |
598 | |
599 movd 124(%ecx), %mm0 | |
600 pfsub 120(%ecx), %mm0 | |
601 pfmul 120(%ebx), %mm0 | |
602 movd %mm0, 124(%edx) | |
603 pfadd 120(%ecx), %mm0 | |
604 pfadd 124(%ecx), %mm0 | |
605 movq %mm0, %mm1 | |
606 | |
607 pfadd 112(%ecx), %mm0 | |
608 pfadd 116(%ecx), %mm0 | |
609 movd %mm0, 112(%edx) | |
610 | |
611 movd 112(%ecx), %mm0 | |
612 pfsub 116(%ecx), %mm0 | |
613 pfmul 120(%ebx), %mm0 | |
614 pfadd %mm0,%mm1 | |
615 pfadd 124(%edx), %mm0 | |
616 movd %mm0, 116(%edx) | |
617 movd %mm1, 120(%edx) | |
618 | |
619 jnz .L01 | |
620 | |
621 /* Phase 7*/ | |
622 /* Code below is coded in scalar mode. Should be optimized */ | |
623 | |
624 movd (%ecx), %mm0 | |
625 pfadd 4(%ecx), %mm0 | |
626 movd %mm0, 1024(%esi) | |
627 | |
628 movd (%ecx), %mm0 | |
629 pfsub 4(%ecx), %mm0 | |
630 pfmul 120(%ebx), %mm0 | |
631 movd %mm0, (%esi) | |
632 movd %mm0, (%edi) | |
633 | |
634 movd 12(%ecx), %mm0 | |
635 pfsub 8(%ecx), %mm0 | |
636 pfmul 120(%ebx), %mm0 | |
637 movd %mm0, 512(%edi) | |
638 pfadd 12(%ecx), %mm0 | |
639 pfadd 8(%ecx), %mm0 | |
640 movd %mm0, 512(%esi) | |
641 | |
642 movd 16(%ecx), %mm0 | |
643 pfsub 20(%ecx), %mm0 | |
644 pfmul 120(%ebx), %mm0 | |
645 movq %mm0, %mm3 | |
646 | |
647 movd 28(%ecx), %mm0 | |
648 pfsub 24(%ecx), %mm0 | |
649 pfmul 120(%ebx), %mm0 | |
650 movd %mm0, 768(%edi) | |
651 movq %mm0, %mm2 | |
652 | |
653 pfadd 24(%ecx), %mm0 | |
654 pfadd 28(%ecx), %mm0 | |
543 movq %mm0, %mm1 | 655 movq %mm0, %mm1 |
544 movq plus_minus_3dnow, %mm6 | 656 |
545 /* n.b.: pfpnacc */ | 657 pfadd 16(%ecx), %mm0 |
546 pxor %mm6, %mm1 | 658 pfadd 20(%ecx), %mm0 |
547 pfacc %mm1, %mm0 | 659 movd %mm0, 768(%esi) |
548 /**/ | 660 pfadd %mm3, %mm1 |
549 pfmul %mm7, %mm0 | 661 movd %mm1, 256(%esi) |
550 movq %mm0, 32(%edx) | 662 pfadd %mm3, %mm2 |
551 femms | 663 movd %mm2, 256(%edi) |
552 | |
553 flds 44(%ecx) | |
554 fsubs 40(%ecx) | |
555 fmuls 120(%ebx) | |
556 | |
557 fsts 44(%edx) | |
558 fadds 40(%ecx) /* pfacc 40(ecx), 56(%ecx) */ | |
559 fadds 44(%ecx) | |
560 fstps 40(%edx) | |
561 | |
562 flds 48(%ecx) | |
563 fsubs 52(%ecx) | |
564 fmuls 120(%ebx) | |
565 | |
566 flds 60(%ecx) | |
567 fsubs 56(%ecx) | |
568 fmuls 120(%ebx) | |
569 | |
570 fld %st(0) | |
571 fadds 56(%ecx) | |
572 fadds 60(%ecx) | |
573 | |
574 fld %st(0) | |
575 fadds 48(%ecx) | |
576 fadds 52(%ecx) | |
577 fstps 48(%edx) | |
578 fadd %st(2) | |
579 fstps 56(%edx) | |
580 fsts 60(%edx) | |
581 faddp %st(1) | |
582 fstps 52(%edx) | |
583 /*---*/ | |
584 flds 64(%ecx) | |
585 fadds 68(%ecx) | |
586 fstps 64(%edx) | |
587 | |
588 flds 64(%ecx) | |
589 fsubs 68(%ecx) | |
590 fmuls 120(%ebx) | |
591 fstps 68(%edx) | |
592 | |
593 flds 76(%ecx) | |
594 fsubs 72(%ecx) | |
595 fmuls 120(%ebx) | |
596 fsts 76(%edx) | |
597 fadds 72(%ecx) | |
598 fadds 76(%ecx) | |
599 fstps 72(%edx) | |
600 | |
601 flds 92(%ecx) | |
602 fsubs 88(%ecx) | |
603 fmuls 120(%ebx) | |
604 fsts 92(%edx) | |
605 fadds 92(%ecx) | |
606 fadds 88(%ecx) | |
607 | |
608 fld %st(0) | |
609 fadds 80(%ecx) | |
610 fadds 84(%ecx) | |
611 fstps 80(%edx) | |
612 | |
613 flds 80(%ecx) | |
614 fsubs 84(%ecx) | |
615 fmuls 120(%ebx) | |
616 fadd %st(0), %st(1) | |
617 fadds 92(%edx) | |
618 fstps 84(%edx) | |
619 fstps 88(%edx) | |
620 | |
621 flds 96(%ecx) | |
622 fadds 100(%ecx) | |
623 fstps 96(%edx) | |
624 | |
625 flds 96(%ecx) | |
626 fsubs 100(%ecx) | |
627 fmuls 120(%ebx) | |
628 fstps 100(%edx) | |
629 | |
630 flds 108(%ecx) | |
631 fsubs 104(%ecx) | |
632 fmuls 120(%ebx) | |
633 fsts 108(%edx) | |
634 fadds 104(%ecx) | |
635 fadds 108(%ecx) | |
636 fstps 104(%edx) | |
637 | |
638 flds 124(%ecx) | |
639 fsubs 120(%ecx) | |
640 fmuls 120(%ebx) | |
641 fsts 124(%edx) | |
642 fadds 120(%ecx) | |
643 fadds 124(%ecx) | |
644 | |
645 fld %st(0) | |
646 fadds 112(%ecx) | |
647 fadds 116(%ecx) | |
648 fstps 112(%edx) | |
649 | |
650 flds 112(%ecx) | |
651 fsubs 116(%ecx) | |
652 fmuls 120(%ebx) | |
653 fadd %st(0),%st(1) | |
654 fadds 124(%edx) | |
655 fstps 116(%edx) | |
656 fstps 120(%edx) | |
657 jnz .L01 | |
658 | |
659 /* Phase 7*/ | |
660 | |
661 flds (%ecx) | |
662 fadds 4(%ecx) | |
663 fstps 1024(%esi) | |
664 | |
665 flds (%ecx) | |
666 fsubs 4(%ecx) | |
667 fmuls 120(%ebx) | |
668 fsts (%esi) | |
669 fstps (%edi) | |
670 | |
671 flds 12(%ecx) | |
672 fsubs 8(%ecx) | |
673 fmuls 120(%ebx) | |
674 fsts 512(%edi) | |
675 fadds 12(%ecx) | |
676 fadds 8(%ecx) | |
677 fstps 512(%esi) | |
678 | |
679 flds 16(%ecx) | |
680 fsubs 20(%ecx) | |
681 fmuls 120(%ebx) | |
682 | |
683 flds 28(%ecx) | |
684 fsubs 24(%ecx) | |
685 fmuls 120(%ebx) | |
686 fsts 768(%edi) | |
687 fld %st(0) | |
688 fadds 24(%ecx) | |
689 fadds 28(%ecx) | |
690 fld %st(0) | |
691 fadds 16(%ecx) | |
692 fadds 20(%ecx) | |
693 fstps 768(%esi) | |
694 fadd %st(2) | |
695 fstps 256(%esi) | |
696 faddp %st(1) | |
697 fstps 256(%edi) | |
698 | 664 |
699 /* Phase 8*/ | 665 /* Phase 8*/ |
700 | 666 |
701 flds 32(%edx) | 667 movq 32(%edx), %mm0 |
702 fadds 48(%edx) | 668 movq 48(%edx), %mm1 |
703 fstps 896(%esi) | 669 pfadd 48(%edx), %mm0 |
704 | 670 pfadd 40(%edx), %mm1 |
705 flds 48(%edx) | 671 movd %mm0, 896(%esi) |
706 fadds 40(%edx) | 672 movd %mm1, 640(%esi) |
707 fstps 640(%esi) | 673 psrlq $32, %mm0 |
708 | 674 psrlq $32, %mm1 |
709 flds 40(%edx) | 675 movd %mm0, 128(%edi) |
710 fadds 56(%edx) | 676 movd %mm1, 384(%edi) |
711 fstps 384(%esi) | 677 |
712 | 678 movd 40(%edx), %mm0 |
713 flds 56(%edx) | 679 pfadd 56(%edx), %mm0 |
714 fadds 36(%edx) | 680 movd %mm0, 384(%esi) |
715 fstps 128(%esi) | 681 |
716 | 682 movd 56(%edx), %mm0 |
717 flds 36(%edx) | 683 pfadd 36(%edx), %mm0 |
718 fadds 52(%edx) | 684 movd %mm0, 128(%esi) |
719 fstps 128(%edi) | 685 |
720 | 686 movd 60(%edx), %mm0 |
721 flds 52(%edx) | 687 movd %mm0, 896(%edi) |
722 fadds 44(%edx) | 688 pfadd 44(%edx), %mm0 |
723 fstps 384(%edi) | 689 movd %mm0, 640(%edi) |
724 | 690 |
725 flds 60(%edx) | 691 movq 96(%edx), %mm0 |
726 fsts 896(%edi) | 692 movq 112(%edx), %mm2 |
727 fadds 44(%edx) | 693 movq 104(%edx), %mm4 |
728 fstps 640(%edi) | 694 pfadd 112(%edx), %mm0 |
729 | 695 pfadd 104(%edx), %mm2 |
730 flds 96(%edx) | 696 pfadd 120(%edx), %mm4 |
731 fadds 112(%edx) | 697 movq %mm0, %mm1 |
732 fld %st(0) | 698 movq %mm2, %mm3 |
733 fadds 64(%edx) | 699 movq %mm4, %mm5 |
734 fstps 960(%esi) | 700 pfadd 64(%edx), %mm0 |
735 fadds 80(%edx) | 701 pfadd 80(%edx), %mm2 |
736 fstps 832(%esi) | 702 pfadd 72(%edx), %mm4 |
737 | 703 movd %mm0, 960(%esi) |
738 flds 112(%edx) | 704 movd %mm2, 704(%esi) |
739 fadds 104(%edx) | 705 movd %mm4, 448(%esi) |
740 fld %st(0) | 706 psrlq $32, %mm0 |
741 fadds 80(%edx) | 707 psrlq $32, %mm2 |
742 fstps 704(%esi) | 708 psrlq $32, %mm4 |
743 fadds 72(%edx) | 709 movd %mm0, 64(%edi) |
744 fstps 576(%esi) | 710 movd %mm2, 320(%edi) |
745 | 711 movd %mm4, 576(%edi) |
746 flds 104(%edx) | 712 pfadd 80(%edx), %mm1 |
747 fadds 120(%edx) | 713 pfadd 72(%edx), %mm3 |
748 fld %st(0) | 714 pfadd 88(%edx), %mm5 |
749 fadds 72(%edx) | 715 movd %mm1, 832(%esi) |
750 fstps 448(%esi) | 716 movd %mm3, 576(%esi) |
751 fadds 88(%edx) | 717 movd %mm5, 320(%esi) |
752 fstps 320(%esi) | 718 psrlq $32, %mm1 |
753 | 719 psrlq $32, %mm3 |
754 flds 120(%edx) | 720 psrlq $32, %mm5 |
755 fadds 100(%edx) | 721 movd %mm1, 192(%edi) |
756 fld %st(0) | 722 movd %mm3, 448(%edi) |
757 fadds 88(%edx) | 723 movd %mm5, 704(%edi) |
758 fstps 192(%esi) | 724 |
759 fadds 68(%edx) | 725 movd 120(%edx), %mm0 |
760 fstps 64(%esi) | 726 pfadd 100(%edx), %mm0 |
761 | 727 movq %mm0, %mm1 |
762 flds 100(%edx) | 728 pfadd 88(%edx), %mm0 |
763 fadds 116(%edx) | 729 movd %mm0, 192(%esi) |
764 fld %st(0) | 730 pfadd 68(%edx), %mm1 |
765 fadds 68(%edx) | 731 movd %mm1, 64(%esi) |
766 fstps 64(%edi) | 732 |
767 fadds 84(%edx) | 733 movd 124(%edx), %mm0 |
768 fstps 192(%edi) | 734 movd %mm0, 960(%edi) |
769 | 735 pfadd 92(%edx), %mm0 |
770 flds 116(%edx) | 736 movd %mm0, 832(%edi) |
771 fadds 108(%edx) | 737 |
772 fld %st(0) | |
773 fadds 84(%edx) | |
774 fstps 320(%edi) | |
775 fadds 76(%edx) | |
776 fstps 448(%edi) | |
777 | |
778 flds 108(%edx) | |
779 fadds 124(%edx) | |
780 fld %st(0) | |
781 fadds 76(%edx) | |
782 fstps 576(%edi) | |
783 fadds 92(%edx) | |
784 fstps 704(%edi) | |
785 | |
786 flds 124(%edx) | |
787 fsts 960(%edi) | |
788 fadds 92(%edx) | |
789 fstps 832(%edi) | |
790 jmp .L_bye | 738 jmp .L_bye |
791 .L01: | 739 .L01: |
792 /* Phase 9*/ | 740 /* Phase 9*/ |
793 | 741 movd (%ecx), %mm0 |
794 flds (%ecx) | 742 pfadd 4(%ecx), %mm0 |
795 fadds 4(%ecx) | 743 pf2id %mm0, %mm0 |
796 fistp 512(%esi) | 744 movd %mm0, %eax |
797 | 745 movw %ax, 512(%esi) |
798 flds (%ecx) | 746 |
799 fsubs 4(%ecx) | 747 movd (%ecx), %mm0 |
800 fmuls 120(%ebx) | 748 pfsub 4(%ecx), %mm0 |
801 | 749 pfmul 120(%ebx), %mm0 |
802 fistp (%esi) | 750 pf2id %mm0, %mm0 |
803 | 751 movd %mm0, %eax |
804 | 752 movw %ax, (%esi) |
805 flds 12(%ecx) | 753 |
806 fsubs 8(%ecx) | 754 movd 12(%ecx), %mm0 |
807 fmuls 120(%ebx) | 755 pfsub 8(%ecx), %mm0 |
808 fist 256(%edi) | 756 pfmul 120(%ebx), %mm0 |
809 fadds 12(%ecx) | 757 pf2id %mm0, %mm7 |
810 fadds 8(%ecx) | 758 movd %mm7, %eax |
811 fistp 256(%esi) | 759 movw %ax, 256(%edi) |
812 | 760 pfadd 12(%ecx), %mm0 |
813 flds 16(%ecx) | 761 pfadd 8(%ecx), %mm0 |
814 fsubs 20(%ecx) | 762 pf2id %mm0, %mm0 |
815 fmuls 120(%ebx) | 763 movd %mm0, %eax |
816 | 764 movw %ax, 256(%esi) |
817 flds 28(%ecx) | 765 |
818 fsubs 24(%ecx) | 766 movd 16(%ecx), %mm0 |
819 fmuls 120(%ebx) | 767 pfsub 20(%ecx), %mm0 |
820 fist 384(%edi) | 768 pfmul 120(%ebx), %mm0 |
821 fld %st(0) | 769 movq %mm0, %mm3 |
822 fadds 24(%ecx) | 770 |
823 fadds 28(%ecx) | 771 movd 28(%ecx), %mm0 |
824 fld %st(0) | 772 pfsub 24(%ecx), %mm0 |
825 fadds 16(%ecx) | 773 pfmul 120(%ebx), %mm0 |
826 fadds 20(%ecx) | 774 pf2id %mm0, %mm7 |
827 fistp 384(%esi) | 775 movd %mm7, %eax |
828 fadd %st(2) | 776 movw %ax, 384(%edi) |
829 fistp 128(%esi) | 777 movq %mm0, %mm2 |
830 faddp %st(1) | 778 |
831 fistp 128(%edi) | 779 pfadd 24(%ecx), %mm0 |
780 pfadd 28(%ecx), %mm0 | |
781 movq %mm0, %mm1 | |
782 pfadd 16(%ecx), %mm0 | |
783 pfadd 20(%ecx), %mm0 | |
784 pf2id %mm0, %mm0 | |
785 movd %mm0, %eax | |
786 movw %ax, 384(%esi) | |
787 pfadd %mm3, %mm1 | |
788 pf2id %mm1, %mm1 | |
789 movd %mm1, %eax | |
790 movw %ax, 128(%esi) | |
791 pfadd %mm3, %mm2 | |
792 pf2id %mm2, %mm2 | |
793 movd %mm2, %eax | |
794 movw %ax, 128(%edi) | |
795 | |
832 | 796 |
833 /* Phase 10*/ | 797 /* Phase 10*/ |
834 | 798 |
835 flds 32(%edx) | 799 movq 32(%edx), %mm0 |
836 fadds 48(%edx) | 800 movq 48(%edx), %mm1 |
837 fistp 448(%esi) | 801 pfadd 48(%edx), %mm0 |
838 | 802 pfadd 40(%edx), %mm1 |
839 flds 48(%edx) | 803 pf2id %mm0, %mm0 |
840 fadds 40(%edx) | 804 pf2id %mm1, %mm1 |
841 fistp 320(%esi) | 805 movd %mm0, %eax |
842 | 806 movd %mm1, %ecx |
843 flds 40(%edx) | 807 movw %ax, 448(%esi) |
844 fadds 56(%edx) | 808 movw %cx, 320(%esi) |
845 fistp 192(%esi) | 809 psrlq $32, %mm0 |
846 | 810 psrlq $32, %mm1 |
847 flds 56(%edx) | 811 movd %mm0, %eax |
848 fadds 36(%edx) | 812 movd %mm1, %ecx |
849 fistp 64(%esi) | 813 movw %ax, 64(%edi) |
850 | 814 movw %cx, 192(%edi) |
851 flds 36(%edx) | 815 |
852 fadds 52(%edx) | 816 movd 40(%edx), %mm0 |
853 fistp 64(%edi) | 817 pfadd 56(%edx), %mm0 |
854 | 818 pf2id %mm0, %mm0 |
855 flds 52(%edx) | 819 movd %mm0, %eax |
856 fadds 44(%edx) | 820 movw %ax, 192(%esi) |
857 fistp 192(%edi) | 821 |
858 | 822 movd 56(%edx), %mm0 |
859 flds 60(%edx) | 823 pfadd 36(%edx), %mm0 |
860 fist 448(%edi) | 824 pf2id %mm0, %mm0 |
861 fadds 44(%edx) | 825 movd %mm0, %eax |
862 fistp 320(%edi) | 826 movw %ax, 64(%esi) |
863 | 827 |
864 flds 96(%edx) | 828 movd 60(%edx), %mm0 |
865 fadds 112(%edx) | 829 pf2id %mm0, %mm7 |
866 fld %st(0) | 830 movd %mm7, %eax |
867 fadds 64(%edx) | 831 movw %ax, 448(%edi) |
868 fistp 480(%esi) | 832 pfadd 44(%edx), %mm0 |
869 fadds 80(%edx) | 833 pf2id %mm0, %mm0 |
870 fistp 416(%esi) | 834 movd %mm0, %eax |
871 | 835 movw %ax, 320(%edi) |
872 flds 112(%edx) | 836 |
873 fadds 104(%edx) | 837 movq 96(%edx), %mm0 |
874 fld %st(0) | 838 movq 112(%edx), %mm2 |
875 fadds 80(%edx) | 839 movq 104(%edx), %mm4 |
876 fistp 352(%esi) | 840 pfadd 112(%edx), %mm0 |
877 fadds 72(%edx) | 841 pfadd 104(%edx), %mm2 |
878 fistp 288(%esi) | 842 pfadd 120(%edx), %mm4 |
879 | 843 movq %mm0, %mm1 |
880 flds 104(%edx) | 844 movq %mm2, %mm3 |
881 fadds 120(%edx) | 845 movq %mm4, %mm5 |
882 fld %st(0) | 846 pfadd 64(%edx), %mm0 |
883 fadds 72(%edx) | 847 pfadd 80(%edx), %mm2 |
884 fistp 224(%esi) | 848 pfadd 72(%edx), %mm4 |
885 fadds 88(%edx) | 849 pf2id %mm0, %mm7 |
886 fistp 160(%esi) | 850 pf2id %mm2, %mm6 |
887 | 851 pf2id %mm4, %mm4 |
888 flds 120(%edx) | 852 movd %mm7, %eax |
889 fadds 100(%edx) | 853 movd %mm6, %ecx |
890 fld %st(0) | 854 movd %mm4, %ebx |
891 fadds 88(%edx) | 855 movw %ax, 480(%esi) |
892 fistp 96(%esi) | 856 movw %cx, 352(%esi) |
893 fadds 68(%edx) | 857 movw %bx, 224(%esi) |
894 fistp 32(%esi) | 858 psrlq $32, %mm7 |
895 | 859 psrlq $32, %mm6 |
896 flds 100(%edx) | 860 psrlq $32, %mm4 |
897 fadds 116(%edx) | 861 movd %mm7, %eax |
898 fld %st(0) | 862 movd %mm6, %ecx |
899 fadds 68(%edx) | 863 movd %mm4, %ebx |
900 fistp 32(%edi) | 864 movw %ax, 32(%edi) |
901 fadds 84(%edx) | 865 movw %cx, 160(%edi) |
902 fistp 96(%edi) | 866 movw %bx, 288(%edi) |
903 | 867 pfadd 80(%edx), %mm1 |
904 flds 116(%edx) | 868 pfadd 72(%edx), %mm3 |
905 fadds 108(%edx) | 869 pfadd 88(%edx), %mm5 |
906 fld %st(0) | 870 pf2id %mm1, %mm1 |
907 fadds 84(%edx) | 871 pf2id %mm3, %mm3 |
908 fistp 160(%edi) | 872 pf2id %mm5, %mm5 |
909 fadds 76(%edx) | 873 movd %mm1, %eax |
910 fistp 224(%edi) | 874 movd %mm3, %ecx |
911 | 875 movd %mm5, %ebx |
912 flds 108(%edx) | 876 movw %ax, 416(%esi) |
913 fadds 124(%edx) | 877 movw %cx, 288(%esi) |
914 fld %st(0) | 878 movw %bx, 160(%esi) |
915 fadds 76(%edx) | 879 psrlq $32, %mm1 |
916 fistp 288(%edi) | 880 psrlq $32, %mm3 |
917 fadds 92(%edx) | 881 psrlq $32, %mm5 |
918 fistp 352(%edi) | 882 movd %mm1, %eax |
919 | 883 movd %mm3, %ecx |
920 flds 124(%edx) | 884 movd %mm5, %ebx |
921 fist 480(%edi) | 885 movw %ax, 96(%edi) |
922 fadds 92(%edx) | 886 movw %cx, 224(%edi) |
923 fistp 416(%edi) | 887 movw %bx, 352(%edi) |
888 | |
889 movd 120(%edx), %mm0 | |
890 pfadd 100(%edx), %mm0 | |
891 movq %mm0, %mm1 | |
892 pfadd 88(%edx), %mm0 | |
893 pf2id %mm0, %mm0 | |
894 movd %mm0, %eax | |
895 movw %ax, 96(%esi) | |
896 pfadd 68(%edx), %mm1 | |
897 pf2id %mm1, %mm1 | |
898 movd %mm1, %eax | |
899 movw %ax, 32(%esi) | |
900 | |
901 movq 124(%edx), %mm0 | |
902 pf2id %mm0, %mm1 | |
903 movd %mm1, %eax | |
904 movw %ax, 480(%edi) | |
905 pfadd 92(%edx), %mm0 | |
906 pf2id %mm0, %mm0 | |
907 movd %mm0, %eax | |
908 movw %ax, 416(%edi) | |
909 | |
924 movsw | 910 movsw |
911 | |
925 .L_bye: | 912 .L_bye: |
926 addl $256,%esp | 913 addl $256,%esp |
914 femms | |
927 popl %edi | 915 popl %edi |
928 popl %esi | 916 popl %esi |
929 popl %ebx | 917 popl %ebx |
930 ret | 918 ret |
931 | 919 |