Mercurial > mplayer.hg
comparison mp3lib/dct64_k7.s @ 1271:2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
author | nick |
---|---|
date | Wed, 04 Jul 2001 09:47:56 +0000 |
parents | 03b7e2955a20 |
children | 3a9699d9e7da |
comparison
equal
deleted
inserted
replaced
1270:8a9fa696b77d | 1271:2864e32cd267 |
---|---|
1 # This code was taken from http://www.mpg123.org | 1 # This code was taken from http://www.mpg123.org |
2 # See ChangeLog of mpg123-0.59s-pre.1 for detail | 2 # See ChangeLog of mpg123-0.59s-pre.1 for detail |
3 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> | 3 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> |
4 # Partial 3dnowex-DSP! optimization by Nick Kurshev | 4 # Partial 3dnowex-DSP! optimization by Nick Kurshev |
5 # | 5 # |
6 # TODO: finish 3dnow! optimization at least in scalar mode | 6 # TODO: optimize scalar 3dnow! code |
7 # Warning: Phases 7 & 8 are not tested | |
7 # | 8 # |
8 | |
9 .data | |
10 .align 8 | |
11 plus_minus_3dnow: .long 0x00000000, 0x80000000 | |
12 costab: | |
13 .long 1056974725 | |
14 .long 1057056395 | |
15 .long 1057223771 | |
16 .long 1057485416 | |
17 .long 1057855544 | |
18 .long 1058356026 | |
19 .long 1059019886 | |
20 .long 1059897405 | |
21 .long 1061067246 | |
22 .long 1062657950 | |
23 .long 1064892987 | |
24 .long 1066774581 | |
25 .long 1069414683 | |
26 .long 1073984175 | |
27 .long 1079645762 | |
28 .long 1092815430 | |
29 .long 1057005197 | |
30 .long 1057342072 | |
31 .long 1058087743 | |
32 .long 1059427869 | |
33 .long 1061799040 | |
34 .long 1065862217 | |
35 .long 1071413542 | |
36 .long 1084439708 | |
37 .long 1057128951 | |
38 .long 1058664893 | |
39 .long 1063675095 | |
40 .long 1076102863 | |
41 .long 1057655764 | |
42 .long 1067924853 | |
43 .long 1060439283 | |
44 | 9 |
45 .text | 10 .text |
46 | 11 |
47 .align 16 | 12 .align 16 |
48 | 13 |
55 movl 280(%esp),%eax | 20 movl 280(%esp),%eax |
56 | 21 |
57 leal 128(%esp),%edx | 22 leal 128(%esp),%edx |
58 movl 272(%esp),%esi | 23 movl 272(%esp),%esi |
59 movl 276(%esp),%edi | 24 movl 276(%esp),%edi |
60 movl $costab,%ebx | 25 movl $costab_mmx,%ebx |
61 orl %ecx,%ecx | 26 orl %ecx,%ecx |
62 movl %esp,%ecx | 27 movl %esp,%ecx |
63 femms | 28 |
64 /* Phase 1*/ | 29 /* Phase 1*/ |
65 movq (%eax), %mm0 | 30 movq (%eax), %mm0 |
66 movq 8(%eax), %mm4 | 31 movq 8(%eax), %mm4 |
67 movq %mm0, %mm3 | 32 movq %mm0, %mm3 |
68 movq %mm4, %mm7 | 33 movq %mm4, %mm7 |
405 pswapd %mm7, %mm7 | 370 pswapd %mm7, %mm7 |
406 movq %mm3, 104(%ecx) | 371 movq %mm3, 104(%ecx) |
407 movq %mm7, 120(%ecx) | 372 movq %mm7, 120(%ecx) |
408 | 373 |
409 /* Phase 6. This is the end of easy road. */ | 374 /* Phase 6. This is the end of easy road. */ |
410 movl $1, %eax | 375 /* Code below is coded in scalar mode. Should be optimized */ |
411 movd %eax, %mm7 | 376 |
412 pi2fd %mm7, %mm7 | 377 movd 32(%ecx), %mm0 |
413 movq 32(%ecx), %mm0 | 378 pfadd 36(%ecx), %mm0 |
414 punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */ | 379 movd %mm0, 32(%edx) |
380 | |
381 movd 32(%ecx), %mm0 | |
382 pfsub 36(%ecx), %mm0 | |
383 pfmul 120(%ebx),%mm0 | |
384 movd %mm0, 36(%edx) | |
385 | |
386 movd 44(%ecx), %mm0 | |
387 pfsub 40(%ecx), %mm0 | |
388 pfmul 120(%ebx),%mm0 | |
389 | |
390 movd %mm0, 44(%edx) | |
391 pfadd 40(%ecx), %mm0 | |
392 pfadd 44(%ecx), %mm0 | |
393 movd %mm0, 40(%edx) | |
394 | |
395 movd 48(%ecx), %mm3 | |
396 pfsub 52(%ecx), %mm3 | |
397 pfmul 120(%ebx), %mm3 | |
398 | |
399 movd 60(%ecx), %mm2 | |
400 pfsub 56(%ecx), %mm2 | |
401 pfmul 120(%ebx), %mm2 | |
402 movq %mm2, %mm1 | |
403 | |
404 pfadd 56(%ecx), %mm1 | |
405 pfadd 60(%ecx), %mm1 | |
406 movq %mm1, %mm0 | |
407 | |
408 pfadd 48(%ecx), %mm0 | |
409 pfadd 52(%ecx), %mm0 | |
410 movd %mm0, 48(%edx) | |
411 pfadd %mm3, %mm1 | |
412 movd %mm1, 56(%edx) | |
413 movd %mm2, 60(%edx) | |
414 pfadd %mm3, %mm2 | |
415 movd %mm2, 52(%edx) | |
416 | |
417 /*---*/ | |
418 movd 64(%ecx), %mm0 | |
419 pfadd 68(%ecx), %mm0 | |
420 movd %mm0, 64(%edx) | |
421 | |
422 movd 64(%ecx), %mm0 | |
423 pfsub 68(%ecx), %mm0 | |
424 pfmul 120(%ebx), %mm0 | |
425 movd %mm0, 68(%edx) | |
426 | |
427 movd 76(%ecx), %mm0 | |
428 pfsub 72(%ecx), %mm0 | |
429 pfmul 120(%ebx), %mm0 | |
430 movd %mm0, 76(%edx) | |
431 pfadd 72(%ecx), %mm0 | |
432 pfadd 76(%ecx), %mm0 | |
433 movd %mm0, 72(%edx) | |
434 | |
435 movd 92(%ecx), %mm0 | |
436 pfsub 88(%ecx), %mm0 | |
437 pfmul 120(%ebx), %mm0 | |
438 movd %mm0, 92(%edx) | |
439 pfadd 92(%ecx), %mm0 | |
440 pfadd 88(%ecx), %mm0 | |
441 movq %mm0, %mm1 | |
442 | |
443 pfadd 80(%ecx), %mm0 | |
444 pfadd 84(%ecx), %mm0 | |
445 movd %mm0, 80(%edx) | |
446 | |
447 movd 80(%ecx), %mm0 | |
448 pfsub 84(%ecx), %mm0 | |
449 pfmul 120(%ebx), %mm0 | |
450 pfadd %mm0, %mm1 | |
451 pfadd 92(%edx), %mm0 | |
452 movd %mm0, 84(%edx) | |
453 movd %mm1, 88(%edx) | |
454 | |
455 movd 96(%ecx), %mm0 | |
456 pfadd 100(%ecx), %mm0 | |
457 movd %mm0, 96(%edx) | |
458 | |
459 movd 96(%ecx), %mm0 | |
460 pfsub 100(%ecx), %mm0 | |
461 pfmul 120(%ebx), %mm0 | |
462 movd %mm0, 100(%edx) | |
463 | |
464 movd 108(%ecx), %mm0 | |
465 pfsub 104(%ecx), %mm0 | |
466 pfmul 120(%ebx), %mm0 | |
467 movd %mm0, 108(%edx) | |
468 pfadd 104(%ecx), %mm0 | |
469 pfadd 108(%ecx), %mm0 | |
470 movd %mm0, 104(%edx) | |
471 | |
472 movd 124(%ecx), %mm0 | |
473 pfsub 120(%ecx), %mm0 | |
474 pfmul 120(%ebx), %mm0 | |
475 movd %mm0, 124(%edx) | |
476 pfadd 120(%ecx), %mm0 | |
477 pfadd 124(%ecx), %mm0 | |
478 movq %mm0, %mm1 | |
479 | |
480 pfadd 112(%ecx), %mm0 | |
481 pfadd 116(%ecx), %mm0 | |
482 movd %mm0, 112(%edx) | |
483 | |
484 movd 112(%ecx), %mm0 | |
485 pfsub 116(%ecx), %mm0 | |
486 pfmul 120(%ebx), %mm0 | |
487 pfadd %mm0,%mm1 | |
488 pfadd 124(%edx), %mm0 | |
489 movd %mm0, 116(%edx) | |
490 movd %mm1, 120(%edx) | |
491 | |
492 jnz .L01 | |
493 | |
494 /* Phase 7*/ | |
495 /* Code below is coded in scalar mode. Should be optimized */ | |
496 | |
497 movd (%ecx), %mm0 | |
498 pfadd 4(%ecx), %mm0 | |
499 movd %mm0, 1024(%esi) | |
500 | |
501 movd (%ecx), %mm0 | |
502 pfsub 4(%ecx), %mm0 | |
503 pfmul 120(%ebx), %mm0 | |
504 movd %mm0, (%esi) | |
505 movd %mm0, (%edi) | |
506 | |
507 movd 12(%ecx), %mm0 | |
508 pfsub 8(%ecx), %mm0 | |
509 pfmul 120(%ebx), %mm0 | |
510 movd %mm0, 512(%edi) | |
511 pfadd 12(%ecx), %mm0 | |
512 pfadd 8(%ecx), %mm0 | |
513 movd %mm0, 512(%esi) | |
514 | |
515 movd 16(%ecx), %mm0 | |
516 pfsub 20(%ecx), %mm0 | |
517 pfmul 120(%ebx), %mm0 | |
518 movq %mm0, %mm3 | |
519 | |
520 movd 28(%ecx), %mm0 | |
521 pfsub 24(%ecx), %mm0 | |
522 pfmul 120(%ebx), %mm0 | |
523 movd %mm0, 768(%edi) | |
524 movq %mm0, %mm2 | |
525 | |
526 pfadd 24(%ecx), %mm0 | |
527 pfadd 28(%ecx), %mm0 | |
415 movq %mm0, %mm1 | 528 movq %mm0, %mm1 |
416 movq plus_minus_3dnow, %mm6 | 529 |
417 /* n.b.: pfpnacc */ | 530 pfadd 16(%ecx), %mm0 |
418 pxor %mm6, %mm1 | 531 pfadd 20(%ecx), %mm0 |
419 pfacc %mm1, %mm0 | 532 movd %mm0, 768(%esi) |
420 /**/ | 533 pfadd %mm3, %mm1 |
421 pfmul %mm7, %mm0 | 534 movd %mm1, 256(%esi) |
422 movq %mm0, 32(%edx) | 535 pfadd %mm3, %mm2 |
423 femms | 536 movd %mm2, 256(%edi) |
424 | |
425 flds 44(%ecx) | |
426 fsubs 40(%ecx) | |
427 fmuls 120(%ebx) | |
428 | |
429 fsts 44(%edx) | |
430 fadds 40(%ecx) /* pfacc 40(ecx), 56(%ecx) */ | |
431 fadds 44(%ecx) | |
432 fstps 40(%edx) | |
433 | |
434 flds 48(%ecx) | |
435 fsubs 52(%ecx) | |
436 fmuls 120(%ebx) | |
437 | |
438 flds 60(%ecx) | |
439 fsubs 56(%ecx) | |
440 fmuls 120(%ebx) | |
441 | |
442 fld %st(0) | |
443 fadds 56(%ecx) | |
444 fadds 60(%ecx) | |
445 | |
446 fld %st(0) | |
447 fadds 48(%ecx) | |
448 fadds 52(%ecx) | |
449 fstps 48(%edx) | |
450 fadd %st(2) | |
451 fstps 56(%edx) | |
452 fsts 60(%edx) | |
453 faddp %st(1) | |
454 fstps 52(%edx) | |
455 /*---*/ | |
456 flds 64(%ecx) | |
457 fadds 68(%ecx) | |
458 fstps 64(%edx) | |
459 | |
460 flds 64(%ecx) | |
461 fsubs 68(%ecx) | |
462 fmuls 120(%ebx) | |
463 fstps 68(%edx) | |
464 | |
465 flds 76(%ecx) | |
466 fsubs 72(%ecx) | |
467 fmuls 120(%ebx) | |
468 fsts 76(%edx) | |
469 fadds 72(%ecx) | |
470 fadds 76(%ecx) | |
471 fstps 72(%edx) | |
472 | |
473 flds 92(%ecx) | |
474 fsubs 88(%ecx) | |
475 fmuls 120(%ebx) | |
476 fsts 92(%edx) | |
477 fadds 92(%ecx) | |
478 fadds 88(%ecx) | |
479 | |
480 fld %st(0) | |
481 fadds 80(%ecx) | |
482 fadds 84(%ecx) | |
483 fstps 80(%edx) | |
484 | |
485 flds 80(%ecx) | |
486 fsubs 84(%ecx) | |
487 fmuls 120(%ebx) | |
488 fadd %st(0), %st(1) | |
489 fadds 92(%edx) | |
490 fstps 84(%edx) | |
491 fstps 88(%edx) | |
492 | |
493 flds 96(%ecx) | |
494 fadds 100(%ecx) | |
495 fstps 96(%edx) | |
496 | |
497 flds 96(%ecx) | |
498 fsubs 100(%ecx) | |
499 fmuls 120(%ebx) | |
500 fstps 100(%edx) | |
501 | |
502 flds 108(%ecx) | |
503 fsubs 104(%ecx) | |
504 fmuls 120(%ebx) | |
505 fsts 108(%edx) | |
506 fadds 104(%ecx) | |
507 fadds 108(%ecx) | |
508 fstps 104(%edx) | |
509 | |
510 flds 124(%ecx) | |
511 fsubs 120(%ecx) | |
512 fmuls 120(%ebx) | |
513 fsts 124(%edx) | |
514 fadds 120(%ecx) | |
515 fadds 124(%ecx) | |
516 | |
517 fld %st(0) | |
518 fadds 112(%ecx) | |
519 fadds 116(%ecx) | |
520 fstps 112(%edx) | |
521 | |
522 flds 112(%ecx) | |
523 fsubs 116(%ecx) | |
524 fmuls 120(%ebx) | |
525 fadd %st(0),%st(1) | |
526 fadds 124(%edx) | |
527 fstps 116(%edx) | |
528 fstps 120(%edx) | |
529 jnz .L01 | |
530 | |
531 /* Phase 7*/ | |
532 | |
533 flds (%ecx) | |
534 fadds 4(%ecx) | |
535 fstps 1024(%esi) | |
536 | |
537 flds (%ecx) | |
538 fsubs 4(%ecx) | |
539 fmuls 120(%ebx) | |
540 fsts (%esi) | |
541 fstps (%edi) | |
542 | |
543 flds 12(%ecx) | |
544 fsubs 8(%ecx) | |
545 fmuls 120(%ebx) | |
546 fsts 512(%edi) | |
547 fadds 12(%ecx) | |
548 fadds 8(%ecx) | |
549 fstps 512(%esi) | |
550 | |
551 flds 16(%ecx) | |
552 fsubs 20(%ecx) | |
553 fmuls 120(%ebx) | |
554 | |
555 flds 28(%ecx) | |
556 fsubs 24(%ecx) | |
557 fmuls 120(%ebx) | |
558 fsts 768(%edi) | |
559 fld %st(0) | |
560 fadds 24(%ecx) | |
561 fadds 28(%ecx) | |
562 fld %st(0) | |
563 fadds 16(%ecx) | |
564 fadds 20(%ecx) | |
565 fstps 768(%esi) | |
566 fadd %st(2) | |
567 fstps 256(%esi) | |
568 faddp %st(1) | |
569 fstps 256(%edi) | |
570 | 537 |
571 /* Phase 8*/ | 538 /* Phase 8*/ |
572 | 539 |
573 flds 32(%edx) | 540 movq 32(%edx), %mm0 |
574 fadds 48(%edx) | 541 movq 48(%edx), %mm1 |
575 fstps 896(%esi) | 542 pfadd 48(%edx), %mm0 |
576 | 543 pfadd 40(%edx), %mm1 |
577 flds 48(%edx) | 544 movd %mm0, 896(%esi) |
578 fadds 40(%edx) | 545 movd %mm1, 640(%esi) |
579 fstps 640(%esi) | 546 psrlq $32, %mm0 |
580 | 547 psrlq $32, %mm1 |
581 flds 40(%edx) | 548 movd %mm0, 128(%edi) |
582 fadds 56(%edx) | 549 movd %mm1, 384(%edi) |
583 fstps 384(%esi) | 550 |
584 | 551 movd 40(%edx), %mm0 |
585 flds 56(%edx) | 552 pfadd 56(%edx), %mm0 |
586 fadds 36(%edx) | 553 movd %mm0, 384(%esi) |
587 fstps 128(%esi) | 554 |
588 | 555 movd 56(%edx), %mm0 |
589 flds 36(%edx) | 556 pfadd 36(%edx), %mm0 |
590 fadds 52(%edx) | 557 movd %mm0, 128(%esi) |
591 fstps 128(%edi) | 558 |
592 | 559 movd 60(%edx), %mm0 |
593 flds 52(%edx) | 560 movd %mm0, 896(%edi) |
594 fadds 44(%edx) | 561 pfadd 44(%edx), %mm0 |
595 fstps 384(%edi) | 562 movd %mm0, 640(%edi) |
596 | 563 |
597 flds 60(%edx) | 564 movq 96(%edx), %mm0 |
598 fsts 896(%edi) | 565 movq 112(%edx), %mm2 |
599 fadds 44(%edx) | 566 movq 104(%edx), %mm4 |
600 fstps 640(%edi) | 567 pfadd 112(%edx), %mm0 |
601 | 568 pfadd 104(%edx), %mm2 |
602 flds 96(%edx) | 569 pfadd 120(%edx), %mm4 |
603 fadds 112(%edx) | 570 movq %mm0, %mm1 |
604 fld %st(0) | 571 movq %mm2, %mm3 |
605 fadds 64(%edx) | 572 movq %mm4, %mm5 |
606 fstps 960(%esi) | 573 pfadd 64(%edx), %mm0 |
607 fadds 80(%edx) | 574 pfadd 80(%edx), %mm2 |
608 fstps 832(%esi) | 575 pfadd 72(%edx), %mm4 |
609 | 576 movd %mm0, 960(%esi) |
610 flds 112(%edx) | 577 movd %mm2, 704(%esi) |
611 fadds 104(%edx) | 578 movd %mm4, 448(%esi) |
612 fld %st(0) | 579 psrlq $32, %mm0 |
613 fadds 80(%edx) | 580 psrlq $32, %mm2 |
614 fstps 704(%esi) | 581 psrlq $32, %mm4 |
615 fadds 72(%edx) | 582 movd %mm0, 64(%edi) |
616 fstps 576(%esi) | 583 movd %mm2, 320(%edi) |
617 | 584 movd %mm4, 576(%edi) |
618 flds 104(%edx) | 585 pfadd 80(%edx), %mm1 |
619 fadds 120(%edx) | 586 pfadd 72(%edx), %mm3 |
620 fld %st(0) | 587 pfadd 88(%edx), %mm5 |
621 fadds 72(%edx) | 588 movd %mm1, 832(%esi) |
622 fstps 448(%esi) | 589 movd %mm3, 576(%esi) |
623 fadds 88(%edx) | 590 movd %mm5, 320(%esi) |
624 fstps 320(%esi) | 591 psrlq $32, %mm1 |
625 | 592 psrlq $32, %mm3 |
626 flds 120(%edx) | 593 psrlq $32, %mm5 |
627 fadds 100(%edx) | 594 movd %mm1, 192(%edi) |
628 fld %st(0) | 595 movd %mm3, 448(%edi) |
629 fadds 88(%edx) | 596 movd %mm5, 704(%edi) |
630 fstps 192(%esi) | 597 |
631 fadds 68(%edx) | 598 movd 120(%edx), %mm0 |
632 fstps 64(%esi) | 599 pfadd 100(%edx), %mm0 |
633 | 600 movq %mm0, %mm1 |
634 flds 100(%edx) | 601 pfadd 88(%edx), %mm0 |
635 fadds 116(%edx) | 602 movd %mm0, 192(%esi) |
636 fld %st(0) | 603 pfadd 68(%edx), %mm1 |
637 fadds 68(%edx) | 604 movd %mm1, 64(%esi) |
638 fstps 64(%edi) | 605 |
639 fadds 84(%edx) | 606 movd 124(%edx), %mm0 |
640 fstps 192(%edi) | 607 movd %mm0, 960(%edi) |
641 | 608 pfadd 92(%edx), %mm0 |
642 flds 116(%edx) | 609 movd %mm0, 832(%edi) |
643 fadds 108(%edx) | 610 |
644 fld %st(0) | |
645 fadds 84(%edx) | |
646 fstps 320(%edi) | |
647 fadds 76(%edx) | |
648 fstps 448(%edi) | |
649 | |
650 flds 108(%edx) | |
651 fadds 124(%edx) | |
652 fld %st(0) | |
653 fadds 76(%edx) | |
654 fstps 576(%edi) | |
655 fadds 92(%edx) | |
656 fstps 704(%edi) | |
657 | |
658 flds 124(%edx) | |
659 fsts 960(%edi) | |
660 fadds 92(%edx) | |
661 fstps 832(%edi) | |
662 jmp .L_bye | 611 jmp .L_bye |
663 .L01: | 612 .L01: |
664 /* Phase 9*/ | 613 /* Phase 9*/ |
665 | 614 movd (%ecx), %mm0 |
666 flds (%ecx) | 615 pfadd 4(%ecx), %mm0 |
667 fadds 4(%ecx) | 616 pf2id %mm0, %mm0 |
668 fistp 512(%esi) | 617 movd %mm0, %eax |
669 | 618 movw %ax, 512(%esi) |
670 flds (%ecx) | 619 |
671 fsubs 4(%ecx) | 620 movd (%ecx), %mm0 |
672 fmuls 120(%ebx) | 621 pfsub 4(%ecx), %mm0 |
673 | 622 pfmul 120(%ebx), %mm0 |
674 fistp (%esi) | 623 pf2id %mm0, %mm0 |
675 | 624 movd %mm0, %eax |
676 | 625 movw %ax, (%esi) |
677 flds 12(%ecx) | 626 |
678 fsubs 8(%ecx) | 627 movd 12(%ecx), %mm0 |
679 fmuls 120(%ebx) | 628 pfsub 8(%ecx), %mm0 |
680 fist 256(%edi) | 629 pfmul 120(%ebx), %mm0 |
681 fadds 12(%ecx) | 630 pf2id %mm0, %mm7 |
682 fadds 8(%ecx) | 631 movd %mm7, %eax |
683 fistp 256(%esi) | 632 movw %ax, 256(%edi) |
684 | 633 pfadd 12(%ecx), %mm0 |
685 flds 16(%ecx) | 634 pfadd 8(%ecx), %mm0 |
686 fsubs 20(%ecx) | 635 pf2id %mm0, %mm0 |
687 fmuls 120(%ebx) | 636 movd %mm0, %eax |
688 | 637 movw %ax, 256(%esi) |
689 flds 28(%ecx) | 638 |
690 fsubs 24(%ecx) | 639 movd 16(%ecx), %mm0 |
691 fmuls 120(%ebx) | 640 pfsub 20(%ecx), %mm0 |
692 fist 384(%edi) | 641 pfmul 120(%ebx), %mm0 |
693 fld %st(0) | 642 movq %mm0, %mm3 |
694 fadds 24(%ecx) | 643 |
695 fadds 28(%ecx) | 644 movd 28(%ecx), %mm0 |
696 fld %st(0) | 645 pfsub 24(%ecx), %mm0 |
697 fadds 16(%ecx) | 646 pfmul 120(%ebx), %mm0 |
698 fadds 20(%ecx) | 647 pf2id %mm0, %mm7 |
699 fistp 384(%esi) | 648 movd %mm7, %eax |
700 fadd %st(2) | 649 movw %ax, 384(%edi) |
701 fistp 128(%esi) | 650 movq %mm0, %mm2 |
702 faddp %st(1) | 651 |
703 fistp 128(%edi) | 652 pfadd 24(%ecx), %mm0 |
653 pfadd 28(%ecx), %mm0 | |
654 movq %mm0, %mm1 | |
655 pfadd 16(%ecx), %mm0 | |
656 pfadd 20(%ecx), %mm0 | |
657 pf2id %mm0, %mm0 | |
658 movd %mm0, %eax | |
659 movw %ax, 384(%esi) | |
660 pfadd %mm3, %mm1 | |
661 pf2id %mm1, %mm1 | |
662 movd %mm1, %eax | |
663 movw %ax, 128(%esi) | |
664 pfadd %mm3, %mm2 | |
665 pf2id %mm2, %mm2 | |
666 movd %mm2, %eax | |
667 movw %ax, 128(%edi) | |
668 | |
704 | 669 |
705 /* Phase 10*/ | 670 /* Phase 10*/ |
706 | 671 |
707 flds 32(%edx) | 672 movq 32(%edx), %mm0 |
708 fadds 48(%edx) | 673 movq 48(%edx), %mm1 |
709 fistp 448(%esi) | 674 pfadd 48(%edx), %mm0 |
710 | 675 pfadd 40(%edx), %mm1 |
711 flds 48(%edx) | 676 pf2id %mm0, %mm0 |
712 fadds 40(%edx) | 677 pf2id %mm1, %mm1 |
713 fistp 320(%esi) | 678 movd %mm0, %eax |
714 | 679 movd %mm1, %ecx |
715 flds 40(%edx) | 680 movw %ax, 448(%esi) |
716 fadds 56(%edx) | 681 movw %cx, 320(%esi) |
717 fistp 192(%esi) | 682 psrlq $32, %mm0 |
718 | 683 psrlq $32, %mm1 |
719 flds 56(%edx) | 684 movd %mm0, %eax |
720 fadds 36(%edx) | 685 movd %mm1, %ecx |
721 fistp 64(%esi) | 686 movw %ax, 64(%edi) |
722 | 687 movw %cx, 192(%edi) |
723 flds 36(%edx) | 688 |
724 fadds 52(%edx) | 689 movd 40(%edx), %mm0 |
725 fistp 64(%edi) | 690 pfadd 56(%edx), %mm0 |
726 | 691 pf2id %mm0, %mm0 |
727 flds 52(%edx) | 692 movd %mm0, %eax |
728 fadds 44(%edx) | 693 movw %ax, 192(%esi) |
729 fistp 192(%edi) | 694 |
730 | 695 movd 56(%edx), %mm0 |
731 flds 60(%edx) | 696 pfadd 36(%edx), %mm0 |
732 fist 448(%edi) | 697 pf2id %mm0, %mm0 |
733 fadds 44(%edx) | 698 movd %mm0, %eax |
734 fistp 320(%edi) | 699 movw %ax, 64(%esi) |
735 | 700 |
736 flds 96(%edx) | 701 movd 60(%edx), %mm0 |
737 fadds 112(%edx) | 702 pf2id %mm0, %mm7 |
738 fld %st(0) | 703 movd %mm7, %eax |
739 fadds 64(%edx) | 704 movw %ax, 448(%edi) |
740 fistp 480(%esi) | 705 pfadd 44(%edx), %mm0 |
741 fadds 80(%edx) | 706 pf2id %mm0, %mm0 |
742 fistp 416(%esi) | 707 movd %mm0, %eax |
743 | 708 movw %ax, 320(%edi) |
744 flds 112(%edx) | 709 |
745 fadds 104(%edx) | 710 movq 96(%edx), %mm0 |
746 fld %st(0) | 711 movq 112(%edx), %mm2 |
747 fadds 80(%edx) | 712 movq 104(%edx), %mm4 |
748 fistp 352(%esi) | 713 pfadd 112(%edx), %mm0 |
749 fadds 72(%edx) | 714 pfadd 104(%edx), %mm2 |
750 fistp 288(%esi) | 715 pfadd 120(%edx), %mm4 |
751 | 716 movq %mm0, %mm1 |
752 flds 104(%edx) | 717 movq %mm2, %mm3 |
753 fadds 120(%edx) | 718 movq %mm4, %mm5 |
754 fld %st(0) | 719 pfadd 64(%edx), %mm0 |
755 fadds 72(%edx) | 720 pfadd 80(%edx), %mm2 |
756 fistp 224(%esi) | 721 pfadd 72(%edx), %mm4 |
757 fadds 88(%edx) | 722 pf2id %mm0, %mm7 |
758 fistp 160(%esi) | 723 pf2id %mm2, %mm6 |
759 | 724 pf2id %mm4, %mm4 |
760 flds 120(%edx) | 725 movd %mm7, %eax |
761 fadds 100(%edx) | 726 movd %mm6, %ecx |
762 fld %st(0) | 727 movd %mm4, %ebx |
763 fadds 88(%edx) | 728 movw %ax, 480(%esi) |
764 fistp 96(%esi) | 729 movw %cx, 352(%esi) |
765 fadds 68(%edx) | 730 movw %bx, 224(%esi) |
766 fistp 32(%esi) | 731 psrlq $32, %mm7 |
767 | 732 psrlq $32, %mm6 |
768 flds 100(%edx) | 733 psrlq $32, %mm4 |
769 fadds 116(%edx) | 734 movd %mm7, %eax |
770 fld %st(0) | 735 movd %mm6, %ecx |
771 fadds 68(%edx) | 736 movd %mm4, %ebx |
772 fistp 32(%edi) | 737 movw %ax, 32(%edi) |
773 fadds 84(%edx) | 738 movw %cx, 160(%edi) |
774 fistp 96(%edi) | 739 movw %bx, 288(%edi) |
775 | 740 pfadd 80(%edx), %mm1 |
776 flds 116(%edx) | 741 pfadd 72(%edx), %mm3 |
777 fadds 108(%edx) | 742 pfadd 88(%edx), %mm5 |
778 fld %st(0) | 743 pf2id %mm1, %mm1 |
779 fadds 84(%edx) | 744 pf2id %mm3, %mm3 |
780 fistp 160(%edi) | 745 pf2id %mm5, %mm5 |
781 fadds 76(%edx) | 746 movd %mm1, %eax |
782 fistp 224(%edi) | 747 movd %mm3, %ecx |
783 | 748 movd %mm5, %ebx |
784 flds 108(%edx) | 749 movw %ax, 416(%esi) |
785 fadds 124(%edx) | 750 movw %cx, 288(%esi) |
786 fld %st(0) | 751 movw %bx, 160(%esi) |
787 fadds 76(%edx) | 752 psrlq $32, %mm1 |
788 fistp 288(%edi) | 753 psrlq $32, %mm3 |
789 fadds 92(%edx) | 754 psrlq $32, %mm5 |
790 fistp 352(%edi) | 755 movd %mm1, %eax |
791 | 756 movd %mm3, %ecx |
792 flds 124(%edx) | 757 movd %mm5, %ebx |
793 fist 480(%edi) | 758 movw %ax, 96(%edi) |
794 fadds 92(%edx) | 759 movw %cx, 224(%edi) |
795 fistp 416(%edi) | 760 movw %bx, 352(%edi) |
761 | |
762 movd 120(%edx), %mm0 | |
763 pfadd 100(%edx), %mm0 | |
764 movq %mm0, %mm1 | |
765 pfadd 88(%edx), %mm0 | |
766 pf2id %mm0, %mm0 | |
767 movd %mm0, %eax | |
768 movw %ax, 96(%esi) | |
769 pfadd 68(%edx), %mm1 | |
770 pf2id %mm1, %mm1 | |
771 movd %mm1, %eax | |
772 movw %ax, 32(%esi) | |
773 | |
774 movq 124(%edx), %mm0 | |
775 pf2id %mm0, %mm1 | |
776 movd %mm1, %eax | |
777 movw %ax, 480(%edi) | |
778 pfadd 92(%edx), %mm0 | |
779 pf2id %mm0, %mm0 | |
780 movd %mm0, %eax | |
781 movw %ax, 416(%edi) | |
782 | |
796 movsw | 783 movsw |
784 | |
797 .L_bye: | 785 .L_bye: |
798 addl $256,%esp | 786 addl $256,%esp |
787 femms | |
799 popl %edi | 788 popl %edi |
800 popl %esi | 789 popl %esi |
801 popl %ebx | 790 popl %ebx |
802 ret | 791 ret |
803 | 792 |