Mercurial > mplayer.hg
comparison mp3lib/dct64_k7.s @ 1173:3c53cbf53e7e
Better 3dnow! optimization
author | nickols_k |
---|---|
date | Wed, 20 Jun 2001 07:54:19 +0000 |
parents | ee303142c2e0 |
children | 03b7e2955a20 |
comparison
equal
deleted
inserted
replaced
1172:290353337b44 | 1173:3c53cbf53e7e |
---|---|
7 /// - added new opcodes PSWAPD, PFPNACC | 7 /// - added new opcodes PSWAPD, PFPNACC |
8 /// - decreased number of opcodes (as it was suggested by k7 manual) | 8 /// - decreased number of opcodes (as it was suggested by k7 manual) |
9 /// (using memory reference as operand of instructions) | 9 /// (using memory reference as operand of instructions) |
10 /// - Phase 6 is rewritten with mixing of cpu and mmx opcodes | 10 /// - Phase 6 is rewritten with mixing of cpu and mmx opcodes |
11 /// - change function name for support 3DNowEx! automatic detect | 11 /// - change function name for support 3DNowEx! automatic detect |
12 /// - negation of 3dnow reg was replaced with PXOR 0x800000000, MMi instead | |
13 /// of PFMUL as it was suggested by athlon manual. (Two not separated PFMUL | |
14 /// can not be paired, but PXOR can be). | |
12 /// | 15 /// |
13 /// note: because K7 processors are an aggresive out-of-order three-way | 16 /// note: because K7 processors are an aggresive out-of-order three-way |
14 /// superscalar ones instruction order is not significand for them. | 17 /// superscalar ones instruction order is not significand for them. |
15 /// | 18 /// |
16 /// Modified by Nick Kurshev <nickols_k@mail.ru> | 19 /// Modified by Nick Kurshev <nickols_k@mail.ru> |
19 /// warranties with regard to this program, and in no event shall the | 22 /// warranties with regard to this program, and in no event shall the |
20 /// author of this program liable to whatever resulted from the use of | 23 /// author of this program liable to whatever resulted from the use of |
21 /// this program. Use it at your own risk. | 24 /// this program. Use it at your own risk. |
22 /// | 25 /// |
23 | 26 |
27 .data | |
28 .align 8 | |
29 plus_minus_3dnow: .long 0x00000000, 0x80000000 | |
30 | |
31 .text | |
24 .globl dct64_3dnowex | 32 .globl dct64_3dnowex |
25 .type dct64_3dnowex,@function | 33 .type dct64_3dnowex,@function |
26 | 34 |
27 /* Discrete Cosine Tansform (DCT) for subband synthesis */ | 35 /* Discrete Cosine Tansform (DCT) for subband synthesis */ |
28 /* void dct64(real *a,real *b,real *c) */ | 36 /* void dct64(real *a,real *b,real *c) */ |
410 movq %mm4,112(%esi) | 418 movq %mm4,112(%esi) |
411 pswapd %mm5, %mm5 | 419 pswapd %mm5, %mm5 |
412 movq %mm5, 120(%esi) | 420 movq %mm5, 120(%esi) |
413 | 421 |
414 // 5 | 422 // 5 |
415 movl $-1,%eax | 423 movq plus_minus_3dnow, %mm0 /* mm0 = 1.0 | -1.0 */ |
416 movd %eax,%mm1 | |
417 movl $1,%eax | 424 movl $1,%eax |
418 movd %eax,%mm0 | |
419 / L | H | |
420 punpckldq %mm1,%mm0 | |
421 pi2fd %mm0,%mm0 /* mm0 = 1.0 | -1.0 */ | |
422 movd %eax,%mm1 | 425 movd %eax,%mm1 |
423 pi2fd %mm1,%mm1 | 426 pi2fd %mm1,%mm1 |
424 movl pnts+16,%eax | 427 movl pnts+16,%eax |
425 movd 0(%eax),%mm2 | 428 movd 0(%eax),%mm2 |
426 punpckldq %mm2,%mm1 /* mm1 = 1.0 | cos0 */ | 429 punpckldq %mm2,%mm1 /* mm1 = 1.0 | cos0 */ |
431 pfmul %mm1,%mm2 /* mm2 = tmp2[0]+tmp2[1]|(tmp2[0]-tmp2[1])*cos0*/ | 434 pfmul %mm1,%mm2 /* mm2 = tmp2[0]+tmp2[1]|(tmp2[0]-tmp2[1])*cos0*/ |
432 movq %mm2,0(%ebx) /* tmp1[0, 1] = mm2 */ | 435 movq %mm2,0(%ebx) /* tmp1[0, 1] = mm2 */ |
433 movq 8(%esi),%mm4 /* mm4 = tmp2[2] | tmp2[3]*/ | 436 movq 8(%esi),%mm4 /* mm4 = tmp2[2] | tmp2[3]*/ |
434 pfpnacc %mm4, %mm4 | 437 pfpnacc %mm4, %mm4 |
435 pswapd %mm4, %mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[2]-tmp2[3]*/ | 438 pswapd %mm4, %mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[2]-tmp2[3]*/ |
436 pfmul %mm0,%mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/ | 439 pxor %mm0,%mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/ |
437 pfmul %mm1,%mm4 /* mm4 = tmp2[2]+tmp2[3]|(tmp2[3]-tmp2[2])*cos0*/ | 440 pfmul %mm1,%mm4 /* mm4 = tmp2[2]+tmp2[3]|(tmp2[3]-tmp2[2])*cos0*/ |
438 movq %mm4,%mm5 | 441 movq %mm4,%mm5 |
439 psrlq $32,%mm5 /* mm5 = (tmp2[3]-tmp2[2])*cos0 */ | 442 psrlq $32,%mm5 /* mm5 = (tmp2[3]-tmp2[2])*cos0 */ |
440 pfacc %mm5,%mm4 /* mm4 = tmp2[2]+tmp2[3]+(tmp2[3]-tmp2[2])*cos0|(tmp2[3]-tmp2[2])*cos0*/ | 443 pfacc %mm5,%mm4 /* mm4 = tmp2[2]+tmp2[3]+(tmp2[3]-tmp2[2])*cos0|(tmp2[3]-tmp2[2])*cos0*/ |
441 movq %mm4,8(%ebx) /* tmp1[2, 3] = mm4 */ | 444 movq %mm4,8(%ebx) /* tmp1[2, 3] = mm4 */ |
447 pfmul %mm1,%mm2 | 450 pfmul %mm1,%mm2 |
448 movq 24(%esi),%mm4 | 451 movq 24(%esi),%mm4 |
449 pfpnacc %mm4, %mm4 | 452 pfpnacc %mm4, %mm4 |
450 pswapd %mm4, %mm4 | 453 pswapd %mm4, %mm4 |
451 | 454 |
452 pfmul %mm0,%mm4 | 455 pxor %mm0,%mm4 |
453 pfmul %mm1,%mm4 | 456 pfmul %mm1,%mm4 |
454 movq %mm4,%mm5 | 457 movq %mm4,%mm5 |
455 psrlq $32,%mm5 | 458 psrlq $32,%mm5 |
456 pfacc %mm5,%mm4 | 459 pfacc %mm5,%mm4 |
457 movq %mm2,%mm3 | 460 movq %mm2,%mm3 |
468 pfmul %mm1,%mm2 | 471 pfmul %mm1,%mm2 |
469 movq %mm2,32(%ebx) | 472 movq %mm2,32(%ebx) |
470 movq 40(%esi),%mm4 | 473 movq 40(%esi),%mm4 |
471 pfpnacc %mm4, %mm4 | 474 pfpnacc %mm4, %mm4 |
472 pswapd %mm4, %mm4 | 475 pswapd %mm4, %mm4 |
473 pfmul %mm0,%mm4 | 476 pxor %mm0,%mm4 |
474 pfmul %mm1,%mm4 | 477 pfmul %mm1,%mm4 |
475 movq %mm4,%mm5 | 478 movq %mm4,%mm5 |
476 psrlq $32,%mm5 | 479 psrlq $32,%mm5 |
477 pfacc %mm5,%mm4 | 480 pfacc %mm5,%mm4 |
478 movq %mm4,40(%ebx) | 481 movq %mm4,40(%ebx) |
482 pswapd %mm2, %mm2 | 485 pswapd %mm2, %mm2 |
483 pfmul %mm1,%mm2 | 486 pfmul %mm1,%mm2 |
484 movq 56(%esi),%mm4 | 487 movq 56(%esi),%mm4 |
485 pfpnacc %mm4, %mm4 | 488 pfpnacc %mm4, %mm4 |
486 pswapd %mm4, %mm4 | 489 pswapd %mm4, %mm4 |
487 pfmul %mm0,%mm4 | 490 pxor %mm0,%mm4 |
488 pfmul %mm1,%mm4 | 491 pfmul %mm1,%mm4 |
489 movq %mm4,%mm5 | 492 movq %mm4,%mm5 |
490 psrlq $32,%mm5 | 493 psrlq $32,%mm5 |
491 pfacc %mm5,%mm4 | 494 pfacc %mm5,%mm4 |
492 movq %mm2,%mm3 | 495 movq %mm2,%mm3 |
502 pfmul %mm1,%mm2 | 505 pfmul %mm1,%mm2 |
503 movq %mm2,64(%ebx) | 506 movq %mm2,64(%ebx) |
504 movq 72(%esi),%mm4 | 507 movq 72(%esi),%mm4 |
505 pfpnacc %mm4, %mm4 | 508 pfpnacc %mm4, %mm4 |
506 pswapd %mm4, %mm4 | 509 pswapd %mm4, %mm4 |
507 pfmul %mm0,%mm4 | 510 pxor %mm0,%mm4 |
508 pfmul %mm1,%mm4 | 511 pfmul %mm1,%mm4 |
509 movq %mm4,%mm5 | 512 movq %mm4,%mm5 |
510 psrlq $32,%mm5 | 513 psrlq $32,%mm5 |
511 pfacc %mm5,%mm4 | 514 pfacc %mm5,%mm4 |
512 movq %mm4,72(%ebx) | 515 movq %mm4,72(%ebx) |
516 pswapd %mm2, %mm2 | 519 pswapd %mm2, %mm2 |
517 pfmul %mm1,%mm2 | 520 pfmul %mm1,%mm2 |
518 movq 88(%esi),%mm4 | 521 movq 88(%esi),%mm4 |
519 pfpnacc %mm4, %mm4 | 522 pfpnacc %mm4, %mm4 |
520 pswapd %mm4, %mm4 | 523 pswapd %mm4, %mm4 |
521 pfmul %mm0,%mm4 | 524 pxor %mm0,%mm4 |
522 pfmul %mm1,%mm4 | 525 pfmul %mm1,%mm4 |
523 movq %mm4,%mm5 | 526 movq %mm4,%mm5 |
524 psrlq $32,%mm5 | 527 psrlq $32,%mm5 |
525 pfacc %mm5,%mm4 | 528 pfacc %mm5,%mm4 |
526 movq %mm2,%mm3 | 529 movq %mm2,%mm3 |
536 pfmul %mm1,%mm2 | 539 pfmul %mm1,%mm2 |
537 movq %mm2,96(%ebx) | 540 movq %mm2,96(%ebx) |
538 movq 104(%esi),%mm4 | 541 movq 104(%esi),%mm4 |
539 pfpnacc %mm4, %mm4 | 542 pfpnacc %mm4, %mm4 |
540 pswapd %mm4, %mm4 | 543 pswapd %mm4, %mm4 |
541 pfmul %mm0,%mm4 | 544 pxor %mm0,%mm4 |
542 pfmul %mm1,%mm4 | 545 pfmul %mm1,%mm4 |
543 movq %mm4,%mm5 | 546 movq %mm4,%mm5 |
544 psrlq $32,%mm5 | 547 psrlq $32,%mm5 |
545 pfacc %mm5,%mm4 | 548 pfacc %mm5,%mm4 |
546 movq %mm4,104(%ebx) | 549 movq %mm4,104(%ebx) |
550 pswapd %mm2, %mm2 | 553 pswapd %mm2, %mm2 |
551 pfmul %mm1,%mm2 | 554 pfmul %mm1,%mm2 |
552 movq 120(%esi),%mm4 | 555 movq 120(%esi),%mm4 |
553 pfpnacc %mm4, %mm4 | 556 pfpnacc %mm4, %mm4 |
554 pswapd %mm4, %mm4 | 557 pswapd %mm4, %mm4 |
555 pfmul %mm0,%mm4 | 558 pxor %mm0,%mm4 |
556 pfmul %mm1,%mm4 | 559 pfmul %mm1,%mm4 |
557 movq %mm4,%mm5 | 560 movq %mm4,%mm5 |
558 psrlq $32,%mm5 | 561 psrlq $32,%mm5 |
559 pfacc %mm5,%mm4 | 562 pfacc %mm5,%mm4 |
560 movq %mm2,%mm3 | 563 movq %mm2,%mm3 |