comparison mp3lib/dct64_k7.s @ 1173:3c53cbf53e7e

Better 3dnow! optimization
author nickols_k
date Wed, 20 Jun 2001 07:54:19 +0000
parents ee303142c2e0
children 03b7e2955a20
comparison
equal deleted inserted replaced
1172:290353337b44 1173:3c53cbf53e7e
7 /// - added new opcodes PSWAPD, PFPNACC 7 /// - added new opcodes PSWAPD, PFPNACC
8 /// - decreased number of opcodes (as it was suggested by k7 manual) 8 /// - decreased number of opcodes (as it was suggested by k7 manual)
9 /// (using memory reference as operand of instructions) 9 /// (using memory reference as operand of instructions)
10 /// - Phase 6 is rewritten with mixing of cpu and mmx opcodes 10 /// - Phase 6 is rewritten with mixing of cpu and mmx opcodes
11 /// - change function name for support 3DNowEx! automatic detect 11 /// - change function name for support 3DNowEx! automatic detect
12 /// - negation of 3dnow reg was replaced with PXOR 0x800000000, MMi instead
13 /// of PFMUL as it was suggested by athlon manual. (Two not separated PFMUL
14 /// can not be paired, but PXOR can be).
12 /// 15 ///
13 /// note: because K7 processors are an aggresive out-of-order three-way 16 /// note: because K7 processors are an aggresive out-of-order three-way
14 /// superscalar ones instruction order is not significand for them. 17 /// superscalar ones instruction order is not significand for them.
15 /// 18 ///
16 /// Modified by Nick Kurshev <nickols_k@mail.ru> 19 /// Modified by Nick Kurshev <nickols_k@mail.ru>
19 /// warranties with regard to this program, and in no event shall the 22 /// warranties with regard to this program, and in no event shall the
20 /// author of this program liable to whatever resulted from the use of 23 /// author of this program liable to whatever resulted from the use of
21 /// this program. Use it at your own risk. 24 /// this program. Use it at your own risk.
22 /// 25 ///
23 26
27 .data
28 .align 8
29 plus_minus_3dnow: .long 0x00000000, 0x80000000
30
31 .text
24 .globl dct64_3dnowex 32 .globl dct64_3dnowex
25 .type dct64_3dnowex,@function 33 .type dct64_3dnowex,@function
26 34
27 /* Discrete Cosine Tansform (DCT) for subband synthesis */ 35 /* Discrete Cosine Tansform (DCT) for subband synthesis */
28 /* void dct64(real *a,real *b,real *c) */ 36 /* void dct64(real *a,real *b,real *c) */
410 movq %mm4,112(%esi) 418 movq %mm4,112(%esi)
411 pswapd %mm5, %mm5 419 pswapd %mm5, %mm5
412 movq %mm5, 120(%esi) 420 movq %mm5, 120(%esi)
413 421
414 // 5 422 // 5
415 movl $-1,%eax 423 movq plus_minus_3dnow, %mm0 /* mm0 = 1.0 | -1.0 */
416 movd %eax,%mm1
417 movl $1,%eax 424 movl $1,%eax
418 movd %eax,%mm0
419 / L | H
420 punpckldq %mm1,%mm0
421 pi2fd %mm0,%mm0 /* mm0 = 1.0 | -1.0 */
422 movd %eax,%mm1 425 movd %eax,%mm1
423 pi2fd %mm1,%mm1 426 pi2fd %mm1,%mm1
424 movl pnts+16,%eax 427 movl pnts+16,%eax
425 movd 0(%eax),%mm2 428 movd 0(%eax),%mm2
426 punpckldq %mm2,%mm1 /* mm1 = 1.0 | cos0 */ 429 punpckldq %mm2,%mm1 /* mm1 = 1.0 | cos0 */
431 pfmul %mm1,%mm2 /* mm2 = tmp2[0]+tmp2[1]|(tmp2[0]-tmp2[1])*cos0*/ 434 pfmul %mm1,%mm2 /* mm2 = tmp2[0]+tmp2[1]|(tmp2[0]-tmp2[1])*cos0*/
432 movq %mm2,0(%ebx) /* tmp1[0, 1] = mm2 */ 435 movq %mm2,0(%ebx) /* tmp1[0, 1] = mm2 */
433 movq 8(%esi),%mm4 /* mm4 = tmp2[2] | tmp2[3]*/ 436 movq 8(%esi),%mm4 /* mm4 = tmp2[2] | tmp2[3]*/
434 pfpnacc %mm4, %mm4 437 pfpnacc %mm4, %mm4
435 pswapd %mm4, %mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[2]-tmp2[3]*/ 438 pswapd %mm4, %mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[2]-tmp2[3]*/
436 pfmul %mm0,%mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/ 439 pxor %mm0,%mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/
437 pfmul %mm1,%mm4 /* mm4 = tmp2[2]+tmp2[3]|(tmp2[3]-tmp2[2])*cos0*/ 440 pfmul %mm1,%mm4 /* mm4 = tmp2[2]+tmp2[3]|(tmp2[3]-tmp2[2])*cos0*/
438 movq %mm4,%mm5 441 movq %mm4,%mm5
439 psrlq $32,%mm5 /* mm5 = (tmp2[3]-tmp2[2])*cos0 */ 442 psrlq $32,%mm5 /* mm5 = (tmp2[3]-tmp2[2])*cos0 */
440 pfacc %mm5,%mm4 /* mm4 = tmp2[2]+tmp2[3]+(tmp2[3]-tmp2[2])*cos0|(tmp2[3]-tmp2[2])*cos0*/ 443 pfacc %mm5,%mm4 /* mm4 = tmp2[2]+tmp2[3]+(tmp2[3]-tmp2[2])*cos0|(tmp2[3]-tmp2[2])*cos0*/
441 movq %mm4,8(%ebx) /* tmp1[2, 3] = mm4 */ 444 movq %mm4,8(%ebx) /* tmp1[2, 3] = mm4 */
447 pfmul %mm1,%mm2 450 pfmul %mm1,%mm2
448 movq 24(%esi),%mm4 451 movq 24(%esi),%mm4
449 pfpnacc %mm4, %mm4 452 pfpnacc %mm4, %mm4
450 pswapd %mm4, %mm4 453 pswapd %mm4, %mm4
451 454
452 pfmul %mm0,%mm4 455 pxor %mm0,%mm4
453 pfmul %mm1,%mm4 456 pfmul %mm1,%mm4
454 movq %mm4,%mm5 457 movq %mm4,%mm5
455 psrlq $32,%mm5 458 psrlq $32,%mm5
456 pfacc %mm5,%mm4 459 pfacc %mm5,%mm4
457 movq %mm2,%mm3 460 movq %mm2,%mm3
468 pfmul %mm1,%mm2 471 pfmul %mm1,%mm2
469 movq %mm2,32(%ebx) 472 movq %mm2,32(%ebx)
470 movq 40(%esi),%mm4 473 movq 40(%esi),%mm4
471 pfpnacc %mm4, %mm4 474 pfpnacc %mm4, %mm4
472 pswapd %mm4, %mm4 475 pswapd %mm4, %mm4
473 pfmul %mm0,%mm4 476 pxor %mm0,%mm4
474 pfmul %mm1,%mm4 477 pfmul %mm1,%mm4
475 movq %mm4,%mm5 478 movq %mm4,%mm5
476 psrlq $32,%mm5 479 psrlq $32,%mm5
477 pfacc %mm5,%mm4 480 pfacc %mm5,%mm4
478 movq %mm4,40(%ebx) 481 movq %mm4,40(%ebx)
482 pswapd %mm2, %mm2 485 pswapd %mm2, %mm2
483 pfmul %mm1,%mm2 486 pfmul %mm1,%mm2
484 movq 56(%esi),%mm4 487 movq 56(%esi),%mm4
485 pfpnacc %mm4, %mm4 488 pfpnacc %mm4, %mm4
486 pswapd %mm4, %mm4 489 pswapd %mm4, %mm4
487 pfmul %mm0,%mm4 490 pxor %mm0,%mm4
488 pfmul %mm1,%mm4 491 pfmul %mm1,%mm4
489 movq %mm4,%mm5 492 movq %mm4,%mm5
490 psrlq $32,%mm5 493 psrlq $32,%mm5
491 pfacc %mm5,%mm4 494 pfacc %mm5,%mm4
492 movq %mm2,%mm3 495 movq %mm2,%mm3
502 pfmul %mm1,%mm2 505 pfmul %mm1,%mm2
503 movq %mm2,64(%ebx) 506 movq %mm2,64(%ebx)
504 movq 72(%esi),%mm4 507 movq 72(%esi),%mm4
505 pfpnacc %mm4, %mm4 508 pfpnacc %mm4, %mm4
506 pswapd %mm4, %mm4 509 pswapd %mm4, %mm4
507 pfmul %mm0,%mm4 510 pxor %mm0,%mm4
508 pfmul %mm1,%mm4 511 pfmul %mm1,%mm4
509 movq %mm4,%mm5 512 movq %mm4,%mm5
510 psrlq $32,%mm5 513 psrlq $32,%mm5
511 pfacc %mm5,%mm4 514 pfacc %mm5,%mm4
512 movq %mm4,72(%ebx) 515 movq %mm4,72(%ebx)
516 pswapd %mm2, %mm2 519 pswapd %mm2, %mm2
517 pfmul %mm1,%mm2 520 pfmul %mm1,%mm2
518 movq 88(%esi),%mm4 521 movq 88(%esi),%mm4
519 pfpnacc %mm4, %mm4 522 pfpnacc %mm4, %mm4
520 pswapd %mm4, %mm4 523 pswapd %mm4, %mm4
521 pfmul %mm0,%mm4 524 pxor %mm0,%mm4
522 pfmul %mm1,%mm4 525 pfmul %mm1,%mm4
523 movq %mm4,%mm5 526 movq %mm4,%mm5
524 psrlq $32,%mm5 527 psrlq $32,%mm5
525 pfacc %mm5,%mm4 528 pfacc %mm5,%mm4
526 movq %mm2,%mm3 529 movq %mm2,%mm3
536 pfmul %mm1,%mm2 539 pfmul %mm1,%mm2
537 movq %mm2,96(%ebx) 540 movq %mm2,96(%ebx)
538 movq 104(%esi),%mm4 541 movq 104(%esi),%mm4
539 pfpnacc %mm4, %mm4 542 pfpnacc %mm4, %mm4
540 pswapd %mm4, %mm4 543 pswapd %mm4, %mm4
541 pfmul %mm0,%mm4 544 pxor %mm0,%mm4
542 pfmul %mm1,%mm4 545 pfmul %mm1,%mm4
543 movq %mm4,%mm5 546 movq %mm4,%mm5
544 psrlq $32,%mm5 547 psrlq $32,%mm5
545 pfacc %mm5,%mm4 548 pfacc %mm5,%mm4
546 movq %mm4,104(%ebx) 549 movq %mm4,104(%ebx)
550 pswapd %mm2, %mm2 553 pswapd %mm2, %mm2
551 pfmul %mm1,%mm2 554 pfmul %mm1,%mm2
552 movq 120(%esi),%mm4 555 movq 120(%esi),%mm4
553 pfpnacc %mm4, %mm4 556 pfpnacc %mm4, %mm4
554 pswapd %mm4, %mm4 557 pswapd %mm4, %mm4
555 pfmul %mm0,%mm4 558 pxor %mm0,%mm4
556 pfmul %mm1,%mm4 559 pfmul %mm1,%mm4
557 movq %mm4,%mm5 560 movq %mm4,%mm5
558 psrlq $32,%mm5 561 psrlq $32,%mm5
559 pfacc %mm5,%mm4 562 pfacc %mm5,%mm4
560 movq %mm2,%mm3 563 movq %mm2,%mm3