comparison mp3lib/dct64_k7.s @ 781:ee303142c2e0

improvements.
author nickols_k
date Sun, 13 May 2001 14:36:02 +0000
parents 59b0a9ec8604
children 3c53cbf53e7e
comparison
equal deleted inserted replaced
780:24e4e6e5aa1c 781:ee303142c2e0
2 /// Replacement of dct64() with AMD's 3DNowEx(DSP)! SIMD operations support 2 /// Replacement of dct64() with AMD's 3DNowEx(DSP)! SIMD operations support
3 /// 3 ///
4 /// This code based 'dct64_3dnow.s' by Syuuhei Kashiyama 4 /// This code based 'dct64_3dnow.s' by Syuuhei Kashiyama
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made: 5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
6 /// 6 ///
7 /// - added new opcode PSWAPD 7 /// - added new opcodes PSWAPD, PFPNACC
8 /// - decreased number of opcodes (as it was suggested by k7 manual)
9 /// (using memory reference as operand of instructions)
10 /// - Phase 6 is rewritten with mixing of cpu and mmx opcodes
8 /// - change function name for support 3DNowEx! automatic detect 11 /// - change function name for support 3DNowEx! automatic detect
9 /// 12 ///
10 /// note: because K7 processors are an aggresive out-of-order three-way 13 /// note: because K7 processors are an aggresive out-of-order three-way
11 /// superscalar ones instruction order is not significand for them. 14 /// superscalar ones instruction order is not significand for them.
12 /// 15 ///
18 /// this program. Use it at your own risk. 21 /// this program. Use it at your own risk.
19 /// 22 ///
20 23
21 .globl dct64_3dnowex 24 .globl dct64_3dnowex
22 .type dct64_3dnowex,@function 25 .type dct64_3dnowex,@function
26
27 /* Discrete Cosine Tansform (DCT) for subband synthesis */
28 /* void dct64(real *a,real *b,real *c) */
23 dct64_3dnowex: 29 dct64_3dnowex:
24 subl $256,%esp 30 subl $256,%esp
25 pushl %ebp 31 pushl %ebp
26 pushl %edi 32 pushl %edi
27 pushl %esi 33 pushl %esi
28 pushl %ebx 34 pushl %ebx
29 leal 16(%esp),%ebx 35 leal 16(%esp),%ebx /* ebx -> real tmp1[32] */
30 movl 284(%esp),%edi 36 movl 284(%esp),%edi /* edi -> c */
31 movl 276(%esp),%ebp 37 movl 276(%esp),%ebp /* ebp -> a */
32 movl 280(%esp),%edx 38 movl 280(%esp),%edx /* edx -> b */
33 leal 128(%ebx),%esi 39 leal 128(%ebx),%esi /* esi -> real tmp2[32] */
34 40
35 / femms 41 / femms
36 42
37 // 1 43 // 1
38 movl pnts,%eax 44 movl pnts,%eax
39 movq 0(%edi),%mm0 45
40 movq %mm0,%mm1 46 movq 0(%edi),%mm0 /* mm0 = c[0x00] | c[0x01]*/
41 movd 124(%edi),%mm2 47 movq %mm0,%mm1 /* mm1 = mm0 */
42 punpckldq 120(%edi),%mm2 48 movd 124(%edi),%mm2 /* mm2 = c[0x1f] */
43 movq 0(%eax),%mm3 49 punpckldq 120(%edi),%mm2 /* mm2 = c[0x1f] | c[0x1E] */
44 pfadd %mm2,%mm0 50 pfadd %mm2,%mm0 /* mm0 = c[0x00]+c[0x1F] | c[0x1E]+c[0x01] */
45 movq %mm0,0(%ebx) 51 movq %mm0,0(%ebx) /* tmp[0, 1] = mm0 */
46 pfsub %mm2,%mm1 52 pfsub %mm2,%mm1 /* c[0x00]-c[0x1f] | c[0x01]-c[0x1e] */
47 pfmul %mm3,%mm1 53 pfmul 0(%eax),%mm1 /* (c[0x00]-c[0x1f])*pnts[0]|(c[0x01]-c[0x1e])*pnts[1]*/
48 pswapd %mm1, %mm1 54 pswapd %mm1, %mm1 /* (c[0x01]-c[0x1e])*pnts[1]|(c[0x00]-c[0x1f])*pnts[0]*/
49 movq %mm1, 120(%ebx) 55 movq %mm1, 120(%ebx) /* tmp1[30, 31]=mm1 */
56
50 movq 8(%edi),%mm4 57 movq 8(%edi),%mm4
51 movq %mm4,%mm5 58 movq %mm4,%mm5
52 movd 116(%edi),%mm6 59 movd 116(%edi),%mm6
53 punpckldq 112(%edi),%mm6 60 punpckldq 112(%edi),%mm6
54 movq 8(%eax),%mm7
55 pfadd %mm6,%mm4 61 pfadd %mm6,%mm4
56 movq %mm4,8(%ebx) 62 movq %mm4,8(%ebx)
57 pfsub %mm6,%mm5 63 pfsub %mm6,%mm5
58 pfmul %mm7,%mm5 64 pfmul 8(%eax),%mm5
59 pswapd %mm5, %mm5 65 pswapd %mm5, %mm5
60 movq %mm5, 112(%ebx) 66 movq %mm5, 112(%ebx)
67
61 movq 16(%edi),%mm0 68 movq 16(%edi),%mm0
62 movq %mm0,%mm1 69 movq %mm0,%mm1
63 movd 108(%edi),%mm2 70 movd 108(%edi),%mm2
64 punpckldq 104(%edi),%mm2 71 punpckldq 104(%edi),%mm2
65 movq 16(%eax),%mm3
66 pfadd %mm2,%mm0 72 pfadd %mm2,%mm0
67 movq %mm0,16(%ebx) 73 movq %mm0,16(%ebx)
68 pfsub %mm2,%mm1 74 pfsub %mm2,%mm1
69 pfmul %mm3,%mm1 75 pfmul 16(%eax),%mm1
70 pswapd %mm1, %mm1 76 pswapd %mm1, %mm1
71 movq %mm1, 104(%ebx) 77 movq %mm1, 104(%ebx)
78
72 movq 24(%edi),%mm4 79 movq 24(%edi),%mm4
73 movq %mm4,%mm5 80 movq %mm4,%mm5
74 movd 100(%edi),%mm6 81 movd 100(%edi),%mm6
75 punpckldq 96(%edi),%mm6 82 punpckldq 96(%edi),%mm6
76 movq 24(%eax),%mm7
77 pfadd %mm6,%mm4 83 pfadd %mm6,%mm4
78 movq %mm4,24(%ebx) 84 movq %mm4,24(%ebx)
79 pfsub %mm6,%mm5 85 pfsub %mm6,%mm5
80 pfmul %mm7,%mm5 86 pfmul 24(%eax),%mm5
81 pswapd %mm5, %mm5 87 pswapd %mm5, %mm5
82 movq %mm5, 96(%ebx) 88 movq %mm5, 96(%ebx)
89
83 movq 32(%edi),%mm0 90 movq 32(%edi),%mm0
84 movq %mm0,%mm1 91 movq %mm0,%mm1
85 movd 92(%edi),%mm2 92 movd 92(%edi),%mm2
86 punpckldq 88(%edi),%mm2 93 punpckldq 88(%edi),%mm2
87 movq 32(%eax),%mm3
88 pfadd %mm2,%mm0 94 pfadd %mm2,%mm0
89 movq %mm0,32(%ebx) 95 movq %mm0,32(%ebx)
90 pfsub %mm2,%mm1 96 pfsub %mm2,%mm1
91 pfmul %mm3,%mm1 97 pfmul 32(%eax),%mm1
92 pswapd %mm1, %mm1 98 pswapd %mm1, %mm1
93 movq %mm1, 88(%ebx) 99 movq %mm1, 88(%ebx)
100
94 movq 40(%edi),%mm4 101 movq 40(%edi),%mm4
95 movq %mm4,%mm5 102 movq %mm4,%mm5
96 movd 84(%edi),%mm6 103 movd 84(%edi),%mm6
97 punpckldq 80(%edi),%mm6 104 punpckldq 80(%edi),%mm6
98 movq 40(%eax),%mm7
99 pfadd %mm6,%mm4 105 pfadd %mm6,%mm4
100 movq %mm4,40(%ebx) 106 movq %mm4,40(%ebx)
101 pfsub %mm6,%mm5 107 pfsub %mm6,%mm5
102 pfmul %mm7,%mm5 108 pfmul 40(%eax),%mm5
103 pswapd %mm5, %mm5 109 pswapd %mm5, %mm5
104 movq %mm5, 80(%ebx) 110 movq %mm5, 80(%ebx)
111
105 movq 48(%edi),%mm0 112 movq 48(%edi),%mm0
106 movq %mm0,%mm1 113 movq %mm0,%mm1
107 movd 76(%edi),%mm2 114 movd 76(%edi),%mm2
108 punpckldq 72(%edi),%mm2 115 punpckldq 72(%edi),%mm2
109 movq 48(%eax),%mm3
110 pfadd %mm2,%mm0 116 pfadd %mm2,%mm0
111 movq %mm0,48(%ebx) 117 movq %mm0,48(%ebx)
112 pfsub %mm2,%mm1 118 pfsub %mm2,%mm1
113 pfmul %mm3,%mm1 119 pfmul 48(%eax),%mm1
114 pswapd %mm1, %mm1 120 pswapd %mm1, %mm1
115 movq %mm1, 72(%ebx) 121 movq %mm1, 72(%ebx)
122
116 movq 56(%edi),%mm4 123 movq 56(%edi),%mm4
117 movq %mm4,%mm5 124 movq %mm4,%mm5
118 movd 68(%edi),%mm6 125 movd 68(%edi),%mm6
119 punpckldq 64(%edi),%mm6 126 punpckldq 64(%edi),%mm6
120 movq 56(%eax),%mm7
121 pfadd %mm6,%mm4 127 pfadd %mm6,%mm4
122 movq %mm4,56(%ebx) 128 movq %mm4,56(%ebx)
123 pfsub %mm6,%mm5 129 pfsub %mm6,%mm5
124 pfmul %mm7,%mm5 130 pfmul 56(%eax),%mm5
125 pswapd %mm5, %mm5 131 pswapd %mm5, %mm5
126 movq %mm5, 64(%ebx) 132 movq %mm5, 64(%ebx)
127 133
128 // 2 134 // 2
129 movl pnts+4,%eax 135 movl pnts+4,%eax
130 / 0, 14 136 / 0, 14
131 movq 0(%ebx),%mm0 137 movq 0(%ebx),%mm0 /* mm0 = tmp1[0] | tmp1[1] */
132 movq %mm0,%mm1 138 movq %mm0,%mm1
133 movd 60(%ebx),%mm2 139 movd 60(%ebx),%mm2 /* mm2 = tmp1[0x0F] */
134 punpckldq 56(%ebx),%mm2 140 punpckldq 56(%ebx),%mm2 /* mm2 = tmp1[0x0E] | tmp1[0x0F] */
135 movq 0(%eax),%mm3 141 movq 0(%eax),%mm3 /* mm3 = pnts[0] | pnts[1] */
136 pfadd %mm2,%mm0 142 pfadd %mm2,%mm0 /* mm0 = tmp1[0]+tmp1[0x0F]|tmp1[1]+tmp1[0x0E]*/
137 movq %mm0,0(%esi) 143 movq %mm0,0(%esi) /* tmp2[0, 1] = mm0 */
138 pfsub %mm2,%mm1 144 pfsub %mm2,%mm1 /* mm1 = tmp1[0]-tmp1[0x0F]|tmp1[1]-tmp1[0x0E]*/
139 pfmul %mm3,%mm1 145 pfmul %mm3,%mm1 /* mm1 = (tmp1[0]-tmp1[0x0F])*pnts[0]|(tmp1[1]-tmp1[0x0E])*pnts[1]*/
140 pswapd %mm1, %mm1 146 pswapd %mm1, %mm1 /* mm1 = (tmp1[1]-tmp1[0x0E])*pnts[1]|(tmp1[0]-tmp1[0x0F])*pnts[0]*/
141 movq %mm1, 56(%esi) 147 movq %mm1, 56(%esi) /* tmp2[0x0E, 0x0F] = mm1 */
142 / 16, 30 148 / 16, 30
143 movq 64(%ebx),%mm0 149 movq 64(%ebx),%mm0
144 movq %mm0,%mm1 150 movq %mm0,%mm1
145 movd 124(%ebx),%mm2 151 movd 124(%ebx),%mm2
146 punpckldq 120(%ebx),%mm2 152 punpckldq 120(%ebx),%mm2
312 movq %mm5,104(%ebx) 318 movq %mm5,104(%ebx)
313 pswapd %mm6, %mm6 319 pswapd %mm6, %mm6
314 movq %mm6, 112(%ebx) 320 movq %mm6, 112(%ebx)
315 321
316 // 4 322 // 4
317 movl pnts+12,%eax 323 movl pnts+12,%eax
318 movq 0(%eax),%mm0 324 movq 0(%eax),%mm0 /* mm0 = pnts[3] | pnts[4] */
319 movq 0(%ebx),%mm1 325 movq 0(%ebx),%mm1 /* mm1 = tmp1[0] | tmp1[1] */
320 / 0 326 / 0
321 movq %mm1,%mm2 327 movq %mm1,%mm2
322 movd 12(%ebx),%mm3 328 movd 12(%ebx),%mm3 /* mm3 = tmp1[3] */
323 punpckldq 8(%ebx),%mm3 329 punpckldq 8(%ebx),%mm3 /* mm3 = tmp1[3] | tmp1[2] */
324 pfadd %mm3,%mm1 330 pfadd %mm3,%mm1 /* mm1 = tmp1[0]+tmp1[3] | tmp1[1]+tmp1[2]*/
325 pfsub %mm3,%mm2 331 pfsub %mm3,%mm2 /* mm2 = tmp1[0]-tmp1[3] | tmp1[0]-tmp1[2]*/
326 pfmul %mm0,%mm2 332 pfmul %mm0,%mm2 /* mm2 = tmp1[0]-tmp1[3]*pnts[3]|tmp1[0]-tmp1[2]*pnts[4]*/
327 movq %mm1,0(%esi) 333 movq %mm1,0(%esi) /* tmp2[0, 1] = mm1 */
328 pswapd %mm2, %mm2 334 pswapd %mm2, %mm2 /* mm2 = tmp1[0]-tmp1[2]*pnts[4]|tmp1[0]-tmp1[3]*pnts[3] */
329 movq %mm2, 8(%esi) 335 movq %mm2, 8(%esi) /* tmp2[2, 3] = mm2 */
330 movq 16(%ebx),%mm4 336 movq 16(%ebx),%mm4
331 / 4 337 / 4
332 movq %mm4,%mm5 338 movq %mm4,%mm5
333 movd 28(%ebx),%mm6 339 movd 28(%ebx),%mm6
334 punpckldq 24(%ebx),%mm6 340 punpckldq 24(%ebx),%mm6
410 movd %eax,%mm1 416 movd %eax,%mm1
411 movl $1,%eax 417 movl $1,%eax
412 movd %eax,%mm0 418 movd %eax,%mm0
413 / L | H 419 / L | H
414 punpckldq %mm1,%mm0 420 punpckldq %mm1,%mm0
415 pi2fd %mm0,%mm0 421 pi2fd %mm0,%mm0 /* mm0 = 1.0 | -1.0 */
416 / 1.0 | -1.0
417 movd %eax,%mm1 422 movd %eax,%mm1
418 pi2fd %mm1,%mm1 423 pi2fd %mm1,%mm1
419 movl pnts+16,%eax 424 movl pnts+16,%eax
420 movd 0(%eax),%mm2 425 movd 0(%eax),%mm2
421 punpckldq %mm2,%mm1 426 punpckldq %mm2,%mm1 /* mm1 = 1.0 | cos0 */
422 / 1.0 | cos0 427 movq 0(%esi),%mm2 /* mm2 = tmp2[0] | tmp2[1] */
423 movq 0(%esi),%mm2
424 / 0 428 / 0
425 movq %mm2,%mm3 429 pfpnacc %mm2, %mm2
426 pfmul %mm0,%mm3 430 pswapd %mm2, %mm2 /* mm2 = tmp2[0]+tmp2[1]|tmp2[0]-tmp2[1]*/
427 pfacc %mm3,%mm2 431 pfmul %mm1,%mm2 /* mm2 = tmp2[0]+tmp2[1]|(tmp2[0]-tmp2[1])*cos0*/
428 pfmul %mm1,%mm2 432 movq %mm2,0(%ebx) /* tmp1[0, 1] = mm2 */
429 movq %mm2,0(%ebx) 433 movq 8(%esi),%mm4 /* mm4 = tmp2[2] | tmp2[3]*/
430 movq 8(%esi),%mm4 434 pfpnacc %mm4, %mm4
431 movq %mm4,%mm5 435 pswapd %mm4, %mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[2]-tmp2[3]*/
432 pfmul %mm0,%mm5 436 pfmul %mm0,%mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/
433 pfacc %mm5,%mm4 437 pfmul %mm1,%mm4 /* mm4 = tmp2[2]+tmp2[3]|(tmp2[3]-tmp2[2])*cos0*/
434 pfmul %mm0,%mm4 438 movq %mm4,%mm5
435 pfmul %mm1,%mm4 439 psrlq $32,%mm5 /* mm5 = (tmp2[3]-tmp2[2])*cos0 */
436 movq %mm4,%mm5 440 pfacc %mm5,%mm4 /* mm4 = tmp2[2]+tmp2[3]+(tmp2[3]-tmp2[2])*cos0|(tmp2[3]-tmp2[2])*cos0*/
437 psrlq $32,%mm5 441 movq %mm4,8(%ebx) /* tmp1[2, 3] = mm4 */
438 pfacc %mm5,%mm4
439 movq %mm4,8(%ebx)
440 movq 16(%esi),%mm2 442 movq 16(%esi),%mm2
441 / 4 443 / 4
442 movq %mm2,%mm3 444 pfpnacc %mm2, %mm2
443 pfmul %mm0,%mm3 445 pswapd %mm2, %mm2
444 pfacc %mm3,%mm2 446
445 pfmul %mm1,%mm2 447 pfmul %mm1,%mm2
446 movq 24(%esi),%mm4 448 movq 24(%esi),%mm4
447 movq %mm4,%mm5 449 pfpnacc %mm4, %mm4
448 pfmul %mm0,%mm5 450 pswapd %mm4, %mm4
449 pfacc %mm5,%mm4 451
450 pfmul %mm0,%mm4 452 pfmul %mm0,%mm4
451 pfmul %mm1,%mm4 453 pfmul %mm1,%mm4
452 movq %mm4,%mm5 454 movq %mm4,%mm5
453 psrlq $32,%mm5 455 psrlq $32,%mm5
454 pfacc %mm5,%mm4 456 pfacc %mm5,%mm4
458 pfadd %mm3,%mm4 460 pfadd %mm3,%mm4
459 movq %mm2,16(%ebx) 461 movq %mm2,16(%ebx)
460 movq %mm4,24(%ebx) 462 movq %mm4,24(%ebx)
461 movq 32(%esi),%mm2 463 movq 32(%esi),%mm2
462 / 8 464 / 8
463 movq %mm2,%mm3 465 pfpnacc %mm2, %mm2
464 pfmul %mm0,%mm3 466 pswapd %mm2, %mm2
465 pfacc %mm3,%mm2 467
466 pfmul %mm1,%mm2 468 pfmul %mm1,%mm2
467 movq %mm2,32(%ebx) 469 movq %mm2,32(%ebx)
468 movq 40(%esi),%mm4 470 movq 40(%esi),%mm4
469 movq %mm4,%mm5 471 pfpnacc %mm4, %mm4
470 pfmul %mm0,%mm5 472 pswapd %mm4, %mm4
471 pfacc %mm5,%mm4
472 pfmul %mm0,%mm4 473 pfmul %mm0,%mm4
473 pfmul %mm1,%mm4 474 pfmul %mm1,%mm4
474 movq %mm4,%mm5 475 movq %mm4,%mm5
475 psrlq $32,%mm5 476 psrlq $32,%mm5
476 pfacc %mm5,%mm4 477 pfacc %mm5,%mm4
477 movq %mm4,40(%ebx) 478 movq %mm4,40(%ebx)
478 movq 48(%esi),%mm2 479 movq 48(%esi),%mm2
479 / 12 480 / 12
480 movq %mm2,%mm3 481 pfpnacc %mm2, %mm2
481 pfmul %mm0,%mm3 482 pswapd %mm2, %mm2
482 pfacc %mm3,%mm2
483 pfmul %mm1,%mm2 483 pfmul %mm1,%mm2
484 movq 56(%esi),%mm4 484 movq 56(%esi),%mm4
485 movq %mm4,%mm5 485 pfpnacc %mm4, %mm4
486 pfmul %mm0,%mm5 486 pswapd %mm4, %mm4
487 pfacc %mm5,%mm4
488 pfmul %mm0,%mm4 487 pfmul %mm0,%mm4
489 pfmul %mm1,%mm4 488 pfmul %mm1,%mm4
490 movq %mm4,%mm5 489 movq %mm4,%mm5
491 psrlq $32,%mm5 490 psrlq $32,%mm5
492 pfacc %mm5,%mm4 491 pfacc %mm5,%mm4
496 pfadd %mm3,%mm4 495 pfadd %mm3,%mm4
497 movq %mm2,48(%ebx) 496 movq %mm2,48(%ebx)
498 movq %mm4,56(%ebx) 497 movq %mm4,56(%ebx)
499 movq 64(%esi),%mm2 498 movq 64(%esi),%mm2
500 / 16 499 / 16
501 movq %mm2,%mm3 500 pfpnacc %mm2, %mm2
502 pfmul %mm0,%mm3 501 pswapd %mm2, %mm2
503 pfacc %mm3,%mm2
504 pfmul %mm1,%mm2 502 pfmul %mm1,%mm2
505 movq %mm2,64(%ebx) 503 movq %mm2,64(%ebx)
506 movq 72(%esi),%mm4 504 movq 72(%esi),%mm4
507 movq %mm4,%mm5 505 pfpnacc %mm4, %mm4
508 pfmul %mm0,%mm5 506 pswapd %mm4, %mm4
509 pfacc %mm5,%mm4
510 pfmul %mm0,%mm4 507 pfmul %mm0,%mm4
511 pfmul %mm1,%mm4 508 pfmul %mm1,%mm4
512 movq %mm4,%mm5 509 movq %mm4,%mm5
513 psrlq $32,%mm5 510 psrlq $32,%mm5
514 pfacc %mm5,%mm4 511 pfacc %mm5,%mm4
515 movq %mm4,72(%ebx) 512 movq %mm4,72(%ebx)
516 movq 80(%esi),%mm2 513 movq 80(%esi),%mm2
517 / 20 514 / 20
518 movq %mm2,%mm3 515 pfpnacc %mm2, %mm2
519 pfmul %mm0,%mm3 516 pswapd %mm2, %mm2
520 pfacc %mm3,%mm2
521 pfmul %mm1,%mm2 517 pfmul %mm1,%mm2
522 movq 88(%esi),%mm4 518 movq 88(%esi),%mm4
523 movq %mm4,%mm5 519 pfpnacc %mm4, %mm4
524 pfmul %mm0,%mm5 520 pswapd %mm4, %mm4
525 pfacc %mm5,%mm4
526 pfmul %mm0,%mm4 521 pfmul %mm0,%mm4
527 pfmul %mm1,%mm4 522 pfmul %mm1,%mm4
528 movq %mm4,%mm5 523 movq %mm4,%mm5
529 psrlq $32,%mm5 524 psrlq $32,%mm5
530 pfacc %mm5,%mm4 525 pfacc %mm5,%mm4
534 pfadd %mm3,%mm4 529 pfadd %mm3,%mm4
535 movq %mm2,80(%ebx) 530 movq %mm2,80(%ebx)
536 movq %mm4,88(%ebx) 531 movq %mm4,88(%ebx)
537 movq 96(%esi),%mm2 532 movq 96(%esi),%mm2
538 / 24 533 / 24
539 movq %mm2,%mm3 534 pfpnacc %mm2, %mm2
540 pfmul %mm0,%mm3 535 pswapd %mm2, %mm2
541 pfacc %mm3,%mm2
542 pfmul %mm1,%mm2 536 pfmul %mm1,%mm2
543 movq %mm2,96(%ebx) 537 movq %mm2,96(%ebx)
544 movq 104(%esi),%mm4 538 movq 104(%esi),%mm4
545 movq %mm4,%mm5 539 pfpnacc %mm4, %mm4
546 pfmul %mm0,%mm5 540 pswapd %mm4, %mm4
547 pfacc %mm5,%mm4
548 pfmul %mm0,%mm4 541 pfmul %mm0,%mm4
549 pfmul %mm1,%mm4 542 pfmul %mm1,%mm4
550 movq %mm4,%mm5 543 movq %mm4,%mm5
551 psrlq $32,%mm5 544 psrlq $32,%mm5
552 pfacc %mm5,%mm4 545 pfacc %mm5,%mm4
553 movq %mm4,104(%ebx) 546 movq %mm4,104(%ebx)
554 movq 112(%esi),%mm2 547 movq 112(%esi),%mm2
555 / 28 548 / 28
556 movq %mm2,%mm3 549 pfpnacc %mm2, %mm2
557 pfmul %mm0,%mm3 550 pswapd %mm2, %mm2
558 pfacc %mm3,%mm2
559 pfmul %mm1,%mm2 551 pfmul %mm1,%mm2
560 movq 120(%esi),%mm4 552 movq 120(%esi),%mm4
561 movq %mm4,%mm5 553 pfpnacc %mm4, %mm4
562 pfmul %mm0,%mm5 554 pswapd %mm4, %mm4
563 pfacc %mm5,%mm4
564 pfmul %mm0,%mm4 555 pfmul %mm0,%mm4
565 pfmul %mm1,%mm4 556 pfmul %mm1,%mm4
566 movq %mm4,%mm5 557 movq %mm4,%mm5
567 psrlq $32,%mm5 558 psrlq $32,%mm5
568 pfacc %mm5,%mm4 559 pfacc %mm5,%mm4
572 pfadd %mm3,%mm4 563 pfadd %mm3,%mm4
573 movq %mm2,112(%ebx) 564 movq %mm2,112(%ebx)
574 movq %mm4,120(%ebx) 565 movq %mm4,120(%ebx)
575 566
576 // Phase6 567 // Phase6
577 movl 0(%ebx),%eax 568 movd 0(%ebx),%mm0
578 movl %eax,1024(%ebp) 569 movd %mm0,1024(%ebp)
579 movl 4(%ebx),%eax 570 movl 4(%ebx),%eax
580 movl %eax,0(%ebp) 571 movl %eax,0(%ebp)
581 movl %eax,0(%edx) 572 movl %eax,0(%edx)
582 movl 8(%ebx),%eax 573 movd 8(%ebx),%mm2
583 movl %eax,512(%ebp) 574 movd %mm2,512(%ebp)
584 movl 12(%ebx),%eax 575 movd 12(%ebx),%mm3
585 movl %eax,512(%edx) 576 movd %mm3,512(%edx)
586 577
587 movl 16(%ebx),%eax 578 movl 16(%ebx),%eax
588 movl %eax,768(%ebp) 579 movl %eax,768(%ebp)
589 movl 20(%ebx),%eax 580 movd 20(%ebx),%mm5
590 movl %eax,256(%edx) 581 movd %mm5,256(%edx)
591 582
592 movl 24(%ebx),%eax 583 movd 24(%ebx),%mm6
593 movl %eax,256(%ebp) 584 movd %mm6,256(%ebp)
594 movl 28(%ebx),%eax 585 movd 28(%ebx),%mm7
595 movl %eax,768(%edx) 586 movd %mm7,768(%edx)
596 587
597 movq 32(%ebx),%mm0 588 movq 32(%ebx),%mm0 /* mm0 = tmp1[8] | tmp1[9] */
598 movq 48(%ebx),%mm1 589 movq 48(%ebx),%mm1 /* mm1 = tmp1[12] | tmp1[13] */
599 pfadd %mm1,%mm0 590 pfadd %mm1,%mm0 /* mm0 = tmp1[8]+tmp1[12]| tmp1[9]+tmp1[13]*/
600 movd %mm0,896(%ebp) 591 movd %mm0,896(%ebp) /* a[0xE0] = tmp1[8]+tmp1[12] */
601 psrlq $32,%mm0 592 psrlq $32,%mm0
602 movd %mm0,128(%edx) 593 movd %mm0,128(%edx) /* a[0x20] = tmp1[9]+tmp1[13] */
603 movq 40(%ebx),%mm2 594 movq 40(%ebx),%mm2
604 pfadd %mm2,%mm1 595 pfadd %mm2,%mm1
605 movd %mm1,640(%ebp) 596 movd %mm1,640(%ebp)
606 psrlq $32,%mm1 597 psrlq $32,%mm1
607 movd %mm1,384(%edx) 598 movd %mm1,384(%edx)
677 popl %esi 668 popl %esi
678 popl %edi 669 popl %edi
679 popl %ebp 670 popl %ebp
680 addl $256,%esp 671 addl $256,%esp
681 672
682 ret 673 ret $12
683 674