Mercurial > mplayer.hg
comparison mp3lib/dct64_k7.s @ 781:ee303142c2e0
improvements.
author | nickols_k |
---|---|
date | Sun, 13 May 2001 14:36:02 +0000 |
parents | 59b0a9ec8604 |
children | 3c53cbf53e7e |
comparison
equal
deleted
inserted
replaced
780:24e4e6e5aa1c | 781:ee303142c2e0 |
---|---|
2 /// Replacement of dct64() with AMD's 3DNowEx(DSP)! SIMD operations support | 2 /// Replacement of dct64() with AMD's 3DNowEx(DSP)! SIMD operations support |
3 /// | 3 /// |
4 /// This code based 'dct64_3dnow.s' by Syuuhei Kashiyama | 4 /// This code based 'dct64_3dnow.s' by Syuuhei Kashiyama |
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made: | 5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made: |
6 /// | 6 /// |
7 /// - added new opcode PSWAPD | 7 /// - added new opcodes PSWAPD, PFPNACC |
8 /// - decreased number of opcodes (as it was suggested by k7 manual) | |
9 /// (using memory reference as operand of instructions) | |
10 /// - Phase 6 is rewritten with mixing of cpu and mmx opcodes | |
8 /// - change function name for support 3DNowEx! automatic detect | 11 /// - change function name for support 3DNowEx! automatic detect |
9 /// | 12 /// |
10 /// note: because K7 processors are an aggresive out-of-order three-way | 13 /// note: because K7 processors are an aggresive out-of-order three-way |
11 /// superscalar ones instruction order is not significand for them. | 14 /// superscalar ones instruction order is not significand for them. |
12 /// | 15 /// |
18 /// this program. Use it at your own risk. | 21 /// this program. Use it at your own risk. |
19 /// | 22 /// |
20 | 23 |
21 .globl dct64_3dnowex | 24 .globl dct64_3dnowex |
22 .type dct64_3dnowex,@function | 25 .type dct64_3dnowex,@function |
26 | |
27 /* Discrete Cosine Tansform (DCT) for subband synthesis */ | |
28 /* void dct64(real *a,real *b,real *c) */ | |
23 dct64_3dnowex: | 29 dct64_3dnowex: |
24 subl $256,%esp | 30 subl $256,%esp |
25 pushl %ebp | 31 pushl %ebp |
26 pushl %edi | 32 pushl %edi |
27 pushl %esi | 33 pushl %esi |
28 pushl %ebx | 34 pushl %ebx |
29 leal 16(%esp),%ebx | 35 leal 16(%esp),%ebx /* ebx -> real tmp1[32] */ |
30 movl 284(%esp),%edi | 36 movl 284(%esp),%edi /* edi -> c */ |
31 movl 276(%esp),%ebp | 37 movl 276(%esp),%ebp /* ebp -> a */ |
32 movl 280(%esp),%edx | 38 movl 280(%esp),%edx /* edx -> b */ |
33 leal 128(%ebx),%esi | 39 leal 128(%ebx),%esi /* esi -> real tmp2[32] */ |
34 | 40 |
35 / femms | 41 / femms |
36 | 42 |
37 // 1 | 43 // 1 |
38 movl pnts,%eax | 44 movl pnts,%eax |
39 movq 0(%edi),%mm0 | 45 |
40 movq %mm0,%mm1 | 46 movq 0(%edi),%mm0 /* mm0 = c[0x00] | c[0x01]*/ |
41 movd 124(%edi),%mm2 | 47 movq %mm0,%mm1 /* mm1 = mm0 */ |
42 punpckldq 120(%edi),%mm2 | 48 movd 124(%edi),%mm2 /* mm2 = c[0x1f] */ |
43 movq 0(%eax),%mm3 | 49 punpckldq 120(%edi),%mm2 /* mm2 = c[0x1f] | c[0x1E] */ |
44 pfadd %mm2,%mm0 | 50 pfadd %mm2,%mm0 /* mm0 = c[0x00]+c[0x1F] | c[0x1E]+c[0x01] */ |
45 movq %mm0,0(%ebx) | 51 movq %mm0,0(%ebx) /* tmp[0, 1] = mm0 */ |
46 pfsub %mm2,%mm1 | 52 pfsub %mm2,%mm1 /* c[0x00]-c[0x1f] | c[0x01]-c[0x1e] */ |
47 pfmul %mm3,%mm1 | 53 pfmul 0(%eax),%mm1 /* (c[0x00]-c[0x1f])*pnts[0]|(c[0x01]-c[0x1e])*pnts[1]*/ |
48 pswapd %mm1, %mm1 | 54 pswapd %mm1, %mm1 /* (c[0x01]-c[0x1e])*pnts[1]|(c[0x00]-c[0x1f])*pnts[0]*/ |
49 movq %mm1, 120(%ebx) | 55 movq %mm1, 120(%ebx) /* tmp1[30, 31]=mm1 */ |
56 | |
50 movq 8(%edi),%mm4 | 57 movq 8(%edi),%mm4 |
51 movq %mm4,%mm5 | 58 movq %mm4,%mm5 |
52 movd 116(%edi),%mm6 | 59 movd 116(%edi),%mm6 |
53 punpckldq 112(%edi),%mm6 | 60 punpckldq 112(%edi),%mm6 |
54 movq 8(%eax),%mm7 | |
55 pfadd %mm6,%mm4 | 61 pfadd %mm6,%mm4 |
56 movq %mm4,8(%ebx) | 62 movq %mm4,8(%ebx) |
57 pfsub %mm6,%mm5 | 63 pfsub %mm6,%mm5 |
58 pfmul %mm7,%mm5 | 64 pfmul 8(%eax),%mm5 |
59 pswapd %mm5, %mm5 | 65 pswapd %mm5, %mm5 |
60 movq %mm5, 112(%ebx) | 66 movq %mm5, 112(%ebx) |
67 | |
61 movq 16(%edi),%mm0 | 68 movq 16(%edi),%mm0 |
62 movq %mm0,%mm1 | 69 movq %mm0,%mm1 |
63 movd 108(%edi),%mm2 | 70 movd 108(%edi),%mm2 |
64 punpckldq 104(%edi),%mm2 | 71 punpckldq 104(%edi),%mm2 |
65 movq 16(%eax),%mm3 | |
66 pfadd %mm2,%mm0 | 72 pfadd %mm2,%mm0 |
67 movq %mm0,16(%ebx) | 73 movq %mm0,16(%ebx) |
68 pfsub %mm2,%mm1 | 74 pfsub %mm2,%mm1 |
69 pfmul %mm3,%mm1 | 75 pfmul 16(%eax),%mm1 |
70 pswapd %mm1, %mm1 | 76 pswapd %mm1, %mm1 |
71 movq %mm1, 104(%ebx) | 77 movq %mm1, 104(%ebx) |
78 | |
72 movq 24(%edi),%mm4 | 79 movq 24(%edi),%mm4 |
73 movq %mm4,%mm5 | 80 movq %mm4,%mm5 |
74 movd 100(%edi),%mm6 | 81 movd 100(%edi),%mm6 |
75 punpckldq 96(%edi),%mm6 | 82 punpckldq 96(%edi),%mm6 |
76 movq 24(%eax),%mm7 | |
77 pfadd %mm6,%mm4 | 83 pfadd %mm6,%mm4 |
78 movq %mm4,24(%ebx) | 84 movq %mm4,24(%ebx) |
79 pfsub %mm6,%mm5 | 85 pfsub %mm6,%mm5 |
80 pfmul %mm7,%mm5 | 86 pfmul 24(%eax),%mm5 |
81 pswapd %mm5, %mm5 | 87 pswapd %mm5, %mm5 |
82 movq %mm5, 96(%ebx) | 88 movq %mm5, 96(%ebx) |
89 | |
83 movq 32(%edi),%mm0 | 90 movq 32(%edi),%mm0 |
84 movq %mm0,%mm1 | 91 movq %mm0,%mm1 |
85 movd 92(%edi),%mm2 | 92 movd 92(%edi),%mm2 |
86 punpckldq 88(%edi),%mm2 | 93 punpckldq 88(%edi),%mm2 |
87 movq 32(%eax),%mm3 | |
88 pfadd %mm2,%mm0 | 94 pfadd %mm2,%mm0 |
89 movq %mm0,32(%ebx) | 95 movq %mm0,32(%ebx) |
90 pfsub %mm2,%mm1 | 96 pfsub %mm2,%mm1 |
91 pfmul %mm3,%mm1 | 97 pfmul 32(%eax),%mm1 |
92 pswapd %mm1, %mm1 | 98 pswapd %mm1, %mm1 |
93 movq %mm1, 88(%ebx) | 99 movq %mm1, 88(%ebx) |
100 | |
94 movq 40(%edi),%mm4 | 101 movq 40(%edi),%mm4 |
95 movq %mm4,%mm5 | 102 movq %mm4,%mm5 |
96 movd 84(%edi),%mm6 | 103 movd 84(%edi),%mm6 |
97 punpckldq 80(%edi),%mm6 | 104 punpckldq 80(%edi),%mm6 |
98 movq 40(%eax),%mm7 | |
99 pfadd %mm6,%mm4 | 105 pfadd %mm6,%mm4 |
100 movq %mm4,40(%ebx) | 106 movq %mm4,40(%ebx) |
101 pfsub %mm6,%mm5 | 107 pfsub %mm6,%mm5 |
102 pfmul %mm7,%mm5 | 108 pfmul 40(%eax),%mm5 |
103 pswapd %mm5, %mm5 | 109 pswapd %mm5, %mm5 |
104 movq %mm5, 80(%ebx) | 110 movq %mm5, 80(%ebx) |
111 | |
105 movq 48(%edi),%mm0 | 112 movq 48(%edi),%mm0 |
106 movq %mm0,%mm1 | 113 movq %mm0,%mm1 |
107 movd 76(%edi),%mm2 | 114 movd 76(%edi),%mm2 |
108 punpckldq 72(%edi),%mm2 | 115 punpckldq 72(%edi),%mm2 |
109 movq 48(%eax),%mm3 | |
110 pfadd %mm2,%mm0 | 116 pfadd %mm2,%mm0 |
111 movq %mm0,48(%ebx) | 117 movq %mm0,48(%ebx) |
112 pfsub %mm2,%mm1 | 118 pfsub %mm2,%mm1 |
113 pfmul %mm3,%mm1 | 119 pfmul 48(%eax),%mm1 |
114 pswapd %mm1, %mm1 | 120 pswapd %mm1, %mm1 |
115 movq %mm1, 72(%ebx) | 121 movq %mm1, 72(%ebx) |
122 | |
116 movq 56(%edi),%mm4 | 123 movq 56(%edi),%mm4 |
117 movq %mm4,%mm5 | 124 movq %mm4,%mm5 |
118 movd 68(%edi),%mm6 | 125 movd 68(%edi),%mm6 |
119 punpckldq 64(%edi),%mm6 | 126 punpckldq 64(%edi),%mm6 |
120 movq 56(%eax),%mm7 | |
121 pfadd %mm6,%mm4 | 127 pfadd %mm6,%mm4 |
122 movq %mm4,56(%ebx) | 128 movq %mm4,56(%ebx) |
123 pfsub %mm6,%mm5 | 129 pfsub %mm6,%mm5 |
124 pfmul %mm7,%mm5 | 130 pfmul 56(%eax),%mm5 |
125 pswapd %mm5, %mm5 | 131 pswapd %mm5, %mm5 |
126 movq %mm5, 64(%ebx) | 132 movq %mm5, 64(%ebx) |
127 | 133 |
128 // 2 | 134 // 2 |
129 movl pnts+4,%eax | 135 movl pnts+4,%eax |
130 / 0, 14 | 136 / 0, 14 |
131 movq 0(%ebx),%mm0 | 137 movq 0(%ebx),%mm0 /* mm0 = tmp1[0] | tmp1[1] */ |
132 movq %mm0,%mm1 | 138 movq %mm0,%mm1 |
133 movd 60(%ebx),%mm2 | 139 movd 60(%ebx),%mm2 /* mm2 = tmp1[0x0F] */ |
134 punpckldq 56(%ebx),%mm2 | 140 punpckldq 56(%ebx),%mm2 /* mm2 = tmp1[0x0E] | tmp1[0x0F] */ |
135 movq 0(%eax),%mm3 | 141 movq 0(%eax),%mm3 /* mm3 = pnts[0] | pnts[1] */ |
136 pfadd %mm2,%mm0 | 142 pfadd %mm2,%mm0 /* mm0 = tmp1[0]+tmp1[0x0F]|tmp1[1]+tmp1[0x0E]*/ |
137 movq %mm0,0(%esi) | 143 movq %mm0,0(%esi) /* tmp2[0, 1] = mm0 */ |
138 pfsub %mm2,%mm1 | 144 pfsub %mm2,%mm1 /* mm1 = tmp1[0]-tmp1[0x0F]|tmp1[1]-tmp1[0x0E]*/ |
139 pfmul %mm3,%mm1 | 145 pfmul %mm3,%mm1 /* mm1 = (tmp1[0]-tmp1[0x0F])*pnts[0]|(tmp1[1]-tmp1[0x0E])*pnts[1]*/ |
140 pswapd %mm1, %mm1 | 146 pswapd %mm1, %mm1 /* mm1 = (tmp1[1]-tmp1[0x0E])*pnts[1]|(tmp1[0]-tmp1[0x0F])*pnts[0]*/ |
141 movq %mm1, 56(%esi) | 147 movq %mm1, 56(%esi) /* tmp2[0x0E, 0x0F] = mm1 */ |
142 / 16, 30 | 148 / 16, 30 |
143 movq 64(%ebx),%mm0 | 149 movq 64(%ebx),%mm0 |
144 movq %mm0,%mm1 | 150 movq %mm0,%mm1 |
145 movd 124(%ebx),%mm2 | 151 movd 124(%ebx),%mm2 |
146 punpckldq 120(%ebx),%mm2 | 152 punpckldq 120(%ebx),%mm2 |
312 movq %mm5,104(%ebx) | 318 movq %mm5,104(%ebx) |
313 pswapd %mm6, %mm6 | 319 pswapd %mm6, %mm6 |
314 movq %mm6, 112(%ebx) | 320 movq %mm6, 112(%ebx) |
315 | 321 |
316 // 4 | 322 // 4 |
317 movl pnts+12,%eax | 323 movl pnts+12,%eax |
318 movq 0(%eax),%mm0 | 324 movq 0(%eax),%mm0 /* mm0 = pnts[3] | pnts[4] */ |
319 movq 0(%ebx),%mm1 | 325 movq 0(%ebx),%mm1 /* mm1 = tmp1[0] | tmp1[1] */ |
320 / 0 | 326 / 0 |
321 movq %mm1,%mm2 | 327 movq %mm1,%mm2 |
322 movd 12(%ebx),%mm3 | 328 movd 12(%ebx),%mm3 /* mm3 = tmp1[3] */ |
323 punpckldq 8(%ebx),%mm3 | 329 punpckldq 8(%ebx),%mm3 /* mm3 = tmp1[3] | tmp1[2] */ |
324 pfadd %mm3,%mm1 | 330 pfadd %mm3,%mm1 /* mm1 = tmp1[0]+tmp1[3] | tmp1[1]+tmp1[2]*/ |
325 pfsub %mm3,%mm2 | 331 pfsub %mm3,%mm2 /* mm2 = tmp1[0]-tmp1[3] | tmp1[0]-tmp1[2]*/ |
326 pfmul %mm0,%mm2 | 332 pfmul %mm0,%mm2 /* mm2 = tmp1[0]-tmp1[3]*pnts[3]|tmp1[0]-tmp1[2]*pnts[4]*/ |
327 movq %mm1,0(%esi) | 333 movq %mm1,0(%esi) /* tmp2[0, 1] = mm1 */ |
328 pswapd %mm2, %mm2 | 334 pswapd %mm2, %mm2 /* mm2 = tmp1[0]-tmp1[2]*pnts[4]|tmp1[0]-tmp1[3]*pnts[3] */ |
329 movq %mm2, 8(%esi) | 335 movq %mm2, 8(%esi) /* tmp2[2, 3] = mm2 */ |
330 movq 16(%ebx),%mm4 | 336 movq 16(%ebx),%mm4 |
331 / 4 | 337 / 4 |
332 movq %mm4,%mm5 | 338 movq %mm4,%mm5 |
333 movd 28(%ebx),%mm6 | 339 movd 28(%ebx),%mm6 |
334 punpckldq 24(%ebx),%mm6 | 340 punpckldq 24(%ebx),%mm6 |
410 movd %eax,%mm1 | 416 movd %eax,%mm1 |
411 movl $1,%eax | 417 movl $1,%eax |
412 movd %eax,%mm0 | 418 movd %eax,%mm0 |
413 / L | H | 419 / L | H |
414 punpckldq %mm1,%mm0 | 420 punpckldq %mm1,%mm0 |
415 pi2fd %mm0,%mm0 | 421 pi2fd %mm0,%mm0 /* mm0 = 1.0 | -1.0 */ |
416 / 1.0 | -1.0 | |
417 movd %eax,%mm1 | 422 movd %eax,%mm1 |
418 pi2fd %mm1,%mm1 | 423 pi2fd %mm1,%mm1 |
419 movl pnts+16,%eax | 424 movl pnts+16,%eax |
420 movd 0(%eax),%mm2 | 425 movd 0(%eax),%mm2 |
421 punpckldq %mm2,%mm1 | 426 punpckldq %mm2,%mm1 /* mm1 = 1.0 | cos0 */ |
422 / 1.0 | cos0 | 427 movq 0(%esi),%mm2 /* mm2 = tmp2[0] | tmp2[1] */ |
423 movq 0(%esi),%mm2 | |
424 / 0 | 428 / 0 |
425 movq %mm2,%mm3 | 429 pfpnacc %mm2, %mm2 |
426 pfmul %mm0,%mm3 | 430 pswapd %mm2, %mm2 /* mm2 = tmp2[0]+tmp2[1]|tmp2[0]-tmp2[1]*/ |
427 pfacc %mm3,%mm2 | 431 pfmul %mm1,%mm2 /* mm2 = tmp2[0]+tmp2[1]|(tmp2[0]-tmp2[1])*cos0*/ |
428 pfmul %mm1,%mm2 | 432 movq %mm2,0(%ebx) /* tmp1[0, 1] = mm2 */ |
429 movq %mm2,0(%ebx) | 433 movq 8(%esi),%mm4 /* mm4 = tmp2[2] | tmp2[3]*/ |
430 movq 8(%esi),%mm4 | 434 pfpnacc %mm4, %mm4 |
431 movq %mm4,%mm5 | 435 pswapd %mm4, %mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[2]-tmp2[3]*/ |
432 pfmul %mm0,%mm5 | 436 pfmul %mm0,%mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/ |
433 pfacc %mm5,%mm4 | 437 pfmul %mm1,%mm4 /* mm4 = tmp2[2]+tmp2[3]|(tmp2[3]-tmp2[2])*cos0*/ |
434 pfmul %mm0,%mm4 | 438 movq %mm4,%mm5 |
435 pfmul %mm1,%mm4 | 439 psrlq $32,%mm5 /* mm5 = (tmp2[3]-tmp2[2])*cos0 */ |
436 movq %mm4,%mm5 | 440 pfacc %mm5,%mm4 /* mm4 = tmp2[2]+tmp2[3]+(tmp2[3]-tmp2[2])*cos0|(tmp2[3]-tmp2[2])*cos0*/ |
437 psrlq $32,%mm5 | 441 movq %mm4,8(%ebx) /* tmp1[2, 3] = mm4 */ |
438 pfacc %mm5,%mm4 | |
439 movq %mm4,8(%ebx) | |
440 movq 16(%esi),%mm2 | 442 movq 16(%esi),%mm2 |
441 / 4 | 443 / 4 |
442 movq %mm2,%mm3 | 444 pfpnacc %mm2, %mm2 |
443 pfmul %mm0,%mm3 | 445 pswapd %mm2, %mm2 |
444 pfacc %mm3,%mm2 | 446 |
445 pfmul %mm1,%mm2 | 447 pfmul %mm1,%mm2 |
446 movq 24(%esi),%mm4 | 448 movq 24(%esi),%mm4 |
447 movq %mm4,%mm5 | 449 pfpnacc %mm4, %mm4 |
448 pfmul %mm0,%mm5 | 450 pswapd %mm4, %mm4 |
449 pfacc %mm5,%mm4 | 451 |
450 pfmul %mm0,%mm4 | 452 pfmul %mm0,%mm4 |
451 pfmul %mm1,%mm4 | 453 pfmul %mm1,%mm4 |
452 movq %mm4,%mm5 | 454 movq %mm4,%mm5 |
453 psrlq $32,%mm5 | 455 psrlq $32,%mm5 |
454 pfacc %mm5,%mm4 | 456 pfacc %mm5,%mm4 |
458 pfadd %mm3,%mm4 | 460 pfadd %mm3,%mm4 |
459 movq %mm2,16(%ebx) | 461 movq %mm2,16(%ebx) |
460 movq %mm4,24(%ebx) | 462 movq %mm4,24(%ebx) |
461 movq 32(%esi),%mm2 | 463 movq 32(%esi),%mm2 |
462 / 8 | 464 / 8 |
463 movq %mm2,%mm3 | 465 pfpnacc %mm2, %mm2 |
464 pfmul %mm0,%mm3 | 466 pswapd %mm2, %mm2 |
465 pfacc %mm3,%mm2 | 467 |
466 pfmul %mm1,%mm2 | 468 pfmul %mm1,%mm2 |
467 movq %mm2,32(%ebx) | 469 movq %mm2,32(%ebx) |
468 movq 40(%esi),%mm4 | 470 movq 40(%esi),%mm4 |
469 movq %mm4,%mm5 | 471 pfpnacc %mm4, %mm4 |
470 pfmul %mm0,%mm5 | 472 pswapd %mm4, %mm4 |
471 pfacc %mm5,%mm4 | |
472 pfmul %mm0,%mm4 | 473 pfmul %mm0,%mm4 |
473 pfmul %mm1,%mm4 | 474 pfmul %mm1,%mm4 |
474 movq %mm4,%mm5 | 475 movq %mm4,%mm5 |
475 psrlq $32,%mm5 | 476 psrlq $32,%mm5 |
476 pfacc %mm5,%mm4 | 477 pfacc %mm5,%mm4 |
477 movq %mm4,40(%ebx) | 478 movq %mm4,40(%ebx) |
478 movq 48(%esi),%mm2 | 479 movq 48(%esi),%mm2 |
479 / 12 | 480 / 12 |
480 movq %mm2,%mm3 | 481 pfpnacc %mm2, %mm2 |
481 pfmul %mm0,%mm3 | 482 pswapd %mm2, %mm2 |
482 pfacc %mm3,%mm2 | |
483 pfmul %mm1,%mm2 | 483 pfmul %mm1,%mm2 |
484 movq 56(%esi),%mm4 | 484 movq 56(%esi),%mm4 |
485 movq %mm4,%mm5 | 485 pfpnacc %mm4, %mm4 |
486 pfmul %mm0,%mm5 | 486 pswapd %mm4, %mm4 |
487 pfacc %mm5,%mm4 | |
488 pfmul %mm0,%mm4 | 487 pfmul %mm0,%mm4 |
489 pfmul %mm1,%mm4 | 488 pfmul %mm1,%mm4 |
490 movq %mm4,%mm5 | 489 movq %mm4,%mm5 |
491 psrlq $32,%mm5 | 490 psrlq $32,%mm5 |
492 pfacc %mm5,%mm4 | 491 pfacc %mm5,%mm4 |
496 pfadd %mm3,%mm4 | 495 pfadd %mm3,%mm4 |
497 movq %mm2,48(%ebx) | 496 movq %mm2,48(%ebx) |
498 movq %mm4,56(%ebx) | 497 movq %mm4,56(%ebx) |
499 movq 64(%esi),%mm2 | 498 movq 64(%esi),%mm2 |
500 / 16 | 499 / 16 |
501 movq %mm2,%mm3 | 500 pfpnacc %mm2, %mm2 |
502 pfmul %mm0,%mm3 | 501 pswapd %mm2, %mm2 |
503 pfacc %mm3,%mm2 | |
504 pfmul %mm1,%mm2 | 502 pfmul %mm1,%mm2 |
505 movq %mm2,64(%ebx) | 503 movq %mm2,64(%ebx) |
506 movq 72(%esi),%mm4 | 504 movq 72(%esi),%mm4 |
507 movq %mm4,%mm5 | 505 pfpnacc %mm4, %mm4 |
508 pfmul %mm0,%mm5 | 506 pswapd %mm4, %mm4 |
509 pfacc %mm5,%mm4 | |
510 pfmul %mm0,%mm4 | 507 pfmul %mm0,%mm4 |
511 pfmul %mm1,%mm4 | 508 pfmul %mm1,%mm4 |
512 movq %mm4,%mm5 | 509 movq %mm4,%mm5 |
513 psrlq $32,%mm5 | 510 psrlq $32,%mm5 |
514 pfacc %mm5,%mm4 | 511 pfacc %mm5,%mm4 |
515 movq %mm4,72(%ebx) | 512 movq %mm4,72(%ebx) |
516 movq 80(%esi),%mm2 | 513 movq 80(%esi),%mm2 |
517 / 20 | 514 / 20 |
518 movq %mm2,%mm3 | 515 pfpnacc %mm2, %mm2 |
519 pfmul %mm0,%mm3 | 516 pswapd %mm2, %mm2 |
520 pfacc %mm3,%mm2 | |
521 pfmul %mm1,%mm2 | 517 pfmul %mm1,%mm2 |
522 movq 88(%esi),%mm4 | 518 movq 88(%esi),%mm4 |
523 movq %mm4,%mm5 | 519 pfpnacc %mm4, %mm4 |
524 pfmul %mm0,%mm5 | 520 pswapd %mm4, %mm4 |
525 pfacc %mm5,%mm4 | |
526 pfmul %mm0,%mm4 | 521 pfmul %mm0,%mm4 |
527 pfmul %mm1,%mm4 | 522 pfmul %mm1,%mm4 |
528 movq %mm4,%mm5 | 523 movq %mm4,%mm5 |
529 psrlq $32,%mm5 | 524 psrlq $32,%mm5 |
530 pfacc %mm5,%mm4 | 525 pfacc %mm5,%mm4 |
534 pfadd %mm3,%mm4 | 529 pfadd %mm3,%mm4 |
535 movq %mm2,80(%ebx) | 530 movq %mm2,80(%ebx) |
536 movq %mm4,88(%ebx) | 531 movq %mm4,88(%ebx) |
537 movq 96(%esi),%mm2 | 532 movq 96(%esi),%mm2 |
538 / 24 | 533 / 24 |
539 movq %mm2,%mm3 | 534 pfpnacc %mm2, %mm2 |
540 pfmul %mm0,%mm3 | 535 pswapd %mm2, %mm2 |
541 pfacc %mm3,%mm2 | |
542 pfmul %mm1,%mm2 | 536 pfmul %mm1,%mm2 |
543 movq %mm2,96(%ebx) | 537 movq %mm2,96(%ebx) |
544 movq 104(%esi),%mm4 | 538 movq 104(%esi),%mm4 |
545 movq %mm4,%mm5 | 539 pfpnacc %mm4, %mm4 |
546 pfmul %mm0,%mm5 | 540 pswapd %mm4, %mm4 |
547 pfacc %mm5,%mm4 | |
548 pfmul %mm0,%mm4 | 541 pfmul %mm0,%mm4 |
549 pfmul %mm1,%mm4 | 542 pfmul %mm1,%mm4 |
550 movq %mm4,%mm5 | 543 movq %mm4,%mm5 |
551 psrlq $32,%mm5 | 544 psrlq $32,%mm5 |
552 pfacc %mm5,%mm4 | 545 pfacc %mm5,%mm4 |
553 movq %mm4,104(%ebx) | 546 movq %mm4,104(%ebx) |
554 movq 112(%esi),%mm2 | 547 movq 112(%esi),%mm2 |
555 / 28 | 548 / 28 |
556 movq %mm2,%mm3 | 549 pfpnacc %mm2, %mm2 |
557 pfmul %mm0,%mm3 | 550 pswapd %mm2, %mm2 |
558 pfacc %mm3,%mm2 | |
559 pfmul %mm1,%mm2 | 551 pfmul %mm1,%mm2 |
560 movq 120(%esi),%mm4 | 552 movq 120(%esi),%mm4 |
561 movq %mm4,%mm5 | 553 pfpnacc %mm4, %mm4 |
562 pfmul %mm0,%mm5 | 554 pswapd %mm4, %mm4 |
563 pfacc %mm5,%mm4 | |
564 pfmul %mm0,%mm4 | 555 pfmul %mm0,%mm4 |
565 pfmul %mm1,%mm4 | 556 pfmul %mm1,%mm4 |
566 movq %mm4,%mm5 | 557 movq %mm4,%mm5 |
567 psrlq $32,%mm5 | 558 psrlq $32,%mm5 |
568 pfacc %mm5,%mm4 | 559 pfacc %mm5,%mm4 |
572 pfadd %mm3,%mm4 | 563 pfadd %mm3,%mm4 |
573 movq %mm2,112(%ebx) | 564 movq %mm2,112(%ebx) |
574 movq %mm4,120(%ebx) | 565 movq %mm4,120(%ebx) |
575 | 566 |
576 // Phase6 | 567 // Phase6 |
577 movl 0(%ebx),%eax | 568 movd 0(%ebx),%mm0 |
578 movl %eax,1024(%ebp) | 569 movd %mm0,1024(%ebp) |
579 movl 4(%ebx),%eax | 570 movl 4(%ebx),%eax |
580 movl %eax,0(%ebp) | 571 movl %eax,0(%ebp) |
581 movl %eax,0(%edx) | 572 movl %eax,0(%edx) |
582 movl 8(%ebx),%eax | 573 movd 8(%ebx),%mm2 |
583 movl %eax,512(%ebp) | 574 movd %mm2,512(%ebp) |
584 movl 12(%ebx),%eax | 575 movd 12(%ebx),%mm3 |
585 movl %eax,512(%edx) | 576 movd %mm3,512(%edx) |
586 | 577 |
587 movl 16(%ebx),%eax | 578 movl 16(%ebx),%eax |
588 movl %eax,768(%ebp) | 579 movl %eax,768(%ebp) |
589 movl 20(%ebx),%eax | 580 movd 20(%ebx),%mm5 |
590 movl %eax,256(%edx) | 581 movd %mm5,256(%edx) |
591 | 582 |
592 movl 24(%ebx),%eax | 583 movd 24(%ebx),%mm6 |
593 movl %eax,256(%ebp) | 584 movd %mm6,256(%ebp) |
594 movl 28(%ebx),%eax | 585 movd 28(%ebx),%mm7 |
595 movl %eax,768(%edx) | 586 movd %mm7,768(%edx) |
596 | 587 |
597 movq 32(%ebx),%mm0 | 588 movq 32(%ebx),%mm0 /* mm0 = tmp1[8] | tmp1[9] */ |
598 movq 48(%ebx),%mm1 | 589 movq 48(%ebx),%mm1 /* mm1 = tmp1[12] | tmp1[13] */ |
599 pfadd %mm1,%mm0 | 590 pfadd %mm1,%mm0 /* mm0 = tmp1[8]+tmp1[12]| tmp1[9]+tmp1[13]*/ |
600 movd %mm0,896(%ebp) | 591 movd %mm0,896(%ebp) /* a[0xE0] = tmp1[8]+tmp1[12] */ |
601 psrlq $32,%mm0 | 592 psrlq $32,%mm0 |
602 movd %mm0,128(%edx) | 593 movd %mm0,128(%edx) /* a[0x20] = tmp1[9]+tmp1[13] */ |
603 movq 40(%ebx),%mm2 | 594 movq 40(%ebx),%mm2 |
604 pfadd %mm2,%mm1 | 595 pfadd %mm2,%mm1 |
605 movd %mm1,640(%ebp) | 596 movd %mm1,640(%ebp) |
606 psrlq $32,%mm1 | 597 psrlq $32,%mm1 |
607 movd %mm1,384(%edx) | 598 movd %mm1,384(%edx) |
677 popl %esi | 668 popl %esi |
678 popl %edi | 669 popl %edi |
679 popl %ebp | 670 popl %ebp |
680 addl $256,%esp | 671 addl $256,%esp |
681 | 672 |
682 ret | 673 ret $12 |
683 | 674 |