Mercurial > mplayer.hg
comparison mp3lib/dct64_sse.s @ 1394:d9e3f91d6da9
First development version of dct64, mixed with 3dnow/k7 and fpu code.
Phases 1 to 3 seem to be ok already, report if you get strange sound with this version (klicks or distorted sound, that doesn't happen with mmx-only version), I've tested with approx. 20 mp3 files which all sounded ok, speed improvement with this version is still very minimal cause more cpu intensive phases 4 and 5 aren't working so I use fpu code for them.
author | atmos4 |
---|---|
date | Fri, 27 Jul 2001 17:25:19 +0000 |
parents | |
children | 8312f4bc8dab |
comparison
equal
deleted
inserted
replaced
1393:5eef9e69b145 | 1394:d9e3f91d6da9 |
---|---|
1 # This code is a translation of dct64_k7.s from MPlayer. | |
2 # Coded by Felix Buenemann <atmosfear at users.sourceforge.net> | |
3 # | |
4 # TODO: - fix phases 4 and 5 (sse) | |
5 # - optimize scalar FPU code? (interleave with sse code) | |
6 # | |
7 | |
8 //.data | |
9 // .align 8 | |
10 //x_plus_minus_3dnow: .long 0x00000000, 0x80000000 | |
11 //plus_1f: .float 1.0 | |
12 | |
13 .text | |
14 | |
15 .align 16 | |
16 | |
17 .global dct64_MMX_sse | |
18 | |
19 dct64_MMX_sse: | |
20 pushl %ebx | |
21 pushl %esi | |
22 pushl %edi | |
23 subl $256,%esp | |
24 movl 280(%esp),%eax | |
25 | |
26 leal 128(%esp),%edx | |
27 movl 272(%esp),%esi | |
28 movl 276(%esp),%edi | |
29 movl $costab_mmx,%ebx | |
30 orl %ecx,%ecx | |
31 movl %esp,%ecx | |
32 | |
33 /* Phase 1 (complete, worx) */ | |
34 | |
35 // [1] Process Block A1 (16 Bytes) | |
36 / movq (%eax), %mm0 | |
37 / movq 8(%eax), %mm4 | |
38 movups (%eax), %xmm0 | |
39 | |
40 // Copy A1 to another register A2 | |
41 / movq %mm0, %mm3 | |
42 / movq %mm4, %mm7 | |
43 movaps %xmm0, %xmm2 | |
44 | |
45 // Process Block B1 (last 16 bytes) | |
46 / movq 120(%eax), %mm1 | |
47 / movq 112(%eax), %mm5 | |
48 movups 112(%eax), %xmm1 | |
49 | |
50 /* The PSWAPD instruction swaps or reverses the upper and lower | |
51 * doublewords of the source operand. PSWAPD mmreg1, mmreg2 | |
52 * performs the following operations: | |
53 * temp = mmreg2 | |
54 * mmreg1[63:32] = temp[31:0 ] | |
55 * mmreg1[31:0 ] = temp[63:32] | |
56 */ | |
57 / pswapd %mm1, %mm1 | |
58 / pswapd %mm5, %mm5 | |
59 // shufps here exchanges a,b,c,d to b,a,d,c in xmm1 (desc ia32-ref p.752) | |
60 //// shufps $177, %xmm1, %xmm1 | |
61 shufps $27, %xmm1, %xmm1 | |
62 | |
63 // Add B1 to A1 | |
64 / pfadd %mm1, %mm0 | |
65 / pfadd %mm5, %mm4 | |
66 addps %xmm1, %xmm0 | |
67 | |
68 // Save Block A1 | |
69 / movq %mm0, (%edx) | |
70 / movq %mm4, 8(%edx) | |
71 movups %xmm0, (%edx) | |
72 | |
73 // Sub B1 from A2 | |
74 / pfsub %mm1, %mm3 | |
75 / pfsub %mm5, %mm7 | |
76 subps %xmm1, %xmm2 | |
77 | |
78 // Mul mem with A2 | |
79 / pfmul (%ebx), %mm3 | |
80 / pfmul 8(%ebx), %mm7 | |
81 movups (%ebx), %xmm7 | |
82 mulps %xmm7, %xmm2 | |
83 | |
84 // Shuffle A2 | |
85 / pswapd %mm3, %mm3 | |
86 / pswapd %mm7, %mm7 | |
87 // I do a,b,c,d -> d,c,b,a to suit order when writing to mem (saves one shufps) | |
88 shufps $27, %xmm2, %xmm2 | |
89 | |
90 // Save A2 to mem (end) | |
91 / movq %mm3, 120(%edx) | |
92 / movq %mm7, 112(%edx) | |
93 movups %xmm2, 112(%edx) | |
94 | |
95 // [2] Process next data block | |
96 / movq 16(%eax), %mm0 | |
97 / movq 24(%eax), %mm4 | |
98 movups 16(%eax), %xmm0 | |
99 | |
100 / movq %mm0, %mm3 | |
101 / movq %mm4, %mm7 | |
102 movaps %xmm0, %xmm2 | |
103 | |
104 / movq 104(%eax), %mm1 | |
105 / movq 96(%eax), %mm5 | |
106 movups 96(%eax), %xmm1 | |
107 | |
108 / pswapd %mm1, %mm1 | |
109 / pswapd %mm5, %mm5 | |
110 //// shufps $177, %xmm1, %xmm1 | |
111 shufps $27, %xmm1, %xmm1 | |
112 | |
113 / pfadd %mm1, %mm0 | |
114 / pfadd %mm5, %mm4 | |
115 addps %xmm1, %xmm0 | |
116 | |
117 / movq %mm0, 16(%edx) | |
118 / movq %mm4, 24(%edx) | |
119 movups %xmm0, 16(%edx) | |
120 | |
121 / pfsub %mm1, %mm3 | |
122 / pfsub %mm5, %mm7 | |
123 subps %xmm1, %xmm2 | |
124 | |
125 / pfmul 16(%ebx), %mm3 | |
126 / pfmul 24(%ebx), %mm7 | |
127 movups 16(%ebx), %xmm7 | |
128 mulps %xmm7, %xmm2 | |
129 | |
130 / pswapd %mm3, %mm3 | |
131 / pswapd %mm7, %mm7 | |
132 shufps $27, %xmm2, %xmm2 | |
133 | |
134 / movq %mm3, 104(%edx) | |
135 / movq %mm7, 96(%edx) | |
136 movups %xmm2, 96(%edx) | |
137 | |
138 // [3] | |
139 / movq 32(%eax), %mm0 | |
140 / movq 40(%eax), %mm4 | |
141 movups 32(%eax), %xmm0 | |
142 | |
143 / movq %mm0, %mm3 | |
144 / movq %mm4, %mm7 | |
145 movaps %xmm0, %xmm2 | |
146 | |
147 / movq 88(%eax), %mm1 | |
148 / movq 80(%eax), %mm5 | |
149 movups 80(%eax), %xmm1 | |
150 | |
151 / pswapd %mm1, %mm1 | |
152 / pswapd %mm5, %mm5 | |
153 //// shufps $177, %xmm1, %xmm1 | |
154 shufps $27, %xmm1, %xmm1 | |
155 | |
156 / pfadd %mm1, %mm0 | |
157 / pfadd %mm5, %mm4 | |
158 addps %xmm1, %xmm0 | |
159 | |
160 / movq %mm0, 32(%edx) | |
161 / movq %mm4, 40(%edx) | |
162 movups %xmm0, 32(%edx) | |
163 | |
164 / pfsub %mm1, %mm3 | |
165 / pfsub %mm5, %mm7 | |
166 subps %xmm1, %xmm2 | |
167 | |
168 / pfmul 32(%ebx), %mm3 | |
169 / pfmul 40(%ebx), %mm7 | |
170 movups 32(%ebx), %xmm7 | |
171 mulps %xmm7, %xmm2 | |
172 | |
173 / pswapd %mm3, %mm3 | |
174 / pswapd %mm7, %mm7 | |
175 shufps $27, %xmm2, %xmm2 | |
176 | |
177 / movq %mm3, 88(%edx) | |
178 / movq %mm7, 80(%edx) | |
179 movups %xmm2, 80(%edx) | |
180 | |
181 // [4] | |
182 / movq 48(%eax), %mm0 | |
183 / movq 56(%eax), %mm4 | |
184 movups 48(%eax), %xmm0 | |
185 | |
186 / movq %mm0, %mm3 | |
187 / movq %mm4, %mm7 | |
188 movaps %xmm0, %xmm2 | |
189 | |
190 / movq 72(%eax), %mm1 | |
191 / movq 64(%eax), %mm5 | |
192 movups 64(%eax), %xmm1 | |
193 | |
194 / pswapd %mm1, %mm1 | |
195 / pswapd %mm5, %mm5 | |
196 //// shufps $177, %xmm1, %xmm1 | |
197 shufps $27, %xmm1, %xmm1 | |
198 | |
199 / pfadd %mm1, %mm0 | |
200 / pfadd %mm5, %mm4 | |
201 addps %xmm1, %xmm0 | |
202 | |
203 / movq %mm0, 48(%edx) | |
204 / movq %mm4, 56(%edx) | |
205 movups %xmm0, 48(%edx) | |
206 | |
207 / pfsub %mm1, %mm3 | |
208 / pfsub %mm5, %mm7 | |
209 subps %xmm1, %xmm2 | |
210 | |
211 / pfmul 48(%ebx), %mm3 | |
212 / pfmul 56(%ebx), %mm7 | |
213 movups 48(%ebx), %xmm7 | |
214 mulps %xmm7, %xmm2 | |
215 | |
216 / pswapd %mm3, %mm3 | |
217 / pswapd %mm7, %mm7 | |
218 shufps $27, %xmm2, %xmm2 | |
219 | |
220 / movq %mm3, 72(%edx) | |
221 / movq %mm7, 64(%edx) | |
222 movups %xmm2, 64(%edx) | |
223 | |
224 | |
225 // phase 1 fpu code | |
226 /* Phase 1*/ | |
227 /* | |
228 flds (%eax) | |
229 leal 128(%esp),%edx | |
230 fadds 124(%eax) | |
231 movl 272(%esp),%esi | |
232 fstps (%edx) | |
233 movl 276(%esp),%edi | |
234 | |
235 flds 4(%eax) | |
236 movl $costab_mmx,%ebx | |
237 fadds 120(%eax) | |
238 orl %ecx,%ecx | |
239 fstps 4(%edx) | |
240 | |
241 flds (%eax) | |
242 movl %esp,%ecx | |
243 fsubs 124(%eax) | |
244 fmuls (%ebx) | |
245 fstps 124(%edx) | |
246 | |
247 flds 4(%eax) | |
248 fsubs 120(%eax) | |
249 fmuls 4(%ebx) | |
250 fstps 120(%edx) | |
251 | |
252 flds 8(%eax) | |
253 fadds 116(%eax) | |
254 fstps 8(%edx) | |
255 | |
256 flds 12(%eax) | |
257 fadds 112(%eax) | |
258 fstps 12(%edx) | |
259 | |
260 flds 8(%eax) | |
261 fsubs 116(%eax) | |
262 fmuls 8(%ebx) | |
263 fstps 116(%edx) | |
264 | |
265 flds 12(%eax) | |
266 fsubs 112(%eax) | |
267 fmuls 12(%ebx) | |
268 fstps 112(%edx) | |
269 | |
270 flds 16(%eax) | |
271 fadds 108(%eax) | |
272 fstps 16(%edx) | |
273 | |
274 flds 20(%eax) | |
275 fadds 104(%eax) | |
276 fstps 20(%edx) | |
277 | |
278 flds 16(%eax) | |
279 fsubs 108(%eax) | |
280 fmuls 16(%ebx) | |
281 fstps 108(%edx) | |
282 | |
283 flds 20(%eax) | |
284 fsubs 104(%eax) | |
285 fmuls 20(%ebx) | |
286 fstps 104(%edx) | |
287 | |
288 flds 24(%eax) | |
289 fadds 100(%eax) | |
290 fstps 24(%edx) | |
291 | |
292 flds 28(%eax) | |
293 fadds 96(%eax) | |
294 fstps 28(%edx) | |
295 | |
296 flds 24(%eax) | |
297 fsubs 100(%eax) | |
298 fmuls 24(%ebx) | |
299 fstps 100(%edx) | |
300 | |
301 flds 28(%eax) | |
302 fsubs 96(%eax) | |
303 fmuls 28(%ebx) | |
304 fstps 96(%edx) | |
305 | |
306 flds 32(%eax) | |
307 fadds 92(%eax) | |
308 fstps 32(%edx) | |
309 | |
310 flds 36(%eax) | |
311 fadds 88(%eax) | |
312 fstps 36(%edx) | |
313 | |
314 flds 32(%eax) | |
315 fsubs 92(%eax) | |
316 fmuls 32(%ebx) | |
317 fstps 92(%edx) | |
318 | |
319 flds 36(%eax) | |
320 fsubs 88(%eax) | |
321 fmuls 36(%ebx) | |
322 fstps 88(%edx) | |
323 | |
324 flds 40(%eax) | |
325 fadds 84(%eax) | |
326 fstps 40(%edx) | |
327 | |
328 flds 44(%eax) | |
329 fadds 80(%eax) | |
330 fstps 44(%edx) | |
331 | |
332 flds 40(%eax) | |
333 fsubs 84(%eax) | |
334 fmuls 40(%ebx) | |
335 fstps 84(%edx) | |
336 | |
337 flds 44(%eax) | |
338 fsubs 80(%eax) | |
339 fmuls 44(%ebx) | |
340 fstps 80(%edx) | |
341 | |
342 flds 48(%eax) | |
343 fadds 76(%eax) | |
344 fstps 48(%edx) | |
345 | |
346 flds 52(%eax) | |
347 fadds 72(%eax) | |
348 fstps 52(%edx) | |
349 | |
350 flds 48(%eax) | |
351 fsubs 76(%eax) | |
352 fmuls 48(%ebx) | |
353 fstps 76(%edx) | |
354 | |
355 flds 52(%eax) | |
356 fsubs 72(%eax) | |
357 fmuls 52(%ebx) | |
358 fstps 72(%edx) | |
359 | |
360 flds 56(%eax) | |
361 fadds 68(%eax) | |
362 fstps 56(%edx) | |
363 | |
364 flds 60(%eax) | |
365 fadds 64(%eax) | |
366 fstps 60(%edx) | |
367 | |
368 flds 56(%eax) | |
369 fsubs 68(%eax) | |
370 fmuls 56(%ebx) | |
371 fstps 68(%edx) | |
372 | |
373 flds 60(%eax) | |
374 fsubs 64(%eax) | |
375 fmuls 60(%ebx) | |
376 fstps 64(%edx) | |
377 */ | |
378 // end phase 1 fpu code | |
379 | |
380 /* Phase 2 (completed, worx) */ | |
381 | |
382 / movq (%edx), %mm0 | |
383 / movq 8(%edx), %mm4 | |
384 movups (%edx), %xmm0 | |
385 | |
386 / movq %mm0, %mm3 | |
387 / movq %mm4, %mm7 | |
388 movaps %xmm0, %xmm2 | |
389 | |
390 / movq 56(%edx), %mm1 | |
391 / movq 48(%edx), %mm5 | |
392 movups 48(%edx), %xmm1 | |
393 | |
394 / pswapd %mm1, %mm1 | |
395 / pswapd %mm5, %mm5 | |
396 //// shufps $177, %xmm1, %xmm1 | |
397 shufps $27, %xmm1, %xmm1 | |
398 | |
399 / pfadd %mm1, %mm0 | |
400 / pfadd %mm5, %mm4 | |
401 addps %xmm1, %xmm0 | |
402 | |
403 / movq %mm0, (%ecx) | |
404 / movq %mm4, 8(%ecx) | |
405 movups %xmm0, (%ecx) | |
406 | |
407 / pfsub %mm1, %mm3 | |
408 / pfsub %mm5, %mm7 | |
409 subps %xmm1, %xmm2 | |
410 | |
411 / pfmul 64(%ebx), %mm3 | |
412 / pfmul 72(%ebx), %mm7 | |
413 movups 64(%ebx), %xmm7 | |
414 mulps %xmm7, %xmm2 | |
415 | |
416 / pswapd %mm3, %mm3 | |
417 / pswapd %mm7, %mm7 | |
418 shufps $27, %xmm2, %xmm2 | |
419 | |
420 / movq %mm3, 56(%ecx) | |
421 / movq %mm7, 48(%ecx) | |
422 movups %xmm2, 48(%ecx) | |
423 | |
424 / movq 16(%edx), %mm0 | |
425 / movq 24(%edx), %mm4 | |
426 movups 16(%edx), %xmm0 | |
427 | |
428 / movq %mm0, %mm3 | |
429 / movq %mm4, %mm7 | |
430 movaps %xmm0, %xmm2 | |
431 | |
432 / movq 40(%edx), %mm1 | |
433 / movq 32(%edx), %mm5 | |
434 movups 32(%edx), %xmm1 | |
435 | |
436 / pswapd %mm1, %mm1 | |
437 / pswapd %mm5, %mm5 | |
438 //// shufps $177, %xmm1, %xmm1 | |
439 shufps $27, %xmm1, %xmm1 | |
440 | |
441 / pfadd %mm1, %mm0 | |
442 / pfadd %mm5, %mm4 | |
443 addps %xmm1, %xmm0 | |
444 | |
445 / movq %mm0, 16(%ecx) | |
446 / movq %mm4, 24(%ecx) | |
447 movups %xmm0, 16(%ecx) | |
448 | |
449 / pfsub %mm1, %mm3 | |
450 / pfsub %mm5, %mm7 | |
451 subps %xmm1, %xmm2 | |
452 | |
453 / pfmul 80(%ebx), %mm3 | |
454 / pfmul 88(%ebx), %mm7 | |
455 movups 80(%ebx), %xmm7 | |
456 mulps %xmm7, %xmm2 | |
457 | |
458 / pswapd %mm3, %mm3 | |
459 / pswapd %mm7, %mm7 | |
460 shufps $27, %xmm2, %xmm2 | |
461 | |
462 / movq %mm3, 40(%ecx) | |
463 / movq %mm7, 32(%ecx) | |
464 movups %xmm2, 32(%ecx) | |
465 | |
466 | |
467 // phase 2 fpu | |
468 /* Phase 2*/ | |
469 /* | |
470 flds (%edx) | |
471 fadds 60(%edx) | |
472 fstps (%ecx) | |
473 | |
474 flds 4(%edx) | |
475 fadds 56(%edx) | |
476 fstps 4(%ecx) | |
477 | |
478 flds (%edx) | |
479 fsubs 60(%edx) | |
480 fmuls 64(%ebx) | |
481 fstps 60(%ecx) | |
482 | |
483 flds 4(%edx) | |
484 fsubs 56(%edx) | |
485 fmuls 68(%ebx) | |
486 fstps 56(%ecx) | |
487 | |
488 flds 8(%edx) | |
489 fadds 52(%edx) | |
490 fstps 8(%ecx) | |
491 | |
492 flds 12(%edx) | |
493 fadds 48(%edx) | |
494 fstps 12(%ecx) | |
495 | |
496 flds 8(%edx) | |
497 fsubs 52(%edx) | |
498 fmuls 72(%ebx) | |
499 fstps 52(%ecx) | |
500 | |
501 flds 12(%edx) | |
502 fsubs 48(%edx) | |
503 fmuls 76(%ebx) | |
504 fstps 48(%ecx) | |
505 | |
506 flds 16(%edx) | |
507 fadds 44(%edx) | |
508 fstps 16(%ecx) | |
509 | |
510 flds 20(%edx) | |
511 fadds 40(%edx) | |
512 fstps 20(%ecx) | |
513 | |
514 flds 16(%edx) | |
515 fsubs 44(%edx) | |
516 fmuls 80(%ebx) | |
517 fstps 44(%ecx) | |
518 | |
519 flds 20(%edx) | |
520 fsubs 40(%edx) | |
521 fmuls 84(%ebx) | |
522 fstps 40(%ecx) | |
523 | |
524 flds 24(%edx) | |
525 fadds 36(%edx) | |
526 fstps 24(%ecx) | |
527 | |
528 flds 28(%edx) | |
529 fadds 32(%edx) | |
530 fstps 28(%ecx) | |
531 | |
532 flds 24(%edx) | |
533 fsubs 36(%edx) | |
534 fmuls 88(%ebx) | |
535 fstps 36(%ecx) | |
536 | |
537 flds 28(%edx) | |
538 fsubs 32(%edx) | |
539 fmuls 92(%ebx) | |
540 fstps 32(%ecx) | |
541 */ | |
542 // end phase 2 fpu | |
543 | |
544 /* Phase 3 (completed, working) */ | |
545 | |
546 / movq 64(%edx), %mm0 | |
547 / movq 72(%edx), %mm4 | |
548 movups 64(%edx), %xmm0 | |
549 | |
550 / movq %mm0, %mm3 | |
551 / movq %mm4, %mm7 | |
552 movaps %xmm0, %xmm2 | |
553 | |
554 / movq 120(%edx), %mm1 | |
555 / movq 112(%edx), %mm5 | |
556 movups 112(%edx), %xmm1 | |
557 | |
558 / pswapd %mm1, %mm1 | |
559 / pswapd %mm5, %mm5 | |
560 //// shufps $177, %xmm1, %xmm1 | |
561 shufps $27, %xmm1, %xmm1 | |
562 | |
563 / pfadd %mm1, %mm0 | |
564 / pfadd %mm5, %mm4 | |
565 addps %xmm1, %xmm0 | |
566 | |
567 / movq %mm0, 64(%ecx) | |
568 / movq %mm4, 72(%ecx) | |
569 movups %xmm0, 64(%ecx) | |
570 | |
571 / pfsubr %mm1, %mm3 | |
572 / pfsubr %mm5, %mm7 | |
573 // optimized (xmm1<->xmm2) | |
574 subps %xmm2, %xmm1 | |
575 | |
576 / pfmul 64(%ebx), %mm3 | |
577 / pfmul 72(%ebx), %mm7 | |
578 movups 64(%ebx), %xmm7 | |
579 mulps %xmm7, %xmm1 | |
580 | |
581 / pswapd %mm3, %mm3 | |
582 / pswapd %mm7, %mm7 | |
583 shufps $27, %xmm1, %xmm1 | |
584 | |
585 / movq %mm3, 120(%ecx) | |
586 / movq %mm7, 112(%ecx) | |
587 movups %xmm1, 112(%ecx) | |
588 | |
589 | |
590 / movq 80(%edx), %mm0 | |
591 / movq 88(%edx), %mm4 | |
592 movups 80(%edx), %xmm0 | |
593 | |
594 / movq %mm0, %mm3 | |
595 / movq %mm4, %mm7 | |
596 movaps %xmm0, %xmm2 | |
597 | |
598 / movq 104(%edx), %mm1 | |
599 / movq 96(%edx), %mm5 | |
600 movups 96(%edx), %xmm1 | |
601 | |
602 / pswapd %mm1, %mm1 | |
603 / pswapd %mm5, %mm5 | |
604 //// shufps $177, %xmm1, %xmm1 | |
605 shufps $27, %xmm1, %xmm1 | |
606 | |
607 / pfadd %mm1, %mm0 | |
608 / pfadd %mm5, %mm4 | |
609 addps %xmm1, %xmm0 | |
610 | |
611 / movq %mm0, 80(%ecx) | |
612 / movq %mm4, 88(%ecx) | |
613 movups %xmm0, 80(%ecx) | |
614 | |
615 / pfsubr %mm1, %mm3 | |
616 / pfsubr %mm5, %mm7 | |
617 // optimized (xmm1<->xmm2) | |
618 subps %xmm2, %xmm1 | |
619 | |
620 / pfmul 80(%ebx), %mm3 | |
621 / pfmul 88(%ebx), %mm7 | |
622 movups 80(%ebx), %xmm7 | |
623 mulps %xmm7, %xmm1 | |
624 | |
625 / pswapd %mm3, %mm3 | |
626 / pswapd %mm7, %mm7 | |
627 shufps $27, %xmm1, %xmm1 | |
628 | |
629 / movq %mm3, 104(%ecx) | |
630 / movq %mm7, 96(%ecx) | |
631 movups %xmm1, 96(%ecx) | |
632 | |
633 | |
634 // phase 3 fpu | |
635 /* Phase 3*/ | |
636 /* | |
637 flds 64(%edx) | |
638 fadds 124(%edx) | |
639 fstps 64(%ecx) | |
640 | |
641 flds 68(%edx) | |
642 fadds 120(%edx) | |
643 fstps 68(%ecx) | |
644 | |
645 flds 124(%edx) | |
646 fsubs 64(%edx) | |
647 fmuls 64(%ebx) | |
648 fstps 124(%ecx) | |
649 | |
650 flds 120(%edx) | |
651 fsubs 68(%edx) | |
652 fmuls 68(%ebx) | |
653 fstps 120(%ecx) | |
654 | |
655 flds 72(%edx) | |
656 fadds 116(%edx) | |
657 fstps 72(%ecx) | |
658 | |
659 flds 76(%edx) | |
660 fadds 112(%edx) | |
661 fstps 76(%ecx) | |
662 | |
663 flds 116(%edx) | |
664 fsubs 72(%edx) | |
665 fmuls 72(%ebx) | |
666 fstps 116(%ecx) | |
667 | |
668 flds 112(%edx) | |
669 fsubs 76(%edx) | |
670 fmuls 76(%ebx) | |
671 fstps 112(%ecx) | |
672 | |
673 flds 80(%edx) | |
674 fadds 108(%edx) | |
675 fstps 80(%ecx) | |
676 | |
677 flds 84(%edx) | |
678 fadds 104(%edx) | |
679 fstps 84(%ecx) | |
680 | |
681 flds 108(%edx) | |
682 fsubs 80(%edx) | |
683 fmuls 80(%ebx) | |
684 fstps 108(%ecx) | |
685 | |
686 flds 104(%edx) | |
687 fsubs 84(%edx) | |
688 fmuls 84(%ebx) | |
689 fstps 104(%ecx) | |
690 | |
691 flds 88(%edx) | |
692 fadds 100(%edx) | |
693 fstps 88(%ecx) | |
694 | |
695 flds 92(%edx) | |
696 fadds 96(%edx) | |
697 fstps 92(%ecx) | |
698 | |
699 flds 100(%edx) | |
700 fsubs 88(%edx) | |
701 fmuls 88(%ebx) | |
702 fstps 100(%ecx) | |
703 | |
704 flds 96(%edx) | |
705 fsubs 92(%edx) | |
706 fmuls 92(%ebx) | |
707 fstps 96(%ecx) | |
708 */ | |
709 // end phase 3 fpu | |
710 | |
711 | |
712 /* Phase 4 (completed, buggy) */ | |
713 /* | |
714 / movq 96(%ebx), %mm2 | |
715 / movq 104(%ebx), %mm6 | |
716 movups 96(%ebx), %xmm4 | |
717 | |
718 | |
719 / movq (%ecx), %mm0 | |
720 / movq 8(%ecx), %mm4 | |
721 movups (%ecx), %xmm0 | |
722 | |
723 / movq %mm0, %mm3 | |
724 / movq %mm4, %mm7 | |
725 movaps %xmm0, %xmm2 | |
726 | |
727 / movq 24(%ecx), %mm1 | |
728 / movq 16(%ecx), %mm5 | |
729 movups 16(%ecx), %xmm1 | |
730 | |
731 / pswapd %mm1, %mm1 | |
732 / pswapd %mm5, %mm5 | |
733 //// shufps $177, %xmm1, %xmm1 | |
734 shufps $27, %xmm1, %xmm1 | |
735 | |
736 / pfadd %mm1, %mm0 | |
737 / pfadd %mm5, %mm4 | |
738 addps %xmm1, %xmm0 | |
739 | |
740 / movq %mm0, (%edx) | |
741 / movq %mm4, 8(%edx) | |
742 movups %xmm0, (%edx) | |
743 | |
744 / pfsub %mm1, %mm3 | |
745 / pfsub %mm5, %mm7 | |
746 subps %xmm1, %xmm2 | |
747 | |
748 / pfmul %mm2, %mm3 | |
749 / pfmul %mm6, %mm7 | |
750 mulps %xmm4, %xmm2 | |
751 | |
752 / pswapd %mm3, %mm3 | |
753 / pswapd %mm7, %mm7 | |
754 shufps $27, %xmm2, %xmm2 | |
755 | |
756 / movq %mm3, 24(%edx) | |
757 / movq %mm7, 16(%edx) | |
758 movups %xmm2, 16(%edx) | |
759 | |
760 / movq 32(%ecx), %mm0 | |
761 / movq 40(%ecx), %mm4 | |
762 movups 32(%ecx), %xmm0 | |
763 | |
764 / movq %mm0, %mm3 | |
765 / movq %mm4, %mm7 | |
766 movaps %xmm0, %xmm2 | |
767 | |
768 / movq 56(%ecx), %mm1 | |
769 / movq 48(%ecx), %mm5 | |
770 movups 48(%ecx), %xmm1 | |
771 | |
772 / pswapd %mm1, %mm1 | |
773 / pswapd %mm5, %mm5 | |
774 //// shufps $177, %xmm1, %xmm1 | |
775 shufps $27, %xmm1, %xmm1 | |
776 | |
777 / pfadd %mm1, %mm0 | |
778 / pfadd %mm5, %mm4 | |
779 addps %xmm1, %xmm0 | |
780 | |
781 / movq %mm0, 32(%edx) | |
782 / movq %mm4, 40(%edx) | |
783 movups %xmm0, 32(%edx) | |
784 | |
785 / pfsubr %mm1, %mm3 | |
786 / pfsubr %mm5, %mm7 | |
787 // Luckily we can swap this (xmm1<->xmm2) | |
788 subps %xmm2, %xmm1 | |
789 | |
790 / pfmul %mm2, %mm3 | |
791 / pfmul %mm6, %mm7 | |
792 mulps %xmm4, %xmm1 | |
793 | |
794 / pswapd %mm3, %mm3 | |
795 / pswapd %mm7, %mm7 | |
796 shufps $27, %xmm1, %xmm1 | |
797 | |
798 / movq %mm3, 56(%edx) | |
799 / movq %mm7, 48(%edx) | |
800 movups %xmm1, 48(%edx) | |
801 | |
802 | |
803 / movq 64(%ecx), %mm0 | |
804 / movq 72(%ecx), %mm4 | |
805 movups 64(%ecx), %xmm0 | |
806 | |
807 / movq %mm0, %mm3 | |
808 / movq %mm4, %mm7 | |
809 movaps %xmm0, %xmm2 | |
810 | |
811 / movq 88(%ecx), %mm1 | |
812 / movq 80(%ecx), %mm5 | |
813 movups 80(%ecx), %xmm1 | |
814 | |
815 / pswapd %mm1, %mm1 | |
816 / pswapd %mm5, %mm5 | |
817 //// shufps $177, %xmm1, %xmm1 | |
818 shufps $27, %xmm1, %xmm1 | |
819 | |
820 / pfadd %mm1, %mm0 | |
821 / pfadd %mm5, %mm4 | |
822 addps %xmm1, %xmm0 | |
823 | |
824 / movq %mm0, 64(%edx) | |
825 / movq %mm4, 72(%edx) | |
826 movups %xmm0, 64(%edx) | |
827 | |
828 / pfsub %mm1, %mm3 | |
829 / pfsub %mm5, %mm7 | |
830 subps %xmm1, %xmm2 | |
831 | |
832 / pfmul %mm2, %mm3 | |
833 / pfmul %mm6, %mm7 | |
834 mulps %xmm4, %xmm2 | |
835 | |
836 / pswapd %mm3, %mm3 | |
837 / pswapd %mm7, %mm7 | |
838 shufps $27, %xmm2, %xmm2 | |
839 | |
840 / movq %mm3, 88(%edx) | |
841 / movq %mm7, 80(%edx) | |
842 movups %xmm2, 80(%edx) | |
843 | |
844 | |
845 / movq 96(%ecx), %mm0 | |
846 / movq 104(%ecx), %mm4 | |
847 movups 96(%ecx), %xmm0 | |
848 | |
849 / movq %mm0, %mm3 | |
850 / movq %mm4, %mm7 | |
851 movaps %xmm0, %xmm2 | |
852 | |
853 / movq 120(%ecx), %mm1 | |
854 / movq 112(%ecx), %mm5 | |
855 movups 112(%ecx), %xmm1 | |
856 | |
857 / pswapd %mm1, %mm1 | |
858 / pswapd %mm5, %mm5 | |
859 //// shufps $177, %xmm1, %xmm1 | |
860 shufps $27, %xmm1, %xmm1 | |
861 | |
862 / pfadd %mm1, %mm0 | |
863 / pfadd %mm5, %mm4 | |
864 addps %xmm1, %xmm0 | |
865 | |
866 / movq %mm0, 96(%edx) | |
867 / movq %mm4, 104(%edx) | |
868 movups %xmm0, 96(%edx) | |
869 | |
870 / pfsubr %mm1, %mm3 | |
871 / pfsubr %mm5, %mm7 | |
872 // This is already optimized, so xmm2 must be swapped with xmm1 for rest of phase | |
873 subps %xmm2, %xmm1 | |
874 | |
875 / pfmul %mm2, %mm3 | |
876 / pfmul %mm6, %mm7 | |
877 mulps %xmm4, %xmm1 | |
878 | |
879 / pswapd %mm3, %mm3 | |
880 / pswapd %mm7, %mm7 | |
881 shufps $27, %xmm1, %xmm1 | |
882 | |
883 / movq %mm3, 120(%edx) | |
884 / movq %mm7, 112(%edx) | |
885 movups %xmm1, 112(%edx) | |
886 */ | |
887 | |
888 // phase 4 fpu code | |
889 /* Phase 4*/ | |
890 | |
891 flds (%ecx) | |
892 fadds 28(%ecx) | |
893 fstps (%edx) | |
894 | |
895 flds (%ecx) | |
896 fsubs 28(%ecx) | |
897 fmuls 96(%ebx) | |
898 fstps 28(%edx) | |
899 | |
900 flds 4(%ecx) | |
901 fadds 24(%ecx) | |
902 fstps 4(%edx) | |
903 | |
904 flds 4(%ecx) | |
905 fsubs 24(%ecx) | |
906 fmuls 100(%ebx) | |
907 fstps 24(%edx) | |
908 | |
909 flds 8(%ecx) | |
910 fadds 20(%ecx) | |
911 fstps 8(%edx) | |
912 | |
913 flds 8(%ecx) | |
914 fsubs 20(%ecx) | |
915 fmuls 104(%ebx) | |
916 fstps 20(%edx) | |
917 | |
918 flds 12(%ecx) | |
919 fadds 16(%ecx) | |
920 fstps 12(%edx) | |
921 | |
922 flds 12(%ecx) | |
923 fsubs 16(%ecx) | |
924 fmuls 108(%ebx) | |
925 fstps 16(%edx) | |
926 | |
927 flds 32(%ecx) | |
928 fadds 60(%ecx) | |
929 fstps 32(%edx) | |
930 | |
931 flds 60(%ecx) | |
932 fsubs 32(%ecx) | |
933 fmuls 96(%ebx) | |
934 fstps 60(%edx) | |
935 | |
936 flds 36(%ecx) | |
937 fadds 56(%ecx) | |
938 fstps 36(%edx) | |
939 | |
940 flds 56(%ecx) | |
941 fsubs 36(%ecx) | |
942 fmuls 100(%ebx) | |
943 fstps 56(%edx) | |
944 | |
945 flds 40(%ecx) | |
946 fadds 52(%ecx) | |
947 fstps 40(%edx) | |
948 | |
949 flds 52(%ecx) | |
950 fsubs 40(%ecx) | |
951 fmuls 104(%ebx) | |
952 fstps 52(%edx) | |
953 | |
954 flds 44(%ecx) | |
955 fadds 48(%ecx) | |
956 fstps 44(%edx) | |
957 | |
958 flds 48(%ecx) | |
959 fsubs 44(%ecx) | |
960 fmuls 108(%ebx) | |
961 fstps 48(%edx) | |
962 | |
963 flds 64(%ecx) | |
964 fadds 92(%ecx) | |
965 fstps 64(%edx) | |
966 | |
967 flds 64(%ecx) | |
968 fsubs 92(%ecx) | |
969 fmuls 96(%ebx) | |
970 fstps 92(%edx) | |
971 | |
972 flds 68(%ecx) | |
973 fadds 88(%ecx) | |
974 fstps 68(%edx) | |
975 | |
976 flds 68(%ecx) | |
977 fsubs 88(%ecx) | |
978 fmuls 100(%ebx) | |
979 fstps 88(%edx) | |
980 | |
981 flds 72(%ecx) | |
982 fadds 84(%ecx) | |
983 fstps 72(%edx) | |
984 | |
985 flds 72(%ecx) | |
986 fsubs 84(%ecx) | |
987 fmuls 104(%ebx) | |
988 fstps 84(%edx) | |
989 | |
990 flds 76(%ecx) | |
991 fadds 80(%ecx) | |
992 fstps 76(%edx) | |
993 | |
994 flds 76(%ecx) | |
995 fsubs 80(%ecx) | |
996 fmuls 108(%ebx) | |
997 fstps 80(%edx) | |
998 | |
999 flds 96(%ecx) | |
1000 fadds 124(%ecx) | |
1001 fstps 96(%edx) | |
1002 | |
1003 flds 124(%ecx) | |
1004 fsubs 96(%ecx) | |
1005 fmuls 96(%ebx) | |
1006 fstps 124(%edx) | |
1007 | |
1008 flds 100(%ecx) | |
1009 fadds 120(%ecx) | |
1010 fstps 100(%edx) | |
1011 | |
1012 flds 120(%ecx) | |
1013 fsubs 100(%ecx) | |
1014 fmuls 100(%ebx) | |
1015 fstps 120(%edx) | |
1016 | |
1017 flds 104(%ecx) | |
1018 fadds 116(%ecx) | |
1019 fstps 104(%edx) | |
1020 | |
1021 flds 116(%ecx) | |
1022 fsubs 104(%ecx) | |
1023 fmuls 104(%ebx) | |
1024 fstps 116(%edx) | |
1025 | |
1026 flds 108(%ecx) | |
1027 fadds 112(%ecx) | |
1028 fstps 108(%edx) | |
1029 | |
1030 flds 112(%ecx) | |
1031 fsubs 108(%ecx) | |
1032 fmuls 108(%ebx) | |
1033 fstps 112(%edx) | |
1034 | |
1035 flds (%edx) | |
1036 fadds 12(%edx) | |
1037 fstps (%ecx) | |
1038 | |
1039 flds (%edx) | |
1040 fsubs 12(%edx) | |
1041 fmuls 112(%ebx) | |
1042 fstps 12(%ecx) | |
1043 | |
1044 flds 4(%edx) | |
1045 fadds 8(%edx) | |
1046 fstps 4(%ecx) | |
1047 | |
1048 flds 4(%edx) | |
1049 fsubs 8(%edx) | |
1050 fmuls 116(%ebx) | |
1051 fstps 8(%ecx) | |
1052 | |
1053 flds 16(%edx) | |
1054 fadds 28(%edx) | |
1055 fstps 16(%ecx) | |
1056 | |
1057 flds 28(%edx) | |
1058 fsubs 16(%edx) | |
1059 fmuls 112(%ebx) | |
1060 fstps 28(%ecx) | |
1061 | |
1062 flds 20(%edx) | |
1063 fadds 24(%edx) | |
1064 fstps 20(%ecx) | |
1065 | |
1066 flds 24(%edx) | |
1067 fsubs 20(%edx) | |
1068 fmuls 116(%ebx) | |
1069 fstps 24(%ecx) | |
1070 | |
1071 flds 32(%edx) | |
1072 fadds 44(%edx) | |
1073 fstps 32(%ecx) | |
1074 | |
1075 flds 32(%edx) | |
1076 fsubs 44(%edx) | |
1077 fmuls 112(%ebx) | |
1078 fstps 44(%ecx) | |
1079 | |
1080 flds 36(%edx) | |
1081 fadds 40(%edx) | |
1082 fstps 36(%ecx) | |
1083 | |
1084 flds 36(%edx) | |
1085 fsubs 40(%edx) | |
1086 fmuls 116(%ebx) | |
1087 fstps 40(%ecx) | |
1088 | |
1089 flds 48(%edx) | |
1090 fadds 60(%edx) | |
1091 fstps 48(%ecx) | |
1092 | |
1093 flds 60(%edx) | |
1094 fsubs 48(%edx) | |
1095 fmuls 112(%ebx) | |
1096 fstps 60(%ecx) | |
1097 | |
1098 flds 52(%edx) | |
1099 fadds 56(%edx) | |
1100 fstps 52(%ecx) | |
1101 | |
1102 flds 56(%edx) | |
1103 fsubs 52(%edx) | |
1104 fmuls 116(%ebx) | |
1105 fstps 56(%ecx) | |
1106 | |
1107 flds 64(%edx) | |
1108 fadds 76(%edx) | |
1109 fstps 64(%ecx) | |
1110 | |
1111 flds 64(%edx) | |
1112 fsubs 76(%edx) | |
1113 fmuls 112(%ebx) | |
1114 fstps 76(%ecx) | |
1115 | |
1116 flds 68(%edx) | |
1117 fadds 72(%edx) | |
1118 fstps 68(%ecx) | |
1119 | |
1120 flds 68(%edx) | |
1121 fsubs 72(%edx) | |
1122 fmuls 116(%ebx) | |
1123 fstps 72(%ecx) | |
1124 | |
1125 flds 80(%edx) | |
1126 fadds 92(%edx) | |
1127 fstps 80(%ecx) | |
1128 | |
1129 flds 92(%edx) | |
1130 fsubs 80(%edx) | |
1131 fmuls 112(%ebx) | |
1132 fstps 92(%ecx) | |
1133 | |
1134 flds 84(%edx) | |
1135 fadds 88(%edx) | |
1136 fstps 84(%ecx) | |
1137 | |
1138 flds 88(%edx) | |
1139 fsubs 84(%edx) | |
1140 fmuls 116(%ebx) | |
1141 fstps 88(%ecx) | |
1142 | |
1143 flds 96(%edx) | |
1144 fadds 108(%edx) | |
1145 fstps 96(%ecx) | |
1146 | |
1147 flds 96(%edx) | |
1148 fsubs 108(%edx) | |
1149 fmuls 112(%ebx) | |
1150 fstps 108(%ecx) | |
1151 | |
1152 flds 100(%edx) | |
1153 fadds 104(%edx) | |
1154 fstps 100(%ecx) | |
1155 | |
1156 flds 100(%edx) | |
1157 fsubs 104(%edx) | |
1158 fmuls 116(%ebx) | |
1159 fstps 104(%ecx) | |
1160 | |
1161 flds 112(%edx) | |
1162 fadds 124(%edx) | |
1163 fstps 112(%ecx) | |
1164 | |
1165 flds 124(%edx) | |
1166 fsubs 112(%edx) | |
1167 fmuls 112(%ebx) | |
1168 fstps 124(%ecx) | |
1169 | |
1170 flds 116(%edx) | |
1171 fadds 120(%edx) | |
1172 fstps 116(%ecx) | |
1173 | |
1174 flds 120(%edx) | |
1175 fsubs 116(%edx) | |
1176 fmuls 116(%ebx) | |
1177 fstps 120(%ecx) | |
1178 | |
1179 // end of phase 4 fpu | |
1180 | |
1181 // below stuff needs to be finished I use FPU code for first | |
1182 /* Phase 5 (completed, crashing) */ | |
1183 /* | |
1184 / movq 112(%ebx), %mm2 | |
1185 // move 8 byte data to (low)high quadword - check this! atmos | |
1186 movlps 112(%ebx), %xmm4 | |
1187 // maybe I need movhlps too to get data into correct quadword | |
1188 movlhps %xmm4, %xmm4 | |
1189 | |
1190 / movq (%edx), %mm0 | |
1191 / movq 16(%edx), %mm4 | |
1192 movups (%edx), %xmm0 | |
1193 | |
1194 / movq %mm0, %mm3 | |
1195 / movq %mm4, %mm7 | |
1196 movaps %xmm0, %xmm2 | |
1197 | |
1198 // hmm? this is strange | |
1199 / movq 8(%edx), %mm1 | |
1200 / movq 24(%edx), %mm5 | |
1201 movlps 8(%edx), %xmm1 | |
1202 movhps 24(%edx), %xmm1 | |
1203 | |
1204 / pswapd %mm1, %mm1 | |
1205 / pswapd %mm5, %mm5 | |
1206 pshufd $177, %xmm1, %xmm1 | |
1207 | |
1208 / pfadd %mm1, %mm0 | |
1209 / pfadd %mm5, %mm4 | |
1210 addps %xmm1, %xmm0 | |
1211 | |
1212 / movq %mm0, (%ecx) | |
1213 / movq %mm4, 16(%ecx) | |
1214 movlps %xmm0, (%ecx) | |
1215 movhps %xmm0, 16(%ecx) | |
1216 | |
1217 / pfsub %mm1, %mm3 | |
1218 / pfsubr %mm5, %mm7 | |
1219 // I need to emulate pfsubr here | |
1220 movaps %xmm1, %xmm3 | |
1221 subps %xmm2, %xmm3 | |
1222 subps %xmm1, %xmm2 | |
1223 // now move correct quadword from reverse substration in xmm3 to correct | |
1224 // quadword in xmm2 and leave other quadword with non-reversed substration untouched | |
1225 /// shufpd $2, %xmm3, %xmm2 | |
1226 // (or $1?) (see ia32-ref p.749) | |
1227 // optimize | |
1228 movq %xmm2, %xmm3 | |
1229 movaps %xmm3, %xmm2 | |
1230 | |
1231 / pfmul %mm2, %mm3 | |
1232 / pfmul %mm2, %mm7 | |
1233 mulps %xmm4, %xmm2 | |
1234 | |
1235 / pswapd %mm3, %mm3 | |
1236 / pswapd %mm7, %mm7 | |
1237 shufps $177, %xmm2, %xmm2 | |
1238 | |
1239 / movq %mm3, 8(%ecx) | |
1240 / movq %mm7, 24(%ecx) | |
1241 movlps %xmm2, 8(%ecx) | |
1242 movhps %xmm2, 24(%ecx) | |
1243 | |
1244 / movq 32(%edx), %mm0 | |
1245 / movq 48(%edx), %mm4 | |
1246 movlps 32(%edx), %xmm0 | |
1247 movhps 48(%edx), %xmm0 | |
1248 | |
1249 / movq %mm0, %mm3 | |
1250 / movq %mm4, %mm7 | |
1251 movaps %xmm0, %xmm2 | |
1252 | |
1253 / movq 40(%edx), %mm1 | |
1254 / movq 56(%edx), %mm5 | |
1255 movlps 40(%edx), %xmm1 | |
1256 movhps 56(%edx), %xmm1 | |
1257 | |
1258 / pswapd %mm1, %mm1 | |
1259 / pswapd %mm5, %mm5 | |
1260 shufps $177, %xmm1, %xmm1 | |
1261 | |
1262 / pfadd %mm1, %mm0 | |
1263 / pfadd %mm5, %mm4 | |
1264 addps %xmm1, %xmm0 | |
1265 | |
1266 / movq %mm0, 32(%ecx) | |
1267 / movq %mm4, 48(%ecx) | |
1268 movlps %xmm0, 32(%ecx) | |
1269 movhps %xmm0, 48(%ecx) | |
1270 | |
1271 / pfsub %mm1, %mm3 | |
1272 / pfsubr %mm5, %mm7 | |
1273 movaps %xmm1, %xmm3 | |
1274 subps %xmm2, %xmm3 | |
1275 subps %xmm1, %xmm2 | |
1276 /// shufpd $2, %xmm3, %xmm2 | |
1277 // (or $1?) | |
1278 // optimize | |
1279 movq %xmm2, %xmm3 | |
1280 movaps %xmm3, %xmm2 | |
1281 | |
1282 / pfmul %mm2, %mm3 | |
1283 / pfmul %mm2, %mm7 | |
1284 mulps %xmm4, %xmm2 | |
1285 | |
1286 / pswapd %mm3, %mm3 | |
1287 / pswapd %mm7, %mm7 | |
1288 shufps $177, %xmm2, %xmm2 | |
1289 | |
1290 / movq %mm3, 40(%ecx) | |
1291 / movq %mm7, 56(%ecx) | |
1292 movlps %xmm2, 40(%ecx) | |
1293 movhps %xmm2, 56(%ecx) | |
1294 | |
1295 | |
1296 / movq 64(%edx), %mm0 | |
1297 / movq 80(%edx), %mm4 | |
1298 movlps 64(%edx), %xmm0 | |
1299 movhps 80(%edx), %xmm0 | |
1300 | |
1301 / movq %mm0, %mm3 | |
1302 / movq %mm4, %mm7 | |
1303 movaps %xmm0, %xmm2 | |
1304 | |
1305 / movq 72(%edx), %mm1 | |
1306 / movq 88(%edx), %mm5 | |
1307 movlps 72(%edx), %xmm1 | |
1308 movhps 88(%edx), %xmm1 | |
1309 | |
1310 / pswapd %mm1, %mm1 | |
1311 / pswapd %mm5, %mm5 | |
1312 shufps $177, %xmm1, %xmm1 | |
1313 | |
1314 / pfadd %mm1, %mm0 | |
1315 / pfadd %mm5, %mm4 | |
1316 addps %xmm1, %xmm0 | |
1317 | |
1318 / movq %mm0, 64(%ecx) | |
1319 / movq %mm4, 80(%ecx) | |
1320 movlps %xmm0, 64(%ecx) | |
1321 movhps %xmm0, 80(%ecx) | |
1322 | |
1323 / pfsub %mm1, %mm3 | |
1324 / pfsubr %mm5, %mm7 | |
1325 movaps %xmm1, %xmm3 | |
1326 subps %xmm2, %xmm3 | |
1327 subps %xmm1, %xmm2 | |
1328 /// shufpd $2, %xmm3, %xmm2 | |
1329 // (or $1?) | |
1330 // optimize | |
1331 movq %xmm2, %xmm3 | |
1332 movaps %xmm3, %xmm2 | |
1333 | |
1334 / pfmul %mm2, %mm3 | |
1335 / pfmul %mm2, %mm7 | |
1336 mulps %xmm4, %xmm2 | |
1337 | |
1338 / pswapd %mm3, %mm3 | |
1339 / pswapd %mm7, %mm7 | |
1340 shufps $177, %xmm2, %xmm2 | |
1341 | |
1342 / movq %mm3, 72(%ecx) | |
1343 / movq %mm7, 88(%ecx) | |
1344 movlps %xmm2, 72(%ecx) | |
1345 movhps %xmm2, 88(%ecx) | |
1346 | |
1347 / movq 96(%edx), %mm0 | |
1348 / movq 112(%edx), %mm4 | |
1349 movups 96(%edx), %xmm0 | |
1350 | |
1351 / movq %mm0, %mm3 | |
1352 / movq %mm4, %mm7 | |
1353 movaps %xmm0, %xmm2 | |
1354 | |
1355 / movq 104(%edx), %mm1 | |
1356 / movq 120(%edx), %mm5 | |
1357 movlps 104(%edx), %xmm1 | |
1358 movhps 120(%edx), %xmm1 | |
1359 | |
1360 / pswapd %mm1, %mm1 | |
1361 / pswapd %mm5, %mm5 | |
1362 shufps $177, %xmm1, %xmm1 | |
1363 | |
1364 / pfadd %mm1, %mm0 | |
1365 / pfadd %mm5, %mm4 | |
1366 addps %xmm1, %xmm0 | |
1367 | |
1368 / movq %mm0, 96(%ecx) | |
1369 / movq %mm4, 112(%ecx) | |
1370 movups %xmm0, 96(%ecx) | |
1371 | |
1372 / pfsub %mm1, %mm3 | |
1373 / pfsubr %mm5, %mm7 | |
1374 movaps %xmm1, %xmm3 | |
1375 subps %xmm2, %xmm3 | |
1376 subps %xmm1, %xmm2 | |
1377 /// shufpd $2, %xmm3, %xmm2 | |
1378 // (or $1?) | |
1379 // optimize | |
1380 movq %xmm2, %xmm3 | |
1381 movaps %xmm3, %xmm2 | |
1382 | |
1383 / pfmul %mm2, %mm3 | |
1384 / pfmul %mm2, %mm7 | |
1385 mulps %xmm4, %xmm2 | |
1386 | |
1387 / pswapd %mm3, %mm3 | |
1388 / pswapd %mm7, %mm7 | |
1389 shufps $177, %xmm2, %xmm2 | |
1390 | |
1391 / movq %mm3, 104(%ecx) | |
1392 / movq %mm7, 120(%ecx) | |
1393 movlps %xmm2, 104(%ecx) | |
1394 movhps %xmm2, 120(%ecx) | |
1395 */ | |
1396 | |
1397 | |
1398 /* Phase 6. This is the end of easy road. */ | |
1399 /* Code below is coded in scalar mode. Should be optimized */ | |
1400 // | |
1401 // movd plus_1f, %mm6 | |
1402 // punpckldq 120(%ebx), %mm6 /* mm6 = 1.0 | 120(%ebx)*/ | |
1403 // movq x_plus_minus_3dnow, %mm7 /* mm7 = +1 | -1 */ | |
1404 /* | |
1405 movq 32(%ecx), %mm0 | |
1406 movq 64(%ecx), %mm2 | |
1407 movq %mm0, %mm1 | |
1408 movq %mm2, %mm3 | |
1409 pxor %mm7, %mm1 | |
1410 pxor %mm7, %mm3 | |
1411 pfacc %mm1, %mm0 | |
1412 pfacc %mm3, %mm2 | |
1413 pfmul %mm6, %mm0 | |
1414 pfmul %mm6, %mm2 | |
1415 movq %mm0, 32(%edx) | |
1416 movq %mm2, 64(%edx) | |
1417 | |
1418 movd 44(%ecx), %mm0 | |
1419 movd 40(%ecx), %mm2 | |
1420 movd 120(%ebx), %mm3 | |
1421 punpckldq 76(%ecx), %mm0 | |
1422 punpckldq 72(%ecx), %mm2 | |
1423 punpckldq %mm3, %mm3 | |
1424 movq %mm0, %mm4 | |
1425 movq %mm2, %mm5 | |
1426 pfsub %mm2, %mm0 | |
1427 pfmul %mm3, %mm0 | |
1428 movq %mm0, %mm1 | |
1429 pfadd %mm5, %mm0 | |
1430 pfadd %mm4, %mm0 | |
1431 movq %mm0, %mm2 | |
1432 punpckldq %mm1, %mm0 | |
1433 punpckhdq %mm1, %mm2 | |
1434 movq %mm0, 40(%edx) | |
1435 movq %mm2, 72(%edx) | |
1436 | |
1437 movd 48(%ecx), %mm3 | |
1438 movd 60(%ecx), %mm2 | |
1439 pfsub 52(%ecx), %mm3 | |
1440 pfsub 56(%ecx), %mm2 | |
1441 pfmul 120(%ebx), %mm3 | |
1442 pfmul 120(%ebx), %mm2 | |
1443 movq %mm2, %mm1 | |
1444 | |
1445 pfadd 56(%ecx), %mm1 | |
1446 pfadd 60(%ecx), %mm1 | |
1447 movq %mm1, %mm0 | |
1448 | |
1449 pfadd 48(%ecx), %mm0 | |
1450 pfadd 52(%ecx), %mm0 | |
1451 pfadd %mm3, %mm1 | |
1452 punpckldq %mm2, %mm1 | |
1453 pfadd %mm3, %mm2 | |
1454 punpckldq %mm2, %mm0 | |
1455 movq %mm1, 56(%edx) | |
1456 movq %mm0, 48(%edx) | |
1457 */ | |
1458 /*---*/ | |
1459 /* | |
1460 movd 92(%ecx), %mm1 | |
1461 pfsub 88(%ecx), %mm1 | |
1462 pfmul 120(%ebx), %mm1 | |
1463 movd %mm1, 92(%edx) | |
1464 pfadd 92(%ecx), %mm1 | |
1465 pfadd 88(%ecx), %mm1 | |
1466 movq %mm1, %mm0 | |
1467 | |
1468 pfadd 80(%ecx), %mm0 | |
1469 pfadd 84(%ecx), %mm0 | |
1470 movd %mm0, 80(%edx) | |
1471 | |
1472 movd 80(%ecx), %mm0 | |
1473 pfsub 84(%ecx), %mm0 | |
1474 pfmul 120(%ebx), %mm0 | |
1475 pfadd %mm0, %mm1 | |
1476 pfadd 92(%edx), %mm0 | |
1477 punpckldq %mm1, %mm0 | |
1478 movq %mm0, 84(%edx) | |
1479 | |
1480 movq 96(%ecx), %mm0 | |
1481 movq %mm0, %mm1 | |
1482 pxor %mm7, %mm1 | |
1483 pfacc %mm1, %mm0 | |
1484 pfmul %mm6, %mm0 | |
1485 movq %mm0, 96(%edx) | |
1486 | |
1487 movd 108(%ecx), %mm0 | |
1488 pfsub 104(%ecx), %mm0 | |
1489 pfmul 120(%ebx), %mm0 | |
1490 movd %mm0, 108(%edx) | |
1491 pfadd 104(%ecx), %mm0 | |
1492 pfadd 108(%ecx), %mm0 | |
1493 movd %mm0, 104(%edx) | |
1494 | |
1495 movd 124(%ecx), %mm1 | |
1496 pfsub 120(%ecx), %mm1 | |
1497 pfmul 120(%ebx), %mm1 | |
1498 movd %mm1, 124(%edx) | |
1499 pfadd 120(%ecx), %mm1 | |
1500 pfadd 124(%ecx), %mm1 | |
1501 movq %mm1, %mm0 | |
1502 | |
1503 pfadd 112(%ecx), %mm0 | |
1504 pfadd 116(%ecx), %mm0 | |
1505 movd %mm0, 112(%edx) | |
1506 | |
1507 movd 112(%ecx), %mm0 | |
1508 pfsub 116(%ecx), %mm0 | |
1509 pfmul 120(%ebx), %mm0 | |
1510 pfadd %mm0,%mm1 | |
1511 pfadd 124(%edx), %mm0 | |
1512 punpckldq %mm1, %mm0 | |
1513 movq %mm0, 116(%edx) | |
1514 | |
1515 jnz .L01 | |
1516 */ | |
1517 | |
1518 | |
1519 /* Phase 7*/ | |
1520 /* Code below is coded in scalar mode. Should be optimized */ | |
1521 /* | |
1522 movd (%ecx), %mm0 | |
1523 pfadd 4(%ecx), %mm0 | |
1524 movd %mm0, 1024(%esi) | |
1525 | |
1526 movd (%ecx), %mm0 | |
1527 pfsub 4(%ecx), %mm0 | |
1528 pfmul 120(%ebx), %mm0 | |
1529 movd %mm0, (%esi) | |
1530 movd %mm0, (%edi) | |
1531 | |
1532 movd 12(%ecx), %mm0 | |
1533 pfsub 8(%ecx), %mm0 | |
1534 pfmul 120(%ebx), %mm0 | |
1535 movd %mm0, 512(%edi) | |
1536 pfadd 12(%ecx), %mm0 | |
1537 pfadd 8(%ecx), %mm0 | |
1538 movd %mm0, 512(%esi) | |
1539 | |
1540 movd 16(%ecx), %mm0 | |
1541 pfsub 20(%ecx), %mm0 | |
1542 pfmul 120(%ebx), %mm0 | |
1543 movq %mm0, %mm3 | |
1544 | |
1545 movd 28(%ecx), %mm0 | |
1546 pfsub 24(%ecx), %mm0 | |
1547 pfmul 120(%ebx), %mm0 | |
1548 movd %mm0, 768(%edi) | |
1549 movq %mm0, %mm2 | |
1550 | |
1551 pfadd 24(%ecx), %mm0 | |
1552 pfadd 28(%ecx), %mm0 | |
1553 movq %mm0, %mm1 | |
1554 | |
1555 pfadd 16(%ecx), %mm0 | |
1556 pfadd 20(%ecx), %mm0 | |
1557 movd %mm0, 768(%esi) | |
1558 pfadd %mm3, %mm1 | |
1559 movd %mm1, 256(%esi) | |
1560 pfadd %mm3, %mm2 | |
1561 movd %mm2, 256(%edi) | |
1562 */ | |
1563 | |
1564 | |
1565 /* Phase 8*/ | |
1566 /* | |
1567 movq 32(%edx), %mm0 | |
1568 movq 48(%edx), %mm1 | |
1569 pfadd 48(%edx), %mm0 | |
1570 pfadd 40(%edx), %mm1 | |
1571 movd %mm0, 896(%esi) | |
1572 movd %mm1, 640(%esi) | |
1573 psrlq $32, %mm0 | |
1574 psrlq $32, %mm1 | |
1575 movd %mm0, 128(%edi) | |
1576 movd %mm1, 384(%edi) | |
1577 | |
1578 movd 40(%edx), %mm0 | |
1579 pfadd 56(%edx), %mm0 | |
1580 movd %mm0, 384(%esi) | |
1581 | |
1582 movd 56(%edx), %mm0 | |
1583 pfadd 36(%edx), %mm0 | |
1584 movd %mm0, 128(%esi) | |
1585 | |
1586 movd 60(%edx), %mm0 | |
1587 movd %mm0, 896(%edi) | |
1588 pfadd 44(%edx), %mm0 | |
1589 movd %mm0, 640(%edi) | |
1590 | |
1591 movq 96(%edx), %mm0 | |
1592 movq 112(%edx), %mm2 | |
1593 movq 104(%edx), %mm4 | |
1594 pfadd 112(%edx), %mm0 | |
1595 pfadd 104(%edx), %mm2 | |
1596 pfadd 120(%edx), %mm4 | |
1597 movq %mm0, %mm1 | |
1598 movq %mm2, %mm3 | |
1599 movq %mm4, %mm5 | |
1600 pfadd 64(%edx), %mm0 | |
1601 pfadd 80(%edx), %mm2 | |
1602 pfadd 72(%edx), %mm4 | |
1603 movd %mm0, 960(%esi) | |
1604 movd %mm2, 704(%esi) | |
1605 movd %mm4, 448(%esi) | |
1606 psrlq $32, %mm0 | |
1607 psrlq $32, %mm2 | |
1608 psrlq $32, %mm4 | |
1609 movd %mm0, 64(%edi) | |
1610 movd %mm2, 320(%edi) | |
1611 movd %mm4, 576(%edi) | |
1612 pfadd 80(%edx), %mm1 | |
1613 pfadd 72(%edx), %mm3 | |
1614 pfadd 88(%edx), %mm5 | |
1615 movd %mm1, 832(%esi) | |
1616 movd %mm3, 576(%esi) | |
1617 movd %mm5, 320(%esi) | |
1618 psrlq $32, %mm1 | |
1619 psrlq $32, %mm3 | |
1620 psrlq $32, %mm5 | |
1621 movd %mm1, 192(%edi) | |
1622 movd %mm3, 448(%edi) | |
1623 movd %mm5, 704(%edi) | |
1624 | |
1625 movd 120(%edx), %mm0 | |
1626 pfadd 100(%edx), %mm0 | |
1627 movq %mm0, %mm1 | |
1628 pfadd 88(%edx), %mm0 | |
1629 movd %mm0, 192(%esi) | |
1630 pfadd 68(%edx), %mm1 | |
1631 movd %mm1, 64(%esi) | |
1632 | |
1633 movd 124(%edx), %mm0 | |
1634 movd %mm0, 960(%edi) | |
1635 pfadd 92(%edx), %mm0 | |
1636 movd %mm0, 832(%edi) | |
1637 | |
1638 jmp .L_bye | |
1639 .L01: | |
1640 */ | |
1641 | |
1642 | |
1643 /* Phase 9*/ | |
1644 /* | |
1645 movq (%ecx), %mm0 | |
1646 movq %mm0, %mm1 | |
1647 pxor %mm7, %mm1 | |
1648 pfacc %mm1, %mm0 | |
1649 pfmul %mm6, %mm0 | |
1650 pf2id %mm0, %mm0 | |
1651 movd %mm0, %eax | |
1652 movw %ax, 512(%esi) | |
1653 psrlq $32, %mm0 | |
1654 movd %mm0, %eax | |
1655 movw %ax, (%esi) | |
1656 | |
1657 movd 12(%ecx), %mm0 | |
1658 pfsub 8(%ecx), %mm0 | |
1659 pfmul 120(%ebx), %mm0 | |
1660 pf2id %mm0, %mm7 | |
1661 movd %mm7, %eax | |
1662 movw %ax, 256(%edi) | |
1663 pfadd 12(%ecx), %mm0 | |
1664 pfadd 8(%ecx), %mm0 | |
1665 pf2id %mm0, %mm0 | |
1666 movd %mm0, %eax | |
1667 movw %ax, 256(%esi) | |
1668 | |
1669 movd 16(%ecx), %mm3 | |
1670 pfsub 20(%ecx), %mm3 | |
1671 pfmul 120(%ebx), %mm3 | |
1672 movq %mm3, %mm2 | |
1673 | |
1674 movd 28(%ecx), %mm2 | |
1675 pfsub 24(%ecx), %mm2 | |
1676 pfmul 120(%ebx), %mm2 | |
1677 movq %mm2, %mm1 | |
1678 | |
1679 pf2id %mm2, %mm7 | |
1680 movd %mm7, %eax | |
1681 movw %ax, 384(%edi) | |
1682 | |
1683 pfadd 24(%ecx), %mm1 | |
1684 pfadd 28(%ecx), %mm1 | |
1685 movq %mm1, %mm0 | |
1686 | |
1687 pfadd 16(%ecx), %mm0 | |
1688 pfadd 20(%ecx), %mm0 | |
1689 pf2id %mm0, %mm0 | |
1690 movd %mm0, %eax | |
1691 movw %ax, 384(%esi) | |
1692 pfadd %mm3, %mm1 | |
1693 pf2id %mm1, %mm1 | |
1694 movd %mm1, %eax | |
1695 movw %ax, 128(%esi) | |
1696 pfadd %mm3, %mm2 | |
1697 pf2id %mm2, %mm2 | |
1698 movd %mm2, %eax | |
1699 movw %ax, 128(%edi) | |
1700 */ | |
1701 | |
1702 | |
1703 /* Phase 10*/ | |
1704 /* | |
1705 movq 32(%edx), %mm0 | |
1706 movq 48(%edx), %mm1 | |
1707 pfadd 48(%edx), %mm0 | |
1708 pfadd 40(%edx), %mm1 | |
1709 pf2id %mm0, %mm0 | |
1710 pf2id %mm1, %mm1 | |
1711 movd %mm0, %eax | |
1712 movd %mm1, %ecx | |
1713 movw %ax, 448(%esi) | |
1714 movw %cx, 320(%esi) | |
1715 psrlq $32, %mm0 | |
1716 psrlq $32, %mm1 | |
1717 movd %mm0, %eax | |
1718 movd %mm1, %ecx | |
1719 movw %ax, 64(%edi) | |
1720 movw %cx, 192(%edi) | |
1721 | |
1722 movd 40(%edx), %mm3 | |
1723 movd 56(%edx), %mm4 | |
1724 movd 60(%edx), %mm0 | |
1725 movd 44(%edx), %mm2 | |
1726 movd 120(%edx), %mm5 | |
1727 punpckldq %mm4, %mm3 | |
1728 punpckldq 124(%edx), %mm0 | |
1729 pfadd 100(%edx), %mm5 | |
1730 punpckldq 36(%edx), %mm4 | |
1731 punpckldq 92(%edx), %mm2 | |
1732 movq %mm5, %mm6 | |
1733 pfadd %mm4, %mm3 | |
1734 pf2id %mm0, %mm1 | |
1735 pf2id %mm3, %mm3 | |
1736 pfadd 88(%edx), %mm5 | |
1737 movd %mm1, %eax | |
1738 movd %mm3, %ecx | |
1739 movw %ax, 448(%edi) | |
1740 movw %cx, 192(%esi) | |
1741 pf2id %mm5, %mm5 | |
1742 psrlq $32, %mm1 | |
1743 psrlq $32, %mm3 | |
1744 movd %mm5, %ebx | |
1745 movd %mm1, %eax | |
1746 movd %mm3, %ecx | |
1747 movw %bx, 96(%esi) | |
1748 movw %ax, 480(%edi) | |
1749 movw %cx, 64(%esi) | |
1750 pfadd %mm2, %mm0 | |
1751 pf2id %mm0, %mm0 | |
1752 movd %mm0, %eax | |
1753 pfadd 68(%edx), %mm6 | |
1754 movw %ax, 320(%edi) | |
1755 psrlq $32, %mm0 | |
1756 pf2id %mm6, %mm6 | |
1757 movd %mm0, %eax | |
1758 movd %mm6, %ebx | |
1759 movw %ax, 416(%edi) | |
1760 movw %bx, 32(%esi) | |
1761 | |
1762 movq 96(%edx), %mm0 | |
1763 movq 112(%edx), %mm2 | |
1764 movq 104(%edx), %mm4 | |
1765 pfadd %mm2, %mm0 | |
1766 pfadd %mm4, %mm2 | |
1767 pfadd 120(%edx), %mm4 | |
1768 movq %mm0, %mm1 | |
1769 movq %mm2, %mm3 | |
1770 movq %mm4, %mm5 | |
1771 pfadd 64(%edx), %mm0 | |
1772 pfadd 80(%edx), %mm2 | |
1773 pfadd 72(%edx), %mm4 | |
1774 pf2id %mm0, %mm0 | |
1775 pf2id %mm2, %mm2 | |
1776 pf2id %mm4, %mm4 | |
1777 movd %mm0, %eax | |
1778 movd %mm2, %ecx | |
1779 movd %mm4, %ebx | |
1780 movw %ax, 480(%esi) | |
1781 movw %cx, 352(%esi) | |
1782 movw %bx, 224(%esi) | |
1783 psrlq $32, %mm0 | |
1784 psrlq $32, %mm2 | |
1785 psrlq $32, %mm4 | |
1786 movd %mm0, %eax | |
1787 movd %mm2, %ecx | |
1788 movd %mm4, %ebx | |
1789 movw %ax, 32(%edi) | |
1790 movw %cx, 160(%edi) | |
1791 movw %bx, 288(%edi) | |
1792 pfadd 80(%edx), %mm1 | |
1793 pfadd 72(%edx), %mm3 | |
1794 pfadd 88(%edx), %mm5 | |
1795 pf2id %mm1, %mm1 | |
1796 pf2id %mm3, %mm3 | |
1797 pf2id %mm5, %mm5 | |
1798 movd %mm1, %eax | |
1799 movd %mm3, %ecx | |
1800 movd %mm5, %ebx | |
1801 movw %ax, 416(%esi) | |
1802 movw %cx, 288(%esi) | |
1803 movw %bx, 160(%esi) | |
1804 psrlq $32, %mm1 | |
1805 psrlq $32, %mm3 | |
1806 psrlq $32, %mm5 | |
1807 movd %mm1, %eax | |
1808 movd %mm3, %ecx | |
1809 movd %mm5, %ebx | |
1810 movw %ax, 96(%edi) | |
1811 movw %cx, 224(%edi) | |
1812 movw %bx, 352(%edi) | |
1813 | |
1814 movsw | |
1815 | |
1816 .L_bye: | |
1817 addl $256,%esp | |
1818 / femms | |
1819 emms | |
1820 popl %edi | |
1821 popl %esi | |
1822 popl %ebx | |
1823 ret $12 | |
1824 */ | |
1825 | |
1826 // here comes old fashioned FPU code for the tough parts | |
1827 | |
1828 /* Phase 5*/ | |
1829 | |
1830 flds 32(%ecx) | |
1831 fadds 36(%ecx) | |
1832 fstps 32(%edx) | |
1833 | |
1834 flds 32(%ecx) | |
1835 fsubs 36(%ecx) | |
1836 fmuls 120(%ebx) | |
1837 fstps 36(%edx) | |
1838 | |
1839 flds 44(%ecx) | |
1840 fsubs 40(%ecx) | |
1841 fmuls 120(%ebx) | |
1842 fsts 44(%edx) | |
1843 fadds 40(%ecx) | |
1844 fadds 44(%ecx) | |
1845 fstps 40(%edx) | |
1846 | |
1847 flds 48(%ecx) | |
1848 fsubs 52(%ecx) | |
1849 fmuls 120(%ebx) | |
1850 | |
1851 flds 60(%ecx) | |
1852 fsubs 56(%ecx) | |
1853 fmuls 120(%ebx) | |
1854 fld %st(0) | |
1855 fadds 56(%ecx) | |
1856 fadds 60(%ecx) | |
1857 fld %st(0) | |
1858 fadds 48(%ecx) | |
1859 fadds 52(%ecx) | |
1860 fstps 48(%edx) | |
1861 fadd %st(2) | |
1862 fstps 56(%edx) | |
1863 fsts 60(%edx) | |
1864 faddp %st(1) | |
1865 fstps 52(%edx) | |
1866 | |
1867 flds 64(%ecx) | |
1868 fadds 68(%ecx) | |
1869 fstps 64(%edx) | |
1870 | |
1871 flds 64(%ecx) | |
1872 fsubs 68(%ecx) | |
1873 fmuls 120(%ebx) | |
1874 fstps 68(%edx) | |
1875 | |
1876 flds 76(%ecx) | |
1877 fsubs 72(%ecx) | |
1878 fmuls 120(%ebx) | |
1879 fsts 76(%edx) | |
1880 fadds 72(%ecx) | |
1881 fadds 76(%ecx) | |
1882 fstps 72(%edx) | |
1883 | |
1884 flds 92(%ecx) | |
1885 fsubs 88(%ecx) | |
1886 fmuls 120(%ebx) | |
1887 fsts 92(%edx) | |
1888 fadds 92(%ecx) | |
1889 fadds 88(%ecx) | |
1890 fld %st(0) | |
1891 fadds 80(%ecx) | |
1892 fadds 84(%ecx) | |
1893 fstps 80(%edx) | |
1894 | |
1895 flds 80(%ecx) | |
1896 fsubs 84(%ecx) | |
1897 fmuls 120(%ebx) | |
1898 fadd %st(0), %st(1) | |
1899 fadds 92(%edx) | |
1900 fstps 84(%edx) | |
1901 fstps 88(%edx) | |
1902 | |
1903 flds 96(%ecx) | |
1904 fadds 100(%ecx) | |
1905 fstps 96(%edx) | |
1906 | |
1907 flds 96(%ecx) | |
1908 fsubs 100(%ecx) | |
1909 fmuls 120(%ebx) | |
1910 fstps 100(%edx) | |
1911 | |
1912 flds 108(%ecx) | |
1913 fsubs 104(%ecx) | |
1914 fmuls 120(%ebx) | |
1915 fsts 108(%edx) | |
1916 fadds 104(%ecx) | |
1917 fadds 108(%ecx) | |
1918 fstps 104(%edx) | |
1919 | |
1920 flds 124(%ecx) | |
1921 fsubs 120(%ecx) | |
1922 fmuls 120(%ebx) | |
1923 fsts 124(%edx) | |
1924 fadds 120(%ecx) | |
1925 fadds 124(%ecx) | |
1926 fld %st(0) | |
1927 fadds 112(%ecx) | |
1928 fadds 116(%ecx) | |
1929 fstps 112(%edx) | |
1930 | |
1931 flds 112(%ecx) | |
1932 fsubs 116(%ecx) | |
1933 fmuls 120(%ebx) | |
1934 fadd %st(0),%st(1) | |
1935 fadds 124(%edx) | |
1936 fstps 116(%edx) | |
1937 fstps 120(%edx) | |
1938 jnz .L01 | |
1939 | |
1940 | |
1941 /* Phase 6*/ | |
1942 | |
1943 flds (%ecx) | |
1944 fadds 4(%ecx) | |
1945 fstps 1024(%esi) | |
1946 | |
1947 flds (%ecx) | |
1948 fsubs 4(%ecx) | |
1949 fmuls 120(%ebx) | |
1950 fsts (%esi) | |
1951 fstps (%edi) | |
1952 | |
1953 flds 12(%ecx) | |
1954 fsubs 8(%ecx) | |
1955 fmuls 120(%ebx) | |
1956 fsts 512(%edi) | |
1957 fadds 12(%ecx) | |
1958 fadds 8(%ecx) | |
1959 fstps 512(%esi) | |
1960 | |
1961 flds 16(%ecx) | |
1962 fsubs 20(%ecx) | |
1963 fmuls 120(%ebx) | |
1964 | |
1965 flds 28(%ecx) | |
1966 fsubs 24(%ecx) | |
1967 fmuls 120(%ebx) | |
1968 fsts 768(%edi) | |
1969 fld %st(0) | |
1970 fadds 24(%ecx) | |
1971 fadds 28(%ecx) | |
1972 fld %st(0) | |
1973 fadds 16(%ecx) | |
1974 fadds 20(%ecx) | |
1975 fstps 768(%esi) | |
1976 fadd %st(2) | |
1977 fstps 256(%esi) | |
1978 faddp %st(1) | |
1979 fstps 256(%edi) | |
1980 | |
1981 /* Phase 7*/ | |
1982 | |
1983 flds 32(%edx) | |
1984 fadds 48(%edx) | |
1985 fstps 896(%esi) | |
1986 | |
1987 flds 48(%edx) | |
1988 fadds 40(%edx) | |
1989 fstps 640(%esi) | |
1990 | |
1991 flds 40(%edx) | |
1992 fadds 56(%edx) | |
1993 fstps 384(%esi) | |
1994 | |
1995 flds 56(%edx) | |
1996 fadds 36(%edx) | |
1997 fstps 128(%esi) | |
1998 | |
1999 flds 36(%edx) | |
2000 fadds 52(%edx) | |
2001 fstps 128(%edi) | |
2002 | |
2003 flds 52(%edx) | |
2004 fadds 44(%edx) | |
2005 fstps 384(%edi) | |
2006 | |
2007 flds 60(%edx) | |
2008 fsts 896(%edi) | |
2009 fadds 44(%edx) | |
2010 fstps 640(%edi) | |
2011 | |
2012 flds 96(%edx) | |
2013 fadds 112(%edx) | |
2014 fld %st(0) | |
2015 fadds 64(%edx) | |
2016 fstps 960(%esi) | |
2017 fadds 80(%edx) | |
2018 fstps 832(%esi) | |
2019 | |
2020 flds 112(%edx) | |
2021 fadds 104(%edx) | |
2022 fld %st(0) | |
2023 fadds 80(%edx) | |
2024 fstps 704(%esi) | |
2025 fadds 72(%edx) | |
2026 fstps 576(%esi) | |
2027 | |
2028 flds 104(%edx) | |
2029 fadds 120(%edx) | |
2030 fld %st(0) | |
2031 fadds 72(%edx) | |
2032 fstps 448(%esi) | |
2033 fadds 88(%edx) | |
2034 fstps 320(%esi) | |
2035 | |
2036 flds 120(%edx) | |
2037 fadds 100(%edx) | |
2038 fld %st(0) | |
2039 fadds 88(%edx) | |
2040 fstps 192(%esi) | |
2041 fadds 68(%edx) | |
2042 fstps 64(%esi) | |
2043 | |
2044 flds 100(%edx) | |
2045 fadds 116(%edx) | |
2046 fld %st(0) | |
2047 fadds 68(%edx) | |
2048 fstps 64(%edi) | |
2049 fadds 84(%edx) | |
2050 fstps 192(%edi) | |
2051 | |
2052 flds 116(%edx) | |
2053 fadds 108(%edx) | |
2054 fld %st(0) | |
2055 fadds 84(%edx) | |
2056 fstps 320(%edi) | |
2057 fadds 76(%edx) | |
2058 fstps 448(%edi) | |
2059 | |
2060 flds 108(%edx) | |
2061 fadds 124(%edx) | |
2062 fld %st(0) | |
2063 fadds 76(%edx) | |
2064 fstps 576(%edi) | |
2065 fadds 92(%edx) | |
2066 fstps 704(%edi) | |
2067 | |
2068 flds 124(%edx) | |
2069 fsts 960(%edi) | |
2070 fadds 92(%edx) | |
2071 fstps 832(%edi) | |
2072 addl $256,%esp | |
2073 popl %edi | |
2074 popl %esi | |
2075 popl %ebx | |
2076 ret | |
2077 .L01: | |
2078 /* Phase 8*/ | |
2079 | |
2080 flds (%ecx) | |
2081 fadds 4(%ecx) | |
2082 fistp 512(%esi) | |
2083 | |
2084 flds (%ecx) | |
2085 fsubs 4(%ecx) | |
2086 fmuls 120(%ebx) | |
2087 | |
2088 fistp (%esi) | |
2089 | |
2090 | |
2091 flds 12(%ecx) | |
2092 fsubs 8(%ecx) | |
2093 fmuls 120(%ebx) | |
2094 fist 256(%edi) | |
2095 fadds 12(%ecx) | |
2096 fadds 8(%ecx) | |
2097 fistp 256(%esi) | |
2098 | |
2099 flds 16(%ecx) | |
2100 fsubs 20(%ecx) | |
2101 fmuls 120(%ebx) | |
2102 | |
2103 flds 28(%ecx) | |
2104 fsubs 24(%ecx) | |
2105 fmuls 120(%ebx) | |
2106 fist 384(%edi) | |
2107 fld %st(0) | |
2108 fadds 24(%ecx) | |
2109 fadds 28(%ecx) | |
2110 fld %st(0) | |
2111 fadds 16(%ecx) | |
2112 fadds 20(%ecx) | |
2113 fistp 384(%esi) | |
2114 fadd %st(2) | |
2115 fistp 128(%esi) | |
2116 faddp %st(1) | |
2117 fistp 128(%edi) | |
2118 | |
2119 /* Phase 9*/ | |
2120 | |
2121 flds 32(%edx) | |
2122 fadds 48(%edx) | |
2123 fistp 448(%esi) | |
2124 | |
2125 flds 48(%edx) | |
2126 fadds 40(%edx) | |
2127 fistp 320(%esi) | |
2128 | |
2129 flds 40(%edx) | |
2130 fadds 56(%edx) | |
2131 fistp 192(%esi) | |
2132 | |
2133 flds 56(%edx) | |
2134 fadds 36(%edx) | |
2135 fistp 64(%esi) | |
2136 | |
2137 flds 36(%edx) | |
2138 fadds 52(%edx) | |
2139 fistp 64(%edi) | |
2140 | |
2141 flds 52(%edx) | |
2142 fadds 44(%edx) | |
2143 fistp 192(%edi) | |
2144 | |
2145 flds 60(%edx) | |
2146 fist 448(%edi) | |
2147 fadds 44(%edx) | |
2148 fistp 320(%edi) | |
2149 | |
2150 flds 96(%edx) | |
2151 fadds 112(%edx) | |
2152 fld %st(0) | |
2153 fadds 64(%edx) | |
2154 fistp 480(%esi) | |
2155 fadds 80(%edx) | |
2156 fistp 416(%esi) | |
2157 | |
2158 flds 112(%edx) | |
2159 fadds 104(%edx) | |
2160 fld %st(0) | |
2161 fadds 80(%edx) | |
2162 fistp 352(%esi) | |
2163 fadds 72(%edx) | |
2164 fistp 288(%esi) | |
2165 | |
2166 flds 104(%edx) | |
2167 fadds 120(%edx) | |
2168 fld %st(0) | |
2169 fadds 72(%edx) | |
2170 fistp 224(%esi) | |
2171 fadds 88(%edx) | |
2172 fistp 160(%esi) | |
2173 | |
2174 flds 120(%edx) | |
2175 fadds 100(%edx) | |
2176 fld %st(0) | |
2177 fadds 88(%edx) | |
2178 fistp 96(%esi) | |
2179 fadds 68(%edx) | |
2180 fistp 32(%esi) | |
2181 | |
2182 flds 100(%edx) | |
2183 fadds 116(%edx) | |
2184 fld %st(0) | |
2185 fadds 68(%edx) | |
2186 fistp 32(%edi) | |
2187 fadds 84(%edx) | |
2188 fistp 96(%edi) | |
2189 | |
2190 flds 116(%edx) | |
2191 fadds 108(%edx) | |
2192 fld %st(0) | |
2193 fadds 84(%edx) | |
2194 fistp 160(%edi) | |
2195 fadds 76(%edx) | |
2196 fistp 224(%edi) | |
2197 | |
2198 flds 108(%edx) | |
2199 fadds 124(%edx) | |
2200 fld %st(0) | |
2201 fadds 76(%edx) | |
2202 fistp 288(%edi) | |
2203 fadds 92(%edx) | |
2204 fistp 352(%edi) | |
2205 | |
2206 flds 124(%edx) | |
2207 fist 480(%edi) | |
2208 fadds 92(%edx) | |
2209 fistp 416(%edi) | |
2210 movsw | |
2211 addl $256,%esp | |
2212 popl %edi | |
2213 popl %esi | |
2214 popl %ebx | |
2215 ret $12 | |
2216 | |
2217 // end of FPU stuff |