4148
|
1 /*
|
|
2 * This code was taken from http://www.mpg123.org
|
|
3 * See ChangeLog of mpg123-0.59s-pre.1 for detail
|
|
4 * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
|
|
5 * Partial 3dnow! optimization by Nick Kurshev
|
|
6 *
|
|
7 * TODO: optimize scalar 3dnow! code
|
|
8 * Warning: Phases 7 & 8 are not tested
|
|
9 */
|
|
10 #define real float /* ugly - but only way */
|
|
11
|
5291
|
12 #include "../mangle.h"
|
|
13
|
4148
|
14 static unsigned long long int __attribute__((aligned(8))) x_plus_minus_3dnow = 0x8000000000000000ULL;
|
|
15 static float plus_1f = 1.0;
|
|
16
|
|
17 void __attribute__ (( __stdcall__ )) dct64_MMX_3dnow(real *a,real *b,real *c)
|
|
18 {
|
|
19 char tmp[256];
|
|
20 __asm __volatile(
|
|
21 " movl %2,%%eax\n\t"
|
|
22
|
|
23 " leal 128+%3,%%edx\n\t"
|
|
24 " movl %0,%%esi\n\t"
|
|
25 " movl %1,%%edi\n\t"
|
5291
|
26 " movl $"MANGLE(costab_mmx)",%%ebx\n\t"
|
4148
|
27 " leal %3,%%ecx\n\t"
|
|
28
|
|
29 /* Phase 1*/
|
|
30 " movq (%%eax), %%mm0\n\t"
|
|
31 " movq 8(%%eax), %%mm4\n\t"
|
|
32 " movq %%mm0, %%mm3\n\t"
|
|
33 " movq %%mm4, %%mm7\n\t"
|
|
34 " movq 120(%%eax), %%mm1\n\t"
|
|
35 " movq 112(%%eax), %%mm5\n\t"
|
|
36 /* n.b.: pswapd*/
|
|
37 " movq %%mm1, %%mm2\n\t"
|
|
38 " movq %%mm5, %%mm6\n\t"
|
|
39 " psrlq $32, %%mm1\n\t"
|
|
40 " psrlq $32, %%mm5\n\t"
|
|
41 " punpckldq %%mm2, %%mm1\n\t"
|
|
42 " punpckldq %%mm6, %%mm5\n\t"
|
|
43 /**/
|
|
44 " pfadd %%mm1, %%mm0\n\t"
|
|
45 " pfadd %%mm5, %%mm4\n\t"
|
|
46 " movq %%mm0, (%%edx)\n\t"
|
|
47 " movq %%mm4, 8(%%edx)\n\t"
|
|
48 " pfsub %%mm1, %%mm3\n\t"
|
|
49 " pfsub %%mm5, %%mm7\n\t"
|
|
50 " pfmul (%%ebx), %%mm3\n\t"
|
|
51 " pfmul 8(%%ebx), %%mm7\n\t"
|
|
52 " movd %%mm3, 124(%%edx)\n\t"
|
|
53 " movd %%mm7, 116(%%edx)\n\t"
|
|
54 " psrlq $32, %%mm3\n\t"
|
|
55 " psrlq $32, %%mm7\n\t"
|
|
56 " movd %%mm3, 120(%%edx)\n\t"
|
|
57 " movd %%mm7, 112(%%edx)\n\t"
|
|
58
|
|
59 " movq 16(%%eax), %%mm0\n\t"
|
|
60 " movq 24(%%eax), %%mm4\n\t"
|
|
61 " movq %%mm0, %%mm3\n\t"
|
|
62 " movq %%mm4, %%mm7\n\t"
|
|
63 " movq 104(%%eax), %%mm1\n\t"
|
|
64 " movq 96(%%eax), %%mm5\n\t"
|
|
65 /* n.b.: pswapd*/
|
|
66 " movq %%mm1, %%mm2\n\t"
|
|
67 " movq %%mm5, %%mm6\n\t"
|
|
68 " psrlq $32, %%mm1\n\t"
|
|
69 " psrlq $32, %%mm5\n\t"
|
|
70 " punpckldq %%mm2, %%mm1\n\t"
|
|
71 " punpckldq %%mm6, %%mm5\n\t"
|
|
72 /**/
|
|
73 " pfadd %%mm1, %%mm0\n\t"
|
|
74 " pfadd %%mm5, %%mm4\n\t"
|
|
75 " movq %%mm0, 16(%%edx)\n\t"
|
|
76 " movq %%mm4, 24(%%edx)\n\t"
|
|
77 " pfsub %%mm1, %%mm3\n\t"
|
|
78 " pfsub %%mm5, %%mm7\n\t"
|
|
79 " pfmul 16(%%ebx), %%mm3\n\t"
|
|
80 " pfmul 24(%%ebx), %%mm7\n\t"
|
|
81 " movd %%mm3, 108(%%edx)\n\t"
|
|
82 " movd %%mm7, 100(%%edx)\n\t"
|
|
83 " psrlq $32, %%mm3\n\t"
|
|
84 " psrlq $32, %%mm7\n\t"
|
|
85 " movd %%mm3, 104(%%edx)\n\t"
|
|
86 " movd %%mm7, 96(%%edx)\n\t"
|
|
87
|
|
88 " movq 32(%%eax), %%mm0\n\t"
|
|
89 " movq 40(%%eax), %%mm4\n\t"
|
|
90 " movq %%mm0, %%mm3\n\t"
|
|
91 " movq %%mm4, %%mm7\n\t"
|
|
92 " movq 88(%%eax), %%mm1\n\t"
|
|
93 " movq 80(%%eax), %%mm5\n\t"
|
|
94 /* n.b.: pswapd*/
|
|
95 " movq %%mm1, %%mm2\n\t"
|
|
96 " movq %%mm5, %%mm6\n\t"
|
|
97 " psrlq $32, %%mm1\n\t"
|
|
98 " psrlq $32, %%mm5\n\t"
|
|
99 " punpckldq %%mm2, %%mm1\n\t"
|
|
100 " punpckldq %%mm6, %%mm5\n\t"
|
|
101 /**/
|
|
102 " pfadd %%mm1, %%mm0\n\t"
|
|
103 " pfadd %%mm5, %%mm4\n\t"
|
|
104 " movq %%mm0, 32(%%edx)\n\t"
|
|
105 " movq %%mm4, 40(%%edx)\n\t"
|
|
106 " pfsub %%mm1, %%mm3\n\t"
|
|
107 " pfsub %%mm5, %%mm7\n\t"
|
|
108 " pfmul 32(%%ebx), %%mm3\n\t"
|
|
109 " pfmul 40(%%ebx), %%mm7\n\t"
|
|
110 " movd %%mm3, 92(%%edx)\n\t"
|
|
111 " movd %%mm7, 84(%%edx)\n\t"
|
|
112 " psrlq $32, %%mm3\n\t"
|
|
113 " psrlq $32, %%mm7\n\t"
|
|
114 " movd %%mm3, 88(%%edx)\n\t"
|
|
115 " movd %%mm7, 80(%%edx)\n\t"
|
|
116
|
|
117 " movq 48(%%eax), %%mm0\n\t"
|
|
118 " movq 56(%%eax), %%mm4\n\t"
|
|
119 " movq %%mm0, %%mm3\n\t"
|
|
120 " movq %%mm4, %%mm7\n\t"
|
|
121 " movq 72(%%eax), %%mm1\n\t"
|
|
122 " movq 64(%%eax), %%mm5\n\t"
|
|
123 /* n.b.: pswapd*/
|
|
124 " movq %%mm1, %%mm2\n\t"
|
|
125 " movq %%mm5, %%mm6\n\t"
|
|
126 " psrlq $32, %%mm1\n\t"
|
|
127 " psrlq $32, %%mm5\n\t"
|
|
128 " punpckldq %%mm2, %%mm1\n\t"
|
|
129 " punpckldq %%mm6, %%mm5\n\t"
|
|
130 /**/
|
|
131 " pfadd %%mm1, %%mm0\n\t"
|
|
132 " pfadd %%mm5, %%mm4\n\t"
|
|
133 " movq %%mm0, 48(%%edx)\n\t"
|
|
134 " movq %%mm4, 56(%%edx)\n\t"
|
|
135 " pfsub %%mm1, %%mm3\n\t"
|
|
136 " pfsub %%mm5, %%mm7\n\t"
|
|
137 " pfmul 48(%%ebx), %%mm3\n\t"
|
|
138 " pfmul 56(%%ebx), %%mm7\n\t"
|
|
139 " movd %%mm3, 76(%%edx)\n\t"
|
|
140 " movd %%mm7, 68(%%edx)\n\t"
|
|
141 " psrlq $32, %%mm3\n\t"
|
|
142 " psrlq $32, %%mm7\n\t"
|
|
143 " movd %%mm3, 72(%%edx)\n\t"
|
|
144 " movd %%mm7, 64(%%edx)\n\t"
|
|
145
|
|
146 /* Phase 2*/
|
|
147
|
|
148 " movq (%%edx), %%mm0\n\t"
|
|
149 " movq 8(%%edx), %%mm4\n\t"
|
|
150 " movq %%mm0, %%mm3\n\t"
|
|
151 " movq %%mm4, %%mm7\n\t"
|
|
152 " movq 56(%%edx), %%mm1\n\t"
|
|
153 " movq 48(%%edx), %%mm5\n\t"
|
|
154 /* n.b.: pswapd*/
|
|
155 " movq %%mm1, %%mm2\n\t"
|
|
156 " movq %%mm5, %%mm6\n\t"
|
|
157 " psrlq $32, %%mm1\n\t"
|
|
158 " psrlq $32, %%mm5\n\t"
|
|
159 " punpckldq %%mm2, %%mm1\n\t"
|
|
160 " punpckldq %%mm6, %%mm5\n\t"
|
|
161 /**/
|
|
162 " pfadd %%mm1, %%mm0\n\t"
|
|
163 " pfadd %%mm5, %%mm4\n\t"
|
|
164 " movq %%mm0, (%%ecx)\n\t"
|
|
165 " movq %%mm4, 8(%%ecx)\n\t"
|
|
166 " pfsub %%mm1, %%mm3\n\t"
|
|
167 " pfsub %%mm5, %%mm7\n\t"
|
|
168 " pfmul 64(%%ebx), %%mm3\n\t"
|
|
169 " pfmul 72(%%ebx), %%mm7\n\t"
|
|
170 " movd %%mm3, 60(%%ecx)\n\t"
|
|
171 " movd %%mm7, 52(%%ecx)\n\t"
|
|
172 " psrlq $32, %%mm3\n\t"
|
|
173 " psrlq $32, %%mm7\n\t"
|
|
174 " movd %%mm3, 56(%%ecx)\n\t"
|
|
175 " movd %%mm7, 48(%%ecx)\n\t"
|
|
176
|
|
177 " movq 16(%%edx), %%mm0\n\t"
|
|
178 " movq 24(%%edx), %%mm4\n\t"
|
|
179 " movq %%mm0, %%mm3\n\t"
|
|
180 " movq %%mm4, %%mm7\n\t"
|
|
181 " movq 40(%%edx), %%mm1\n\t"
|
|
182 " movq 32(%%edx), %%mm5\n\t"
|
|
183 /* n.b.: pswapd*/
|
|
184 " movq %%mm1, %%mm2\n\t"
|
|
185 " movq %%mm5, %%mm6\n\t"
|
|
186 " psrlq $32, %%mm1\n\t"
|
|
187 " psrlq $32, %%mm5\n\t"
|
|
188 " punpckldq %%mm2, %%mm1\n\t"
|
|
189 " punpckldq %%mm6, %%mm5\n\t"
|
|
190 /**/
|
|
191 " pfadd %%mm1, %%mm0\n\t"
|
|
192 " pfadd %%mm5, %%mm4\n\t"
|
|
193 " movq %%mm0, 16(%%ecx)\n\t"
|
|
194 " movq %%mm4, 24(%%ecx)\n\t"
|
|
195 " pfsub %%mm1, %%mm3\n\t"
|
|
196 " pfsub %%mm5, %%mm7\n\t"
|
|
197 " pfmul 80(%%ebx), %%mm3\n\t"
|
|
198 " pfmul 88(%%ebx), %%mm7\n\t"
|
|
199 " movd %%mm3, 44(%%ecx)\n\t"
|
|
200 " movd %%mm7, 36(%%ecx)\n\t"
|
|
201 " psrlq $32, %%mm3\n\t"
|
|
202 " psrlq $32, %%mm7\n\t"
|
|
203 " movd %%mm3, 40(%%ecx)\n\t"
|
|
204 " movd %%mm7, 32(%%ecx)\n\t"
|
|
205
|
|
206 /* Phase 3*/
|
|
207
|
|
208 " movq 64(%%edx), %%mm0\n\t"
|
|
209 " movq 72(%%edx), %%mm4\n\t"
|
|
210 " movq %%mm0, %%mm3\n\t"
|
|
211 " movq %%mm4, %%mm7\n\t"
|
|
212 " movq 120(%%edx), %%mm1\n\t"
|
|
213 " movq 112(%%edx), %%mm5\n\t"
|
|
214 /* n.b.: pswapd*/
|
|
215 " movq %%mm1, %%mm2\n\t"
|
|
216 " movq %%mm5, %%mm6\n\t"
|
|
217 " psrlq $32, %%mm1\n\t"
|
|
218 " psrlq $32, %%mm5\n\t"
|
|
219 " punpckldq %%mm2, %%mm1\n\t"
|
|
220 " punpckldq %%mm6, %%mm5\n\t"
|
|
221 /**/
|
|
222 " pfadd %%mm1, %%mm0\n\t"
|
|
223 " pfadd %%mm5, %%mm4\n\t"
|
|
224 " movq %%mm0, 64(%%ecx)\n\t"
|
|
225 " movq %%mm4, 72(%%ecx)\n\t"
|
|
226 " pfsubr %%mm1, %%mm3\n\t"
|
|
227 " pfsubr %%mm5, %%mm7\n\t"
|
|
228 " pfmul 64(%%ebx), %%mm3\n\t"
|
|
229 " pfmul 72(%%ebx), %%mm7\n\t"
|
|
230 " movd %%mm3, 124(%%ecx)\n\t"
|
|
231 " movd %%mm7, 116(%%ecx)\n\t"
|
|
232 " psrlq $32, %%mm3\n\t"
|
|
233 " psrlq $32, %%mm7\n\t"
|
|
234 " movd %%mm3, 120(%%ecx)\n\t"
|
|
235 " movd %%mm7, 112(%%ecx)\n\t"
|
|
236
|
|
237 " movq 80(%%edx), %%mm0\n\t"
|
|
238 " movq 88(%%edx), %%mm4\n\t"
|
|
239 " movq %%mm0, %%mm3\n\t"
|
|
240 " movq %%mm4, %%mm7\n\t"
|
|
241 " movq 104(%%edx), %%mm1\n\t"
|
|
242 " movq 96(%%edx), %%mm5\n\t"
|
|
243 /* n.b.: pswapd*/
|
|
244 " movq %%mm1, %%mm2\n\t"
|
|
245 " movq %%mm5, %%mm6\n\t"
|
|
246 " psrlq $32, %%mm1\n\t"
|
|
247 " psrlq $32, %%mm5\n\t"
|
|
248 " punpckldq %%mm2, %%mm1\n\t"
|
|
249 " punpckldq %%mm6, %%mm5\n\t"
|
|
250 /**/
|
|
251 " pfadd %%mm1, %%mm0\n\t"
|
|
252 " pfadd %%mm5, %%mm4\n\t"
|
|
253 " movq %%mm0, 80(%%ecx)\n\t"
|
|
254 " movq %%mm4, 88(%%ecx)\n\t"
|
|
255 " pfsubr %%mm1, %%mm3\n\t"
|
|
256 " pfsubr %%mm5, %%mm7\n\t"
|
|
257 " pfmul 80(%%ebx), %%mm3\n\t"
|
|
258 " pfmul 88(%%ebx), %%mm7\n\t"
|
|
259 " movd %%mm3, 108(%%ecx)\n\t"
|
|
260 " movd %%mm7, 100(%%ecx)\n\t"
|
|
261 " psrlq $32, %%mm3\n\t"
|
|
262 " psrlq $32, %%mm7\n\t"
|
|
263 " movd %%mm3, 104(%%ecx)\n\t"
|
|
264 " movd %%mm7, 96(%%ecx)\n\t"
|
|
265
|
|
266 /* Phase 4*/
|
|
267
|
|
268 " movq (%%ecx), %%mm0\n\t"
|
|
269 " movq 8(%%ecx), %%mm4\n\t"
|
|
270 " movq %%mm0, %%mm3\n\t"
|
|
271 " movq %%mm4, %%mm7\n\t"
|
|
272 " movq 24(%%ecx), %%mm1\n\t"
|
|
273 " movq 16(%%ecx), %%mm5\n\t"
|
|
274 /* n.b.: pswapd*/
|
|
275 " movq %%mm1, %%mm2\n\t"
|
|
276 " movq %%mm5, %%mm6\n\t"
|
|
277 " psrlq $32, %%mm1\n\t"
|
|
278 " psrlq $32, %%mm5\n\t"
|
|
279 " punpckldq %%mm2, %%mm1\n\t"
|
|
280 " punpckldq %%mm6, %%mm5\n\t"
|
|
281 /**/
|
|
282 " pfadd %%mm1, %%mm0\n\t"
|
|
283 " pfadd %%mm5, %%mm4\n\t"
|
|
284 " movq %%mm0, (%%edx)\n\t"
|
|
285 " movq %%mm4, 8(%%edx)\n\t"
|
|
286 " pfsub %%mm1, %%mm3\n\t"
|
|
287 " pfsub %%mm5, %%mm7\n\t"
|
|
288 " pfmul 96(%%ebx), %%mm3\n\t"
|
|
289 " pfmul 104(%%ebx), %%mm7\n\t"
|
|
290 " movd %%mm3, 28(%%edx)\n\t"
|
|
291 " movd %%mm7, 20(%%edx)\n\t"
|
|
292 " psrlq $32, %%mm3\n\t"
|
|
293 " psrlq $32, %%mm7\n\t"
|
|
294 " movd %%mm3, 24(%%edx)\n\t"
|
|
295 " movd %%mm7, 16(%%edx)\n\t"
|
|
296
|
|
297 " movq 32(%%ecx), %%mm0\n\t"
|
|
298 " movq 40(%%ecx), %%mm4\n\t"
|
|
299 " movq %%mm0, %%mm3\n\t"
|
|
300 " movq %%mm4, %%mm7\n\t"
|
|
301 " movq 56(%%ecx), %%mm1\n\t"
|
|
302 " movq 48(%%ecx), %%mm5\n\t"
|
|
303 /* n.b.: pswapd*/
|
|
304 " movq %%mm1, %%mm2\n\t"
|
|
305 " movq %%mm5, %%mm6\n\t"
|
|
306 " psrlq $32, %%mm1\n\t"
|
|
307 " psrlq $32, %%mm5\n\t"
|
|
308 " punpckldq %%mm2, %%mm1\n\t"
|
|
309 " punpckldq %%mm6, %%mm5\n\t"
|
|
310 /**/
|
|
311 " pfadd %%mm1, %%mm0\n\t"
|
|
312 " pfadd %%mm5, %%mm4\n\t"
|
|
313 " movq %%mm0, 32(%%edx)\n\t"
|
|
314 " movq %%mm4, 40(%%edx)\n\t"
|
|
315 " pfsubr %%mm1, %%mm3\n\t"
|
|
316 " pfsubr %%mm5, %%mm7\n\t"
|
|
317 " pfmul 96(%%ebx), %%mm3\n\t"
|
|
318 " pfmul 104(%%ebx), %%mm7\n\t"
|
|
319 " movd %%mm3, 60(%%edx)\n\t"
|
|
320 " movd %%mm7, 52(%%edx)\n\t"
|
|
321 " psrlq $32, %%mm3\n\t"
|
|
322 " psrlq $32, %%mm7\n\t"
|
|
323 " movd %%mm3, 56(%%edx)\n\t"
|
|
324 " movd %%mm7, 48(%%edx)\n\t"
|
|
325
|
|
326 " movq 64(%%ecx), %%mm0\n\t"
|
|
327 " movq 72(%%ecx), %%mm4\n\t"
|
|
328 " movq %%mm0, %%mm3\n\t"
|
|
329 " movq %%mm4, %%mm7\n\t"
|
|
330 " movq 88(%%ecx), %%mm1\n\t"
|
|
331 " movq 80(%%ecx), %%mm5\n\t"
|
|
332 /* n.b.: pswapd*/
|
|
333 " movq %%mm1, %%mm2\n\t"
|
|
334 " movq %%mm5, %%mm6\n\t"
|
|
335 " psrlq $32, %%mm1\n\t"
|
|
336 " psrlq $32, %%mm5\n\t"
|
|
337 " punpckldq %%mm2, %%mm1\n\t"
|
|
338 " punpckldq %%mm6, %%mm5\n\t"
|
|
339 /**/
|
|
340 " pfadd %%mm1, %%mm0\n\t"
|
|
341 " pfadd %%mm5, %%mm4\n\t"
|
|
342 " movq %%mm0, 64(%%edx)\n\t"
|
|
343 " movq %%mm4, 72(%%edx)\n\t"
|
|
344 " pfsub %%mm1, %%mm3\n\t"
|
|
345 " pfsub %%mm5, %%mm7\n\t"
|
|
346 " pfmul 96(%%ebx), %%mm3\n\t"
|
|
347 " pfmul 104(%%ebx), %%mm7\n\t"
|
|
348 " movd %%mm3, 92(%%edx)\n\t"
|
|
349 " movd %%mm7, 84(%%edx)\n\t"
|
|
350 " psrlq $32, %%mm3\n\t"
|
|
351 " psrlq $32, %%mm7\n\t"
|
|
352 " movd %%mm3, 88(%%edx)\n\t"
|
|
353 " movd %%mm7, 80(%%edx)\n\t"
|
|
354
|
|
355 " movq 96(%%ecx), %%mm0\n\t"
|
|
356 " movq 104(%%ecx), %%mm4\n\t"
|
|
357 " movq %%mm0, %%mm3\n\t"
|
|
358 " movq %%mm4, %%mm7\n\t"
|
|
359 " movq 120(%%ecx), %%mm1\n\t"
|
|
360 " movq 112(%%ecx), %%mm5\n\t"
|
|
361 /* n.b.: pswapd*/
|
|
362 " movq %%mm1, %%mm2\n\t"
|
|
363 " movq %%mm5, %%mm6\n\t"
|
|
364 " psrlq $32, %%mm1\n\t"
|
|
365 " psrlq $32, %%mm5\n\t"
|
|
366 " punpckldq %%mm2, %%mm1\n\t"
|
|
367 " punpckldq %%mm6, %%mm5\n\t"
|
|
368 /**/
|
|
369 " pfadd %%mm1, %%mm0\n\t"
|
|
370 " pfadd %%mm5, %%mm4\n\t"
|
|
371 " movq %%mm0, 96(%%edx)\n\t"
|
|
372 " movq %%mm4, 104(%%edx)\n\t"
|
|
373 " pfsubr %%mm1, %%mm3\n\t"
|
|
374 " pfsubr %%mm5, %%mm7\n\t"
|
|
375 " pfmul 96(%%ebx), %%mm3\n\t"
|
|
376 " pfmul 104(%%ebx), %%mm7\n\t"
|
|
377 " movd %%mm3, 124(%%edx)\n\t"
|
|
378 " movd %%mm7, 116(%%edx)\n\t"
|
|
379 " psrlq $32, %%mm3\n\t"
|
|
380 " psrlq $32, %%mm7\n\t"
|
|
381 " movd %%mm3, 120(%%edx)\n\t"
|
|
382 " movd %%mm7, 112(%%edx)\n\t"
|
|
383
|
|
384 /* Phase 5 */
|
|
385
|
|
386 " movq (%%edx), %%mm0\n\t"
|
|
387 " movq 16(%%edx), %%mm4\n\t"
|
|
388 " movq %%mm0, %%mm3\n\t"
|
|
389 " movq %%mm4, %%mm7\n\t"
|
|
390 " movq 8(%%edx), %%mm1\n\t"
|
|
391 " movq 24(%%edx), %%mm5\n\t"
|
|
392 /* n.b.: pswapd*/
|
|
393 " movq %%mm1, %%mm2\n\t"
|
|
394 " movq %%mm5, %%mm6\n\t"
|
|
395 " psrlq $32, %%mm1\n\t"
|
|
396 " psrlq $32, %%mm5\n\t"
|
|
397 " punpckldq %%mm2, %%mm1\n\t"
|
|
398 " punpckldq %%mm6, %%mm5\n\t"
|
|
399 /**/
|
|
400 " pfadd %%mm1, %%mm0\n\t"
|
|
401 " pfadd %%mm5, %%mm4\n\t"
|
|
402 " movq %%mm0, (%%ecx)\n\t"
|
|
403 " movq %%mm4, 16(%%ecx)\n\t"
|
|
404 " pfsub %%mm1, %%mm3\n\t"
|
|
405 " pfsubr %%mm5, %%mm7\n\t"
|
|
406 " pfmul 112(%%ebx), %%mm3\n\t"
|
|
407 " pfmul 112(%%ebx), %%mm7\n\t"
|
|
408 " movd %%mm3, 12(%%ecx)\n\t"
|
|
409 " movd %%mm7, 28(%%ecx)\n\t"
|
|
410 " psrlq $32, %%mm3\n\t"
|
|
411 " psrlq $32, %%mm7\n\t"
|
|
412 " movd %%mm3, 8(%%ecx)\n\t"
|
|
413 " movd %%mm7, 24(%%ecx)\n\t"
|
|
414
|
|
415 " movq 32(%%edx), %%mm0\n\t"
|
|
416 " movq 48(%%edx), %%mm4\n\t"
|
|
417 " movq %%mm0, %%mm3\n\t"
|
|
418 " movq %%mm4, %%mm7\n\t"
|
|
419 " movq 40(%%edx), %%mm1\n\t"
|
|
420 " movq 56(%%edx), %%mm5\n\t"
|
|
421 /* n.b.: pswapd*/
|
|
422 " movq %%mm1, %%mm2\n\t"
|
|
423 " movq %%mm5, %%mm6\n\t"
|
|
424 " psrlq $32, %%mm1\n\t"
|
|
425 " psrlq $32, %%mm5\n\t"
|
|
426 " punpckldq %%mm2, %%mm1\n\t"
|
|
427 " punpckldq %%mm6, %%mm5\n\t"
|
|
428 /**/
|
|
429 " pfadd %%mm1, %%mm0\n\t"
|
|
430 " pfadd %%mm5, %%mm4\n\t"
|
|
431 " movq %%mm0, 32(%%ecx)\n\t"
|
|
432 " movq %%mm4, 48(%%ecx)\n\t"
|
|
433 " pfsub %%mm1, %%mm3\n\t"
|
|
434 " pfsubr %%mm5, %%mm7\n\t"
|
|
435 " pfmul 112(%%ebx), %%mm3\n\t"
|
|
436 " pfmul 112(%%ebx), %%mm7\n\t"
|
|
437 " movd %%mm3, 44(%%ecx)\n\t"
|
|
438 " movd %%mm7, 60(%%ecx)\n\t"
|
|
439 " psrlq $32, %%mm3\n\t"
|
|
440 " psrlq $32, %%mm7\n\t"
|
|
441 " movd %%mm3, 40(%%ecx)\n\t"
|
|
442 " movd %%mm7, 56(%%ecx)\n\t"
|
|
443
|
|
444 " movq 64(%%edx), %%mm0\n\t"
|
|
445 " movq 80(%%edx), %%mm4\n\t"
|
|
446 " movq %%mm0, %%mm3\n\t"
|
|
447 " movq %%mm4, %%mm7\n\t"
|
|
448 " movq 72(%%edx), %%mm1\n\t"
|
|
449 " movq 88(%%edx), %%mm5\n\t"
|
|
450 /* n.b.: pswapd*/
|
|
451 " movq %%mm1, %%mm2\n\t"
|
|
452 " movq %%mm5, %%mm6\n\t"
|
|
453 " psrlq $32, %%mm1\n\t"
|
|
454 " psrlq $32, %%mm5\n\t"
|
|
455 " punpckldq %%mm2, %%mm1\n\t"
|
|
456 " punpckldq %%mm6, %%mm5\n\t"
|
|
457 /**/
|
|
458 " pfadd %%mm1, %%mm0\n\t"
|
|
459 " pfadd %%mm5, %%mm4\n\t"
|
|
460 " movq %%mm0, 64(%%ecx)\n\t"
|
|
461 " movq %%mm4, 80(%%ecx)\n\t"
|
|
462 " pfsub %%mm1, %%mm3\n\t"
|
|
463 " pfsubr %%mm5, %%mm7\n\t"
|
|
464 " pfmul 112(%%ebx), %%mm3\n\t"
|
|
465 " pfmul 112(%%ebx), %%mm7\n\t"
|
|
466 " movd %%mm3, 76(%%ecx)\n\t"
|
|
467 " movd %%mm7, 92(%%ecx)\n\t"
|
|
468 " psrlq $32, %%mm3\n\t"
|
|
469 " psrlq $32, %%mm7\n\t"
|
|
470 " movd %%mm3, 72(%%ecx)\n\t"
|
|
471 " movd %%mm7, 88(%%ecx)\n\t"
|
|
472
|
|
473 " movq 96(%%edx), %%mm0\n\t"
|
|
474 " movq 112(%%edx), %%mm4\n\t"
|
|
475 " movq %%mm0, %%mm3\n\t"
|
|
476 " movq %%mm4, %%mm7\n\t"
|
|
477 " movq 104(%%edx), %%mm1\n\t"
|
|
478 " movq 120(%%edx), %%mm5\n\t"
|
|
479 /* n.b.: pswapd*/
|
|
480 " movq %%mm1, %%mm2\n\t"
|
|
481 " movq %%mm5, %%mm6\n\t"
|
|
482 " psrlq $32, %%mm1\n\t"
|
|
483 " psrlq $32, %%mm5\n\t"
|
|
484 " punpckldq %%mm2, %%mm1\n\t"
|
|
485 " punpckldq %%mm6, %%mm5\n\t"
|
|
486 /**/
|
|
487 " pfadd %%mm1, %%mm0\n\t"
|
|
488 " pfadd %%mm5, %%mm4\n\t"
|
|
489 " movq %%mm0, 96(%%ecx)\n\t"
|
|
490 " movq %%mm4, 112(%%ecx)\n\t"
|
|
491 " pfsub %%mm1, %%mm3\n\t"
|
|
492 " pfsubr %%mm5, %%mm7\n\t"
|
|
493 " pfmul 112(%%ebx), %%mm3\n\t"
|
|
494 " pfmul 112(%%ebx), %%mm7\n\t"
|
|
495 " movd %%mm3, 108(%%ecx)\n\t"
|
|
496 " movd %%mm7, 124(%%ecx)\n\t"
|
|
497 " psrlq $32, %%mm3\n\t"
|
|
498 " psrlq $32, %%mm7\n\t"
|
|
499 " movd %%mm3, 104(%%ecx)\n\t"
|
|
500 " movd %%mm7, 120(%%ecx)\n\t"
|
|
501
|
|
502 /* Phase 6. This is the end of easy road. */
|
|
503 /* Code below is coded in scalar mode. Should be optimized */
|
|
504
|
5291
|
505 " movd "MANGLE(plus_1f)", %%mm6\n\t"
|
4148
|
506 " punpckldq 120(%%ebx), %%mm6\n\t" /* mm6 = 1.0 | 120(%%ebx)*/
|
|
507 " movq x_plus_minus_3dnow, %%mm7\n\t" /* mm7 = +1 | -1 */
|
|
508
|
|
509 " movq 32(%%ecx), %%mm0\n\t"
|
|
510 " movq 64(%%ecx), %%mm2\n\t"
|
|
511 " movq %%mm0, %%mm1\n\t"
|
|
512 " movq %%mm2, %%mm3\n\t"
|
|
513 " pxor %%mm7, %%mm1\n\t"
|
|
514 " pxor %%mm7, %%mm3\n\t"
|
|
515 " pfacc %%mm1, %%mm0\n\t"
|
|
516 " pfacc %%mm3, %%mm2\n\t"
|
|
517 " pfmul %%mm6, %%mm0\n\t"
|
|
518 " pfmul %%mm6, %%mm2\n\t"
|
|
519 " movq %%mm0, 32(%%edx)\n\t"
|
|
520 " movq %%mm2, 64(%%edx)\n\t"
|
|
521
|
|
522 " movd 44(%%ecx), %%mm0\n\t"
|
|
523 " movd 40(%%ecx), %%mm2\n\t"
|
|
524 " movd 120(%%ebx), %%mm3\n\t"
|
|
525 " punpckldq 76(%%ecx), %%mm0\n\t"
|
|
526 " punpckldq 72(%%ecx), %%mm2\n\t"
|
|
527 " punpckldq %%mm3, %%mm3\n\t"
|
|
528 " movq %%mm0, %%mm4\n\t"
|
|
529 " movq %%mm2, %%mm5\n\t"
|
|
530 " pfsub %%mm2, %%mm0\n\t"
|
|
531 " pfmul %%mm3, %%mm0\n\t"
|
|
532 " movq %%mm0, %%mm1\n\t"
|
|
533 " pfadd %%mm5, %%mm0\n\t"
|
|
534 " pfadd %%mm4, %%mm0\n\t"
|
|
535 " movq %%mm0, %%mm2\n\t"
|
|
536 " punpckldq %%mm1, %%mm0\n\t"
|
|
537 " punpckhdq %%mm1, %%mm2\n\t"
|
|
538 " movq %%mm0, 40(%%edx)\n\t"
|
|
539 " movq %%mm2, 72(%%edx)\n\t"
|
|
540
|
|
541 " movd 48(%%ecx), %%mm3\n\t"
|
|
542 " movd 60(%%ecx), %%mm2\n\t"
|
|
543 " pfsub 52(%%ecx), %%mm3\n\t"
|
|
544 " pfsub 56(%%ecx), %%mm2\n\t"
|
|
545 " pfmul 120(%%ebx), %%mm3\n\t"
|
|
546 " pfmul 120(%%ebx), %%mm2\n\t"
|
|
547 " movq %%mm2, %%mm1\n\t"
|
|
548
|
|
549 " pfadd 56(%%ecx), %%mm1\n\t"
|
|
550 " pfadd 60(%%ecx), %%mm1\n\t"
|
|
551 " movq %%mm1, %%mm0\n\t"
|
|
552
|
|
553 " pfadd 48(%%ecx), %%mm0\n\t"
|
|
554 " pfadd 52(%%ecx), %%mm0\n\t"
|
|
555 " pfadd %%mm3, %%mm1\n\t"
|
|
556 " punpckldq %%mm2, %%mm1\n\t"
|
|
557 " pfadd %%mm3, %%mm2\n\t"
|
|
558 " punpckldq %%mm2, %%mm0\n\t"
|
|
559 " movq %%mm1, 56(%%edx)\n\t"
|
|
560 " movq %%mm0, 48(%%edx)\n\t"
|
|
561
|
|
562 /*---*/
|
|
563
|
|
564 " movd 92(%%ecx), %%mm1\n\t"
|
|
565 " pfsub 88(%%ecx), %%mm1\n\t"
|
|
566 " pfmul 120(%%ebx), %%mm1\n\t"
|
|
567 " movd %%mm1, 92(%%edx)\n\t"
|
|
568 " pfadd 92(%%ecx), %%mm1\n\t"
|
|
569 " pfadd 88(%%ecx), %%mm1\n\t"
|
|
570 " movq %%mm1, %%mm0\n\t"
|
|
571
|
|
572 " pfadd 80(%%ecx), %%mm0\n\t"
|
|
573 " pfadd 84(%%ecx), %%mm0\n\t"
|
|
574 " movd %%mm0, 80(%%edx)\n\t"
|
|
575
|
|
576 " movd 80(%%ecx), %%mm0\n\t"
|
|
577 " pfsub 84(%%ecx), %%mm0\n\t"
|
|
578 " pfmul 120(%%ebx), %%mm0\n\t"
|
|
579 " pfadd %%mm0, %%mm1\n\t"
|
|
580 " pfadd 92(%%edx), %%mm0\n\t"
|
|
581 " punpckldq %%mm1, %%mm0\n\t"
|
|
582 " movq %%mm0, 84(%%edx)\n\t"
|
|
583
|
|
584 " movq 96(%%ecx), %%mm0\n\t"
|
|
585 " movq %%mm0, %%mm1\n\t"
|
|
586 " pxor %%mm7, %%mm1\n\t"
|
|
587 " pfacc %%mm1, %%mm0\n\t"
|
|
588 " pfmul %%mm6, %%mm0\n\t"
|
|
589 " movq %%mm0, 96(%%edx)\n\t"
|
|
590
|
|
591 " movd 108(%%ecx), %%mm0\n\t"
|
|
592 " pfsub 104(%%ecx), %%mm0\n\t"
|
|
593 " pfmul 120(%%ebx), %%mm0\n\t"
|
|
594 " movd %%mm0, 108(%%edx)\n\t"
|
|
595 " pfadd 104(%%ecx), %%mm0\n\t"
|
|
596 " pfadd 108(%%ecx), %%mm0\n\t"
|
|
597 " movd %%mm0, 104(%%edx)\n\t"
|
|
598
|
|
599 " movd 124(%%ecx), %%mm1\n\t"
|
|
600 " pfsub 120(%%ecx), %%mm1\n\t"
|
|
601 " pfmul 120(%%ebx), %%mm1\n\t"
|
|
602 " movd %%mm1, 124(%%edx)\n\t"
|
|
603 " pfadd 120(%%ecx), %%mm1\n\t"
|
|
604 " pfadd 124(%%ecx), %%mm1\n\t"
|
|
605 " movq %%mm1, %%mm0\n\t"
|
|
606
|
|
607 " pfadd 112(%%ecx), %%mm0\n\t"
|
|
608 " pfadd 116(%%ecx), %%mm0\n\t"
|
|
609 " movd %%mm0, 112(%%edx)\n\t"
|
|
610
|
|
611 " movd 112(%%ecx), %%mm0\n\t"
|
|
612 " pfsub 116(%%ecx), %%mm0\n\t"
|
|
613 " pfmul 120(%%ebx), %%mm0\n\t"
|
|
614 " pfadd %%mm0,%%mm1\n\t"
|
|
615 " pfadd 124(%%edx), %%mm0\n\t"
|
|
616 " punpckldq %%mm1, %%mm0\n\t"
|
|
617 " movq %%mm0, 116(%%edx)\n\t"
|
|
618
|
|
619 " jnz .L01\n\t"
|
|
620
|
|
621 /* Phase 7*/
|
|
622 /* Code below is coded in scalar mode. Should be optimized */
|
|
623
|
|
624 " movd (%%ecx), %%mm0\n\t"
|
|
625 " pfadd 4(%%ecx), %%mm0\n\t"
|
|
626 " movd %%mm0, 1024(%%esi)\n\t"
|
|
627
|
|
628 " movd (%%ecx), %%mm0\n\t"
|
|
629 " pfsub 4(%%ecx), %%mm0\n\t"
|
|
630 " pfmul 120(%%ebx), %%mm0\n\t"
|
|
631 " movd %%mm0, (%%esi)\n\t"
|
|
632 " movd %%mm0, (%%edi)\n\t"
|
|
633
|
|
634 " movd 12(%%ecx), %%mm0\n\t"
|
|
635 " pfsub 8(%%ecx), %%mm0\n\t"
|
|
636 " pfmul 120(%%ebx), %%mm0\n\t"
|
|
637 " movd %%mm0, 512(%%edi)\n\t"
|
|
638 " pfadd 12(%%ecx), %%mm0\n\t"
|
|
639 " pfadd 8(%%ecx), %%mm0\n\t"
|
|
640 " movd %%mm0, 512(%%esi)\n\t"
|
|
641
|
|
642 " movd 16(%%ecx), %%mm0\n\t"
|
|
643 " pfsub 20(%%ecx), %%mm0\n\t"
|
|
644 " pfmul 120(%%ebx), %%mm0\n\t"
|
|
645 " movq %%mm0, %%mm3\n\t"
|
|
646
|
|
647 " movd 28(%%ecx), %%mm0\n\t"
|
|
648 " pfsub 24(%%ecx), %%mm0\n\t"
|
|
649 " pfmul 120(%%ebx), %%mm0\n\t"
|
|
650 " movd %%mm0, 768(%%edi)\n\t"
|
|
651 " movq %%mm0, %%mm2\n\t"
|
|
652
|
|
653 " pfadd 24(%%ecx), %%mm0\n\t"
|
|
654 " pfadd 28(%%ecx), %%mm0\n\t"
|
|
655 " movq %%mm0, %%mm1\n\t"
|
|
656
|
|
657 " pfadd 16(%%ecx), %%mm0\n\t"
|
|
658 " pfadd 20(%%ecx), %%mm0\n\t"
|
|
659 " movd %%mm0, 768(%%esi)\n\t"
|
|
660 " pfadd %%mm3, %%mm1\n\t"
|
|
661 " movd %%mm1, 256(%%esi)\n\t"
|
|
662 " pfadd %%mm3, %%mm2\n\t"
|
|
663 " movd %%mm2, 256(%%edi)\n\t"
|
|
664
|
|
665 /* Phase 8*/
|
|
666
|
|
667 " movq 32(%%edx), %%mm0\n\t"
|
|
668 " movq 48(%%edx), %%mm1\n\t"
|
|
669 " pfadd 48(%%edx), %%mm0\n\t"
|
|
670 " pfadd 40(%%edx), %%mm1\n\t"
|
|
671 " movd %%mm0, 896(%%esi)\n\t"
|
|
672 " movd %%mm1, 640(%%esi)\n\t"
|
|
673 " psrlq $32, %%mm0\n\t"
|
|
674 " psrlq $32, %%mm1\n\t"
|
|
675 " movd %%mm0, 128(%%edi)\n\t"
|
|
676 " movd %%mm1, 384(%%edi)\n\t"
|
|
677
|
|
678 " movd 40(%%edx), %%mm0\n\t"
|
|
679 " pfadd 56(%%edx), %%mm0\n\t"
|
|
680 " movd %%mm0, 384(%%esi)\n\t"
|
|
681
|
|
682 " movd 56(%%edx), %%mm0\n\t"
|
|
683 " pfadd 36(%%edx), %%mm0\n\t"
|
|
684 " movd %%mm0, 128(%%esi)\n\t"
|
|
685
|
|
686 " movd 60(%%edx), %%mm0\n\t"
|
|
687 " movd %%mm0, 896(%%edi)\n\t"
|
|
688 " pfadd 44(%%edx), %%mm0\n\t"
|
|
689 " movd %%mm0, 640(%%edi)\n\t"
|
|
690
|
|
691 " movq 96(%%edx), %%mm0\n\t"
|
|
692 " movq 112(%%edx), %%mm2\n\t"
|
|
693 " movq 104(%%edx), %%mm4\n\t"
|
|
694 " pfadd 112(%%edx), %%mm0\n\t"
|
|
695 " pfadd 104(%%edx), %%mm2\n\t"
|
|
696 " pfadd 120(%%edx), %%mm4\n\t"
|
|
697 " movq %%mm0, %%mm1\n\t"
|
|
698 " movq %%mm2, %%mm3\n\t"
|
|
699 " movq %%mm4, %%mm5\n\t"
|
|
700 " pfadd 64(%%edx), %%mm0\n\t"
|
|
701 " pfadd 80(%%edx), %%mm2\n\t"
|
|
702 " pfadd 72(%%edx), %%mm4\n\t"
|
|
703 " movd %%mm0, 960(%%esi)\n\t"
|
|
704 " movd %%mm2, 704(%%esi)\n\t"
|
|
705 " movd %%mm4, 448(%%esi)\n\t"
|
|
706 " psrlq $32, %%mm0\n\t"
|
|
707 " psrlq $32, %%mm2\n\t"
|
|
708 " psrlq $32, %%mm4\n\t"
|
|
709 " movd %%mm0, 64(%%edi)\n\t"
|
|
710 " movd %%mm2, 320(%%edi)\n\t"
|
|
711 " movd %%mm4, 576(%%edi)\n\t"
|
|
712 " pfadd 80(%%edx), %%mm1\n\t"
|
|
713 " pfadd 72(%%edx), %%mm3\n\t"
|
|
714 " pfadd 88(%%edx), %%mm5\n\t"
|
|
715 " movd %%mm1, 832(%%esi)\n\t"
|
|
716 " movd %%mm3, 576(%%esi)\n\t"
|
|
717 " movd %%mm5, 320(%%esi)\n\t"
|
|
718 " psrlq $32, %%mm1\n\t"
|
|
719 " psrlq $32, %%mm3\n\t"
|
|
720 " psrlq $32, %%mm5\n\t"
|
|
721 " movd %%mm1, 192(%%edi)\n\t"
|
|
722 " movd %%mm3, 448(%%edi)\n\t"
|
|
723 " movd %%mm5, 704(%%edi)\n\t"
|
|
724
|
|
725 " movd 120(%%edx), %%mm0\n\t"
|
|
726 " pfadd 100(%%edx), %%mm0\n\t"
|
|
727 " movq %%mm0, %%mm1\n\t"
|
|
728 " pfadd 88(%%edx), %%mm0\n\t"
|
|
729 " movd %%mm0, 192(%%esi)\n\t"
|
|
730 " pfadd 68(%%edx), %%mm1\n\t"
|
|
731 " movd %%mm1, 64(%%esi)\n\t"
|
|
732
|
|
733 " movd 124(%%edx), %%mm0\n\t"
|
|
734 " movd %%mm0, 960(%%edi)\n\t"
|
|
735 " pfadd 92(%%edx), %%mm0\n\t"
|
|
736 " movd %%mm0, 832(%%edi)\n\t"
|
|
737
|
|
738 " jmp .L_bye\n\t"
|
|
739 ".L01:\n\t"
|
|
740 /* Phase 9*/
|
|
741
|
|
742 " movq (%%ecx), %%mm0\n\t"
|
|
743 " movq %%mm0, %%mm1\n\t"
|
|
744 " pxor %%mm7, %%mm1\n\t"
|
|
745 " pfacc %%mm1, %%mm0\n\t"
|
|
746 " pfmul %%mm6, %%mm0\n\t"
|
|
747 " pf2id %%mm0, %%mm0\n\t"
|
|
748 " movd %%mm0, %%eax\n\t"
|
|
749 " movw %%ax, 512(%%esi)\n\t"
|
|
750 " psrlq $32, %%mm0\n\t"
|
|
751 " movd %%mm0, %%eax\n\t"
|
|
752 " movw %%ax, (%%esi)\n\t"
|
|
753
|
|
754 " movd 12(%%ecx), %%mm0\n\t"
|
|
755 " pfsub 8(%%ecx), %%mm0\n\t"
|
|
756 " pfmul 120(%%ebx), %%mm0\n\t"
|
|
757 " pf2id %%mm0, %%mm7\n\t"
|
|
758 " movd %%mm7, %%eax\n\t"
|
|
759 " movw %%ax, 256(%%edi)\n\t"
|
|
760 " pfadd 12(%%ecx), %%mm0\n\t"
|
|
761 " pfadd 8(%%ecx), %%mm0\n\t"
|
|
762 " pf2id %%mm0, %%mm0\n\t"
|
|
763 " movd %%mm0, %%eax\n\t"
|
|
764 " movw %%ax, 256(%%esi)\n\t"
|
|
765
|
|
766 " movd 16(%%ecx), %%mm3\n\t"
|
|
767 " pfsub 20(%%ecx), %%mm3\n\t"
|
|
768 " pfmul 120(%%ebx), %%mm3\n\t"
|
|
769 " movq %%mm3, %%mm2\n\t"
|
|
770
|
|
771 " movd 28(%%ecx), %%mm2\n\t"
|
|
772 " pfsub 24(%%ecx), %%mm2\n\t"
|
|
773 " pfmul 120(%%ebx), %%mm2\n\t"
|
|
774 " movq %%mm2, %%mm1\n\t"
|
|
775
|
|
776 " pf2id %%mm2, %%mm7\n\t"
|
|
777 " movd %%mm7, %%eax\n\t"
|
|
778 " movw %%ax, 384(%%edi)\n\t"
|
|
779
|
|
780 " pfadd 24(%%ecx), %%mm1\n\t"
|
|
781 " pfadd 28(%%ecx), %%mm1\n\t"
|
|
782 " movq %%mm1, %%mm0\n\t"
|
|
783
|
|
784 " pfadd 16(%%ecx), %%mm0\n\t"
|
|
785 " pfadd 20(%%ecx), %%mm0\n\t"
|
|
786 " pf2id %%mm0, %%mm0\n\t"
|
|
787 " movd %%mm0, %%eax\n\t"
|
|
788 " movw %%ax, 384(%%esi)\n\t"
|
|
789 " pfadd %%mm3, %%mm1\n\t"
|
|
790 " pf2id %%mm1, %%mm1\n\t"
|
|
791 " movd %%mm1, %%eax\n\t"
|
|
792 " movw %%ax, 128(%%esi)\n\t"
|
|
793 " pfadd %%mm3, %%mm2\n\t"
|
|
794 " pf2id %%mm2, %%mm2\n\t"
|
|
795 " movd %%mm2, %%eax\n\t"
|
|
796 " movw %%ax, 128(%%edi)\n\t"
|
|
797
|
|
798 /* Phase 10*/
|
|
799
|
|
800 " movq 32(%%edx), %%mm0\n\t"
|
|
801 " movq 48(%%edx), %%mm1\n\t"
|
|
802 " pfadd 48(%%edx), %%mm0\n\t"
|
|
803 " pfadd 40(%%edx), %%mm1\n\t"
|
|
804 " pf2id %%mm0, %%mm0\n\t"
|
|
805 " pf2id %%mm1, %%mm1\n\t"
|
|
806 " movd %%mm0, %%eax\n\t"
|
|
807 " movd %%mm1, %%ecx\n\t"
|
|
808 " movw %%ax, 448(%%esi)\n\t"
|
|
809 " movw %%cx, 320(%%esi)\n\t"
|
|
810 " psrlq $32, %%mm0\n\t"
|
|
811 " psrlq $32, %%mm1\n\t"
|
|
812 " movd %%mm0, %%eax\n\t"
|
|
813 " movd %%mm1, %%ecx\n\t"
|
|
814 " movw %%ax, 64(%%edi)\n\t"
|
|
815 " movw %%cx, 192(%%edi)\n\t"
|
|
816
|
|
817 " movd 40(%%edx), %%mm3\n\t"
|
|
818 " movd 56(%%edx), %%mm4\n\t"
|
|
819 " movd 60(%%edx), %%mm0\n\t"
|
|
820 " movd 44(%%edx), %%mm2\n\t"
|
|
821 " movd 120(%%edx), %%mm5\n\t"
|
|
822 " punpckldq %%mm4, %%mm3\n\t"
|
|
823 " punpckldq 124(%%edx), %%mm0\n\t"
|
|
824 " pfadd 100(%%edx), %%mm5\n\t"
|
|
825 " punpckldq 36(%%edx), %%mm4\n\t"
|
|
826 " punpckldq 92(%%edx), %%mm2\n\t"
|
|
827 " movq %%mm5, %%mm6\n\t"
|
|
828 " pfadd %%mm4, %%mm3\n\t"
|
|
829 " pf2id %%mm0, %%mm1\n\t"
|
|
830 " pf2id %%mm3, %%mm3\n\t"
|
|
831 " pfadd 88(%%edx), %%mm5\n\t"
|
|
832 " movd %%mm1, %%eax\n\t"
|
|
833 " movd %%mm3, %%ecx\n\t"
|
|
834 " movw %%ax, 448(%%edi)\n\t"
|
|
835 " movw %%cx, 192(%%esi)\n\t"
|
|
836 " pf2id %%mm5, %%mm5\n\t"
|
|
837 " psrlq $32, %%mm1\n\t"
|
|
838 " psrlq $32, %%mm3\n\t"
|
|
839 " movd %%mm5, %%ebx\n\t"
|
|
840 " movd %%mm1, %%eax\n\t"
|
|
841 " movd %%mm3, %%ecx\n\t"
|
|
842 " movw %%bx, 96(%%esi)\n\t"
|
|
843 " movw %%ax, 480(%%edi)\n\t"
|
|
844 " movw %%cx, 64(%%esi)\n\t"
|
|
845 " pfadd %%mm2, %%mm0\n\t"
|
|
846 " pf2id %%mm0, %%mm0\n\t"
|
|
847 " movd %%mm0, %%eax\n\t"
|
|
848 " pfadd 68(%%edx), %%mm6\n\t"
|
|
849 " movw %%ax, 320(%%edi)\n\t"
|
|
850 " psrlq $32, %%mm0\n\t"
|
|
851 " pf2id %%mm6, %%mm6\n\t"
|
|
852 " movd %%mm0, %%eax\n\t"
|
|
853 " movd %%mm6, %%ebx\n\t"
|
|
854 " movw %%ax, 416(%%edi)\n\t"
|
|
855 " movw %%bx, 32(%%esi)\n\t"
|
|
856
|
|
857 " movq 96(%%edx), %%mm0\n\t"
|
|
858 " movq 112(%%edx), %%mm2\n\t"
|
|
859 " movq 104(%%edx), %%mm4\n\t"
|
|
860 " pfadd %%mm2, %%mm0\n\t"
|
|
861 " pfadd %%mm4, %%mm2\n\t"
|
|
862 " pfadd 120(%%edx), %%mm4\n\t"
|
|
863 " movq %%mm0, %%mm1\n\t"
|
|
864 " movq %%mm2, %%mm3\n\t"
|
|
865 " movq %%mm4, %%mm5\n\t"
|
|
866 " pfadd 64(%%edx), %%mm0\n\t"
|
|
867 " pfadd 80(%%edx), %%mm2\n\t"
|
|
868 " pfadd 72(%%edx), %%mm4\n\t"
|
|
869 " pf2id %%mm0, %%mm0\n\t"
|
|
870 " pf2id %%mm2, %%mm2\n\t"
|
|
871 " pf2id %%mm4, %%mm4\n\t"
|
|
872 " movd %%mm0, %%eax\n\t"
|
|
873 " movd %%mm2, %%ecx\n\t"
|
|
874 " movd %%mm4, %%ebx\n\t"
|
|
875 " movw %%ax, 480(%%esi)\n\t"
|
|
876 " movw %%cx, 352(%%esi)\n\t"
|
|
877 " movw %%bx, 224(%%esi)\n\t"
|
|
878 " psrlq $32, %%mm0\n\t"
|
|
879 " psrlq $32, %%mm2\n\t"
|
|
880 " psrlq $32, %%mm4\n\t"
|
|
881 " movd %%mm0, %%eax\n\t"
|
|
882 " movd %%mm2, %%ecx\n\t"
|
|
883 " movd %%mm4, %%ebx\n\t"
|
|
884 " movw %%ax, 32(%%edi)\n\t"
|
|
885 " movw %%cx, 160(%%edi)\n\t"
|
|
886 " movw %%bx, 288(%%edi)\n\t"
|
|
887 " pfadd 80(%%edx), %%mm1\n\t"
|
|
888 " pfadd 72(%%edx), %%mm3\n\t"
|
|
889 " pfadd 88(%%edx), %%mm5\n\t"
|
|
890 " pf2id %%mm1, %%mm1\n\t"
|
|
891 " pf2id %%mm3, %%mm3\n\t"
|
|
892 " pf2id %%mm5, %%mm5\n\t"
|
|
893 " movd %%mm1, %%eax\n\t"
|
|
894 " movd %%mm3, %%ecx\n\t"
|
|
895 " movd %%mm5, %%ebx\n\t"
|
|
896 " movw %%ax, 416(%%esi)\n\t"
|
|
897 " movw %%cx, 288(%%esi)\n\t"
|
|
898 " movw %%bx, 160(%%esi)\n\t"
|
|
899 " psrlq $32, %%mm1\n\t"
|
|
900 " psrlq $32, %%mm3\n\t"
|
|
901 " psrlq $32, %%mm5\n\t"
|
|
902 " movd %%mm1, %%eax\n\t"
|
|
903 " movd %%mm3, %%ecx\n\t"
|
|
904 " movd %%mm5, %%ebx\n\t"
|
|
905 " movw %%ax, 96(%%edi)\n\t"
|
|
906 " movw %%cx, 224(%%edi)\n\t"
|
|
907 " movw %%bx, 352(%%edi)\n\t"
|
|
908
|
|
909 " movsw\n\t"
|
|
910
|
|
911 ".L_bye:\n\t"
|
|
912 " femms\n\t"
|
|
913 :
|
|
914 :"m"(a),"m"(b),"m"(c),"m"(tmp[0])
|
|
915 :"memory","%ebx","%esi","%edi");
|
|
916 } |