comparison mp3lib/dct64_3dnow.c @ 4148:3b29772a4fb2

S->C
author nick
date Mon, 14 Jan 2002 10:34:38 +0000
parents
children 421969d55d5f
comparison
equal deleted inserted replaced
4147:4bbdda22003d 4148:3b29772a4fb2
1 /*
2 * This code was taken from http://www.mpg123.org
3 * See ChangeLog of mpg123-0.59s-pre.1 for detail
4 * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
5 * Partial 3dnow! optimization by Nick Kurshev
6 *
7 * TODO: optimize scalar 3dnow! code
8 * Warning: Phases 7 & 8 are not tested
9 */
10 #define real float /* ugly - but only way */
11
12 static unsigned long long int __attribute__((aligned(8))) x_plus_minus_3dnow = 0x8000000000000000ULL;
13 static float plus_1f = 1.0;
14
15 void __attribute__ (( __stdcall__ )) dct64_MMX_3dnow(real *a,real *b,real *c)
16 {
17 char tmp[256];
18 __asm __volatile(
19 " movl %2,%%eax\n\t"
20
21 " leal 128+%3,%%edx\n\t"
22 " movl %0,%%esi\n\t"
23 " movl %1,%%edi\n\t"
24 " movl $costab_mmx,%%ebx\n\t"
25 " leal %3,%%ecx\n\t"
26
27 /* Phase 1*/
28 " movq (%%eax), %%mm0\n\t"
29 " movq 8(%%eax), %%mm4\n\t"
30 " movq %%mm0, %%mm3\n\t"
31 " movq %%mm4, %%mm7\n\t"
32 " movq 120(%%eax), %%mm1\n\t"
33 " movq 112(%%eax), %%mm5\n\t"
34 /* n.b.: pswapd*/
35 " movq %%mm1, %%mm2\n\t"
36 " movq %%mm5, %%mm6\n\t"
37 " psrlq $32, %%mm1\n\t"
38 " psrlq $32, %%mm5\n\t"
39 " punpckldq %%mm2, %%mm1\n\t"
40 " punpckldq %%mm6, %%mm5\n\t"
41 /**/
42 " pfadd %%mm1, %%mm0\n\t"
43 " pfadd %%mm5, %%mm4\n\t"
44 " movq %%mm0, (%%edx)\n\t"
45 " movq %%mm4, 8(%%edx)\n\t"
46 " pfsub %%mm1, %%mm3\n\t"
47 " pfsub %%mm5, %%mm7\n\t"
48 " pfmul (%%ebx), %%mm3\n\t"
49 " pfmul 8(%%ebx), %%mm7\n\t"
50 " movd %%mm3, 124(%%edx)\n\t"
51 " movd %%mm7, 116(%%edx)\n\t"
52 " psrlq $32, %%mm3\n\t"
53 " psrlq $32, %%mm7\n\t"
54 " movd %%mm3, 120(%%edx)\n\t"
55 " movd %%mm7, 112(%%edx)\n\t"
56
57 " movq 16(%%eax), %%mm0\n\t"
58 " movq 24(%%eax), %%mm4\n\t"
59 " movq %%mm0, %%mm3\n\t"
60 " movq %%mm4, %%mm7\n\t"
61 " movq 104(%%eax), %%mm1\n\t"
62 " movq 96(%%eax), %%mm5\n\t"
63 /* n.b.: pswapd*/
64 " movq %%mm1, %%mm2\n\t"
65 " movq %%mm5, %%mm6\n\t"
66 " psrlq $32, %%mm1\n\t"
67 " psrlq $32, %%mm5\n\t"
68 " punpckldq %%mm2, %%mm1\n\t"
69 " punpckldq %%mm6, %%mm5\n\t"
70 /**/
71 " pfadd %%mm1, %%mm0\n\t"
72 " pfadd %%mm5, %%mm4\n\t"
73 " movq %%mm0, 16(%%edx)\n\t"
74 " movq %%mm4, 24(%%edx)\n\t"
75 " pfsub %%mm1, %%mm3\n\t"
76 " pfsub %%mm5, %%mm7\n\t"
77 " pfmul 16(%%ebx), %%mm3\n\t"
78 " pfmul 24(%%ebx), %%mm7\n\t"
79 " movd %%mm3, 108(%%edx)\n\t"
80 " movd %%mm7, 100(%%edx)\n\t"
81 " psrlq $32, %%mm3\n\t"
82 " psrlq $32, %%mm7\n\t"
83 " movd %%mm3, 104(%%edx)\n\t"
84 " movd %%mm7, 96(%%edx)\n\t"
85
86 " movq 32(%%eax), %%mm0\n\t"
87 " movq 40(%%eax), %%mm4\n\t"
88 " movq %%mm0, %%mm3\n\t"
89 " movq %%mm4, %%mm7\n\t"
90 " movq 88(%%eax), %%mm1\n\t"
91 " movq 80(%%eax), %%mm5\n\t"
92 /* n.b.: pswapd*/
93 " movq %%mm1, %%mm2\n\t"
94 " movq %%mm5, %%mm6\n\t"
95 " psrlq $32, %%mm1\n\t"
96 " psrlq $32, %%mm5\n\t"
97 " punpckldq %%mm2, %%mm1\n\t"
98 " punpckldq %%mm6, %%mm5\n\t"
99 /**/
100 " pfadd %%mm1, %%mm0\n\t"
101 " pfadd %%mm5, %%mm4\n\t"
102 " movq %%mm0, 32(%%edx)\n\t"
103 " movq %%mm4, 40(%%edx)\n\t"
104 " pfsub %%mm1, %%mm3\n\t"
105 " pfsub %%mm5, %%mm7\n\t"
106 " pfmul 32(%%ebx), %%mm3\n\t"
107 " pfmul 40(%%ebx), %%mm7\n\t"
108 " movd %%mm3, 92(%%edx)\n\t"
109 " movd %%mm7, 84(%%edx)\n\t"
110 " psrlq $32, %%mm3\n\t"
111 " psrlq $32, %%mm7\n\t"
112 " movd %%mm3, 88(%%edx)\n\t"
113 " movd %%mm7, 80(%%edx)\n\t"
114
115 " movq 48(%%eax), %%mm0\n\t"
116 " movq 56(%%eax), %%mm4\n\t"
117 " movq %%mm0, %%mm3\n\t"
118 " movq %%mm4, %%mm7\n\t"
119 " movq 72(%%eax), %%mm1\n\t"
120 " movq 64(%%eax), %%mm5\n\t"
121 /* n.b.: pswapd*/
122 " movq %%mm1, %%mm2\n\t"
123 " movq %%mm5, %%mm6\n\t"
124 " psrlq $32, %%mm1\n\t"
125 " psrlq $32, %%mm5\n\t"
126 " punpckldq %%mm2, %%mm1\n\t"
127 " punpckldq %%mm6, %%mm5\n\t"
128 /**/
129 " pfadd %%mm1, %%mm0\n\t"
130 " pfadd %%mm5, %%mm4\n\t"
131 " movq %%mm0, 48(%%edx)\n\t"
132 " movq %%mm4, 56(%%edx)\n\t"
133 " pfsub %%mm1, %%mm3\n\t"
134 " pfsub %%mm5, %%mm7\n\t"
135 " pfmul 48(%%ebx), %%mm3\n\t"
136 " pfmul 56(%%ebx), %%mm7\n\t"
137 " movd %%mm3, 76(%%edx)\n\t"
138 " movd %%mm7, 68(%%edx)\n\t"
139 " psrlq $32, %%mm3\n\t"
140 " psrlq $32, %%mm7\n\t"
141 " movd %%mm3, 72(%%edx)\n\t"
142 " movd %%mm7, 64(%%edx)\n\t"
143
144 /* Phase 2*/
145
146 " movq (%%edx), %%mm0\n\t"
147 " movq 8(%%edx), %%mm4\n\t"
148 " movq %%mm0, %%mm3\n\t"
149 " movq %%mm4, %%mm7\n\t"
150 " movq 56(%%edx), %%mm1\n\t"
151 " movq 48(%%edx), %%mm5\n\t"
152 /* n.b.: pswapd*/
153 " movq %%mm1, %%mm2\n\t"
154 " movq %%mm5, %%mm6\n\t"
155 " psrlq $32, %%mm1\n\t"
156 " psrlq $32, %%mm5\n\t"
157 " punpckldq %%mm2, %%mm1\n\t"
158 " punpckldq %%mm6, %%mm5\n\t"
159 /**/
160 " pfadd %%mm1, %%mm0\n\t"
161 " pfadd %%mm5, %%mm4\n\t"
162 " movq %%mm0, (%%ecx)\n\t"
163 " movq %%mm4, 8(%%ecx)\n\t"
164 " pfsub %%mm1, %%mm3\n\t"
165 " pfsub %%mm5, %%mm7\n\t"
166 " pfmul 64(%%ebx), %%mm3\n\t"
167 " pfmul 72(%%ebx), %%mm7\n\t"
168 " movd %%mm3, 60(%%ecx)\n\t"
169 " movd %%mm7, 52(%%ecx)\n\t"
170 " psrlq $32, %%mm3\n\t"
171 " psrlq $32, %%mm7\n\t"
172 " movd %%mm3, 56(%%ecx)\n\t"
173 " movd %%mm7, 48(%%ecx)\n\t"
174
175 " movq 16(%%edx), %%mm0\n\t"
176 " movq 24(%%edx), %%mm4\n\t"
177 " movq %%mm0, %%mm3\n\t"
178 " movq %%mm4, %%mm7\n\t"
179 " movq 40(%%edx), %%mm1\n\t"
180 " movq 32(%%edx), %%mm5\n\t"
181 /* n.b.: pswapd*/
182 " movq %%mm1, %%mm2\n\t"
183 " movq %%mm5, %%mm6\n\t"
184 " psrlq $32, %%mm1\n\t"
185 " psrlq $32, %%mm5\n\t"
186 " punpckldq %%mm2, %%mm1\n\t"
187 " punpckldq %%mm6, %%mm5\n\t"
188 /**/
189 " pfadd %%mm1, %%mm0\n\t"
190 " pfadd %%mm5, %%mm4\n\t"
191 " movq %%mm0, 16(%%ecx)\n\t"
192 " movq %%mm4, 24(%%ecx)\n\t"
193 " pfsub %%mm1, %%mm3\n\t"
194 " pfsub %%mm5, %%mm7\n\t"
195 " pfmul 80(%%ebx), %%mm3\n\t"
196 " pfmul 88(%%ebx), %%mm7\n\t"
197 " movd %%mm3, 44(%%ecx)\n\t"
198 " movd %%mm7, 36(%%ecx)\n\t"
199 " psrlq $32, %%mm3\n\t"
200 " psrlq $32, %%mm7\n\t"
201 " movd %%mm3, 40(%%ecx)\n\t"
202 " movd %%mm7, 32(%%ecx)\n\t"
203
204 /* Phase 3*/
205
206 " movq 64(%%edx), %%mm0\n\t"
207 " movq 72(%%edx), %%mm4\n\t"
208 " movq %%mm0, %%mm3\n\t"
209 " movq %%mm4, %%mm7\n\t"
210 " movq 120(%%edx), %%mm1\n\t"
211 " movq 112(%%edx), %%mm5\n\t"
212 /* n.b.: pswapd*/
213 " movq %%mm1, %%mm2\n\t"
214 " movq %%mm5, %%mm6\n\t"
215 " psrlq $32, %%mm1\n\t"
216 " psrlq $32, %%mm5\n\t"
217 " punpckldq %%mm2, %%mm1\n\t"
218 " punpckldq %%mm6, %%mm5\n\t"
219 /**/
220 " pfadd %%mm1, %%mm0\n\t"
221 " pfadd %%mm5, %%mm4\n\t"
222 " movq %%mm0, 64(%%ecx)\n\t"
223 " movq %%mm4, 72(%%ecx)\n\t"
224 " pfsubr %%mm1, %%mm3\n\t"
225 " pfsubr %%mm5, %%mm7\n\t"
226 " pfmul 64(%%ebx), %%mm3\n\t"
227 " pfmul 72(%%ebx), %%mm7\n\t"
228 " movd %%mm3, 124(%%ecx)\n\t"
229 " movd %%mm7, 116(%%ecx)\n\t"
230 " psrlq $32, %%mm3\n\t"
231 " psrlq $32, %%mm7\n\t"
232 " movd %%mm3, 120(%%ecx)\n\t"
233 " movd %%mm7, 112(%%ecx)\n\t"
234
235 " movq 80(%%edx), %%mm0\n\t"
236 " movq 88(%%edx), %%mm4\n\t"
237 " movq %%mm0, %%mm3\n\t"
238 " movq %%mm4, %%mm7\n\t"
239 " movq 104(%%edx), %%mm1\n\t"
240 " movq 96(%%edx), %%mm5\n\t"
241 /* n.b.: pswapd*/
242 " movq %%mm1, %%mm2\n\t"
243 " movq %%mm5, %%mm6\n\t"
244 " psrlq $32, %%mm1\n\t"
245 " psrlq $32, %%mm5\n\t"
246 " punpckldq %%mm2, %%mm1\n\t"
247 " punpckldq %%mm6, %%mm5\n\t"
248 /**/
249 " pfadd %%mm1, %%mm0\n\t"
250 " pfadd %%mm5, %%mm4\n\t"
251 " movq %%mm0, 80(%%ecx)\n\t"
252 " movq %%mm4, 88(%%ecx)\n\t"
253 " pfsubr %%mm1, %%mm3\n\t"
254 " pfsubr %%mm5, %%mm7\n\t"
255 " pfmul 80(%%ebx), %%mm3\n\t"
256 " pfmul 88(%%ebx), %%mm7\n\t"
257 " movd %%mm3, 108(%%ecx)\n\t"
258 " movd %%mm7, 100(%%ecx)\n\t"
259 " psrlq $32, %%mm3\n\t"
260 " psrlq $32, %%mm7\n\t"
261 " movd %%mm3, 104(%%ecx)\n\t"
262 " movd %%mm7, 96(%%ecx)\n\t"
263
264 /* Phase 4*/
265
266 " movq (%%ecx), %%mm0\n\t"
267 " movq 8(%%ecx), %%mm4\n\t"
268 " movq %%mm0, %%mm3\n\t"
269 " movq %%mm4, %%mm7\n\t"
270 " movq 24(%%ecx), %%mm1\n\t"
271 " movq 16(%%ecx), %%mm5\n\t"
272 /* n.b.: pswapd*/
273 " movq %%mm1, %%mm2\n\t"
274 " movq %%mm5, %%mm6\n\t"
275 " psrlq $32, %%mm1\n\t"
276 " psrlq $32, %%mm5\n\t"
277 " punpckldq %%mm2, %%mm1\n\t"
278 " punpckldq %%mm6, %%mm5\n\t"
279 /**/
280 " pfadd %%mm1, %%mm0\n\t"
281 " pfadd %%mm5, %%mm4\n\t"
282 " movq %%mm0, (%%edx)\n\t"
283 " movq %%mm4, 8(%%edx)\n\t"
284 " pfsub %%mm1, %%mm3\n\t"
285 " pfsub %%mm5, %%mm7\n\t"
286 " pfmul 96(%%ebx), %%mm3\n\t"
287 " pfmul 104(%%ebx), %%mm7\n\t"
288 " movd %%mm3, 28(%%edx)\n\t"
289 " movd %%mm7, 20(%%edx)\n\t"
290 " psrlq $32, %%mm3\n\t"
291 " psrlq $32, %%mm7\n\t"
292 " movd %%mm3, 24(%%edx)\n\t"
293 " movd %%mm7, 16(%%edx)\n\t"
294
295 " movq 32(%%ecx), %%mm0\n\t"
296 " movq 40(%%ecx), %%mm4\n\t"
297 " movq %%mm0, %%mm3\n\t"
298 " movq %%mm4, %%mm7\n\t"
299 " movq 56(%%ecx), %%mm1\n\t"
300 " movq 48(%%ecx), %%mm5\n\t"
301 /* n.b.: pswapd*/
302 " movq %%mm1, %%mm2\n\t"
303 " movq %%mm5, %%mm6\n\t"
304 " psrlq $32, %%mm1\n\t"
305 " psrlq $32, %%mm5\n\t"
306 " punpckldq %%mm2, %%mm1\n\t"
307 " punpckldq %%mm6, %%mm5\n\t"
308 /**/
309 " pfadd %%mm1, %%mm0\n\t"
310 " pfadd %%mm5, %%mm4\n\t"
311 " movq %%mm0, 32(%%edx)\n\t"
312 " movq %%mm4, 40(%%edx)\n\t"
313 " pfsubr %%mm1, %%mm3\n\t"
314 " pfsubr %%mm5, %%mm7\n\t"
315 " pfmul 96(%%ebx), %%mm3\n\t"
316 " pfmul 104(%%ebx), %%mm7\n\t"
317 " movd %%mm3, 60(%%edx)\n\t"
318 " movd %%mm7, 52(%%edx)\n\t"
319 " psrlq $32, %%mm3\n\t"
320 " psrlq $32, %%mm7\n\t"
321 " movd %%mm3, 56(%%edx)\n\t"
322 " movd %%mm7, 48(%%edx)\n\t"
323
324 " movq 64(%%ecx), %%mm0\n\t"
325 " movq 72(%%ecx), %%mm4\n\t"
326 " movq %%mm0, %%mm3\n\t"
327 " movq %%mm4, %%mm7\n\t"
328 " movq 88(%%ecx), %%mm1\n\t"
329 " movq 80(%%ecx), %%mm5\n\t"
330 /* n.b.: pswapd*/
331 " movq %%mm1, %%mm2\n\t"
332 " movq %%mm5, %%mm6\n\t"
333 " psrlq $32, %%mm1\n\t"
334 " psrlq $32, %%mm5\n\t"
335 " punpckldq %%mm2, %%mm1\n\t"
336 " punpckldq %%mm6, %%mm5\n\t"
337 /**/
338 " pfadd %%mm1, %%mm0\n\t"
339 " pfadd %%mm5, %%mm4\n\t"
340 " movq %%mm0, 64(%%edx)\n\t"
341 " movq %%mm4, 72(%%edx)\n\t"
342 " pfsub %%mm1, %%mm3\n\t"
343 " pfsub %%mm5, %%mm7\n\t"
344 " pfmul 96(%%ebx), %%mm3\n\t"
345 " pfmul 104(%%ebx), %%mm7\n\t"
346 " movd %%mm3, 92(%%edx)\n\t"
347 " movd %%mm7, 84(%%edx)\n\t"
348 " psrlq $32, %%mm3\n\t"
349 " psrlq $32, %%mm7\n\t"
350 " movd %%mm3, 88(%%edx)\n\t"
351 " movd %%mm7, 80(%%edx)\n\t"
352
353 " movq 96(%%ecx), %%mm0\n\t"
354 " movq 104(%%ecx), %%mm4\n\t"
355 " movq %%mm0, %%mm3\n\t"
356 " movq %%mm4, %%mm7\n\t"
357 " movq 120(%%ecx), %%mm1\n\t"
358 " movq 112(%%ecx), %%mm5\n\t"
359 /* n.b.: pswapd*/
360 " movq %%mm1, %%mm2\n\t"
361 " movq %%mm5, %%mm6\n\t"
362 " psrlq $32, %%mm1\n\t"
363 " psrlq $32, %%mm5\n\t"
364 " punpckldq %%mm2, %%mm1\n\t"
365 " punpckldq %%mm6, %%mm5\n\t"
366 /**/
367 " pfadd %%mm1, %%mm0\n\t"
368 " pfadd %%mm5, %%mm4\n\t"
369 " movq %%mm0, 96(%%edx)\n\t"
370 " movq %%mm4, 104(%%edx)\n\t"
371 " pfsubr %%mm1, %%mm3\n\t"
372 " pfsubr %%mm5, %%mm7\n\t"
373 " pfmul 96(%%ebx), %%mm3\n\t"
374 " pfmul 104(%%ebx), %%mm7\n\t"
375 " movd %%mm3, 124(%%edx)\n\t"
376 " movd %%mm7, 116(%%edx)\n\t"
377 " psrlq $32, %%mm3\n\t"
378 " psrlq $32, %%mm7\n\t"
379 " movd %%mm3, 120(%%edx)\n\t"
380 " movd %%mm7, 112(%%edx)\n\t"
381
382 /* Phase 5 */
383
384 " movq (%%edx), %%mm0\n\t"
385 " movq 16(%%edx), %%mm4\n\t"
386 " movq %%mm0, %%mm3\n\t"
387 " movq %%mm4, %%mm7\n\t"
388 " movq 8(%%edx), %%mm1\n\t"
389 " movq 24(%%edx), %%mm5\n\t"
390 /* n.b.: pswapd*/
391 " movq %%mm1, %%mm2\n\t"
392 " movq %%mm5, %%mm6\n\t"
393 " psrlq $32, %%mm1\n\t"
394 " psrlq $32, %%mm5\n\t"
395 " punpckldq %%mm2, %%mm1\n\t"
396 " punpckldq %%mm6, %%mm5\n\t"
397 /**/
398 " pfadd %%mm1, %%mm0\n\t"
399 " pfadd %%mm5, %%mm4\n\t"
400 " movq %%mm0, (%%ecx)\n\t"
401 " movq %%mm4, 16(%%ecx)\n\t"
402 " pfsub %%mm1, %%mm3\n\t"
403 " pfsubr %%mm5, %%mm7\n\t"
404 " pfmul 112(%%ebx), %%mm3\n\t"
405 " pfmul 112(%%ebx), %%mm7\n\t"
406 " movd %%mm3, 12(%%ecx)\n\t"
407 " movd %%mm7, 28(%%ecx)\n\t"
408 " psrlq $32, %%mm3\n\t"
409 " psrlq $32, %%mm7\n\t"
410 " movd %%mm3, 8(%%ecx)\n\t"
411 " movd %%mm7, 24(%%ecx)\n\t"
412
413 " movq 32(%%edx), %%mm0\n\t"
414 " movq 48(%%edx), %%mm4\n\t"
415 " movq %%mm0, %%mm3\n\t"
416 " movq %%mm4, %%mm7\n\t"
417 " movq 40(%%edx), %%mm1\n\t"
418 " movq 56(%%edx), %%mm5\n\t"
419 /* n.b.: pswapd*/
420 " movq %%mm1, %%mm2\n\t"
421 " movq %%mm5, %%mm6\n\t"
422 " psrlq $32, %%mm1\n\t"
423 " psrlq $32, %%mm5\n\t"
424 " punpckldq %%mm2, %%mm1\n\t"
425 " punpckldq %%mm6, %%mm5\n\t"
426 /**/
427 " pfadd %%mm1, %%mm0\n\t"
428 " pfadd %%mm5, %%mm4\n\t"
429 " movq %%mm0, 32(%%ecx)\n\t"
430 " movq %%mm4, 48(%%ecx)\n\t"
431 " pfsub %%mm1, %%mm3\n\t"
432 " pfsubr %%mm5, %%mm7\n\t"
433 " pfmul 112(%%ebx), %%mm3\n\t"
434 " pfmul 112(%%ebx), %%mm7\n\t"
435 " movd %%mm3, 44(%%ecx)\n\t"
436 " movd %%mm7, 60(%%ecx)\n\t"
437 " psrlq $32, %%mm3\n\t"
438 " psrlq $32, %%mm7\n\t"
439 " movd %%mm3, 40(%%ecx)\n\t"
440 " movd %%mm7, 56(%%ecx)\n\t"
441
442 " movq 64(%%edx), %%mm0\n\t"
443 " movq 80(%%edx), %%mm4\n\t"
444 " movq %%mm0, %%mm3\n\t"
445 " movq %%mm4, %%mm7\n\t"
446 " movq 72(%%edx), %%mm1\n\t"
447 " movq 88(%%edx), %%mm5\n\t"
448 /* n.b.: pswapd*/
449 " movq %%mm1, %%mm2\n\t"
450 " movq %%mm5, %%mm6\n\t"
451 " psrlq $32, %%mm1\n\t"
452 " psrlq $32, %%mm5\n\t"
453 " punpckldq %%mm2, %%mm1\n\t"
454 " punpckldq %%mm6, %%mm5\n\t"
455 /**/
456 " pfadd %%mm1, %%mm0\n\t"
457 " pfadd %%mm5, %%mm4\n\t"
458 " movq %%mm0, 64(%%ecx)\n\t"
459 " movq %%mm4, 80(%%ecx)\n\t"
460 " pfsub %%mm1, %%mm3\n\t"
461 " pfsubr %%mm5, %%mm7\n\t"
462 " pfmul 112(%%ebx), %%mm3\n\t"
463 " pfmul 112(%%ebx), %%mm7\n\t"
464 " movd %%mm3, 76(%%ecx)\n\t"
465 " movd %%mm7, 92(%%ecx)\n\t"
466 " psrlq $32, %%mm3\n\t"
467 " psrlq $32, %%mm7\n\t"
468 " movd %%mm3, 72(%%ecx)\n\t"
469 " movd %%mm7, 88(%%ecx)\n\t"
470
471 " movq 96(%%edx), %%mm0\n\t"
472 " movq 112(%%edx), %%mm4\n\t"
473 " movq %%mm0, %%mm3\n\t"
474 " movq %%mm4, %%mm7\n\t"
475 " movq 104(%%edx), %%mm1\n\t"
476 " movq 120(%%edx), %%mm5\n\t"
477 /* n.b.: pswapd*/
478 " movq %%mm1, %%mm2\n\t"
479 " movq %%mm5, %%mm6\n\t"
480 " psrlq $32, %%mm1\n\t"
481 " psrlq $32, %%mm5\n\t"
482 " punpckldq %%mm2, %%mm1\n\t"
483 " punpckldq %%mm6, %%mm5\n\t"
484 /**/
485 " pfadd %%mm1, %%mm0\n\t"
486 " pfadd %%mm5, %%mm4\n\t"
487 " movq %%mm0, 96(%%ecx)\n\t"
488 " movq %%mm4, 112(%%ecx)\n\t"
489 " pfsub %%mm1, %%mm3\n\t"
490 " pfsubr %%mm5, %%mm7\n\t"
491 " pfmul 112(%%ebx), %%mm3\n\t"
492 " pfmul 112(%%ebx), %%mm7\n\t"
493 " movd %%mm3, 108(%%ecx)\n\t"
494 " movd %%mm7, 124(%%ecx)\n\t"
495 " psrlq $32, %%mm3\n\t"
496 " psrlq $32, %%mm7\n\t"
497 " movd %%mm3, 104(%%ecx)\n\t"
498 " movd %%mm7, 120(%%ecx)\n\t"
499
500 /* Phase 6. This is the end of easy road. */
501 /* Code below is coded in scalar mode. Should be optimized */
502
503 " movd plus_1f, %%mm6\n\t"
504 " punpckldq 120(%%ebx), %%mm6\n\t" /* mm6 = 1.0 | 120(%%ebx)*/
505 " movq x_plus_minus_3dnow, %%mm7\n\t" /* mm7 = +1 | -1 */
506
507 " movq 32(%%ecx), %%mm0\n\t"
508 " movq 64(%%ecx), %%mm2\n\t"
509 " movq %%mm0, %%mm1\n\t"
510 " movq %%mm2, %%mm3\n\t"
511 " pxor %%mm7, %%mm1\n\t"
512 " pxor %%mm7, %%mm3\n\t"
513 " pfacc %%mm1, %%mm0\n\t"
514 " pfacc %%mm3, %%mm2\n\t"
515 " pfmul %%mm6, %%mm0\n\t"
516 " pfmul %%mm6, %%mm2\n\t"
517 " movq %%mm0, 32(%%edx)\n\t"
518 " movq %%mm2, 64(%%edx)\n\t"
519
520 " movd 44(%%ecx), %%mm0\n\t"
521 " movd 40(%%ecx), %%mm2\n\t"
522 " movd 120(%%ebx), %%mm3\n\t"
523 " punpckldq 76(%%ecx), %%mm0\n\t"
524 " punpckldq 72(%%ecx), %%mm2\n\t"
525 " punpckldq %%mm3, %%mm3\n\t"
526 " movq %%mm0, %%mm4\n\t"
527 " movq %%mm2, %%mm5\n\t"
528 " pfsub %%mm2, %%mm0\n\t"
529 " pfmul %%mm3, %%mm0\n\t"
530 " movq %%mm0, %%mm1\n\t"
531 " pfadd %%mm5, %%mm0\n\t"
532 " pfadd %%mm4, %%mm0\n\t"
533 " movq %%mm0, %%mm2\n\t"
534 " punpckldq %%mm1, %%mm0\n\t"
535 " punpckhdq %%mm1, %%mm2\n\t"
536 " movq %%mm0, 40(%%edx)\n\t"
537 " movq %%mm2, 72(%%edx)\n\t"
538
539 " movd 48(%%ecx), %%mm3\n\t"
540 " movd 60(%%ecx), %%mm2\n\t"
541 " pfsub 52(%%ecx), %%mm3\n\t"
542 " pfsub 56(%%ecx), %%mm2\n\t"
543 " pfmul 120(%%ebx), %%mm3\n\t"
544 " pfmul 120(%%ebx), %%mm2\n\t"
545 " movq %%mm2, %%mm1\n\t"
546
547 " pfadd 56(%%ecx), %%mm1\n\t"
548 " pfadd 60(%%ecx), %%mm1\n\t"
549 " movq %%mm1, %%mm0\n\t"
550
551 " pfadd 48(%%ecx), %%mm0\n\t"
552 " pfadd 52(%%ecx), %%mm0\n\t"
553 " pfadd %%mm3, %%mm1\n\t"
554 " punpckldq %%mm2, %%mm1\n\t"
555 " pfadd %%mm3, %%mm2\n\t"
556 " punpckldq %%mm2, %%mm0\n\t"
557 " movq %%mm1, 56(%%edx)\n\t"
558 " movq %%mm0, 48(%%edx)\n\t"
559
560 /*---*/
561
562 " movd 92(%%ecx), %%mm1\n\t"
563 " pfsub 88(%%ecx), %%mm1\n\t"
564 " pfmul 120(%%ebx), %%mm1\n\t"
565 " movd %%mm1, 92(%%edx)\n\t"
566 " pfadd 92(%%ecx), %%mm1\n\t"
567 " pfadd 88(%%ecx), %%mm1\n\t"
568 " movq %%mm1, %%mm0\n\t"
569
570 " pfadd 80(%%ecx), %%mm0\n\t"
571 " pfadd 84(%%ecx), %%mm0\n\t"
572 " movd %%mm0, 80(%%edx)\n\t"
573
574 " movd 80(%%ecx), %%mm0\n\t"
575 " pfsub 84(%%ecx), %%mm0\n\t"
576 " pfmul 120(%%ebx), %%mm0\n\t"
577 " pfadd %%mm0, %%mm1\n\t"
578 " pfadd 92(%%edx), %%mm0\n\t"
579 " punpckldq %%mm1, %%mm0\n\t"
580 " movq %%mm0, 84(%%edx)\n\t"
581
582 " movq 96(%%ecx), %%mm0\n\t"
583 " movq %%mm0, %%mm1\n\t"
584 " pxor %%mm7, %%mm1\n\t"
585 " pfacc %%mm1, %%mm0\n\t"
586 " pfmul %%mm6, %%mm0\n\t"
587 " movq %%mm0, 96(%%edx)\n\t"
588
589 " movd 108(%%ecx), %%mm0\n\t"
590 " pfsub 104(%%ecx), %%mm0\n\t"
591 " pfmul 120(%%ebx), %%mm0\n\t"
592 " movd %%mm0, 108(%%edx)\n\t"
593 " pfadd 104(%%ecx), %%mm0\n\t"
594 " pfadd 108(%%ecx), %%mm0\n\t"
595 " movd %%mm0, 104(%%edx)\n\t"
596
597 " movd 124(%%ecx), %%mm1\n\t"
598 " pfsub 120(%%ecx), %%mm1\n\t"
599 " pfmul 120(%%ebx), %%mm1\n\t"
600 " movd %%mm1, 124(%%edx)\n\t"
601 " pfadd 120(%%ecx), %%mm1\n\t"
602 " pfadd 124(%%ecx), %%mm1\n\t"
603 " movq %%mm1, %%mm0\n\t"
604
605 " pfadd 112(%%ecx), %%mm0\n\t"
606 " pfadd 116(%%ecx), %%mm0\n\t"
607 " movd %%mm0, 112(%%edx)\n\t"
608
609 " movd 112(%%ecx), %%mm0\n\t"
610 " pfsub 116(%%ecx), %%mm0\n\t"
611 " pfmul 120(%%ebx), %%mm0\n\t"
612 " pfadd %%mm0,%%mm1\n\t"
613 " pfadd 124(%%edx), %%mm0\n\t"
614 " punpckldq %%mm1, %%mm0\n\t"
615 " movq %%mm0, 116(%%edx)\n\t"
616
617 " jnz .L01\n\t"
618
619 /* Phase 7*/
620 /* Code below is coded in scalar mode. Should be optimized */
621
622 " movd (%%ecx), %%mm0\n\t"
623 " pfadd 4(%%ecx), %%mm0\n\t"
624 " movd %%mm0, 1024(%%esi)\n\t"
625
626 " movd (%%ecx), %%mm0\n\t"
627 " pfsub 4(%%ecx), %%mm0\n\t"
628 " pfmul 120(%%ebx), %%mm0\n\t"
629 " movd %%mm0, (%%esi)\n\t"
630 " movd %%mm0, (%%edi)\n\t"
631
632 " movd 12(%%ecx), %%mm0\n\t"
633 " pfsub 8(%%ecx), %%mm0\n\t"
634 " pfmul 120(%%ebx), %%mm0\n\t"
635 " movd %%mm0, 512(%%edi)\n\t"
636 " pfadd 12(%%ecx), %%mm0\n\t"
637 " pfadd 8(%%ecx), %%mm0\n\t"
638 " movd %%mm0, 512(%%esi)\n\t"
639
640 " movd 16(%%ecx), %%mm0\n\t"
641 " pfsub 20(%%ecx), %%mm0\n\t"
642 " pfmul 120(%%ebx), %%mm0\n\t"
643 " movq %%mm0, %%mm3\n\t"
644
645 " movd 28(%%ecx), %%mm0\n\t"
646 " pfsub 24(%%ecx), %%mm0\n\t"
647 " pfmul 120(%%ebx), %%mm0\n\t"
648 " movd %%mm0, 768(%%edi)\n\t"
649 " movq %%mm0, %%mm2\n\t"
650
651 " pfadd 24(%%ecx), %%mm0\n\t"
652 " pfadd 28(%%ecx), %%mm0\n\t"
653 " movq %%mm0, %%mm1\n\t"
654
655 " pfadd 16(%%ecx), %%mm0\n\t"
656 " pfadd 20(%%ecx), %%mm0\n\t"
657 " movd %%mm0, 768(%%esi)\n\t"
658 " pfadd %%mm3, %%mm1\n\t"
659 " movd %%mm1, 256(%%esi)\n\t"
660 " pfadd %%mm3, %%mm2\n\t"
661 " movd %%mm2, 256(%%edi)\n\t"
662
663 /* Phase 8*/
664
665 " movq 32(%%edx), %%mm0\n\t"
666 " movq 48(%%edx), %%mm1\n\t"
667 " pfadd 48(%%edx), %%mm0\n\t"
668 " pfadd 40(%%edx), %%mm1\n\t"
669 " movd %%mm0, 896(%%esi)\n\t"
670 " movd %%mm1, 640(%%esi)\n\t"
671 " psrlq $32, %%mm0\n\t"
672 " psrlq $32, %%mm1\n\t"
673 " movd %%mm0, 128(%%edi)\n\t"
674 " movd %%mm1, 384(%%edi)\n\t"
675
676 " movd 40(%%edx), %%mm0\n\t"
677 " pfadd 56(%%edx), %%mm0\n\t"
678 " movd %%mm0, 384(%%esi)\n\t"
679
680 " movd 56(%%edx), %%mm0\n\t"
681 " pfadd 36(%%edx), %%mm0\n\t"
682 " movd %%mm0, 128(%%esi)\n\t"
683
684 " movd 60(%%edx), %%mm0\n\t"
685 " movd %%mm0, 896(%%edi)\n\t"
686 " pfadd 44(%%edx), %%mm0\n\t"
687 " movd %%mm0, 640(%%edi)\n\t"
688
689 " movq 96(%%edx), %%mm0\n\t"
690 " movq 112(%%edx), %%mm2\n\t"
691 " movq 104(%%edx), %%mm4\n\t"
692 " pfadd 112(%%edx), %%mm0\n\t"
693 " pfadd 104(%%edx), %%mm2\n\t"
694 " pfadd 120(%%edx), %%mm4\n\t"
695 " movq %%mm0, %%mm1\n\t"
696 " movq %%mm2, %%mm3\n\t"
697 " movq %%mm4, %%mm5\n\t"
698 " pfadd 64(%%edx), %%mm0\n\t"
699 " pfadd 80(%%edx), %%mm2\n\t"
700 " pfadd 72(%%edx), %%mm4\n\t"
701 " movd %%mm0, 960(%%esi)\n\t"
702 " movd %%mm2, 704(%%esi)\n\t"
703 " movd %%mm4, 448(%%esi)\n\t"
704 " psrlq $32, %%mm0\n\t"
705 " psrlq $32, %%mm2\n\t"
706 " psrlq $32, %%mm4\n\t"
707 " movd %%mm0, 64(%%edi)\n\t"
708 " movd %%mm2, 320(%%edi)\n\t"
709 " movd %%mm4, 576(%%edi)\n\t"
710 " pfadd 80(%%edx), %%mm1\n\t"
711 " pfadd 72(%%edx), %%mm3\n\t"
712 " pfadd 88(%%edx), %%mm5\n\t"
713 " movd %%mm1, 832(%%esi)\n\t"
714 " movd %%mm3, 576(%%esi)\n\t"
715 " movd %%mm5, 320(%%esi)\n\t"
716 " psrlq $32, %%mm1\n\t"
717 " psrlq $32, %%mm3\n\t"
718 " psrlq $32, %%mm5\n\t"
719 " movd %%mm1, 192(%%edi)\n\t"
720 " movd %%mm3, 448(%%edi)\n\t"
721 " movd %%mm5, 704(%%edi)\n\t"
722
723 " movd 120(%%edx), %%mm0\n\t"
724 " pfadd 100(%%edx), %%mm0\n\t"
725 " movq %%mm0, %%mm1\n\t"
726 " pfadd 88(%%edx), %%mm0\n\t"
727 " movd %%mm0, 192(%%esi)\n\t"
728 " pfadd 68(%%edx), %%mm1\n\t"
729 " movd %%mm1, 64(%%esi)\n\t"
730
731 " movd 124(%%edx), %%mm0\n\t"
732 " movd %%mm0, 960(%%edi)\n\t"
733 " pfadd 92(%%edx), %%mm0\n\t"
734 " movd %%mm0, 832(%%edi)\n\t"
735
736 " jmp .L_bye\n\t"
737 ".L01:\n\t"
738 /* Phase 9*/
739
740 " movq (%%ecx), %%mm0\n\t"
741 " movq %%mm0, %%mm1\n\t"
742 " pxor %%mm7, %%mm1\n\t"
743 " pfacc %%mm1, %%mm0\n\t"
744 " pfmul %%mm6, %%mm0\n\t"
745 " pf2id %%mm0, %%mm0\n\t"
746 " movd %%mm0, %%eax\n\t"
747 " movw %%ax, 512(%%esi)\n\t"
748 " psrlq $32, %%mm0\n\t"
749 " movd %%mm0, %%eax\n\t"
750 " movw %%ax, (%%esi)\n\t"
751
752 " movd 12(%%ecx), %%mm0\n\t"
753 " pfsub 8(%%ecx), %%mm0\n\t"
754 " pfmul 120(%%ebx), %%mm0\n\t"
755 " pf2id %%mm0, %%mm7\n\t"
756 " movd %%mm7, %%eax\n\t"
757 " movw %%ax, 256(%%edi)\n\t"
758 " pfadd 12(%%ecx), %%mm0\n\t"
759 " pfadd 8(%%ecx), %%mm0\n\t"
760 " pf2id %%mm0, %%mm0\n\t"
761 " movd %%mm0, %%eax\n\t"
762 " movw %%ax, 256(%%esi)\n\t"
763
764 " movd 16(%%ecx), %%mm3\n\t"
765 " pfsub 20(%%ecx), %%mm3\n\t"
766 " pfmul 120(%%ebx), %%mm3\n\t"
767 " movq %%mm3, %%mm2\n\t"
768
769 " movd 28(%%ecx), %%mm2\n\t"
770 " pfsub 24(%%ecx), %%mm2\n\t"
771 " pfmul 120(%%ebx), %%mm2\n\t"
772 " movq %%mm2, %%mm1\n\t"
773
774 " pf2id %%mm2, %%mm7\n\t"
775 " movd %%mm7, %%eax\n\t"
776 " movw %%ax, 384(%%edi)\n\t"
777
778 " pfadd 24(%%ecx), %%mm1\n\t"
779 " pfadd 28(%%ecx), %%mm1\n\t"
780 " movq %%mm1, %%mm0\n\t"
781
782 " pfadd 16(%%ecx), %%mm0\n\t"
783 " pfadd 20(%%ecx), %%mm0\n\t"
784 " pf2id %%mm0, %%mm0\n\t"
785 " movd %%mm0, %%eax\n\t"
786 " movw %%ax, 384(%%esi)\n\t"
787 " pfadd %%mm3, %%mm1\n\t"
788 " pf2id %%mm1, %%mm1\n\t"
789 " movd %%mm1, %%eax\n\t"
790 " movw %%ax, 128(%%esi)\n\t"
791 " pfadd %%mm3, %%mm2\n\t"
792 " pf2id %%mm2, %%mm2\n\t"
793 " movd %%mm2, %%eax\n\t"
794 " movw %%ax, 128(%%edi)\n\t"
795
796 /* Phase 10*/
797
798 " movq 32(%%edx), %%mm0\n\t"
799 " movq 48(%%edx), %%mm1\n\t"
800 " pfadd 48(%%edx), %%mm0\n\t"
801 " pfadd 40(%%edx), %%mm1\n\t"
802 " pf2id %%mm0, %%mm0\n\t"
803 " pf2id %%mm1, %%mm1\n\t"
804 " movd %%mm0, %%eax\n\t"
805 " movd %%mm1, %%ecx\n\t"
806 " movw %%ax, 448(%%esi)\n\t"
807 " movw %%cx, 320(%%esi)\n\t"
808 " psrlq $32, %%mm0\n\t"
809 " psrlq $32, %%mm1\n\t"
810 " movd %%mm0, %%eax\n\t"
811 " movd %%mm1, %%ecx\n\t"
812 " movw %%ax, 64(%%edi)\n\t"
813 " movw %%cx, 192(%%edi)\n\t"
814
815 " movd 40(%%edx), %%mm3\n\t"
816 " movd 56(%%edx), %%mm4\n\t"
817 " movd 60(%%edx), %%mm0\n\t"
818 " movd 44(%%edx), %%mm2\n\t"
819 " movd 120(%%edx), %%mm5\n\t"
820 " punpckldq %%mm4, %%mm3\n\t"
821 " punpckldq 124(%%edx), %%mm0\n\t"
822 " pfadd 100(%%edx), %%mm5\n\t"
823 " punpckldq 36(%%edx), %%mm4\n\t"
824 " punpckldq 92(%%edx), %%mm2\n\t"
825 " movq %%mm5, %%mm6\n\t"
826 " pfadd %%mm4, %%mm3\n\t"
827 " pf2id %%mm0, %%mm1\n\t"
828 " pf2id %%mm3, %%mm3\n\t"
829 " pfadd 88(%%edx), %%mm5\n\t"
830 " movd %%mm1, %%eax\n\t"
831 " movd %%mm3, %%ecx\n\t"
832 " movw %%ax, 448(%%edi)\n\t"
833 " movw %%cx, 192(%%esi)\n\t"
834 " pf2id %%mm5, %%mm5\n\t"
835 " psrlq $32, %%mm1\n\t"
836 " psrlq $32, %%mm3\n\t"
837 " movd %%mm5, %%ebx\n\t"
838 " movd %%mm1, %%eax\n\t"
839 " movd %%mm3, %%ecx\n\t"
840 " movw %%bx, 96(%%esi)\n\t"
841 " movw %%ax, 480(%%edi)\n\t"
842 " movw %%cx, 64(%%esi)\n\t"
843 " pfadd %%mm2, %%mm0\n\t"
844 " pf2id %%mm0, %%mm0\n\t"
845 " movd %%mm0, %%eax\n\t"
846 " pfadd 68(%%edx), %%mm6\n\t"
847 " movw %%ax, 320(%%edi)\n\t"
848 " psrlq $32, %%mm0\n\t"
849 " pf2id %%mm6, %%mm6\n\t"
850 " movd %%mm0, %%eax\n\t"
851 " movd %%mm6, %%ebx\n\t"
852 " movw %%ax, 416(%%edi)\n\t"
853 " movw %%bx, 32(%%esi)\n\t"
854
855 " movq 96(%%edx), %%mm0\n\t"
856 " movq 112(%%edx), %%mm2\n\t"
857 " movq 104(%%edx), %%mm4\n\t"
858 " pfadd %%mm2, %%mm0\n\t"
859 " pfadd %%mm4, %%mm2\n\t"
860 " pfadd 120(%%edx), %%mm4\n\t"
861 " movq %%mm0, %%mm1\n\t"
862 " movq %%mm2, %%mm3\n\t"
863 " movq %%mm4, %%mm5\n\t"
864 " pfadd 64(%%edx), %%mm0\n\t"
865 " pfadd 80(%%edx), %%mm2\n\t"
866 " pfadd 72(%%edx), %%mm4\n\t"
867 " pf2id %%mm0, %%mm0\n\t"
868 " pf2id %%mm2, %%mm2\n\t"
869 " pf2id %%mm4, %%mm4\n\t"
870 " movd %%mm0, %%eax\n\t"
871 " movd %%mm2, %%ecx\n\t"
872 " movd %%mm4, %%ebx\n\t"
873 " movw %%ax, 480(%%esi)\n\t"
874 " movw %%cx, 352(%%esi)\n\t"
875 " movw %%bx, 224(%%esi)\n\t"
876 " psrlq $32, %%mm0\n\t"
877 " psrlq $32, %%mm2\n\t"
878 " psrlq $32, %%mm4\n\t"
879 " movd %%mm0, %%eax\n\t"
880 " movd %%mm2, %%ecx\n\t"
881 " movd %%mm4, %%ebx\n\t"
882 " movw %%ax, 32(%%edi)\n\t"
883 " movw %%cx, 160(%%edi)\n\t"
884 " movw %%bx, 288(%%edi)\n\t"
885 " pfadd 80(%%edx), %%mm1\n\t"
886 " pfadd 72(%%edx), %%mm3\n\t"
887 " pfadd 88(%%edx), %%mm5\n\t"
888 " pf2id %%mm1, %%mm1\n\t"
889 " pf2id %%mm3, %%mm3\n\t"
890 " pf2id %%mm5, %%mm5\n\t"
891 " movd %%mm1, %%eax\n\t"
892 " movd %%mm3, %%ecx\n\t"
893 " movd %%mm5, %%ebx\n\t"
894 " movw %%ax, 416(%%esi)\n\t"
895 " movw %%cx, 288(%%esi)\n\t"
896 " movw %%bx, 160(%%esi)\n\t"
897 " psrlq $32, %%mm1\n\t"
898 " psrlq $32, %%mm3\n\t"
899 " psrlq $32, %%mm5\n\t"
900 " movd %%mm1, %%eax\n\t"
901 " movd %%mm3, %%ecx\n\t"
902 " movd %%mm5, %%ebx\n\t"
903 " movw %%ax, 96(%%edi)\n\t"
904 " movw %%cx, 224(%%edi)\n\t"
905 " movw %%bx, 352(%%edi)\n\t"
906
907 " movsw\n\t"
908
909 ".L_bye:\n\t"
910 " femms\n\t"
911 :
912 :"m"(a),"m"(b),"m"(c),"m"(tmp[0])
913 :"memory","%ebx","%esi","%edi");
914 }