1
|
1 /
|
|
2 / dct36_3dnow.s - 3DNow! optimized dct36()
|
|
3 /
|
|
4 / This code based 'dct36_3dnow.s' by Syuuhei Kashiyama
|
|
5 / <squash@mb.kcom.ne.jp>,only two types of changes have been made:
|
|
6 /
|
|
7 / - remove PREFETCH instruction for speedup
|
|
8 / - change function name for support 3DNow! automatic detect
|
|
9 /
|
|
10 / You can find Kashiyama's original 3dnow! support patch
|
|
11 / (for mpg123-0.59o) at
|
|
12 / http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese).
|
|
13 /
|
|
14 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1999
|
|
15 / <kim@comtec.co.jp> - after 1.Apr.1999
|
|
16 /
|
|
17
|
|
18 ///
|
|
19 /// Replacement of dct36() with AMD's 3DNow! SIMD operations support
|
|
20 ///
|
|
21 /// Syuuhei Kashiyama <squash@mb.kcom.ne.jp>
|
|
22 ///
|
|
23 /// The author of this program disclaim whole expressed or implied
|
|
24 /// warranties with regard to this program, and in no event shall the
|
|
25 /// author of this program liable to whatever resulted from the use of
|
|
26 /// this program. Use it at your own risk.
|
|
27 ///
|
|
28
|
|
29 .globl dct36_3dnow
|
|
30 .type dct36_3dnow,@function
|
|
31 dct36_3dnow:
|
|
32 pushl %ebp
|
|
33 movl %esp,%ebp
|
|
34 subl $120,%esp
|
|
35 pushl %esi
|
|
36 pushl %ebx
|
|
37 movl 8(%ebp),%eax
|
|
38 movl 12(%ebp),%esi
|
|
39 movl 16(%ebp),%ecx
|
|
40 movl 20(%ebp),%edx
|
|
41 movl 24(%ebp),%ebx
|
|
42 leal -128(%ebp),%esp
|
|
43
|
|
44 femms
|
|
45 movq (%eax),%mm0
|
|
46 movq 4(%eax),%mm1
|
|
47 pfadd %mm1,%mm0
|
|
48 movq %mm0,4(%eax)
|
|
49 psrlq $32,%mm1
|
|
50 movq 12(%eax),%mm2
|
|
51 punpckldq %mm2,%mm1
|
|
52 pfadd %mm2,%mm1
|
|
53 movq %mm1,12(%eax)
|
|
54 psrlq $32,%mm2
|
|
55 movq 20(%eax),%mm3
|
|
56 punpckldq %mm3,%mm2
|
|
57 pfadd %mm3,%mm2
|
|
58 movq %mm2,20(%eax)
|
|
59 psrlq $32,%mm3
|
|
60 movq 28(%eax),%mm4
|
|
61 punpckldq %mm4,%mm3
|
|
62 pfadd %mm4,%mm3
|
|
63 movq %mm3,28(%eax)
|
|
64 psrlq $32,%mm4
|
|
65 movq 36(%eax),%mm5
|
|
66 punpckldq %mm5,%mm4
|
|
67 pfadd %mm5,%mm4
|
|
68 movq %mm4,36(%eax)
|
|
69 psrlq $32,%mm5
|
|
70 movq 44(%eax),%mm6
|
|
71 punpckldq %mm6,%mm5
|
|
72 pfadd %mm6,%mm5
|
|
73 movq %mm5,44(%eax)
|
|
74 psrlq $32,%mm6
|
|
75 movq 52(%eax),%mm7
|
|
76 punpckldq %mm7,%mm6
|
|
77 pfadd %mm7,%mm6
|
|
78 movq %mm6,52(%eax)
|
|
79 psrlq $32,%mm7
|
|
80 movq 60(%eax),%mm0
|
|
81 punpckldq %mm0,%mm7
|
|
82 pfadd %mm0,%mm7
|
|
83 movq %mm7,60(%eax)
|
|
84 psrlq $32,%mm0
|
|
85 movd 68(%eax),%mm1
|
|
86 pfadd %mm1,%mm0
|
|
87 movd %mm0,68(%eax)
|
|
88 movd 4(%eax),%mm0
|
|
89 movd 12(%eax),%mm1
|
|
90 punpckldq %mm1,%mm0
|
|
91 punpckldq 20(%eax),%mm1
|
|
92 pfadd %mm1,%mm0
|
|
93 movd %mm0,12(%eax)
|
|
94 psrlq $32,%mm0
|
|
95 movd %mm0,20(%eax)
|
|
96 psrlq $32,%mm1
|
|
97 movd 28(%eax),%mm2
|
|
98 punpckldq %mm2,%mm1
|
|
99 punpckldq 36(%eax),%mm2
|
|
100 pfadd %mm2,%mm1
|
|
101 movd %mm1,28(%eax)
|
|
102 psrlq $32,%mm1
|
|
103 movd %mm1,36(%eax)
|
|
104 psrlq $32,%mm2
|
|
105 movd 44(%eax),%mm3
|
|
106 punpckldq %mm3,%mm2
|
|
107 punpckldq 52(%eax),%mm3
|
|
108 pfadd %mm3,%mm2
|
|
109 movd %mm2,44(%eax)
|
|
110 psrlq $32,%mm2
|
|
111 movd %mm2,52(%eax)
|
|
112 psrlq $32,%mm3
|
|
113 movd 60(%eax),%mm4
|
|
114 punpckldq %mm4,%mm3
|
|
115 punpckldq 68(%eax),%mm4
|
|
116 pfadd %mm4,%mm3
|
|
117 movd %mm3,60(%eax)
|
|
118 psrlq $32,%mm3
|
|
119 movd %mm3,68(%eax)
|
|
120
|
|
121 movq 24(%eax),%mm0
|
|
122 movq 48(%eax),%mm1
|
|
123 movd COS9+12,%mm2
|
|
124 punpckldq %mm2,%mm2
|
|
125 movd COS9+24,%mm3
|
|
126 punpckldq %mm3,%mm3
|
|
127 pfmul %mm2,%mm0
|
|
128 pfmul %mm3,%mm1
|
|
129 pushl %eax
|
|
130 movl $1,%eax
|
|
131 movd %eax,%mm7
|
|
132 pi2fd %mm7,%mm7
|
|
133 popl %eax
|
|
134 movq 8(%eax),%mm2
|
|
135 movd COS9+4,%mm3
|
|
136 punpckldq %mm3,%mm3
|
|
137 pfmul %mm3,%mm2
|
|
138 pfadd %mm0,%mm2
|
|
139 movq 40(%eax),%mm3
|
|
140 movd COS9+20,%mm4
|
|
141 punpckldq %mm4,%mm4
|
|
142 pfmul %mm4,%mm3
|
|
143 pfadd %mm3,%mm2
|
|
144 movq 56(%eax),%mm3
|
|
145 movd COS9+28,%mm4
|
|
146 punpckldq %mm4,%mm4
|
|
147 pfmul %mm4,%mm3
|
|
148 pfadd %mm3,%mm2
|
|
149 movq (%eax),%mm3
|
|
150 movq 16(%eax),%mm4
|
|
151 movd COS9+8,%mm5
|
|
152 punpckldq %mm5,%mm5
|
|
153 pfmul %mm5,%mm4
|
|
154 pfadd %mm4,%mm3
|
|
155 movq 32(%eax),%mm4
|
|
156 movd COS9+16,%mm5
|
|
157 punpckldq %mm5,%mm5
|
|
158 pfmul %mm5,%mm4
|
|
159 pfadd %mm4,%mm3
|
|
160 pfadd %mm1,%mm3
|
|
161 movq 64(%eax),%mm4
|
|
162 movd COS9+32,%mm5
|
|
163 punpckldq %mm5,%mm5
|
|
164 pfmul %mm5,%mm4
|
|
165 pfadd %mm4,%mm3
|
|
166 movq %mm2,%mm4
|
|
167 pfadd %mm3,%mm4
|
|
168 movq %mm7,%mm5
|
|
169 punpckldq tfcos36+0,%mm5
|
|
170 pfmul %mm5,%mm4
|
|
171 movq %mm4,%mm5
|
|
172 pfacc %mm5,%mm5
|
|
173 movd 108(%edx),%mm6
|
|
174 punpckldq 104(%edx),%mm6
|
|
175 pfmul %mm6,%mm5
|
|
176 movd %mm5,36(%ecx)
|
|
177 psrlq $32,%mm5
|
|
178 movd %mm5,32(%ecx)
|
|
179 movq %mm4,%mm6
|
|
180 punpckldq %mm6,%mm5
|
|
181 pfsub %mm6,%mm5
|
|
182 punpckhdq %mm5,%mm5
|
|
183 movd 32(%edx),%mm6
|
|
184 punpckldq 36(%edx),%mm6
|
|
185 pfmul %mm6,%mm5
|
|
186 movd 32(%esi),%mm6
|
|
187 punpckldq 36(%esi),%mm6
|
|
188 pfadd %mm6,%mm5
|
|
189 movd %mm5,1024(%ebx)
|
|
190 psrlq $32,%mm5
|
|
191 movd %mm5,1152(%ebx)
|
|
192 movq %mm3,%mm4
|
|
193 pfsub %mm2,%mm4
|
|
194 movq %mm7,%mm5
|
|
195 punpckldq tfcos36+32,%mm5
|
|
196 pfmul %mm5,%mm4
|
|
197 movq %mm4,%mm5
|
|
198 pfacc %mm5,%mm5
|
|
199 movd 140(%edx),%mm6
|
|
200 punpckldq 72(%edx),%mm6
|
|
201 pfmul %mm6,%mm5
|
|
202 movd %mm5,68(%ecx)
|
|
203 psrlq $32,%mm5
|
|
204 movd %mm5,0(%ecx)
|
|
205 movq %mm4,%mm6
|
|
206 punpckldq %mm6,%mm5
|
|
207 pfsub %mm6,%mm5
|
|
208 punpckhdq %mm5,%mm5
|
|
209 movd 0(%edx),%mm6
|
|
210 punpckldq 68(%edx),%mm6
|
|
211 pfmul %mm6,%mm5
|
|
212 movd 0(%esi),%mm6
|
|
213 punpckldq 68(%esi),%mm6
|
|
214 pfadd %mm6,%mm5
|
|
215 movd %mm5,0(%ebx)
|
|
216 psrlq $32,%mm5
|
|
217 movd %mm5,2176(%ebx)
|
|
218 movq 8(%eax),%mm2
|
|
219 movq 40(%eax),%mm3
|
|
220 pfsub %mm3,%mm2
|
|
221 movq 56(%eax),%mm3
|
|
222 pfsub %mm3,%mm2
|
|
223 movd COS9+12,%mm3
|
|
224 punpckldq %mm3,%mm3
|
|
225 pfmul %mm3,%mm2
|
|
226 movq 16(%eax),%mm3
|
|
227 movq 32(%eax),%mm4
|
|
228 pfsub %mm4,%mm3
|
|
229 movq 64(%eax),%mm4
|
|
230 pfsub %mm4,%mm3
|
|
231 movd COS9+24,%mm4
|
|
232 punpckldq %mm4,%mm4
|
|
233 pfmul %mm4,%mm3
|
|
234 movq 48(%eax),%mm4
|
|
235 pfsub %mm4,%mm3
|
|
236 movq (%eax),%mm4
|
|
237 pfadd %mm4,%mm3
|
|
238 movq %mm2,%mm4
|
|
239 pfadd %mm3,%mm4
|
|
240 movq %mm7,%mm5
|
|
241 punpckldq tfcos36+4,%mm5
|
|
242 pfmul %mm5,%mm4
|
|
243 movq %mm4,%mm5
|
|
244 pfacc %mm5,%mm5
|
|
245 movd 112(%edx),%mm6
|
|
246 punpckldq 100(%edx),%mm6
|
|
247 pfmul %mm6,%mm5
|
|
248 movd %mm5,40(%ecx)
|
|
249 psrlq $32,%mm5
|
|
250 movd %mm5,28(%ecx)
|
|
251 movq %mm4,%mm6
|
|
252 punpckldq %mm6,%mm5
|
|
253 pfsub %mm6,%mm5
|
|
254 punpckhdq %mm5,%mm5
|
|
255 movd 28(%edx),%mm6
|
|
256 punpckldq 40(%edx),%mm6
|
|
257 pfmul %mm6,%mm5
|
|
258 movd 28(%esi),%mm6
|
|
259 punpckldq 40(%esi),%mm6
|
|
260 pfadd %mm6,%mm5
|
|
261 movd %mm5,896(%ebx)
|
|
262 psrlq $32,%mm5
|
|
263 movd %mm5,1280(%ebx)
|
|
264 movq %mm3,%mm4
|
|
265 pfsub %mm2,%mm4
|
|
266 movq %mm7,%mm5
|
|
267 punpckldq tfcos36+28,%mm5
|
|
268 pfmul %mm5,%mm4
|
|
269 movq %mm4,%mm5
|
|
270 pfacc %mm5,%mm5
|
|
271 movd 136(%edx),%mm6
|
|
272 punpckldq 76(%edx),%mm6
|
|
273 pfmul %mm6,%mm5
|
|
274 movd %mm5,64(%ecx)
|
|
275 psrlq $32,%mm5
|
|
276 movd %mm5,4(%ecx)
|
|
277 movq %mm4,%mm6
|
|
278 punpckldq %mm6,%mm5
|
|
279 pfsub %mm6,%mm5
|
|
280 punpckhdq %mm5,%mm5
|
|
281 movd 4(%edx),%mm6
|
|
282 punpckldq 64(%edx),%mm6
|
|
283 pfmul %mm6,%mm5
|
|
284 movd 4(%esi),%mm6
|
|
285 punpckldq 64(%esi),%mm6
|
|
286 pfadd %mm6,%mm5
|
|
287 movd %mm5,128(%ebx)
|
|
288 psrlq $32,%mm5
|
|
289 movd %mm5,2048(%ebx)
|
|
290
|
|
291 movq 8(%eax),%mm2
|
|
292 movd COS9+20,%mm3
|
|
293 punpckldq %mm3,%mm3
|
|
294 pfmul %mm3,%mm2
|
|
295 pfsub %mm0,%mm2
|
|
296 movq 40(%eax),%mm3
|
|
297 movd COS9+28,%mm4
|
|
298 punpckldq %mm4,%mm4
|
|
299 pfmul %mm4,%mm3
|
|
300 pfsub %mm3,%mm2
|
|
301 movq 56(%eax),%mm3
|
|
302 movd COS9+4,%mm4
|
|
303 punpckldq %mm4,%mm4
|
|
304 pfmul %mm4,%mm3
|
|
305 pfadd %mm3,%mm2
|
|
306 movq (%eax),%mm3
|
|
307 movq 16(%eax),%mm4
|
|
308 movd COS9+32,%mm5
|
|
309 punpckldq %mm5,%mm5
|
|
310 pfmul %mm5,%mm4
|
|
311 pfsub %mm4,%mm3
|
|
312 movq 32(%eax),%mm4
|
|
313 movd COS9+8,%mm5
|
|
314 punpckldq %mm5,%mm5
|
|
315 pfmul %mm5,%mm4
|
|
316 pfsub %mm4,%mm3
|
|
317 pfadd %mm1,%mm3
|
|
318 movq 64(%eax),%mm4
|
|
319 movd COS9+16,%mm5
|
|
320 punpckldq %mm5,%mm5
|
|
321 pfmul %mm5,%mm4
|
|
322 pfadd %mm4,%mm3
|
|
323 movq %mm2,%mm4
|
|
324 pfadd %mm3,%mm4
|
|
325 movq %mm7,%mm5
|
|
326 punpckldq tfcos36+8,%mm5
|
|
327 pfmul %mm5,%mm4
|
|
328 movq %mm4,%mm5
|
|
329 pfacc %mm5,%mm5
|
|
330 movd 116(%edx),%mm6
|
|
331 punpckldq 96(%edx),%mm6
|
|
332 pfmul %mm6,%mm5
|
|
333 movd %mm5,44(%ecx)
|
|
334 psrlq $32,%mm5
|
|
335 movd %mm5,24(%ecx)
|
|
336 movq %mm4,%mm6
|
|
337 punpckldq %mm6,%mm5
|
|
338 pfsub %mm6,%mm5
|
|
339 punpckhdq %mm5,%mm5
|
|
340 movd 24(%edx),%mm6
|
|
341 punpckldq 44(%edx),%mm6
|
|
342 pfmul %mm6,%mm5
|
|
343 movd 24(%esi),%mm6
|
|
344 punpckldq 44(%esi),%mm6
|
|
345 pfadd %mm6,%mm5
|
|
346 movd %mm5,768(%ebx)
|
|
347 psrlq $32,%mm5
|
|
348 movd %mm5,1408(%ebx)
|
|
349 movq %mm3,%mm4
|
|
350 pfsub %mm2,%mm4
|
|
351 movq %mm7,%mm5
|
|
352 punpckldq tfcos36+24,%mm5
|
|
353 pfmul %mm5,%mm4
|
|
354 movq %mm4,%mm5
|
|
355 pfacc %mm5,%mm5
|
|
356 movd 132(%edx),%mm6
|
|
357 punpckldq 80(%edx),%mm6
|
|
358 pfmul %mm6,%mm5
|
|
359 movd %mm5,60(%ecx)
|
|
360 psrlq $32,%mm5
|
|
361 movd %mm5,8(%ecx)
|
|
362 movq %mm4,%mm6
|
|
363 punpckldq %mm6,%mm5
|
|
364 pfsub %mm6,%mm5
|
|
365 punpckhdq %mm5,%mm5
|
|
366 movd 8(%edx),%mm6
|
|
367 punpckldq 60(%edx),%mm6
|
|
368 pfmul %mm6,%mm5
|
|
369 movd 8(%esi),%mm6
|
|
370 punpckldq 60(%esi),%mm6
|
|
371 pfadd %mm6,%mm5
|
|
372 movd %mm5,256(%ebx)
|
|
373 psrlq $32,%mm5
|
|
374 movd %mm5,1920(%ebx)
|
|
375 movq 8(%eax),%mm2
|
|
376 movd COS9+28,%mm3
|
|
377 punpckldq %mm3,%mm3
|
|
378 pfmul %mm3,%mm2
|
|
379 pfsub %mm0,%mm2
|
|
380 movq 40(%eax),%mm3
|
|
381 movd COS9+4,%mm4
|
|
382 punpckldq %mm4,%mm4
|
|
383 pfmul %mm4,%mm3
|
|
384 pfadd %mm3,%mm2
|
|
385 movq 56(%eax),%mm3
|
|
386 movd COS9+20,%mm4
|
|
387 punpckldq %mm4,%mm4
|
|
388 pfmul %mm4,%mm3
|
|
389 pfsub %mm3,%mm2
|
|
390 movq (%eax),%mm3
|
|
391 movq 16(%eax),%mm4
|
|
392 movd COS9+16,%mm5
|
|
393 punpckldq %mm5,%mm5
|
|
394 pfmul %mm5,%mm4
|
|
395 pfsub %mm4,%mm3
|
|
396 movq 32(%eax),%mm4
|
|
397 movd COS9+32,%mm5
|
|
398 punpckldq %mm5,%mm5
|
|
399 pfmul %mm5,%mm4
|
|
400 pfadd %mm4,%mm3
|
|
401 pfadd %mm1,%mm3
|
|
402 movq 64(%eax),%mm4
|
|
403 movd COS9+8,%mm5
|
|
404 punpckldq %mm5,%mm5
|
|
405 pfmul %mm5,%mm4
|
|
406 pfsub %mm4,%mm3
|
|
407 movq %mm2,%mm4
|
|
408 pfadd %mm3,%mm4
|
|
409 movq %mm7,%mm5
|
|
410 punpckldq tfcos36+12,%mm5
|
|
411 pfmul %mm5,%mm4
|
|
412 movq %mm4,%mm5
|
|
413 pfacc %mm5,%mm5
|
|
414 movd 120(%edx),%mm6
|
|
415 punpckldq 92(%edx),%mm6
|
|
416 pfmul %mm6,%mm5
|
|
417 movd %mm5,48(%ecx)
|
|
418 psrlq $32,%mm5
|
|
419 movd %mm5,20(%ecx)
|
|
420 movq %mm4,%mm6
|
|
421 punpckldq %mm6,%mm5
|
|
422 pfsub %mm6,%mm5
|
|
423 punpckhdq %mm5,%mm5
|
|
424 movd 20(%edx),%mm6
|
|
425 punpckldq 48(%edx),%mm6
|
|
426 pfmul %mm6,%mm5
|
|
427 movd 20(%esi),%mm6
|
|
428 punpckldq 48(%esi),%mm6
|
|
429 pfadd %mm6,%mm5
|
|
430 movd %mm5,640(%ebx)
|
|
431 psrlq $32,%mm5
|
|
432 movd %mm5,1536(%ebx)
|
|
433 movq %mm3,%mm4
|
|
434 pfsub %mm2,%mm4
|
|
435 movq %mm7,%mm5
|
|
436 punpckldq tfcos36+20,%mm5
|
|
437 pfmul %mm5,%mm4
|
|
438 movq %mm4,%mm5
|
|
439 pfacc %mm5,%mm5
|
|
440 movd 128(%edx),%mm6
|
|
441 punpckldq 84(%edx),%mm6
|
|
442 pfmul %mm6,%mm5
|
|
443 movd %mm5,56(%ecx)
|
|
444 psrlq $32,%mm5
|
|
445 movd %mm5,12(%ecx)
|
|
446 movq %mm4,%mm6
|
|
447 punpckldq %mm6,%mm5
|
|
448 pfsub %mm6,%mm5
|
|
449 punpckhdq %mm5,%mm5
|
|
450 movd 12(%edx),%mm6
|
|
451 punpckldq 56(%edx),%mm6
|
|
452 pfmul %mm6,%mm5
|
|
453 movd 12(%esi),%mm6
|
|
454 punpckldq 56(%esi),%mm6
|
|
455 pfadd %mm6,%mm5
|
|
456 movd %mm5,384(%ebx)
|
|
457 psrlq $32,%mm5
|
|
458 movd %mm5,1792(%ebx)
|
|
459
|
|
460 movq (%eax),%mm4
|
|
461 movq 16(%eax),%mm3
|
|
462 pfsub %mm3,%mm4
|
|
463 movq 32(%eax),%mm3
|
|
464 pfadd %mm3,%mm4
|
|
465 movq 48(%eax),%mm3
|
|
466 pfsub %mm3,%mm4
|
|
467 movq 64(%eax),%mm3
|
|
468 pfadd %mm3,%mm4
|
|
469 movq %mm7,%mm5
|
|
470 punpckldq tfcos36+16,%mm5
|
|
471 pfmul %mm5,%mm4
|
|
472 movq %mm4,%mm5
|
|
473 pfacc %mm5,%mm5
|
|
474 movd 124(%edx),%mm6
|
|
475 punpckldq 88(%edx),%mm6
|
|
476 pfmul %mm6,%mm5
|
|
477 movd %mm5,52(%ecx)
|
|
478 psrlq $32,%mm5
|
|
479 movd %mm5,16(%ecx)
|
|
480 movq %mm4,%mm6
|
|
481 punpckldq %mm6,%mm5
|
|
482 pfsub %mm6,%mm5
|
|
483 punpckhdq %mm5,%mm5
|
|
484 movd 16(%edx),%mm6
|
|
485 punpckldq 52(%edx),%mm6
|
|
486 pfmul %mm6,%mm5
|
|
487 movd 16(%esi),%mm6
|
|
488 punpckldq 52(%esi),%mm6
|
|
489 pfadd %mm6,%mm5
|
|
490 movd %mm5,512(%ebx)
|
|
491 psrlq $32,%mm5
|
|
492 movd %mm5,1664(%ebx)
|
|
493
|
|
494 femms
|
|
495 popl %ebx
|
|
496 popl %esi
|
|
497 movl %ebp,%esp
|
|
498 popl %ebp
|
|
499 ret
|