Mercurial > mplayer.hg
comparison mp3lib/dct36_3dnow.s @ 1:3b5f5d1c5041
Initial revision
author | arpi_esp |
---|---|
date | Sat, 24 Feb 2001 20:28:24 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:c1bb2c071d63 | 1:3b5f5d1c5041 |
---|---|
1 / | |
2 / dct36_3dnow.s - 3DNow! optimized dct36() | |
3 / | |
4 / This code based 'dct36_3dnow.s' by Syuuhei Kashiyama | |
5 / <squash@mb.kcom.ne.jp>,only two types of changes have been made: | |
6 / | |
7 / - remove PREFETCH instruction for speedup | |
8 / - change function name for support 3DNow! automatic detect | |
9 / | |
10 / You can find Kashiyama's original 3dnow! support patch | |
11 / (for mpg123-0.59o) at | |
12 / http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese). | |
13 / | |
14 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1999 | |
15 / <kim@comtec.co.jp> - after 1.Apr.1999 | |
16 / | |
17 | |
18 /// | |
19 /// Replacement of dct36() with AMD's 3DNow! SIMD operations support | |
20 /// | |
21 /// Syuuhei Kashiyama <squash@mb.kcom.ne.jp> | |
22 /// | |
23 /// The author of this program disclaim whole expressed or implied | |
24 /// warranties with regard to this program, and in no event shall the | |
25 /// author of this program liable to whatever resulted from the use of | |
26 /// this program. Use it at your own risk. | |
27 /// | |
28 | |
29 .globl dct36_3dnow | |
30 .type dct36_3dnow,@function | |
31 dct36_3dnow: | |
32 pushl %ebp | |
33 movl %esp,%ebp | |
34 subl $120,%esp | |
35 pushl %esi | |
36 pushl %ebx | |
37 movl 8(%ebp),%eax | |
38 movl 12(%ebp),%esi | |
39 movl 16(%ebp),%ecx | |
40 movl 20(%ebp),%edx | |
41 movl 24(%ebp),%ebx | |
42 leal -128(%ebp),%esp | |
43 | |
44 femms | |
45 movq (%eax),%mm0 | |
46 movq 4(%eax),%mm1 | |
47 pfadd %mm1,%mm0 | |
48 movq %mm0,4(%eax) | |
49 psrlq $32,%mm1 | |
50 movq 12(%eax),%mm2 | |
51 punpckldq %mm2,%mm1 | |
52 pfadd %mm2,%mm1 | |
53 movq %mm1,12(%eax) | |
54 psrlq $32,%mm2 | |
55 movq 20(%eax),%mm3 | |
56 punpckldq %mm3,%mm2 | |
57 pfadd %mm3,%mm2 | |
58 movq %mm2,20(%eax) | |
59 psrlq $32,%mm3 | |
60 movq 28(%eax),%mm4 | |
61 punpckldq %mm4,%mm3 | |
62 pfadd %mm4,%mm3 | |
63 movq %mm3,28(%eax) | |
64 psrlq $32,%mm4 | |
65 movq 36(%eax),%mm5 | |
66 punpckldq %mm5,%mm4 | |
67 pfadd %mm5,%mm4 | |
68 movq %mm4,36(%eax) | |
69 psrlq $32,%mm5 | |
70 movq 44(%eax),%mm6 | |
71 punpckldq %mm6,%mm5 | |
72 pfadd %mm6,%mm5 | |
73 movq %mm5,44(%eax) | |
74 psrlq $32,%mm6 | |
75 movq 52(%eax),%mm7 | |
76 punpckldq %mm7,%mm6 | |
77 pfadd %mm7,%mm6 | |
78 movq %mm6,52(%eax) | |
79 psrlq $32,%mm7 | |
80 movq 60(%eax),%mm0 | |
81 punpckldq %mm0,%mm7 | |
82 pfadd %mm0,%mm7 | |
83 movq %mm7,60(%eax) | |
84 psrlq $32,%mm0 | |
85 movd 68(%eax),%mm1 | |
86 pfadd %mm1,%mm0 | |
87 movd %mm0,68(%eax) | |
88 movd 4(%eax),%mm0 | |
89 movd 12(%eax),%mm1 | |
90 punpckldq %mm1,%mm0 | |
91 punpckldq 20(%eax),%mm1 | |
92 pfadd %mm1,%mm0 | |
93 movd %mm0,12(%eax) | |
94 psrlq $32,%mm0 | |
95 movd %mm0,20(%eax) | |
96 psrlq $32,%mm1 | |
97 movd 28(%eax),%mm2 | |
98 punpckldq %mm2,%mm1 | |
99 punpckldq 36(%eax),%mm2 | |
100 pfadd %mm2,%mm1 | |
101 movd %mm1,28(%eax) | |
102 psrlq $32,%mm1 | |
103 movd %mm1,36(%eax) | |
104 psrlq $32,%mm2 | |
105 movd 44(%eax),%mm3 | |
106 punpckldq %mm3,%mm2 | |
107 punpckldq 52(%eax),%mm3 | |
108 pfadd %mm3,%mm2 | |
109 movd %mm2,44(%eax) | |
110 psrlq $32,%mm2 | |
111 movd %mm2,52(%eax) | |
112 psrlq $32,%mm3 | |
113 movd 60(%eax),%mm4 | |
114 punpckldq %mm4,%mm3 | |
115 punpckldq 68(%eax),%mm4 | |
116 pfadd %mm4,%mm3 | |
117 movd %mm3,60(%eax) | |
118 psrlq $32,%mm3 | |
119 movd %mm3,68(%eax) | |
120 | |
121 movq 24(%eax),%mm0 | |
122 movq 48(%eax),%mm1 | |
123 movd COS9+12,%mm2 | |
124 punpckldq %mm2,%mm2 | |
125 movd COS9+24,%mm3 | |
126 punpckldq %mm3,%mm3 | |
127 pfmul %mm2,%mm0 | |
128 pfmul %mm3,%mm1 | |
129 pushl %eax | |
130 movl $1,%eax | |
131 movd %eax,%mm7 | |
132 pi2fd %mm7,%mm7 | |
133 popl %eax | |
134 movq 8(%eax),%mm2 | |
135 movd COS9+4,%mm3 | |
136 punpckldq %mm3,%mm3 | |
137 pfmul %mm3,%mm2 | |
138 pfadd %mm0,%mm2 | |
139 movq 40(%eax),%mm3 | |
140 movd COS9+20,%mm4 | |
141 punpckldq %mm4,%mm4 | |
142 pfmul %mm4,%mm3 | |
143 pfadd %mm3,%mm2 | |
144 movq 56(%eax),%mm3 | |
145 movd COS9+28,%mm4 | |
146 punpckldq %mm4,%mm4 | |
147 pfmul %mm4,%mm3 | |
148 pfadd %mm3,%mm2 | |
149 movq (%eax),%mm3 | |
150 movq 16(%eax),%mm4 | |
151 movd COS9+8,%mm5 | |
152 punpckldq %mm5,%mm5 | |
153 pfmul %mm5,%mm4 | |
154 pfadd %mm4,%mm3 | |
155 movq 32(%eax),%mm4 | |
156 movd COS9+16,%mm5 | |
157 punpckldq %mm5,%mm5 | |
158 pfmul %mm5,%mm4 | |
159 pfadd %mm4,%mm3 | |
160 pfadd %mm1,%mm3 | |
161 movq 64(%eax),%mm4 | |
162 movd COS9+32,%mm5 | |
163 punpckldq %mm5,%mm5 | |
164 pfmul %mm5,%mm4 | |
165 pfadd %mm4,%mm3 | |
166 movq %mm2,%mm4 | |
167 pfadd %mm3,%mm4 | |
168 movq %mm7,%mm5 | |
169 punpckldq tfcos36+0,%mm5 | |
170 pfmul %mm5,%mm4 | |
171 movq %mm4,%mm5 | |
172 pfacc %mm5,%mm5 | |
173 movd 108(%edx),%mm6 | |
174 punpckldq 104(%edx),%mm6 | |
175 pfmul %mm6,%mm5 | |
176 movd %mm5,36(%ecx) | |
177 psrlq $32,%mm5 | |
178 movd %mm5,32(%ecx) | |
179 movq %mm4,%mm6 | |
180 punpckldq %mm6,%mm5 | |
181 pfsub %mm6,%mm5 | |
182 punpckhdq %mm5,%mm5 | |
183 movd 32(%edx),%mm6 | |
184 punpckldq 36(%edx),%mm6 | |
185 pfmul %mm6,%mm5 | |
186 movd 32(%esi),%mm6 | |
187 punpckldq 36(%esi),%mm6 | |
188 pfadd %mm6,%mm5 | |
189 movd %mm5,1024(%ebx) | |
190 psrlq $32,%mm5 | |
191 movd %mm5,1152(%ebx) | |
192 movq %mm3,%mm4 | |
193 pfsub %mm2,%mm4 | |
194 movq %mm7,%mm5 | |
195 punpckldq tfcos36+32,%mm5 | |
196 pfmul %mm5,%mm4 | |
197 movq %mm4,%mm5 | |
198 pfacc %mm5,%mm5 | |
199 movd 140(%edx),%mm6 | |
200 punpckldq 72(%edx),%mm6 | |
201 pfmul %mm6,%mm5 | |
202 movd %mm5,68(%ecx) | |
203 psrlq $32,%mm5 | |
204 movd %mm5,0(%ecx) | |
205 movq %mm4,%mm6 | |
206 punpckldq %mm6,%mm5 | |
207 pfsub %mm6,%mm5 | |
208 punpckhdq %mm5,%mm5 | |
209 movd 0(%edx),%mm6 | |
210 punpckldq 68(%edx),%mm6 | |
211 pfmul %mm6,%mm5 | |
212 movd 0(%esi),%mm6 | |
213 punpckldq 68(%esi),%mm6 | |
214 pfadd %mm6,%mm5 | |
215 movd %mm5,0(%ebx) | |
216 psrlq $32,%mm5 | |
217 movd %mm5,2176(%ebx) | |
218 movq 8(%eax),%mm2 | |
219 movq 40(%eax),%mm3 | |
220 pfsub %mm3,%mm2 | |
221 movq 56(%eax),%mm3 | |
222 pfsub %mm3,%mm2 | |
223 movd COS9+12,%mm3 | |
224 punpckldq %mm3,%mm3 | |
225 pfmul %mm3,%mm2 | |
226 movq 16(%eax),%mm3 | |
227 movq 32(%eax),%mm4 | |
228 pfsub %mm4,%mm3 | |
229 movq 64(%eax),%mm4 | |
230 pfsub %mm4,%mm3 | |
231 movd COS9+24,%mm4 | |
232 punpckldq %mm4,%mm4 | |
233 pfmul %mm4,%mm3 | |
234 movq 48(%eax),%mm4 | |
235 pfsub %mm4,%mm3 | |
236 movq (%eax),%mm4 | |
237 pfadd %mm4,%mm3 | |
238 movq %mm2,%mm4 | |
239 pfadd %mm3,%mm4 | |
240 movq %mm7,%mm5 | |
241 punpckldq tfcos36+4,%mm5 | |
242 pfmul %mm5,%mm4 | |
243 movq %mm4,%mm5 | |
244 pfacc %mm5,%mm5 | |
245 movd 112(%edx),%mm6 | |
246 punpckldq 100(%edx),%mm6 | |
247 pfmul %mm6,%mm5 | |
248 movd %mm5,40(%ecx) | |
249 psrlq $32,%mm5 | |
250 movd %mm5,28(%ecx) | |
251 movq %mm4,%mm6 | |
252 punpckldq %mm6,%mm5 | |
253 pfsub %mm6,%mm5 | |
254 punpckhdq %mm5,%mm5 | |
255 movd 28(%edx),%mm6 | |
256 punpckldq 40(%edx),%mm6 | |
257 pfmul %mm6,%mm5 | |
258 movd 28(%esi),%mm6 | |
259 punpckldq 40(%esi),%mm6 | |
260 pfadd %mm6,%mm5 | |
261 movd %mm5,896(%ebx) | |
262 psrlq $32,%mm5 | |
263 movd %mm5,1280(%ebx) | |
264 movq %mm3,%mm4 | |
265 pfsub %mm2,%mm4 | |
266 movq %mm7,%mm5 | |
267 punpckldq tfcos36+28,%mm5 | |
268 pfmul %mm5,%mm4 | |
269 movq %mm4,%mm5 | |
270 pfacc %mm5,%mm5 | |
271 movd 136(%edx),%mm6 | |
272 punpckldq 76(%edx),%mm6 | |
273 pfmul %mm6,%mm5 | |
274 movd %mm5,64(%ecx) | |
275 psrlq $32,%mm5 | |
276 movd %mm5,4(%ecx) | |
277 movq %mm4,%mm6 | |
278 punpckldq %mm6,%mm5 | |
279 pfsub %mm6,%mm5 | |
280 punpckhdq %mm5,%mm5 | |
281 movd 4(%edx),%mm6 | |
282 punpckldq 64(%edx),%mm6 | |
283 pfmul %mm6,%mm5 | |
284 movd 4(%esi),%mm6 | |
285 punpckldq 64(%esi),%mm6 | |
286 pfadd %mm6,%mm5 | |
287 movd %mm5,128(%ebx) | |
288 psrlq $32,%mm5 | |
289 movd %mm5,2048(%ebx) | |
290 | |
291 movq 8(%eax),%mm2 | |
292 movd COS9+20,%mm3 | |
293 punpckldq %mm3,%mm3 | |
294 pfmul %mm3,%mm2 | |
295 pfsub %mm0,%mm2 | |
296 movq 40(%eax),%mm3 | |
297 movd COS9+28,%mm4 | |
298 punpckldq %mm4,%mm4 | |
299 pfmul %mm4,%mm3 | |
300 pfsub %mm3,%mm2 | |
301 movq 56(%eax),%mm3 | |
302 movd COS9+4,%mm4 | |
303 punpckldq %mm4,%mm4 | |
304 pfmul %mm4,%mm3 | |
305 pfadd %mm3,%mm2 | |
306 movq (%eax),%mm3 | |
307 movq 16(%eax),%mm4 | |
308 movd COS9+32,%mm5 | |
309 punpckldq %mm5,%mm5 | |
310 pfmul %mm5,%mm4 | |
311 pfsub %mm4,%mm3 | |
312 movq 32(%eax),%mm4 | |
313 movd COS9+8,%mm5 | |
314 punpckldq %mm5,%mm5 | |
315 pfmul %mm5,%mm4 | |
316 pfsub %mm4,%mm3 | |
317 pfadd %mm1,%mm3 | |
318 movq 64(%eax),%mm4 | |
319 movd COS9+16,%mm5 | |
320 punpckldq %mm5,%mm5 | |
321 pfmul %mm5,%mm4 | |
322 pfadd %mm4,%mm3 | |
323 movq %mm2,%mm4 | |
324 pfadd %mm3,%mm4 | |
325 movq %mm7,%mm5 | |
326 punpckldq tfcos36+8,%mm5 | |
327 pfmul %mm5,%mm4 | |
328 movq %mm4,%mm5 | |
329 pfacc %mm5,%mm5 | |
330 movd 116(%edx),%mm6 | |
331 punpckldq 96(%edx),%mm6 | |
332 pfmul %mm6,%mm5 | |
333 movd %mm5,44(%ecx) | |
334 psrlq $32,%mm5 | |
335 movd %mm5,24(%ecx) | |
336 movq %mm4,%mm6 | |
337 punpckldq %mm6,%mm5 | |
338 pfsub %mm6,%mm5 | |
339 punpckhdq %mm5,%mm5 | |
340 movd 24(%edx),%mm6 | |
341 punpckldq 44(%edx),%mm6 | |
342 pfmul %mm6,%mm5 | |
343 movd 24(%esi),%mm6 | |
344 punpckldq 44(%esi),%mm6 | |
345 pfadd %mm6,%mm5 | |
346 movd %mm5,768(%ebx) | |
347 psrlq $32,%mm5 | |
348 movd %mm5,1408(%ebx) | |
349 movq %mm3,%mm4 | |
350 pfsub %mm2,%mm4 | |
351 movq %mm7,%mm5 | |
352 punpckldq tfcos36+24,%mm5 | |
353 pfmul %mm5,%mm4 | |
354 movq %mm4,%mm5 | |
355 pfacc %mm5,%mm5 | |
356 movd 132(%edx),%mm6 | |
357 punpckldq 80(%edx),%mm6 | |
358 pfmul %mm6,%mm5 | |
359 movd %mm5,60(%ecx) | |
360 psrlq $32,%mm5 | |
361 movd %mm5,8(%ecx) | |
362 movq %mm4,%mm6 | |
363 punpckldq %mm6,%mm5 | |
364 pfsub %mm6,%mm5 | |
365 punpckhdq %mm5,%mm5 | |
366 movd 8(%edx),%mm6 | |
367 punpckldq 60(%edx),%mm6 | |
368 pfmul %mm6,%mm5 | |
369 movd 8(%esi),%mm6 | |
370 punpckldq 60(%esi),%mm6 | |
371 pfadd %mm6,%mm5 | |
372 movd %mm5,256(%ebx) | |
373 psrlq $32,%mm5 | |
374 movd %mm5,1920(%ebx) | |
375 movq 8(%eax),%mm2 | |
376 movd COS9+28,%mm3 | |
377 punpckldq %mm3,%mm3 | |
378 pfmul %mm3,%mm2 | |
379 pfsub %mm0,%mm2 | |
380 movq 40(%eax),%mm3 | |
381 movd COS9+4,%mm4 | |
382 punpckldq %mm4,%mm4 | |
383 pfmul %mm4,%mm3 | |
384 pfadd %mm3,%mm2 | |
385 movq 56(%eax),%mm3 | |
386 movd COS9+20,%mm4 | |
387 punpckldq %mm4,%mm4 | |
388 pfmul %mm4,%mm3 | |
389 pfsub %mm3,%mm2 | |
390 movq (%eax),%mm3 | |
391 movq 16(%eax),%mm4 | |
392 movd COS9+16,%mm5 | |
393 punpckldq %mm5,%mm5 | |
394 pfmul %mm5,%mm4 | |
395 pfsub %mm4,%mm3 | |
396 movq 32(%eax),%mm4 | |
397 movd COS9+32,%mm5 | |
398 punpckldq %mm5,%mm5 | |
399 pfmul %mm5,%mm4 | |
400 pfadd %mm4,%mm3 | |
401 pfadd %mm1,%mm3 | |
402 movq 64(%eax),%mm4 | |
403 movd COS9+8,%mm5 | |
404 punpckldq %mm5,%mm5 | |
405 pfmul %mm5,%mm4 | |
406 pfsub %mm4,%mm3 | |
407 movq %mm2,%mm4 | |
408 pfadd %mm3,%mm4 | |
409 movq %mm7,%mm5 | |
410 punpckldq tfcos36+12,%mm5 | |
411 pfmul %mm5,%mm4 | |
412 movq %mm4,%mm5 | |
413 pfacc %mm5,%mm5 | |
414 movd 120(%edx),%mm6 | |
415 punpckldq 92(%edx),%mm6 | |
416 pfmul %mm6,%mm5 | |
417 movd %mm5,48(%ecx) | |
418 psrlq $32,%mm5 | |
419 movd %mm5,20(%ecx) | |
420 movq %mm4,%mm6 | |
421 punpckldq %mm6,%mm5 | |
422 pfsub %mm6,%mm5 | |
423 punpckhdq %mm5,%mm5 | |
424 movd 20(%edx),%mm6 | |
425 punpckldq 48(%edx),%mm6 | |
426 pfmul %mm6,%mm5 | |
427 movd 20(%esi),%mm6 | |
428 punpckldq 48(%esi),%mm6 | |
429 pfadd %mm6,%mm5 | |
430 movd %mm5,640(%ebx) | |
431 psrlq $32,%mm5 | |
432 movd %mm5,1536(%ebx) | |
433 movq %mm3,%mm4 | |
434 pfsub %mm2,%mm4 | |
435 movq %mm7,%mm5 | |
436 punpckldq tfcos36+20,%mm5 | |
437 pfmul %mm5,%mm4 | |
438 movq %mm4,%mm5 | |
439 pfacc %mm5,%mm5 | |
440 movd 128(%edx),%mm6 | |
441 punpckldq 84(%edx),%mm6 | |
442 pfmul %mm6,%mm5 | |
443 movd %mm5,56(%ecx) | |
444 psrlq $32,%mm5 | |
445 movd %mm5,12(%ecx) | |
446 movq %mm4,%mm6 | |
447 punpckldq %mm6,%mm5 | |
448 pfsub %mm6,%mm5 | |
449 punpckhdq %mm5,%mm5 | |
450 movd 12(%edx),%mm6 | |
451 punpckldq 56(%edx),%mm6 | |
452 pfmul %mm6,%mm5 | |
453 movd 12(%esi),%mm6 | |
454 punpckldq 56(%esi),%mm6 | |
455 pfadd %mm6,%mm5 | |
456 movd %mm5,384(%ebx) | |
457 psrlq $32,%mm5 | |
458 movd %mm5,1792(%ebx) | |
459 | |
460 movq (%eax),%mm4 | |
461 movq 16(%eax),%mm3 | |
462 pfsub %mm3,%mm4 | |
463 movq 32(%eax),%mm3 | |
464 pfadd %mm3,%mm4 | |
465 movq 48(%eax),%mm3 | |
466 pfsub %mm3,%mm4 | |
467 movq 64(%eax),%mm3 | |
468 pfadd %mm3,%mm4 | |
469 movq %mm7,%mm5 | |
470 punpckldq tfcos36+16,%mm5 | |
471 pfmul %mm5,%mm4 | |
472 movq %mm4,%mm5 | |
473 pfacc %mm5,%mm5 | |
474 movd 124(%edx),%mm6 | |
475 punpckldq 88(%edx),%mm6 | |
476 pfmul %mm6,%mm5 | |
477 movd %mm5,52(%ecx) | |
478 psrlq $32,%mm5 | |
479 movd %mm5,16(%ecx) | |
480 movq %mm4,%mm6 | |
481 punpckldq %mm6,%mm5 | |
482 pfsub %mm6,%mm5 | |
483 punpckhdq %mm5,%mm5 | |
484 movd 16(%edx),%mm6 | |
485 punpckldq 52(%edx),%mm6 | |
486 pfmul %mm6,%mm5 | |
487 movd 16(%esi),%mm6 | |
488 punpckldq 52(%esi),%mm6 | |
489 pfadd %mm6,%mm5 | |
490 movd %mm5,512(%ebx) | |
491 psrlq $32,%mm5 | |
492 movd %mm5,1664(%ebx) | |
493 | |
494 femms | |
495 popl %ebx | |
496 popl %esi | |
497 movl %ebp,%esp | |
498 popl %ebp | |
499 ret |