736
|
1 ///
|
|
2 /// Replacement of dct36() with AMD's 3DNowEx(DSP)! SIMD operations support
|
|
3 ///
|
|
4 /// This code based 'dct36_3dnow.s' by Syuuhei Kashiyama
|
|
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
|
|
6 ///
|
|
7 /// - added new opcode PSWAPD
|
|
8 /// - change function name for support 3DNowEx! automatic detect
|
|
9 ///
|
|
10 /// note: because K7 processors are an aggresive out-of-order three-way
|
|
11 /// superscalar ones instruction order is not significand for them.
|
|
12 ///
|
|
13 /// Modified by Nick Kurshev <nickols_k@mail.ru>
|
|
14 ///
|
|
15 /
|
|
16 / dct36_3dnow.s - 3DNow! optimized dct36()
|
|
17 /
|
|
18 / This code based 'dct36_3dnow.s' by Syuuhei Kashiyama
|
|
19 / <squash@mb.kcom.ne.jp>,only two types of changes have been made:
|
|
20 /
|
|
21 / - remove PREFETCH instruction for speedup
|
|
22 / - change function name for support 3DNow! automatic detect
|
|
23 /
|
|
24 / You can find Kashiyama's original 3dnow! support patch
|
|
25 / (for mpg123-0.59o) at
|
|
26 / http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese).
|
|
27 /
|
|
28 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1999
|
|
29 / <kim@comtec.co.jp> - after 1.Apr.1999
|
|
30 /
|
|
31
|
|
32 ///
|
|
33 /// Replacement of dct36() with AMD's 3DNow! SIMD operations support
|
|
34 ///
|
|
35 /// Syuuhei Kashiyama <squash@mb.kcom.ne.jp>
|
|
36 ///
|
|
37 /// The author of this program disclaim whole expressed or implied
|
|
38 /// warranties with regard to this program, and in no event shall the
|
|
39 /// author of this program liable to whatever resulted from the use of
|
|
40 /// this program. Use it at your own risk.
|
|
41 ///
|
|
42
|
|
43 .globl dct36_3dnowex
|
|
44 .type dct36_3dnowex,@function
|
|
45 dct36_3dnowex:
|
|
46 pushl %ebp
|
|
47 movl %esp,%ebp
|
|
48 subl $120,%esp
|
|
49 pushl %esi
|
|
50 pushl %ebx
|
|
51 movl 8(%ebp),%eax
|
|
52 movl 12(%ebp),%esi
|
|
53 movl 16(%ebp),%ecx
|
|
54 movl 20(%ebp),%edx
|
|
55 movl 24(%ebp),%ebx
|
|
56 leal -128(%ebp),%esp
|
|
57
|
|
58 femms
|
|
59 movq (%eax),%mm0
|
|
60 movq 4(%eax),%mm1
|
|
61 pfadd %mm1,%mm0
|
|
62 movq %mm0,4(%eax)
|
|
63 psrlq $32,%mm1
|
|
64 movq 12(%eax),%mm2
|
|
65 punpckldq %mm2,%mm1
|
|
66 pfadd %mm2,%mm1
|
|
67 movq %mm1,12(%eax)
|
|
68 psrlq $32,%mm2
|
|
69 movq 20(%eax),%mm3
|
|
70 punpckldq %mm3,%mm2
|
|
71 pfadd %mm3,%mm2
|
|
72 movq %mm2,20(%eax)
|
|
73 psrlq $32,%mm3
|
|
74 movq 28(%eax),%mm4
|
|
75 punpckldq %mm4,%mm3
|
|
76 pfadd %mm4,%mm3
|
|
77 movq %mm3,28(%eax)
|
|
78 psrlq $32,%mm4
|
|
79 movq 36(%eax),%mm5
|
|
80 punpckldq %mm5,%mm4
|
|
81 pfadd %mm5,%mm4
|
|
82 movq %mm4,36(%eax)
|
|
83 psrlq $32,%mm5
|
|
84 movq 44(%eax),%mm6
|
|
85 punpckldq %mm6,%mm5
|
|
86 pfadd %mm6,%mm5
|
|
87 movq %mm5,44(%eax)
|
|
88 psrlq $32,%mm6
|
|
89 movq 52(%eax),%mm7
|
|
90 punpckldq %mm7,%mm6
|
|
91 pfadd %mm7,%mm6
|
|
92 movq %mm6,52(%eax)
|
|
93 psrlq $32,%mm7
|
|
94 movq 60(%eax),%mm0
|
|
95 punpckldq %mm0,%mm7
|
|
96 pfadd %mm0,%mm7
|
|
97 movq %mm7,60(%eax)
|
|
98 psrlq $32,%mm0
|
|
99 movd 68(%eax),%mm1
|
|
100 pfadd %mm1,%mm0
|
|
101 movd %mm0,68(%eax)
|
|
102 movd 4(%eax),%mm0
|
|
103 movd 12(%eax),%mm1
|
|
104 punpckldq %mm1,%mm0
|
|
105 punpckldq 20(%eax),%mm1
|
|
106 pfadd %mm1,%mm0
|
|
107 movd %mm0,12(%eax)
|
|
108 psrlq $32,%mm0
|
|
109 movd %mm0,20(%eax)
|
|
110 psrlq $32,%mm1
|
|
111 movd 28(%eax),%mm2
|
|
112 punpckldq %mm2,%mm1
|
|
113 punpckldq 36(%eax),%mm2
|
|
114 pfadd %mm2,%mm1
|
|
115 movd %mm1,28(%eax)
|
|
116 psrlq $32,%mm1
|
|
117 movd %mm1,36(%eax)
|
|
118 psrlq $32,%mm2
|
|
119 movd 44(%eax),%mm3
|
|
120 punpckldq %mm3,%mm2
|
|
121 punpckldq 52(%eax),%mm3
|
|
122 pfadd %mm3,%mm2
|
|
123 movd %mm2,44(%eax)
|
|
124 psrlq $32,%mm2
|
|
125 movd %mm2,52(%eax)
|
|
126 psrlq $32,%mm3
|
|
127 movd 60(%eax),%mm4
|
|
128 punpckldq %mm4,%mm3
|
|
129 punpckldq 68(%eax),%mm4
|
|
130 pfadd %mm4,%mm3
|
|
131 movd %mm3,60(%eax)
|
|
132 psrlq $32,%mm3
|
|
133 movd %mm3,68(%eax)
|
|
134 movq 24(%eax),%mm0
|
|
135 movq 48(%eax),%mm1
|
|
136 movd COS9+12,%mm2
|
|
137 punpckldq %mm2,%mm2
|
|
138 movd COS9+24,%mm3
|
|
139 punpckldq %mm3,%mm3
|
|
140 pfmul %mm2,%mm0
|
|
141 pfmul %mm3,%mm1
|
|
142 pushl %eax
|
|
143 movl $1,%eax
|
|
144 movd %eax,%mm7
|
|
145 pi2fd %mm7,%mm7
|
|
146 popl %eax
|
|
147 movq 8(%eax),%mm2
|
|
148 movd COS9+4,%mm3
|
|
149 punpckldq %mm3,%mm3
|
|
150 pfmul %mm3,%mm2
|
|
151 pfadd %mm0,%mm2
|
|
152 movq 40(%eax),%mm3
|
|
153 movd COS9+20,%mm4
|
|
154 punpckldq %mm4,%mm4
|
|
155 pfmul %mm4,%mm3
|
|
156 pfadd %mm3,%mm2
|
|
157 movq 56(%eax),%mm3
|
|
158 movd COS9+28,%mm4
|
|
159 punpckldq %mm4,%mm4
|
|
160 pfmul %mm4,%mm3
|
|
161 pfadd %mm3,%mm2
|
|
162 movq (%eax),%mm3
|
|
163 movq 16(%eax),%mm4
|
|
164 movd COS9+8,%mm5
|
|
165 punpckldq %mm5,%mm5
|
|
166 pfmul %mm5,%mm4
|
|
167 pfadd %mm4,%mm3
|
|
168 movq 32(%eax),%mm4
|
|
169 movd COS9+16,%mm5
|
|
170 punpckldq %mm5,%mm5
|
|
171 pfmul %mm5,%mm4
|
|
172 pfadd %mm4,%mm3
|
|
173 pfadd %mm1,%mm3
|
|
174 movq 64(%eax),%mm4
|
|
175 movd COS9+32,%mm5
|
|
176 punpckldq %mm5,%mm5
|
|
177 pfmul %mm5,%mm4
|
|
178 pfadd %mm4,%mm3
|
|
179 movq %mm2,%mm4
|
|
180 pfadd %mm3,%mm4
|
|
181 movq %mm7,%mm5
|
|
182 punpckldq tfcos36+0,%mm5
|
|
183 pfmul %mm5,%mm4
|
|
184 movq %mm4,%mm5
|
|
185 pfacc %mm5,%mm5
|
|
186 movd 108(%edx),%mm6
|
|
187 punpckldq 104(%edx),%mm6
|
|
188 pfmul %mm6,%mm5
|
|
189 pswapd %mm5, %mm5
|
|
190 movq %mm5, 32(%ecx)
|
|
191 movq %mm4,%mm6
|
|
192 punpckldq %mm6,%mm5
|
|
193 pfsub %mm6,%mm5
|
|
194 punpckhdq %mm5,%mm5
|
|
195 movd 32(%edx),%mm6
|
|
196 punpckldq 36(%edx),%mm6
|
|
197 pfmul %mm6,%mm5
|
|
198 movd 32(%esi),%mm6
|
|
199 punpckldq 36(%esi),%mm6
|
|
200 pfadd %mm6,%mm5
|
|
201 movd %mm5,1024(%ebx)
|
|
202 psrlq $32,%mm5
|
|
203 movd %mm5,1152(%ebx)
|
|
204 movq %mm3,%mm4
|
|
205 pfsub %mm2,%mm4
|
|
206 movq %mm7,%mm5
|
|
207 punpckldq tfcos36+32,%mm5
|
|
208 pfmul %mm5,%mm4
|
|
209 movq %mm4,%mm5
|
|
210 pfacc %mm5,%mm5
|
|
211 movd 140(%edx),%mm6
|
|
212 punpckldq 72(%edx),%mm6
|
|
213 pfmul %mm6,%mm5
|
|
214 movd %mm5,68(%ecx)
|
|
215 psrlq $32,%mm5
|
|
216 movd %mm5,0(%ecx)
|
|
217 movq %mm4,%mm6
|
|
218 punpckldq %mm6,%mm5
|
|
219 pfsub %mm6,%mm5
|
|
220 punpckhdq %mm5,%mm5
|
|
221 movd 0(%edx),%mm6
|
|
222 punpckldq 68(%edx),%mm6
|
|
223 pfmul %mm6,%mm5
|
|
224 movd 0(%esi),%mm6
|
|
225 punpckldq 68(%esi),%mm6
|
|
226 pfadd %mm6,%mm5
|
|
227 movd %mm5,0(%ebx)
|
|
228 psrlq $32,%mm5
|
|
229 movd %mm5,2176(%ebx)
|
|
230 movq 8(%eax),%mm2
|
|
231 movq 40(%eax),%mm3
|
|
232 pfsub %mm3,%mm2
|
|
233 movq 56(%eax),%mm3
|
|
234 pfsub %mm3,%mm2
|
|
235 movd COS9+12,%mm3
|
|
236 punpckldq %mm3,%mm3
|
|
237 pfmul %mm3,%mm2
|
|
238 movq 16(%eax),%mm3
|
|
239 movq 32(%eax),%mm4
|
|
240 pfsub %mm4,%mm3
|
|
241 movq 64(%eax),%mm4
|
|
242 pfsub %mm4,%mm3
|
|
243 movd COS9+24,%mm4
|
|
244 punpckldq %mm4,%mm4
|
|
245 pfmul %mm4,%mm3
|
|
246 movq 48(%eax),%mm4
|
|
247 pfsub %mm4,%mm3
|
|
248 movq (%eax),%mm4
|
|
249 pfadd %mm4,%mm3
|
|
250 movq %mm2,%mm4
|
|
251 pfadd %mm3,%mm4
|
|
252 movq %mm7,%mm5
|
|
253 punpckldq tfcos36+4,%mm5
|
|
254 pfmul %mm5,%mm4
|
|
255 movq %mm4,%mm5
|
|
256 pfacc %mm5,%mm5
|
|
257 movd 112(%edx),%mm6
|
|
258 punpckldq 100(%edx),%mm6
|
|
259 pfmul %mm6,%mm5
|
|
260 movd %mm5,40(%ecx)
|
|
261 psrlq $32,%mm5
|
|
262 movd %mm5,28(%ecx)
|
|
263 movq %mm4,%mm6
|
|
264 punpckldq %mm6,%mm5
|
|
265 pfsub %mm6,%mm5
|
|
266 punpckhdq %mm5,%mm5
|
|
267 movd 28(%edx),%mm6
|
|
268 punpckldq 40(%edx),%mm6
|
|
269 pfmul %mm6,%mm5
|
|
270 movd 28(%esi),%mm6
|
|
271 punpckldq 40(%esi),%mm6
|
|
272 pfadd %mm6,%mm5
|
|
273 movd %mm5,896(%ebx)
|
|
274 psrlq $32,%mm5
|
|
275 movd %mm5,1280(%ebx)
|
|
276 movq %mm3,%mm4
|
|
277 pfsub %mm2,%mm4
|
|
278 movq %mm7,%mm5
|
|
279 punpckldq tfcos36+28,%mm5
|
|
280 pfmul %mm5,%mm4
|
|
281 movq %mm4,%mm5
|
|
282 pfacc %mm5,%mm5
|
|
283 movd 136(%edx),%mm6
|
|
284 punpckldq 76(%edx),%mm6
|
|
285 pfmul %mm6,%mm5
|
|
286 movd %mm5,64(%ecx)
|
|
287 psrlq $32,%mm5
|
|
288 movd %mm5,4(%ecx)
|
|
289 movq %mm4,%mm6
|
|
290 punpckldq %mm6,%mm5
|
|
291 pfsub %mm6,%mm5
|
|
292 punpckhdq %mm5,%mm5
|
|
293 movd 4(%edx),%mm6
|
|
294 punpckldq 64(%edx),%mm6
|
|
295 pfmul %mm6,%mm5
|
|
296 movd 4(%esi),%mm6
|
|
297 punpckldq 64(%esi),%mm6
|
|
298 pfadd %mm6,%mm5
|
|
299 movd %mm5,128(%ebx)
|
|
300 psrlq $32,%mm5
|
|
301 movd %mm5,2048(%ebx)
|
|
302
|
|
303 movq 8(%eax),%mm2
|
|
304 movd COS9+20,%mm3
|
|
305 punpckldq %mm3,%mm3
|
|
306 pfmul %mm3,%mm2
|
|
307 pfsub %mm0,%mm2
|
|
308 movq 40(%eax),%mm3
|
|
309 movd COS9+28,%mm4
|
|
310 punpckldq %mm4,%mm4
|
|
311 pfmul %mm4,%mm3
|
|
312 pfsub %mm3,%mm2
|
|
313 movq 56(%eax),%mm3
|
|
314 movd COS9+4,%mm4
|
|
315 punpckldq %mm4,%mm4
|
|
316 pfmul %mm4,%mm3
|
|
317 pfadd %mm3,%mm2
|
|
318 movq (%eax),%mm3
|
|
319 movq 16(%eax),%mm4
|
|
320 movd COS9+32,%mm5
|
|
321 punpckldq %mm5,%mm5
|
|
322 pfmul %mm5,%mm4
|
|
323 pfsub %mm4,%mm3
|
|
324 movq 32(%eax),%mm4
|
|
325 movd COS9+8,%mm5
|
|
326 punpckldq %mm5,%mm5
|
|
327 pfmul %mm5,%mm4
|
|
328 pfsub %mm4,%mm3
|
|
329 pfadd %mm1,%mm3
|
|
330 movq 64(%eax),%mm4
|
|
331 movd COS9+16,%mm5
|
|
332 punpckldq %mm5,%mm5
|
|
333 pfmul %mm5,%mm4
|
|
334 pfadd %mm4,%mm3
|
|
335 movq %mm2,%mm4
|
|
336 pfadd %mm3,%mm4
|
|
337 movq %mm7,%mm5
|
|
338 punpckldq tfcos36+8,%mm5
|
|
339 pfmul %mm5,%mm4
|
|
340 movq %mm4,%mm5
|
|
341 pfacc %mm5,%mm5
|
|
342 movd 116(%edx),%mm6
|
|
343 punpckldq 96(%edx),%mm6
|
|
344 pfmul %mm6,%mm5
|
|
345 movd %mm5,44(%ecx)
|
|
346 psrlq $32,%mm5
|
|
347 movd %mm5,24(%ecx)
|
|
348 movq %mm4,%mm6
|
|
349 punpckldq %mm6,%mm5
|
|
350 pfsub %mm6,%mm5
|
|
351 punpckhdq %mm5,%mm5
|
|
352 movd 24(%edx),%mm6
|
|
353 punpckldq 44(%edx),%mm6
|
|
354 pfmul %mm6,%mm5
|
|
355 movd 24(%esi),%mm6
|
|
356 punpckldq 44(%esi),%mm6
|
|
357 pfadd %mm6,%mm5
|
|
358 movd %mm5,768(%ebx)
|
|
359 psrlq $32,%mm5
|
|
360 movd %mm5,1408(%ebx)
|
|
361 movq %mm3,%mm4
|
|
362 pfsub %mm2,%mm4
|
|
363 movq %mm7,%mm5
|
|
364 punpckldq tfcos36+24,%mm5
|
|
365 pfmul %mm5,%mm4
|
|
366 movq %mm4,%mm5
|
|
367 pfacc %mm5,%mm5
|
|
368 movd 132(%edx),%mm6
|
|
369 punpckldq 80(%edx),%mm6
|
|
370 pfmul %mm6,%mm5
|
|
371 movd %mm5,60(%ecx)
|
|
372 psrlq $32,%mm5
|
|
373 movd %mm5,8(%ecx)
|
|
374 movq %mm4,%mm6
|
|
375 punpckldq %mm6,%mm5
|
|
376 pfsub %mm6,%mm5
|
|
377 punpckhdq %mm5,%mm5
|
|
378 movd 8(%edx),%mm6
|
|
379 punpckldq 60(%edx),%mm6
|
|
380 pfmul %mm6,%mm5
|
|
381 movd 8(%esi),%mm6
|
|
382 punpckldq 60(%esi),%mm6
|
|
383 pfadd %mm6,%mm5
|
|
384 movd %mm5,256(%ebx)
|
|
385 psrlq $32,%mm5
|
|
386 movd %mm5,1920(%ebx)
|
|
387 movq 8(%eax),%mm2
|
|
388 movd COS9+28,%mm3
|
|
389 punpckldq %mm3,%mm3
|
|
390 pfmul %mm3,%mm2
|
|
391 pfsub %mm0,%mm2
|
|
392 movq 40(%eax),%mm3
|
|
393 movd COS9+4,%mm4
|
|
394 punpckldq %mm4,%mm4
|
|
395 pfmul %mm4,%mm3
|
|
396 pfadd %mm3,%mm2
|
|
397 movq 56(%eax),%mm3
|
|
398 movd COS9+20,%mm4
|
|
399 punpckldq %mm4,%mm4
|
|
400 pfmul %mm4,%mm3
|
|
401 pfsub %mm3,%mm2
|
|
402 movq (%eax),%mm3
|
|
403 movq 16(%eax),%mm4
|
|
404 movd COS9+16,%mm5
|
|
405 punpckldq %mm5,%mm5
|
|
406 pfmul %mm5,%mm4
|
|
407 pfsub %mm4,%mm3
|
|
408 movq 32(%eax),%mm4
|
|
409 movd COS9+32,%mm5
|
|
410 punpckldq %mm5,%mm5
|
|
411 pfmul %mm5,%mm4
|
|
412 pfadd %mm4,%mm3
|
|
413 pfadd %mm1,%mm3
|
|
414 movq 64(%eax),%mm4
|
|
415 movd COS9+8,%mm5
|
|
416 punpckldq %mm5,%mm5
|
|
417 pfmul %mm5,%mm4
|
|
418 pfsub %mm4,%mm3
|
|
419 movq %mm2,%mm4
|
|
420 pfadd %mm3,%mm4
|
|
421 movq %mm7,%mm5
|
|
422 punpckldq tfcos36+12,%mm5
|
|
423 pfmul %mm5,%mm4
|
|
424 movq %mm4,%mm5
|
|
425 pfacc %mm5,%mm5
|
|
426 movd 120(%edx),%mm6
|
|
427 punpckldq 92(%edx),%mm6
|
|
428 pfmul %mm6,%mm5
|
|
429 movd %mm5,48(%ecx)
|
|
430 psrlq $32,%mm5
|
|
431 movd %mm5,20(%ecx)
|
|
432 movq %mm4,%mm6
|
|
433 punpckldq %mm6,%mm5
|
|
434 pfsub %mm6,%mm5
|
|
435 punpckhdq %mm5,%mm5
|
|
436 movd 20(%edx),%mm6
|
|
437 punpckldq 48(%edx),%mm6
|
|
438 pfmul %mm6,%mm5
|
|
439 movd 20(%esi),%mm6
|
|
440 punpckldq 48(%esi),%mm6
|
|
441 pfadd %mm6,%mm5
|
|
442 movd %mm5,640(%ebx)
|
|
443 psrlq $32,%mm5
|
|
444 movd %mm5,1536(%ebx)
|
|
445 movq %mm3,%mm4
|
|
446 pfsub %mm2,%mm4
|
|
447 movq %mm7,%mm5
|
|
448 punpckldq tfcos36+20,%mm5
|
|
449 pfmul %mm5,%mm4
|
|
450 movq %mm4,%mm5
|
|
451 pfacc %mm5,%mm5
|
|
452 movd 128(%edx),%mm6
|
|
453 punpckldq 84(%edx),%mm6
|
|
454 pfmul %mm6,%mm5
|
|
455 movd %mm5,56(%ecx)
|
|
456 psrlq $32,%mm5
|
|
457 movd %mm5,12(%ecx)
|
|
458 movq %mm4,%mm6
|
|
459 punpckldq %mm6,%mm5
|
|
460 pfsub %mm6,%mm5
|
|
461 punpckhdq %mm5,%mm5
|
|
462 movd 12(%edx),%mm6
|
|
463 punpckldq 56(%edx),%mm6
|
|
464 pfmul %mm6,%mm5
|
|
465 movd 12(%esi),%mm6
|
|
466 punpckldq 56(%esi),%mm6
|
|
467 pfadd %mm6,%mm5
|
|
468 movd %mm5,384(%ebx)
|
|
469 psrlq $32,%mm5
|
|
470 movd %mm5,1792(%ebx)
|
|
471
|
|
472 movq (%eax),%mm4
|
|
473 movq 16(%eax),%mm3
|
|
474 pfsub %mm3,%mm4
|
|
475 movq 32(%eax),%mm3
|
|
476 pfadd %mm3,%mm4
|
|
477 movq 48(%eax),%mm3
|
|
478 pfsub %mm3,%mm4
|
|
479 movq 64(%eax),%mm3
|
|
480 pfadd %mm3,%mm4
|
|
481 movq %mm7,%mm5
|
|
482 punpckldq tfcos36+16,%mm5
|
|
483 pfmul %mm5,%mm4
|
|
484 movq %mm4,%mm5
|
|
485 pfacc %mm5,%mm5
|
|
486 movd 124(%edx),%mm6
|
|
487 punpckldq 88(%edx),%mm6
|
|
488 pfmul %mm6,%mm5
|
|
489 movd %mm5,52(%ecx)
|
|
490 psrlq $32,%mm5
|
|
491 movd %mm5,16(%ecx)
|
|
492 movq %mm4,%mm6
|
|
493 punpckldq %mm6,%mm5
|
|
494 pfsub %mm6,%mm5
|
|
495 punpckhdq %mm5,%mm5
|
|
496 movd 16(%edx),%mm6
|
|
497 punpckldq 52(%edx),%mm6
|
|
498 pfmul %mm6,%mm5
|
|
499 movd 16(%esi),%mm6
|
|
500 punpckldq 52(%esi),%mm6
|
|
501 pfadd %mm6,%mm5
|
|
502 movd %mm5,512(%ebx)
|
|
503 psrlq $32,%mm5
|
|
504 movd %mm5,1664(%ebx)
|
|
505
|
|
506 femms
|
|
507 popl %ebx
|
|
508 popl %esi
|
|
509 movl %ebp,%esp
|
|
510 popl %ebp
|
|
511 ret
|