comparison mp3lib/decode_3dnow.s @ 1:3b5f5d1c5041

Initial revision
author arpi_esp
date Sat, 24 Feb 2001 20:28:24 +0000
parents
children
comparison
equal deleted inserted replaced
0:c1bb2c071d63 1:3b5f5d1c5041
1 / synth_1to1_3dnow works the same way as the c version of
2 / synth_1to1. this assembler code based 'decode-i586.s'
3 / (by Stefan Bieschewski <stb@acm.org>), two types of changes
4 / have been made:
5 / - use {MMX,3DNow!} instruction for reduce cpu
6 / - remove unused(?) local symbols
7 /
8 / useful sources of information on optimizing 3DNow! code include:
9 / AMD 3DNow! Technology Manual (Publication #21928)
10 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
11 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
12 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
13 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
14 /
15 / This code was tested only AMD-K6-2 processor Linux systems,
16 / please tell me:
17 / - whether this code works on other 3DNow! capable processors
18 / (ex.IDT-C6-2) or not
19 / - whether this code works on other OSes or not
20 /
21 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
22 / <kim@comtec.co.jp> - after 1.Apr.1998
23
24 / Enhancments for q-word operation by Michael Hipp
25
26 .bss
27 .comm buffs,4352,4
28 .data
29 .align 4
30 bo:
31 .long 1
32 .text
33 .globl synth_1to1_3dnow
34 synth_1to1_3dnow:
35 subl $12,%esp
36 pushl %ebp
37 pushl %edi
38 pushl %esi
39 pushl %ebx
40 movl 32(%esp),%eax
41 movl 40(%esp),%esi
42 movl $0,%edi
43 movl bo,%ebp
44 cmpl %edi,36(%esp)
45 jne .L48
46 decl %ebp
47 andl $15,%ebp
48 movl %ebp,bo
49 movl $buffs,%ecx
50 jmp .L49
51 .L48:
52 addl $2,%esi
53 movl $buffs+2176,%ecx
54 .L49:
55 testl $1,%ebp
56 je .L50
57 movl %ecx,%ebx
58 movl %ebp,16(%esp)
59 pushl %eax
60 movl 20(%esp),%edx
61 leal (%ebx,%edx,4),%eax
62 pushl %eax
63 movl 24(%esp),%eax
64 incl %eax
65 andl $15,%eax
66 leal 1088(,%eax,4),%eax
67 addl %ebx,%eax
68 jmp .L74
69 .L50:
70 leal 1088(%ecx),%ebx
71 leal 1(%ebp),%edx
72 movl %edx,16(%esp)
73 pushl %eax
74 leal 1092(%ecx,%ebp,4),%eax
75 pushl %eax
76 leal (%ecx,%ebp,4),%eax
77 .L74:
78 pushl %eax
79 call dct64_3dnow
80 addl $12,%esp
81 movl 16(%esp),%edx
82 leal 0(,%edx,4),%edx
83 movl $decwin+64,%eax
84 movl %eax,%ecx
85 subl %edx,%ecx
86 movl $16,%ebp
87
88 .L55:
89 movq (%ecx),%mm4
90 movq (%ebx),%mm3
91 movq 8(%ecx),%mm0
92 movq 8(%ebx),%mm1
93 pfmul %mm3,%mm4
94
95 movq 16(%ecx),%mm2
96 pfmul %mm1,%mm0
97 movq 16(%ebx),%mm3
98 pfadd %mm0,%mm4
99
100 movq 24(%ecx),%mm0
101 pfmul %mm2,%mm3
102 movq 24(%ebx),%mm1
103 pfadd %mm3,%mm4
104
105 movq 32(%ecx),%mm2
106 pfmul %mm1,%mm0
107 movq 32(%ebx),%mm3
108 pfadd %mm0,%mm4
109
110 movq 40(%ecx),%mm0
111 pfmul %mm2,%mm3
112 movq 40(%ebx),%mm1
113 pfadd %mm3,%mm4
114
115 movq 48(%ecx),%mm2
116 pfmul %mm1,%mm0
117 movq 48(%ebx),%mm3
118 pfadd %mm0,%mm4
119
120 movq 56(%ecx),%mm0
121 pfmul %mm2,%mm3
122 movq 56(%ebx),%mm1
123 pfadd %mm3,%mm4
124
125 pfmul %mm1,%mm0
126 pfadd %mm0,%mm4
127
128 movq %mm4,%mm0
129 psrlq $32,%mm0
130 pfsub %mm0,%mm4
131
132 pf2id %mm4,%mm4
133 movd %mm4,%eax
134
135 sar $16,%eax
136 movw %ax,(%esi)
137
138 addl $64,%ebx
139 subl $-128,%ecx
140 addl $4,%esi
141 decl %ebp
142 jnz .L55
143
144 / --- end of loop 1 ---
145
146 movd (%ecx),%mm2
147 movd (%ebx),%mm1
148 pfmul %mm1,%mm2
149
150 movd 8(%ecx),%mm0
151 movd 8(%ebx),%mm1
152 pfmul %mm0,%mm1
153 pfadd %mm1,%mm2
154
155 movd 16(%ecx),%mm0
156 movd 16(%ebx),%mm1
157 pfmul %mm0,%mm1
158 pfadd %mm1,%mm2
159
160 movd 24(%ecx),%mm0
161 movd 24(%ebx),%mm1
162 pfmul %mm0,%mm1
163 pfadd %mm1,%mm2
164
165 movd 32(%ecx),%mm0
166 movd 32(%ebx),%mm1
167 pfmul %mm0,%mm1
168 pfadd %mm1,%mm2
169
170 movd 40(%ecx),%mm0
171 movd 40(%ebx),%mm1
172 pfmul %mm0,%mm1
173 pfadd %mm1,%mm2
174
175 movd 48(%ecx),%mm0
176 movd 48(%ebx),%mm1
177 pfmul %mm0,%mm1
178 pfadd %mm1,%mm2
179
180 movd 56(%ecx),%mm0
181 movd 56(%ebx),%mm1
182 pfmul %mm0,%mm1
183 pfadd %mm1,%mm2
184
185 pf2id %mm2,%mm2
186 movd %mm2,%eax
187
188 sar $16,%eax
189
190 movw %ax,(%esi)
191
192 addl $-64,%ebx
193 addl $4,%esi
194 addl $256,%ecx
195 movl $15,%ebp
196
197 .L68:
198 psubd %mm0,%mm0
199
200 movq (%ebx),%mm1
201 movq (%ecx),%mm2
202 pfmul %mm1,%mm2
203 pfsub %mm2,%mm0
204
205 movq 8(%ebx),%mm3
206 movq 8(%ecx),%mm4
207 pfmul %mm3,%mm4
208 pfsub %mm4,%mm0
209
210 movq 16(%ebx),%mm1
211 movq 16(%ecx),%mm2
212 pfmul %mm1,%mm2
213 pfsub %mm2,%mm0
214
215 movq 24(%ebx),%mm3
216 movq 24(%ecx),%mm4
217 pfmul %mm3,%mm4
218 pfsub %mm4,%mm0
219
220 movq 32(%ebx),%mm1
221 movq 32(%ecx),%mm2
222 pfmul %mm1,%mm2
223 pfsub %mm2,%mm0
224
225 movq 40(%ebx),%mm3
226 movq 40(%ecx),%mm4
227 pfmul %mm3,%mm4
228 pfsub %mm4,%mm0
229
230 movq 48(%ebx),%mm1
231 movq 48(%ecx),%mm2
232 pfmul %mm1,%mm2
233 pfsub %mm2,%mm0
234
235 movq 56(%ebx),%mm3
236 movq 56(%ecx),%mm4
237 pfmul %mm3,%mm4
238 pfsub %mm4,%mm0
239
240 pfacc %mm0,%mm0
241
242 pf2id %mm0,%mm0
243 movd %mm0,%eax
244
245 sar $16,%eax
246
247 movw %ax,(%esi)
248
249 addl $-64,%ebx
250 subl $-128,%ecx
251 addl $4,%esi
252 decl %ebp
253 jnz .L68
254
255 / --- end of loop 2
256
257 femms
258
259 movl %edi,%eax
260 popl %ebx
261 popl %esi
262 popl %edi
263 popl %ebp
264 addl $12,%esp
265 ret