1
|
1 /
|
|
2 / mpg123_synth_1to1 works the same way as the c version of this
|
|
3 / file. only two types of changes have been made:
|
|
4 / - reordered floating point instructions to
|
|
5 / prevent pipline stalls
|
|
6 / - made WRITE_SAMPLE use integer instead of
|
|
7 / (slower) floating point
|
|
8 / all kinds of x86 processors should benefit from these
|
|
9 / modifications.
|
|
10 /
|
|
11 / useful sources of information on optimizing x86 code include:
|
|
12 /
|
|
13 / Intel Architecture Optimization Manual
|
|
14 / http://www.intel.com/design/pentium/manuals/242816.htm
|
|
15 /
|
|
16 / Cyrix 6x86 Instruction Set Summary
|
|
17 / ftp://ftp.cyrix.com/6x86/6x-dbch6.pdf
|
|
18 /
|
|
19 / AMD-K5 Processor Software Development
|
|
20 / http://www.amd.com/products/cpg/techdocs/appnotes/20007e.pdf
|
|
21 /
|
|
22 / Stefan Bieschewski <stb@acm.org>
|
|
23 /
|
|
24 / $Id$
|
|
25 /
|
|
26 .bss
|
|
27 .comm buffs,4352,4
|
|
28 .data
|
|
29 .align 4
|
|
30 bo:
|
|
31 .long 1
|
|
32 .section .rodata
|
|
33 .align 8
|
|
34 .LC0:
|
|
35 .long 0x0,0x40dfffc0
|
|
36 .align 8
|
|
37 .LC1:
|
|
38 .long 0x0,0xc0e00000
|
|
39 .align 8
|
|
40 .text
|
|
41 .globl synth_1to1_pent
|
|
42 synth_1to1_pent:
|
|
43 subl $12,%esp
|
|
44 pushl %ebp
|
|
45 pushl %edi
|
|
46 pushl %esi
|
|
47 pushl %ebx
|
|
48 movl 32(%esp),%eax
|
|
49 movl 40(%esp),%esi
|
|
50 xorl %edi,%edi
|
|
51 movl bo,%ebp
|
|
52 cmpl %edi,36(%esp)
|
|
53 jne .L48
|
|
54 decl %ebp
|
|
55 andl $15,%ebp
|
|
56 movl %ebp,bo
|
|
57 movl $buffs,%ecx
|
|
58 jmp .L49
|
|
59 .L48:
|
|
60 addl $2,%esi
|
|
61 movl $buffs+2176,%ecx
|
|
62 .L49:
|
|
63 testl $1,%ebp
|
|
64 je .L50
|
|
65 movl %ecx,%ebx
|
|
66 movl %ebp,16(%esp)
|
|
67 pushl %eax
|
|
68 movl 20(%esp),%edx
|
|
69 leal (%ebx,%edx,4),%eax
|
|
70 pushl %eax
|
|
71 movl 24(%esp),%eax
|
|
72 incl %eax
|
|
73 andl $15,%eax
|
|
74 leal 1088(,%eax,4),%eax
|
|
75 addl %ebx,%eax
|
|
76 jmp .L74
|
|
77 .L50:
|
|
78 leal 1088(%ecx),%ebx
|
|
79 leal 1(%ebp),%edx
|
|
80 movl %edx,16(%esp)
|
|
81 pushl %eax
|
|
82 leal 1092(%ecx,%ebp,4),%eax
|
|
83 pushl %eax
|
|
84 leal (%ecx,%ebp,4),%eax
|
|
85 .L74:
|
|
86 pushl %eax
|
|
87 call dct64
|
|
88 addl $12,%esp
|
|
89 movl 16(%esp),%edx
|
|
90 leal 0(,%edx,4),%edx
|
|
91 movl $decwin+64,%eax
|
|
92 movl %eax,%ecx
|
|
93 subl %edx,%ecx
|
|
94 movl $16,%ebp
|
|
95 .L55:
|
|
96 flds (%ecx)
|
|
97 fmuls (%ebx)
|
|
98 flds 4(%ecx)
|
|
99 fmuls 4(%ebx)
|
|
100 fxch %st(1)
|
|
101 flds 8(%ecx)
|
|
102 fmuls 8(%ebx)
|
|
103 fxch %st(2)
|
|
104 fsubrp %st,%st(1)
|
|
105 flds 12(%ecx)
|
|
106 fmuls 12(%ebx)
|
|
107 fxch %st(2)
|
|
108 faddp %st,%st(1)
|
|
109 flds 16(%ecx)
|
|
110 fmuls 16(%ebx)
|
|
111 fxch %st(2)
|
|
112 fsubrp %st,%st(1)
|
|
113 flds 20(%ecx)
|
|
114 fmuls 20(%ebx)
|
|
115 fxch %st(2)
|
|
116 faddp %st,%st(1)
|
|
117 flds 24(%ecx)
|
|
118 fmuls 24(%ebx)
|
|
119 fxch %st(2)
|
|
120 fsubrp %st,%st(1)
|
|
121 flds 28(%ecx)
|
|
122 fmuls 28(%ebx)
|
|
123 fxch %st(2)
|
|
124 faddp %st,%st(1)
|
|
125 flds 32(%ecx)
|
|
126 fmuls 32(%ebx)
|
|
127 fxch %st(2)
|
|
128 fsubrp %st,%st(1)
|
|
129 flds 36(%ecx)
|
|
130 fmuls 36(%ebx)
|
|
131 fxch %st(2)
|
|
132 faddp %st,%st(1)
|
|
133 flds 40(%ecx)
|
|
134 fmuls 40(%ebx)
|
|
135 fxch %st(2)
|
|
136 fsubrp %st,%st(1)
|
|
137 flds 44(%ecx)
|
|
138 fmuls 44(%ebx)
|
|
139 fxch %st(2)
|
|
140 faddp %st,%st(1)
|
|
141 flds 48(%ecx)
|
|
142 fmuls 48(%ebx)
|
|
143 fxch %st(2)
|
|
144 fsubrp %st,%st(1)
|
|
145 flds 52(%ecx)
|
|
146 fmuls 52(%ebx)
|
|
147 fxch %st(2)
|
|
148 faddp %st,%st(1)
|
|
149 flds 56(%ecx)
|
|
150 fmuls 56(%ebx)
|
|
151 fxch %st(2)
|
|
152 fsubrp %st,%st(1)
|
|
153 flds 60(%ecx)
|
|
154 fmuls 60(%ebx)
|
|
155 fxch %st(2)
|
|
156 subl $4,%esp
|
|
157 faddp %st,%st(1)
|
|
158 fxch %st(1)
|
|
159 fsubrp %st,%st(1)
|
|
160 fistpl (%esp)
|
|
161 popl %eax
|
|
162 cmpl $32767,%eax
|
|
163 jg 1f
|
|
164 cmpl $-32768,%eax
|
|
165 jl 2f
|
|
166 movw %ax,(%esi)
|
|
167 jmp 4f
|
|
168 1: movw $32767,(%esi)
|
|
169 jmp 3f
|
|
170 2: movw $-32768,(%esi)
|
|
171 3: incl %edi
|
|
172 4:
|
|
173 .L54:
|
|
174 addl $64,%ebx
|
|
175 subl $-128,%ecx
|
|
176 addl $4,%esi
|
|
177 decl %ebp
|
|
178 jnz .L55
|
|
179 flds (%ecx)
|
|
180 fmuls (%ebx)
|
|
181 flds 8(%ecx)
|
|
182 fmuls 8(%ebx)
|
|
183 flds 16(%ecx)
|
|
184 fmuls 16(%ebx)
|
|
185 fxch %st(2)
|
|
186 faddp %st,%st(1)
|
|
187 flds 24(%ecx)
|
|
188 fmuls 24(%ebx)
|
|
189 fxch %st(2)
|
|
190 faddp %st,%st(1)
|
|
191 flds 32(%ecx)
|
|
192 fmuls 32(%ebx)
|
|
193 fxch %st(2)
|
|
194 faddp %st,%st(1)
|
|
195 flds 40(%ecx)
|
|
196 fmuls 40(%ebx)
|
|
197 fxch %st(2)
|
|
198 faddp %st,%st(1)
|
|
199 flds 48(%ecx)
|
|
200 fmuls 48(%ebx)
|
|
201 fxch %st(2)
|
|
202 faddp %st,%st(1)
|
|
203 flds 56(%ecx)
|
|
204 fmuls 56(%ebx)
|
|
205 fxch %st(2)
|
|
206 subl $4,%esp
|
|
207 faddp %st,%st(1)
|
|
208 fxch %st(1)
|
|
209 faddp %st,%st(1)
|
|
210 fistpl (%esp)
|
|
211 popl %eax
|
|
212 cmpl $32767,%eax
|
|
213 jg 1f
|
|
214 cmpl $-32768,%eax
|
|
215 jl 2f
|
|
216 movw %ax,(%esi)
|
|
217 jmp 4f
|
|
218 1: movw $32767,(%esi)
|
|
219 jmp 3f
|
|
220 2: movw $-32768,(%esi)
|
|
221 3: incl %edi
|
|
222 4:
|
|
223 .L62:
|
|
224 addl $-64,%ebx
|
|
225 addl $4,%esi
|
|
226 movl 16(%esp),%edx
|
|
227 leal -128(%ecx,%edx,8),%ecx
|
|
228 movl $15,%ebp
|
|
229 .L68:
|
|
230 flds -4(%ecx)
|
|
231 fchs
|
|
232 fmuls (%ebx)
|
|
233 flds -8(%ecx)
|
|
234 fmuls 4(%ebx)
|
|
235 fxch %st(1)
|
|
236 flds -12(%ecx)
|
|
237 fmuls 8(%ebx)
|
|
238 fxch %st(2)
|
|
239 fsubrp %st,%st(1)
|
|
240 flds -16(%ecx)
|
|
241 fmuls 12(%ebx)
|
|
242 fxch %st(2)
|
|
243 fsubrp %st,%st(1)
|
|
244 flds -20(%ecx)
|
|
245 fmuls 16(%ebx)
|
|
246 fxch %st(2)
|
|
247 fsubrp %st,%st(1)
|
|
248 flds -24(%ecx)
|
|
249 fmuls 20(%ebx)
|
|
250 fxch %st(2)
|
|
251 fsubrp %st,%st(1)
|
|
252 flds -28(%ecx)
|
|
253 fmuls 24(%ebx)
|
|
254 fxch %st(2)
|
|
255 fsubrp %st,%st(1)
|
|
256 flds -32(%ecx)
|
|
257 fmuls 28(%ebx)
|
|
258 fxch %st(2)
|
|
259 fsubrp %st,%st(1)
|
|
260 flds -36(%ecx)
|
|
261 fmuls 32(%ebx)
|
|
262 fxch %st(2)
|
|
263 fsubrp %st,%st(1)
|
|
264 flds -40(%ecx)
|
|
265 fmuls 36(%ebx)
|
|
266 fxch %st(2)
|
|
267 fsubrp %st,%st(1)
|
|
268 flds -44(%ecx)
|
|
269 fmuls 40(%ebx)
|
|
270 fxch %st(2)
|
|
271 fsubrp %st,%st(1)
|
|
272 flds -48(%ecx)
|
|
273 fmuls 44(%ebx)
|
|
274 fxch %st(2)
|
|
275 fsubrp %st,%st(1)
|
|
276 flds -52(%ecx)
|
|
277 fmuls 48(%ebx)
|
|
278 fxch %st(2)
|
|
279 fsubrp %st,%st(1)
|
|
280 flds -56(%ecx)
|
|
281 fmuls 52(%ebx)
|
|
282 fxch %st(2)
|
|
283 fsubrp %st,%st(1)
|
|
284 flds -60(%ecx)
|
|
285 fmuls 56(%ebx)
|
|
286 fxch %st(2)
|
|
287 fsubrp %st,%st(1)
|
|
288 flds (%ecx)
|
|
289 fmuls 60(%ebx)
|
|
290 fxch %st(2)
|
|
291 subl $4,%esp
|
|
292 fsubrp %st,%st(1)
|
|
293 fxch %st(1)
|
|
294 fsubrp %st,%st(1)
|
|
295 fistpl (%esp)
|
|
296 popl %eax
|
|
297 cmpl $32767,%eax
|
|
298 jg 1f
|
|
299 cmpl $-32768,%eax
|
|
300 jl 2f
|
|
301 movw %ax,(%esi)
|
|
302 jmp 4f
|
|
303 1: movw $32767,(%esi)
|
|
304 jmp 3f
|
|
305 2: movw $-32768,(%esi)
|
|
306 3: incl %edi
|
|
307 4:
|
|
308 .L67:
|
|
309 addl $-64,%ebx
|
|
310 addl $-128,%ecx
|
|
311 addl $4,%esi
|
|
312 decl %ebp
|
|
313 jnz .L68
|
|
314 movl %edi,%eax
|
|
315 popl %ebx
|
|
316 popl %esi
|
|
317 popl %edi
|
|
318 popl %ebp
|
|
319 addl $12,%esp
|
|
320 ret
|
|
321
|