Mercurial > mplayer.hg
annotate mp3lib/decode_k7.s @ 1127:ca1bc9edf735
audio_out_format_name prototype added, ao_control_vol_t for aocontrol_get/set_volume added
author | al3x |
---|---|
date | Thu, 14 Jun 2001 15:25:00 +0000 |
parents | 2f0265763322 |
children | 152c7c71a29c |
rev | line source |
---|---|
736 | 1 /// |
2 /// Replacement of synth_1to1() with AMD's 3DNowEx(DSP)! SIMD operations support | |
3 /// | |
4 /// This code based 'decode_3dnow.s' by Syuuhei Kashiyama | |
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made: | |
6 /// | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
7 /// - Added new opcode PFNACC |
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
8 /// - decreased number of opcodes (as it was suggested by k7 manual) |
763 | 9 /// (using memory reference as operand of instructions) |
1054
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
10 /// - added PREFETCHW opcode. It has different semantic on k7 than on k6-2 |
1053 | 11 /// and saves 15-25 cpu clocks for athlon. |
12 /// - partial unrolling loops for removing slower MOVW insns. | |
1054
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
13 /// (Note: probably same operation should be done for decode_3dnow.s) |
736 | 14 /// - change function name for support 3DNowEx! automatic detect |
15 /// | |
16 /// note: because K7 processors are an aggresive out-of-order three-way | |
17 /// superscalar ones instruction order is not significand for them. | |
18 /// | |
1054
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
19 /// Benchmark: measured by mplayer on Duron-700: |
1067 | 20 /// 3dNow! optimized code - 1.4% of cpu usage |
21 /// k7 optimized code (without partial loop unrolling) - 1.3% of cpu usage | |
22 /// k7 optimized code - 1.1% of cpu usage | |
1054
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
23 /// Note: K6-2 users have an chance with partial loops unrolling |
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
24 /// |
736 | 25 /// Modified by Nick Kurshev <nickols_k@mail.ru> |
26 /// | |
27 / synth_1to1_3dnow works the same way as the c version of | |
28 / synth_1to1. this assembler code based 'decode-i586.s' | |
29 / (by Stefan Bieschewski <stb@acm.org>), two types of changes | |
30 / have been made: | |
31 / - use {MMX,3DNow!} instruction for reduce cpu | |
32 / - remove unused(?) local symbols | |
33 / | |
34 / useful sources of information on optimizing 3DNow! code include: | |
35 / AMD 3DNow! Technology Manual (Publication #21928) | |
36 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf | |
37 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) | |
38 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) | |
39 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf | |
40 / | |
41 / This code was tested only AMD-K6-2 processor Linux systems, | |
42 / please tell me: | |
43 / - whether this code works on other 3DNow! capable processors | |
44 / (ex.IDT-C6-2) or not | |
45 / - whether this code works on other OSes or not | |
46 / | |
47 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 | |
48 / <kim@comtec.co.jp> - after 1.Apr.1998 | |
49 | |
50 / Enhancments for q-word operation by Michael Hipp | |
51 | |
52 .bss | |
53 .comm buffs,4352,4 | |
54 .data | |
1053 | 55 .align 8 |
56 null_one: .long 0x0000ffff, 0x0000ffff | |
57 one_null: .long 0xffff0000, 0xffff0000 | |
58 bo: .long 1 | |
736 | 59 .text |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
60 /* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ |
736 | 61 .globl synth_1to1_3dnowex |
62 synth_1to1_3dnowex: | |
63 subl $12,%esp | |
64 pushl %ebp | |
65 pushl %edi | |
66 pushl %esi | |
67 pushl %ebx | |
68 | |
69 movl 32(%esp),%eax | |
70 movl 40(%esp),%esi | |
71 movl $0,%edi | |
72 movl bo,%ebp | |
73 cmpl %edi,36(%esp) | |
74 jne .L48 | |
75 decl %ebp | |
76 andl $15,%ebp | |
77 movl %ebp,bo | |
78 movl $buffs,%ecx | |
79 jmp .L49 | |
80 .L48: | |
81 addl $2,%esi | |
82 movl $buffs+2176,%ecx | |
83 .L49: | |
84 testl $1,%ebp | |
85 je .L50 | |
86 movl %ecx,%ebx | |
87 movl %ebp,16(%esp) | |
88 pushl %eax | |
89 movl 20(%esp),%edx | |
90 leal (%ebx,%edx,4),%eax | |
91 pushl %eax | |
92 movl 24(%esp),%eax | |
93 incl %eax | |
94 andl $15,%eax | |
95 leal 1088(,%eax,4),%eax | |
96 addl %ebx,%eax | |
97 jmp .L74 | |
98 .L50: | |
99 leal 1088(%ecx),%ebx | |
100 leal 1(%ebp),%edx | |
101 movl %edx,16(%esp) | |
102 pushl %eax | |
103 leal 1092(%ecx,%ebp,4),%eax | |
104 pushl %eax | |
105 leal (%ecx,%ebp,4),%eax | |
106 .L74: | |
107 pushl %eax | |
108 call dct64_3dnowex | |
109 movl 16(%esp),%edx | |
110 leal 0(,%edx,4),%edx | |
111 movl $decwin+64,%eax | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
112 movl %eax,%ecx |
736 | 113 subl %edx,%ecx |
1053 | 114 movl $8,%ebp |
115 prefetchw (%esi) | |
736 | 116 |
117 .L55: | |
763 | 118 |
859 | 119 movq (%ecx),%mm0 |
120 pfmul (%ebx),%mm0 | |
1053 | 121 movq 128(%ecx),%mm4 |
122 pfmul 64(%ebx),%mm4 | |
736 | 123 |
859 | 124 movq 8(%ecx),%mm1 |
125 pfmul 8(%ebx),%mm1 | |
126 pfadd %mm1,%mm0 | |
1053 | 127 movq 136(%ecx),%mm5 |
128 pfmul 72(%ebx),%mm5 | |
129 pfadd %mm5,%mm4 | |
736 | 130 |
859 | 131 movq 16(%ebx),%mm2 |
132 pfmul 16(%ecx),%mm2 | |
133 pfadd %mm2,%mm0 | |
1053 | 134 movq 80(%ebx),%mm6 |
135 pfmul 144(%ecx),%mm6 | |
136 pfadd %mm6,%mm4 | |
736 | 137 |
859 | 138 movq 24(%ecx),%mm3 |
139 pfmul 24(%ebx),%mm3 | |
140 pfadd %mm3,%mm0 | |
1053 | 141 movq 152(%ecx),%mm7 |
142 pfmul 88(%ebx),%mm7 | |
143 pfadd %mm7,%mm4 | |
736 | 144 |
1053 | 145 movq 32(%ebx),%mm1 |
146 pfmul 32(%ecx),%mm1 | |
147 pfadd %mm1,%mm0 | |
148 movq 96(%ebx),%mm5 | |
149 pfmul 160(%ecx),%mm5 | |
150 pfadd %mm5,%mm4 | |
736 | 151 |
1053 | 152 movq 40(%ecx),%mm2 |
153 pfmul 40(%ebx),%mm2 | |
154 pfadd %mm2,%mm0 | |
155 movq 168(%ecx),%mm6 | |
156 pfmul 104(%ebx),%mm6 | |
157 pfadd %mm6,%mm4 | |
736 | 158 |
1053 | 159 movq 48(%ebx),%mm3 |
160 pfmul 48(%ecx),%mm3 | |
161 pfadd %mm3,%mm0 | |
162 movq 112(%ebx),%mm7 | |
163 pfmul 176(%ecx),%mm7 | |
164 pfadd %mm7,%mm4 | |
736 | 165 |
1053 | 166 movq 56(%ecx),%mm1 |
167 pfmul 56(%ebx),%mm1 | |
168 pfadd %mm1,%mm0 | |
169 movq 184(%ecx),%mm5 | |
170 pfmul 120(%ebx),%mm5 | |
171 pfadd %mm5,%mm4 | |
736 | 172 |
1053 | 173 pfnacc %mm4, %mm0 |
174 movq (%esi), %mm1 | |
175 pf2id %mm0, %mm0 | |
176 pand one_null, %mm1 | |
177 psrld $16,%mm0 | |
178 pand null_one, %mm0 | |
179 por %mm0, %mm1 | |
180 movq %mm1,(%esi) | |
181 | |
182 addl $128,%ebx | |
183 addl $256,%ecx | |
184 addl $8,%esi | |
736 | 185 decl %ebp |
186 jnz .L55 | |
187 | |
188 / --- end of loop 1 --- | |
189 | |
1053 | 190 prefetchw (%esi) /* prefetching for writing this block and next loop */ |
191 | |
859 | 192 movd (%ecx),%mm0 |
193 pfmul (%ebx),%mm0 | |
736 | 194 |
195 movd 8(%ebx),%mm1 | |
763 | 196 pfmul 8(%ecx),%mm1 |
859 | 197 pfadd %mm1,%mm0 |
736 | 198 |
859 | 199 movd 16(%ebx),%mm2 |
200 pfmul 16(%ecx),%mm2 | |
201 pfadd %mm2,%mm0 | |
736 | 202 |
859 | 203 movd 24(%ebx),%mm3 |
204 pfmul 24(%ecx),%mm3 | |
205 pfadd %mm3,%mm0 | |
736 | 206 |
859 | 207 movd 32(%ebx),%mm4 |
208 pfmul 32(%ecx),%mm4 | |
209 pfadd %mm4,%mm0 | |
736 | 210 |
859 | 211 movd 40(%ebx),%mm5 |
212 pfmul 40(%ecx),%mm5 | |
213 pfadd %mm5,%mm0 | |
736 | 214 |
859 | 215 movd 48(%ebx),%mm6 |
216 pfmul 48(%ecx),%mm6 | |
217 pfadd %mm6,%mm0 | |
736 | 218 |
859 | 219 movd 56(%ebx),%mm7 |
220 pfmul 56(%ecx),%mm7 | |
221 pfadd %mm7,%mm0 | |
736 | 222 |
859 | 223 pf2id %mm0,%mm0 |
224 movd %mm0,%eax | |
736 | 225 |
226 sar $16,%eax | |
227 | |
228 movw %ax,(%esi) | |
229 | |
1053 | 230 subl $64,%ebx |
736 | 231 addl $4,%esi |
232 addl $256,%ecx | |
1053 | 233 movl $7,%ebp |
736 | 234 |
235 .L68: | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
236 pxor %mm0, %mm0 |
1053 | 237 pxor %mm4, %mm4 |
238 | |
239 movq (%ecx),%mm1 | |
240 pfmul (%ebx),%mm1 | |
241 pfsub %mm1,%mm0 | |
242 movq 128(%ecx),%mm5 | |
243 pfmul -64(%ebx),%mm5 | |
244 pfsub %mm5,%mm4 | |
245 | |
246 movq 8(%ecx),%mm2 | |
247 pfmul 8(%ebx),%mm2 | |
248 pfsub %mm2,%mm0 | |
249 movq 136(%ecx),%mm6 | |
250 pfmul -56(%ebx),%mm6 | |
251 pfsub %mm6,%mm4 | |
252 | |
253 movq 16(%ecx),%mm3 | |
254 pfmul 16(%ebx),%mm3 | |
255 pfsub %mm3,%mm0 | |
256 movq 144(%ecx),%mm7 | |
257 pfmul -48(%ebx),%mm7 | |
258 pfsub %mm7,%mm4 | |
259 | |
260 movq 24(%ecx),%mm1 | |
261 pfmul 24(%ebx),%mm1 | |
262 pfsub %mm1,%mm0 | |
263 movq 152(%ecx),%mm5 | |
264 pfmul -40(%ebx),%mm5 | |
265 pfsub %mm5,%mm4 | |
266 | |
267 movq 32(%ecx),%mm2 | |
268 pfmul 32(%ebx),%mm2 | |
269 pfsub %mm2,%mm0 | |
270 movq 160(%ecx),%mm6 | |
271 pfmul -32(%ebx),%mm6 | |
272 pfsub %mm6,%mm4 | |
273 | |
274 movq 40(%ecx),%mm3 | |
275 pfmul 40(%ebx),%mm3 | |
276 pfsub %mm3,%mm0 | |
277 movq 168(%ecx),%mm7 | |
278 pfmul -24(%ebx),%mm7 | |
279 pfsub %mm7,%mm4 | |
280 | |
281 movq 48(%ecx),%mm1 | |
282 pfmul 48(%ebx),%mm1 | |
283 pfsub %mm1,%mm0 | |
284 movq 176(%ecx),%mm5 | |
285 pfmul -16(%ebx),%mm5 | |
286 pfsub %mm5,%mm4 | |
287 | |
288 movq 56(%ecx),%mm2 | |
289 pfmul 56(%ebx),%mm2 | |
290 pfsub %mm2,%mm0 | |
291 movq 184(%ecx),%mm6 | |
292 pfmul -8(%ebx),%mm6 | |
293 pfsub %mm6,%mm4 | |
294 | |
295 pfacc %mm4,%mm0 | |
296 movq (%esi), %mm1 | |
297 pf2id %mm0, %mm0 | |
298 pand one_null, %mm1 | |
299 psrld $16,%mm0 | |
300 pand null_one, %mm0 | |
301 por %mm0, %mm1 | |
302 movq %mm1,(%esi) | |
303 | |
304 subl $128,%ebx | |
305 addl $256,%ecx | |
306 addl $8,%esi | |
307 decl %ebp | |
308 jnz .L68 | |
309 | |
310 / --- end of loop 2 | |
311 | |
312 pxor %mm0, %mm0 | |
736 | 313 |
859 | 314 movq (%ecx),%mm1 |
315 pfmul (%ebx),%mm1 | |
316 pfsub %mm1,%mm0 | |
317 | |
318 movq 8(%ecx),%mm2 | |
319 pfmul 8(%ebx),%mm2 | |
736 | 320 pfsub %mm2,%mm0 |
321 | |
859 | 322 movq 16(%ecx),%mm3 |
323 pfmul 16(%ebx),%mm3 | |
324 pfsub %mm3,%mm0 | |
736 | 325 |
326 movq 24(%ecx),%mm4 | |
763 | 327 pfmul 24(%ebx),%mm4 |
736 | 328 pfsub %mm4,%mm0 |
329 | |
859 | 330 movq 32(%ecx),%mm5 |
331 pfmul 32(%ebx),%mm5 | |
332 pfsub %mm5,%mm0 | |
736 | 333 |
859 | 334 movq 40(%ecx),%mm6 |
335 pfmul 40(%ebx),%mm6 | |
336 pfsub %mm6,%mm0 | |
736 | 337 |
859 | 338 movq 48(%ecx),%mm7 |
339 pfmul 48(%ebx),%mm7 | |
340 pfsub %mm7,%mm0 | |
736 | 341 |
859 | 342 movq 56(%ecx),%mm1 |
343 pfmul 56(%ebx),%mm1 | |
344 pfsub %mm1,%mm0 | |
736 | 345 |
346 pfacc %mm0,%mm0 | |
347 | |
348 pf2id %mm0,%mm0 | |
349 movd %mm0,%eax | |
350 | |
351 sar $16,%eax | |
352 | |
353 movw %ax,(%esi) | |
354 | |
355 femms | |
356 | |
357 movl %edi,%eax | |
358 popl %ebx | |
359 popl %esi | |
360 popl %edi | |
361 popl %ebp | |
362 addl $12,%esp | |
363 ret |