Mercurial > mplayer.hg
annotate mp3lib/decode_k7.s @ 1063:269780d31b51
added missing 'using namespace std;' for gcc 3.0
author | arpi_esp |
---|---|
date | Sat, 09 Jun 2001 00:10:50 +0000 |
parents | 9b0bb5c805b2 |
children | 2f0265763322 |
rev | line source |
---|---|
736 | 1 /// |
2 /// Replacement of synth_1to1() with AMD's 3DNowEx(DSP)! SIMD operations support | |
3 /// | |
4 /// This code based 'decode_3dnow.s' by Syuuhei Kashiyama | |
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made: | |
6 /// | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
7 /// - Added new opcode PFNACC |
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
8 /// - decreased number of opcodes (as it was suggested by k7 manual) |
763 | 9 /// (using memory reference as operand of instructions) |
1054
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
10 /// - added PREFETCHW opcode. It has different semantic on k7 than on k6-2 |
1053 | 11 /// and saves 15-25 cpu clocks for athlon. |
12 /// - partial unrolling loops for removing slower MOVW insns. | |
1054
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
13 /// (Note: probably same operation should be done for decode_3dnow.s) |
736 | 14 /// - change function name for support 3DNowEx! automatic detect |
15 /// | |
16 /// note: because K7 processors are an aggresive out-of-order three-way | |
17 /// superscalar ones instruction order is not significand for them. | |
18 /// | |
1054
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
19 /// Benchmark: measured by mplayer on Duron-700: |
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
20 /// 3dNow! optimized code - 1.5% of cpu usage |
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
21 /// k7 optimized code - 1.1% of cpu usage |
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
22 /// Note: K6-2 users have an chance with partial loops unrolling |
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
23 /// |
736 | 24 /// Modified by Nick Kurshev <nickols_k@mail.ru> |
25 /// | |
26 / synth_1to1_3dnow works the same way as the c version of | |
27 / synth_1to1. this assembler code based 'decode-i586.s' | |
28 / (by Stefan Bieschewski <stb@acm.org>), two types of changes | |
29 / have been made: | |
30 / - use {MMX,3DNow!} instruction for reduce cpu | |
31 / - remove unused(?) local symbols | |
32 / | |
33 / useful sources of information on optimizing 3DNow! code include: | |
34 / AMD 3DNow! Technology Manual (Publication #21928) | |
35 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf | |
36 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) | |
37 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) | |
38 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf | |
39 / | |
40 / This code was tested only AMD-K6-2 processor Linux systems, | |
41 / please tell me: | |
42 / - whether this code works on other 3DNow! capable processors | |
43 / (ex.IDT-C6-2) or not | |
44 / - whether this code works on other OSes or not | |
45 / | |
46 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 | |
47 / <kim@comtec.co.jp> - after 1.Apr.1998 | |
48 | |
49 / Enhancments for q-word operation by Michael Hipp | |
50 | |
51 .bss | |
52 .comm buffs,4352,4 | |
53 .data | |
1053 | 54 .align 8 |
55 null_one: .long 0x0000ffff, 0x0000ffff | |
56 one_null: .long 0xffff0000, 0xffff0000 | |
57 bo: .long 1 | |
736 | 58 .text |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
59 /* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ |
736 | 60 .globl synth_1to1_3dnowex |
61 synth_1to1_3dnowex: | |
62 subl $12,%esp | |
63 pushl %ebp | |
64 pushl %edi | |
65 pushl %esi | |
66 pushl %ebx | |
67 | |
68 movl 32(%esp),%eax | |
69 movl 40(%esp),%esi | |
70 movl $0,%edi | |
71 movl bo,%ebp | |
72 cmpl %edi,36(%esp) | |
73 jne .L48 | |
74 decl %ebp | |
75 andl $15,%ebp | |
76 movl %ebp,bo | |
77 movl $buffs,%ecx | |
78 jmp .L49 | |
79 .L48: | |
80 addl $2,%esi | |
81 movl $buffs+2176,%ecx | |
82 .L49: | |
83 testl $1,%ebp | |
84 je .L50 | |
85 movl %ecx,%ebx | |
86 movl %ebp,16(%esp) | |
87 pushl %eax | |
88 movl 20(%esp),%edx | |
89 leal (%ebx,%edx,4),%eax | |
90 pushl %eax | |
91 movl 24(%esp),%eax | |
92 incl %eax | |
93 andl $15,%eax | |
94 leal 1088(,%eax,4),%eax | |
95 addl %ebx,%eax | |
96 jmp .L74 | |
97 .L50: | |
98 leal 1088(%ecx),%ebx | |
99 leal 1(%ebp),%edx | |
100 movl %edx,16(%esp) | |
101 pushl %eax | |
102 leal 1092(%ecx,%ebp,4),%eax | |
103 pushl %eax | |
104 leal (%ecx,%ebp,4),%eax | |
105 .L74: | |
106 pushl %eax | |
107 call dct64_3dnowex | |
108 movl 16(%esp),%edx | |
109 leal 0(,%edx,4),%edx | |
110 movl $decwin+64,%eax | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
111 movl %eax,%ecx |
736 | 112 subl %edx,%ecx |
1053 | 113 movl $8,%ebp |
114 prefetchw (%esi) | |
736 | 115 |
116 .L55: | |
763 | 117 |
859 | 118 movq (%ecx),%mm0 |
119 pfmul (%ebx),%mm0 | |
1053 | 120 movq 128(%ecx),%mm4 |
121 pfmul 64(%ebx),%mm4 | |
736 | 122 |
859 | 123 movq 8(%ecx),%mm1 |
124 pfmul 8(%ebx),%mm1 | |
125 pfadd %mm1,%mm0 | |
1053 | 126 movq 136(%ecx),%mm5 |
127 pfmul 72(%ebx),%mm5 | |
128 pfadd %mm5,%mm4 | |
736 | 129 |
859 | 130 movq 16(%ebx),%mm2 |
131 pfmul 16(%ecx),%mm2 | |
132 pfadd %mm2,%mm0 | |
1053 | 133 movq 80(%ebx),%mm6 |
134 pfmul 144(%ecx),%mm6 | |
135 pfadd %mm6,%mm4 | |
736 | 136 |
859 | 137 movq 24(%ecx),%mm3 |
138 pfmul 24(%ebx),%mm3 | |
139 pfadd %mm3,%mm0 | |
1053 | 140 movq 152(%ecx),%mm7 |
141 pfmul 88(%ebx),%mm7 | |
142 pfadd %mm7,%mm4 | |
736 | 143 |
1053 | 144 movq 32(%ebx),%mm1 |
145 pfmul 32(%ecx),%mm1 | |
146 pfadd %mm1,%mm0 | |
147 movq 96(%ebx),%mm5 | |
148 pfmul 160(%ecx),%mm5 | |
149 pfadd %mm5,%mm4 | |
736 | 150 |
1053 | 151 movq 40(%ecx),%mm2 |
152 pfmul 40(%ebx),%mm2 | |
153 pfadd %mm2,%mm0 | |
154 movq 168(%ecx),%mm6 | |
155 pfmul 104(%ebx),%mm6 | |
156 pfadd %mm6,%mm4 | |
736 | 157 |
1053 | 158 movq 48(%ebx),%mm3 |
159 pfmul 48(%ecx),%mm3 | |
160 pfadd %mm3,%mm0 | |
161 movq 112(%ebx),%mm7 | |
162 pfmul 176(%ecx),%mm7 | |
163 pfadd %mm7,%mm4 | |
736 | 164 |
1053 | 165 movq 56(%ecx),%mm1 |
166 pfmul 56(%ebx),%mm1 | |
167 pfadd %mm1,%mm0 | |
168 movq 184(%ecx),%mm5 | |
169 pfmul 120(%ebx),%mm5 | |
170 pfadd %mm5,%mm4 | |
736 | 171 |
1053 | 172 pfnacc %mm4, %mm0 |
173 movq (%esi), %mm1 | |
174 pf2id %mm0, %mm0 | |
175 pand one_null, %mm1 | |
176 psrld $16,%mm0 | |
177 pand null_one, %mm0 | |
178 por %mm0, %mm1 | |
179 movq %mm1,(%esi) | |
180 | |
181 addl $128,%ebx | |
182 addl $256,%ecx | |
183 addl $8,%esi | |
736 | 184 decl %ebp |
185 jnz .L55 | |
186 | |
187 / --- end of loop 1 --- | |
188 | |
1053 | 189 prefetchw (%esi) /* prefetching for writing this block and next loop */ |
190 | |
859 | 191 movd (%ecx),%mm0 |
192 pfmul (%ebx),%mm0 | |
736 | 193 |
194 movd 8(%ebx),%mm1 | |
763 | 195 pfmul 8(%ecx),%mm1 |
859 | 196 pfadd %mm1,%mm0 |
736 | 197 |
859 | 198 movd 16(%ebx),%mm2 |
199 pfmul 16(%ecx),%mm2 | |
200 pfadd %mm2,%mm0 | |
736 | 201 |
859 | 202 movd 24(%ebx),%mm3 |
203 pfmul 24(%ecx),%mm3 | |
204 pfadd %mm3,%mm0 | |
736 | 205 |
859 | 206 movd 32(%ebx),%mm4 |
207 pfmul 32(%ecx),%mm4 | |
208 pfadd %mm4,%mm0 | |
736 | 209 |
859 | 210 movd 40(%ebx),%mm5 |
211 pfmul 40(%ecx),%mm5 | |
212 pfadd %mm5,%mm0 | |
736 | 213 |
859 | 214 movd 48(%ebx),%mm6 |
215 pfmul 48(%ecx),%mm6 | |
216 pfadd %mm6,%mm0 | |
736 | 217 |
859 | 218 movd 56(%ebx),%mm7 |
219 pfmul 56(%ecx),%mm7 | |
220 pfadd %mm7,%mm0 | |
736 | 221 |
859 | 222 pf2id %mm0,%mm0 |
223 movd %mm0,%eax | |
736 | 224 |
225 sar $16,%eax | |
226 | |
227 movw %ax,(%esi) | |
228 | |
1053 | 229 subl $64,%ebx |
736 | 230 addl $4,%esi |
231 addl $256,%ecx | |
1053 | 232 movl $7,%ebp |
736 | 233 |
234 .L68: | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
235 pxor %mm0, %mm0 |
1053 | 236 pxor %mm4, %mm4 |
237 | |
238 movq (%ecx),%mm1 | |
239 pfmul (%ebx),%mm1 | |
240 pfsub %mm1,%mm0 | |
241 movq 128(%ecx),%mm5 | |
242 pfmul -64(%ebx),%mm5 | |
243 pfsub %mm5,%mm4 | |
244 | |
245 movq 8(%ecx),%mm2 | |
246 pfmul 8(%ebx),%mm2 | |
247 pfsub %mm2,%mm0 | |
248 movq 136(%ecx),%mm6 | |
249 pfmul -56(%ebx),%mm6 | |
250 pfsub %mm6,%mm4 | |
251 | |
252 movq 16(%ecx),%mm3 | |
253 pfmul 16(%ebx),%mm3 | |
254 pfsub %mm3,%mm0 | |
255 movq 144(%ecx),%mm7 | |
256 pfmul -48(%ebx),%mm7 | |
257 pfsub %mm7,%mm4 | |
258 | |
259 movq 24(%ecx),%mm1 | |
260 pfmul 24(%ebx),%mm1 | |
261 pfsub %mm1,%mm0 | |
262 movq 152(%ecx),%mm5 | |
263 pfmul -40(%ebx),%mm5 | |
264 pfsub %mm5,%mm4 | |
265 | |
266 movq 32(%ecx),%mm2 | |
267 pfmul 32(%ebx),%mm2 | |
268 pfsub %mm2,%mm0 | |
269 movq 160(%ecx),%mm6 | |
270 pfmul -32(%ebx),%mm6 | |
271 pfsub %mm6,%mm4 | |
272 | |
273 movq 40(%ecx),%mm3 | |
274 pfmul 40(%ebx),%mm3 | |
275 pfsub %mm3,%mm0 | |
276 movq 168(%ecx),%mm7 | |
277 pfmul -24(%ebx),%mm7 | |
278 pfsub %mm7,%mm4 | |
279 | |
280 movq 48(%ecx),%mm1 | |
281 pfmul 48(%ebx),%mm1 | |
282 pfsub %mm1,%mm0 | |
283 movq 176(%ecx),%mm5 | |
284 pfmul -16(%ebx),%mm5 | |
285 pfsub %mm5,%mm4 | |
286 | |
287 movq 56(%ecx),%mm2 | |
288 pfmul 56(%ebx),%mm2 | |
289 pfsub %mm2,%mm0 | |
290 movq 184(%ecx),%mm6 | |
291 pfmul -8(%ebx),%mm6 | |
292 pfsub %mm6,%mm4 | |
293 | |
294 pfacc %mm4,%mm0 | |
295 movq (%esi), %mm1 | |
296 pf2id %mm0, %mm0 | |
297 pand one_null, %mm1 | |
298 psrld $16,%mm0 | |
299 pand null_one, %mm0 | |
300 por %mm0, %mm1 | |
301 movq %mm1,(%esi) | |
302 | |
303 subl $128,%ebx | |
304 addl $256,%ecx | |
305 addl $8,%esi | |
306 decl %ebp | |
307 jnz .L68 | |
308 | |
309 / --- end of loop 2 | |
310 | |
311 pxor %mm0, %mm0 | |
736 | 312 |
859 | 313 movq (%ecx),%mm1 |
314 pfmul (%ebx),%mm1 | |
315 pfsub %mm1,%mm0 | |
316 | |
317 movq 8(%ecx),%mm2 | |
318 pfmul 8(%ebx),%mm2 | |
736 | 319 pfsub %mm2,%mm0 |
320 | |
859 | 321 movq 16(%ecx),%mm3 |
322 pfmul 16(%ebx),%mm3 | |
323 pfsub %mm3,%mm0 | |
736 | 324 |
325 movq 24(%ecx),%mm4 | |
763 | 326 pfmul 24(%ebx),%mm4 |
736 | 327 pfsub %mm4,%mm0 |
328 | |
859 | 329 movq 32(%ecx),%mm5 |
330 pfmul 32(%ebx),%mm5 | |
331 pfsub %mm5,%mm0 | |
736 | 332 |
859 | 333 movq 40(%ecx),%mm6 |
334 pfmul 40(%ebx),%mm6 | |
335 pfsub %mm6,%mm0 | |
736 | 336 |
859 | 337 movq 48(%ecx),%mm7 |
338 pfmul 48(%ebx),%mm7 | |
339 pfsub %mm7,%mm0 | |
736 | 340 |
859 | 341 movq 56(%ecx),%mm1 |
342 pfmul 56(%ebx),%mm1 | |
343 pfsub %mm1,%mm0 | |
736 | 344 |
345 pfacc %mm0,%mm0 | |
346 | |
347 pf2id %mm0,%mm0 | |
348 movd %mm0,%eax | |
349 | |
350 sar $16,%eax | |
351 | |
352 movw %ax,(%esi) | |
353 | |
354 femms | |
355 | |
356 movl %edi,%eax | |
357 popl %ebx | |
358 popl %esi | |
359 popl %edi | |
360 popl %ebp | |
361 addl $12,%esp | |
362 ret |