Mercurial > mplayer.hg
annotate mp3lib/decode_k7.s @ 1135:152c7c71a29c
loops alignment
author | nickols_k |
---|---|
date | Sat, 16 Jun 2001 15:24:02 +0000 |
parents | 2f0265763322 |
children |
rev | line source |
---|---|
736 | 1 /// |
2 /// Replacement of synth_1to1() with AMD's 3DNowEx(DSP)! SIMD operations support | |
3 /// | |
4 /// This code based 'decode_3dnow.s' by Syuuhei Kashiyama | |
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made: | |
6 /// | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
7 /// - Added new opcode PFNACC |
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
8 /// - decreased number of opcodes (as it was suggested by k7 manual) |
763 | 9 /// (using memory reference as operand of instructions) |
1054
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
10 /// - added PREFETCHW opcode. It has different semantic on k7 than on k6-2 |
1053 | 11 /// and saves 15-25 cpu clocks for athlon. |
12 /// - partial unrolling loops for removing slower MOVW insns. | |
1054
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
13 /// (Note: probably same operation should be done for decode_3dnow.s) |
736 | 14 /// - change function name for support 3DNowEx! automatic detect |
1135 | 15 /// - added loops alignment |
736 | 16 /// |
17 /// note: because K7 processors are an aggresive out-of-order three-way | |
18 /// superscalar ones instruction order is not significand for them. | |
19 /// | |
1054
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
20 /// Benchmark: measured by mplayer on Duron-700: |
1067 | 21 /// 3dNow! optimized code - 1.4% of cpu usage |
22 /// k7 optimized code (without partial loop unrolling) - 1.3% of cpu usage | |
23 /// k7 optimized code - 1.1% of cpu usage | |
1054
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
24 /// Note: K6-2 users have an chance with partial loops unrolling |
9b0bb5c805b2
Last minute improvements before release. Added banchmark
nickols_k
parents:
1053
diff
changeset
|
25 /// |
736 | 26 /// Modified by Nick Kurshev <nickols_k@mail.ru> |
27 /// | |
28 / synth_1to1_3dnow works the same way as the c version of | |
29 / synth_1to1. this assembler code based 'decode-i586.s' | |
30 / (by Stefan Bieschewski <stb@acm.org>), two types of changes | |
31 / have been made: | |
32 / - use {MMX,3DNow!} instruction for reduce cpu | |
33 / - remove unused(?) local symbols | |
34 / | |
35 / useful sources of information on optimizing 3DNow! code include: | |
36 / AMD 3DNow! Technology Manual (Publication #21928) | |
37 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf | |
38 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) | |
39 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) | |
40 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf | |
41 / | |
42 / This code was tested only AMD-K6-2 processor Linux systems, | |
43 / please tell me: | |
44 / - whether this code works on other 3DNow! capable processors | |
45 / (ex.IDT-C6-2) or not | |
46 / - whether this code works on other OSes or not | |
47 / | |
48 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 | |
49 / <kim@comtec.co.jp> - after 1.Apr.1998 | |
50 | |
51 / Enhancments for q-word operation by Michael Hipp | |
52 | |
53 .bss | |
54 .comm buffs,4352,4 | |
55 .data | |
1053 | 56 .align 8 |
57 null_one: .long 0x0000ffff, 0x0000ffff | |
58 one_null: .long 0xffff0000, 0xffff0000 | |
59 bo: .long 1 | |
736 | 60 .text |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
61 /* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ |
736 | 62 .globl synth_1to1_3dnowex |
63 synth_1to1_3dnowex: | |
64 subl $12,%esp | |
65 pushl %ebp | |
66 pushl %edi | |
67 pushl %esi | |
68 pushl %ebx | |
69 | |
70 movl 32(%esp),%eax | |
71 movl 40(%esp),%esi | |
72 movl $0,%edi | |
73 movl bo,%ebp | |
74 cmpl %edi,36(%esp) | |
75 jne .L48 | |
76 decl %ebp | |
77 andl $15,%ebp | |
78 movl %ebp,bo | |
79 movl $buffs,%ecx | |
80 jmp .L49 | |
81 .L48: | |
82 addl $2,%esi | |
83 movl $buffs+2176,%ecx | |
84 .L49: | |
85 testl $1,%ebp | |
86 je .L50 | |
87 movl %ecx,%ebx | |
88 movl %ebp,16(%esp) | |
89 pushl %eax | |
90 movl 20(%esp),%edx | |
91 leal (%ebx,%edx,4),%eax | |
92 pushl %eax | |
93 movl 24(%esp),%eax | |
94 incl %eax | |
95 andl $15,%eax | |
96 leal 1088(,%eax,4),%eax | |
97 addl %ebx,%eax | |
98 jmp .L74 | |
99 .L50: | |
100 leal 1088(%ecx),%ebx | |
101 leal 1(%ebp),%edx | |
102 movl %edx,16(%esp) | |
103 pushl %eax | |
104 leal 1092(%ecx,%ebp,4),%eax | |
105 pushl %eax | |
106 leal (%ecx,%ebp,4),%eax | |
107 .L74: | |
108 pushl %eax | |
109 call dct64_3dnowex | |
110 movl 16(%esp),%edx | |
111 leal 0(,%edx,4),%edx | |
112 movl $decwin+64,%eax | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
113 movl %eax,%ecx |
736 | 114 subl %edx,%ecx |
1053 | 115 movl $8,%ebp |
116 prefetchw (%esi) | |
1135 | 117 .align 16 |
736 | 118 .L55: |
763 | 119 |
859 | 120 movq (%ecx),%mm0 |
121 pfmul (%ebx),%mm0 | |
1053 | 122 movq 128(%ecx),%mm4 |
123 pfmul 64(%ebx),%mm4 | |
736 | 124 |
859 | 125 movq 8(%ecx),%mm1 |
126 pfmul 8(%ebx),%mm1 | |
127 pfadd %mm1,%mm0 | |
1053 | 128 movq 136(%ecx),%mm5 |
129 pfmul 72(%ebx),%mm5 | |
130 pfadd %mm5,%mm4 | |
736 | 131 |
859 | 132 movq 16(%ebx),%mm2 |
133 pfmul 16(%ecx),%mm2 | |
134 pfadd %mm2,%mm0 | |
1053 | 135 movq 80(%ebx),%mm6 |
136 pfmul 144(%ecx),%mm6 | |
137 pfadd %mm6,%mm4 | |
736 | 138 |
859 | 139 movq 24(%ecx),%mm3 |
140 pfmul 24(%ebx),%mm3 | |
141 pfadd %mm3,%mm0 | |
1053 | 142 movq 152(%ecx),%mm7 |
143 pfmul 88(%ebx),%mm7 | |
144 pfadd %mm7,%mm4 | |
736 | 145 |
1053 | 146 movq 32(%ebx),%mm1 |
147 pfmul 32(%ecx),%mm1 | |
148 pfadd %mm1,%mm0 | |
149 movq 96(%ebx),%mm5 | |
150 pfmul 160(%ecx),%mm5 | |
151 pfadd %mm5,%mm4 | |
736 | 152 |
1053 | 153 movq 40(%ecx),%mm2 |
154 pfmul 40(%ebx),%mm2 | |
155 pfadd %mm2,%mm0 | |
156 movq 168(%ecx),%mm6 | |
157 pfmul 104(%ebx),%mm6 | |
158 pfadd %mm6,%mm4 | |
736 | 159 |
1053 | 160 movq 48(%ebx),%mm3 |
161 pfmul 48(%ecx),%mm3 | |
162 pfadd %mm3,%mm0 | |
163 movq 112(%ebx),%mm7 | |
164 pfmul 176(%ecx),%mm7 | |
165 pfadd %mm7,%mm4 | |
736 | 166 |
1053 | 167 movq 56(%ecx),%mm1 |
168 pfmul 56(%ebx),%mm1 | |
169 pfadd %mm1,%mm0 | |
170 movq 184(%ecx),%mm5 | |
171 pfmul 120(%ebx),%mm5 | |
172 pfadd %mm5,%mm4 | |
736 | 173 |
1053 | 174 pfnacc %mm4, %mm0 |
175 movq (%esi), %mm1 | |
176 pf2id %mm0, %mm0 | |
177 pand one_null, %mm1 | |
178 psrld $16,%mm0 | |
179 pand null_one, %mm0 | |
180 por %mm0, %mm1 | |
181 movq %mm1,(%esi) | |
182 | |
183 addl $128,%ebx | |
184 addl $256,%ecx | |
185 addl $8,%esi | |
736 | 186 decl %ebp |
187 jnz .L55 | |
188 | |
189 / --- end of loop 1 --- | |
190 | |
1053 | 191 prefetchw (%esi) /* prefetching for writing this block and next loop */ |
192 | |
859 | 193 movd (%ecx),%mm0 |
194 pfmul (%ebx),%mm0 | |
736 | 195 |
196 movd 8(%ebx),%mm1 | |
763 | 197 pfmul 8(%ecx),%mm1 |
859 | 198 pfadd %mm1,%mm0 |
736 | 199 |
859 | 200 movd 16(%ebx),%mm2 |
201 pfmul 16(%ecx),%mm2 | |
202 pfadd %mm2,%mm0 | |
736 | 203 |
859 | 204 movd 24(%ebx),%mm3 |
205 pfmul 24(%ecx),%mm3 | |
206 pfadd %mm3,%mm0 | |
736 | 207 |
859 | 208 movd 32(%ebx),%mm4 |
209 pfmul 32(%ecx),%mm4 | |
210 pfadd %mm4,%mm0 | |
736 | 211 |
859 | 212 movd 40(%ebx),%mm5 |
213 pfmul 40(%ecx),%mm5 | |
214 pfadd %mm5,%mm0 | |
736 | 215 |
859 | 216 movd 48(%ebx),%mm6 |
217 pfmul 48(%ecx),%mm6 | |
218 pfadd %mm6,%mm0 | |
736 | 219 |
859 | 220 movd 56(%ebx),%mm7 |
221 pfmul 56(%ecx),%mm7 | |
222 pfadd %mm7,%mm0 | |
736 | 223 |
859 | 224 pf2id %mm0,%mm0 |
225 movd %mm0,%eax | |
736 | 226 |
227 sar $16,%eax | |
228 | |
229 movw %ax,(%esi) | |
230 | |
1053 | 231 subl $64,%ebx |
736 | 232 addl $4,%esi |
233 addl $256,%ecx | |
1053 | 234 movl $7,%ebp |
1135 | 235 .align 16 |
736 | 236 .L68: |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
237 pxor %mm0, %mm0 |
1053 | 238 pxor %mm4, %mm4 |
239 | |
240 movq (%ecx),%mm1 | |
241 pfmul (%ebx),%mm1 | |
242 pfsub %mm1,%mm0 | |
243 movq 128(%ecx),%mm5 | |
244 pfmul -64(%ebx),%mm5 | |
245 pfsub %mm5,%mm4 | |
246 | |
247 movq 8(%ecx),%mm2 | |
248 pfmul 8(%ebx),%mm2 | |
249 pfsub %mm2,%mm0 | |
250 movq 136(%ecx),%mm6 | |
251 pfmul -56(%ebx),%mm6 | |
252 pfsub %mm6,%mm4 | |
253 | |
254 movq 16(%ecx),%mm3 | |
255 pfmul 16(%ebx),%mm3 | |
256 pfsub %mm3,%mm0 | |
257 movq 144(%ecx),%mm7 | |
258 pfmul -48(%ebx),%mm7 | |
259 pfsub %mm7,%mm4 | |
260 | |
261 movq 24(%ecx),%mm1 | |
262 pfmul 24(%ebx),%mm1 | |
263 pfsub %mm1,%mm0 | |
264 movq 152(%ecx),%mm5 | |
265 pfmul -40(%ebx),%mm5 | |
266 pfsub %mm5,%mm4 | |
267 | |
268 movq 32(%ecx),%mm2 | |
269 pfmul 32(%ebx),%mm2 | |
270 pfsub %mm2,%mm0 | |
271 movq 160(%ecx),%mm6 | |
272 pfmul -32(%ebx),%mm6 | |
273 pfsub %mm6,%mm4 | |
274 | |
275 movq 40(%ecx),%mm3 | |
276 pfmul 40(%ebx),%mm3 | |
277 pfsub %mm3,%mm0 | |
278 movq 168(%ecx),%mm7 | |
279 pfmul -24(%ebx),%mm7 | |
280 pfsub %mm7,%mm4 | |
281 | |
282 movq 48(%ecx),%mm1 | |
283 pfmul 48(%ebx),%mm1 | |
284 pfsub %mm1,%mm0 | |
285 movq 176(%ecx),%mm5 | |
286 pfmul -16(%ebx),%mm5 | |
287 pfsub %mm5,%mm4 | |
288 | |
289 movq 56(%ecx),%mm2 | |
290 pfmul 56(%ebx),%mm2 | |
291 pfsub %mm2,%mm0 | |
292 movq 184(%ecx),%mm6 | |
293 pfmul -8(%ebx),%mm6 | |
294 pfsub %mm6,%mm4 | |
295 | |
296 pfacc %mm4,%mm0 | |
297 movq (%esi), %mm1 | |
298 pf2id %mm0, %mm0 | |
299 pand one_null, %mm1 | |
300 psrld $16,%mm0 | |
301 pand null_one, %mm0 | |
302 por %mm0, %mm1 | |
303 movq %mm1,(%esi) | |
304 | |
305 subl $128,%ebx | |
306 addl $256,%ecx | |
307 addl $8,%esi | |
308 decl %ebp | |
309 jnz .L68 | |
310 | |
311 / --- end of loop 2 | |
312 | |
313 pxor %mm0, %mm0 | |
736 | 314 |
859 | 315 movq (%ecx),%mm1 |
316 pfmul (%ebx),%mm1 | |
317 pfsub %mm1,%mm0 | |
318 | |
319 movq 8(%ecx),%mm2 | |
320 pfmul 8(%ebx),%mm2 | |
736 | 321 pfsub %mm2,%mm0 |
322 | |
859 | 323 movq 16(%ecx),%mm3 |
324 pfmul 16(%ebx),%mm3 | |
325 pfsub %mm3,%mm0 | |
736 | 326 |
327 movq 24(%ecx),%mm4 | |
763 | 328 pfmul 24(%ebx),%mm4 |
736 | 329 pfsub %mm4,%mm0 |
330 | |
859 | 331 movq 32(%ecx),%mm5 |
332 pfmul 32(%ebx),%mm5 | |
333 pfsub %mm5,%mm0 | |
736 | 334 |
859 | 335 movq 40(%ecx),%mm6 |
336 pfmul 40(%ebx),%mm6 | |
337 pfsub %mm6,%mm0 | |
736 | 338 |
859 | 339 movq 48(%ecx),%mm7 |
340 pfmul 48(%ebx),%mm7 | |
341 pfsub %mm7,%mm0 | |
736 | 342 |
859 | 343 movq 56(%ecx),%mm1 |
344 pfmul 56(%ebx),%mm1 | |
345 pfsub %mm1,%mm0 | |
736 | 346 |
347 pfacc %mm0,%mm0 | |
348 | |
349 pf2id %mm0,%mm0 | |
350 movd %mm0,%eax | |
351 | |
352 sar $16,%eax | |
353 | |
354 movw %ax,(%esi) | |
355 | |
356 femms | |
357 | |
358 movl %edi,%eax | |
359 popl %ebx | |
360 popl %esi | |
361 popl %edi | |
362 popl %ebp | |
363 addl $12,%esp | |
364 ret |