Mercurial > mplayer.hg
annotate mp3lib/decode_k7.s @ 837:3de4dc1d1d12
"-vo sdl -dga" corrected to "-vo sdl -sdl dga"
author | gabucino |
---|---|
date | Fri, 18 May 2001 20:48:21 +0000 |
parents | ee303142c2e0 |
children | 3560d38486ab |
rev | line source |
---|---|
736 | 1 /// |
2 /// Replacement of synth_1to1() with AMD's 3DNowEx(DSP)! SIMD operations support | |
3 /// | |
4 /// This code based 'decode_3dnow.s' by Syuuhei Kashiyama | |
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made: | |
6 /// | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
7 /// - Added new opcode PFNACC |
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
8 /// - decreased number of opcodes (as it was suggested by k7 manual) |
763 | 9 /// (using memory reference as operand of instructions) |
736 | 10 /// - change function name for support 3DNowEx! automatic detect |
11 /// | |
12 /// note: because K7 processors are an aggresive out-of-order three-way | |
13 /// superscalar ones instruction order is not significand for them. | |
14 /// | |
15 /// Modified by Nick Kurshev <nickols_k@mail.ru> | |
16 /// | |
17 / synth_1to1_3dnow works the same way as the c version of | |
18 / synth_1to1. this assembler code based 'decode-i586.s' | |
19 / (by Stefan Bieschewski <stb@acm.org>), two types of changes | |
20 / have been made: | |
21 / - use {MMX,3DNow!} instruction for reduce cpu | |
22 / - remove unused(?) local symbols | |
23 / | |
24 / useful sources of information on optimizing 3DNow! code include: | |
25 / AMD 3DNow! Technology Manual (Publication #21928) | |
26 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf | |
27 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) | |
28 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) | |
29 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf | |
30 / | |
31 / This code was tested only AMD-K6-2 processor Linux systems, | |
32 / please tell me: | |
33 / - whether this code works on other 3DNow! capable processors | |
34 / (ex.IDT-C6-2) or not | |
35 / - whether this code works on other OSes or not | |
36 / | |
37 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 | |
38 / <kim@comtec.co.jp> - after 1.Apr.1998 | |
39 | |
40 / Enhancments for q-word operation by Michael Hipp | |
41 | |
42 .bss | |
43 .comm buffs,4352,4 | |
44 .data | |
45 .align 4 | |
46 bo: | |
47 .long 1 | |
48 .text | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
49 /* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ |
736 | 50 .globl synth_1to1_3dnowex |
51 synth_1to1_3dnowex: | |
52 subl $12,%esp | |
53 pushl %ebp | |
54 pushl %edi | |
55 pushl %esi | |
56 pushl %ebx | |
57 | |
58 movl 32(%esp),%eax | |
59 movl 40(%esp),%esi | |
60 movl $0,%edi | |
61 movl bo,%ebp | |
62 cmpl %edi,36(%esp) | |
63 jne .L48 | |
64 decl %ebp | |
65 andl $15,%ebp | |
66 movl %ebp,bo | |
67 movl $buffs,%ecx | |
68 jmp .L49 | |
69 .L48: | |
70 addl $2,%esi | |
71 movl $buffs+2176,%ecx | |
72 .L49: | |
73 testl $1,%ebp | |
74 je .L50 | |
75 movl %ecx,%ebx | |
76 movl %ebp,16(%esp) | |
77 pushl %eax | |
78 movl 20(%esp),%edx | |
79 leal (%ebx,%edx,4),%eax | |
80 pushl %eax | |
81 movl 24(%esp),%eax | |
82 incl %eax | |
83 andl $15,%eax | |
84 leal 1088(,%eax,4),%eax | |
85 addl %ebx,%eax | |
86 jmp .L74 | |
87 .L50: | |
88 leal 1088(%ecx),%ebx | |
89 leal 1(%ebp),%edx | |
90 movl %edx,16(%esp) | |
91 pushl %eax | |
92 leal 1092(%ecx,%ebp,4),%eax | |
93 pushl %eax | |
94 leal (%ecx,%ebp,4),%eax | |
95 .L74: | |
96 pushl %eax | |
97 call dct64_3dnowex | |
98 movl 16(%esp),%edx | |
99 leal 0(,%edx,4),%edx | |
100 movl $decwin+64,%eax | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
101 movl %eax,%ecx |
736 | 102 subl %edx,%ecx |
103 movl $16,%ebp | |
104 | |
105 .L55: | |
763 | 106 |
736 | 107 movq (%ecx),%mm4 |
108 movq 8(%ecx),%mm0 | |
763 | 109 pfmul (%ebx),%mm4 |
736 | 110 |
763 | 111 pfmul 8(%ebx),%mm0 |
736 | 112 movq 16(%ebx),%mm3 |
113 pfadd %mm0,%mm4 | |
114 | |
115 movq 24(%ecx),%mm0 | |
763 | 116 pfmul 16(%ecx),%mm3 |
736 | 117 pfadd %mm3,%mm4 |
118 | |
763 | 119 pfmul 24(%ebx),%mm0 |
736 | 120 movq 32(%ebx),%mm3 |
121 pfadd %mm0,%mm4 | |
122 | |
123 movq 40(%ecx),%mm0 | |
763 | 124 pfmul 32(%ecx),%mm3 |
736 | 125 pfadd %mm3,%mm4 |
126 | |
763 | 127 pfmul 40(%ebx),%mm0 |
736 | 128 movq 48(%ebx),%mm3 |
129 pfadd %mm0,%mm4 | |
130 | |
131 movq 56(%ecx),%mm0 | |
763 | 132 pfmul 48(%ecx),%mm3 |
736 | 133 pfadd %mm3,%mm4 |
134 | |
763 | 135 pfmul 56(%ebx),%mm0 |
736 | 136 pfadd %mm0,%mm4 |
137 | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
138 pfnacc %mm4, %mm4 |
736 | 139 |
140 pf2id %mm4,%mm4 | |
141 movd %mm4,%eax | |
142 | |
143 sar $16,%eax | |
144 movw %ax,(%esi) | |
145 | |
146 addl $64,%ebx | |
147 subl $-128,%ecx | |
148 addl $4,%esi | |
149 decl %ebp | |
150 jnz .L55 | |
151 | |
152 / --- end of loop 1 --- | |
153 | |
154 movd (%ecx),%mm2 | |
763 | 155 pfmul (%ebx),%mm2 |
736 | 156 |
157 movd 8(%ebx),%mm1 | |
763 | 158 pfmul 8(%ecx),%mm1 |
736 | 159 pfadd %mm1,%mm2 |
160 | |
161 movd 16(%ebx),%mm1 | |
763 | 162 pfmul 16(%ecx),%mm1 |
736 | 163 pfadd %mm1,%mm2 |
164 | |
165 movd 24(%ebx),%mm1 | |
763 | 166 pfmul 24(%ecx),%mm1 |
736 | 167 pfadd %mm1,%mm2 |
168 | |
169 movd 32(%ebx),%mm1 | |
763 | 170 pfmul 32(%ecx),%mm1 |
736 | 171 pfadd %mm1,%mm2 |
172 | |
173 movd 40(%ebx),%mm1 | |
763 | 174 pfmul 40(%ecx),%mm1 |
736 | 175 pfadd %mm1,%mm2 |
176 | |
177 movd 48(%ebx),%mm1 | |
763 | 178 pfmul 48(%ecx),%mm1 |
736 | 179 pfadd %mm1,%mm2 |
180 | |
181 movd 56(%ebx),%mm1 | |
763 | 182 pfmul 56(%ecx),%mm1 |
736 | 183 pfadd %mm1,%mm2 |
184 | |
185 pf2id %mm2,%mm2 | |
186 movd %mm2,%eax | |
187 | |
188 sar $16,%eax | |
189 | |
190 movw %ax,(%esi) | |
191 | |
192 addl $-64,%ebx | |
193 addl $4,%esi | |
194 addl $256,%ecx | |
195 movl $15,%ebp | |
196 | |
197 .L68: | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
198 pxor %mm0, %mm0 |
736 | 199 |
200 movq (%ecx),%mm2 | |
763 | 201 pfmul (%ebx),%mm2 |
736 | 202 pfsub %mm2,%mm0 |
203 | |
204 movq 8(%ecx),%mm4 | |
763 | 205 pfmul 8(%ebx),%mm4 |
736 | 206 pfsub %mm4,%mm0 |
207 | |
208 movq 16(%ecx),%mm2 | |
763 | 209 pfmul 16(%ebx),%mm2 |
736 | 210 pfsub %mm2,%mm0 |
211 | |
212 movq 24(%ecx),%mm4 | |
763 | 213 pfmul 24(%ebx),%mm4 |
736 | 214 pfsub %mm4,%mm0 |
215 | |
216 movq 32(%ecx),%mm2 | |
763 | 217 pfmul 32(%ebx),%mm2 |
736 | 218 pfsub %mm2,%mm0 |
219 | |
220 movq 40(%ecx),%mm4 | |
763 | 221 pfmul 40(%ebx),%mm4 |
736 | 222 pfsub %mm4,%mm0 |
223 | |
224 movq 48(%ecx),%mm2 | |
763 | 225 pfmul 48(%ebx),%mm2 |
736 | 226 pfsub %mm2,%mm0 |
227 | |
228 movq 56(%ecx),%mm4 | |
763 | 229 pfmul 56(%ebx),%mm4 |
736 | 230 pfsub %mm4,%mm0 |
231 | |
232 pfacc %mm0,%mm0 | |
233 | |
234 pf2id %mm0,%mm0 | |
235 movd %mm0,%eax | |
236 | |
237 sar $16,%eax | |
238 | |
239 movw %ax,(%esi) | |
240 | |
241 addl $-64,%ebx | |
242 subl $-128,%ecx | |
243 addl $4,%esi | |
244 decl %ebp | |
245 jnz .L68 | |
246 | |
247 / --- end of loop 2 | |
248 | |
249 femms | |
250 | |
251 movl %edi,%eax | |
252 popl %ebx | |
253 popl %esi | |
254 popl %edi | |
255 popl %ebp | |
256 addl $12,%esp | |
257 ret |