Mercurial > mplayer.hg
annotate mp3lib/decode_k7.s @ 988:c6f88600d409
Enable to avoid checking version of gcc. New tests of as
author | nickols_k |
---|---|
date | Mon, 04 Jun 2001 09:38:18 +0000 |
parents | 3560d38486ab |
children | 970fbd433564 |
rev | line source |
---|---|
736 | 1 /// |
2 /// Replacement of synth_1to1() with AMD's 3DNowEx(DSP)! SIMD operations support | |
3 /// | |
4 /// This code based 'decode_3dnow.s' by Syuuhei Kashiyama | |
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made: | |
6 /// | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
7 /// - Added new opcode PFNACC |
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
8 /// - decreased number of opcodes (as it was suggested by k7 manual) |
763 | 9 /// (using memory reference as operand of instructions) |
736 | 10 /// - change function name for support 3DNowEx! automatic detect |
11 /// | |
12 /// note: because K7 processors are an aggresive out-of-order three-way | |
13 /// superscalar ones instruction order is not significand for them. | |
14 /// | |
15 /// Modified by Nick Kurshev <nickols_k@mail.ru> | |
16 /// | |
17 / synth_1to1_3dnow works the same way as the c version of | |
18 / synth_1to1. this assembler code based 'decode-i586.s' | |
19 / (by Stefan Bieschewski <stb@acm.org>), two types of changes | |
20 / have been made: | |
21 / - use {MMX,3DNow!} instruction for reduce cpu | |
22 / - remove unused(?) local symbols | |
23 / | |
24 / useful sources of information on optimizing 3DNow! code include: | |
25 / AMD 3DNow! Technology Manual (Publication #21928) | |
26 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf | |
27 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) | |
28 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) | |
29 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf | |
30 / | |
31 / This code was tested only AMD-K6-2 processor Linux systems, | |
32 / please tell me: | |
33 / - whether this code works on other 3DNow! capable processors | |
34 / (ex.IDT-C6-2) or not | |
35 / - whether this code works on other OSes or not | |
36 / | |
37 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 | |
38 / <kim@comtec.co.jp> - after 1.Apr.1998 | |
39 | |
40 / Enhancments for q-word operation by Michael Hipp | |
41 | |
42 .bss | |
43 .comm buffs,4352,4 | |
44 .data | |
45 .align 4 | |
46 bo: | |
47 .long 1 | |
48 .text | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
49 /* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ |
736 | 50 .globl synth_1to1_3dnowex |
51 synth_1to1_3dnowex: | |
52 subl $12,%esp | |
53 pushl %ebp | |
54 pushl %edi | |
55 pushl %esi | |
56 pushl %ebx | |
57 | |
58 movl 32(%esp),%eax | |
59 movl 40(%esp),%esi | |
60 movl $0,%edi | |
61 movl bo,%ebp | |
62 cmpl %edi,36(%esp) | |
63 jne .L48 | |
64 decl %ebp | |
65 andl $15,%ebp | |
66 movl %ebp,bo | |
67 movl $buffs,%ecx | |
68 jmp .L49 | |
69 .L48: | |
70 addl $2,%esi | |
71 movl $buffs+2176,%ecx | |
72 .L49: | |
73 testl $1,%ebp | |
74 je .L50 | |
75 movl %ecx,%ebx | |
76 movl %ebp,16(%esp) | |
77 pushl %eax | |
78 movl 20(%esp),%edx | |
79 leal (%ebx,%edx,4),%eax | |
80 pushl %eax | |
81 movl 24(%esp),%eax | |
82 incl %eax | |
83 andl $15,%eax | |
84 leal 1088(,%eax,4),%eax | |
85 addl %ebx,%eax | |
86 jmp .L74 | |
87 .L50: | |
88 leal 1088(%ecx),%ebx | |
89 leal 1(%ebp),%edx | |
90 movl %edx,16(%esp) | |
91 pushl %eax | |
92 leal 1092(%ecx,%ebp,4),%eax | |
93 pushl %eax | |
94 leal (%ecx,%ebp,4),%eax | |
95 .L74: | |
96 pushl %eax | |
97 call dct64_3dnowex | |
98 movl 16(%esp),%edx | |
99 leal 0(,%edx,4),%edx | |
100 movl $decwin+64,%eax | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
101 movl %eax,%ecx |
736 | 102 subl %edx,%ecx |
103 movl $16,%ebp | |
104 | |
105 .L55: | |
763 | 106 |
859 | 107 movq (%ecx),%mm0 |
108 pfmul (%ebx),%mm0 | |
736 | 109 |
859 | 110 movq 8(%ecx),%mm1 |
111 pfmul 8(%ebx),%mm1 | |
112 pfadd %mm1,%mm0 | |
736 | 113 |
859 | 114 movq 16(%ebx),%mm2 |
115 pfmul 16(%ecx),%mm2 | |
116 pfadd %mm2,%mm0 | |
736 | 117 |
859 | 118 movq 24(%ecx),%mm3 |
119 pfmul 24(%ebx),%mm3 | |
120 pfadd %mm3,%mm0 | |
736 | 121 |
859 | 122 movq 32(%ebx),%mm4 |
123 pfmul 32(%ecx),%mm4 | |
124 pfadd %mm4,%mm0 | |
736 | 125 |
859 | 126 movq 40(%ecx),%mm5 |
127 pfmul 40(%ebx),%mm5 | |
128 pfadd %mm5,%mm0 | |
736 | 129 |
859 | 130 movq 48(%ebx),%mm6 |
131 pfmul 48(%ecx),%mm6 | |
132 pfadd %mm6,%mm0 | |
736 | 133 |
859 | 134 movq 56(%ecx),%mm7 |
135 pfmul 56(%ebx),%mm7 | |
136 pfadd %mm7,%mm0 | |
736 | 137 |
859 | 138 pfnacc %mm0, %mm0 |
736 | 139 |
859 | 140 pf2id %mm0,%mm0 |
141 movd %mm0,%eax | |
736 | 142 |
143 sar $16,%eax | |
144 movw %ax,(%esi) | |
145 | |
146 addl $64,%ebx | |
147 subl $-128,%ecx | |
148 addl $4,%esi | |
149 decl %ebp | |
150 jnz .L55 | |
151 | |
152 / --- end of loop 1 --- | |
153 | |
859 | 154 movd (%ecx),%mm0 |
155 pfmul (%ebx),%mm0 | |
736 | 156 |
157 movd 8(%ebx),%mm1 | |
763 | 158 pfmul 8(%ecx),%mm1 |
859 | 159 pfadd %mm1,%mm0 |
736 | 160 |
859 | 161 movd 16(%ebx),%mm2 |
162 pfmul 16(%ecx),%mm2 | |
163 pfadd %mm2,%mm0 | |
736 | 164 |
859 | 165 movd 24(%ebx),%mm3 |
166 pfmul 24(%ecx),%mm3 | |
167 pfadd %mm3,%mm0 | |
736 | 168 |
859 | 169 movd 32(%ebx),%mm4 |
170 pfmul 32(%ecx),%mm4 | |
171 pfadd %mm4,%mm0 | |
736 | 172 |
859 | 173 movd 40(%ebx),%mm5 |
174 pfmul 40(%ecx),%mm5 | |
175 pfadd %mm5,%mm0 | |
736 | 176 |
859 | 177 movd 48(%ebx),%mm6 |
178 pfmul 48(%ecx),%mm6 | |
179 pfadd %mm6,%mm0 | |
736 | 180 |
859 | 181 movd 56(%ebx),%mm7 |
182 pfmul 56(%ecx),%mm7 | |
183 pfadd %mm7,%mm0 | |
736 | 184 |
859 | 185 pf2id %mm0,%mm0 |
186 movd %mm0,%eax | |
736 | 187 |
188 sar $16,%eax | |
189 | |
190 movw %ax,(%esi) | |
191 | |
192 addl $-64,%ebx | |
193 addl $4,%esi | |
194 addl $256,%ecx | |
195 movl $15,%ebp | |
196 | |
197 .L68: | |
779
a349510321ff
slight improvements. k7 vs 3dnow already win 12 prcnts
nickols_k
parents:
763
diff
changeset
|
198 pxor %mm0, %mm0 |
736 | 199 |
859 | 200 movq (%ecx),%mm1 |
201 pfmul (%ebx),%mm1 | |
202 pfsub %mm1,%mm0 | |
203 | |
204 movq 8(%ecx),%mm2 | |
205 pfmul 8(%ebx),%mm2 | |
736 | 206 pfsub %mm2,%mm0 |
207 | |
859 | 208 movq 16(%ecx),%mm3 |
209 pfmul 16(%ebx),%mm3 | |
210 pfsub %mm3,%mm0 | |
736 | 211 |
212 movq 24(%ecx),%mm4 | |
763 | 213 pfmul 24(%ebx),%mm4 |
736 | 214 pfsub %mm4,%mm0 |
215 | |
859 | 216 movq 32(%ecx),%mm5 |
217 pfmul 32(%ebx),%mm5 | |
218 pfsub %mm5,%mm0 | |
736 | 219 |
859 | 220 movq 40(%ecx),%mm6 |
221 pfmul 40(%ebx),%mm6 | |
222 pfsub %mm6,%mm0 | |
736 | 223 |
859 | 224 movq 48(%ecx),%mm7 |
225 pfmul 48(%ebx),%mm7 | |
226 pfsub %mm7,%mm0 | |
736 | 227 |
859 | 228 movq 56(%ecx),%mm1 |
229 pfmul 56(%ebx),%mm1 | |
230 pfsub %mm1,%mm0 | |
736 | 231 |
232 pfacc %mm0,%mm0 | |
233 | |
234 pf2id %mm0,%mm0 | |
235 movd %mm0,%eax | |
236 | |
237 sar $16,%eax | |
238 | |
239 movw %ax,(%esi) | |
240 | |
241 addl $-64,%ebx | |
242 subl $-128,%ecx | |
243 addl $4,%esi | |
244 decl %ebp | |
245 jnz .L68 | |
246 | |
247 / --- end of loop 2 | |
248 | |
249 femms | |
250 | |
251 movl %edi,%eax | |
252 popl %ebx | |
253 popl %esi | |
254 popl %edi | |
255 popl %ebp | |
256 addl $12,%esp | |
257 ret |