Mercurial > mplayer.hg
comparison mp3lib/decode_3dnow.s @ 1:3b5f5d1c5041
Initial revision
author | arpi_esp |
---|---|
date | Sat, 24 Feb 2001 20:28:24 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:c1bb2c071d63 | 1:3b5f5d1c5041 |
---|---|
1 / synth_1to1_3dnow works the same way as the c version of | |
2 / synth_1to1. this assembler code based 'decode-i586.s' | |
3 / (by Stefan Bieschewski <stb@acm.org>), two types of changes | |
4 / have been made: | |
5 / - use {MMX,3DNow!} instruction for reduce cpu | |
6 / - remove unused(?) local symbols | |
7 / | |
8 / useful sources of information on optimizing 3DNow! code include: | |
9 / AMD 3DNow! Technology Manual (Publication #21928) | |
10 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf | |
11 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) | |
12 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) | |
13 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf | |
14 / | |
15 / This code was tested only AMD-K6-2 processor Linux systems, | |
16 / please tell me: | |
17 / - whether this code works on other 3DNow! capable processors | |
18 / (ex.IDT-C6-2) or not | |
19 / - whether this code works on other OSes or not | |
20 / | |
21 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 | |
22 / <kim@comtec.co.jp> - after 1.Apr.1998 | |
23 | |
24 / Enhancments for q-word operation by Michael Hipp | |
25 | |
26 .bss | |
27 .comm buffs,4352,4 | |
28 .data | |
29 .align 4 | |
30 bo: | |
31 .long 1 | |
32 .text | |
33 .globl synth_1to1_3dnow | |
34 synth_1to1_3dnow: | |
35 subl $12,%esp | |
36 pushl %ebp | |
37 pushl %edi | |
38 pushl %esi | |
39 pushl %ebx | |
40 movl 32(%esp),%eax | |
41 movl 40(%esp),%esi | |
42 movl $0,%edi | |
43 movl bo,%ebp | |
44 cmpl %edi,36(%esp) | |
45 jne .L48 | |
46 decl %ebp | |
47 andl $15,%ebp | |
48 movl %ebp,bo | |
49 movl $buffs,%ecx | |
50 jmp .L49 | |
51 .L48: | |
52 addl $2,%esi | |
53 movl $buffs+2176,%ecx | |
54 .L49: | |
55 testl $1,%ebp | |
56 je .L50 | |
57 movl %ecx,%ebx | |
58 movl %ebp,16(%esp) | |
59 pushl %eax | |
60 movl 20(%esp),%edx | |
61 leal (%ebx,%edx,4),%eax | |
62 pushl %eax | |
63 movl 24(%esp),%eax | |
64 incl %eax | |
65 andl $15,%eax | |
66 leal 1088(,%eax,4),%eax | |
67 addl %ebx,%eax | |
68 jmp .L74 | |
69 .L50: | |
70 leal 1088(%ecx),%ebx | |
71 leal 1(%ebp),%edx | |
72 movl %edx,16(%esp) | |
73 pushl %eax | |
74 leal 1092(%ecx,%ebp,4),%eax | |
75 pushl %eax | |
76 leal (%ecx,%ebp,4),%eax | |
77 .L74: | |
78 pushl %eax | |
79 call dct64_3dnow | |
80 addl $12,%esp | |
81 movl 16(%esp),%edx | |
82 leal 0(,%edx,4),%edx | |
83 movl $decwin+64,%eax | |
84 movl %eax,%ecx | |
85 subl %edx,%ecx | |
86 movl $16,%ebp | |
87 | |
88 .L55: | |
89 movq (%ecx),%mm4 | |
90 movq (%ebx),%mm3 | |
91 movq 8(%ecx),%mm0 | |
92 movq 8(%ebx),%mm1 | |
93 pfmul %mm3,%mm4 | |
94 | |
95 movq 16(%ecx),%mm2 | |
96 pfmul %mm1,%mm0 | |
97 movq 16(%ebx),%mm3 | |
98 pfadd %mm0,%mm4 | |
99 | |
100 movq 24(%ecx),%mm0 | |
101 pfmul %mm2,%mm3 | |
102 movq 24(%ebx),%mm1 | |
103 pfadd %mm3,%mm4 | |
104 | |
105 movq 32(%ecx),%mm2 | |
106 pfmul %mm1,%mm0 | |
107 movq 32(%ebx),%mm3 | |
108 pfadd %mm0,%mm4 | |
109 | |
110 movq 40(%ecx),%mm0 | |
111 pfmul %mm2,%mm3 | |
112 movq 40(%ebx),%mm1 | |
113 pfadd %mm3,%mm4 | |
114 | |
115 movq 48(%ecx),%mm2 | |
116 pfmul %mm1,%mm0 | |
117 movq 48(%ebx),%mm3 | |
118 pfadd %mm0,%mm4 | |
119 | |
120 movq 56(%ecx),%mm0 | |
121 pfmul %mm2,%mm3 | |
122 movq 56(%ebx),%mm1 | |
123 pfadd %mm3,%mm4 | |
124 | |
125 pfmul %mm1,%mm0 | |
126 pfadd %mm0,%mm4 | |
127 | |
128 movq %mm4,%mm0 | |
129 psrlq $32,%mm0 | |
130 pfsub %mm0,%mm4 | |
131 | |
132 pf2id %mm4,%mm4 | |
133 movd %mm4,%eax | |
134 | |
135 sar $16,%eax | |
136 movw %ax,(%esi) | |
137 | |
138 addl $64,%ebx | |
139 subl $-128,%ecx | |
140 addl $4,%esi | |
141 decl %ebp | |
142 jnz .L55 | |
143 | |
144 / --- end of loop 1 --- | |
145 | |
146 movd (%ecx),%mm2 | |
147 movd (%ebx),%mm1 | |
148 pfmul %mm1,%mm2 | |
149 | |
150 movd 8(%ecx),%mm0 | |
151 movd 8(%ebx),%mm1 | |
152 pfmul %mm0,%mm1 | |
153 pfadd %mm1,%mm2 | |
154 | |
155 movd 16(%ecx),%mm0 | |
156 movd 16(%ebx),%mm1 | |
157 pfmul %mm0,%mm1 | |
158 pfadd %mm1,%mm2 | |
159 | |
160 movd 24(%ecx),%mm0 | |
161 movd 24(%ebx),%mm1 | |
162 pfmul %mm0,%mm1 | |
163 pfadd %mm1,%mm2 | |
164 | |
165 movd 32(%ecx),%mm0 | |
166 movd 32(%ebx),%mm1 | |
167 pfmul %mm0,%mm1 | |
168 pfadd %mm1,%mm2 | |
169 | |
170 movd 40(%ecx),%mm0 | |
171 movd 40(%ebx),%mm1 | |
172 pfmul %mm0,%mm1 | |
173 pfadd %mm1,%mm2 | |
174 | |
175 movd 48(%ecx),%mm0 | |
176 movd 48(%ebx),%mm1 | |
177 pfmul %mm0,%mm1 | |
178 pfadd %mm1,%mm2 | |
179 | |
180 movd 56(%ecx),%mm0 | |
181 movd 56(%ebx),%mm1 | |
182 pfmul %mm0,%mm1 | |
183 pfadd %mm1,%mm2 | |
184 | |
185 pf2id %mm2,%mm2 | |
186 movd %mm2,%eax | |
187 | |
188 sar $16,%eax | |
189 | |
190 movw %ax,(%esi) | |
191 | |
192 addl $-64,%ebx | |
193 addl $4,%esi | |
194 addl $256,%ecx | |
195 movl $15,%ebp | |
196 | |
197 .L68: | |
198 psubd %mm0,%mm0 | |
199 | |
200 movq (%ebx),%mm1 | |
201 movq (%ecx),%mm2 | |
202 pfmul %mm1,%mm2 | |
203 pfsub %mm2,%mm0 | |
204 | |
205 movq 8(%ebx),%mm3 | |
206 movq 8(%ecx),%mm4 | |
207 pfmul %mm3,%mm4 | |
208 pfsub %mm4,%mm0 | |
209 | |
210 movq 16(%ebx),%mm1 | |
211 movq 16(%ecx),%mm2 | |
212 pfmul %mm1,%mm2 | |
213 pfsub %mm2,%mm0 | |
214 | |
215 movq 24(%ebx),%mm3 | |
216 movq 24(%ecx),%mm4 | |
217 pfmul %mm3,%mm4 | |
218 pfsub %mm4,%mm0 | |
219 | |
220 movq 32(%ebx),%mm1 | |
221 movq 32(%ecx),%mm2 | |
222 pfmul %mm1,%mm2 | |
223 pfsub %mm2,%mm0 | |
224 | |
225 movq 40(%ebx),%mm3 | |
226 movq 40(%ecx),%mm4 | |
227 pfmul %mm3,%mm4 | |
228 pfsub %mm4,%mm0 | |
229 | |
230 movq 48(%ebx),%mm1 | |
231 movq 48(%ecx),%mm2 | |
232 pfmul %mm1,%mm2 | |
233 pfsub %mm2,%mm0 | |
234 | |
235 movq 56(%ebx),%mm3 | |
236 movq 56(%ecx),%mm4 | |
237 pfmul %mm3,%mm4 | |
238 pfsub %mm4,%mm0 | |
239 | |
240 pfacc %mm0,%mm0 | |
241 | |
242 pf2id %mm0,%mm0 | |
243 movd %mm0,%eax | |
244 | |
245 sar $16,%eax | |
246 | |
247 movw %ax,(%esi) | |
248 | |
249 addl $-64,%ebx | |
250 subl $-128,%ecx | |
251 addl $4,%esi | |
252 decl %ebp | |
253 jnz .L68 | |
254 | |
255 / --- end of loop 2 | |
256 | |
257 femms | |
258 | |
259 movl %edi,%eax | |
260 popl %ebx | |
261 popl %esi | |
262 popl %edi | |
263 popl %ebp | |
264 addl $12,%esp | |
265 ret |