736
|
1 ///
|
|
2 /// Replacement of synth_1to1() with AMD's 3DNowEx(DSP)! SIMD operations support
|
|
3 ///
|
|
4 /// This code based 'decode_3dnow.s' by Syuuhei Kashiyama
|
|
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
|
|
6 ///
|
763
|
7 /// - decreased number of opcodes (as it suggested by k7 manual)
|
|
8 /// (using memory reference as operand of instructions)
|
736
|
9 /// - change function name for support 3DNowEx! automatic detect
|
|
10 ///
|
|
11 /// note: because K7 processors are an aggresive out-of-order three-way
|
|
12 /// superscalar ones instruction order is not significand for them.
|
|
13 ///
|
|
14 /// Modified by Nick Kurshev <nickols_k@mail.ru>
|
|
15 ///
|
|
16 / synth_1to1_3dnow works the same way as the c version of
|
|
17 / synth_1to1. this assembler code based 'decode-i586.s'
|
|
18 / (by Stefan Bieschewski <stb@acm.org>), two types of changes
|
|
19 / have been made:
|
|
20 / - use {MMX,3DNow!} instruction for reduce cpu
|
|
21 / - remove unused(?) local symbols
|
|
22 /
|
|
23 / useful sources of information on optimizing 3DNow! code include:
|
|
24 / AMD 3DNow! Technology Manual (Publication #21928)
|
|
25 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
|
|
26 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
|
|
27 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
|
|
28 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
|
|
29 /
|
|
30 / This code was tested only AMD-K6-2 processor Linux systems,
|
|
31 / please tell me:
|
|
32 / - whether this code works on other 3DNow! capable processors
|
|
33 / (ex.IDT-C6-2) or not
|
|
34 / - whether this code works on other OSes or not
|
|
35 /
|
|
36 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
|
|
37 / <kim@comtec.co.jp> - after 1.Apr.1998
|
|
38
|
|
39 / Enhancments for q-word operation by Michael Hipp
|
|
40
|
|
41 .bss
|
|
42 .comm buffs,4352,4
|
|
43 .data
|
|
44 .align 4
|
|
45 bo:
|
|
46 .long 1
|
|
47 .text
|
|
48 .globl synth_1to1_3dnowex
|
|
49 synth_1to1_3dnowex:
|
|
50 subl $12,%esp
|
|
51 pushl %ebp
|
|
52 pushl %edi
|
|
53 pushl %esi
|
|
54 pushl %ebx
|
|
55
|
|
56 movl 32(%esp),%eax
|
|
57 movl 40(%esp),%esi
|
|
58 movl $0,%edi
|
|
59 movl bo,%ebp
|
|
60 cmpl %edi,36(%esp)
|
|
61 jne .L48
|
|
62 decl %ebp
|
|
63 andl $15,%ebp
|
|
64 movl %ebp,bo
|
|
65 movl $buffs,%ecx
|
|
66 jmp .L49
|
|
67 .L48:
|
|
68 addl $2,%esi
|
|
69 movl $buffs+2176,%ecx
|
|
70 .L49:
|
|
71 testl $1,%ebp
|
|
72 je .L50
|
|
73 movl %ecx,%ebx
|
|
74 movl %ebp,16(%esp)
|
|
75 pushl %eax
|
|
76 movl 20(%esp),%edx
|
|
77 leal (%ebx,%edx,4),%eax
|
|
78 pushl %eax
|
|
79 movl 24(%esp),%eax
|
|
80 incl %eax
|
|
81 andl $15,%eax
|
|
82 leal 1088(,%eax,4),%eax
|
|
83 addl %ebx,%eax
|
|
84 jmp .L74
|
|
85 .L50:
|
|
86 leal 1088(%ecx),%ebx
|
|
87 leal 1(%ebp),%edx
|
|
88 movl %edx,16(%esp)
|
|
89 pushl %eax
|
|
90 leal 1092(%ecx,%ebp,4),%eax
|
|
91 pushl %eax
|
|
92 leal (%ecx,%ebp,4),%eax
|
|
93 .L74:
|
|
94 pushl %eax
|
|
95 call dct64_3dnowex
|
|
96 addl $12,%esp
|
|
97 movl 16(%esp),%edx
|
|
98 leal 0(,%edx,4),%edx
|
|
99 movl $decwin+64,%eax
|
|
100 movl %eax,%ecx
|
|
101 subl %edx,%ecx
|
|
102 movl $16,%ebp
|
|
103
|
|
104 .L55:
|
763
|
105
|
736
|
106 movq (%ecx),%mm4
|
|
107 movq 8(%ecx),%mm0
|
763
|
108 pfmul (%ebx),%mm4
|
736
|
109
|
763
|
110 pfmul 8(%ebx),%mm0
|
736
|
111 movq 16(%ebx),%mm3
|
|
112 pfadd %mm0,%mm4
|
|
113
|
|
114 movq 24(%ecx),%mm0
|
763
|
115 pfmul 16(%ecx),%mm3
|
736
|
116 pfadd %mm3,%mm4
|
|
117
|
763
|
118 pfmul 24(%ebx),%mm0
|
736
|
119 movq 32(%ebx),%mm3
|
|
120 pfadd %mm0,%mm4
|
|
121
|
|
122 movq 40(%ecx),%mm0
|
763
|
123 pfmul 32(%ecx),%mm3
|
736
|
124 pfadd %mm3,%mm4
|
|
125
|
763
|
126 pfmul 40(%ebx),%mm0
|
736
|
127 movq 48(%ebx),%mm3
|
|
128 pfadd %mm0,%mm4
|
|
129
|
|
130 movq 56(%ecx),%mm0
|
763
|
131 pfmul 48(%ecx),%mm3
|
736
|
132 pfadd %mm3,%mm4
|
|
133
|
763
|
134 pfmul 56(%ebx),%mm0
|
736
|
135 pfadd %mm0,%mm4
|
|
136
|
|
137 movq %mm4,%mm0
|
|
138 psrlq $32,%mm0
|
|
139 pfsub %mm0,%mm4
|
|
140
|
|
141 pf2id %mm4,%mm4
|
|
142 movd %mm4,%eax
|
|
143
|
|
144 sar $16,%eax
|
|
145 movw %ax,(%esi)
|
|
146
|
|
147 addl $64,%ebx
|
|
148 subl $-128,%ecx
|
|
149 addl $4,%esi
|
|
150 decl %ebp
|
|
151 jnz .L55
|
|
152
|
|
153 / --- end of loop 1 ---
|
|
154
|
|
155 movd (%ecx),%mm2
|
763
|
156 pfmul (%ebx),%mm2
|
736
|
157
|
|
158 movd 8(%ebx),%mm1
|
763
|
159 pfmul 8(%ecx),%mm1
|
736
|
160 pfadd %mm1,%mm2
|
|
161
|
|
162 movd 16(%ebx),%mm1
|
763
|
163 pfmul 16(%ecx),%mm1
|
736
|
164 pfadd %mm1,%mm2
|
|
165
|
|
166 movd 24(%ebx),%mm1
|
763
|
167 pfmul 24(%ecx),%mm1
|
736
|
168 pfadd %mm1,%mm2
|
|
169
|
|
170 movd 32(%ebx),%mm1
|
763
|
171 pfmul 32(%ecx),%mm1
|
736
|
172 pfadd %mm1,%mm2
|
|
173
|
|
174 movd 40(%ebx),%mm1
|
763
|
175 pfmul 40(%ecx),%mm1
|
736
|
176 pfadd %mm1,%mm2
|
|
177
|
|
178 movd 48(%ebx),%mm1
|
763
|
179 pfmul 48(%ecx),%mm1
|
736
|
180 pfadd %mm1,%mm2
|
|
181
|
|
182 movd 56(%ebx),%mm1
|
763
|
183 pfmul 56(%ecx),%mm1
|
736
|
184 pfadd %mm1,%mm2
|
|
185
|
|
186 pf2id %mm2,%mm2
|
|
187 movd %mm2,%eax
|
|
188
|
|
189 sar $16,%eax
|
|
190
|
|
191 movw %ax,(%esi)
|
|
192
|
|
193 addl $-64,%ebx
|
|
194 addl $4,%esi
|
|
195 addl $256,%ecx
|
|
196 movl $15,%ebp
|
|
197
|
|
198 .L68:
|
|
199 psubd %mm0,%mm0
|
|
200
|
|
201 movq (%ecx),%mm2
|
763
|
202 pfmul (%ebx),%mm2
|
736
|
203 pfsub %mm2,%mm0
|
|
204
|
|
205 movq 8(%ecx),%mm4
|
763
|
206 pfmul 8(%ebx),%mm4
|
736
|
207 pfsub %mm4,%mm0
|
|
208
|
|
209 movq 16(%ecx),%mm2
|
763
|
210 pfmul 16(%ebx),%mm2
|
736
|
211 pfsub %mm2,%mm0
|
|
212
|
|
213 movq 24(%ecx),%mm4
|
763
|
214 pfmul 24(%ebx),%mm4
|
736
|
215 pfsub %mm4,%mm0
|
|
216
|
|
217 movq 32(%ecx),%mm2
|
763
|
218 pfmul 32(%ebx),%mm2
|
736
|
219 pfsub %mm2,%mm0
|
|
220
|
|
221 movq 40(%ecx),%mm4
|
763
|
222 pfmul 40(%ebx),%mm4
|
736
|
223 pfsub %mm4,%mm0
|
|
224
|
|
225 movq 48(%ecx),%mm2
|
763
|
226 pfmul 48(%ebx),%mm2
|
736
|
227 pfsub %mm2,%mm0
|
|
228
|
|
229 movq 56(%ecx),%mm4
|
763
|
230 pfmul 56(%ebx),%mm4
|
736
|
231 pfsub %mm4,%mm0
|
|
232
|
|
233 pfacc %mm0,%mm0
|
|
234
|
|
235 pf2id %mm0,%mm0
|
|
236 movd %mm0,%eax
|
|
237
|
|
238 sar $16,%eax
|
|
239
|
|
240 movw %ax,(%esi)
|
|
241
|
|
242 addl $-64,%ebx
|
|
243 subl $-128,%ecx
|
|
244 addl $4,%esi
|
|
245 decl %ebp
|
|
246 jnz .L68
|
|
247
|
|
248 / --- end of loop 2
|
|
249
|
|
250 femms
|
|
251
|
|
252 movl %edi,%eax
|
|
253 popl %ebx
|
|
254 popl %esi
|
|
255 popl %edi
|
|
256 popl %ebp
|
|
257 addl $12,%esp
|
|
258 ret
|