787
|
1 ///
|
|
2 /// Replacement of synth_1to1() with Intel's SSE SIMD operations support
|
|
3 ///
|
|
4 /// This code based 'decode_k7.s' by Nick Kurshev
|
|
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
|
|
6 ///
|
|
7 /// - SSE optimization
|
|
8 /// - change function name for support SSE automatic detect
|
|
9 ///
|
|
10 /// Modified by Nick Kurshev <nickols_k@mail.ru>
|
|
11 ///
|
|
12 / synth_1to1_3dnow works the same way as the c version of
|
|
13 / synth_1to1. this assembler code based 'decode-i586.s'
|
|
14 / (by Stefan Bieschewski <stb@acm.org>), two types of changes
|
|
15 / have been made:
|
|
16 / - use {MMX,3DNow!} instruction for reduce cpu
|
|
17 / - remove unused(?) local symbols
|
|
18 /
|
|
19 / useful sources of information on optimizing 3DNow! code include:
|
|
20 / AMD 3DNow! Technology Manual (Publication #21928)
|
|
21 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
|
|
22 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
|
|
23 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
|
|
24 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
|
|
25 /
|
|
26 / This code was tested only AMD-K6-2 processor Linux systems,
|
|
27 / please tell me:
|
|
28 / - whether this code works on other 3DNow! capable processors
|
|
29 / (ex.IDT-C6-2) or not
|
|
30 / - whether this code works on other OSes or not
|
|
31 /
|
|
32 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
|
|
33 / <kim@comtec.co.jp> - after 1.Apr.1998
|
|
34
|
|
35 / Enhancments for q-word operation by Michael Hipp
|
|
36
|
|
37 .bss
|
|
38 .comm buffs,4352,4
|
|
39 .data
|
|
40 .align 4
|
|
41 bo:
|
|
42 .long 1
|
|
43 .text
|
|
44 /* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
|
|
45 .globl synth_1to1_sse
|
|
46 synth_1to1_sse:
|
|
47 subl $12,%esp
|
|
48 pushl %ebp
|
|
49 pushl %edi
|
|
50 pushl %esi
|
|
51 pushl %ebx
|
|
52
|
|
53 movl 32(%esp),%eax
|
|
54 movl 40(%esp),%esi
|
|
55 movl $0,%edi
|
|
56 movl bo,%ebp
|
|
57 cmpl %edi,36(%esp)
|
|
58 jne .L48
|
|
59 decl %ebp
|
|
60 andl $15,%ebp
|
|
61 movl %ebp,bo
|
|
62 movl $buffs,%ecx
|
|
63 jmp .L49
|
|
64 .L48:
|
|
65 addl $2,%esi
|
|
66 movl $buffs+2176,%ecx
|
|
67 .L49:
|
|
68 testl $1,%ebp
|
|
69 je .L50
|
|
70 movl %ecx,%ebx
|
|
71 movl %ebp,16(%esp)
|
|
72 pushl %eax
|
|
73 movl 20(%esp),%edx
|
|
74 leal (%ebx,%edx,4),%eax
|
|
75 pushl %eax
|
|
76 movl 24(%esp),%eax
|
|
77 incl %eax
|
|
78 andl $15,%eax
|
|
79 leal 1088(,%eax,4),%eax
|
|
80 addl %ebx,%eax
|
|
81 jmp .L74
|
|
82 .L50:
|
|
83 leal 1088(%ecx),%ebx
|
|
84 leal 1(%ebp),%edx
|
|
85 movl %edx,16(%esp)
|
|
86 pushl %eax
|
|
87 leal 1092(%ecx,%ebp,4),%eax
|
|
88 pushl %eax
|
|
89 leal (%ecx,%ebp,4),%eax
|
|
90 .L74:
|
|
91 pushl %eax
|
|
92 call dct64
|
|
93 addl $12,%esp
|
|
94 movl 16(%esp),%edx
|
|
95 leal 0(,%edx,4),%edx
|
|
96 movl $decwin+64,%eax
|
|
97 movl %eax,%ecx
|
|
98 subl %edx,%ecx
|
|
99 movl $16,%ebp
|
|
100
|
|
101 .L55:
|
|
102 movups (%ecx), %xmm4
|
|
103 mulps (%ebx), %xmm4
|
|
104 movups 16(%ecx), %xmm0
|
|
105 mulps 16(%ebx), %xmm0
|
|
106 addps %xmm0, %xmm4
|
|
107 movups 32(%ecx), %xmm1
|
|
108 mulps 32(%ebx), %xmm1
|
|
109 addps %xmm1, %xmm4
|
|
110 movups 48(%ecx), %xmm0
|
|
111 mulps 48(%ebx), %xmm0
|
|
112 addps %xmm0, %xmm4
|
|
113 shufps $0xDD, %xmm4, %xmm1 /* fake of pfacc. 3|2|3|2 */
|
|
114 addps %xmm1, %xmm4
|
|
115 shufps $0x55, %xmm4, %xmm1 /* fake of pfnacc. 1|1|1|1 */
|
|
116 subps %xmm1, %xmm4
|
|
117 cvtps2pi %xmm4, %mm4
|
|
118
|
|
119 movd %mm4,%eax
|
|
120
|
|
121 sar $16,%eax
|
|
122 movw %ax,(%esi)
|
|
123
|
|
124 addl $64,%ebx
|
|
125 subl $-128,%ecx
|
|
126 addl $4,%esi
|
|
127 decl %ebp
|
|
128 jnz .L55
|
|
129
|
|
130 / --- end of loop 1 ---
|
|
131
|
|
132 movups (%ecx), %xmm4
|
|
133 mulps (%ebx), %xmm4
|
|
134 movups 16(%ecx), %xmm0
|
|
135 mulps 16(%ebx), %xmm0
|
|
136 addps %xmm0, %xmm4
|
|
137 movups 32(%ecx), %xmm1
|
|
138 mulps 32(%ebx), %xmm1
|
|
139 addps %xmm1, %xmm4
|
|
140 movups 48(%ecx), %xmm0
|
|
141 mulps 48(%ebx), %xmm0
|
|
142 addps %xmm0, %xmm4
|
|
143 shufps $0xDD, %xmm4, %xmm1 /* 3|2|3|2 */
|
|
144 addps %xmm1, %xmm4
|
|
145 cvtps2pi %xmm4, %mm4
|
|
146
|
|
147 movd %mm4, %eax
|
|
148
|
|
149 sar $16,%eax
|
|
150
|
|
151 movw %ax,(%esi)
|
|
152
|
|
153 addl $-64,%ebx
|
|
154 addl $4,%esi
|
|
155 addl $256,%ecx
|
|
156 movl $15,%ebp
|
|
157
|
|
158 .L68:
|
|
159 xorps %xmm3, %xmm3
|
|
160
|
|
161 movups (%ecx), %xmm4
|
|
162 mulps (%ebx), %xmm4
|
|
163 subps %xmm4, %xmm3
|
|
164 movups 16(%ecx), %xmm0
|
|
165 mulps 16(%ebx), %xmm0
|
|
166 subps %xmm0, %xmm3
|
|
167 movups 32(%ecx), %xmm1
|
|
168 mulps 32(%ebx), %xmm1
|
|
169 subps %xmm1, %xmm3
|
|
170 movups 48(%ecx), %xmm0
|
|
171 mulps 48(%ebx), %xmm0
|
|
172 subps %xmm0, %xmm3
|
|
173 shufps $0xDD, %xmm3, %xmm1 /* 3|2|3|2 */
|
|
174 addps %xmm1, %xmm3
|
|
175 shufps $0x55, %xmm3, %xmm1 /* fake of pfacc 1|1|1|1 */
|
|
176 addps %xmm1, %xmm3
|
|
177 cvtps2pi %xmm3, %mm0
|
|
178
|
|
179 movd %mm0,%eax
|
|
180
|
|
181 sar $16,%eax
|
|
182
|
|
183 movw %ax,(%esi)
|
|
184
|
|
185 addl $-64,%ebx
|
|
186 subl $-128,%ecx
|
|
187 addl $4,%esi
|
|
188 decl %ebp
|
|
189 jnz .L68
|
|
190
|
|
191 / --- end of loop 2
|
|
192
|
|
193 emms
|
|
194
|
|
195 movl %edi,%eax
|
|
196 popl %ebx
|
|
197 popl %esi
|
|
198 popl %edi
|
|
199 popl %ebp
|
|
200 addl $12,%esp
|
|
201 ret
|