comparison mp3lib/decode_sse.s @ 787:9bc104531aec

mp3lib sse support - disabled by default
author arpi_esp
date Sun, 13 May 2001 18:30:53 +0000
parents
children 290801346d62
comparison
equal deleted inserted replaced
786:4b6dc49b0cb8 787:9bc104531aec
1 ///
2 /// Replacement of synth_1to1() with Intel's SSE SIMD operations support
3 ///
4 /// This code based 'decode_k7.s' by Nick Kurshev
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
6 ///
7 /// - SSE optimization
8 /// - change function name for support SSE automatic detect
9 ///
10 /// Modified by Nick Kurshev <nickols_k@mail.ru>
11 ///
12 / synth_1to1_3dnow works the same way as the c version of
13 / synth_1to1. this assembler code based 'decode-i586.s'
14 / (by Stefan Bieschewski <stb@acm.org>), two types of changes
15 / have been made:
16 / - use {MMX,3DNow!} instruction for reduce cpu
17 / - remove unused(?) local symbols
18 /
19 / useful sources of information on optimizing 3DNow! code include:
20 / AMD 3DNow! Technology Manual (Publication #21928)
21 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
22 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
23 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
24 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
25 /
26 / This code was tested only AMD-K6-2 processor Linux systems,
27 / please tell me:
28 / - whether this code works on other 3DNow! capable processors
29 / (ex.IDT-C6-2) or not
30 / - whether this code works on other OSes or not
31 /
32 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
33 / <kim@comtec.co.jp> - after 1.Apr.1998
34
35 / Enhancments for q-word operation by Michael Hipp
36
37 .bss
38 .comm buffs,4352,4
39 .data
40 .align 4
41 bo:
42 .long 1
43 .text
44 /* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
45 .globl synth_1to1_sse
46 synth_1to1_sse:
47 subl $12,%esp
48 pushl %ebp
49 pushl %edi
50 pushl %esi
51 pushl %ebx
52
53 movl 32(%esp),%eax
54 movl 40(%esp),%esi
55 movl $0,%edi
56 movl bo,%ebp
57 cmpl %edi,36(%esp)
58 jne .L48
59 decl %ebp
60 andl $15,%ebp
61 movl %ebp,bo
62 movl $buffs,%ecx
63 jmp .L49
64 .L48:
65 addl $2,%esi
66 movl $buffs+2176,%ecx
67 .L49:
68 testl $1,%ebp
69 je .L50
70 movl %ecx,%ebx
71 movl %ebp,16(%esp)
72 pushl %eax
73 movl 20(%esp),%edx
74 leal (%ebx,%edx,4),%eax
75 pushl %eax
76 movl 24(%esp),%eax
77 incl %eax
78 andl $15,%eax
79 leal 1088(,%eax,4),%eax
80 addl %ebx,%eax
81 jmp .L74
82 .L50:
83 leal 1088(%ecx),%ebx
84 leal 1(%ebp),%edx
85 movl %edx,16(%esp)
86 pushl %eax
87 leal 1092(%ecx,%ebp,4),%eax
88 pushl %eax
89 leal (%ecx,%ebp,4),%eax
90 .L74:
91 pushl %eax
92 call dct64
93 addl $12,%esp
94 movl 16(%esp),%edx
95 leal 0(,%edx,4),%edx
96 movl $decwin+64,%eax
97 movl %eax,%ecx
98 subl %edx,%ecx
99 movl $16,%ebp
100
101 .L55:
102 movups (%ecx), %xmm4
103 mulps (%ebx), %xmm4
104 movups 16(%ecx), %xmm0
105 mulps 16(%ebx), %xmm0
106 addps %xmm0, %xmm4
107 movups 32(%ecx), %xmm1
108 mulps 32(%ebx), %xmm1
109 addps %xmm1, %xmm4
110 movups 48(%ecx), %xmm0
111 mulps 48(%ebx), %xmm0
112 addps %xmm0, %xmm4
113 shufps $0xDD, %xmm4, %xmm1 /* fake of pfacc. 3|2|3|2 */
114 addps %xmm1, %xmm4
115 shufps $0x55, %xmm4, %xmm1 /* fake of pfnacc. 1|1|1|1 */
116 subps %xmm1, %xmm4
117 cvtps2pi %xmm4, %mm4
118
119 movd %mm4,%eax
120
121 sar $16,%eax
122 movw %ax,(%esi)
123
124 addl $64,%ebx
125 subl $-128,%ecx
126 addl $4,%esi
127 decl %ebp
128 jnz .L55
129
130 / --- end of loop 1 ---
131
132 movups (%ecx), %xmm4
133 mulps (%ebx), %xmm4
134 movups 16(%ecx), %xmm0
135 mulps 16(%ebx), %xmm0
136 addps %xmm0, %xmm4
137 movups 32(%ecx), %xmm1
138 mulps 32(%ebx), %xmm1
139 addps %xmm1, %xmm4
140 movups 48(%ecx), %xmm0
141 mulps 48(%ebx), %xmm0
142 addps %xmm0, %xmm4
143 shufps $0xDD, %xmm4, %xmm1 /* 3|2|3|2 */
144 addps %xmm1, %xmm4
145 cvtps2pi %xmm4, %mm4
146
147 movd %mm4, %eax
148
149 sar $16,%eax
150
151 movw %ax,(%esi)
152
153 addl $-64,%ebx
154 addl $4,%esi
155 addl $256,%ecx
156 movl $15,%ebp
157
158 .L68:
159 xorps %xmm3, %xmm3
160
161 movups (%ecx), %xmm4
162 mulps (%ebx), %xmm4
163 subps %xmm4, %xmm3
164 movups 16(%ecx), %xmm0
165 mulps 16(%ebx), %xmm0
166 subps %xmm0, %xmm3
167 movups 32(%ecx), %xmm1
168 mulps 32(%ebx), %xmm1
169 subps %xmm1, %xmm3
170 movups 48(%ecx), %xmm0
171 mulps 48(%ebx), %xmm0
172 subps %xmm0, %xmm3
173 shufps $0xDD, %xmm3, %xmm1 /* 3|2|3|2 */
174 addps %xmm1, %xmm3
175 shufps $0x55, %xmm3, %xmm1 /* fake of pfacc 1|1|1|1 */
176 addps %xmm1, %xmm3
177 cvtps2pi %xmm3, %mm0
178
179 movd %mm0,%eax
180
181 sar $16,%eax
182
183 movw %ax,(%esi)
184
185 addl $-64,%ebx
186 subl $-128,%ecx
187 addl $4,%esi
188 decl %ebp
189 jnz .L68
190
191 / --- end of loop 2
192
193 emms
194
195 movl %edi,%eax
196 popl %ebx
197 popl %esi
198 popl %edi
199 popl %ebp
200 addl $12,%esp
201 ret