Mercurial > mplayer.hg
annotate mp3lib/decode_sse.s @ 863:290801346d62
Better working version, no silence, but distorted.
author | atmosfear |
---|---|
date | Thu, 24 May 2001 20:13:28 +0000 |
parents | 9bc104531aec |
children | f0a3b5bf2e7a |
rev | line source |
---|---|
787 | 1 /// |
2 /// Replacement of synth_1to1() with Intel's SSE SIMD operations support | |
3 /// | |
4 /// This code based 'decode_k7.s' by Nick Kurshev | |
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made: | |
6 /// | |
7 /// - SSE optimization | |
8 /// - change function name for support SSE automatic detect | |
9 /// | |
10 /// Modified by Nick Kurshev <nickols_k@mail.ru> | |
11 /// | |
12 / synth_1to1_3dnow works the same way as the c version of | |
13 / synth_1to1. this assembler code based 'decode-i586.s' | |
14 / (by Stefan Bieschewski <stb@acm.org>), two types of changes | |
15 / have been made: | |
16 / - use {MMX,3DNow!} instruction for reduce cpu | |
17 / - remove unused(?) local symbols | |
18 / | |
19 / useful sources of information on optimizing 3DNow! code include: | |
20 / AMD 3DNow! Technology Manual (Publication #21928) | |
21 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf | |
22 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) | |
23 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) | |
24 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf | |
25 / | |
26 / This code was tested only AMD-K6-2 processor Linux systems, | |
27 / please tell me: | |
28 / - whether this code works on other 3DNow! capable processors | |
29 / (ex.IDT-C6-2) or not | |
30 / - whether this code works on other OSes or not | |
31 / | |
32 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 | |
33 / <kim@comtec.co.jp> - after 1.Apr.1998 | |
34 | |
35 / Enhancments for q-word operation by Michael Hipp | |
36 | |
37 .bss | |
38 .comm buffs,4352,4 | |
39 .data | |
40 .align 4 | |
41 bo: | |
42 .long 1 | |
43 .text | |
44 /* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ | |
45 .globl synth_1to1_sse | |
46 synth_1to1_sse: | |
47 subl $12,%esp | |
48 pushl %ebp | |
49 pushl %edi | |
50 pushl %esi | |
51 pushl %ebx | |
52 | |
53 movl 32(%esp),%eax | |
54 movl 40(%esp),%esi | |
55 movl $0,%edi | |
56 movl bo,%ebp | |
57 cmpl %edi,36(%esp) | |
58 jne .L48 | |
59 decl %ebp | |
60 andl $15,%ebp | |
61 movl %ebp,bo | |
62 movl $buffs,%ecx | |
63 jmp .L49 | |
64 .L48: | |
65 addl $2,%esi | |
66 movl $buffs+2176,%ecx | |
67 .L49: | |
68 testl $1,%ebp | |
69 je .L50 | |
70 movl %ecx,%ebx | |
71 movl %ebp,16(%esp) | |
72 pushl %eax | |
73 movl 20(%esp),%edx | |
74 leal (%ebx,%edx,4),%eax | |
75 pushl %eax | |
76 movl 24(%esp),%eax | |
77 incl %eax | |
78 andl $15,%eax | |
79 leal 1088(,%eax,4),%eax | |
80 addl %ebx,%eax | |
81 jmp .L74 | |
82 .L50: | |
83 leal 1088(%ecx),%ebx | |
84 leal 1(%ebp),%edx | |
85 movl %edx,16(%esp) | |
86 pushl %eax | |
87 leal 1092(%ecx,%ebp,4),%eax | |
88 pushl %eax | |
89 leal (%ecx,%ebp,4),%eax | |
90 .L74: | |
91 pushl %eax | |
92 call dct64 | |
93 addl $12,%esp | |
863
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
94 emms |
787 | 95 movl 16(%esp),%edx |
96 leal 0(,%edx,4),%edx | |
97 movl $decwin+64,%eax | |
98 movl %eax,%ecx | |
99 subl %edx,%ecx | |
100 movl $16,%ebp | |
101 | |
102 .L55: | |
103 movups (%ecx), %xmm4 | |
104 mulps (%ebx), %xmm4 | |
105 movups 16(%ecx), %xmm0 | |
106 mulps 16(%ebx), %xmm0 | |
107 addps %xmm0, %xmm4 | |
108 movups 32(%ecx), %xmm1 | |
109 mulps 32(%ebx), %xmm1 | |
110 addps %xmm1, %xmm4 | |
111 movups 48(%ecx), %xmm0 | |
112 mulps 48(%ebx), %xmm0 | |
113 addps %xmm0, %xmm4 | |
863
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
114 movhlps %xmm4, %xmm1 /* fake of pfacc */ |
787 | 115 addps %xmm1, %xmm4 |
116 shufps $0x55, %xmm4, %xmm1 /* fake of pfnacc. 1|1|1|1 */ | |
117 subps %xmm1, %xmm4 | |
863
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
118 cvtss2si %xmm4, %eax |
787 | 119 |
120 movw %ax,(%esi) | |
121 | |
122 addl $64,%ebx | |
123 subl $-128,%ecx | |
124 addl $4,%esi | |
125 decl %ebp | |
126 jnz .L55 | |
127 | |
128 / --- end of loop 1 --- | |
129 | |
130 movups (%ecx), %xmm4 | |
131 mulps (%ebx), %xmm4 | |
132 movups 16(%ecx), %xmm0 | |
133 mulps 16(%ebx), %xmm0 | |
134 addps %xmm0, %xmm4 | |
135 movups 32(%ecx), %xmm1 | |
136 mulps 32(%ebx), %xmm1 | |
137 addps %xmm1, %xmm4 | |
138 movups 48(%ecx), %xmm0 | |
139 mulps 48(%ebx), %xmm0 | |
140 addps %xmm0, %xmm4 | |
863
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
141 movhlps %xmm4, %xmm1 /* 3|2|3|2 */ |
787 | 142 addps %xmm1, %xmm4 |
863
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
143 cvtss2si %xmm4, %eax |
787 | 144 |
145 | |
146 movw %ax,(%esi) | |
147 | |
148 addl $-64,%ebx | |
149 addl $4,%esi | |
150 addl $256,%ecx | |
151 movl $15,%ebp | |
152 | |
153 .L68: | |
863
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
154 xorps %xmm0, %xmm0 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
155 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
156 movups (%ecx),%xmm2 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
157 mulps (%ebx),%xmm2 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
158 subps %xmm2,%xmm0 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
159 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
160 movups 16(%ecx),%xmm2 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
161 mulps 16(%ebx),%xmm2 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
162 subps %xmm2,%xmm0 |
787 | 163 |
863
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
164 movups 32(%ecx),%xmm2 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
165 mulps 32(%ebx),%xmm2 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
166 subps %xmm2,%xmm0 |
787 | 167 |
863
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
168 movups 48(%ecx),%xmm2 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
169 mulps 48(%ebx),%xmm2 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
170 subps %xmm2,%xmm0 |
787 | 171 |
863
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
172 movhlps %xmm0, %xmm1 /* 3|2|3|2 */ |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
173 addps %xmm1, %xmm0 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
174 shufps $0x55, %xmm0, %xmm1 /* fake of pfacc 1|1|1|1 */ |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
175 addps %xmm1, %xmm0 |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
176 cvtss2si %xmm0, %eax |
290801346d62
Better working version, no silence, but distorted.
atmosfear
parents:
787
diff
changeset
|
177 |
787 | 178 |
179 movw %ax,(%esi) | |
180 | |
181 addl $-64,%ebx | |
182 subl $-128,%ecx | |
183 addl $4,%esi | |
184 decl %ebp | |
185 jnz .L68 | |
186 | |
187 / --- end of loop 2 | |
188 | |
189 emms | |
190 | |
191 movl %edi,%eax | |
192 popl %ebx | |
193 popl %esi | |
194 popl %edi | |
195 popl %ebp | |
196 addl $12,%esp | |
197 ret |