Mercurial > mplayer.hg
annotate mp3lib/decode_sse.s @ 956:a6cecd9a1bad
'-ao' switch (including '-ao help'), fixing Arpi's bug (short name 'null' for both of oss and null driver ;)
author | lgb |
---|---|
date | Sun, 03 Jun 2001 00:24:49 +0000 |
parents | 3d45e3bcd2bd |
children |
rev | line source |
---|---|
787 | 1 /// |
2 /// Replacement of synth_1to1() with Intel's SSE SIMD operations support | |
3 /// | |
4 /// This code based 'decode_k7.s' by Nick Kurshev | |
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made: | |
6 /// | |
7 /// - SSE optimization | |
8 /// - change function name for support SSE automatic detect | |
9 /// | |
10 /// Modified by Nick Kurshev <nickols_k@mail.ru> | |
11 /// | |
12 / synth_1to1_3dnow works the same way as the c version of | |
13 / synth_1to1. this assembler code based 'decode-i586.s' | |
14 / (by Stefan Bieschewski <stb@acm.org>), two types of changes | |
15 / have been made: | |
16 / - use {MMX,3DNow!} instruction for reduce cpu | |
17 / - remove unused(?) local symbols | |
18 / | |
19 / useful sources of information on optimizing 3DNow! code include: | |
20 / AMD 3DNow! Technology Manual (Publication #21928) | |
21 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf | |
22 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) | |
23 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) | |
24 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf | |
25 / | |
26 / This code was tested only AMD-K6-2 processor Linux systems, | |
27 / please tell me: | |
28 / - whether this code works on other 3DNow! capable processors | |
29 / (ex.IDT-C6-2) or not | |
30 / - whether this code works on other OSes or not | |
31 / | |
32 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 | |
33 / <kim@comtec.co.jp> - after 1.Apr.1998 | |
34 | |
35 / Enhancments for q-word operation by Michael Hipp | |
36 | |
37 .bss | |
38 .comm buffs,4352,4 | |
39 .data | |
40 .align 4 | |
41 bo: | |
42 .long 1 | |
43 .text | |
44 /* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ | |
45 .globl synth_1to1_sse | |
46 synth_1to1_sse: | |
47 subl $12,%esp | |
48 pushl %ebp | |
49 pushl %edi | |
50 pushl %esi | |
51 pushl %ebx | |
52 | |
53 movl 32(%esp),%eax | |
54 movl 40(%esp),%esi | |
55 movl $0,%edi | |
56 movl bo,%ebp | |
57 cmpl %edi,36(%esp) | |
58 jne .L48 | |
59 decl %ebp | |
60 andl $15,%ebp | |
61 movl %ebp,bo | |
62 movl $buffs,%ecx | |
63 jmp .L49 | |
64 .L48: | |
65 addl $2,%esi | |
66 movl $buffs+2176,%ecx | |
67 .L49: | |
68 testl $1,%ebp | |
69 je .L50 | |
70 movl %ecx,%ebx | |
71 movl %ebp,16(%esp) | |
72 pushl %eax | |
73 movl 20(%esp),%edx | |
74 leal (%ebx,%edx,4),%eax | |
75 pushl %eax | |
76 movl 24(%esp),%eax | |
77 incl %eax | |
78 andl $15,%eax | |
79 leal 1088(,%eax,4),%eax | |
80 addl %ebx,%eax | |
81 jmp .L74 | |
82 .L50: | |
83 leal 1088(%ecx),%ebx | |
84 leal 1(%ebp),%edx | |
85 movl %edx,16(%esp) | |
86 pushl %eax | |
87 leal 1092(%ecx,%ebp,4),%eax | |
88 pushl %eax | |
89 leal (%ecx,%ebp,4),%eax | |
90 .L74: | |
91 pushl %eax | |
92 call dct64 | |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
93 addl $12, %esp |
787 | 94 movl 16(%esp),%edx |
95 leal 0(,%edx,4),%edx | |
96 movl $decwin+64,%eax | |
97 movl %eax,%ecx | |
98 subl %edx,%ecx | |
99 movl $16,%ebp | |
100 | |
101 .L55: | |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
102 movups (%ecx), %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
103 mulps (%ebx), %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
104 movups 16(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
105 mulps 16(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
106 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
107 movups 32(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
108 mulps 32(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
109 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
110 movups 48(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
111 mulps 48(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
112 addps %xmm1, %xmm0 |
872
3d45e3bcd2bd
Replaced wrond direction floating -> integer conversion.
atmosfear
parents:
864
diff
changeset
|
113 /* pfnacc -> PFNACC mmreg1, mmreg2 performs the following operations: */ |
3d45e3bcd2bd
Replaced wrond direction floating -> integer conversion.
atmosfear
parents:
864
diff
changeset
|
114 /* temp = mmreg2 */ |
3d45e3bcd2bd
Replaced wrond direction floating -> integer conversion.
atmosfear
parents:
864
diff
changeset
|
115 /* mmreg1[31:0] = mmreg1[31:0] - mmreg1[63:32] */ |
3d45e3bcd2bd
Replaced wrond direction floating -> integer conversion.
atmosfear
parents:
864
diff
changeset
|
116 /* mmreg1[63:32]= temp [31:0] - temp[63:32] */ |
3d45e3bcd2bd
Replaced wrond direction floating -> integer conversion.
atmosfear
parents:
864
diff
changeset
|
117 /* save difference of mmreg1's low-word and high-word into mmreg1's low-word */ |
3d45e3bcd2bd
Replaced wrond direction floating -> integer conversion.
atmosfear
parents:
864
diff
changeset
|
118 /* save difference of mmreg2's low-word and high-word into mmreg1's high-word */ |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
119 movhlps %xmm0, %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
120 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
121 movaps %xmm0, %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
122 shufps $0x55, %xmm1, %xmm1 /* fake of pfnacc. 1|1|1|1 */ |
872
3d45e3bcd2bd
Replaced wrond direction floating -> integer conversion.
atmosfear
parents:
864
diff
changeset
|
123 |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
124 subss %xmm1, %xmm0 |
872
3d45e3bcd2bd
Replaced wrond direction floating -> integer conversion.
atmosfear
parents:
864
diff
changeset
|
125 cvtss2si %xmm0, %eax |
787 | 126 |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
127 / sar $16,%eax |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
128 movw %ax,(%esi) |
787 | 129 |
130 addl $64,%ebx | |
131 subl $-128,%ecx | |
132 addl $4,%esi | |
133 decl %ebp | |
134 jnz .L55 | |
135 | |
136 / --- end of loop 1 --- | |
137 | |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
138 movups (%ecx), %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
139 mulps (%ebx), %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
140 movups 16(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
141 mulps 16(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
142 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
143 movups 32(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
144 mulps 32(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
145 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
146 movups 48(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
147 mulps 48(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
148 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
149 movhlps %xmm0, %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
150 addss %xmm1, %xmm0 |
872
3d45e3bcd2bd
Replaced wrond direction floating -> integer conversion.
atmosfear
parents:
864
diff
changeset
|
151 cvtss2si %xmm0, %eax |
787 | 152 |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
153 / sar $16,%eax |
787 | 154 |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
155 movw %ax,(%esi) |
787 | 156 |
157 addl $-64,%ebx | |
158 addl $4,%esi | |
159 addl $256,%ecx | |
160 movl $15,%ebp | |
161 | |
162 .L68: | |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
163 xorps %xmm0, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
164 movups (%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
165 mulps (%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
166 subps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
167 movups 16(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
168 mulps 16(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
169 subps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
170 movups 32(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
171 mulps 32(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
172 subps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
173 movups 48(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
174 mulps 48(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
175 subps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
176 movhlps %xmm0, %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
177 subps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
178 movaps %xmm0, %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
179 shufps $0x55, %xmm1, %xmm1 /* fake of pfacc 1|1|1|1 */ |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
180 addss %xmm1, %xmm0 |
872
3d45e3bcd2bd
Replaced wrond direction floating -> integer conversion.
atmosfear
parents:
864
diff
changeset
|
181 cvtss2si %xmm0, %eax |
787 | 182 |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
183 / sar $16,%eax |
787 | 184 |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
185 movw %ax,(%esi) |
787 | 186 |
187 addl $-64,%ebx | |
188 subl $-128,%ecx | |
189 addl $4,%esi | |
190 decl %ebp | |
191 jnz .L68 | |
192 | |
193 / --- end of loop 2 | |
194 | |
195 movl %edi,%eax | |
196 popl %ebx | |
197 popl %esi | |
198 popl %edi | |
199 popl %ebp | |
200 addl $12,%esp | |
201 ret |