Mercurial > mplayer.hg
annotate mp3lib/decode_sse.s @ 871:ab94c4cf96d8
Continue implementation of ASF streaming.
author | bertrand |
---|---|
date | Fri, 25 May 2001 14:01:21 +0000 |
parents | f0a3b5bf2e7a |
children | 3d45e3bcd2bd |
rev | line source |
---|---|
787 | 1 /// |
2 /// Replacement of synth_1to1() with Intel's SSE SIMD operations support | |
3 /// | |
4 /// This code based 'decode_k7.s' by Nick Kurshev | |
5 /// <squash@mb.kcom.ne.jp>,only some types of changes have been made: | |
6 /// | |
7 /// - SSE optimization | |
8 /// - change function name for support SSE automatic detect | |
9 /// | |
10 /// Modified by Nick Kurshev <nickols_k@mail.ru> | |
11 /// | |
12 / synth_1to1_3dnow works the same way as the c version of | |
13 / synth_1to1. this assembler code based 'decode-i586.s' | |
14 / (by Stefan Bieschewski <stb@acm.org>), two types of changes | |
15 / have been made: | |
16 / - use {MMX,3DNow!} instruction for reduce cpu | |
17 / - remove unused(?) local symbols | |
18 / | |
19 / useful sources of information on optimizing 3DNow! code include: | |
20 / AMD 3DNow! Technology Manual (Publication #21928) | |
21 / English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf | |
22 / (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) | |
23 / AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) | |
24 / English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf | |
25 / | |
26 / This code was tested only AMD-K6-2 processor Linux systems, | |
27 / please tell me: | |
28 / - whether this code works on other 3DNow! capable processors | |
29 / (ex.IDT-C6-2) or not | |
30 / - whether this code works on other OSes or not | |
31 / | |
32 / by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998 | |
33 / <kim@comtec.co.jp> - after 1.Apr.1998 | |
34 | |
35 / Enhancments for q-word operation by Michael Hipp | |
36 | |
37 .bss | |
38 .comm buffs,4352,4 | |
39 .data | |
40 .align 4 | |
41 bo: | |
42 .long 1 | |
43 .text | |
44 /* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ | |
45 .globl synth_1to1_sse | |
46 synth_1to1_sse: | |
47 subl $12,%esp | |
48 pushl %ebp | |
49 pushl %edi | |
50 pushl %esi | |
51 pushl %ebx | |
52 | |
53 movl 32(%esp),%eax | |
54 movl 40(%esp),%esi | |
55 movl $0,%edi | |
56 movl bo,%ebp | |
57 cmpl %edi,36(%esp) | |
58 jne .L48 | |
59 decl %ebp | |
60 andl $15,%ebp | |
61 movl %ebp,bo | |
62 movl $buffs,%ecx | |
63 jmp .L49 | |
64 .L48: | |
65 addl $2,%esi | |
66 movl $buffs+2176,%ecx | |
67 .L49: | |
68 testl $1,%ebp | |
69 je .L50 | |
70 movl %ecx,%ebx | |
71 movl %ebp,16(%esp) | |
72 pushl %eax | |
73 movl 20(%esp),%edx | |
74 leal (%ebx,%edx,4),%eax | |
75 pushl %eax | |
76 movl 24(%esp),%eax | |
77 incl %eax | |
78 andl $15,%eax | |
79 leal 1088(,%eax,4),%eax | |
80 addl %ebx,%eax | |
81 jmp .L74 | |
82 .L50: | |
83 leal 1088(%ecx),%ebx | |
84 leal 1(%ebp),%edx | |
85 movl %edx,16(%esp) | |
86 pushl %eax | |
87 leal 1092(%ecx,%ebp,4),%eax | |
88 pushl %eax | |
89 leal (%ecx,%ebp,4),%eax | |
90 .L74: | |
91 pushl %eax | |
92 call dct64 | |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
93 addl $12, %esp |
787 | 94 movl 16(%esp),%edx |
95 leal 0(,%edx,4),%edx | |
96 movl $decwin+64,%eax | |
97 movl %eax,%ecx | |
98 subl %edx,%ecx | |
99 movl $16,%ebp | |
100 | |
101 .L55: | |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
102 movups (%ecx), %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
103 mulps (%ebx), %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
104 movups 16(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
105 mulps 16(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
106 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
107 movups 32(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
108 mulps 32(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
109 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
110 movups 48(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
111 mulps 48(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
112 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
113 movhlps %xmm0, %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
114 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
115 movaps %xmm0, %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
116 shufps $0x55, %xmm1, %xmm1 /* fake of pfnacc. 1|1|1|1 */ |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
117 subss %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
118 cvttss2si %xmm0, %eax |
787 | 119 |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
120 / sar $16,%eax |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
121 movw %ax,(%esi) |
787 | 122 |
123 addl $64,%ebx | |
124 subl $-128,%ecx | |
125 addl $4,%esi | |
126 decl %ebp | |
127 jnz .L55 | |
128 | |
129 / --- end of loop 1 --- | |
130 | |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
131 movups (%ecx), %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
132 mulps (%ebx), %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
133 movups 16(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
134 mulps 16(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
135 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
136 movups 32(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
137 mulps 32(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
138 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
139 movups 48(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
140 mulps 48(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
141 addps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
142 movhlps %xmm0, %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
143 addss %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
144 cvttss2si %xmm0, %eax |
787 | 145 |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
146 / sar $16,%eax |
787 | 147 |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
148 movw %ax,(%esi) |
787 | 149 |
150 addl $-64,%ebx | |
151 addl $4,%esi | |
152 addl $256,%ecx | |
153 movl $15,%ebp | |
154 | |
155 .L68: | |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
156 xorps %xmm0, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
157 movups (%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
158 mulps (%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
159 subps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
160 movups 16(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
161 mulps 16(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
162 subps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
163 movups 32(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
164 mulps 32(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
165 subps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
166 movups 48(%ecx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
167 mulps 48(%ebx), %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
168 subps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
169 movhlps %xmm0, %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
170 subps %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
171 movaps %xmm0, %xmm1 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
172 shufps $0x55, %xmm1, %xmm1 /* fake of pfacc 1|1|1|1 */ |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
173 addss %xmm1, %xmm0 |
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
174 cvttss2si %xmm0, %eax |
787 | 175 |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
176 / sar $16,%eax |
787 | 177 |
864
f0a3b5bf2e7a
This version is slightly better then previous, hard to decide.
atmosfear
parents:
863
diff
changeset
|
178 movw %ax,(%esi) |
787 | 179 |
180 addl $-64,%ebx | |
181 subl $-128,%ecx | |
182 addl $4,%esi | |
183 decl %ebp | |
184 jnz .L68 | |
185 | |
186 / --- end of loop 2 | |
187 | |
188 movl %edi,%eax | |
189 popl %ebx | |
190 popl %esi | |
191 popl %edi | |
192 popl %ebp | |
193 addl $12,%esp | |
194 ret |