annotate mp3lib/decode_MMX.c @ 23342:e070d7f61e9a

Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
author zuxy
date Mon, 21 May 2007 01:47:27 +0000
parents 8092494fc92c
children ccb70d86d797
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4142
nick
parents:
diff changeset
1 /*
nick
parents:
diff changeset
2 * this code comes under GPL
nick
parents:
diff changeset
3 * This code was taken from http://www.mpg123.org
nick
parents:
diff changeset
4 * See ChangeLog of mpg123-0.59s-pre.1 for detail
nick
parents:
diff changeset
5 * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
nick
parents:
diff changeset
6 *
nick
parents:
diff changeset
7 * Local ChangeLog:
nick
parents:
diff changeset
8 * - Partial loops unrolling and removing MOVW insn from loops
nick
parents:
diff changeset
9 */
16989
e7a129082fda Unify include paths, -I.. is in CFLAGS.
diego
parents: 12292
diff changeset
10 #include "config.h"
e7a129082fda Unify include paths, -I.. is in CFLAGS.
diego
parents: 12292
diff changeset
11 #include "mangle.h"
4142
nick
parents:
diff changeset
12 #define real float /* ugly - but only way */
nick
parents:
diff changeset
13
23342
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
14 extern short mp3lib_decwins[];
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
15 extern void (*dct64_MMX_func)(short*, short*, real*);
12292
114f3d149324 attribute_used for gcc3.4
alex
parents: 11266
diff changeset
16 static unsigned long long attribute_used __attribute__((aligned(8))) null_one = 0x0000ffff0000ffffULL;
114f3d149324 attribute_used for gcc3.4
alex
parents: 11266
diff changeset
17 static unsigned long long attribute_used __attribute__((aligned(8))) one_null = 0xffff0000ffff0000ULL;
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents: 16989
diff changeset
18 unsigned long __attribute__((aligned(16))) costab_mmx[] =
4142
nick
parents:
diff changeset
19 {
nick
parents:
diff changeset
20 1056974725,
nick
parents:
diff changeset
21 1057056395,
nick
parents:
diff changeset
22 1057223771,
nick
parents:
diff changeset
23 1057485416,
nick
parents:
diff changeset
24 1057855544,
nick
parents:
diff changeset
25 1058356026,
nick
parents:
diff changeset
26 1059019886,
nick
parents:
diff changeset
27 1059897405,
nick
parents:
diff changeset
28 1061067246,
nick
parents:
diff changeset
29 1062657950,
nick
parents:
diff changeset
30 1064892987,
nick
parents:
diff changeset
31 1066774581,
nick
parents:
diff changeset
32 1069414683,
nick
parents:
diff changeset
33 1073984175,
nick
parents:
diff changeset
34 1079645762,
nick
parents:
diff changeset
35 1092815430,
nick
parents:
diff changeset
36 1057005197,
nick
parents:
diff changeset
37 1057342072,
nick
parents:
diff changeset
38 1058087743,
nick
parents:
diff changeset
39 1059427869,
nick
parents:
diff changeset
40 1061799040,
nick
parents:
diff changeset
41 1065862217,
nick
parents:
diff changeset
42 1071413542,
nick
parents:
diff changeset
43 1084439708,
nick
parents:
diff changeset
44 1057128951,
nick
parents:
diff changeset
45 1058664893,
nick
parents:
diff changeset
46 1063675095,
nick
parents:
diff changeset
47 1076102863,
nick
parents:
diff changeset
48 1057655764,
nick
parents:
diff changeset
49 1067924853,
nick
parents:
diff changeset
50 1060439283,
nick
parents:
diff changeset
51 };
nick
parents:
diff changeset
52
23342
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
53 int synth_1to1_MMX(real *bandPtr, int channel, short *samples)
4142
nick
parents:
diff changeset
54 {
23342
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
55 static short buffs[2][2][0x110] __attribute__((aligned(8)));
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
56 static int bo = 1;
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
57 short *b0, (*buf)[0x110], *a, *b;
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
58 short* window;
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
59 int bo1, i = 8;
11244
b91bd88dafea 100l to Nick
alex
parents: 11240
diff changeset
60
23342
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
61 if (channel == 0) {
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
62 bo = (bo - 1) & 0xf;
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
63 buf = buffs[1];
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
64 } else {
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
65 samples++;
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
66 buf = buffs[0];
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
67 }
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
68
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
69 if (bo & 1) {
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
70 b0 = buf[1];
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
71 bo1 = bo + 1;
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
72 a = buf[0] + bo;
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
73 b = buf[1] + ((bo + 1) & 0xf);
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
74 } else {
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
75 b0 = buf[0];
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
76 bo1 = bo;
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
77 b = buf[0] + bo;
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
78 a = buf[1] + ((bo + 1) & 0xf);
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
79 }
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
80
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
81 dct64_MMX_func(a, b, bandPtr);
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
82 window = mp3lib_decwins + 16 - bo1;
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
83 //printf("DEBUG: channel %d, bo %d, off %d\n", channel, bo, 16 - bo1);
4142
nick
parents:
diff changeset
84 __asm __volatile(
21046
ba5087cb0bd4 Use ASMALIGN macro for better compatibility and remove SYS_DARWIN
reimar
parents: 21040
diff changeset
85 ASMALIGN(4)
4608
b9c19fe73850 Fix according to strange bugreport
nick
parents: 4322
diff changeset
86 ".L03:\n\t"
4142
nick
parents:
diff changeset
87 "movq (%%edx),%%mm0\n\t"
nick
parents:
diff changeset
88 "movq 64(%%edx),%%mm4\n\t"
nick
parents:
diff changeset
89 "pmaddwd (%%esi),%%mm0\n\t"
nick
parents:
diff changeset
90 "pmaddwd 32(%%esi),%%mm4\n\t"
nick
parents:
diff changeset
91 "movq 8(%%edx),%%mm1\n\t"
nick
parents:
diff changeset
92 "movq 72(%%edx),%%mm5\n\t"
nick
parents:
diff changeset
93 "pmaddwd 8(%%esi),%%mm1\n\t"
nick
parents:
diff changeset
94 "pmaddwd 40(%%esi),%%mm5\n\t"
nick
parents:
diff changeset
95 "movq 16(%%edx),%%mm2\n\t"
nick
parents:
diff changeset
96 "movq 80(%%edx),%%mm6\n\t"
nick
parents:
diff changeset
97 "pmaddwd 16(%%esi),%%mm2\n\t"
nick
parents:
diff changeset
98 "pmaddwd 48(%%esi),%%mm6\n\t"
nick
parents:
diff changeset
99 "movq 24(%%edx),%%mm3\n\t"
nick
parents:
diff changeset
100 "movq 88(%%edx),%%mm7\n\t"
nick
parents:
diff changeset
101 "pmaddwd 24(%%esi),%%mm3\n\t"
nick
parents:
diff changeset
102 "pmaddwd 56(%%esi),%%mm7\n\t"
nick
parents:
diff changeset
103 "paddd %%mm1,%%mm0\n\t"
nick
parents:
diff changeset
104 "paddd %%mm5,%%mm4\n\t"
nick
parents:
diff changeset
105 "paddd %%mm2,%%mm0\n\t"
nick
parents:
diff changeset
106 "paddd %%mm6,%%mm4\n\t"
nick
parents:
diff changeset
107 "paddd %%mm3,%%mm0\n\t"
nick
parents:
diff changeset
108 "paddd %%mm7,%%mm4\n\t"
nick
parents:
diff changeset
109 "movq %%mm0,%%mm1\n\t"
nick
parents:
diff changeset
110 "movq %%mm4,%%mm5\n\t"
nick
parents:
diff changeset
111 "psrlq $32,%%mm1\n\t"
nick
parents:
diff changeset
112 "psrlq $32,%%mm5\n\t"
nick
parents:
diff changeset
113 "paddd %%mm1,%%mm0\n\t"
nick
parents:
diff changeset
114 "paddd %%mm5,%%mm4\n\t"
nick
parents:
diff changeset
115 "psrad $13,%%mm0\n\t"
nick
parents:
diff changeset
116 "psrad $13,%%mm4\n\t"
nick
parents:
diff changeset
117 "packssdw %%mm0,%%mm0\n\t"
nick
parents:
diff changeset
118 "packssdw %%mm4,%%mm4\n\t"
nick
parents:
diff changeset
119
nick
parents:
diff changeset
120 "movq (%%edi), %%mm1\n\t"
nick
parents:
diff changeset
121 "punpckldq %%mm4, %%mm0\n\t"
4246
3f677202418b mangling in mp3lib + stdcall undefined fix with cygwin
atmos4
parents: 4142
diff changeset
122 "pand "MANGLE(one_null)", %%mm1\n\t"
3f677202418b mangling in mp3lib + stdcall undefined fix with cygwin
atmos4
parents: 4142
diff changeset
123 "pand "MANGLE(null_one)", %%mm0\n\t"
4142
nick
parents:
diff changeset
124 "por %%mm0, %%mm1\n\t"
nick
parents:
diff changeset
125 "movq %%mm1,(%%edi)\n\t"
nick
parents:
diff changeset
126
nick
parents:
diff changeset
127 "leal 64(%%esi),%%esi\n\t"
nick
parents:
diff changeset
128 "leal 128(%%edx),%%edx\n\t"
nick
parents:
diff changeset
129 "leal 8(%%edi),%%edi\n\t"
nick
parents:
diff changeset
130
nick
parents:
diff changeset
131 "decl %%ecx\n\t"
4608
b9c19fe73850 Fix according to strange bugreport
nick
parents: 4322
diff changeset
132 "jnz .L03\n\t"
4142
nick
parents:
diff changeset
133
nick
parents:
diff changeset
134 "movq (%%edx),%%mm0\n\t"
nick
parents:
diff changeset
135 "pmaddwd (%%esi),%%mm0\n\t"
nick
parents:
diff changeset
136 "movq 8(%%edx),%%mm1\n\t"
nick
parents:
diff changeset
137 "pmaddwd 8(%%esi),%%mm1\n\t"
nick
parents:
diff changeset
138 "movq 16(%%edx),%%mm2\n\t"
nick
parents:
diff changeset
139 "pmaddwd 16(%%esi),%%mm2\n\t"
nick
parents:
diff changeset
140 "movq 24(%%edx),%%mm3\n\t"
nick
parents:
diff changeset
141 "pmaddwd 24(%%esi),%%mm3\n\t"
nick
parents:
diff changeset
142 "paddd %%mm1,%%mm0\n\t"
nick
parents:
diff changeset
143 "paddd %%mm2,%%mm0\n\t"
nick
parents:
diff changeset
144 "paddd %%mm3,%%mm0\n\t"
nick
parents:
diff changeset
145 "movq %%mm0,%%mm1\n\t"
nick
parents:
diff changeset
146 "psrlq $32,%%mm1\n\t"
nick
parents:
diff changeset
147 "paddd %%mm1,%%mm0\n\t"
nick
parents:
diff changeset
148 "psrad $13,%%mm0\n\t"
nick
parents:
diff changeset
149 "packssdw %%mm0,%%mm0\n\t"
nick
parents:
diff changeset
150 "movd %%mm0,%%eax\n\t"
nick
parents:
diff changeset
151 "movw %%ax, (%%edi)\n\t"
nick
parents:
diff changeset
152 "leal 32(%%esi),%%esi\n\t"
nick
parents:
diff changeset
153 "leal 64(%%edx),%%edx\n\t"
nick
parents:
diff changeset
154 "leal 4(%%edi),%%edi\n\t"
nick
parents:
diff changeset
155
nick
parents:
diff changeset
156 "subl $64,%%esi\n\t"
nick
parents:
diff changeset
157 "movl $7,%%ecx\n\t"
21046
ba5087cb0bd4 Use ASMALIGN macro for better compatibility and remove SYS_DARWIN
reimar
parents: 21040
diff changeset
158 ASMALIGN(4)
4608
b9c19fe73850 Fix according to strange bugreport
nick
parents: 4322
diff changeset
159 ".L04:\n\t"
4142
nick
parents:
diff changeset
160 "movq (%%edx),%%mm0\n\t"
nick
parents:
diff changeset
161 "movq 64(%%edx),%%mm4\n\t"
nick
parents:
diff changeset
162 "pmaddwd (%%esi),%%mm0\n\t"
nick
parents:
diff changeset
163 "pmaddwd -32(%%esi),%%mm4\n\t"
nick
parents:
diff changeset
164 "movq 8(%%edx),%%mm1\n\t"
nick
parents:
diff changeset
165 "movq 72(%%edx),%%mm5\n\t"
nick
parents:
diff changeset
166 "pmaddwd 8(%%esi),%%mm1\n\t"
nick
parents:
diff changeset
167 "pmaddwd -24(%%esi),%%mm5\n\t"
nick
parents:
diff changeset
168 "movq 16(%%edx),%%mm2\n\t"
nick
parents:
diff changeset
169 "movq 80(%%edx),%%mm6\n\t"
nick
parents:
diff changeset
170 "pmaddwd 16(%%esi),%%mm2\n\t"
nick
parents:
diff changeset
171 "pmaddwd -16(%%esi),%%mm6\n\t"
nick
parents:
diff changeset
172 "movq 24(%%edx),%%mm3\n\t"
nick
parents:
diff changeset
173 "movq 88(%%edx),%%mm7\n\t"
nick
parents:
diff changeset
174 "pmaddwd 24(%%esi),%%mm3\n\t"
nick
parents:
diff changeset
175 "pmaddwd -8(%%esi),%%mm7\n\t"
nick
parents:
diff changeset
176 "paddd %%mm1,%%mm0\n\t"
nick
parents:
diff changeset
177 "paddd %%mm5,%%mm4\n\t"
nick
parents:
diff changeset
178 "paddd %%mm2,%%mm0\n\t"
nick
parents:
diff changeset
179 "paddd %%mm6,%%mm4\n\t"
nick
parents:
diff changeset
180 "paddd %%mm3,%%mm0\n\t"
nick
parents:
diff changeset
181 "paddd %%mm7,%%mm4\n\t"
nick
parents:
diff changeset
182 "movq %%mm0,%%mm1\n\t"
nick
parents:
diff changeset
183 "movq %%mm4,%%mm5\n\t"
nick
parents:
diff changeset
184 "psrlq $32,%%mm1\n\t"
nick
parents:
diff changeset
185 "psrlq $32,%%mm5\n\t"
nick
parents:
diff changeset
186 "paddd %%mm0,%%mm1\n\t"
nick
parents:
diff changeset
187 "paddd %%mm4,%%mm5\n\t"
nick
parents:
diff changeset
188 "psrad $13,%%mm1\n\t"
nick
parents:
diff changeset
189 "psrad $13,%%mm5\n\t"
nick
parents:
diff changeset
190 "packssdw %%mm1,%%mm1\n\t"
nick
parents:
diff changeset
191 "packssdw %%mm5,%%mm5\n\t"
nick
parents:
diff changeset
192 "psubd %%mm0,%%mm0\n\t"
nick
parents:
diff changeset
193 "psubd %%mm4,%%mm4\n\t"
nick
parents:
diff changeset
194 "psubsw %%mm1,%%mm0\n\t"
nick
parents:
diff changeset
195 "psubsw %%mm5,%%mm4\n\t"
nick
parents:
diff changeset
196
nick
parents:
diff changeset
197 "movq (%%edi), %%mm1\n\t"
nick
parents:
diff changeset
198 "punpckldq %%mm4, %%mm0\n\t"
4246
3f677202418b mangling in mp3lib + stdcall undefined fix with cygwin
atmos4
parents: 4142
diff changeset
199 "pand "MANGLE(one_null)", %%mm1\n\t"
3f677202418b mangling in mp3lib + stdcall undefined fix with cygwin
atmos4
parents: 4142
diff changeset
200 "pand "MANGLE(null_one)", %%mm0\n\t"
4142
nick
parents:
diff changeset
201 "por %%mm0, %%mm1\n\t"
nick
parents:
diff changeset
202 "movq %%mm1,(%%edi)\n\t"
nick
parents:
diff changeset
203
nick
parents:
diff changeset
204 "subl $64,%%esi\n\t"
nick
parents:
diff changeset
205 "addl $128,%%edx\n\t"
nick
parents:
diff changeset
206 "leal 8(%%edi),%%edi\n\t"
nick
parents:
diff changeset
207 "decl %%ecx\n\t"
4608
b9c19fe73850 Fix according to strange bugreport
nick
parents: 4322
diff changeset
208 "jnz .L04\n\t"
4142
nick
parents:
diff changeset
209
nick
parents:
diff changeset
210 "movq (%%edx),%%mm0\n\t"
nick
parents:
diff changeset
211 "pmaddwd (%%esi),%%mm0\n\t"
nick
parents:
diff changeset
212 "movq 8(%%edx),%%mm1\n\t"
nick
parents:
diff changeset
213 "pmaddwd 8(%%esi),%%mm1\n\t"
nick
parents:
diff changeset
214 "movq 16(%%edx),%%mm2\n\t"
nick
parents:
diff changeset
215 "pmaddwd 16(%%esi),%%mm2\n\t"
nick
parents:
diff changeset
216 "movq 24(%%edx),%%mm3\n\t"
nick
parents:
diff changeset
217 "pmaddwd 24(%%esi),%%mm3\n\t"
nick
parents:
diff changeset
218 "paddd %%mm1,%%mm0\n\t"
nick
parents:
diff changeset
219 "paddd %%mm2,%%mm0\n\t"
nick
parents:
diff changeset
220 "paddd %%mm3,%%mm0\n\t"
nick
parents:
diff changeset
221 "movq %%mm0,%%mm1\n\t"
nick
parents:
diff changeset
222 "psrlq $32,%%mm1\n\t"
nick
parents:
diff changeset
223 "paddd %%mm0,%%mm1\n\t"
nick
parents:
diff changeset
224 "psrad $13,%%mm1\n\t"
nick
parents:
diff changeset
225 "packssdw %%mm1,%%mm1\n\t"
nick
parents:
diff changeset
226 "psubd %%mm0,%%mm0\n\t"
nick
parents:
diff changeset
227 "psubsw %%mm1,%%mm0\n\t"
nick
parents:
diff changeset
228 "movd %%mm0,%%eax\n\t"
nick
parents:
diff changeset
229 "movw %%ax,(%%edi)\n\t"
nick
parents:
diff changeset
230 "emms\n\t"
23342
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
231 :"+c"(i), "+d"(window), "+S"(b0), "+D"(samples)
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
232 :
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
233 :"memory", "%eax");
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
234 return 0;
4142
nick
parents:
diff changeset
235 }
23342
e070d7f61e9a Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents: 22375
diff changeset
236