Mercurial > mplayer.hg
annotate mp3lib/decode_MMX.c @ 23358:ccb70d86d797
Replace hardcoded 32-bit leal with equivalent add/sub so pointer arithmetic will be 64-bit under AMD64
author | zuxy |
---|---|
date | Tue, 22 May 2007 14:00:39 +0000 |
parents | e070d7f61e9a |
children | 57a99b0631b2 |
rev | line source |
---|---|
4142 | 1 /* |
2 * this code comes under GPL | |
3 * This code was taken from http://www.mpg123.org | |
4 * See ChangeLog of mpg123-0.59s-pre.1 for detail | |
5 * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> | |
6 * | |
7 * Local ChangeLog: | |
8 * - Partial loops unrolling and removing MOVW insn from loops | |
9 */ | |
16989 | 10 #include "config.h" |
11 #include "mangle.h" | |
4142 | 12 #define real float /* ugly - but only way */ |
13 | |
23342
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
14 extern short mp3lib_decwins[]; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
15 extern void (*dct64_MMX_func)(short*, short*, real*); |
12292 | 16 static unsigned long long attribute_used __attribute__((aligned(8))) null_one = 0x0000ffff0000ffffULL; |
17 static unsigned long long attribute_used __attribute__((aligned(8))) one_null = 0xffff0000ffff0000ULL; | |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
16989
diff
changeset
|
18 unsigned long __attribute__((aligned(16))) costab_mmx[] = |
4142 | 19 { |
20 1056974725, | |
21 1057056395, | |
22 1057223771, | |
23 1057485416, | |
24 1057855544, | |
25 1058356026, | |
26 1059019886, | |
27 1059897405, | |
28 1061067246, | |
29 1062657950, | |
30 1064892987, | |
31 1066774581, | |
32 1069414683, | |
33 1073984175, | |
34 1079645762, | |
35 1092815430, | |
36 1057005197, | |
37 1057342072, | |
38 1058087743, | |
39 1059427869, | |
40 1061799040, | |
41 1065862217, | |
42 1071413542, | |
43 1084439708, | |
44 1057128951, | |
45 1058664893, | |
46 1063675095, | |
47 1076102863, | |
48 1057655764, | |
49 1067924853, | |
50 1060439283, | |
51 }; | |
52 | |
23342
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
53 int synth_1to1_MMX(real *bandPtr, int channel, short *samples) |
4142 | 54 { |
23342
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
55 static short buffs[2][2][0x110] __attribute__((aligned(8))); |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
56 static int bo = 1; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
57 short *b0, (*buf)[0x110], *a, *b; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
58 short* window; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
59 int bo1, i = 8; |
11244 | 60 |
23342
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
61 if (channel == 0) { |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
62 bo = (bo - 1) & 0xf; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
63 buf = buffs[1]; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
64 } else { |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
65 samples++; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
66 buf = buffs[0]; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
67 } |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
68 |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
69 if (bo & 1) { |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
70 b0 = buf[1]; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
71 bo1 = bo + 1; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
72 a = buf[0] + bo; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
73 b = buf[1] + ((bo + 1) & 0xf); |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
74 } else { |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
75 b0 = buf[0]; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
76 bo1 = bo; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
77 b = buf[0] + bo; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
78 a = buf[1] + ((bo + 1) & 0xf); |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
79 } |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
80 |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
81 dct64_MMX_func(a, b, bandPtr); |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
82 window = mp3lib_decwins + 16 - bo1; |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
83 //printf("DEBUG: channel %d, bo %d, off %d\n", channel, bo, 16 - bo1); |
4142 | 84 __asm __volatile( |
21046
ba5087cb0bd4
Use ASMALIGN macro for better compatibility and remove SYS_DARWIN
reimar
parents:
21040
diff
changeset
|
85 ASMALIGN(4) |
4608 | 86 ".L03:\n\t" |
4142 | 87 "movq (%%edx),%%mm0\n\t" |
88 "movq 64(%%edx),%%mm4\n\t" | |
89 "pmaddwd (%%esi),%%mm0\n\t" | |
90 "pmaddwd 32(%%esi),%%mm4\n\t" | |
91 "movq 8(%%edx),%%mm1\n\t" | |
92 "movq 72(%%edx),%%mm5\n\t" | |
93 "pmaddwd 8(%%esi),%%mm1\n\t" | |
94 "pmaddwd 40(%%esi),%%mm5\n\t" | |
95 "movq 16(%%edx),%%mm2\n\t" | |
96 "movq 80(%%edx),%%mm6\n\t" | |
97 "pmaddwd 16(%%esi),%%mm2\n\t" | |
98 "pmaddwd 48(%%esi),%%mm6\n\t" | |
99 "movq 24(%%edx),%%mm3\n\t" | |
100 "movq 88(%%edx),%%mm7\n\t" | |
101 "pmaddwd 24(%%esi),%%mm3\n\t" | |
102 "pmaddwd 56(%%esi),%%mm7\n\t" | |
103 "paddd %%mm1,%%mm0\n\t" | |
104 "paddd %%mm5,%%mm4\n\t" | |
105 "paddd %%mm2,%%mm0\n\t" | |
106 "paddd %%mm6,%%mm4\n\t" | |
107 "paddd %%mm3,%%mm0\n\t" | |
108 "paddd %%mm7,%%mm4\n\t" | |
109 "movq %%mm0,%%mm1\n\t" | |
110 "movq %%mm4,%%mm5\n\t" | |
111 "psrlq $32,%%mm1\n\t" | |
112 "psrlq $32,%%mm5\n\t" | |
113 "paddd %%mm1,%%mm0\n\t" | |
114 "paddd %%mm5,%%mm4\n\t" | |
115 "psrad $13,%%mm0\n\t" | |
116 "psrad $13,%%mm4\n\t" | |
117 "packssdw %%mm0,%%mm0\n\t" | |
118 "packssdw %%mm4,%%mm4\n\t" | |
119 | |
120 "movq (%%edi), %%mm1\n\t" | |
121 "punpckldq %%mm4, %%mm0\n\t" | |
4246
3f677202418b
mangling in mp3lib + stdcall undefined fix with cygwin
atmos4
parents:
4142
diff
changeset
|
122 "pand "MANGLE(one_null)", %%mm1\n\t" |
3f677202418b
mangling in mp3lib + stdcall undefined fix with cygwin
atmos4
parents:
4142
diff
changeset
|
123 "pand "MANGLE(null_one)", %%mm0\n\t" |
4142 | 124 "por %%mm0, %%mm1\n\t" |
125 "movq %%mm1,(%%edi)\n\t" | |
126 | |
23358
ccb70d86d797
Replace hardcoded 32-bit leal with equivalent add/sub so pointer arithmetic will be 64-bit under AMD64
zuxy
parents:
23342
diff
changeset
|
127 "add $64,%%esi\n\t" |
ccb70d86d797
Replace hardcoded 32-bit leal with equivalent add/sub so pointer arithmetic will be 64-bit under AMD64
zuxy
parents:
23342
diff
changeset
|
128 "add $128,%%edx\n\t" |
ccb70d86d797
Replace hardcoded 32-bit leal with equivalent add/sub so pointer arithmetic will be 64-bit under AMD64
zuxy
parents:
23342
diff
changeset
|
129 "add $8,%%edi\n\t" |
4142 | 130 |
131 "decl %%ecx\n\t" | |
4608 | 132 "jnz .L03\n\t" |
4142 | 133 |
134 "movq (%%edx),%%mm0\n\t" | |
135 "pmaddwd (%%esi),%%mm0\n\t" | |
136 "movq 8(%%edx),%%mm1\n\t" | |
137 "pmaddwd 8(%%esi),%%mm1\n\t" | |
138 "movq 16(%%edx),%%mm2\n\t" | |
139 "pmaddwd 16(%%esi),%%mm2\n\t" | |
140 "movq 24(%%edx),%%mm3\n\t" | |
141 "pmaddwd 24(%%esi),%%mm3\n\t" | |
142 "paddd %%mm1,%%mm0\n\t" | |
143 "paddd %%mm2,%%mm0\n\t" | |
144 "paddd %%mm3,%%mm0\n\t" | |
145 "movq %%mm0,%%mm1\n\t" | |
146 "psrlq $32,%%mm1\n\t" | |
147 "paddd %%mm1,%%mm0\n\t" | |
148 "psrad $13,%%mm0\n\t" | |
149 "packssdw %%mm0,%%mm0\n\t" | |
150 "movd %%mm0,%%eax\n\t" | |
151 "movw %%ax, (%%edi)\n\t" | |
23358
ccb70d86d797
Replace hardcoded 32-bit leal with equivalent add/sub so pointer arithmetic will be 64-bit under AMD64
zuxy
parents:
23342
diff
changeset
|
152 "sub $32,%%esi\n\t" |
ccb70d86d797
Replace hardcoded 32-bit leal with equivalent add/sub so pointer arithmetic will be 64-bit under AMD64
zuxy
parents:
23342
diff
changeset
|
153 "add $64,%%edx\n\t" |
ccb70d86d797
Replace hardcoded 32-bit leal with equivalent add/sub so pointer arithmetic will be 64-bit under AMD64
zuxy
parents:
23342
diff
changeset
|
154 "add $4,%%edi\n\t" |
ccb70d86d797
Replace hardcoded 32-bit leal with equivalent add/sub so pointer arithmetic will be 64-bit under AMD64
zuxy
parents:
23342
diff
changeset
|
155 |
4142 | 156 "movl $7,%%ecx\n\t" |
21046
ba5087cb0bd4
Use ASMALIGN macro for better compatibility and remove SYS_DARWIN
reimar
parents:
21040
diff
changeset
|
157 ASMALIGN(4) |
4608 | 158 ".L04:\n\t" |
4142 | 159 "movq (%%edx),%%mm0\n\t" |
160 "movq 64(%%edx),%%mm4\n\t" | |
161 "pmaddwd (%%esi),%%mm0\n\t" | |
162 "pmaddwd -32(%%esi),%%mm4\n\t" | |
163 "movq 8(%%edx),%%mm1\n\t" | |
164 "movq 72(%%edx),%%mm5\n\t" | |
165 "pmaddwd 8(%%esi),%%mm1\n\t" | |
166 "pmaddwd -24(%%esi),%%mm5\n\t" | |
167 "movq 16(%%edx),%%mm2\n\t" | |
168 "movq 80(%%edx),%%mm6\n\t" | |
169 "pmaddwd 16(%%esi),%%mm2\n\t" | |
170 "pmaddwd -16(%%esi),%%mm6\n\t" | |
171 "movq 24(%%edx),%%mm3\n\t" | |
172 "movq 88(%%edx),%%mm7\n\t" | |
173 "pmaddwd 24(%%esi),%%mm3\n\t" | |
174 "pmaddwd -8(%%esi),%%mm7\n\t" | |
175 "paddd %%mm1,%%mm0\n\t" | |
176 "paddd %%mm5,%%mm4\n\t" | |
177 "paddd %%mm2,%%mm0\n\t" | |
178 "paddd %%mm6,%%mm4\n\t" | |
179 "paddd %%mm3,%%mm0\n\t" | |
180 "paddd %%mm7,%%mm4\n\t" | |
181 "movq %%mm0,%%mm1\n\t" | |
182 "movq %%mm4,%%mm5\n\t" | |
183 "psrlq $32,%%mm1\n\t" | |
184 "psrlq $32,%%mm5\n\t" | |
185 "paddd %%mm0,%%mm1\n\t" | |
186 "paddd %%mm4,%%mm5\n\t" | |
187 "psrad $13,%%mm1\n\t" | |
188 "psrad $13,%%mm5\n\t" | |
189 "packssdw %%mm1,%%mm1\n\t" | |
190 "packssdw %%mm5,%%mm5\n\t" | |
191 "psubd %%mm0,%%mm0\n\t" | |
192 "psubd %%mm4,%%mm4\n\t" | |
193 "psubsw %%mm1,%%mm0\n\t" | |
194 "psubsw %%mm5,%%mm4\n\t" | |
195 | |
196 "movq (%%edi), %%mm1\n\t" | |
197 "punpckldq %%mm4, %%mm0\n\t" | |
4246
3f677202418b
mangling in mp3lib + stdcall undefined fix with cygwin
atmos4
parents:
4142
diff
changeset
|
198 "pand "MANGLE(one_null)", %%mm1\n\t" |
3f677202418b
mangling in mp3lib + stdcall undefined fix with cygwin
atmos4
parents:
4142
diff
changeset
|
199 "pand "MANGLE(null_one)", %%mm0\n\t" |
4142 | 200 "por %%mm0, %%mm1\n\t" |
201 "movq %%mm1,(%%edi)\n\t" | |
202 | |
23358
ccb70d86d797
Replace hardcoded 32-bit leal with equivalent add/sub so pointer arithmetic will be 64-bit under AMD64
zuxy
parents:
23342
diff
changeset
|
203 "sub $64,%%esi\n\t" |
ccb70d86d797
Replace hardcoded 32-bit leal with equivalent add/sub so pointer arithmetic will be 64-bit under AMD64
zuxy
parents:
23342
diff
changeset
|
204 "add $128,%%edx\n\t" |
ccb70d86d797
Replace hardcoded 32-bit leal with equivalent add/sub so pointer arithmetic will be 64-bit under AMD64
zuxy
parents:
23342
diff
changeset
|
205 "add $8,%%edi\n\t" |
4142 | 206 "decl %%ecx\n\t" |
4608 | 207 "jnz .L04\n\t" |
4142 | 208 |
209 "movq (%%edx),%%mm0\n\t" | |
210 "pmaddwd (%%esi),%%mm0\n\t" | |
211 "movq 8(%%edx),%%mm1\n\t" | |
212 "pmaddwd 8(%%esi),%%mm1\n\t" | |
213 "movq 16(%%edx),%%mm2\n\t" | |
214 "pmaddwd 16(%%esi),%%mm2\n\t" | |
215 "movq 24(%%edx),%%mm3\n\t" | |
216 "pmaddwd 24(%%esi),%%mm3\n\t" | |
217 "paddd %%mm1,%%mm0\n\t" | |
218 "paddd %%mm2,%%mm0\n\t" | |
219 "paddd %%mm3,%%mm0\n\t" | |
220 "movq %%mm0,%%mm1\n\t" | |
221 "psrlq $32,%%mm1\n\t" | |
222 "paddd %%mm0,%%mm1\n\t" | |
223 "psrad $13,%%mm1\n\t" | |
224 "packssdw %%mm1,%%mm1\n\t" | |
225 "psubd %%mm0,%%mm0\n\t" | |
226 "psubsw %%mm1,%%mm0\n\t" | |
227 "movd %%mm0,%%eax\n\t" | |
228 "movw %%ax,(%%edi)\n\t" | |
229 "emms\n\t" | |
23342
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
230 :"+c"(i), "+d"(window), "+S"(b0), "+D"(samples) |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
231 : |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
232 :"memory", "%eax"); |
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
233 return 0; |
4142 | 234 } |
23342
e070d7f61e9a
Rewrite generic code in decode_MMX.c in C for easier AMD64 port. Slightly faster than original assembly.
zuxy
parents:
22375
diff
changeset
|
235 |