comparison mp3lib/dct64_sse.c @ 18932:69c665e91946

Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III Currently only used on CPUs that _only_ support SSE (otherwise try 3DNow* before) Patch by The Mighty Zuxy Meng %zuxy * meng $ gmail * com% Original thread: Date: Jun 21, 2006 10:20 AM Subject: [MPlayer-dev-eng] [PATCH] SSE version of DCT64 for mp3lib
author gpoirier
date Fri, 07 Jul 2006 14:04:07 +0000
parents
children c7d3523c74ee
comparison
equal deleted inserted replaced
18931:da91dc728556 18932:69c665e91946
1 /*
2 * Discrete Cosine Tansform (DCT) for SSE
3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com>
4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c
5 * and mp3lib/dct64_MMX.c
6 */
7
8 /* NOTE: The following code is suboptimal! It can be improved (at least) by
9
10 1. Replace all movups by movaps. (Can Parameter c be always aligned on
11 a 16-byte boundary?)
12
13 2. Rewritten using intrinsics. (GCC generally optimizes intrinsics
14 better. However, when __m128 locals are involved, GCC may
15 produce bad code that uses movaps to access a stack not aligned
16 on a 16-byte boundary, which leads to run-time crashes.)
17
18 */
19
20 typedef float real;
21
22 extern float __attribute__((aligned(16))) costab_mmx[];
23
24 static const int ppnn[4] __attribute__((aligned(16))) =
25 { 0, 0, 1 << 31, 1 << 31 };
26
27 static const int pnpn[4] __attribute__((aligned(16))) =
28 { 0, 1 << 31, 0, 1 << 31 };
29
30 static const int nnnn[4] __attribute__((aligned(16))) =
31 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
32
33 void dct64_sse(real *a,real *b,real *c)
34 {
35 static real __attribute__ ((aligned(16))) b1[0x20];
36 static real __attribute__ ((aligned(16))) b2[0x20];
37 static real const one = 1.f;
38
39 short *out0 = (short*)a;
40 short *out1 = (short*)b;
41
42 {
43 real *costab = costab_mmx;
44 int i;
45
46 for (i = 0; i < 0x20 / 2; i += 4)
47 {
48 asm(
49 "movaps %2, %%xmm3\n\t"
50 "shufps $27, %%xmm3, %%xmm3\n\t"
51 "movups %3, %%xmm1\n\t"
52 "movaps %%xmm1, %%xmm4\n\t"
53 "movups %4, %%xmm2\n\t"
54 "shufps $27, %%xmm4, %%xmm4\n\t"
55 "movaps %%xmm2, %%xmm0\n\t"
56 "shufps $27, %%xmm0, %%xmm0\n\t"
57 "addps %%xmm0, %%xmm1\n\t"
58 "movaps %%xmm1, %0\n\t"
59 "subps %%xmm2, %%xmm4\n\t"
60 "mulps %%xmm3, %%xmm4\n\t"
61 "movaps %%xmm4, %1\n\t"
62 :"=m"(*(b1 + i)), "=m"(*(b1 + 0x1c - i))
63 :"m"(*(costab + i)), "m"(*(c + i)), "m"(*(c + 0x1c - i))
64 );
65 }
66 }
67
68 {
69 int i;
70
71 for (i = 0; i < 0x20; i += 0x10)
72 {
73 asm(
74 "movaps %4, %%xmm1\n\t"
75 "movaps %5, %%xmm3\n\t"
76 "movaps %6, %%xmm4\n\t"
77 "movaps %7, %%xmm6\n\t"
78 "movaps %%xmm1, %%xmm7\n\t"
79 "shufps $27, %%xmm7, %%xmm7\n\t"
80 "movaps %%xmm3, %%xmm5\n\t"
81 "shufps $27, %%xmm5, %%xmm5\n\t"
82 "movaps %%xmm4, %%xmm2\n\t"
83 "shufps $27, %%xmm2, %%xmm2\n\t"
84 "movaps %%xmm6, %%xmm0\n\t"
85 "shufps $27, %%xmm0, %%xmm0\n\t"
86 "addps %%xmm0, %%xmm1\n\t"
87 "movaps %%xmm1, %0\n\t"
88 "addps %%xmm2, %%xmm3\n\t"
89 "movaps %%xmm3, %1\n\t"
90 "subps %%xmm4, %%xmm5\n\t"
91 "movaps %%xmm5, %2\n\t"
92 "subps %%xmm6, %%xmm7\n\t"
93 "movaps %%xmm7, %3\n\t"
94 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)), "=m"(*(b2 + i + 8)), "=m"(*(b2 + i + 12))
95 :"m"(*(b1 + i)), "m"(*(b1 + i + 4)), "m"(*(b1 + i + 8)), "m"(*(b1 + i + 12))
96 );
97 }
98 }
99
100 {
101 real *costab = costab_mmx + 16;
102 asm(
103 "movaps %4, %%xmm0\n\t"
104 "movaps %5, %%xmm1\n\t"
105 "movaps %8, %%xmm4\n\t"
106 "xorps %%xmm6, %%xmm6\n\t"
107 "shufps $27, %%xmm4, %%xmm4\n\t"
108 "mulps %%xmm4, %%xmm1\n\t"
109 "movaps %9, %%xmm2\n\t"
110 "xorps %%xmm7, %%xmm7\n\t"
111 "shufps $27, %%xmm2, %%xmm2\n\t"
112 "mulps %%xmm2, %%xmm0\n\t"
113 "movaps %%xmm0, %0\n\t"
114 "movaps %%xmm1, %1\n\t"
115 "movaps %6, %%xmm3\n\t"
116 "mulps %%xmm2, %%xmm3\n\t"
117 "subps %%xmm3, %%xmm6\n\t"
118 "movaps %%xmm6, %2\n\t"
119 "movaps %7, %%xmm5\n\t"
120 "mulps %%xmm4, %%xmm5\n\t"
121 "subps %%xmm5, %%xmm7\n\t"
122 "movaps %%xmm7, %3\n\t"
123 :"=m"(*(b2 + 8)), "=m"(*(b2 + 0xc)), "=m"(*(b2 + 0x18)), "=m"(*(b2 + 0x1c))
124 :"m"(*(b2 + 8)), "m"(*(b2 + 0xc)), "m"(*(b2 + 0x18)), "m"(*(b2 + 0x1c)), "m"(*costab), "m"(*(costab + 4))
125 );
126 }
127
128 {
129 real *costab = costab_mmx + 24;
130 int i;
131
132 asm(
133 "movaps %0, %%xmm0\n\t"
134 "shufps $27, %%xmm0, %%xmm0\n\t"
135 "movaps %1, %%xmm5\n\t"
136 "movaps %%xmm5, %%xmm6\n\t"
137 :
138 :"m"(*costab), "m"(*nnnn)
139 );
140
141 for (i = 0; i < 0x20; i += 8)
142 {
143 asm(
144 "movaps %2, %%xmm2\n\t"
145 "movaps %3, %%xmm3\n\t"
146 "movaps %%xmm2, %%xmm4\n\t"
147 "xorps %%xmm5, %%xmm6\n\t"
148 "shufps $27, %%xmm4, %%xmm4\n\t"
149 "movaps %%xmm3, %%xmm1\n\t"
150 "shufps $27, %%xmm1, %%xmm1\n\t"
151 "addps %%xmm1, %%xmm2\n\t"
152 "movaps %%xmm2, %0\n\t"
153 "subps %%xmm3, %%xmm4\n\t"
154 "xorps %%xmm6, %%xmm4\n\t"
155 "mulps %%xmm0, %%xmm4\n\t"
156 "movaps %%xmm4, %1\n\t"
157 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
158 :"m"(*(b2 + i)), "m"(*(b2 + i + 4))
159 );
160 }
161 }
162
163 {
164 int i;
165
166 asm(
167 "movss %0, %%xmm1\n\t"
168 "movss %1, %%xmm0\n\t"
169 "movaps %%xmm1, %%xmm3\n\t"
170 "unpcklps %%xmm0, %%xmm3\n\t"
171 "movss %2, %%xmm2\n\t"
172 "movaps %%xmm1, %%xmm0\n\t"
173 "unpcklps %%xmm2, %%xmm0\n\t"
174 "unpcklps %%xmm3, %%xmm0\n\t"
175 "movaps %3, %%xmm2\n\t"
176 :
177 :"m"(one), "m"(costab_mmx[28]), "m"(costab_mmx[29]), "m"(*ppnn)
178 );
179
180 for (i = 0; i < 0x20; i += 8)
181 {
182 asm(
183 "movaps %2, %%xmm3\n\t"
184 "movaps %%xmm3, %%xmm4\n\t"
185 "shufps $20, %%xmm4, %%xmm4\n\t"
186 "shufps $235, %%xmm3, %%xmm3\n\t"
187 "xorps %%xmm2, %%xmm3\n\t"
188 "addps %%xmm3, %%xmm4\n\t"
189 "mulps %%xmm0, %%xmm4\n\t"
190 "movaps %%xmm4, %0\n\t"
191 "movaps %3, %%xmm6\n\t"
192 "movaps %%xmm6, %%xmm5\n\t"
193 "shufps $27, %%xmm5, %%xmm5\n\t"
194 "xorps %%xmm2, %%xmm5\n\t"
195 "addps %%xmm5, %%xmm6\n\t"
196 "mulps %%xmm0, %%xmm6\n\t"
197 "movaps %%xmm6, %1\n\t"
198 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4))
199 :"m"(*(b1 + i)), "m"(*(b1 + i + 4))
200 );
201 }
202 }
203
204 {
205 int i;
206 asm(
207 "movss %0, %%xmm0\n\t"
208 "movaps %%xmm1, %%xmm2\n\t"
209 "movaps %%xmm0, %%xmm7\n\t"
210 "unpcklps %%xmm1, %%xmm2\n\t"
211 "unpcklps %%xmm0, %%xmm7\n\t"
212 "movaps %1, %%xmm0\n\t"
213 "unpcklps %%xmm7, %%xmm2\n\t"
214 :
215 :"m"(costab_mmx[30]), "m"(*pnpn)
216 );
217
218 for (i = 0x8; i < 0x20; i += 8)
219 {
220 asm volatile (
221 "movaps %2, %%xmm1\n\t"
222 "movaps %%xmm1, %%xmm3\n\t"
223 "shufps $224, %%xmm3, %%xmm3\n\t"
224 "shufps $181, %%xmm1, %%xmm1\n\t"
225 "xorps %%xmm0, %%xmm1\n\t"
226 "addps %%xmm1, %%xmm3\n\t"
227 "mulps %%xmm2, %%xmm3\n\t"
228 "movaps %%xmm3, %0\n\t"
229 "movaps %3, %%xmm4\n\t"
230 "movaps %%xmm4, %%xmm5\n\t"
231 "shufps $224, %%xmm5, %%xmm5\n\t"
232 "shufps $181, %%xmm4, %%xmm4\n\t"
233 "xorps %%xmm0, %%xmm4\n\t"
234 "addps %%xmm4, %%xmm5\n\t"
235 "mulps %%xmm2, %%xmm5\n\t"
236 "movaps %%xmm5, %1\n\t"
237 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
238 :"m"(*(b2 + i)), "m"(*(b2 + i + 4))
239 :"memory"
240 );
241 }
242 for (i = 0x8; i < 0x20; i += 8)
243 {
244 b1[i + 2] += b1[i + 3];
245 b1[i + 6] += b1[i + 7];
246 b1[i + 4] += b1[i + 6];
247 b1[i + 6] += b1[i + 5];
248 b1[i + 5] += b1[i + 7];
249 }
250 }
251
252 #if 0
253 /* Reference C code */
254
255 /*
256 Should run faster than x87 asm, given that the compiler is sane.
257 However, the C code dosen't round with saturation (0x7fff for too
258 large positive float, 0x8000 for too small negative float). You
259 can hear the difference if you listen carefully.
260 */
261
262 out0[256] = (short)(b2[0] + b2[1]);
263 out0[0] = (short)((b2[0] - b2[1]) * costab_mmx[30]);
264 out1[128] = (short)((b2[3] - b2[2]) * costab_mmx[30]);
265 out0[128] = (short)((b2[3] - b2[2]) * costab_mmx[30] + b2[3] + b2[2]);
266 out1[192] = (short)((b2[7] - b2[6]) * costab_mmx[30]);
267 out0[192] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + b2[4] + b2[5]);
268 out0[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + (b2[4] - b2[5]) * costab_mmx[30]);
269 out1[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + (b2[4] - b2[5]) * costab_mmx[30]);
270
271 out0[224] = (short)(b1[8] + b1[12]);
272 out0[160] = (short)(b1[12] + b1[10]);
273 out0[96] = (short)(b1[10] + b1[14]);
274 out0[32] = (short)(b1[14] + b1[9]);
275 out1[32] = (short)(b1[9] + b1[13]);
276 out1[96] = (short)(b1[13] + b1[11]);
277 out1[222] = (short)b1[15];
278 out1[160] = (short)(b1[15] + b1[11]);
279 out0[240] = (short)(b1[24] + b1[28] + b1[16]);
280 out0[208] = (short)(b1[24] + b1[28] + b1[20]);
281 out0[176] = (short)(b1[28] + b1[26] + b1[20]);
282 out0[144] = (short)(b1[28] + b1[26] + b1[18]);
283 out0[112] = (short)(b1[26] + b1[30] + b1[18]);
284 out0[80] = (short)(b1[26] + b1[30] + b1[22]);
285 out0[48] = (short)(b1[30] + b1[25] + b1[22]);
286 out0[16] = (short)(b1[30] + b1[25] + b1[17]);
287 out1[16] = (short)(b1[25] + b1[29] + b1[17]);
288 out1[48] = (short)(b1[25] + b1[29] + b1[21]);
289 out1[80] = (short)(b1[29] + b1[27] + b1[21]);
290 out1[112] = (short)(b1[29] + b1[27] + b1[19]);
291 out1[144] = (short)(b1[27] + b1[31] + b1[19]);
292 out1[176] = (short)(b1[27] + b1[31] + b1[23]);
293 out1[240] = (short)(b1[31]);
294 out1[208] = (short)(b1[31] + b1[23]);
295
296 #else
297 /*
298 To do saturation efficiently in x86 we can use fist(t)(p),
299 pf2iw, or packssdw. We use fist(p) here.
300 */
301 asm(
302 "flds %0\n\t"
303 "flds (%2)\n\t"
304 "fadds 4(%2)\n\t"
305 "fistp 512(%3)\n\t"
306
307 "flds (%2)\n\t"
308 "fsubs 4(%2)\n\t"
309 "fmul %%st(1)\n\t"
310 "fistp (%3)\n\t"
311
312 "flds 12(%2)\n\t"
313 "fsubs 8(%2)\n\t"
314 "fmul %%st(1)\n\t"
315 "fist 256(%4)\n\t"
316 "fadds 12(%2)\n\t"
317 "fadds 8(%2)\n\t"
318 "fistp 256(%3)\n\t"
319
320 "flds 16(%2)\n\t"
321 "fsubs 20(%2)\n\t"
322 "fmul %%st(1)\n\t"
323
324 "flds 28(%2)\n\t"
325 "fsubs 24(%2)\n\t"
326 "fmul %%st(2)\n\t"
327 "fist 384(%4)\n\t"
328 "fld %%st(0)\n\t"
329 "fadds 24(%2)\n\t"
330 "fadds 28(%2)\n\t"
331 "fld %%st(0)\n\t"
332 "fadds 16(%2)\n\t"
333 "fadds 20(%2)\n\t"
334 "fistp 384(%3)\n\t"
335 "fadd %%st(2)\n\t"
336 "fistp 128(%3)\n\t"
337 "faddp %%st(1)\n\t"
338 "fistp 128(%4)\n\t"
339
340 "flds 32(%1)\n\t"
341 "fadds 48(%1)\n\t"
342 "fistp 448(%3)\n\t"
343
344 "flds 48(%1)\n\t"
345 "fadds 40(%1)\n\t"
346 "fistp 320(%3)\n\t"
347
348 "flds 40(%1)\n\t"
349 "fadds 56(%1)\n\t"
350 "fistp 192(%3)\n\t"
351
352 "flds 56(%1)\n\t"
353 "fadds 36(%1)\n\t"
354 "fistp 64(%3)\n\t"
355
356 "flds 36(%1)\n\t"
357 "fadds 52(%1)\n\t"
358 "fistp 64(%4)\n\t"
359
360 "flds 52(%1)\n\t"
361 "fadds 44(%1)\n\t"
362 "fistp 192(%4)\n\t"
363
364 "flds 60(%1)\n\t"
365 "fist 448(%4)\n\t"
366 "fadds 44(%1)\n\t"
367 "fistp 320(%4)\n\t"
368
369 "flds 96(%1)\n\t"
370 "fadds 112(%1)\n\t"
371 "fld %%st(0)\n\t"
372 "fadds 64(%1)\n\t"
373 "fistp 480(%3)\n\t"
374 "fadds 80(%1)\n\t"
375 "fistp 416(%3)\n\t"
376
377 "flds 112(%1)\n\t"
378 "fadds 104(%1)\n\t"
379 "fld %%st(0)\n\t"
380 "fadds 80(%1)\n\t"
381 "fistp 352(%3)\n\t"
382 "fadds 72(%1)\n\t"
383 "fistp 288(%3)\n\t"
384
385 "flds 104(%1)\n\t"
386 "fadds 120(%1)\n\t"
387 "fld %%st(0)\n\t"
388 "fadds 72(%1)\n\t"
389 "fistp 224(%3)\n\t"
390 "fadds 88(%1)\n\t"
391 "fistp 160(%3)\n\t"
392
393 "flds 120(%1)\n\t"
394 "fadds 100(%1)\n\t"
395 "fld %%st(0)\n\t"
396 "fadds 88(%1)\n\t"
397 "fistp 96(%3)\n\t"
398 "fadds 68(%1)\n\t"
399 "fistp 32(%3)\n\t"
400
401 "flds 100(%1)\n\t"
402 "fadds 116(%1)\n\t"
403 "fld %%st(0)\n\t"
404 "fadds 68(%1)\n\t"
405 "fistp 32(%4)\n\t"
406 "fadds 84(%1)\n\t"
407 "fistp 96(%4)\n\t"
408
409 "flds 116(%1)\n\t"
410 "fadds 108(%1)\n\t"
411 "fld %%st(0)\n\t"
412 "fadds 84(%1)\n\t"
413 "fistp 160(%4)\n\t"
414 "fadds 76(%1)\n\t"
415 "fistp 224(%4)\n\t"
416
417 "flds 108(%1)\n\t"
418 "fadds 124(%1)\n\t"
419 "fld %%st(0)\n\t"
420 "fadds 76(%1)\n\t"
421 "fistp 288(%4)\n\t"
422 "fadds 92(%1)\n\t"
423 "fistp 352(%4)\n\t"
424
425 "flds 124(%1)\n\t"
426 "fist 480(%4)\n\t"
427 "fadds 92(%1)\n\t"
428 "fistp 416(%4)\n\t"
429 "ffreep %%st(0)\n\t"
430 :
431 :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(a), "r"(b)
432 :"memory"
433 );
434 #endif
435 out1[0] = out0[0];
436 }
437