Mercurial > mplayer.hg
annotate mp3lib/dct64_sse.c @ 25934:f0e227951c06
Make sure the -wid window does not get notified when we destroy our attached child window.
Previous behaviour seems to cause QT to do something stupid which makes
DestroyWindow hang (SMPlayer is an application where this happened).
author | reimar |
---|---|
date | Sat, 09 Feb 2008 14:47:10 +0000 |
parents | 2095f98cf0fa |
children | 08d18fe9da52 |
rev | line source |
---|---|
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
1 /* |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
2 * Discrete Cosine Tansform (DCT) for SSE |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com> |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c |
25350
2095f98cf0fa
cosmetics: Remove ugly and inconsistent uppercasing from filenames.
diego
parents:
23465
diff
changeset
|
5 * and mp3lib/dct64_mmx.c |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
6 */ |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
7 |
23464
1b1fdac4a68c
Align output pointer so that we can use movaps instead of movups in dct64_sse;
zuxy
parents:
23441
diff
changeset
|
8 #include <libavutil/mem.h> |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
9 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
10 typedef float real; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
11 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
12 extern float __attribute__((aligned(16))) costab_mmx[]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
13 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
14 static const int ppnn[4] __attribute__((aligned(16))) = |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
15 { 0, 0, 1 << 31, 1 << 31 }; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
16 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
17 static const int pnpn[4] __attribute__((aligned(16))) = |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
18 { 0, 1 << 31, 0, 1 << 31 }; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
19 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
20 static const int nnnn[4] __attribute__((aligned(16))) = |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
21 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
22 |
23441 | 23 void dct64_sse(short *out0,short *out1,real *c) |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
24 { |
23465
a1a699833dcf
Change some static temporary vars to automatic ones because mingw32 binutils
zuxy
parents:
23464
diff
changeset
|
25 DECLARE_ALIGNED(16, real, b1[0x20]); |
a1a699833dcf
Change some static temporary vars to automatic ones because mingw32 binutils
zuxy
parents:
23464
diff
changeset
|
26 DECLARE_ALIGNED(16, real, b2[0x20]); |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
27 static real const one = 1.f; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
28 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
29 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
30 real *costab = costab_mmx; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
31 int i; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
32 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
33 for (i = 0; i < 0x20 / 2; i += 4) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
34 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
35 asm( |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
36 "movaps %2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
37 "shufps $27, %%xmm3, %%xmm3\n\t" |
23464
1b1fdac4a68c
Align output pointer so that we can use movaps instead of movups in dct64_sse;
zuxy
parents:
23441
diff
changeset
|
38 "movaps %3, %%xmm1\n\t" |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
39 "movaps %%xmm1, %%xmm4\n\t" |
23464
1b1fdac4a68c
Align output pointer so that we can use movaps instead of movups in dct64_sse;
zuxy
parents:
23441
diff
changeset
|
40 "movaps %4, %%xmm2\n\t" |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
41 "shufps $27, %%xmm4, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
42 "movaps %%xmm2, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
43 "shufps $27, %%xmm0, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
44 "addps %%xmm0, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
45 "movaps %%xmm1, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
46 "subps %%xmm2, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
47 "mulps %%xmm3, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
48 "movaps %%xmm4, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
49 :"=m"(*(b1 + i)), "=m"(*(b1 + 0x1c - i)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
50 :"m"(*(costab + i)), "m"(*(c + i)), "m"(*(c + 0x1c - i)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
51 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
52 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
53 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
54 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
55 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
56 int i; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
57 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
58 for (i = 0; i < 0x20; i += 0x10) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
59 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
60 asm( |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
61 "movaps %4, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
62 "movaps %5, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
63 "movaps %6, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
64 "movaps %7, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
65 "movaps %%xmm1, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
66 "shufps $27, %%xmm7, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
67 "movaps %%xmm3, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
68 "shufps $27, %%xmm5, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
69 "movaps %%xmm4, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
70 "shufps $27, %%xmm2, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
71 "movaps %%xmm6, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
72 "shufps $27, %%xmm0, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
73 "addps %%xmm0, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
74 "movaps %%xmm1, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
75 "addps %%xmm2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
76 "movaps %%xmm3, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
77 "subps %%xmm4, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
78 "movaps %%xmm5, %2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
79 "subps %%xmm6, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
80 "movaps %%xmm7, %3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
81 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)), "=m"(*(b2 + i + 8)), "=m"(*(b2 + i + 12)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
82 :"m"(*(b1 + i)), "m"(*(b1 + i + 4)), "m"(*(b1 + i + 8)), "m"(*(b1 + i + 12)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
83 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
84 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
85 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
86 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
87 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
88 real *costab = costab_mmx + 16; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
89 asm( |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
90 "movaps %4, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
91 "movaps %5, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
92 "movaps %8, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
93 "xorps %%xmm6, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
94 "shufps $27, %%xmm4, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
95 "mulps %%xmm4, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
96 "movaps %9, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
97 "xorps %%xmm7, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
98 "shufps $27, %%xmm2, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
99 "mulps %%xmm2, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
100 "movaps %%xmm0, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
101 "movaps %%xmm1, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
102 "movaps %6, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
103 "mulps %%xmm2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
104 "subps %%xmm3, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
105 "movaps %%xmm6, %2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
106 "movaps %7, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
107 "mulps %%xmm4, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
108 "subps %%xmm5, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
109 "movaps %%xmm7, %3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
110 :"=m"(*(b2 + 8)), "=m"(*(b2 + 0xc)), "=m"(*(b2 + 0x18)), "=m"(*(b2 + 0x1c)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
111 :"m"(*(b2 + 8)), "m"(*(b2 + 0xc)), "m"(*(b2 + 0x18)), "m"(*(b2 + 0x1c)), "m"(*costab), "m"(*(costab + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
112 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
113 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
114 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
115 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
116 real *costab = costab_mmx + 24; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
117 int i; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
118 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
119 asm( |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
120 "movaps %0, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
121 "shufps $27, %%xmm0, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
122 "movaps %1, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
123 "movaps %%xmm5, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
124 : |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
125 :"m"(*costab), "m"(*nnnn) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
126 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
127 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
128 for (i = 0; i < 0x20; i += 8) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
129 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
130 asm( |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
131 "movaps %2, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
132 "movaps %3, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
133 "movaps %%xmm2, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
134 "xorps %%xmm5, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
135 "shufps $27, %%xmm4, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
136 "movaps %%xmm3, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
137 "shufps $27, %%xmm1, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
138 "addps %%xmm1, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
139 "movaps %%xmm2, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
140 "subps %%xmm3, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
141 "xorps %%xmm6, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
142 "mulps %%xmm0, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
143 "movaps %%xmm4, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
144 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
145 :"m"(*(b2 + i)), "m"(*(b2 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
146 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
147 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
148 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
149 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
150 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
151 int i; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
152 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
153 asm( |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
154 "movss %0, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
155 "movss %1, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
156 "movaps %%xmm1, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
157 "unpcklps %%xmm0, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
158 "movss %2, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
159 "movaps %%xmm1, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
160 "unpcklps %%xmm2, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
161 "unpcklps %%xmm3, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
162 "movaps %3, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
163 : |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
164 :"m"(one), "m"(costab_mmx[28]), "m"(costab_mmx[29]), "m"(*ppnn) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
165 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
166 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
167 for (i = 0; i < 0x20; i += 8) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
168 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
169 asm( |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
170 "movaps %2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
171 "movaps %%xmm3, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
172 "shufps $20, %%xmm4, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
173 "shufps $235, %%xmm3, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
174 "xorps %%xmm2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
175 "addps %%xmm3, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
176 "mulps %%xmm0, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
177 "movaps %%xmm4, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
178 "movaps %3, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
179 "movaps %%xmm6, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
180 "shufps $27, %%xmm5, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
181 "xorps %%xmm2, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
182 "addps %%xmm5, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
183 "mulps %%xmm0, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
184 "movaps %%xmm6, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
185 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
186 :"m"(*(b1 + i)), "m"(*(b1 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
187 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
188 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
189 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
190 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
191 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
192 int i; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
193 asm( |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
194 "movss %0, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
195 "movaps %%xmm1, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
196 "movaps %%xmm0, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
197 "unpcklps %%xmm1, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
198 "unpcklps %%xmm0, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
199 "movaps %1, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
200 "unpcklps %%xmm7, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
201 : |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
202 :"m"(costab_mmx[30]), "m"(*pnpn) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
203 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
204 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
205 for (i = 0x8; i < 0x20; i += 8) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
206 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
207 asm volatile ( |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
208 "movaps %2, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
209 "movaps %%xmm1, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
210 "shufps $224, %%xmm3, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
211 "shufps $181, %%xmm1, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
212 "xorps %%xmm0, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
213 "addps %%xmm1, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
214 "mulps %%xmm2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
215 "movaps %%xmm3, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
216 "movaps %3, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
217 "movaps %%xmm4, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
218 "shufps $224, %%xmm5, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
219 "shufps $181, %%xmm4, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
220 "xorps %%xmm0, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
221 "addps %%xmm4, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
222 "mulps %%xmm2, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
223 "movaps %%xmm5, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
224 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
225 :"m"(*(b2 + i)), "m"(*(b2 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
226 :"memory" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
227 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
228 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
229 for (i = 0x8; i < 0x20; i += 8) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
230 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
231 b1[i + 2] += b1[i + 3]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
232 b1[i + 6] += b1[i + 7]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
233 b1[i + 4] += b1[i + 6]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
234 b1[i + 6] += b1[i + 5]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
235 b1[i + 5] += b1[i + 7]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
236 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
237 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
238 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
239 #if 0 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
240 /* Reference C code */ |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
241 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
242 /* |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
243 Should run faster than x87 asm, given that the compiler is sane. |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
244 However, the C code dosen't round with saturation (0x7fff for too |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
245 large positive float, 0x8000 for too small negative float). You |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
246 can hear the difference if you listen carefully. |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
247 */ |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
248 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
249 out0[256] = (short)(b2[0] + b2[1]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
250 out0[0] = (short)((b2[0] - b2[1]) * costab_mmx[30]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
251 out1[128] = (short)((b2[3] - b2[2]) * costab_mmx[30]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
252 out0[128] = (short)((b2[3] - b2[2]) * costab_mmx[30] + b2[3] + b2[2]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
253 out1[192] = (short)((b2[7] - b2[6]) * costab_mmx[30]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
254 out0[192] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + b2[4] + b2[5]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
255 out0[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + (b2[4] - b2[5]) * costab_mmx[30]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
256 out1[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + (b2[4] - b2[5]) * costab_mmx[30]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
257 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
258 out0[224] = (short)(b1[8] + b1[12]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
259 out0[160] = (short)(b1[12] + b1[10]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
260 out0[96] = (short)(b1[10] + b1[14]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
261 out0[32] = (short)(b1[14] + b1[9]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
262 out1[32] = (short)(b1[9] + b1[13]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
263 out1[96] = (short)(b1[13] + b1[11]); |
23323 | 264 out1[224] = (short)b1[15]; |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
265 out1[160] = (short)(b1[15] + b1[11]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
266 out0[240] = (short)(b1[24] + b1[28] + b1[16]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
267 out0[208] = (short)(b1[24] + b1[28] + b1[20]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
268 out0[176] = (short)(b1[28] + b1[26] + b1[20]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
269 out0[144] = (short)(b1[28] + b1[26] + b1[18]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
270 out0[112] = (short)(b1[26] + b1[30] + b1[18]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
271 out0[80] = (short)(b1[26] + b1[30] + b1[22]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
272 out0[48] = (short)(b1[30] + b1[25] + b1[22]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
273 out0[16] = (short)(b1[30] + b1[25] + b1[17]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
274 out1[16] = (short)(b1[25] + b1[29] + b1[17]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
275 out1[48] = (short)(b1[25] + b1[29] + b1[21]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
276 out1[80] = (short)(b1[29] + b1[27] + b1[21]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
277 out1[112] = (short)(b1[29] + b1[27] + b1[19]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
278 out1[144] = (short)(b1[27] + b1[31] + b1[19]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
279 out1[176] = (short)(b1[27] + b1[31] + b1[23]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
280 out1[240] = (short)(b1[31]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
281 out1[208] = (short)(b1[31] + b1[23]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
282 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
283 #else |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
284 /* |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
285 To do saturation efficiently in x86 we can use fist(t)(p), |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
286 pf2iw, or packssdw. We use fist(p) here. |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
287 */ |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
288 asm( |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
289 "flds %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
290 "flds (%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
291 "fadds 4(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
292 "fistp 512(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
293 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
294 "flds (%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
295 "fsubs 4(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
296 "fmul %%st(1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
297 "fistp (%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
298 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
299 "flds 12(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
300 "fsubs 8(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
301 "fmul %%st(1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
302 "fist 256(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
303 "fadds 12(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
304 "fadds 8(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
305 "fistp 256(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
306 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
307 "flds 16(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
308 "fsubs 20(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
309 "fmul %%st(1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
310 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
311 "flds 28(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
312 "fsubs 24(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
313 "fmul %%st(2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
314 "fist 384(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
315 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
316 "fadds 24(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
317 "fadds 28(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
318 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
319 "fadds 16(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
320 "fadds 20(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
321 "fistp 384(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
322 "fadd %%st(2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
323 "fistp 128(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
324 "faddp %%st(1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
325 "fistp 128(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
326 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
327 "flds 32(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
328 "fadds 48(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
329 "fistp 448(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
330 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
331 "flds 48(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
332 "fadds 40(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
333 "fistp 320(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
334 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
335 "flds 40(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
336 "fadds 56(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
337 "fistp 192(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
338 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
339 "flds 56(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
340 "fadds 36(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
341 "fistp 64(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
342 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
343 "flds 36(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
344 "fadds 52(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
345 "fistp 64(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
346 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
347 "flds 52(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
348 "fadds 44(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
349 "fistp 192(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
350 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
351 "flds 60(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
352 "fist 448(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
353 "fadds 44(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
354 "fistp 320(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
355 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
356 "flds 96(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
357 "fadds 112(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
358 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
359 "fadds 64(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
360 "fistp 480(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
361 "fadds 80(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
362 "fistp 416(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
363 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
364 "flds 112(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
365 "fadds 104(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
366 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
367 "fadds 80(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
368 "fistp 352(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
369 "fadds 72(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
370 "fistp 288(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
371 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
372 "flds 104(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
373 "fadds 120(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
374 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
375 "fadds 72(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
376 "fistp 224(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
377 "fadds 88(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
378 "fistp 160(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
379 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
380 "flds 120(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
381 "fadds 100(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
382 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
383 "fadds 88(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
384 "fistp 96(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
385 "fadds 68(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
386 "fistp 32(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
387 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
388 "flds 100(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
389 "fadds 116(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
390 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
391 "fadds 68(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
392 "fistp 32(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
393 "fadds 84(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
394 "fistp 96(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
395 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
396 "flds 116(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
397 "fadds 108(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
398 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
399 "fadds 84(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
400 "fistp 160(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
401 "fadds 76(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
402 "fistp 224(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
403 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
404 "flds 108(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
405 "fadds 124(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
406 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
407 "fadds 76(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
408 "fistp 288(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
409 "fadds 92(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
410 "fistp 352(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
411 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
412 "flds 124(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
413 "fist 480(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
414 "fadds 92(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
415 "fistp 416(%4)\n\t" |
21047
ca38f523c848
directly insert ffreep %%st(0) opcode for compatibility with old
reimar
parents:
21040
diff
changeset
|
416 ".byte 0xdf, 0xc0\n\t" // ffreep %%st(0) |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
417 : |
23441 | 418 :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(out0), "r"(out1) |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
419 :"memory" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
420 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
421 #endif |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
422 out1[0] = out0[0]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
423 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
424 |