annotate mp3lib/dct64_sse.c @ 23441:1b739c2dc613

Correct dct64 functions' declarations
author zuxy
date Mon, 04 Jun 2007 03:05:16 +0000
parents e30b3f6bab3f
children 1b1fdac4a68c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
1 /*
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
2 * Discrete Cosine Tansform (DCT) for SSE
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com>
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
5 * and mp3lib/dct64_MMX.c
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
6 */
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
7
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
8 /* NOTE: The following code is suboptimal! It can be improved (at least) by
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
9
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
10 1. Replace all movups by movaps. (Can Parameter c be always aligned on
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
11 a 16-byte boundary?)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
12
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
13 2. Rewritten using intrinsics. (GCC generally optimizes intrinsics
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
14 better. However, when __m128 locals are involved, GCC may
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
15 produce bad code that uses movaps to access a stack not aligned
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
16 on a 16-byte boundary, which leads to run-time crashes.)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
17
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
18 */
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
19
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
20 typedef float real;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
21
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
22 extern float __attribute__((aligned(16))) costab_mmx[];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
23
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
24 static const int ppnn[4] __attribute__((aligned(16))) =
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
25 { 0, 0, 1 << 31, 1 << 31 };
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
26
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
27 static const int pnpn[4] __attribute__((aligned(16))) =
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
28 { 0, 1 << 31, 0, 1 << 31 };
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
29
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
30 static const int nnnn[4] __attribute__((aligned(16))) =
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
31 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
32
23441
1b739c2dc613 Correct dct64 functions' declarations
zuxy
parents: 23323
diff changeset
33 void dct64_sse(short *out0,short *out1,real *c)
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
34 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
35 static real __attribute__ ((aligned(16))) b1[0x20];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
36 static real __attribute__ ((aligned(16))) b2[0x20];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
37 static real const one = 1.f;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
38
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
39 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
40 real *costab = costab_mmx;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
41 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
42
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
43 for (i = 0; i < 0x20 / 2; i += 4)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
44 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
45 asm(
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
46 "movaps %2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
47 "shufps $27, %%xmm3, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
48 "movups %3, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
49 "movaps %%xmm1, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
50 "movups %4, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
51 "shufps $27, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
52 "movaps %%xmm2, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
53 "shufps $27, %%xmm0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
54 "addps %%xmm0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
55 "movaps %%xmm1, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
56 "subps %%xmm2, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
57 "mulps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
58 "movaps %%xmm4, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
59 :"=m"(*(b1 + i)), "=m"(*(b1 + 0x1c - i))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
60 :"m"(*(costab + i)), "m"(*(c + i)), "m"(*(c + 0x1c - i))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
61 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
62 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
63 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
64
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
65 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
66 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
67
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
68 for (i = 0; i < 0x20; i += 0x10)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
69 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
70 asm(
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
71 "movaps %4, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
72 "movaps %5, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
73 "movaps %6, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
74 "movaps %7, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
75 "movaps %%xmm1, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
76 "shufps $27, %%xmm7, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
77 "movaps %%xmm3, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
78 "shufps $27, %%xmm5, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
79 "movaps %%xmm4, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
80 "shufps $27, %%xmm2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
81 "movaps %%xmm6, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
82 "shufps $27, %%xmm0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
83 "addps %%xmm0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
84 "movaps %%xmm1, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
85 "addps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
86 "movaps %%xmm3, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
87 "subps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
88 "movaps %%xmm5, %2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
89 "subps %%xmm6, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
90 "movaps %%xmm7, %3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
91 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)), "=m"(*(b2 + i + 8)), "=m"(*(b2 + i + 12))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
92 :"m"(*(b1 + i)), "m"(*(b1 + i + 4)), "m"(*(b1 + i + 8)), "m"(*(b1 + i + 12))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
93 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
94 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
95 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
96
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
97 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
98 real *costab = costab_mmx + 16;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
99 asm(
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
100 "movaps %4, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
101 "movaps %5, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
102 "movaps %8, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
103 "xorps %%xmm6, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
104 "shufps $27, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
105 "mulps %%xmm4, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
106 "movaps %9, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
107 "xorps %%xmm7, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
108 "shufps $27, %%xmm2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
109 "mulps %%xmm2, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
110 "movaps %%xmm0, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
111 "movaps %%xmm1, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
112 "movaps %6, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
113 "mulps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
114 "subps %%xmm3, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
115 "movaps %%xmm6, %2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
116 "movaps %7, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
117 "mulps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
118 "subps %%xmm5, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
119 "movaps %%xmm7, %3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
120 :"=m"(*(b2 + 8)), "=m"(*(b2 + 0xc)), "=m"(*(b2 + 0x18)), "=m"(*(b2 + 0x1c))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
121 :"m"(*(b2 + 8)), "m"(*(b2 + 0xc)), "m"(*(b2 + 0x18)), "m"(*(b2 + 0x1c)), "m"(*costab), "m"(*(costab + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
122 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
123 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
124
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
125 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
126 real *costab = costab_mmx + 24;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
127 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
128
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
129 asm(
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
130 "movaps %0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
131 "shufps $27, %%xmm0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
132 "movaps %1, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
133 "movaps %%xmm5, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
134 :
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
135 :"m"(*costab), "m"(*nnnn)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
136 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
137
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
138 for (i = 0; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
139 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
140 asm(
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
141 "movaps %2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
142 "movaps %3, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
143 "movaps %%xmm2, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
144 "xorps %%xmm5, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
145 "shufps $27, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
146 "movaps %%xmm3, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
147 "shufps $27, %%xmm1, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
148 "addps %%xmm1, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
149 "movaps %%xmm2, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
150 "subps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
151 "xorps %%xmm6, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
152 "mulps %%xmm0, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
153 "movaps %%xmm4, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
154 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
155 :"m"(*(b2 + i)), "m"(*(b2 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
156 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
157 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
158 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
159
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
160 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
161 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
162
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
163 asm(
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
164 "movss %0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
165 "movss %1, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
166 "movaps %%xmm1, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
167 "unpcklps %%xmm0, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
168 "movss %2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
169 "movaps %%xmm1, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
170 "unpcklps %%xmm2, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
171 "unpcklps %%xmm3, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
172 "movaps %3, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
173 :
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
174 :"m"(one), "m"(costab_mmx[28]), "m"(costab_mmx[29]), "m"(*ppnn)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
175 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
176
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
177 for (i = 0; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
178 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
179 asm(
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
180 "movaps %2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
181 "movaps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
182 "shufps $20, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
183 "shufps $235, %%xmm3, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
184 "xorps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
185 "addps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
186 "mulps %%xmm0, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
187 "movaps %%xmm4, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
188 "movaps %3, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
189 "movaps %%xmm6, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
190 "shufps $27, %%xmm5, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
191 "xorps %%xmm2, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
192 "addps %%xmm5, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
193 "mulps %%xmm0, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
194 "movaps %%xmm6, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
195 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
196 :"m"(*(b1 + i)), "m"(*(b1 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
197 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
198 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
199 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
200
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
201 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
202 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
203 asm(
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
204 "movss %0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
205 "movaps %%xmm1, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
206 "movaps %%xmm0, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
207 "unpcklps %%xmm1, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
208 "unpcklps %%xmm0, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
209 "movaps %1, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
210 "unpcklps %%xmm7, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
211 :
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
212 :"m"(costab_mmx[30]), "m"(*pnpn)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
213 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
214
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
215 for (i = 0x8; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
216 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
217 asm volatile (
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
218 "movaps %2, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
219 "movaps %%xmm1, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
220 "shufps $224, %%xmm3, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
221 "shufps $181, %%xmm1, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
222 "xorps %%xmm0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
223 "addps %%xmm1, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
224 "mulps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
225 "movaps %%xmm3, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
226 "movaps %3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
227 "movaps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
228 "shufps $224, %%xmm5, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
229 "shufps $181, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
230 "xorps %%xmm0, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
231 "addps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
232 "mulps %%xmm2, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
233 "movaps %%xmm5, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
234 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
235 :"m"(*(b2 + i)), "m"(*(b2 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
236 :"memory"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
237 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
238 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
239 for (i = 0x8; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
240 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
241 b1[i + 2] += b1[i + 3];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
242 b1[i + 6] += b1[i + 7];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
243 b1[i + 4] += b1[i + 6];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
244 b1[i + 6] += b1[i + 5];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
245 b1[i + 5] += b1[i + 7];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
246 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
247 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
248
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
249 #if 0
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
250 /* Reference C code */
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
251
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
252 /*
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
253 Should run faster than x87 asm, given that the compiler is sane.
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
254 However, the C code dosen't round with saturation (0x7fff for too
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
255 large positive float, 0x8000 for too small negative float). You
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
256 can hear the difference if you listen carefully.
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
257 */
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
258
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
259 out0[256] = (short)(b2[0] + b2[1]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
260 out0[0] = (short)((b2[0] - b2[1]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
261 out1[128] = (short)((b2[3] - b2[2]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
262 out0[128] = (short)((b2[3] - b2[2]) * costab_mmx[30] + b2[3] + b2[2]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
263 out1[192] = (short)((b2[7] - b2[6]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
264 out0[192] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + b2[4] + b2[5]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
265 out0[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + (b2[4] - b2[5]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
266 out1[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + (b2[4] - b2[5]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
267
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
268 out0[224] = (short)(b1[8] + b1[12]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
269 out0[160] = (short)(b1[12] + b1[10]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
270 out0[96] = (short)(b1[10] + b1[14]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
271 out0[32] = (short)(b1[14] + b1[9]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
272 out1[32] = (short)(b1[9] + b1[13]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
273 out1[96] = (short)(b1[13] + b1[11]);
23323
e30b3f6bab3f Fix bug in reference code
zuxy
parents: 21047
diff changeset
274 out1[224] = (short)b1[15];
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
275 out1[160] = (short)(b1[15] + b1[11]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
276 out0[240] = (short)(b1[24] + b1[28] + b1[16]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
277 out0[208] = (short)(b1[24] + b1[28] + b1[20]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
278 out0[176] = (short)(b1[28] + b1[26] + b1[20]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
279 out0[144] = (short)(b1[28] + b1[26] + b1[18]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
280 out0[112] = (short)(b1[26] + b1[30] + b1[18]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
281 out0[80] = (short)(b1[26] + b1[30] + b1[22]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
282 out0[48] = (short)(b1[30] + b1[25] + b1[22]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
283 out0[16] = (short)(b1[30] + b1[25] + b1[17]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
284 out1[16] = (short)(b1[25] + b1[29] + b1[17]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
285 out1[48] = (short)(b1[25] + b1[29] + b1[21]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
286 out1[80] = (short)(b1[29] + b1[27] + b1[21]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
287 out1[112] = (short)(b1[29] + b1[27] + b1[19]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
288 out1[144] = (short)(b1[27] + b1[31] + b1[19]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
289 out1[176] = (short)(b1[27] + b1[31] + b1[23]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
290 out1[240] = (short)(b1[31]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
291 out1[208] = (short)(b1[31] + b1[23]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
292
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
293 #else
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
294 /*
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
295 To do saturation efficiently in x86 we can use fist(t)(p),
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
296 pf2iw, or packssdw. We use fist(p) here.
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
297 */
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
298 asm(
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
299 "flds %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
300 "flds (%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
301 "fadds 4(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
302 "fistp 512(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
303
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
304 "flds (%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
305 "fsubs 4(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
306 "fmul %%st(1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
307 "fistp (%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
308
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
309 "flds 12(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
310 "fsubs 8(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
311 "fmul %%st(1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
312 "fist 256(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
313 "fadds 12(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
314 "fadds 8(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
315 "fistp 256(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
316
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
317 "flds 16(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
318 "fsubs 20(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
319 "fmul %%st(1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
320
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
321 "flds 28(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
322 "fsubs 24(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
323 "fmul %%st(2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
324 "fist 384(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
325 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
326 "fadds 24(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
327 "fadds 28(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
328 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
329 "fadds 16(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
330 "fadds 20(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
331 "fistp 384(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
332 "fadd %%st(2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
333 "fistp 128(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
334 "faddp %%st(1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
335 "fistp 128(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
336
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
337 "flds 32(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
338 "fadds 48(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
339 "fistp 448(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
340
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
341 "flds 48(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
342 "fadds 40(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
343 "fistp 320(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
344
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
345 "flds 40(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
346 "fadds 56(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
347 "fistp 192(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
348
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
349 "flds 56(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
350 "fadds 36(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
351 "fistp 64(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
352
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
353 "flds 36(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
354 "fadds 52(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
355 "fistp 64(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
356
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
357 "flds 52(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
358 "fadds 44(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
359 "fistp 192(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
360
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
361 "flds 60(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
362 "fist 448(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
363 "fadds 44(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
364 "fistp 320(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
365
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
366 "flds 96(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
367 "fadds 112(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
368 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
369 "fadds 64(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
370 "fistp 480(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
371 "fadds 80(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
372 "fistp 416(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
373
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
374 "flds 112(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
375 "fadds 104(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
376 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
377 "fadds 80(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
378 "fistp 352(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
379 "fadds 72(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
380 "fistp 288(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
381
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
382 "flds 104(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
383 "fadds 120(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
384 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
385 "fadds 72(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
386 "fistp 224(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
387 "fadds 88(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
388 "fistp 160(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
389
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
390 "flds 120(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
391 "fadds 100(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
392 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
393 "fadds 88(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
394 "fistp 96(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
395 "fadds 68(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
396 "fistp 32(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
397
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
398 "flds 100(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
399 "fadds 116(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
400 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
401 "fadds 68(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
402 "fistp 32(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
403 "fadds 84(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
404 "fistp 96(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
405
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
406 "flds 116(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
407 "fadds 108(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
408 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
409 "fadds 84(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
410 "fistp 160(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
411 "fadds 76(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
412 "fistp 224(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
413
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
414 "flds 108(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
415 "fadds 124(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
416 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
417 "fadds 76(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
418 "fistp 288(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
419 "fadds 92(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
420 "fistp 352(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
421
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
422 "flds 124(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
423 "fist 480(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
424 "fadds 92(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
425 "fistp 416(%4)\n\t"
21047
ca38f523c848 directly insert ffreep %%st(0) opcode for compatibility with old
reimar
parents: 21040
diff changeset
426 ".byte 0xdf, 0xc0\n\t" // ffreep %%st(0)
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
427 :
23441
1b739c2dc613 Correct dct64 functions' declarations
zuxy
parents: 23323
diff changeset
428 :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(out0), "r"(out1)
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
429 :"memory"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
430 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
431 #endif
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
432 out1[0] = out0[0];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
433 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
434