annotate mp3lib/dct64_sse.c @ 29265:8f7e539305a0

Restore old license file after whitspace removal in previous commit. Legal stuff is very fragile and shouldn't be changed, even for whitespace cosmetics.
author bircoph
date Wed, 13 May 2009 18:42:38 +0000
parents 50c9bf3e41cf
children 347d152a5cfa
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
1 /*
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
2 * Discrete Cosine Tansform (DCT) for SSE
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com>
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c
25350
2095f98cf0fa cosmetics: Remove ugly and inconsistent uppercasing from filenames.
diego
parents: 23465
diff changeset
5 * and mp3lib/dct64_mmx.c
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
6 */
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
7
28328
50c9bf3e41cf DECLARE_ALIGNED was moved in FFmpeg.
diego
parents: 27754
diff changeset
8 #include "libavutil/internal.h"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
9
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
10 typedef float real;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
11
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
12 extern float __attribute__((aligned(16))) costab_mmx[];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
13
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
14 static const int ppnn[4] __attribute__((aligned(16))) =
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
15 { 0, 0, 1 << 31, 1 << 31 };
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
16
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
17 static const int pnpn[4] __attribute__((aligned(16))) =
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
18 { 0, 1 << 31, 0, 1 << 31 };
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
19
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
20 static const int nnnn[4] __attribute__((aligned(16))) =
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
21 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
22
23441
1b739c2dc613 Correct dct64 functions' declarations
zuxy
parents: 23323
diff changeset
23 void dct64_sse(short *out0,short *out1,real *c)
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
24 {
23465
a1a699833dcf Change some static temporary vars to automatic ones because mingw32 binutils
zuxy
parents: 23464
diff changeset
25 DECLARE_ALIGNED(16, real, b1[0x20]);
a1a699833dcf Change some static temporary vars to automatic ones because mingw32 binutils
zuxy
parents: 23464
diff changeset
26 DECLARE_ALIGNED(16, real, b2[0x20]);
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
27 static real const one = 1.f;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
28
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
29 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
30 real *costab = costab_mmx;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
31 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
32
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
33 for (i = 0; i < 0x20 / 2; i += 4)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
34 {
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
35 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
36 "movaps %2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
37 "shufps $27, %%xmm3, %%xmm3\n\t"
23464
1b1fdac4a68c Align output pointer so that we can use movaps instead of movups in dct64_sse;
zuxy
parents: 23441
diff changeset
38 "movaps %3, %%xmm1\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
39 "movaps %%xmm1, %%xmm4\n\t"
23464
1b1fdac4a68c Align output pointer so that we can use movaps instead of movups in dct64_sse;
zuxy
parents: 23441
diff changeset
40 "movaps %4, %%xmm2\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
41 "shufps $27, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
42 "movaps %%xmm2, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
43 "shufps $27, %%xmm0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
44 "addps %%xmm0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
45 "movaps %%xmm1, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
46 "subps %%xmm2, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
47 "mulps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
48 "movaps %%xmm4, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
49 :"=m"(*(b1 + i)), "=m"(*(b1 + 0x1c - i))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
50 :"m"(*(costab + i)), "m"(*(c + i)), "m"(*(c + 0x1c - i))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
51 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
52 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
53 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
54
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
55 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
56 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
57
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
58 for (i = 0; i < 0x20; i += 0x10)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
59 {
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
60 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
61 "movaps %4, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
62 "movaps %5, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
63 "movaps %6, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
64 "movaps %7, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
65 "movaps %%xmm1, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
66 "shufps $27, %%xmm7, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
67 "movaps %%xmm3, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
68 "shufps $27, %%xmm5, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
69 "movaps %%xmm4, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
70 "shufps $27, %%xmm2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
71 "movaps %%xmm6, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
72 "shufps $27, %%xmm0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
73 "addps %%xmm0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
74 "movaps %%xmm1, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
75 "addps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
76 "movaps %%xmm3, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
77 "subps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
78 "movaps %%xmm5, %2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
79 "subps %%xmm6, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
80 "movaps %%xmm7, %3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
81 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)), "=m"(*(b2 + i + 8)), "=m"(*(b2 + i + 12))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
82 :"m"(*(b1 + i)), "m"(*(b1 + i + 4)), "m"(*(b1 + i + 8)), "m"(*(b1 + i + 12))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
83 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
84 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
85 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
86
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
87 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
88 real *costab = costab_mmx + 16;
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
89 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
90 "movaps %4, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
91 "movaps %5, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
92 "movaps %8, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
93 "xorps %%xmm6, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
94 "shufps $27, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
95 "mulps %%xmm4, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
96 "movaps %9, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
97 "xorps %%xmm7, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
98 "shufps $27, %%xmm2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
99 "mulps %%xmm2, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
100 "movaps %%xmm0, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
101 "movaps %%xmm1, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
102 "movaps %6, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
103 "mulps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
104 "subps %%xmm3, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
105 "movaps %%xmm6, %2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
106 "movaps %7, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
107 "mulps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
108 "subps %%xmm5, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
109 "movaps %%xmm7, %3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
110 :"=m"(*(b2 + 8)), "=m"(*(b2 + 0xc)), "=m"(*(b2 + 0x18)), "=m"(*(b2 + 0x1c))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
111 :"m"(*(b2 + 8)), "m"(*(b2 + 0xc)), "m"(*(b2 + 0x18)), "m"(*(b2 + 0x1c)), "m"(*costab), "m"(*(costab + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
112 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
113 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
114
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
115 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
116 real *costab = costab_mmx + 24;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
117 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
118
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
119 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
120 "movaps %0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
121 "shufps $27, %%xmm0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
122 "movaps %1, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
123 "movaps %%xmm5, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
124 :
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
125 :"m"(*costab), "m"(*nnnn)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
126 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
127
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
128 for (i = 0; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
129 {
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
130 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
131 "movaps %2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
132 "movaps %3, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
133 "movaps %%xmm2, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
134 "xorps %%xmm5, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
135 "shufps $27, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
136 "movaps %%xmm3, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
137 "shufps $27, %%xmm1, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
138 "addps %%xmm1, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
139 "movaps %%xmm2, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
140 "subps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
141 "xorps %%xmm6, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
142 "mulps %%xmm0, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
143 "movaps %%xmm4, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
144 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
145 :"m"(*(b2 + i)), "m"(*(b2 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
146 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
147 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
148 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
149
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
150 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
151 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
152
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
153 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
154 "movss %0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
155 "movss %1, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
156 "movaps %%xmm1, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
157 "unpcklps %%xmm0, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
158 "movss %2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
159 "movaps %%xmm1, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
160 "unpcklps %%xmm2, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
161 "unpcklps %%xmm3, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
162 "movaps %3, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
163 :
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
164 :"m"(one), "m"(costab_mmx[28]), "m"(costab_mmx[29]), "m"(*ppnn)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
165 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
166
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
167 for (i = 0; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
168 {
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
169 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
170 "movaps %2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
171 "movaps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
172 "shufps $20, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
173 "shufps $235, %%xmm3, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
174 "xorps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
175 "addps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
176 "mulps %%xmm0, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
177 "movaps %%xmm4, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
178 "movaps %3, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
179 "movaps %%xmm6, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
180 "shufps $27, %%xmm5, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
181 "xorps %%xmm2, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
182 "addps %%xmm5, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
183 "mulps %%xmm0, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
184 "movaps %%xmm6, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
185 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
186 :"m"(*(b1 + i)), "m"(*(b1 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
187 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
188 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
189 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
190
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
191 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
192 int i;
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
193 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
194 "movss %0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
195 "movaps %%xmm1, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
196 "movaps %%xmm0, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
197 "unpcklps %%xmm1, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
198 "unpcklps %%xmm0, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
199 "movaps %1, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
200 "unpcklps %%xmm7, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
201 :
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
202 :"m"(costab_mmx[30]), "m"(*pnpn)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
203 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
204
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
205 for (i = 0x8; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
206 {
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
207 __asm__ volatile (
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
208 "movaps %2, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
209 "movaps %%xmm1, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
210 "shufps $224, %%xmm3, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
211 "shufps $181, %%xmm1, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
212 "xorps %%xmm0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
213 "addps %%xmm1, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
214 "mulps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
215 "movaps %%xmm3, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
216 "movaps %3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
217 "movaps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
218 "shufps $224, %%xmm5, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
219 "shufps $181, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
220 "xorps %%xmm0, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
221 "addps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
222 "mulps %%xmm2, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
223 "movaps %%xmm5, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
224 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
225 :"m"(*(b2 + i)), "m"(*(b2 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
226 :"memory"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
227 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
228 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
229 for (i = 0x8; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
230 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
231 b1[i + 2] += b1[i + 3];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
232 b1[i + 6] += b1[i + 7];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
233 b1[i + 4] += b1[i + 6];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
234 b1[i + 6] += b1[i + 5];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
235 b1[i + 5] += b1[i + 7];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
236 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
237 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
238
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
239 #if 0
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
240 /* Reference C code */
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
241
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
242 /*
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
243 Should run faster than x87 asm, given that the compiler is sane.
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
244 However, the C code dosen't round with saturation (0x7fff for too
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
245 large positive float, 0x8000 for too small negative float). You
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
246 can hear the difference if you listen carefully.
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
247 */
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
248
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
249 out0[256] = (short)(b2[0] + b2[1]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
250 out0[0] = (short)((b2[0] - b2[1]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
251 out1[128] = (short)((b2[3] - b2[2]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
252 out0[128] = (short)((b2[3] - b2[2]) * costab_mmx[30] + b2[3] + b2[2]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
253 out1[192] = (short)((b2[7] - b2[6]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
254 out0[192] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + b2[4] + b2[5]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
255 out0[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + (b2[4] - b2[5]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
256 out1[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + (b2[4] - b2[5]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
257
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
258 out0[224] = (short)(b1[8] + b1[12]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
259 out0[160] = (short)(b1[12] + b1[10]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
260 out0[96] = (short)(b1[10] + b1[14]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
261 out0[32] = (short)(b1[14] + b1[9]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
262 out1[32] = (short)(b1[9] + b1[13]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
263 out1[96] = (short)(b1[13] + b1[11]);
23323
e30b3f6bab3f Fix bug in reference code
zuxy
parents: 21047
diff changeset
264 out1[224] = (short)b1[15];
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
265 out1[160] = (short)(b1[15] + b1[11]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
266 out0[240] = (short)(b1[24] + b1[28] + b1[16]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
267 out0[208] = (short)(b1[24] + b1[28] + b1[20]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
268 out0[176] = (short)(b1[28] + b1[26] + b1[20]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
269 out0[144] = (short)(b1[28] + b1[26] + b1[18]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
270 out0[112] = (short)(b1[26] + b1[30] + b1[18]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
271 out0[80] = (short)(b1[26] + b1[30] + b1[22]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
272 out0[48] = (short)(b1[30] + b1[25] + b1[22]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
273 out0[16] = (short)(b1[30] + b1[25] + b1[17]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
274 out1[16] = (short)(b1[25] + b1[29] + b1[17]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
275 out1[48] = (short)(b1[25] + b1[29] + b1[21]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
276 out1[80] = (short)(b1[29] + b1[27] + b1[21]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
277 out1[112] = (short)(b1[29] + b1[27] + b1[19]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
278 out1[144] = (short)(b1[27] + b1[31] + b1[19]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
279 out1[176] = (short)(b1[27] + b1[31] + b1[23]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
280 out1[240] = (short)(b1[31]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
281 out1[208] = (short)(b1[31] + b1[23]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
282
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
283 #else
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
284 /*
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
285 To do saturation efficiently in x86 we can use fist(t)(p),
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
286 pf2iw, or packssdw. We use fist(p) here.
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
287 */
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
288 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
289 "flds %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
290 "flds (%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
291 "fadds 4(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
292 "fistp 512(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
293
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
294 "flds (%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
295 "fsubs 4(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
296 "fmul %%st(1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
297 "fistp (%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
298
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
299 "flds 12(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
300 "fsubs 8(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
301 "fmul %%st(1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
302 "fist 256(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
303 "fadds 12(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
304 "fadds 8(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
305 "fistp 256(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
306
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
307 "flds 16(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
308 "fsubs 20(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
309 "fmul %%st(1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
310
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
311 "flds 28(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
312 "fsubs 24(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
313 "fmul %%st(2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
314 "fist 384(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
315 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
316 "fadds 24(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
317 "fadds 28(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
318 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
319 "fadds 16(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
320 "fadds 20(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
321 "fistp 384(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
322 "fadd %%st(2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
323 "fistp 128(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
324 "faddp %%st(1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
325 "fistp 128(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
326
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
327 "flds 32(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
328 "fadds 48(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
329 "fistp 448(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
330
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
331 "flds 48(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
332 "fadds 40(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
333 "fistp 320(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
334
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
335 "flds 40(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
336 "fadds 56(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
337 "fistp 192(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
338
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
339 "flds 56(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
340 "fadds 36(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
341 "fistp 64(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
342
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
343 "flds 36(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
344 "fadds 52(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
345 "fistp 64(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
346
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
347 "flds 52(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
348 "fadds 44(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
349 "fistp 192(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
350
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
351 "flds 60(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
352 "fist 448(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
353 "fadds 44(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
354 "fistp 320(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
355
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
356 "flds 96(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
357 "fadds 112(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
358 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
359 "fadds 64(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
360 "fistp 480(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
361 "fadds 80(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
362 "fistp 416(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
363
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
364 "flds 112(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
365 "fadds 104(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
366 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
367 "fadds 80(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
368 "fistp 352(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
369 "fadds 72(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
370 "fistp 288(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
371
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
372 "flds 104(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
373 "fadds 120(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
374 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
375 "fadds 72(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
376 "fistp 224(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
377 "fadds 88(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
378 "fistp 160(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
379
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
380 "flds 120(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
381 "fadds 100(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
382 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
383 "fadds 88(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
384 "fistp 96(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
385 "fadds 68(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
386 "fistp 32(%3)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
387
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
388 "flds 100(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
389 "fadds 116(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
390 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
391 "fadds 68(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
392 "fistp 32(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
393 "fadds 84(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
394 "fistp 96(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
395
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
396 "flds 116(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
397 "fadds 108(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
398 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
399 "fadds 84(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
400 "fistp 160(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
401 "fadds 76(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
402 "fistp 224(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
403
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
404 "flds 108(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
405 "fadds 124(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
406 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
407 "fadds 76(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
408 "fistp 288(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
409 "fadds 92(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
410 "fistp 352(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
411
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
412 "flds 124(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
413 "fist 480(%4)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
414 "fadds 92(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
415 "fistp 416(%4)\n\t"
21047
ca38f523c848 directly insert ffreep %%st(0) opcode for compatibility with old
reimar
parents: 21040
diff changeset
416 ".byte 0xdf, 0xc0\n\t" // ffreep %%st(0)
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
417 :
23441
1b739c2dc613 Correct dct64 functions' declarations
zuxy
parents: 23323
diff changeset
418 :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(out0), "r"(out1)
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
419 :"memory"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
420 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
421 #endif
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
422 out1[0] = out0[0];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
423 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
424