annotate mp3lib/dct64_sse.c @ 33080:60b2e408bd78

Allow number sign as comment character. This is practice in some skins and should not lead to an error message now that unknown items are reported. Update documentation.
author ib
date Thu, 31 Mar 2011 10:13:47 +0000
parents 741d36324244
children 5e58893a6478
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
1 /*
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
2 * Discrete Cosine Tansform (DCT) for SSE
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com>
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c
25350
2095f98cf0fa cosmetics: Remove ugly and inconsistent uppercasing from filenames.
diego
parents: 23465
diff changeset
5 * and mp3lib/dct64_mmx.c
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
6 */
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
7
31502
c95e08f87c70 Include internal.h last to avoid a conflict between its dprintf()
reimar
parents: 30835
diff changeset
8 #include "libavutil/mem.h"
30167
347d152a5cfa Refactor real --> float #define to a typedef in a common header.
diego
parents: 28328
diff changeset
9 #include "mpg123.h"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
10
32303
cad51466b42e Revert accidentally committed changes.
diego
parents: 32302
diff changeset
11 extern float __attribute__((aligned(16))) costab_mmx[];
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
12
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
13 static const int ppnn[4] __attribute__((aligned(16))) =
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
14 { 0, 0, 1 << 31, 1 << 31 };
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
15
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
16 static const int pnpn[4] __attribute__((aligned(16))) =
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
17 { 0, 1 << 31, 0, 1 << 31 };
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
18
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
19 static const int nnnn[4] __attribute__((aligned(16))) =
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
20 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
21
23441
1b739c2dc613 Correct dct64 functions' declarations
zuxy
parents: 23323
diff changeset
22 void dct64_sse(short *out0,short *out1,real *c)
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
23 {
23465
a1a699833dcf Change some static temporary vars to automatic ones because mingw32 binutils
zuxy
parents: 23464
diff changeset
24 DECLARE_ALIGNED(16, real, b1[0x20]);
a1a699833dcf Change some static temporary vars to automatic ones because mingw32 binutils
zuxy
parents: 23464
diff changeset
25 DECLARE_ALIGNED(16, real, b2[0x20]);
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
26 static real const one = 1.f;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
27
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
28 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
29 real *costab = costab_mmx;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
30 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
31
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
32 for (i = 0; i < 0x20 / 2; i += 4)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
33 {
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
34 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
35 "movaps %2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
36 "shufps $27, %%xmm3, %%xmm3\n\t"
23464
1b1fdac4a68c Align output pointer so that we can use movaps instead of movups in dct64_sse;
zuxy
parents: 23441
diff changeset
37 "movaps %3, %%xmm1\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
38 "movaps %%xmm1, %%xmm4\n\t"
23464
1b1fdac4a68c Align output pointer so that we can use movaps instead of movups in dct64_sse;
zuxy
parents: 23441
diff changeset
39 "movaps %4, %%xmm2\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
40 "shufps $27, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
41 "movaps %%xmm2, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
42 "shufps $27, %%xmm0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
43 "addps %%xmm0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
44 "movaps %%xmm1, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
45 "subps %%xmm2, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
46 "mulps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
47 "movaps %%xmm4, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
48 :"=m"(*(b1 + i)), "=m"(*(b1 + 0x1c - i))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
49 :"m"(*(costab + i)), "m"(*(c + i)), "m"(*(c + 0x1c - i))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
50 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
51 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
52 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
53
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
54 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
55 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
56
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
57 for (i = 0; i < 0x20; i += 0x10)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
58 {
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
59 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
60 "movaps %4, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
61 "movaps %5, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
62 "movaps %6, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
63 "movaps %7, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
64 "movaps %%xmm1, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
65 "shufps $27, %%xmm7, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
66 "movaps %%xmm3, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
67 "shufps $27, %%xmm5, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
68 "movaps %%xmm4, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
69 "shufps $27, %%xmm2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
70 "movaps %%xmm6, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
71 "shufps $27, %%xmm0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
72 "addps %%xmm0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
73 "movaps %%xmm1, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
74 "addps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
75 "movaps %%xmm3, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
76 "subps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
77 "movaps %%xmm5, %2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
78 "subps %%xmm6, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
79 "movaps %%xmm7, %3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
80 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)), "=m"(*(b2 + i + 8)), "=m"(*(b2 + i + 12))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
81 :"m"(*(b1 + i)), "m"(*(b1 + i + 4)), "m"(*(b1 + i + 8)), "m"(*(b1 + i + 12))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
82 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
83 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
84 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
85
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
86 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
87 real *costab = costab_mmx + 16;
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
88 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
89 "movaps %4, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
90 "movaps %5, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
91 "movaps %8, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
92 "xorps %%xmm6, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
93 "shufps $27, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
94 "mulps %%xmm4, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
95 "movaps %9, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
96 "xorps %%xmm7, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
97 "shufps $27, %%xmm2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
98 "mulps %%xmm2, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
99 "movaps %%xmm0, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
100 "movaps %%xmm1, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
101 "movaps %6, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
102 "mulps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
103 "subps %%xmm3, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
104 "movaps %%xmm6, %2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
105 "movaps %7, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
106 "mulps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
107 "subps %%xmm5, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
108 "movaps %%xmm7, %3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
109 :"=m"(*(b2 + 8)), "=m"(*(b2 + 0xc)), "=m"(*(b2 + 0x18)), "=m"(*(b2 + 0x1c))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
110 :"m"(*(b2 + 8)), "m"(*(b2 + 0xc)), "m"(*(b2 + 0x18)), "m"(*(b2 + 0x1c)), "m"(*costab), "m"(*(costab + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
111 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
112 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
113
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
114 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
115 real *costab = costab_mmx + 24;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
116 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
117
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
118 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
119 "movaps %0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
120 "shufps $27, %%xmm0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
121 "movaps %1, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
122 "movaps %%xmm5, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
123 :
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
124 :"m"(*costab), "m"(*nnnn)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
125 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
126
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
127 for (i = 0; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
128 {
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
129 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
130 "movaps %2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
131 "movaps %3, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
132 "movaps %%xmm2, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
133 "xorps %%xmm5, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
134 "shufps $27, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
135 "movaps %%xmm3, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
136 "shufps $27, %%xmm1, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
137 "addps %%xmm1, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
138 "movaps %%xmm2, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
139 "subps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
140 "xorps %%xmm6, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
141 "mulps %%xmm0, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
142 "movaps %%xmm4, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
143 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
144 :"m"(*(b2 + i)), "m"(*(b2 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
145 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
146 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
147 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
148
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
149 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
150 int i;
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
151
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
152 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
153 "movss %0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
154 "movss %1, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
155 "movaps %%xmm1, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
156 "unpcklps %%xmm0, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
157 "movss %2, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
158 "movaps %%xmm1, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
159 "unpcklps %%xmm2, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
160 "unpcklps %%xmm3, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
161 "movaps %3, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
162 :
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
163 :"m"(one), "m"(costab_mmx[28]), "m"(costab_mmx[29]), "m"(*ppnn)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
164 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
165
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
166 for (i = 0; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
167 {
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
168 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
169 "movaps %2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
170 "movaps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
171 "shufps $20, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
172 "shufps $235, %%xmm3, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
173 "xorps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
174 "addps %%xmm3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
175 "mulps %%xmm0, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
176 "movaps %%xmm4, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
177 "movaps %3, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
178 "movaps %%xmm6, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
179 "shufps $27, %%xmm5, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
180 "xorps %%xmm2, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
181 "addps %%xmm5, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
182 "mulps %%xmm0, %%xmm6\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
183 "movaps %%xmm6, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
184 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
185 :"m"(*(b1 + i)), "m"(*(b1 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
186 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
187 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
188 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
189
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
190 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
191 int i;
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
192 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
193 "movss %0, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
194 "movaps %%xmm1, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
195 "movaps %%xmm0, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
196 "unpcklps %%xmm1, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
197 "unpcklps %%xmm0, %%xmm7\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
198 "movaps %1, %%xmm0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
199 "unpcklps %%xmm7, %%xmm2\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
200 :
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
201 :"m"(costab_mmx[30]), "m"(*pnpn)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
202 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
203
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
204 for (i = 0x8; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
205 {
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
206 __asm__ volatile (
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
207 "movaps %2, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
208 "movaps %%xmm1, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
209 "shufps $224, %%xmm3, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
210 "shufps $181, %%xmm1, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
211 "xorps %%xmm0, %%xmm1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
212 "addps %%xmm1, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
213 "mulps %%xmm2, %%xmm3\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
214 "movaps %%xmm3, %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
215 "movaps %3, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
216 "movaps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
217 "shufps $224, %%xmm5, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
218 "shufps $181, %%xmm4, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
219 "xorps %%xmm0, %%xmm4\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
220 "addps %%xmm4, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
221 "mulps %%xmm2, %%xmm5\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
222 "movaps %%xmm5, %1\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
223 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
224 :"m"(*(b2 + i)), "m"(*(b2 + i + 4))
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
225 :"memory"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
226 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
227 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
228 for (i = 0x8; i < 0x20; i += 8)
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
229 {
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
230 b1[i + 2] += b1[i + 3];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
231 b1[i + 6] += b1[i + 7];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
232 b1[i + 4] += b1[i + 6];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
233 b1[i + 6] += b1[i + 5];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
234 b1[i + 5] += b1[i + 7];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
235 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
236 }
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
237
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
238 #if 0
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
239 /* Reference C code */
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
240
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
241 /*
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
242 Should run faster than x87 asm, given that the compiler is sane.
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
243 However, the C code dosen't round with saturation (0x7fff for too
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
244 large positive float, 0x8000 for too small negative float). You
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
245 can hear the difference if you listen carefully.
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
246 */
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
247
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
248 out0[256] = (short)(b2[0] + b2[1]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
249 out0[0] = (short)((b2[0] - b2[1]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
250 out1[128] = (short)((b2[3] - b2[2]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
251 out0[128] = (short)((b2[3] - b2[2]) * costab_mmx[30] + b2[3] + b2[2]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
252 out1[192] = (short)((b2[7] - b2[6]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
253 out0[192] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + b2[4] + b2[5]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
254 out0[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + (b2[4] - b2[5]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
255 out1[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + (b2[4] - b2[5]) * costab_mmx[30]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
256
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
257 out0[224] = (short)(b1[8] + b1[12]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
258 out0[160] = (short)(b1[12] + b1[10]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
259 out0[96] = (short)(b1[10] + b1[14]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
260 out0[32] = (short)(b1[14] + b1[9]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
261 out1[32] = (short)(b1[9] + b1[13]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
262 out1[96] = (short)(b1[13] + b1[11]);
23323
e30b3f6bab3f Fix bug in reference code
zuxy
parents: 21047
diff changeset
263 out1[224] = (short)b1[15];
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
264 out1[160] = (short)(b1[15] + b1[11]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
265 out0[240] = (short)(b1[24] + b1[28] + b1[16]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
266 out0[208] = (short)(b1[24] + b1[28] + b1[20]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
267 out0[176] = (short)(b1[28] + b1[26] + b1[20]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
268 out0[144] = (short)(b1[28] + b1[26] + b1[18]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
269 out0[112] = (short)(b1[26] + b1[30] + b1[18]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
270 out0[80] = (short)(b1[26] + b1[30] + b1[22]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
271 out0[48] = (short)(b1[30] + b1[25] + b1[22]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
272 out0[16] = (short)(b1[30] + b1[25] + b1[17]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
273 out1[16] = (short)(b1[25] + b1[29] + b1[17]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
274 out1[48] = (short)(b1[25] + b1[29] + b1[21]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
275 out1[80] = (short)(b1[29] + b1[27] + b1[21]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
276 out1[112] = (short)(b1[29] + b1[27] + b1[19]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
277 out1[144] = (short)(b1[27] + b1[31] + b1[19]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
278 out1[176] = (short)(b1[27] + b1[31] + b1[23]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
279 out1[240] = (short)(b1[31]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
280 out1[208] = (short)(b1[31] + b1[23]);
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
281
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
282 #else
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
283 /*
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
284 To do saturation efficiently in x86 we can use fist(p)s,
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
285 pf2iw, or packssdw. We use fist(p)s here.
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
286 */
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25350
diff changeset
287 __asm__(
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
288 "flds %0\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
289 "flds (%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
290 "fadds 4(%2)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
291 "fistps 512(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
292
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
293 "flds (%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
294 "fsubs 4(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
295 "fmul %%st(1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
296 "fistps (%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
297
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
298 "flds 12(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
299 "fsubs 8(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
300 "fmul %%st(1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
301 "fists 256(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
302 "fadds 12(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
303 "fadds 8(%2)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
304 "fistps 256(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
305
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
306 "flds 16(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
307 "fsubs 20(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
308 "fmul %%st(1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
309
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
310 "flds 28(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
311 "fsubs 24(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
312 "fmul %%st(2)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
313 "fists 384(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
314 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
315 "fadds 24(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
316 "fadds 28(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
317 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
318 "fadds 16(%2)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
319 "fadds 20(%2)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
320 "fistps 384(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
321 "fadd %%st(2)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
322 "fistps 128(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
323 "faddp %%st(1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
324 "fistps 128(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
325
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
326 "flds 32(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
327 "fadds 48(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
328 "fistps 448(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
329
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
330 "flds 48(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
331 "fadds 40(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
332 "fistps 320(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
333
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
334 "flds 40(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
335 "fadds 56(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
336 "fistps 192(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
337
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
338 "flds 56(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
339 "fadds 36(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
340 "fistps 64(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
341
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
342 "flds 36(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
343 "fadds 52(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
344 "fistps 64(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
345
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
346 "flds 52(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
347 "fadds 44(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
348 "fistps 192(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
349
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
350 "flds 60(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
351 "fists 448(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
352 "fadds 44(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
353 "fistps 320(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
354
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
355 "flds 96(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
356 "fadds 112(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
357 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
358 "fadds 64(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
359 "fistps 480(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
360 "fadds 80(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
361 "fistps 416(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
362
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
363 "flds 112(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
364 "fadds 104(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
365 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
366 "fadds 80(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
367 "fistps 352(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
368 "fadds 72(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
369 "fistps 288(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
370
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
371 "flds 104(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
372 "fadds 120(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
373 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
374 "fadds 72(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
375 "fistps 224(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
376 "fadds 88(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
377 "fistps 160(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
378
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
379 "flds 120(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
380 "fadds 100(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
381 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
382 "fadds 88(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
383 "fistps 96(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
384 "fadds 68(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
385 "fistps 32(%3)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
386
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
387 "flds 100(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
388 "fadds 116(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
389 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
390 "fadds 68(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
391 "fistps 32(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
392 "fadds 84(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
393 "fistps 96(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
394
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
395 "flds 116(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
396 "fadds 108(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
397 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
398 "fadds 84(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
399 "fistps 160(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
400 "fadds 76(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
401 "fistps 224(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
402
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
403 "flds 108(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
404 "fadds 124(%1)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
405 "fld %%st(0)\n\t"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
406 "fadds 76(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
407 "fistps 288(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
408 "fadds 92(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
409 "fistps 352(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
410
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
411 "flds 124(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
412 "fists 480(%4)\n\t"
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
413 "fadds 92(%1)\n\t"
32526
741d36324244 Use fist(p)s instead of fist(p), fixes compilation with clang.
cehoyos
parents: 32303
diff changeset
414 "fistps 416(%4)\n\t"
21047
ca38f523c848 directly insert ffreep %%st(0) opcode for compatibility with old
reimar
parents: 21040
diff changeset
415 ".byte 0xdf, 0xc0\n\t" // ffreep %%st(0)
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
416 :
23441
1b739c2dc613 Correct dct64 functions' declarations
zuxy
parents: 23323
diff changeset
417 :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(out0), "r"(out1)
18932
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
418 :"memory"
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
419 );
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
420 #endif
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
421 out1[0] = out0[0];
69c665e91946 Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff changeset
422 }