Mercurial > mplayer.hg
annotate mp3lib/dct64_sse.c @ 32282:606e4157cd4c
Split alloc and init of context so that parameters can be set in the context
instead of requireing being passed through function parameters. This also
makes sws work with AVOptions.
author | michael |
---|---|
date | Sun, 26 Sep 2010 19:33:57 +0000 |
parents | 9b4d2a805d63 |
children | 6d3212dd47b2 |
rev | line source |
---|---|
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
1 /* |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
2 * Discrete Cosine Tansform (DCT) for SSE |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com> |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c |
25350
2095f98cf0fa
cosmetics: Remove ugly and inconsistent uppercasing from filenames.
diego
parents:
23465
diff
changeset
|
5 * and mp3lib/dct64_mmx.c |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
6 */ |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
7 |
31502
c95e08f87c70
Include internal.h last to avoid a conflict between its dprintf()
reimar
parents:
30835
diff
changeset
|
8 #include "libavutil/mem.h" |
30167
347d152a5cfa
Refactor real --> float #define to a typedef in a common header.
diego
parents:
28328
diff
changeset
|
9 #include "mpg123.h" |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
10 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
11 extern float __attribute__((aligned(16))) costab_mmx[]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
12 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
13 static const int ppnn[4] __attribute__((aligned(16))) = |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
14 { 0, 0, 1 << 31, 1 << 31 }; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
15 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
16 static const int pnpn[4] __attribute__((aligned(16))) = |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
17 { 0, 1 << 31, 0, 1 << 31 }; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
18 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
19 static const int nnnn[4] __attribute__((aligned(16))) = |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
20 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
21 |
23441 | 22 void dct64_sse(short *out0,short *out1,real *c) |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
23 { |
23465
a1a699833dcf
Change some static temporary vars to automatic ones because mingw32 binutils
zuxy
parents:
23464
diff
changeset
|
24 DECLARE_ALIGNED(16, real, b1[0x20]); |
a1a699833dcf
Change some static temporary vars to automatic ones because mingw32 binutils
zuxy
parents:
23464
diff
changeset
|
25 DECLARE_ALIGNED(16, real, b2[0x20]); |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
26 static real const one = 1.f; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
27 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
28 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
29 real *costab = costab_mmx; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
30 int i; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
31 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
32 for (i = 0; i < 0x20 / 2; i += 4) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
33 { |
27754
08d18fe9da52
Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents:
25350
diff
changeset
|
34 __asm__( |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
35 "movaps %2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
36 "shufps $27, %%xmm3, %%xmm3\n\t" |
23464
1b1fdac4a68c
Align output pointer so that we can use movaps instead of movups in dct64_sse;
zuxy
parents:
23441
diff
changeset
|
37 "movaps %3, %%xmm1\n\t" |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
38 "movaps %%xmm1, %%xmm4\n\t" |
23464
1b1fdac4a68c
Align output pointer so that we can use movaps instead of movups in dct64_sse;
zuxy
parents:
23441
diff
changeset
|
39 "movaps %4, %%xmm2\n\t" |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
40 "shufps $27, %%xmm4, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
41 "movaps %%xmm2, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
42 "shufps $27, %%xmm0, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
43 "addps %%xmm0, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
44 "movaps %%xmm1, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
45 "subps %%xmm2, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
46 "mulps %%xmm3, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
47 "movaps %%xmm4, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
48 :"=m"(*(b1 + i)), "=m"(*(b1 + 0x1c - i)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
49 :"m"(*(costab + i)), "m"(*(c + i)), "m"(*(c + 0x1c - i)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
50 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
51 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
52 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
53 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
54 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
55 int i; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
56 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
57 for (i = 0; i < 0x20; i += 0x10) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
58 { |
27754
08d18fe9da52
Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents:
25350
diff
changeset
|
59 __asm__( |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
60 "movaps %4, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
61 "movaps %5, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
62 "movaps %6, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
63 "movaps %7, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
64 "movaps %%xmm1, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
65 "shufps $27, %%xmm7, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
66 "movaps %%xmm3, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
67 "shufps $27, %%xmm5, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
68 "movaps %%xmm4, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
69 "shufps $27, %%xmm2, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
70 "movaps %%xmm6, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
71 "shufps $27, %%xmm0, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
72 "addps %%xmm0, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
73 "movaps %%xmm1, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
74 "addps %%xmm2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
75 "movaps %%xmm3, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
76 "subps %%xmm4, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
77 "movaps %%xmm5, %2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
78 "subps %%xmm6, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
79 "movaps %%xmm7, %3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
80 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)), "=m"(*(b2 + i + 8)), "=m"(*(b2 + i + 12)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
81 :"m"(*(b1 + i)), "m"(*(b1 + i + 4)), "m"(*(b1 + i + 8)), "m"(*(b1 + i + 12)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
82 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
83 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
84 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
85 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
86 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
87 real *costab = costab_mmx + 16; |
27754
08d18fe9da52
Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents:
25350
diff
changeset
|
88 __asm__( |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
89 "movaps %4, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
90 "movaps %5, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
91 "movaps %8, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
92 "xorps %%xmm6, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
93 "shufps $27, %%xmm4, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
94 "mulps %%xmm4, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
95 "movaps %9, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
96 "xorps %%xmm7, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
97 "shufps $27, %%xmm2, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
98 "mulps %%xmm2, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
99 "movaps %%xmm0, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
100 "movaps %%xmm1, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
101 "movaps %6, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
102 "mulps %%xmm2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
103 "subps %%xmm3, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
104 "movaps %%xmm6, %2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
105 "movaps %7, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
106 "mulps %%xmm4, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
107 "subps %%xmm5, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
108 "movaps %%xmm7, %3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
109 :"=m"(*(b2 + 8)), "=m"(*(b2 + 0xc)), "=m"(*(b2 + 0x18)), "=m"(*(b2 + 0x1c)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
110 :"m"(*(b2 + 8)), "m"(*(b2 + 0xc)), "m"(*(b2 + 0x18)), "m"(*(b2 + 0x1c)), "m"(*costab), "m"(*(costab + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
111 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
112 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
113 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
114 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
115 real *costab = costab_mmx + 24; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
116 int i; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
117 |
27754
08d18fe9da52
Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents:
25350
diff
changeset
|
118 __asm__( |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
119 "movaps %0, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
120 "shufps $27, %%xmm0, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
121 "movaps %1, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
122 "movaps %%xmm5, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
123 : |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
124 :"m"(*costab), "m"(*nnnn) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
125 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
126 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
127 for (i = 0; i < 0x20; i += 8) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
128 { |
27754
08d18fe9da52
Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents:
25350
diff
changeset
|
129 __asm__( |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
130 "movaps %2, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
131 "movaps %3, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
132 "movaps %%xmm2, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
133 "xorps %%xmm5, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
134 "shufps $27, %%xmm4, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
135 "movaps %%xmm3, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
136 "shufps $27, %%xmm1, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
137 "addps %%xmm1, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
138 "movaps %%xmm2, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
139 "subps %%xmm3, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
140 "xorps %%xmm6, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
141 "mulps %%xmm0, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
142 "movaps %%xmm4, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
143 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
144 :"m"(*(b2 + i)), "m"(*(b2 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
145 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
146 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
147 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
148 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
149 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
150 int i; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
151 |
27754
08d18fe9da52
Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents:
25350
diff
changeset
|
152 __asm__( |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
153 "movss %0, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
154 "movss %1, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
155 "movaps %%xmm1, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
156 "unpcklps %%xmm0, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
157 "movss %2, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
158 "movaps %%xmm1, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
159 "unpcklps %%xmm2, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
160 "unpcklps %%xmm3, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
161 "movaps %3, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
162 : |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
163 :"m"(one), "m"(costab_mmx[28]), "m"(costab_mmx[29]), "m"(*ppnn) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
164 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
165 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
166 for (i = 0; i < 0x20; i += 8) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
167 { |
27754
08d18fe9da52
Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents:
25350
diff
changeset
|
168 __asm__( |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
169 "movaps %2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
170 "movaps %%xmm3, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
171 "shufps $20, %%xmm4, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
172 "shufps $235, %%xmm3, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
173 "xorps %%xmm2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
174 "addps %%xmm3, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
175 "mulps %%xmm0, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
176 "movaps %%xmm4, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
177 "movaps %3, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
178 "movaps %%xmm6, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
179 "shufps $27, %%xmm5, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
180 "xorps %%xmm2, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
181 "addps %%xmm5, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
182 "mulps %%xmm0, %%xmm6\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
183 "movaps %%xmm6, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
184 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
185 :"m"(*(b1 + i)), "m"(*(b1 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
186 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
187 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
188 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
189 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
190 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
191 int i; |
27754
08d18fe9da52
Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents:
25350
diff
changeset
|
192 __asm__( |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
193 "movss %0, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
194 "movaps %%xmm1, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
195 "movaps %%xmm0, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
196 "unpcklps %%xmm1, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
197 "unpcklps %%xmm0, %%xmm7\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
198 "movaps %1, %%xmm0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
199 "unpcklps %%xmm7, %%xmm2\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
200 : |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
201 :"m"(costab_mmx[30]), "m"(*pnpn) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
202 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
203 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
204 for (i = 0x8; i < 0x20; i += 8) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
205 { |
27754
08d18fe9da52
Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents:
25350
diff
changeset
|
206 __asm__ volatile ( |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
207 "movaps %2, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
208 "movaps %%xmm1, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
209 "shufps $224, %%xmm3, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
210 "shufps $181, %%xmm1, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
211 "xorps %%xmm0, %%xmm1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
212 "addps %%xmm1, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
213 "mulps %%xmm2, %%xmm3\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
214 "movaps %%xmm3, %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
215 "movaps %3, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
216 "movaps %%xmm4, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
217 "shufps $224, %%xmm5, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
218 "shufps $181, %%xmm4, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
219 "xorps %%xmm0, %%xmm4\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
220 "addps %%xmm4, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
221 "mulps %%xmm2, %%xmm5\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
222 "movaps %%xmm5, %1\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
223 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
224 :"m"(*(b2 + i)), "m"(*(b2 + i + 4)) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
225 :"memory" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
226 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
227 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
228 for (i = 0x8; i < 0x20; i += 8) |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
229 { |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
230 b1[i + 2] += b1[i + 3]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
231 b1[i + 6] += b1[i + 7]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
232 b1[i + 4] += b1[i + 6]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
233 b1[i + 6] += b1[i + 5]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
234 b1[i + 5] += b1[i + 7]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
235 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
236 } |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
237 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
238 #if 0 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
239 /* Reference C code */ |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
240 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
241 /* |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
242 Should run faster than x87 asm, given that the compiler is sane. |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
243 However, the C code dosen't round with saturation (0x7fff for too |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
244 large positive float, 0x8000 for too small negative float). You |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
245 can hear the difference if you listen carefully. |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
246 */ |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
247 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
248 out0[256] = (short)(b2[0] + b2[1]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
249 out0[0] = (short)((b2[0] - b2[1]) * costab_mmx[30]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
250 out1[128] = (short)((b2[3] - b2[2]) * costab_mmx[30]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
251 out0[128] = (short)((b2[3] - b2[2]) * costab_mmx[30] + b2[3] + b2[2]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
252 out1[192] = (short)((b2[7] - b2[6]) * costab_mmx[30]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
253 out0[192] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + b2[4] + b2[5]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
254 out0[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + (b2[4] - b2[5]) * costab_mmx[30]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
255 out1[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + (b2[4] - b2[5]) * costab_mmx[30]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
256 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
257 out0[224] = (short)(b1[8] + b1[12]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
258 out0[160] = (short)(b1[12] + b1[10]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
259 out0[96] = (short)(b1[10] + b1[14]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
260 out0[32] = (short)(b1[14] + b1[9]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
261 out1[32] = (short)(b1[9] + b1[13]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
262 out1[96] = (short)(b1[13] + b1[11]); |
23323 | 263 out1[224] = (short)b1[15]; |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
264 out1[160] = (short)(b1[15] + b1[11]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
265 out0[240] = (short)(b1[24] + b1[28] + b1[16]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
266 out0[208] = (short)(b1[24] + b1[28] + b1[20]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
267 out0[176] = (short)(b1[28] + b1[26] + b1[20]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
268 out0[144] = (short)(b1[28] + b1[26] + b1[18]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
269 out0[112] = (short)(b1[26] + b1[30] + b1[18]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
270 out0[80] = (short)(b1[26] + b1[30] + b1[22]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
271 out0[48] = (short)(b1[30] + b1[25] + b1[22]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
272 out0[16] = (short)(b1[30] + b1[25] + b1[17]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
273 out1[16] = (short)(b1[25] + b1[29] + b1[17]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
274 out1[48] = (short)(b1[25] + b1[29] + b1[21]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
275 out1[80] = (short)(b1[29] + b1[27] + b1[21]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
276 out1[112] = (short)(b1[29] + b1[27] + b1[19]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
277 out1[144] = (short)(b1[27] + b1[31] + b1[19]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
278 out1[176] = (short)(b1[27] + b1[31] + b1[23]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
279 out1[240] = (short)(b1[31]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
280 out1[208] = (short)(b1[31] + b1[23]); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
281 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
282 #else |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
283 /* |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
284 To do saturation efficiently in x86 we can use fist(t)(p), |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
285 pf2iw, or packssdw. We use fist(p) here. |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
286 */ |
27754
08d18fe9da52
Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents:
25350
diff
changeset
|
287 __asm__( |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
288 "flds %0\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
289 "flds (%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
290 "fadds 4(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
291 "fistp 512(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
292 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
293 "flds (%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
294 "fsubs 4(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
295 "fmul %%st(1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
296 "fistp (%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
297 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
298 "flds 12(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
299 "fsubs 8(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
300 "fmul %%st(1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
301 "fist 256(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
302 "fadds 12(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
303 "fadds 8(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
304 "fistp 256(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
305 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
306 "flds 16(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
307 "fsubs 20(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
308 "fmul %%st(1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
309 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
310 "flds 28(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
311 "fsubs 24(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
312 "fmul %%st(2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
313 "fist 384(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
314 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
315 "fadds 24(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
316 "fadds 28(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
317 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
318 "fadds 16(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
319 "fadds 20(%2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
320 "fistp 384(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
321 "fadd %%st(2)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
322 "fistp 128(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
323 "faddp %%st(1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
324 "fistp 128(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
325 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
326 "flds 32(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
327 "fadds 48(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
328 "fistp 448(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
329 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
330 "flds 48(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
331 "fadds 40(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
332 "fistp 320(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
333 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
334 "flds 40(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
335 "fadds 56(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
336 "fistp 192(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
337 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
338 "flds 56(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
339 "fadds 36(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
340 "fistp 64(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
341 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
342 "flds 36(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
343 "fadds 52(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
344 "fistp 64(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
345 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
346 "flds 52(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
347 "fadds 44(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
348 "fistp 192(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
349 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
350 "flds 60(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
351 "fist 448(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
352 "fadds 44(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
353 "fistp 320(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
354 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
355 "flds 96(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
356 "fadds 112(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
357 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
358 "fadds 64(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
359 "fistp 480(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
360 "fadds 80(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
361 "fistp 416(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
362 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
363 "flds 112(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
364 "fadds 104(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
365 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
366 "fadds 80(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
367 "fistp 352(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
368 "fadds 72(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
369 "fistp 288(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
370 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
371 "flds 104(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
372 "fadds 120(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
373 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
374 "fadds 72(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
375 "fistp 224(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
376 "fadds 88(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
377 "fistp 160(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
378 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
379 "flds 120(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
380 "fadds 100(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
381 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
382 "fadds 88(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
383 "fistp 96(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
384 "fadds 68(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
385 "fistp 32(%3)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
386 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
387 "flds 100(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
388 "fadds 116(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
389 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
390 "fadds 68(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
391 "fistp 32(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
392 "fadds 84(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
393 "fistp 96(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
394 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
395 "flds 116(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
396 "fadds 108(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
397 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
398 "fadds 84(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
399 "fistp 160(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
400 "fadds 76(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
401 "fistp 224(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
402 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
403 "flds 108(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
404 "fadds 124(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
405 "fld %%st(0)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
406 "fadds 76(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
407 "fistp 288(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
408 "fadds 92(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
409 "fistp 352(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
410 |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
411 "flds 124(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
412 "fist 480(%4)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
413 "fadds 92(%1)\n\t" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
414 "fistp 416(%4)\n\t" |
21047
ca38f523c848
directly insert ffreep %%st(0) opcode for compatibility with old
reimar
parents:
21040
diff
changeset
|
415 ".byte 0xdf, 0xc0\n\t" // ffreep %%st(0) |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
416 : |
23441 | 417 :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(out0), "r"(out1) |
18932
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
418 :"memory" |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
419 ); |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
420 #endif |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
421 out1[0] = out0[0]; |
69c665e91946
Add dct64_sse, a replacement for dct64_MMX. About 60% faster on its author's Pentium III
gpoirier
parents:
diff
changeset
|
422 } |