Mercurial > mplayer.hg
annotate libvo/aclib.c @ 33546:b0f904bc6cb0
Use the same yasm check as FFmpeg.
author | cehoyos |
---|---|
date | Thu, 16 Jun 2011 21:37:21 +0000 |
parents | 807fce7a4bb3 |
children | 4e2f4bd081ce |
rev | line source |
---|---|
28446
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
1 /* |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
2 * aclib - advanced C library ;) |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
3 * Functions which improve and expand the standard C library, see aclib_template.c. |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
4 * This file only contains runtime CPU detection and config option stuff. |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
5 * runtime CPU detection by Michael Niedermayer (michaelni@gmx.at) |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
6 * |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
7 * This file is part of MPlayer. |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
8 * |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
9 * MPlayer is free software; you can redistribute it and/or modify |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
10 * it under the terms of the GNU General Public License as published by |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
11 * the Free Software Foundation; either version 2 of the License, or |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
12 * (at your option) any later version. |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
13 * |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
14 * MPlayer is distributed in the hope that it will be useful, |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
17 * GNU General Public License for more details. |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
18 * |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
19 * You should have received a copy of the GNU General Public License along |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
20 * with MPlayer; if not, write to the Free Software Foundation, Inc., |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
22 */ |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
23 |
12650
ac3fd2ff2561
Unify the config.h #include, use "config.h" instead of "../config.h"
diego
parents:
12492
diff
changeset
|
24 #include "config.h" |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
25 #include <stddef.h> |
30135
807fce7a4bb3
Do not assume that "long" is the size of a register.
reimar
parents:
29114
diff
changeset
|
26 #include <stdint.h> |
21982
fa66a03e8920
Include string.h to make sure memcpy is not used without prototype
reimar
parents:
20577
diff
changeset
|
27 #include <string.h> |
13787
e047e70a9767
Handle "xxx.h" vs "../xxx.h" include paths in a consistent way.
diego
parents:
13720
diff
changeset
|
28 #include "cpudetect.h" |
8123
9fc45fe0d444
*HUGE* set of compiler warning fixes, unused variables removal
arpi
parents:
7072
diff
changeset
|
29 #include "fastmemcpy.h" |
30135
807fce7a4bb3
Do not assume that "long" is the size of a register.
reimar
parents:
29114
diff
changeset
|
30 #include "libavutil/x86_cpu.h" |
12492
4b8417674f1c
fix crash due to fast_memcpy calling itself instead of libc memcpy
reimar
parents:
8127
diff
changeset
|
31 #undef memcpy |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
32 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
33 #define BLOCK_SIZE 4096 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
34 #define CONFUSION_FACTOR 0 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
35 //Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
36 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
37 //#define STATISTICS |
3393 | 38 |
39 //Note: we have MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one | |
40 //Plain C versions | |
29114
06540eb5ef6a
Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents:
28921
diff
changeset
|
41 //#if !HAVE_MMX || CONFIG_RUNTIME_CPUDETECT |
3393 | 42 //#define COMPILE_C |
43 //#endif | |
44 | |
28921
62f0032e736a
Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents:
28448
diff
changeset
|
45 #if ARCH_X86 |
3393 | 46 |
29114
06540eb5ef6a
Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents:
28921
diff
changeset
|
47 #if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT |
3393 | 48 #define COMPILE_MMX |
49 #endif | |
50 | |
29114
06540eb5ef6a
Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents:
28921
diff
changeset
|
51 #if (HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT |
3393 | 52 #define COMPILE_MMX2 |
53 #endif | |
54 | |
29114
06540eb5ef6a
Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents:
28921
diff
changeset
|
55 #if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT |
3393 | 56 #define COMPILE_3DNOW |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
57 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
58 |
29114
06540eb5ef6a
Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents:
28921
diff
changeset
|
59 #if HAVE_SSE2 || CONFIG_RUNTIME_CPUDETECT |
5208 | 60 #define COMPILE_SSE |
61 #endif | |
62 | |
3393 | 63 #undef HAVE_MMX |
64 #undef HAVE_MMX2 | |
28335 | 65 #undef HAVE_AMD3DNOW |
5208 | 66 #undef HAVE_SSE |
67 #undef HAVE_SSE2 | |
28290 | 68 #define HAVE_MMX 0 |
69 #define HAVE_MMX2 0 | |
28335 | 70 #define HAVE_AMD3DNOW 0 |
28290 | 71 #define HAVE_SSE 0 |
72 #define HAVE_SSE2 0 | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
73 /* |
3393 | 74 #ifdef COMPILE_C |
75 #undef HAVE_MMX | |
76 #undef HAVE_MMX2 | |
28335 | 77 #undef HAVE_AMD3DNOW |
28290 | 78 #undef HAVE_SSE |
79 #undef HAVE_SSE2 | |
80 #define HAVE_MMX 0 | |
81 #define HAVE_MMX2 0 | |
28335 | 82 #define HAVE_AMD3DNOW 0 |
28290 | 83 #define HAVE_SSE 0 |
84 #define HAVE_SSE2 0 | |
3393 | 85 #define RENAME(a) a ## _C |
86 #include "aclib_template.c" | |
87 #endif | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
88 */ |
3393 | 89 //MMX versions |
90 #ifdef COMPILE_MMX | |
91 #undef RENAME | |
28290 | 92 #undef HAVE_MMX |
3393 | 93 #undef HAVE_MMX2 |
28335 | 94 #undef HAVE_AMD3DNOW |
5208 | 95 #undef HAVE_SSE |
96 #undef HAVE_SSE2 | |
28290 | 97 #define HAVE_MMX 1 |
98 #define HAVE_MMX2 0 | |
28335 | 99 #define HAVE_AMD3DNOW 0 |
28290 | 100 #define HAVE_SSE 0 |
101 #define HAVE_SSE2 0 | |
3393 | 102 #define RENAME(a) a ## _MMX |
103 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
104 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
105 |
3393 | 106 //MMX2 versions |
107 #ifdef COMPILE_MMX2 | |
108 #undef RENAME | |
28290 | 109 #undef HAVE_MMX |
110 #undef HAVE_MMX2 | |
28335 | 111 #undef HAVE_AMD3DNOW |
5208 | 112 #undef HAVE_SSE |
113 #undef HAVE_SSE2 | |
28290 | 114 #define HAVE_MMX 1 |
115 #define HAVE_MMX2 1 | |
28335 | 116 #define HAVE_AMD3DNOW 0 |
28290 | 117 #define HAVE_SSE 0 |
118 #define HAVE_SSE2 0 | |
3393 | 119 #define RENAME(a) a ## _MMX2 |
120 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
121 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
122 |
3393 | 123 //3DNOW versions |
124 #ifdef COMPILE_3DNOW | |
125 #undef RENAME | |
28290 | 126 #undef HAVE_MMX |
3393 | 127 #undef HAVE_MMX2 |
28335 | 128 #undef HAVE_AMD3DNOW |
5208 | 129 #undef HAVE_SSE |
130 #undef HAVE_SSE2 | |
28290 | 131 #define HAVE_MMX 1 |
132 #define HAVE_MMX2 0 | |
28335 | 133 #define HAVE_AMD3DNOW 1 |
28290 | 134 #define HAVE_SSE 0 |
135 #define HAVE_SSE2 0 | |
3393 | 136 #define RENAME(a) a ## _3DNow |
137 #include "aclib_template.c" | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
138 #endif |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
139 |
5208 | 140 //SSE versions (only used on SSE2 cpus) |
141 #ifdef COMPILE_SSE | |
142 #undef RENAME | |
28290 | 143 #undef HAVE_MMX |
144 #undef HAVE_MMX2 | |
28335 | 145 #undef HAVE_AMD3DNOW |
28290 | 146 #undef HAVE_SSE |
147 #undef HAVE_SSE2 | |
148 #define HAVE_MMX 1 | |
149 #define HAVE_MMX2 1 | |
28335 | 150 #define HAVE_AMD3DNOW 0 |
28290 | 151 #define HAVE_SSE 1 |
152 #define HAVE_SSE2 1 | |
5208 | 153 #define RENAME(a) a ## _SSE |
154 #include "aclib_template.c" | |
155 #endif | |
156 | |
28921
62f0032e736a
Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents:
28448
diff
changeset
|
157 #endif /* ARCH_X86 */ |
3393 | 158 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
159 |
23523
273aa6124f66
avoid utter breakage on non-x86, patch from Chris Roccati <roccati@at@pobox.dot.com>
lu_zero
parents:
21982
diff
changeset
|
160 #undef fast_memcpy |
7072 | 161 void * fast_memcpy(void * to, const void * from, size_t len) |
3393 | 162 { |
29114
06540eb5ef6a
Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents:
28921
diff
changeset
|
163 #if CONFIG_RUNTIME_CPUDETECT |
28921
62f0032e736a
Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents:
28448
diff
changeset
|
164 #if ARCH_X86 |
3393 | 165 // ordered per speed fasterst first |
5208 | 166 if(gCpuCaps.hasSSE2) |
167 fast_memcpy_SSE(to, from, len); | |
168 else if(gCpuCaps.hasMMX2) | |
3393 | 169 fast_memcpy_MMX2(to, from, len); |
170 else if(gCpuCaps.has3DNow) | |
171 fast_memcpy_3DNow(to, from, len); | |
172 else if(gCpuCaps.hasMMX) | |
173 fast_memcpy_MMX(to, from, len); | |
174 else | |
28921
62f0032e736a
Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents:
28448
diff
changeset
|
175 #endif |
3393 | 176 memcpy(to, from, len); // prior to mmx we use the standart memcpy |
177 #else | |
28290 | 178 #if HAVE_SSE2 |
5208 | 179 fast_memcpy_SSE(to, from, len); |
28290 | 180 #elif HAVE_MMX2 |
3393 | 181 fast_memcpy_MMX2(to, from, len); |
28335 | 182 #elif HAVE_AMD3DNOW |
3393 | 183 fast_memcpy_3DNow(to, from, len); |
28290 | 184 #elif HAVE_MMX |
3393 | 185 fast_memcpy_MMX(to, from, len); |
186 #else | |
187 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
188 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
189 |
29114
06540eb5ef6a
Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents:
28921
diff
changeset
|
190 #endif //!CONFIG_RUNTIME_CPUDETECT |
5543
c75f75806af1
memcpy must return destination ptr patch by Adam <adam@cfar.umd.edu>
michael
parents:
5208
diff
changeset
|
191 return to; |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
192 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
193 |
8127
e7153e62a7f4
On non-x86 platforms, memcpy was re-implemented in mplayer and was called
jkeil
parents:
8123
diff
changeset
|
194 #undef mem2agpcpy |
7072 | 195 void * mem2agpcpy(void * to, const void * from, size_t len) |
4681 | 196 { |
29114
06540eb5ef6a
Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents:
28921
diff
changeset
|
197 #if CONFIG_RUNTIME_CPUDETECT |
28921
62f0032e736a
Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents:
28448
diff
changeset
|
198 #if ARCH_X86 |
4681 | 199 // ordered per speed fasterst first |
5208 | 200 if(gCpuCaps.hasSSE2) |
201 mem2agpcpy_SSE(to, from, len); | |
202 else if(gCpuCaps.hasMMX2) | |
4681 | 203 mem2agpcpy_MMX2(to, from, len); |
204 else if(gCpuCaps.has3DNow) | |
205 mem2agpcpy_3DNow(to, from, len); | |
206 else if(gCpuCaps.hasMMX) | |
207 mem2agpcpy_MMX(to, from, len); | |
208 else | |
28921
62f0032e736a
Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents:
28448
diff
changeset
|
209 #endif |
4681 | 210 memcpy(to, from, len); // prior to mmx we use the standart memcpy |
211 #else | |
28290 | 212 #if HAVE_SSE2 |
5208 | 213 mem2agpcpy_SSE(to, from, len); |
28290 | 214 #elif HAVE_MMX2 |
4681 | 215 mem2agpcpy_MMX2(to, from, len); |
28335 | 216 #elif HAVE_AMD3DNOW |
4681 | 217 mem2agpcpy_3DNow(to, from, len); |
28290 | 218 #elif HAVE_MMX |
4681 | 219 mem2agpcpy_MMX(to, from, len); |
220 #else | |
221 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
222 #endif | |
223 | |
29114
06540eb5ef6a
Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents:
28921
diff
changeset
|
224 #endif //!CONFIG_RUNTIME_CPUDETECT |
8123
9fc45fe0d444
*HUGE* set of compiler warning fixes, unused variables removal
arpi
parents:
7072
diff
changeset
|
225 return to; |
4681 | 226 } |