Mercurial > mplayer.hg
annotate libvo/aclib.c @ 28329:ed42e982e79f
Fix compilation after DECLARE_ASM_CONST/DECLARE_ALIGNED moving within FFmpeg.
author | diego |
---|---|
date | Sun, 25 Jan 2009 22:34:26 +0000 |
parents | af0b0ae25b84 |
children | 31287e75b5d8 |
rev | line source |
---|---|
12650
ac3fd2ff2561
Unify the config.h #include, use "config.h" instead of "../config.h"
diego
parents:
12492
diff
changeset
|
1 #include "config.h" |
27341
e7c989f7a7c9
Start unifying names of internal preprocessor directives.
diego
parents:
23523
diff
changeset
|
2 #ifdef CONFIG_FASTMEMCPY |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
3 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
4 /* |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
5 aclib - advanced C library ;) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
6 This file contains functions which improve and expand standard C-library |
3393 | 7 see aclib_template.c ... this file only contains runtime cpu detection and config options stuff |
8 runtime cpu detection by michael niedermayer (michaelni@gmx.at) is under GPL | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
9 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
10 #include <stddef.h> |
21982
fa66a03e8920
Include string.h to make sure memcpy is not used without prototype
reimar
parents:
20577
diff
changeset
|
11 #include <string.h> |
13787
e047e70a9767
Handle "xxx.h" vs "../xxx.h" include paths in a consistent way.
diego
parents:
13720
diff
changeset
|
12 #include "cpudetect.h" |
8123
9fc45fe0d444
*HUGE* set of compiler warning fixes, unused variables removal
arpi
parents:
7072
diff
changeset
|
13 #include "fastmemcpy.h" |
12492
4b8417674f1c
fix crash due to fast_memcpy calling itself instead of libc memcpy
reimar
parents:
8127
diff
changeset
|
14 #undef memcpy |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
15 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
16 #define BLOCK_SIZE 4096 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
17 #define CONFUSION_FACTOR 0 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
18 //Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
19 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
20 //#define STATISTICS |
28290 | 21 #if ARCH_X86 |
3393 | 22 #define CAN_COMPILE_X86_ASM |
23 #endif | |
24 | |
25 //Note: we have MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one | |
26 //Plain C versions | |
28292
d6001126678f
More #ifdef HAVE_MMX etc. missed by earlier search.
reimar
parents:
28290
diff
changeset
|
27 //#if !HAVE_MMX || defined (RUNTIME_CPUDETECT) |
3393 | 28 //#define COMPILE_C |
29 //#endif | |
30 | |
31 #ifdef CAN_COMPILE_X86_ASM | |
32 | |
28290 | 33 #if (HAVE_MMX && !HAVE_3DNOW && !HAVE_MMX2) || defined (RUNTIME_CPUDETECT) |
3393 | 34 #define COMPILE_MMX |
35 #endif | |
36 | |
28290 | 37 #if (HAVE_MMX2 && !HAVE_SSE2) || defined (RUNTIME_CPUDETECT) |
3393 | 38 #define COMPILE_MMX2 |
39 #endif | |
40 | |
28290 | 41 #if (HAVE_3DNOW && !HAVE_MMX2) || defined (RUNTIME_CPUDETECT) |
3393 | 42 #define COMPILE_3DNOW |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
43 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
44 |
28290 | 45 #if HAVE_SSE2 || defined (RUNTIME_CPUDETECT) |
5208 | 46 #define COMPILE_SSE |
47 #endif | |
48 | |
3393 | 49 #undef HAVE_MMX |
50 #undef HAVE_MMX2 | |
51 #undef HAVE_3DNOW | |
5208 | 52 #undef HAVE_SSE |
53 #undef HAVE_SSE2 | |
28290 | 54 #define HAVE_MMX 0 |
55 #define HAVE_MMX2 0 | |
56 #define HAVE_3DNOW 0 | |
57 #define HAVE_SSE 0 | |
58 #define HAVE_SSE2 0 | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
59 /* |
3393 | 60 #ifdef COMPILE_C |
61 #undef HAVE_MMX | |
62 #undef HAVE_MMX2 | |
63 #undef HAVE_3DNOW | |
28290 | 64 #undef HAVE_SSE |
65 #undef HAVE_SSE2 | |
66 #define HAVE_MMX 0 | |
67 #define HAVE_MMX2 0 | |
68 #define HAVE_3DNOW 0 | |
69 #define HAVE_SSE 0 | |
70 #define HAVE_SSE2 0 | |
3393 | 71 #define RENAME(a) a ## _C |
72 #include "aclib_template.c" | |
73 #endif | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
74 */ |
3393 | 75 //MMX versions |
76 #ifdef COMPILE_MMX | |
77 #undef RENAME | |
28290 | 78 #undef HAVE_MMX |
3393 | 79 #undef HAVE_MMX2 |
80 #undef HAVE_3DNOW | |
5208 | 81 #undef HAVE_SSE |
82 #undef HAVE_SSE2 | |
28290 | 83 #define HAVE_MMX 1 |
84 #define HAVE_MMX2 0 | |
85 #define HAVE_3DNOW 0 | |
86 #define HAVE_SSE 0 | |
87 #define HAVE_SSE2 0 | |
3393 | 88 #define RENAME(a) a ## _MMX |
89 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
90 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
91 |
3393 | 92 //MMX2 versions |
93 #ifdef COMPILE_MMX2 | |
94 #undef RENAME | |
28290 | 95 #undef HAVE_MMX |
96 #undef HAVE_MMX2 | |
3393 | 97 #undef HAVE_3DNOW |
5208 | 98 #undef HAVE_SSE |
99 #undef HAVE_SSE2 | |
28290 | 100 #define HAVE_MMX 1 |
101 #define HAVE_MMX2 1 | |
102 #define HAVE_3DNOW 0 | |
103 #define HAVE_SSE 0 | |
104 #define HAVE_SSE2 0 | |
3393 | 105 #define RENAME(a) a ## _MMX2 |
106 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
107 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
108 |
3393 | 109 //3DNOW versions |
110 #ifdef COMPILE_3DNOW | |
111 #undef RENAME | |
28290 | 112 #undef HAVE_MMX |
3393 | 113 #undef HAVE_MMX2 |
28290 | 114 #undef HAVE_3DNOW |
5208 | 115 #undef HAVE_SSE |
116 #undef HAVE_SSE2 | |
28290 | 117 #define HAVE_MMX 1 |
118 #define HAVE_MMX2 0 | |
119 #define HAVE_3DNOW 1 | |
120 #define HAVE_SSE 0 | |
121 #define HAVE_SSE2 0 | |
3393 | 122 #define RENAME(a) a ## _3DNow |
123 #include "aclib_template.c" | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
124 #endif |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
125 |
5208 | 126 //SSE versions (only used on SSE2 cpus) |
127 #ifdef COMPILE_SSE | |
128 #undef RENAME | |
28290 | 129 #undef HAVE_MMX |
130 #undef HAVE_MMX2 | |
5208 | 131 #undef HAVE_3DNOW |
28290 | 132 #undef HAVE_SSE |
133 #undef HAVE_SSE2 | |
134 #define HAVE_MMX 1 | |
135 #define HAVE_MMX2 1 | |
136 #define HAVE_3DNOW 0 | |
137 #define HAVE_SSE 1 | |
138 #define HAVE_SSE2 1 | |
5208 | 139 #define RENAME(a) a ## _SSE |
140 #include "aclib_template.c" | |
141 #endif | |
142 | |
3393 | 143 #endif // CAN_COMPILE_X86_ASM |
144 | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
145 |
23523
273aa6124f66
avoid utter breakage on non-x86, patch from Chris Roccati <roccati@at@pobox.dot.com>
lu_zero
parents:
21982
diff
changeset
|
146 #undef fast_memcpy |
7072 | 147 void * fast_memcpy(void * to, const void * from, size_t len) |
3393 | 148 { |
149 #ifdef RUNTIME_CPUDETECT | |
150 #ifdef CAN_COMPILE_X86_ASM | |
151 // ordered per speed fasterst first | |
5208 | 152 if(gCpuCaps.hasSSE2) |
153 fast_memcpy_SSE(to, from, len); | |
154 else if(gCpuCaps.hasMMX2) | |
3393 | 155 fast_memcpy_MMX2(to, from, len); |
156 else if(gCpuCaps.has3DNow) | |
157 fast_memcpy_3DNow(to, from, len); | |
158 else if(gCpuCaps.hasMMX) | |
159 fast_memcpy_MMX(to, from, len); | |
160 else | |
161 #endif //CAN_COMPILE_X86_ASM | |
162 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
163 #else | |
28290 | 164 #if HAVE_SSE2 |
5208 | 165 fast_memcpy_SSE(to, from, len); |
28290 | 166 #elif HAVE_MMX2 |
3393 | 167 fast_memcpy_MMX2(to, from, len); |
28290 | 168 #elif HAVE_3DNOW |
3393 | 169 fast_memcpy_3DNow(to, from, len); |
28290 | 170 #elif HAVE_MMX |
3393 | 171 fast_memcpy_MMX(to, from, len); |
172 #else | |
173 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
174 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
175 |
3393 | 176 #endif //!RUNTIME_CPUDETECT |
5543
c75f75806af1
memcpy must return destination ptr patch by Adam <adam@cfar.umd.edu>
michael
parents:
5208
diff
changeset
|
177 return to; |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
178 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
179 |
8127
e7153e62a7f4
On non-x86 platforms, memcpy was re-implemented in mplayer and was called
jkeil
parents:
8123
diff
changeset
|
180 #undef mem2agpcpy |
7072 | 181 void * mem2agpcpy(void * to, const void * from, size_t len) |
4681 | 182 { |
183 #ifdef RUNTIME_CPUDETECT | |
184 #ifdef CAN_COMPILE_X86_ASM | |
185 // ordered per speed fasterst first | |
5208 | 186 if(gCpuCaps.hasSSE2) |
187 mem2agpcpy_SSE(to, from, len); | |
188 else if(gCpuCaps.hasMMX2) | |
4681 | 189 mem2agpcpy_MMX2(to, from, len); |
190 else if(gCpuCaps.has3DNow) | |
191 mem2agpcpy_3DNow(to, from, len); | |
192 else if(gCpuCaps.hasMMX) | |
193 mem2agpcpy_MMX(to, from, len); | |
194 else | |
195 #endif //CAN_COMPILE_X86_ASM | |
196 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
197 #else | |
28290 | 198 #if HAVE_SSE2 |
5208 | 199 mem2agpcpy_SSE(to, from, len); |
28290 | 200 #elif HAVE_MMX2 |
4681 | 201 mem2agpcpy_MMX2(to, from, len); |
28290 | 202 #elif HAVE_3DNOW |
4681 | 203 mem2agpcpy_3DNow(to, from, len); |
28290 | 204 #elif HAVE_MMX |
4681 | 205 mem2agpcpy_MMX(to, from, len); |
206 #else | |
207 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
208 #endif | |
209 | |
210 #endif //!RUNTIME_CPUDETECT | |
8123
9fc45fe0d444
*HUGE* set of compiler warning fixes, unused variables removal
arpi
parents:
7072
diff
changeset
|
211 return to; |
4681 | 212 } |
213 | |
27341
e7c989f7a7c9
Start unifying names of internal preprocessor directives.
diego
parents:
23523
diff
changeset
|
214 #endif /* CONFIG_FASTMEMCPY */ |