Mercurial > mplayer.hg
annotate libvo/aclib.c @ 27985:1c77b86d355d
Remove a ColorFill that is not necessary since the surface it is used
on has exactly the same size as the video image and the video will
be copied into it before it is used the first time.
author | reimar |
---|---|
date | Mon, 24 Nov 2008 09:46:23 +0000 |
parents | e7c989f7a7c9 |
children | 25337a2147e7 |
rev | line source |
---|---|
12650
ac3fd2ff2561
Unify the config.h #include, use "config.h" instead of "../config.h"
diego
parents:
12492
diff
changeset
|
1 #include "config.h" |
27341
e7c989f7a7c9
Start unifying names of internal preprocessor directives.
diego
parents:
23523
diff
changeset
|
2 #ifdef CONFIG_FASTMEMCPY |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
3 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
4 /* |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
5 aclib - advanced C library ;) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
6 This file contains functions which improve and expand standard C-library |
3393 | 7 see aclib_template.c ... this file only contains runtime cpu detection and config options stuff |
8 runtime cpu detection by michael niedermayer (michaelni@gmx.at) is under GPL | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
9 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
10 #include <stddef.h> |
21982
fa66a03e8920
Include string.h to make sure memcpy is not used without prototype
reimar
parents:
20577
diff
changeset
|
11 #include <string.h> |
13787
e047e70a9767
Handle "xxx.h" vs "../xxx.h" include paths in a consistent way.
diego
parents:
13720
diff
changeset
|
12 #include "cpudetect.h" |
8123
9fc45fe0d444
*HUGE* set of compiler warning fixes, unused variables removal
arpi
parents:
7072
diff
changeset
|
13 #include "fastmemcpy.h" |
12492
4b8417674f1c
fix crash due to fast_memcpy calling itself instead of libc memcpy
reimar
parents:
8127
diff
changeset
|
14 #undef memcpy |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
15 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
16 #define BLOCK_SIZE 4096 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
17 #define CONFUSION_FACTOR 0 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
18 //Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
19 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
20 //#define STATISTICS |
20577 | 21 #ifdef ARCH_X86 |
3393 | 22 #define CAN_COMPILE_X86_ASM |
23 #endif | |
24 | |
25 //Note: we have MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one | |
26 //Plain C versions | |
27 //#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT) | |
28 //#define COMPILE_C | |
29 //#endif | |
30 | |
31 #ifdef CAN_COMPILE_X86_ASM | |
32 | |
33 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | |
34 #define COMPILE_MMX | |
35 #endif | |
36 | |
5208 | 37 #if (defined (HAVE_MMX2) && !defined (HAVE_SSE2)) || defined (RUNTIME_CPUDETECT) |
3393 | 38 #define COMPILE_MMX2 |
39 #endif | |
40 | |
41 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | |
42 #define COMPILE_3DNOW | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
43 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
44 |
5208 | 45 #if defined (HAVE_SSE2) || defined (RUNTIME_CPUDETECT) |
46 #define COMPILE_SSE | |
47 #endif | |
48 | |
3393 | 49 #undef HAVE_MMX |
50 #undef HAVE_MMX2 | |
51 #undef HAVE_3DNOW | |
5208 | 52 #undef HAVE_SSE |
53 #undef HAVE_SSE2 | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
54 /* |
3393 | 55 #ifdef COMPILE_C |
56 #undef HAVE_MMX | |
57 #undef HAVE_MMX2 | |
58 #undef HAVE_3DNOW | |
59 #undef ARCH_X86 | |
60 #define RENAME(a) a ## _C | |
61 #include "aclib_template.c" | |
62 #endif | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
63 */ |
3393 | 64 //MMX versions |
65 #ifdef COMPILE_MMX | |
66 #undef RENAME | |
67 #define HAVE_MMX | |
68 #undef HAVE_MMX2 | |
69 #undef HAVE_3DNOW | |
5208 | 70 #undef HAVE_SSE |
71 #undef HAVE_SSE2 | |
3393 | 72 #define RENAME(a) a ## _MMX |
73 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
74 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
75 |
3393 | 76 //MMX2 versions |
77 #ifdef COMPILE_MMX2 | |
78 #undef RENAME | |
79 #define HAVE_MMX | |
80 #define HAVE_MMX2 | |
81 #undef HAVE_3DNOW | |
5208 | 82 #undef HAVE_SSE |
83 #undef HAVE_SSE2 | |
3393 | 84 #define RENAME(a) a ## _MMX2 |
85 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
86 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
87 |
3393 | 88 //3DNOW versions |
89 #ifdef COMPILE_3DNOW | |
90 #undef RENAME | |
91 #define HAVE_MMX | |
92 #undef HAVE_MMX2 | |
93 #define HAVE_3DNOW | |
5208 | 94 #undef HAVE_SSE |
95 #undef HAVE_SSE2 | |
3393 | 96 #define RENAME(a) a ## _3DNow |
97 #include "aclib_template.c" | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
98 #endif |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
99 |
5208 | 100 //SSE versions (only used on SSE2 cpus) |
101 #ifdef COMPILE_SSE | |
102 #undef RENAME | |
103 #define HAVE_MMX | |
104 #define HAVE_MMX2 | |
105 #undef HAVE_3DNOW | |
106 #define HAVE_SSE | |
107 #define HAVE_SSE2 | |
108 #define RENAME(a) a ## _SSE | |
109 #include "aclib_template.c" | |
110 #endif | |
111 | |
3393 | 112 #endif // CAN_COMPILE_X86_ASM |
113 | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
114 |
23523
273aa6124f66
avoid utter breakage on non-x86, patch from Chris Roccati <roccati@at@pobox.dot.com>
lu_zero
parents:
21982
diff
changeset
|
115 #undef fast_memcpy |
7072 | 116 void * fast_memcpy(void * to, const void * from, size_t len) |
3393 | 117 { |
118 #ifdef RUNTIME_CPUDETECT | |
119 #ifdef CAN_COMPILE_X86_ASM | |
120 // ordered per speed fasterst first | |
5208 | 121 if(gCpuCaps.hasSSE2) |
122 fast_memcpy_SSE(to, from, len); | |
123 else if(gCpuCaps.hasMMX2) | |
3393 | 124 fast_memcpy_MMX2(to, from, len); |
125 else if(gCpuCaps.has3DNow) | |
126 fast_memcpy_3DNow(to, from, len); | |
127 else if(gCpuCaps.hasMMX) | |
128 fast_memcpy_MMX(to, from, len); | |
129 else | |
130 #endif //CAN_COMPILE_X86_ASM | |
131 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
132 #else | |
5208 | 133 #ifdef HAVE_SSE2 |
134 fast_memcpy_SSE(to, from, len); | |
135 #elif defined (HAVE_MMX2) | |
3393 | 136 fast_memcpy_MMX2(to, from, len); |
137 #elif defined (HAVE_3DNOW) | |
138 fast_memcpy_3DNow(to, from, len); | |
139 #elif defined (HAVE_MMX) | |
140 fast_memcpy_MMX(to, from, len); | |
141 #else | |
142 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
143 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
144 |
3393 | 145 #endif //!RUNTIME_CPUDETECT |
5543
c75f75806af1
memcpy must return destination ptr patch by Adam <adam@cfar.umd.edu>
michael
parents:
5208
diff
changeset
|
146 return to; |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
147 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
148 |
8127
e7153e62a7f4
On non-x86 platforms, memcpy was re-implemented in mplayer and was called
jkeil
parents:
8123
diff
changeset
|
149 #undef mem2agpcpy |
7072 | 150 void * mem2agpcpy(void * to, const void * from, size_t len) |
4681 | 151 { |
152 #ifdef RUNTIME_CPUDETECT | |
153 #ifdef CAN_COMPILE_X86_ASM | |
154 // ordered per speed fasterst first | |
5208 | 155 if(gCpuCaps.hasSSE2) |
156 mem2agpcpy_SSE(to, from, len); | |
157 else if(gCpuCaps.hasMMX2) | |
4681 | 158 mem2agpcpy_MMX2(to, from, len); |
159 else if(gCpuCaps.has3DNow) | |
160 mem2agpcpy_3DNow(to, from, len); | |
161 else if(gCpuCaps.hasMMX) | |
162 mem2agpcpy_MMX(to, from, len); | |
163 else | |
164 #endif //CAN_COMPILE_X86_ASM | |
165 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
166 #else | |
5208 | 167 #ifdef HAVE_SSE2 |
168 mem2agpcpy_SSE(to, from, len); | |
169 #elif defined (HAVE_MMX2) | |
4681 | 170 mem2agpcpy_MMX2(to, from, len); |
171 #elif defined (HAVE_3DNOW) | |
172 mem2agpcpy_3DNow(to, from, len); | |
173 #elif defined (HAVE_MMX) | |
174 mem2agpcpy_MMX(to, from, len); | |
175 #else | |
176 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
177 #endif | |
178 | |
179 #endif //!RUNTIME_CPUDETECT | |
8123
9fc45fe0d444
*HUGE* set of compiler warning fixes, unused variables removal
arpi
parents:
7072
diff
changeset
|
180 return to; |
4681 | 181 } |
182 | |
27341
e7c989f7a7c9
Start unifying names of internal preprocessor directives.
diego
parents:
23523
diff
changeset
|
183 #endif /* CONFIG_FASTMEMCPY */ |