Mercurial > mplayer.hg
annotate libvo/aclib.c @ 15952:7a33ae1f8e6d
--Patch by Stefan '1stein' Schuermans <1stein@schuermans.info>:
the bugfix of the "grayscale" output scheme introduced a bug in the header
writer for the stream output, this patch corrects that
author | rik |
---|---|
date | Sun, 10 Jul 2005 12:35:43 +0000 |
parents | e047e70a9767 |
children | 6289755ce7c7 |
rev | line source |
---|---|
12650
ac3fd2ff2561
Unify the config.h #include, use "config.h" instead of "../config.h"
diego
parents:
12492
diff
changeset
|
1 #include "config.h" |
3393 | 2 #ifdef USE_FASTMEMCPY |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
3 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
4 /* |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
5 aclib - advanced C library ;) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
6 This file contains functions which improve and expand standard C-library |
3393 | 7 see aclib_template.c ... this file only contains runtime cpu detection and config options stuff |
8 runtime cpu detection by michael niedermayer (michaelni@gmx.at) is under GPL | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
9 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
10 #include <stddef.h> |
13787
e047e70a9767
Handle "xxx.h" vs "../xxx.h" include paths in a consistent way.
diego
parents:
13720
diff
changeset
|
11 #include "cpudetect.h" |
8123
9fc45fe0d444
*HUGE* set of compiler warning fixes, unused variables removal
arpi
parents:
7072
diff
changeset
|
12 #include "fastmemcpy.h" |
12492
4b8417674f1c
fix crash due to fast_memcpy calling itself instead of libc memcpy
reimar
parents:
8127
diff
changeset
|
13 #undef memcpy |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
14 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
15 #define BLOCK_SIZE 4096 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
16 #define CONFUSION_FACTOR 0 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
17 //Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
18 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
19 //#define STATISTICS |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12650
diff
changeset
|
20 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
3393 | 21 #define CAN_COMPILE_X86_ASM |
22 #endif | |
23 | |
24 //Note: we have MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one | |
25 //Plain C versions | |
26 //#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT) | |
27 //#define COMPILE_C | |
28 //#endif | |
29 | |
30 #ifdef CAN_COMPILE_X86_ASM | |
31 | |
32 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | |
33 #define COMPILE_MMX | |
34 #endif | |
35 | |
5208 | 36 #if (defined (HAVE_MMX2) && !defined (HAVE_SSE2)) || defined (RUNTIME_CPUDETECT) |
3393 | 37 #define COMPILE_MMX2 |
38 #endif | |
39 | |
40 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | |
41 #define COMPILE_3DNOW | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
42 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
43 |
5208 | 44 #if defined (HAVE_SSE2) || defined (RUNTIME_CPUDETECT) |
45 #define COMPILE_SSE | |
46 #endif | |
47 | |
3393 | 48 #undef HAVE_MMX |
49 #undef HAVE_MMX2 | |
50 #undef HAVE_3DNOW | |
5208 | 51 #undef HAVE_SSE |
52 #undef HAVE_SSE2 | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
53 /* |
3393 | 54 #ifdef COMPILE_C |
55 #undef HAVE_MMX | |
56 #undef HAVE_MMX2 | |
57 #undef HAVE_3DNOW | |
58 #undef ARCH_X86 | |
59 #define RENAME(a) a ## _C | |
60 #include "aclib_template.c" | |
61 #endif | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
62 */ |
3393 | 63 //MMX versions |
64 #ifdef COMPILE_MMX | |
65 #undef RENAME | |
66 #define HAVE_MMX | |
67 #undef HAVE_MMX2 | |
68 #undef HAVE_3DNOW | |
5208 | 69 #undef HAVE_SSE |
70 #undef HAVE_SSE2 | |
3393 | 71 #define RENAME(a) a ## _MMX |
72 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
73 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
74 |
3393 | 75 //MMX2 versions |
76 #ifdef COMPILE_MMX2 | |
77 #undef RENAME | |
78 #define HAVE_MMX | |
79 #define HAVE_MMX2 | |
80 #undef HAVE_3DNOW | |
5208 | 81 #undef HAVE_SSE |
82 #undef HAVE_SSE2 | |
3393 | 83 #define RENAME(a) a ## _MMX2 |
84 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
85 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
86 |
3393 | 87 //3DNOW versions |
88 #ifdef COMPILE_3DNOW | |
89 #undef RENAME | |
90 #define HAVE_MMX | |
91 #undef HAVE_MMX2 | |
92 #define HAVE_3DNOW | |
5208 | 93 #undef HAVE_SSE |
94 #undef HAVE_SSE2 | |
3393 | 95 #define RENAME(a) a ## _3DNow |
96 #include "aclib_template.c" | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
97 #endif |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
98 |
5208 | 99 //SSE versions (only used on SSE2 cpus) |
100 #ifdef COMPILE_SSE | |
101 #undef RENAME | |
102 #define HAVE_MMX | |
103 #define HAVE_MMX2 | |
104 #undef HAVE_3DNOW | |
105 #define HAVE_SSE | |
106 #define HAVE_SSE2 | |
107 #define RENAME(a) a ## _SSE | |
108 #include "aclib_template.c" | |
109 #endif | |
110 | |
3393 | 111 #endif // CAN_COMPILE_X86_ASM |
112 | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
113 |
7072 | 114 void * fast_memcpy(void * to, const void * from, size_t len) |
3393 | 115 { |
116 #ifdef RUNTIME_CPUDETECT | |
117 #ifdef CAN_COMPILE_X86_ASM | |
118 // ordered per speed fasterst first | |
5208 | 119 if(gCpuCaps.hasSSE2) |
120 fast_memcpy_SSE(to, from, len); | |
121 else if(gCpuCaps.hasMMX2) | |
3393 | 122 fast_memcpy_MMX2(to, from, len); |
123 else if(gCpuCaps.has3DNow) | |
124 fast_memcpy_3DNow(to, from, len); | |
125 else if(gCpuCaps.hasMMX) | |
126 fast_memcpy_MMX(to, from, len); | |
127 else | |
128 #endif //CAN_COMPILE_X86_ASM | |
129 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
130 #else | |
5208 | 131 #ifdef HAVE_SSE2 |
132 fast_memcpy_SSE(to, from, len); | |
133 #elif defined (HAVE_MMX2) | |
3393 | 134 fast_memcpy_MMX2(to, from, len); |
135 #elif defined (HAVE_3DNOW) | |
136 fast_memcpy_3DNow(to, from, len); | |
137 #elif defined (HAVE_MMX) | |
138 fast_memcpy_MMX(to, from, len); | |
139 #else | |
140 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
141 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
142 |
3393 | 143 #endif //!RUNTIME_CPUDETECT |
5543
c75f75806af1
memcpy must return destination ptr patch by Adam <adam@cfar.umd.edu>
michael
parents:
5208
diff
changeset
|
144 return to; |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
145 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
146 |
8127
e7153e62a7f4
On non-x86 platforms, memcpy was re-implemented in mplayer and was called
jkeil
parents:
8123
diff
changeset
|
147 #undef mem2agpcpy |
7072 | 148 void * mem2agpcpy(void * to, const void * from, size_t len) |
4681 | 149 { |
150 #ifdef RUNTIME_CPUDETECT | |
151 #ifdef CAN_COMPILE_X86_ASM | |
152 // ordered per speed fasterst first | |
5208 | 153 if(gCpuCaps.hasSSE2) |
154 mem2agpcpy_SSE(to, from, len); | |
155 else if(gCpuCaps.hasMMX2) | |
4681 | 156 mem2agpcpy_MMX2(to, from, len); |
157 else if(gCpuCaps.has3DNow) | |
158 mem2agpcpy_3DNow(to, from, len); | |
159 else if(gCpuCaps.hasMMX) | |
160 mem2agpcpy_MMX(to, from, len); | |
161 else | |
162 #endif //CAN_COMPILE_X86_ASM | |
163 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
164 #else | |
5208 | 165 #ifdef HAVE_SSE2 |
166 mem2agpcpy_SSE(to, from, len); | |
167 #elif defined (HAVE_MMX2) | |
4681 | 168 mem2agpcpy_MMX2(to, from, len); |
169 #elif defined (HAVE_3DNOW) | |
170 mem2agpcpy_3DNow(to, from, len); | |
171 #elif defined (HAVE_MMX) | |
172 mem2agpcpy_MMX(to, from, len); | |
173 #else | |
174 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
175 #endif | |
176 | |
177 #endif //!RUNTIME_CPUDETECT | |
8123
9fc45fe0d444
*HUGE* set of compiler warning fixes, unused variables removal
arpi
parents:
7072
diff
changeset
|
178 return to; |
4681 | 179 } |
180 | |
181 #endif /* use fastmemcpy */ | |
182 |