Mercurial > mplayer.hg
annotate libvo/aclib.c @ 6539:79b536a37e40
better planar support, chroma subsampling support and Y8/Y800 support
author | alex |
---|---|
date | Sun, 23 Jun 2002 21:08:31 +0000 |
parents | c75f75806af1 |
children | 113d66d78967 |
rev | line source |
---|---|
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
1 #include "../config.h" |
3393 | 2 #ifdef USE_FASTMEMCPY |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
3 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
4 /* |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
5 aclib - advanced C library ;) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
6 This file contains functions which improve and expand standard C-library |
3393 | 7 see aclib_template.c ... this file only contains runtime cpu detection and config options stuff |
8 runtime cpu detection by michael niedermayer (michaelni@gmx.at) is under GPL | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
9 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
10 #include <stddef.h> |
3393 | 11 #include "../cpudetect.h" |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
12 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
13 #define BLOCK_SIZE 4096 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
14 #define CONFUSION_FACTOR 0 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
15 //Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
16 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
17 //#define STATISTICS |
3393 | 18 #ifdef ARCH_X86 |
19 #define CAN_COMPILE_X86_ASM | |
20 #endif | |
21 | |
22 //Note: we have MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one | |
23 //Plain C versions | |
24 //#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT) | |
25 //#define COMPILE_C | |
26 //#endif | |
27 | |
28 #ifdef CAN_COMPILE_X86_ASM | |
29 | |
30 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | |
31 #define COMPILE_MMX | |
32 #endif | |
33 | |
5208 | 34 #if (defined (HAVE_MMX2) && !defined (HAVE_SSE2)) || defined (RUNTIME_CPUDETECT) |
3393 | 35 #define COMPILE_MMX2 |
36 #endif | |
37 | |
38 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | |
39 #define COMPILE_3DNOW | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
40 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
41 |
5208 | 42 #if defined (HAVE_SSE2) || defined (RUNTIME_CPUDETECT) |
43 #define COMPILE_SSE | |
44 #endif | |
45 | |
3393 | 46 #undef HAVE_MMX |
47 #undef HAVE_MMX2 | |
48 #undef HAVE_3DNOW | |
5208 | 49 #undef HAVE_SSE |
50 #undef HAVE_SSE2 | |
3393 | 51 #undef ARCH_X86 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
52 /* |
3393 | 53 #ifdef COMPILE_C |
54 #undef HAVE_MMX | |
55 #undef HAVE_MMX2 | |
56 #undef HAVE_3DNOW | |
57 #undef ARCH_X86 | |
58 #define RENAME(a) a ## _C | |
59 #include "aclib_template.c" | |
60 #endif | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
61 */ |
3393 | 62 //MMX versions |
63 #ifdef COMPILE_MMX | |
64 #undef RENAME | |
65 #define HAVE_MMX | |
66 #undef HAVE_MMX2 | |
67 #undef HAVE_3DNOW | |
5208 | 68 #undef HAVE_SSE |
69 #undef HAVE_SSE2 | |
3393 | 70 #define ARCH_X86 |
71 #define RENAME(a) a ## _MMX | |
72 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
73 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
74 |
3393 | 75 //MMX2 versions |
76 #ifdef COMPILE_MMX2 | |
77 #undef RENAME | |
78 #define HAVE_MMX | |
79 #define HAVE_MMX2 | |
80 #undef HAVE_3DNOW | |
5208 | 81 #undef HAVE_SSE |
82 #undef HAVE_SSE2 | |
3393 | 83 #define ARCH_X86 |
84 #define RENAME(a) a ## _MMX2 | |
85 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
86 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
87 |
3393 | 88 //3DNOW versions |
89 #ifdef COMPILE_3DNOW | |
90 #undef RENAME | |
91 #define HAVE_MMX | |
92 #undef HAVE_MMX2 | |
93 #define HAVE_3DNOW | |
5208 | 94 #undef HAVE_SSE |
95 #undef HAVE_SSE2 | |
3393 | 96 #define ARCH_X86 |
97 #define RENAME(a) a ## _3DNow | |
98 #include "aclib_template.c" | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
99 #endif |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
100 |
5208 | 101 //SSE versions (only used on SSE2 cpus) |
102 #ifdef COMPILE_SSE | |
103 #undef RENAME | |
104 #define HAVE_MMX | |
105 #define HAVE_MMX2 | |
106 #undef HAVE_3DNOW | |
107 #define HAVE_SSE | |
108 #define HAVE_SSE2 | |
109 #define ARCH_X86 | |
110 #define RENAME(a) a ## _SSE | |
111 #include "aclib_template.c" | |
112 #endif | |
113 | |
3393 | 114 #endif // CAN_COMPILE_X86_ASM |
115 | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
116 |
3393 | 117 inline void * fast_memcpy(void * to, const void * from, size_t len) |
118 { | |
119 #ifdef RUNTIME_CPUDETECT | |
120 #ifdef CAN_COMPILE_X86_ASM | |
121 // ordered per speed fasterst first | |
5208 | 122 if(gCpuCaps.hasSSE2) |
123 fast_memcpy_SSE(to, from, len); | |
124 else if(gCpuCaps.hasMMX2) | |
3393 | 125 fast_memcpy_MMX2(to, from, len); |
126 else if(gCpuCaps.has3DNow) | |
127 fast_memcpy_3DNow(to, from, len); | |
128 else if(gCpuCaps.hasMMX) | |
129 fast_memcpy_MMX(to, from, len); | |
130 else | |
131 #endif //CAN_COMPILE_X86_ASM | |
132 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
133 #else | |
5208 | 134 #ifdef HAVE_SSE2 |
135 fast_memcpy_SSE(to, from, len); | |
136 #elif defined (HAVE_MMX2) | |
3393 | 137 fast_memcpy_MMX2(to, from, len); |
138 #elif defined (HAVE_3DNOW) | |
139 fast_memcpy_3DNow(to, from, len); | |
140 #elif defined (HAVE_MMX) | |
141 fast_memcpy_MMX(to, from, len); | |
142 #else | |
143 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
144 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
145 |
3393 | 146 #endif //!RUNTIME_CPUDETECT |
5543
c75f75806af1
memcpy must return destination ptr patch by Adam <adam@cfar.umd.edu>
michael
parents:
5208
diff
changeset
|
147 return to; |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
148 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
149 |
4681 | 150 inline void * mem2agpcpy(void * to, const void * from, size_t len) |
151 { | |
152 #ifdef RUNTIME_CPUDETECT | |
153 #ifdef CAN_COMPILE_X86_ASM | |
154 // ordered per speed fasterst first | |
5208 | 155 if(gCpuCaps.hasSSE2) |
156 mem2agpcpy_SSE(to, from, len); | |
157 else if(gCpuCaps.hasMMX2) | |
4681 | 158 mem2agpcpy_MMX2(to, from, len); |
159 else if(gCpuCaps.has3DNow) | |
160 mem2agpcpy_3DNow(to, from, len); | |
161 else if(gCpuCaps.hasMMX) | |
162 mem2agpcpy_MMX(to, from, len); | |
163 else | |
164 #endif //CAN_COMPILE_X86_ASM | |
165 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
166 #else | |
5208 | 167 #ifdef HAVE_SSE2 |
168 mem2agpcpy_SSE(to, from, len); | |
169 #elif defined (HAVE_MMX2) | |
4681 | 170 mem2agpcpy_MMX2(to, from, len); |
171 #elif defined (HAVE_3DNOW) | |
172 mem2agpcpy_3DNow(to, from, len); | |
173 #elif defined (HAVE_MMX) | |
174 mem2agpcpy_MMX(to, from, len); | |
175 #else | |
176 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
177 #endif | |
178 | |
179 #endif //!RUNTIME_CPUDETECT | |
180 } | |
181 | |
182 #endif /* use fastmemcpy */ | |
183 |