Mercurial > mplayer.hg
annotate libvo/aclib.c @ 11296:86916e46d445
different / faster / simpler "quantization"
filtered images look like with the old quantization (to me at least) if anyone notices a difference then tell me ASAP
author | michael |
---|---|
date | Mon, 27 Oct 2003 21:12:29 +0000 |
parents | e7153e62a7f4 |
children | 4b8417674f1c |
rev | line source |
---|---|
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
1 #include "../config.h" |
3393 | 2 #ifdef USE_FASTMEMCPY |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
3 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
4 /* |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
5 aclib - advanced C library ;) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
6 This file contains functions which improve and expand standard C-library |
3393 | 7 see aclib_template.c ... this file only contains runtime cpu detection and config options stuff |
8 runtime cpu detection by michael niedermayer (michaelni@gmx.at) is under GPL | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
9 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
10 #include <stddef.h> |
3393 | 11 #include "../cpudetect.h" |
8123
9fc45fe0d444
*HUGE* set of compiler warning fixes, unused variables removal
arpi
parents:
7072
diff
changeset
|
12 #include "fastmemcpy.h" |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
13 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
14 #define BLOCK_SIZE 4096 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
15 #define CONFUSION_FACTOR 0 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
16 //Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
17 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
18 //#define STATISTICS |
3393 | 19 #ifdef ARCH_X86 |
20 #define CAN_COMPILE_X86_ASM | |
21 #endif | |
22 | |
23 //Note: we have MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one | |
24 //Plain C versions | |
25 //#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT) | |
26 //#define COMPILE_C | |
27 //#endif | |
28 | |
29 #ifdef CAN_COMPILE_X86_ASM | |
30 | |
31 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | |
32 #define COMPILE_MMX | |
33 #endif | |
34 | |
5208 | 35 #if (defined (HAVE_MMX2) && !defined (HAVE_SSE2)) || defined (RUNTIME_CPUDETECT) |
3393 | 36 #define COMPILE_MMX2 |
37 #endif | |
38 | |
39 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | |
40 #define COMPILE_3DNOW | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
41 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
42 |
5208 | 43 #if defined (HAVE_SSE2) || defined (RUNTIME_CPUDETECT) |
44 #define COMPILE_SSE | |
45 #endif | |
46 | |
3393 | 47 #undef HAVE_MMX |
48 #undef HAVE_MMX2 | |
49 #undef HAVE_3DNOW | |
5208 | 50 #undef HAVE_SSE |
51 #undef HAVE_SSE2 | |
3393 | 52 #undef ARCH_X86 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
53 /* |
3393 | 54 #ifdef COMPILE_C |
55 #undef HAVE_MMX | |
56 #undef HAVE_MMX2 | |
57 #undef HAVE_3DNOW | |
58 #undef ARCH_X86 | |
59 #define RENAME(a) a ## _C | |
60 #include "aclib_template.c" | |
61 #endif | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
62 */ |
3393 | 63 //MMX versions |
64 #ifdef COMPILE_MMX | |
65 #undef RENAME | |
66 #define HAVE_MMX | |
67 #undef HAVE_MMX2 | |
68 #undef HAVE_3DNOW | |
5208 | 69 #undef HAVE_SSE |
70 #undef HAVE_SSE2 | |
3393 | 71 #define ARCH_X86 |
72 #define RENAME(a) a ## _MMX | |
73 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
74 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
75 |
3393 | 76 //MMX2 versions |
77 #ifdef COMPILE_MMX2 | |
78 #undef RENAME | |
79 #define HAVE_MMX | |
80 #define HAVE_MMX2 | |
81 #undef HAVE_3DNOW | |
5208 | 82 #undef HAVE_SSE |
83 #undef HAVE_SSE2 | |
3393 | 84 #define ARCH_X86 |
85 #define RENAME(a) a ## _MMX2 | |
86 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
87 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
88 |
3393 | 89 //3DNOW versions |
90 #ifdef COMPILE_3DNOW | |
91 #undef RENAME | |
92 #define HAVE_MMX | |
93 #undef HAVE_MMX2 | |
94 #define HAVE_3DNOW | |
5208 | 95 #undef HAVE_SSE |
96 #undef HAVE_SSE2 | |
3393 | 97 #define ARCH_X86 |
98 #define RENAME(a) a ## _3DNow | |
99 #include "aclib_template.c" | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
100 #endif |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
101 |
5208 | 102 //SSE versions (only used on SSE2 cpus) |
103 #ifdef COMPILE_SSE | |
104 #undef RENAME | |
105 #define HAVE_MMX | |
106 #define HAVE_MMX2 | |
107 #undef HAVE_3DNOW | |
108 #define HAVE_SSE | |
109 #define HAVE_SSE2 | |
110 #define ARCH_X86 | |
111 #define RENAME(a) a ## _SSE | |
112 #include "aclib_template.c" | |
113 #endif | |
114 | |
3393 | 115 #endif // CAN_COMPILE_X86_ASM |
116 | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
117 |
7072 | 118 void * fast_memcpy(void * to, const void * from, size_t len) |
3393 | 119 { |
120 #ifdef RUNTIME_CPUDETECT | |
121 #ifdef CAN_COMPILE_X86_ASM | |
122 // ordered per speed fasterst first | |
5208 | 123 if(gCpuCaps.hasSSE2) |
124 fast_memcpy_SSE(to, from, len); | |
125 else if(gCpuCaps.hasMMX2) | |
3393 | 126 fast_memcpy_MMX2(to, from, len); |
127 else if(gCpuCaps.has3DNow) | |
128 fast_memcpy_3DNow(to, from, len); | |
129 else if(gCpuCaps.hasMMX) | |
130 fast_memcpy_MMX(to, from, len); | |
131 else | |
132 #endif //CAN_COMPILE_X86_ASM | |
133 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
134 #else | |
5208 | 135 #ifdef HAVE_SSE2 |
136 fast_memcpy_SSE(to, from, len); | |
137 #elif defined (HAVE_MMX2) | |
3393 | 138 fast_memcpy_MMX2(to, from, len); |
139 #elif defined (HAVE_3DNOW) | |
140 fast_memcpy_3DNow(to, from, len); | |
141 #elif defined (HAVE_MMX) | |
142 fast_memcpy_MMX(to, from, len); | |
143 #else | |
144 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
145 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
146 |
3393 | 147 #endif //!RUNTIME_CPUDETECT |
5543
c75f75806af1
memcpy must return destination ptr patch by Adam <adam@cfar.umd.edu>
michael
parents:
5208
diff
changeset
|
148 return to; |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
149 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
150 |
8127
e7153e62a7f4
On non-x86 platforms, memcpy was re-implemented in mplayer and was called
jkeil
parents:
8123
diff
changeset
|
151 #undef mem2agpcpy |
7072 | 152 void * mem2agpcpy(void * to, const void * from, size_t len) |
4681 | 153 { |
154 #ifdef RUNTIME_CPUDETECT | |
155 #ifdef CAN_COMPILE_X86_ASM | |
156 // ordered per speed fasterst first | |
5208 | 157 if(gCpuCaps.hasSSE2) |
158 mem2agpcpy_SSE(to, from, len); | |
159 else if(gCpuCaps.hasMMX2) | |
4681 | 160 mem2agpcpy_MMX2(to, from, len); |
161 else if(gCpuCaps.has3DNow) | |
162 mem2agpcpy_3DNow(to, from, len); | |
163 else if(gCpuCaps.hasMMX) | |
164 mem2agpcpy_MMX(to, from, len); | |
165 else | |
166 #endif //CAN_COMPILE_X86_ASM | |
167 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
168 #else | |
5208 | 169 #ifdef HAVE_SSE2 |
170 mem2agpcpy_SSE(to, from, len); | |
171 #elif defined (HAVE_MMX2) | |
4681 | 172 mem2agpcpy_MMX2(to, from, len); |
173 #elif defined (HAVE_3DNOW) | |
174 mem2agpcpy_3DNow(to, from, len); | |
175 #elif defined (HAVE_MMX) | |
176 mem2agpcpy_MMX(to, from, len); | |
177 #else | |
178 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
179 #endif | |
180 | |
181 #endif //!RUNTIME_CPUDETECT | |
8123
9fc45fe0d444
*HUGE* set of compiler warning fixes, unused variables removal
arpi
parents:
7072
diff
changeset
|
182 return to; |
4681 | 183 } |
184 | |
185 #endif /* use fastmemcpy */ | |
186 |