annotate libvo/aclib.c @ 37195:ac6c37d85d65 default tip

configure: Fix initialization of variable def_local_aligned_32 It contiained the #define of HAVE_LOCAL_ALIGNED_16 instead of HAVE_LOCAL_ALIGNED_32.
author al
date Sun, 28 Sep 2014 18:38:41 +0000
parents 4e2f4bd081ce
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
28446
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
1 /*
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
2 * aclib - advanced C library ;)
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
3 * Functions which improve and expand the standard C library, see aclib_template.c.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
4 * This file only contains runtime CPU detection and config option stuff.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
5 * runtime CPU detection by Michael Niedermayer (michaelni@gmx.at)
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
6 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
7 * This file is part of MPlayer.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
8 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
9 * MPlayer is free software; you can redistribute it and/or modify
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
10 * it under the terms of the GNU General Public License as published by
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
11 * the Free Software Foundation; either version 2 of the License, or
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
12 * (at your option) any later version.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
13 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
14 * MPlayer is distributed in the hope that it will be useful,
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
17 * GNU General Public License for more details.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
18 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
19 * You should have received a copy of the GNU General Public License along
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
20 * with MPlayer; if not, write to the Free Software Foundation, Inc.,
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
22 */
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
23
12650
ac3fd2ff2561 Unify the config.h #include, use "config.h" instead of "../config.h"
diego
parents: 12492
diff changeset
24 #include "config.h"
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
25 #include <stddef.h>
30135
807fce7a4bb3 Do not assume that "long" is the size of a register.
reimar
parents: 29114
diff changeset
26 #include <stdint.h>
21982
fa66a03e8920 Include string.h to make sure memcpy is not used without prototype
reimar
parents: 20577
diff changeset
27 #include <string.h>
13787
e047e70a9767 Handle "xxx.h" vs "../xxx.h" include paths in a consistent way.
diego
parents: 13720
diff changeset
28 #include "cpudetect.h"
8123
9fc45fe0d444 *HUGE* set of compiler warning fixes, unused variables removal
arpi
parents: 7072
diff changeset
29 #include "fastmemcpy.h"
12492
4b8417674f1c fix crash due to fast_memcpy calling itself instead of libc memcpy
reimar
parents: 8127
diff changeset
30 #undef memcpy
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
31
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
32 #define BLOCK_SIZE 4096
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
33 #define CONFUSION_FACTOR 0
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
34 //Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
35
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
36 //#define STATISTICS
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
37
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
38 //Note: we have MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
39 //Plain C versions
29114
06540eb5ef6a Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents: 28921
diff changeset
40 //#if !HAVE_MMX || CONFIG_RUNTIME_CPUDETECT
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
41 //#define COMPILE_C
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
42 //#endif
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
43
28921
62f0032e736a Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents: 28448
diff changeset
44 #if ARCH_X86
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
45
29114
06540eb5ef6a Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents: 28921
diff changeset
46 #if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
47 #define COMPILE_MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
48 #endif
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
49
29114
06540eb5ef6a Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents: 28921
diff changeset
50 #if (HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
51 #define COMPILE_MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
52 #endif
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
53
29114
06540eb5ef6a Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents: 28921
diff changeset
54 #if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
55 #define COMPILE_3DNOW
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
56 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
57
29114
06540eb5ef6a Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents: 28921
diff changeset
58 #if HAVE_SSE2 || CONFIG_RUNTIME_CPUDETECT
5208
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
59 #define COMPILE_SSE
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
60 #endif
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
61
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
62 #undef HAVE_MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
63 #undef HAVE_MMX2
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
64 #undef HAVE_AMD3DNOW
5208
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
65 #undef HAVE_SSE
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
66 #undef HAVE_SSE2
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
67 #define HAVE_MMX 0
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
68 #define HAVE_MMX2 0
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
69 #define HAVE_AMD3DNOW 0
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
70 #define HAVE_SSE 0
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
71 #define HAVE_SSE2 0
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
72 /*
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
73 #ifdef COMPILE_C
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
74 #undef HAVE_MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
75 #undef HAVE_MMX2
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
76 #undef HAVE_AMD3DNOW
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
77 #undef HAVE_SSE
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
78 #undef HAVE_SSE2
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
79 #define HAVE_MMX 0
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
80 #define HAVE_MMX2 0
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
81 #define HAVE_AMD3DNOW 0
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
82 #define HAVE_SSE 0
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
83 #define HAVE_SSE2 0
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
84 #define RENAME(a) a ## _C
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
85 #include "aclib_template.c"
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
86 #endif
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
87 */
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
88 //MMX versions
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
89 #ifdef COMPILE_MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
90 #undef RENAME
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
91 #undef HAVE_MMX
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
92 #undef HAVE_MMX2
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
93 #undef HAVE_AMD3DNOW
5208
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
94 #undef HAVE_SSE
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
95 #undef HAVE_SSE2
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
96 #define HAVE_MMX 1
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
97 #define HAVE_MMX2 0
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
98 #define HAVE_AMD3DNOW 0
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
99 #define HAVE_SSE 0
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
100 #define HAVE_SSE2 0
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
101 #define RENAME(a) a ## _MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
102 #include "aclib_template.c"
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
103 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
104
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
105 //MMX2 versions
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
106 #ifdef COMPILE_MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
107 #undef RENAME
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
108 #undef HAVE_MMX
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
109 #undef HAVE_MMX2
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
110 #undef HAVE_AMD3DNOW
5208
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
111 #undef HAVE_SSE
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
112 #undef HAVE_SSE2
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
113 #define HAVE_MMX 1
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
114 #define HAVE_MMX2 1
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
115 #define HAVE_AMD3DNOW 0
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
116 #define HAVE_SSE 0
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
117 #define HAVE_SSE2 0
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
118 #define RENAME(a) a ## _MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
119 #include "aclib_template.c"
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
120 #endif
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
121
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
122 //3DNOW versions
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
123 #ifdef COMPILE_3DNOW
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
124 #undef RENAME
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
125 #undef HAVE_MMX
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
126 #undef HAVE_MMX2
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
127 #undef HAVE_AMD3DNOW
5208
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
128 #undef HAVE_SSE
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
129 #undef HAVE_SSE2
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
130 #define HAVE_MMX 1
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
131 #define HAVE_MMX2 0
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
132 #define HAVE_AMD3DNOW 1
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
133 #define HAVE_SSE 0
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
134 #define HAVE_SSE2 0
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
135 #define RENAME(a) a ## _3DNow
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
136 #include "aclib_template.c"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
137 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
138
5208
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
139 //SSE versions (only used on SSE2 cpus)
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
140 #ifdef COMPILE_SSE
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
141 #undef RENAME
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
142 #undef HAVE_MMX
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
143 #undef HAVE_MMX2
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
144 #undef HAVE_AMD3DNOW
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
145 #undef HAVE_SSE
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
146 #undef HAVE_SSE2
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
147 #define HAVE_MMX 1
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
148 #define HAVE_MMX2 1
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
149 #define HAVE_AMD3DNOW 0
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
150 #define HAVE_SSE 1
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
151 #define HAVE_SSE2 1
5208
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
152 #define RENAME(a) a ## _SSE
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
153 #include "aclib_template.c"
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
154 #endif
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
155
28921
62f0032e736a Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents: 28448
diff changeset
156 #endif /* ARCH_X86 */
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
157
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
158
23523
273aa6124f66 avoid utter breakage on non-x86, patch from Chris Roccati <roccati@at@pobox.dot.com>
lu_zero
parents: 21982
diff changeset
159 #undef fast_memcpy
7072
113d66d78967 removed nonsense 'inline'
arpi
parents: 5543
diff changeset
160 void * fast_memcpy(void * to, const void * from, size_t len)
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
161 {
29114
06540eb5ef6a Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents: 28921
diff changeset
162 #if CONFIG_RUNTIME_CPUDETECT
28921
62f0032e736a Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents: 28448
diff changeset
163 #if ARCH_X86
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
164 // ordered per speed fasterst first
5208
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
165 if(gCpuCaps.hasSSE2)
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
166 fast_memcpy_SSE(to, from, len);
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
167 else if(gCpuCaps.hasMMX2)
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
168 fast_memcpy_MMX2(to, from, len);
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
169 else if(gCpuCaps.has3DNow)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
170 fast_memcpy_3DNow(to, from, len);
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
171 else if(gCpuCaps.hasMMX)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
172 fast_memcpy_MMX(to, from, len);
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
173 else
28921
62f0032e736a Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents: 28448
diff changeset
174 #endif
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
175 memcpy(to, from, len); // prior to mmx we use the standart memcpy
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
176 #else
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
177 #if HAVE_SSE2
5208
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
178 fast_memcpy_SSE(to, from, len);
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
179 #elif HAVE_MMX2
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
180 fast_memcpy_MMX2(to, from, len);
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
181 #elif HAVE_AMD3DNOW
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
182 fast_memcpy_3DNow(to, from, len);
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
183 #elif HAVE_MMX
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
184 fast_memcpy_MMX(to, from, len);
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
185 #else
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
186 memcpy(to, from, len); // prior to mmx we use the standart memcpy
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
187 #endif
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
188
29114
06540eb5ef6a Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents: 28921
diff changeset
189 #endif //!CONFIG_RUNTIME_CPUDETECT
5543
c75f75806af1 memcpy must return destination ptr patch by Adam <adam@cfar.umd.edu>
michael
parents: 5208
diff changeset
190 return to;
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
191 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
192
8127
e7153e62a7f4 On non-x86 platforms, memcpy was re-implemented in mplayer and was called
jkeil
parents: 8123
diff changeset
193 #undef mem2agpcpy
7072
113d66d78967 removed nonsense 'inline'
arpi
parents: 5543
diff changeset
194 void * mem2agpcpy(void * to, const void * from, size_t len)
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
195 {
29114
06540eb5ef6a Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents: 28921
diff changeset
196 #if CONFIG_RUNTIME_CPUDETECT
28921
62f0032e736a Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents: 28448
diff changeset
197 #if ARCH_X86
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
198 // ordered per speed fasterst first
5208
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
199 if(gCpuCaps.hasSSE2)
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
200 mem2agpcpy_SSE(to, from, len);
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
201 else if(gCpuCaps.hasMMX2)
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
202 mem2agpcpy_MMX2(to, from, len);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
203 else if(gCpuCaps.has3DNow)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
204 mem2agpcpy_3DNow(to, from, len);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
205 else if(gCpuCaps.hasMMX)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
206 mem2agpcpy_MMX(to, from, len);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
207 else
28921
62f0032e736a Get rid of pointless preprocessor condition indirection and use ARCH_X86
diego
parents: 28448
diff changeset
208 #endif
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
209 memcpy(to, from, len); // prior to mmx we use the standart memcpy
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
210 #else
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
211 #if HAVE_SSE2
5208
b08228af4098 fixing runtime cpudetect with pre SSE cpus
michael
parents: 4681
diff changeset
212 mem2agpcpy_SSE(to, from, len);
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
213 #elif HAVE_MMX2
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
214 mem2agpcpy_MMX2(to, from, len);
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28302
diff changeset
215 #elif HAVE_AMD3DNOW
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
216 mem2agpcpy_3DNow(to, from, len);
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27341
diff changeset
217 #elif HAVE_MMX
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
218 mem2agpcpy_MMX(to, from, len);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
219 #else
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
220 memcpy(to, from, len); // prior to mmx we use the standart memcpy
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
221 #endif
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
222
29114
06540eb5ef6a Rename RUNTIME_CPUDETECT to CONFIG_RUNTIME_CPUDETECT and always define it.
ramiro
parents: 28921
diff changeset
223 #endif //!CONFIG_RUNTIME_CPUDETECT
8123
9fc45fe0d444 *HUGE* set of compiler warning fixes, unused variables removal
arpi
parents: 7072
diff changeset
224 return to;
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
225 }