Mercurial > mplayer.hg
annotate libvo/aclib.c @ 4515:4064940f3f9b
optimization
author | nick |
---|---|
date | Sun, 03 Feb 2002 15:16:46 +0000 |
parents | 3624cd351618 |
children | 8db59073127e |
rev | line source |
---|---|
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
1 #include "../config.h" |
3393 | 2 #ifdef USE_FASTMEMCPY |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
3 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
4 /* |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
5 aclib - advanced C library ;) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
6 This file contains functions which improve and expand standard C-library |
3393 | 7 see aclib_template.c ... this file only contains runtime cpu detection and config options stuff |
8 runtime cpu detection by michael niedermayer (michaelni@gmx.at) is under GPL | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
9 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
10 #include <stddef.h> |
3393 | 11 #include "../cpudetect.h" |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
12 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
13 #define BLOCK_SIZE 4096 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
14 #define CONFUSION_FACTOR 0 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
15 //Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
16 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
17 //#define STATISTICS |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
18 |
3393 | 19 #ifdef ARCH_X86 |
20 #define CAN_COMPILE_X86_ASM | |
21 #endif | |
22 | |
23 //Note: we have MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one | |
24 //Plain C versions | |
25 //#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT) | |
26 //#define COMPILE_C | |
27 //#endif | |
28 | |
29 #ifdef CAN_COMPILE_X86_ASM | |
30 | |
31 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | |
32 #define COMPILE_MMX | |
33 #endif | |
34 | |
35 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT) | |
36 #define COMPILE_MMX2 | |
37 #endif | |
38 | |
39 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) | |
40 #define COMPILE_3DNOW | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
41 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
42 |
3393 | 43 #undef HAVE_MMX |
44 #undef HAVE_MMX2 | |
45 #undef HAVE_3DNOW | |
46 #undef ARCH_X86 | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
47 /* |
3393 | 48 #ifdef COMPILE_C |
49 #undef HAVE_MMX | |
50 #undef HAVE_MMX2 | |
51 #undef HAVE_3DNOW | |
52 #undef ARCH_X86 | |
53 #define RENAME(a) a ## _C | |
54 #include "aclib_template.c" | |
55 #endif | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
56 */ |
3393 | 57 //MMX versions |
58 #ifdef COMPILE_MMX | |
59 #undef RENAME | |
60 #define HAVE_MMX | |
61 #undef HAVE_MMX2 | |
62 #undef HAVE_3DNOW | |
63 #define ARCH_X86 | |
64 #define RENAME(a) a ## _MMX | |
65 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
66 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
67 |
3393 | 68 //MMX2 versions |
69 #ifdef COMPILE_MMX2 | |
70 #undef RENAME | |
71 #define HAVE_MMX | |
72 #define HAVE_MMX2 | |
73 #undef HAVE_3DNOW | |
74 #define ARCH_X86 | |
75 #define RENAME(a) a ## _MMX2 | |
76 #include "aclib_template.c" | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
77 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
78 |
3393 | 79 //3DNOW versions |
80 #ifdef COMPILE_3DNOW | |
81 #undef RENAME | |
82 #define HAVE_MMX | |
83 #undef HAVE_MMX2 | |
84 #define HAVE_3DNOW | |
85 #define ARCH_X86 | |
86 #define RENAME(a) a ## _3DNow | |
87 #include "aclib_template.c" | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
88 #endif |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
89 |
3393 | 90 #endif // CAN_COMPILE_X86_ASM |
91 | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
92 |
3393 | 93 inline void * fast_memcpy(void * to, const void * from, size_t len) |
94 { | |
95 #ifdef RUNTIME_CPUDETECT | |
96 #ifdef CAN_COMPILE_X86_ASM | |
97 // ordered per speed fasterst first | |
98 if(gCpuCaps.hasMMX2) | |
99 fast_memcpy_MMX2(to, from, len); | |
100 else if(gCpuCaps.has3DNow) | |
101 fast_memcpy_3DNow(to, from, len); | |
102 else if(gCpuCaps.hasMMX) | |
103 fast_memcpy_MMX(to, from, len); | |
104 else | |
105 #endif //CAN_COMPILE_X86_ASM | |
106 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
107 #else | |
108 #ifdef HAVE_MMX2 | |
109 fast_memcpy_MMX2(to, from, len); | |
110 #elif defined (HAVE_3DNOW) | |
111 fast_memcpy_3DNow(to, from, len); | |
112 #elif defined (HAVE_MMX) | |
113 fast_memcpy_MMX(to, from, len); | |
114 #else | |
115 memcpy(to, from, len); // prior to mmx we use the standart memcpy | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
116 #endif |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
117 |
3393 | 118 #endif //!RUNTIME_CPUDETECT |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
119 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
120 |
3393 | 121 #endif /* use fastmemcpy */ |