annotate libvo/aclib.c @ 4515:4064940f3f9b

optimization
author nick
date Sun, 03 Feb 2002 15:16:46 +0000
parents 3624cd351618
children 8db59073127e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
1 #include "../config.h"
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
2 #ifdef USE_FASTMEMCPY
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
3
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
4 /*
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
5 aclib - advanced C library ;)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
6 This file contains functions which improve and expand standard C-library
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
7 see aclib_template.c ... this file only contains runtime cpu detection and config options stuff
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
8 runtime cpu detection by michael niedermayer (michaelni@gmx.at) is under GPL
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
9 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
10 #include <stddef.h>
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
11 #include "../cpudetect.h"
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
12
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
13 #define BLOCK_SIZE 4096
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
14 #define CONFUSION_FACTOR 0
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
15 //Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
16
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
17 //#define STATISTICS
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
18
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
19 #ifdef ARCH_X86
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
20 #define CAN_COMPILE_X86_ASM
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
21 #endif
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
22
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
23 //Note: we have MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
24 //Plain C versions
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
25 //#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
26 //#define COMPILE_C
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
27 //#endif
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
28
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
29 #ifdef CAN_COMPILE_X86_ASM
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
30
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
31 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
32 #define COMPILE_MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
33 #endif
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
34
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
35 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
36 #define COMPILE_MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
37 #endif
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
38
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
39 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
40 #define COMPILE_3DNOW
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
41 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
42
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
43 #undef HAVE_MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
44 #undef HAVE_MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
45 #undef HAVE_3DNOW
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
46 #undef ARCH_X86
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
47 /*
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
48 #ifdef COMPILE_C
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
49 #undef HAVE_MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
50 #undef HAVE_MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
51 #undef HAVE_3DNOW
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
52 #undef ARCH_X86
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
53 #define RENAME(a) a ## _C
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
54 #include "aclib_template.c"
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
55 #endif
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
56 */
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
57 //MMX versions
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
58 #ifdef COMPILE_MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
59 #undef RENAME
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
60 #define HAVE_MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
61 #undef HAVE_MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
62 #undef HAVE_3DNOW
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
63 #define ARCH_X86
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
64 #define RENAME(a) a ## _MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
65 #include "aclib_template.c"
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
66 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
67
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
68 //MMX2 versions
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
69 #ifdef COMPILE_MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
70 #undef RENAME
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
71 #define HAVE_MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
72 #define HAVE_MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
73 #undef HAVE_3DNOW
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
74 #define ARCH_X86
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
75 #define RENAME(a) a ## _MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
76 #include "aclib_template.c"
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
77 #endif
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
78
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
79 //3DNOW versions
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
80 #ifdef COMPILE_3DNOW
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
81 #undef RENAME
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
82 #define HAVE_MMX
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
83 #undef HAVE_MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
84 #define HAVE_3DNOW
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
85 #define ARCH_X86
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
86 #define RENAME(a) a ## _3DNow
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
87 #include "aclib_template.c"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
88 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
89
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
90 #endif // CAN_COMPILE_X86_ASM
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
91
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
92
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
93 inline void * fast_memcpy(void * to, const void * from, size_t len)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
94 {
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
95 #ifdef RUNTIME_CPUDETECT
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
96 #ifdef CAN_COMPILE_X86_ASM
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
97 // ordered per speed fasterst first
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
98 if(gCpuCaps.hasMMX2)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
99 fast_memcpy_MMX2(to, from, len);
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
100 else if(gCpuCaps.has3DNow)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
101 fast_memcpy_3DNow(to, from, len);
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
102 else if(gCpuCaps.hasMMX)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
103 fast_memcpy_MMX(to, from, len);
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
104 else
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
105 #endif //CAN_COMPILE_X86_ASM
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
106 memcpy(to, from, len); // prior to mmx we use the standart memcpy
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
107 #else
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
108 #ifdef HAVE_MMX2
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
109 fast_memcpy_MMX2(to, from, len);
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
110 #elif defined (HAVE_3DNOW)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
111 fast_memcpy_3DNow(to, from, len);
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
112 #elif defined (HAVE_MMX)
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
113 fast_memcpy_MMX(to, from, len);
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
114 #else
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
115 memcpy(to, from, len); // prior to mmx we use the standart memcpy
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
116 #endif
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
117
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
118 #endif //!RUNTIME_CPUDETECT
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
119 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
120
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
121 #endif /* use fastmemcpy */