comparison libvo/fastmemcpy.h @ 698:f0fbf1a9bf31

Moving fast_memcpy to separate file (Size optimization)
author nickols_k
date Sat, 05 May 2001 13:30:00 +0000
parents 32697fe58055
children dc6cdb87229a
comparison
equal deleted inserted replaced
697:6612e7cac0aa 698:f0fbf1a9bf31
1 #ifndef __MPLAYER_MEMCPY 1 #ifndef __MPLAYER_MEMCPY
2 #define __MPLAYER_MEMCPY 1 2 #define __MPLAYER_MEMCPY 1
3 3
4 #ifdef USE_FASTMEMCPY 4 #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX )
5 #include <stddef.h> 5 #include <stddef.h>
6 6
7 // Enable this code, if SSE version works (faster) for you! 7 extern void * fast_memcpy(void * to, const void * from, size_t len);
8 #if 0
9 #ifdef HAVE_SSE
10 #define FASTMEMCPY_SSE
11 #endif
12 #endif
13
14 /*
15 This part of code was taken by from Linux-2.4.3 and slightly modified
16 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
17 blocks but mplayer uses weakly ordered data and original sources can not
18 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
19
20 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
21
22 Order Number 245470:
23 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
24
25 Data referenced by a program can be temporal (data will be used again) or
26 non-temporal (data will be referenced once and not reused in the immediate
27 future). To make efficient use of the processor's caches, it is generally
28 desirable to cache temporal data and not cache non-temporal data. Overloading
29 the processor's caches with non-temporal data is sometimes referred to as
30 "polluting the caches".
31 The non-temporal data is written to memory with Write-Combining semantics.
32
33 The PREFETCHh instructions permits a program to load data into the processor
34 at a suggested cache level, so that it is closer to the processors load and
35 store unit when it is needed. If the data is already present in a level of
36 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
37 will not result in any data movement.
38 But we should you PREFETCHNTA: Non-temporal data fetch data into location
39 close to the processor, minimizing cache pollution.
40
41 The MOVNTQ (store quadword using non-temporal hint) instruction stores
42 packed integer data from an MMX register to memory, using a non-temporal hint.
43 The MOVNTPS (store packed single-precision floating-point values using
44 non-temporal hint) instruction stores packed floating-point data from an
45 XMM register to memory, using a non-temporal hint.
46
47 The SFENCE (Store Fence) instruction controls write ordering by creating a
48 fence for memory store operations. This instruction guarantees that the results
49 of every store instruction that precedes the store fence in program order is
50 globally visible before any store instruction that follows the fence. The
51 SFENCE instruction provides an efficient way of ensuring ordering between
52 procedures that produce weakly-ordered data and procedures that consume that
53 data.
54
55 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
56 */
57
58 // 3dnow memcpy support from kernel 2.4.2
59 // by Pontscho/fresh!mindworkz
60
61 #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX )
62
63 #undef HAVE_MMX1
64 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(FASTMEMCPY_SSE)
65 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
66 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
67 standard (non MMX-optimized) version.
68 Note: on K6-2+ it speedups memory copying upto 25% and
69 on K7 and P3 about 500% (5 times). */
70 #define HAVE_MMX1
71 #endif
72
73
74 #undef HAVE_K6_2PLUS
75 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
76 #define HAVE_K6_2PLUS
77 #endif
78
79 /* for small memory blocks (<256 bytes) this version is faster */
80 #define small_memcpy(to,from,n)\
81 {\
82 __asm__ __volatile__(\
83 "rep; movsb"\
84 :"=D"(to), "=S"(from), "=c"(n)\
85 /* It's most portable way to notify compiler */\
86 /* that edi, esi and ecx are clobbered in asm block. */\
87 /* Thanks to A'rpi for hint!!! */\
88 :"0" (to), "1" (from),"2" (n)\
89 : "memory");\
90 }
91
92 #ifdef FASTMEMCPY_SSE
93 #define MMREG_SIZE 16
94 #else
95 #define MMREG_SIZE 8
96 #endif
97
98 /* Small defines (for readability only) ;) */
99 #ifdef HAVE_K6_2PLUS
100 #define PREFETCH "prefetch"
101 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
102 #define EMMS "femms"
103 #else
104 #define PREFETCH "prefetchnta"
105 #define EMMS "emms"
106 #endif
107
108 #ifdef HAVE_MMX2
109 #define MOVNTQ "movntq"
110 #else
111 #define MOVNTQ "movq"
112 #endif
113
114 inline static void * fast_memcpy(void * to, const void * from, size_t len)
115 {
116 void *retval;
117 int i;
118 retval = to;
119 if(len >= 0x200) /* 512-byte blocks */
120 {
121 register unsigned long int delta;
122 /* Align destinition to MMREG_SIZE -boundary */
123 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
124 if(delta)
125 {
126 delta=MMREG_SIZE-delta;
127 len -= delta;
128 small_memcpy(to, from, delta);
129 }
130 i = len >> 6; /* len/64 */
131 len&=63;
132
133 #ifndef HAVE_MMX1
134 __asm__ __volatile__ (
135 PREFETCH" (%0)\n"
136 PREFETCH" 64(%0)\n"
137 PREFETCH" 128(%0)\n"
138 PREFETCH" 192(%0)\n"
139 PREFETCH" 256(%0)\n"
140 : : "r" (from) );
141 #endif
142 /*
143 This algorithm is top effective when the code consequently
144 reads and writes blocks which have size of cache line.
145 Size of cache line is processor-dependent.
146 It will, however, be a minimum of 32 bytes on any processors.
147 It would be better to have a number of instructions which
148 perform reading and writing to be multiple to a number of
149 processor's decoders, but it's not always possible.
150 */
151 #ifdef FASTMEMCPY_SSE /* Only P3 (may be Cyrix3) */
152 if(((unsigned long)from) & 15)
153 /* if SRC is misaligned */
154 for(; i>0; i--)
155 {
156 __asm__ __volatile__ (
157 PREFETCH" 320(%0)\n"
158 "movups (%0), %%xmm0\n"
159 "movups 16(%0), %%xmm1\n"
160 "movntps %%xmm0, (%1)\n"
161 "movntps %%xmm1, 16(%1)\n"
162 "movups 32(%0), %%xmm0\n"
163 "movups 48(%0), %%xmm1\n"
164 "movntps %%xmm0, 32(%1)\n"
165 "movntps %%xmm1, 48(%1)\n"
166 :: "r" (from), "r" (to) : "memory");
167 ((const unsigned char *)from)+=64;
168 ((unsigned char *)to)+=64;
169 }
170 else
171 /*
172 Only if SRC is aligned on 16-byte boundary.
173 It allows to use movaps instead of movups, which required data
174 to be aligned or a general-protection exception (#GP) is generated.
175 */
176 for(; i>0; i--)
177 {
178 __asm__ __volatile__ (
179 PREFETCH" 320(%0)\n"
180 "movaps (%0), %%xmm0\n"
181 "movaps 16(%0), %%xmm1\n"
182 "movntps %%xmm0, (%1)\n"
183 "movntps %%xmm1, 16(%1)\n"
184 "movaps 32(%0), %%xmm0\n"
185 "movaps 48(%0), %%xmm1\n"
186 "movntps %%xmm0, 32(%1)\n"
187 "movntps %%xmm1, 48(%1)\n"
188 :: "r" (from), "r" (to) : "memory");
189 ((const unsigned char *)from)+=64;
190 ((unsigned char *)to)+=64;
191 }
192 #else
193 for(; i>0; i--)
194 {
195 __asm__ __volatile__ (
196 #ifndef HAVE_MMX1
197 PREFETCH" 320(%0)\n"
198 #endif
199 "movq (%0), %%mm0\n"
200 "movq 8(%0), %%mm1\n"
201 "movq 16(%0), %%mm2\n"
202 "movq 24(%0), %%mm3\n"
203 MOVNTQ" %%mm0, (%1)\n"
204 MOVNTQ" %%mm1, 8(%1)\n"
205 MOVNTQ" %%mm2, 16(%1)\n"
206 MOVNTQ" %%mm3, 24(%1)\n"
207 "movq 32(%0), %%mm0\n"
208 "movq 40(%0), %%mm1\n"
209 "movq 48(%0), %%mm2\n"
210 "movq 56(%0), %%mm3\n"
211 MOVNTQ" %%mm0, 32(%1)\n"
212 MOVNTQ" %%mm1, 40(%1)\n"
213 MOVNTQ" %%mm2, 48(%1)\n"
214 MOVNTQ" %%mm3, 56(%1)\n"
215 :: "r" (from), "r" (to) : "memory");
216 ((const unsigned char *)from)+=64;
217 ((unsigned char *)to)+=64;
218 }
219 #endif /* Have SSE */
220 #ifdef HAVE_MMX2
221 /* since movntq is weakly-ordered, a "sfence"
222 * is needed to become ordered again. */
223 __asm__ __volatile__ ("sfence":::"memory");
224 #endif
225 #ifndef FASTMEMCPY_SSE
226 /* enables to use FPU */
227 __asm__ __volatile__ (EMMS:::"memory");
228 #endif
229 }
230 /*
231 * Now do the tail of the block
232 */
233 if(len) small_memcpy(to, from, len);
234 return retval;
235 }
236 #define memcpy(a,b,c) fast_memcpy(a,b,c) 8 #define memcpy(a,b,c) fast_memcpy(a,b,c)
237 #undef small_memcpy
238 9
239 #endif 10 #endif
240 11
241 #endif 12 #endif
242
243 #endif