Mercurial > mplayer.hg
comparison libvo/fastmemcpy.h @ 698:f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
author | nickols_k |
---|---|
date | Sat, 05 May 2001 13:30:00 +0000 |
parents | 32697fe58055 |
children | dc6cdb87229a |
comparison
equal
deleted
inserted
replaced
697:6612e7cac0aa | 698:f0fbf1a9bf31 |
---|---|
1 #ifndef __MPLAYER_MEMCPY | 1 #ifndef __MPLAYER_MEMCPY |
2 #define __MPLAYER_MEMCPY 1 | 2 #define __MPLAYER_MEMCPY 1 |
3 | 3 |
4 #ifdef USE_FASTMEMCPY | 4 #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) |
5 #include <stddef.h> | 5 #include <stddef.h> |
6 | 6 |
7 // Enable this code, if SSE version works (faster) for you! | 7 extern void * fast_memcpy(void * to, const void * from, size_t len); |
8 #if 0 | |
9 #ifdef HAVE_SSE | |
10 #define FASTMEMCPY_SSE | |
11 #endif | |
12 #endif | |
13 | |
14 /* | |
15 This part of code was taken by from Linux-2.4.3 and slightly modified | |
16 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned | |
17 blocks but mplayer uses weakly ordered data and original sources can not | |
18 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect! | |
19 | |
20 >From IA-32 Intel Architecture Software Developer's Manual Volume 1, | |
21 | |
22 Order Number 245470: | |
23 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions" | |
24 | |
25 Data referenced by a program can be temporal (data will be used again) or | |
26 non-temporal (data will be referenced once and not reused in the immediate | |
27 future). To make efficient use of the processor's caches, it is generally | |
28 desirable to cache temporal data and not cache non-temporal data. Overloading | |
29 the processor's caches with non-temporal data is sometimes referred to as | |
30 "polluting the caches". | |
31 The non-temporal data is written to memory with Write-Combining semantics. | |
32 | |
33 The PREFETCHh instructions permits a program to load data into the processor | |
34 at a suggested cache level, so that it is closer to the processors load and | |
35 store unit when it is needed. If the data is already present in a level of | |
36 the cache hierarchy that is closer to the processor, the PREFETCHh instruction | |
37 will not result in any data movement. | |
38 But we should you PREFETCHNTA: Non-temporal data fetch data into location | |
39 close to the processor, minimizing cache pollution. | |
40 | |
41 The MOVNTQ (store quadword using non-temporal hint) instruction stores | |
42 packed integer data from an MMX register to memory, using a non-temporal hint. | |
43 The MOVNTPS (store packed single-precision floating-point values using | |
44 non-temporal hint) instruction stores packed floating-point data from an | |
45 XMM register to memory, using a non-temporal hint. | |
46 | |
47 The SFENCE (Store Fence) instruction controls write ordering by creating a | |
48 fence for memory store operations. This instruction guarantees that the results | |
49 of every store instruction that precedes the store fence in program order is | |
50 globally visible before any store instruction that follows the fence. The | |
51 SFENCE instruction provides an efficient way of ensuring ordering between | |
52 procedures that produce weakly-ordered data and procedures that consume that | |
53 data. | |
54 | |
55 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru. | |
56 */ | |
57 | |
58 // 3dnow memcpy support from kernel 2.4.2 | |
59 // by Pontscho/fresh!mindworkz | |
60 | |
61 #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) | |
62 | |
63 #undef HAVE_MMX1 | |
64 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(FASTMEMCPY_SSE) | |
65 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups | |
66 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus | |
67 standard (non MMX-optimized) version. | |
68 Note: on K6-2+ it speedups memory copying upto 25% and | |
69 on K7 and P3 about 500% (5 times). */ | |
70 #define HAVE_MMX1 | |
71 #endif | |
72 | |
73 | |
74 #undef HAVE_K6_2PLUS | |
75 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW) | |
76 #define HAVE_K6_2PLUS | |
77 #endif | |
78 | |
79 /* for small memory blocks (<256 bytes) this version is faster */ | |
80 #define small_memcpy(to,from,n)\ | |
81 {\ | |
82 __asm__ __volatile__(\ | |
83 "rep; movsb"\ | |
84 :"=D"(to), "=S"(from), "=c"(n)\ | |
85 /* It's most portable way to notify compiler */\ | |
86 /* that edi, esi and ecx are clobbered in asm block. */\ | |
87 /* Thanks to A'rpi for hint!!! */\ | |
88 :"0" (to), "1" (from),"2" (n)\ | |
89 : "memory");\ | |
90 } | |
91 | |
92 #ifdef FASTMEMCPY_SSE | |
93 #define MMREG_SIZE 16 | |
94 #else | |
95 #define MMREG_SIZE 8 | |
96 #endif | |
97 | |
98 /* Small defines (for readability only) ;) */ | |
99 #ifdef HAVE_K6_2PLUS | |
100 #define PREFETCH "prefetch" | |
101 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
102 #define EMMS "femms" | |
103 #else | |
104 #define PREFETCH "prefetchnta" | |
105 #define EMMS "emms" | |
106 #endif | |
107 | |
108 #ifdef HAVE_MMX2 | |
109 #define MOVNTQ "movntq" | |
110 #else | |
111 #define MOVNTQ "movq" | |
112 #endif | |
113 | |
114 inline static void * fast_memcpy(void * to, const void * from, size_t len) | |
115 { | |
116 void *retval; | |
117 int i; | |
118 retval = to; | |
119 if(len >= 0x200) /* 512-byte blocks */ | |
120 { | |
121 register unsigned long int delta; | |
122 /* Align destinition to MMREG_SIZE -boundary */ | |
123 delta = ((unsigned long int)to)&(MMREG_SIZE-1); | |
124 if(delta) | |
125 { | |
126 delta=MMREG_SIZE-delta; | |
127 len -= delta; | |
128 small_memcpy(to, from, delta); | |
129 } | |
130 i = len >> 6; /* len/64 */ | |
131 len&=63; | |
132 | |
133 #ifndef HAVE_MMX1 | |
134 __asm__ __volatile__ ( | |
135 PREFETCH" (%0)\n" | |
136 PREFETCH" 64(%0)\n" | |
137 PREFETCH" 128(%0)\n" | |
138 PREFETCH" 192(%0)\n" | |
139 PREFETCH" 256(%0)\n" | |
140 : : "r" (from) ); | |
141 #endif | |
142 /* | |
143 This algorithm is top effective when the code consequently | |
144 reads and writes blocks which have size of cache line. | |
145 Size of cache line is processor-dependent. | |
146 It will, however, be a minimum of 32 bytes on any processors. | |
147 It would be better to have a number of instructions which | |
148 perform reading and writing to be multiple to a number of | |
149 processor's decoders, but it's not always possible. | |
150 */ | |
151 #ifdef FASTMEMCPY_SSE /* Only P3 (may be Cyrix3) */ | |
152 if(((unsigned long)from) & 15) | |
153 /* if SRC is misaligned */ | |
154 for(; i>0; i--) | |
155 { | |
156 __asm__ __volatile__ ( | |
157 PREFETCH" 320(%0)\n" | |
158 "movups (%0), %%xmm0\n" | |
159 "movups 16(%0), %%xmm1\n" | |
160 "movntps %%xmm0, (%1)\n" | |
161 "movntps %%xmm1, 16(%1)\n" | |
162 "movups 32(%0), %%xmm0\n" | |
163 "movups 48(%0), %%xmm1\n" | |
164 "movntps %%xmm0, 32(%1)\n" | |
165 "movntps %%xmm1, 48(%1)\n" | |
166 :: "r" (from), "r" (to) : "memory"); | |
167 ((const unsigned char *)from)+=64; | |
168 ((unsigned char *)to)+=64; | |
169 } | |
170 else | |
171 /* | |
172 Only if SRC is aligned on 16-byte boundary. | |
173 It allows to use movaps instead of movups, which required data | |
174 to be aligned or a general-protection exception (#GP) is generated. | |
175 */ | |
176 for(; i>0; i--) | |
177 { | |
178 __asm__ __volatile__ ( | |
179 PREFETCH" 320(%0)\n" | |
180 "movaps (%0), %%xmm0\n" | |
181 "movaps 16(%0), %%xmm1\n" | |
182 "movntps %%xmm0, (%1)\n" | |
183 "movntps %%xmm1, 16(%1)\n" | |
184 "movaps 32(%0), %%xmm0\n" | |
185 "movaps 48(%0), %%xmm1\n" | |
186 "movntps %%xmm0, 32(%1)\n" | |
187 "movntps %%xmm1, 48(%1)\n" | |
188 :: "r" (from), "r" (to) : "memory"); | |
189 ((const unsigned char *)from)+=64; | |
190 ((unsigned char *)to)+=64; | |
191 } | |
192 #else | |
193 for(; i>0; i--) | |
194 { | |
195 __asm__ __volatile__ ( | |
196 #ifndef HAVE_MMX1 | |
197 PREFETCH" 320(%0)\n" | |
198 #endif | |
199 "movq (%0), %%mm0\n" | |
200 "movq 8(%0), %%mm1\n" | |
201 "movq 16(%0), %%mm2\n" | |
202 "movq 24(%0), %%mm3\n" | |
203 MOVNTQ" %%mm0, (%1)\n" | |
204 MOVNTQ" %%mm1, 8(%1)\n" | |
205 MOVNTQ" %%mm2, 16(%1)\n" | |
206 MOVNTQ" %%mm3, 24(%1)\n" | |
207 "movq 32(%0), %%mm0\n" | |
208 "movq 40(%0), %%mm1\n" | |
209 "movq 48(%0), %%mm2\n" | |
210 "movq 56(%0), %%mm3\n" | |
211 MOVNTQ" %%mm0, 32(%1)\n" | |
212 MOVNTQ" %%mm1, 40(%1)\n" | |
213 MOVNTQ" %%mm2, 48(%1)\n" | |
214 MOVNTQ" %%mm3, 56(%1)\n" | |
215 :: "r" (from), "r" (to) : "memory"); | |
216 ((const unsigned char *)from)+=64; | |
217 ((unsigned char *)to)+=64; | |
218 } | |
219 #endif /* Have SSE */ | |
220 #ifdef HAVE_MMX2 | |
221 /* since movntq is weakly-ordered, a "sfence" | |
222 * is needed to become ordered again. */ | |
223 __asm__ __volatile__ ("sfence":::"memory"); | |
224 #endif | |
225 #ifndef FASTMEMCPY_SSE | |
226 /* enables to use FPU */ | |
227 __asm__ __volatile__ (EMMS:::"memory"); | |
228 #endif | |
229 } | |
230 /* | |
231 * Now do the tail of the block | |
232 */ | |
233 if(len) small_memcpy(to, from, len); | |
234 return retval; | |
235 } | |
236 #define memcpy(a,b,c) fast_memcpy(a,b,c) | 8 #define memcpy(a,b,c) fast_memcpy(a,b,c) |
237 #undef small_memcpy | |
238 | 9 |
239 #endif | 10 #endif |
240 | 11 |
241 #endif | 12 #endif |
242 | |
243 #endif |