Mercurial > mplayer.hg
comparison libvo/aclib.c @ 698:f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
author | nickols_k |
---|---|
date | Sat, 05 May 2001 13:30:00 +0000 |
parents | |
children | 5b69dabe5823 |
comparison
equal
deleted
inserted
replaced
697:6612e7cac0aa | 698:f0fbf1a9bf31 |
---|---|
1 #include "../config.h" | |
2 | |
3 #ifdef USE_FASTMEMCPY | |
4 /* | |
5 aclib - advanced C library ;) | |
6 This file contains functions which improve and expand standard C-library | |
7 */ | |
8 | |
9 #include <stddef.h> | |
10 | |
11 /* Enable this code, if SSE version works (faster) for you! */ | |
12 #if 1 | |
13 #undef HAVE_SSE | |
14 #endif | |
15 | |
16 | |
17 /* | |
18 This part of code was taken by me from Linux-2.4.3 and slightly modified | |
19 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned | |
20 blocks but mplayer uses weakly ordered data and original sources can not | |
21 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect! | |
22 | |
23 >From IA-32 Intel Architecture Software Developer's Manual Volume 1, | |
24 | |
25 Order Number 245470: | |
26 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions" | |
27 | |
28 Data referenced by a program can be temporal (data will be used again) or | |
29 non-temporal (data will be referenced once and not reused in the immediate | |
30 future). To make efficient use of the processor's caches, it is generally | |
31 desirable to cache temporal data and not cache non-temporal data. Overloading | |
32 the processor's caches with non-temporal data is sometimes referred to as | |
33 "polluting the caches". | |
34 The non-temporal data is written to memory with Write-Combining semantics. | |
35 | |
36 The PREFETCHh instructions permits a program to load data into the processor | |
37 at a suggested cache level, so that it is closer to the processors load and | |
38 store unit when it is needed. If the data is already present in a level of | |
39 the cache hierarchy that is closer to the processor, the PREFETCHh instruction | |
40 will not result in any data movement. | |
41 But we should you PREFETCHNTA: Non-temporal data fetch data into location | |
42 close to the processor, minimizing cache pollution. | |
43 | |
44 The MOVNTQ (store quadword using non-temporal hint) instruction stores | |
45 packed integer data from an MMX register to memory, using a non-temporal hint. | |
46 The MOVNTPS (store packed single-precision floating-point values using | |
47 non-temporal hint) instruction stores packed floating-point data from an | |
48 XMM register to memory, using a non-temporal hint. | |
49 | |
50 The SFENCE (Store Fence) instruction controls write ordering by creating a | |
51 fence for memory store operations. This instruction guarantees that the results | |
52 of every store instruction that precedes the store fence in program order is | |
53 globally visible before any store instruction that follows the fence. The | |
54 SFENCE instruction provides an efficient way of ensuring ordering between | |
55 procedures that produce weakly-ordered data and procedures that consume that | |
56 data. | |
57 | |
58 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru. | |
59 */ | |
60 | |
61 // 3dnow memcpy support from kernel 2.4.2 | |
62 // by Pontscho/fresh!mindworkz | |
63 | |
64 #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) | |
65 | |
66 #undef HAVE_MMX1 | |
67 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE) | |
68 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups | |
69 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus | |
70 standard (non MMX-optimized) version. | |
71 Note: on K6-2+ it speedups memory copying upto 25% and | |
72 on K7 and P3 about 500% (5 times). */ | |
73 #define HAVE_MMX1 | |
74 #endif | |
75 | |
76 | |
77 #undef HAVE_K6_2PLUS | |
78 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW) | |
79 #define HAVE_K6_2PLUS | |
80 #endif | |
81 | |
82 /* for small memory blocks (<256 bytes) this version is faster */ | |
83 #define small_memcpy(to,from,n)\ | |
84 {\ | |
85 register unsigned long int dummy;\ | |
86 __asm__ __volatile__(\ | |
87 "rep; movsb"\ | |
88 :"=&D"(to), "=&S"(from), "=&c"(dummy)\ | |
89 /* It's most portable way to notify compiler */\ | |
90 /* that edi, esi and ecx are clobbered in asm block. */\ | |
91 /* Thanks to A'rpi for hint!!! */\ | |
92 :"0" (to), "1" (from),"2" (n)\ | |
93 : "memory");\ | |
94 } | |
95 | |
96 #ifdef HAVE_SSE | |
97 #define MMREG_SIZE 16 | |
98 #else | |
99 #define MMREG_SIZE 8 | |
100 #endif | |
101 | |
102 /* Small defines (for readability only) ;) */ | |
103 #ifdef HAVE_K6_2PLUS | |
104 #define PREFETCH "prefetch" | |
105 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
106 #define EMMS "femms" | |
107 #else | |
108 #define PREFETCH "prefetchnta" | |
109 #define EMMS "emms" | |
110 #endif | |
111 | |
112 #ifdef HAVE_MMX2 | |
113 #define MOVNTQ "movntq" | |
114 #else | |
115 #define MOVNTQ "movq" | |
116 #endif | |
117 | |
118 #ifdef HAVE_MMX1 | |
119 #define MIN_LEN 0x800 /* 2K blocks */ | |
120 #else | |
121 #define MIN_LEN 0x40 /* 64-byte blocks */ | |
122 #endif | |
123 | |
124 void * fast_memcpy(void * to, const void * from, size_t len) | |
125 { | |
126 void *retval; | |
127 size_t i; | |
128 retval = to; | |
129 #ifndef HAVE_MMX1 | |
130 /* PREFETCH has effect even for MOVSB instruction ;) */ | |
131 __asm__ __volatile__ ( | |
132 PREFETCH" (%0)\n" | |
133 PREFETCH" 64(%0)\n" | |
134 PREFETCH" 128(%0)\n" | |
135 PREFETCH" 192(%0)\n" | |
136 PREFETCH" 256(%0)\n" | |
137 : : "r" (from) ); | |
138 #endif | |
139 if(len >= MIN_LEN) | |
140 { | |
141 register unsigned long int delta; | |
142 /* Align destinition to MMREG_SIZE -boundary */ | |
143 delta = ((unsigned long int)to)&(MMREG_SIZE-1); | |
144 if(delta) | |
145 { | |
146 delta=MMREG_SIZE-delta; | |
147 len -= delta; | |
148 small_memcpy(to, from, delta); | |
149 } | |
150 i = len >> 6; /* len/64 */ | |
151 len&=63; | |
152 /* | |
153 This algorithm is top effective when the code consequently | |
154 reads and writes blocks which have size of cache line. | |
155 Size of cache line is processor-dependent. | |
156 It will, however, be a minimum of 32 bytes on any processors. | |
157 It would be better to have a number of instructions which | |
158 perform reading and writing to be multiple to a number of | |
159 processor's decoders, but it's not always possible. | |
160 */ | |
161 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */ | |
162 if(((unsigned long)from) & 15) | |
163 /* if SRC is misaligned */ | |
164 for(; i>0; i--) | |
165 { | |
166 __asm__ __volatile__ ( | |
167 PREFETCH" 320(%0)\n" | |
168 "movups (%0), %%xmm0\n" | |
169 "movups 16(%0), %%xmm1\n" | |
170 "movups 32(%0), %%xmm2\n" | |
171 "movups 48(%0), %%xmm3\n" | |
172 "movntps %%xmm0, (%1)\n" | |
173 "movntps %%xmm1, 16(%1)\n" | |
174 "movntps %%xmm2, 32(%1)\n" | |
175 "movntps %%xmm3, 48(%1)\n" | |
176 :: "r" (from), "r" (to) : "memory"); | |
177 ((const unsigned char *)from)+=64; | |
178 ((unsigned char *)to)+=64; | |
179 } | |
180 else | |
181 /* | |
182 Only if SRC is aligned on 16-byte boundary. | |
183 It allows to use movaps instead of movups, which required data | |
184 to be aligned or a general-protection exception (#GP) is generated. | |
185 */ | |
186 for(; i>0; i--) | |
187 { | |
188 __asm__ __volatile__ ( | |
189 PREFETCH" 320(%0)\n" | |
190 "movaps (%0), %%xmm0\n" | |
191 "movaps 16(%0), %%xmm1\n" | |
192 "movaps 32(%0), %%xmm2\n" | |
193 "movaps 48(%0), %%xmm3\n" | |
194 "movntps %%xmm0, (%1)\n" | |
195 "movntps %%xmm1, 16(%1)\n" | |
196 "movntps %%xmm2, 32(%1)\n" | |
197 "movntps %%xmm3, 48(%1)\n" | |
198 :: "r" (from), "r" (to) : "memory"); | |
199 ((const unsigned char *)from)+=64; | |
200 ((unsigned char *)to)+=64; | |
201 } | |
202 #else | |
203 for(; i>0; i--) | |
204 { | |
205 __asm__ __volatile__ ( | |
206 #ifndef HAVE_MMX1 | |
207 PREFETCH" 320(%0)\n" | |
208 #endif | |
209 "movq (%0), %%mm0\n" | |
210 "movq 8(%0), %%mm1\n" | |
211 "movq 16(%0), %%mm2\n" | |
212 "movq 24(%0), %%mm3\n" | |
213 "movq 32(%0), %%mm4\n" | |
214 "movq 40(%0), %%mm5\n" | |
215 "movq 48(%0), %%mm6\n" | |
216 "movq 56(%0), %%mm7\n" | |
217 MOVNTQ" %%mm0, (%1)\n" | |
218 MOVNTQ" %%mm1, 8(%1)\n" | |
219 MOVNTQ" %%mm2, 16(%1)\n" | |
220 MOVNTQ" %%mm3, 24(%1)\n" | |
221 MOVNTQ" %%mm4, 32(%1)\n" | |
222 MOVNTQ" %%mm5, 40(%1)\n" | |
223 MOVNTQ" %%mm6, 48(%1)\n" | |
224 MOVNTQ" %%mm7, 56(%1)\n" | |
225 :: "r" (from), "r" (to) : "memory"); | |
226 ((const unsigned char *)from)+=64; | |
227 ((unsigned char *)to)+=64; | |
228 } | |
229 #endif /* Have SSE */ | |
230 #ifdef HAVE_MMX2 | |
231 /* since movntq is weakly-ordered, a "sfence" | |
232 * is needed to become ordered again. */ | |
233 __asm__ __volatile__ ("sfence":::"memory"); | |
234 #endif | |
235 #ifndef HAVE_SSE | |
236 /* enables to use FPU */ | |
237 __asm__ __volatile__ (EMMS:::"memory"); | |
238 #endif | |
239 } | |
240 /* | |
241 * Now do the tail of the block | |
242 */ | |
243 if(len) small_memcpy(to, from, len); | |
244 return retval; | |
245 } | |
246 | |
247 | |
248 #endif /* #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) */ | |
249 #endif /* USE_FASTMEMCPY */ |