comparison libvo/aclib.c @ 698:f0fbf1a9bf31

Moving fast_memcpy to separate file (Size optimization)
author nickols_k
date Sat, 05 May 2001 13:30:00 +0000
parents
children 5b69dabe5823
comparison
equal deleted inserted replaced
697:6612e7cac0aa 698:f0fbf1a9bf31
1 #include "../config.h"
2
3 #ifdef USE_FASTMEMCPY
4 /*
5 aclib - advanced C library ;)
6 This file contains functions which improve and expand standard C-library
7 */
8
9 #include <stddef.h>
10
11 /* Enable this code, if SSE version works (faster) for you! */
12 #if 1
13 #undef HAVE_SSE
14 #endif
15
16
17 /*
18 This part of code was taken by me from Linux-2.4.3 and slightly modified
19 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
20 blocks but mplayer uses weakly ordered data and original sources can not
21 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
22
23 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
24
25 Order Number 245470:
26 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
27
28 Data referenced by a program can be temporal (data will be used again) or
29 non-temporal (data will be referenced once and not reused in the immediate
30 future). To make efficient use of the processor's caches, it is generally
31 desirable to cache temporal data and not cache non-temporal data. Overloading
32 the processor's caches with non-temporal data is sometimes referred to as
33 "polluting the caches".
34 The non-temporal data is written to memory with Write-Combining semantics.
35
36 The PREFETCHh instructions permits a program to load data into the processor
37 at a suggested cache level, so that it is closer to the processors load and
38 store unit when it is needed. If the data is already present in a level of
39 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
40 will not result in any data movement.
41 But we should you PREFETCHNTA: Non-temporal data fetch data into location
42 close to the processor, minimizing cache pollution.
43
44 The MOVNTQ (store quadword using non-temporal hint) instruction stores
45 packed integer data from an MMX register to memory, using a non-temporal hint.
46 The MOVNTPS (store packed single-precision floating-point values using
47 non-temporal hint) instruction stores packed floating-point data from an
48 XMM register to memory, using a non-temporal hint.
49
50 The SFENCE (Store Fence) instruction controls write ordering by creating a
51 fence for memory store operations. This instruction guarantees that the results
52 of every store instruction that precedes the store fence in program order is
53 globally visible before any store instruction that follows the fence. The
54 SFENCE instruction provides an efficient way of ensuring ordering between
55 procedures that produce weakly-ordered data and procedures that consume that
56 data.
57
58 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
59 */
60
61 // 3dnow memcpy support from kernel 2.4.2
62 // by Pontscho/fresh!mindworkz
63
64 #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX )
65
66 #undef HAVE_MMX1
67 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
68 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
69 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
70 standard (non MMX-optimized) version.
71 Note: on K6-2+ it speedups memory copying upto 25% and
72 on K7 and P3 about 500% (5 times). */
73 #define HAVE_MMX1
74 #endif
75
76
77 #undef HAVE_K6_2PLUS
78 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
79 #define HAVE_K6_2PLUS
80 #endif
81
82 /* for small memory blocks (<256 bytes) this version is faster */
83 #define small_memcpy(to,from,n)\
84 {\
85 register unsigned long int dummy;\
86 __asm__ __volatile__(\
87 "rep; movsb"\
88 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
89 /* It's most portable way to notify compiler */\
90 /* that edi, esi and ecx are clobbered in asm block. */\
91 /* Thanks to A'rpi for hint!!! */\
92 :"0" (to), "1" (from),"2" (n)\
93 : "memory");\
94 }
95
96 #ifdef HAVE_SSE
97 #define MMREG_SIZE 16
98 #else
99 #define MMREG_SIZE 8
100 #endif
101
102 /* Small defines (for readability only) ;) */
103 #ifdef HAVE_K6_2PLUS
104 #define PREFETCH "prefetch"
105 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
106 #define EMMS "femms"
107 #else
108 #define PREFETCH "prefetchnta"
109 #define EMMS "emms"
110 #endif
111
112 #ifdef HAVE_MMX2
113 #define MOVNTQ "movntq"
114 #else
115 #define MOVNTQ "movq"
116 #endif
117
118 #ifdef HAVE_MMX1
119 #define MIN_LEN 0x800 /* 2K blocks */
120 #else
121 #define MIN_LEN 0x40 /* 64-byte blocks */
122 #endif
123
124 void * fast_memcpy(void * to, const void * from, size_t len)
125 {
126 void *retval;
127 size_t i;
128 retval = to;
129 #ifndef HAVE_MMX1
130 /* PREFETCH has effect even for MOVSB instruction ;) */
131 __asm__ __volatile__ (
132 PREFETCH" (%0)\n"
133 PREFETCH" 64(%0)\n"
134 PREFETCH" 128(%0)\n"
135 PREFETCH" 192(%0)\n"
136 PREFETCH" 256(%0)\n"
137 : : "r" (from) );
138 #endif
139 if(len >= MIN_LEN)
140 {
141 register unsigned long int delta;
142 /* Align destinition to MMREG_SIZE -boundary */
143 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
144 if(delta)
145 {
146 delta=MMREG_SIZE-delta;
147 len -= delta;
148 small_memcpy(to, from, delta);
149 }
150 i = len >> 6; /* len/64 */
151 len&=63;
152 /*
153 This algorithm is top effective when the code consequently
154 reads and writes blocks which have size of cache line.
155 Size of cache line is processor-dependent.
156 It will, however, be a minimum of 32 bytes on any processors.
157 It would be better to have a number of instructions which
158 perform reading and writing to be multiple to a number of
159 processor's decoders, but it's not always possible.
160 */
161 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
162 if(((unsigned long)from) & 15)
163 /* if SRC is misaligned */
164 for(; i>0; i--)
165 {
166 __asm__ __volatile__ (
167 PREFETCH" 320(%0)\n"
168 "movups (%0), %%xmm0\n"
169 "movups 16(%0), %%xmm1\n"
170 "movups 32(%0), %%xmm2\n"
171 "movups 48(%0), %%xmm3\n"
172 "movntps %%xmm0, (%1)\n"
173 "movntps %%xmm1, 16(%1)\n"
174 "movntps %%xmm2, 32(%1)\n"
175 "movntps %%xmm3, 48(%1)\n"
176 :: "r" (from), "r" (to) : "memory");
177 ((const unsigned char *)from)+=64;
178 ((unsigned char *)to)+=64;
179 }
180 else
181 /*
182 Only if SRC is aligned on 16-byte boundary.
183 It allows to use movaps instead of movups, which required data
184 to be aligned or a general-protection exception (#GP) is generated.
185 */
186 for(; i>0; i--)
187 {
188 __asm__ __volatile__ (
189 PREFETCH" 320(%0)\n"
190 "movaps (%0), %%xmm0\n"
191 "movaps 16(%0), %%xmm1\n"
192 "movaps 32(%0), %%xmm2\n"
193 "movaps 48(%0), %%xmm3\n"
194 "movntps %%xmm0, (%1)\n"
195 "movntps %%xmm1, 16(%1)\n"
196 "movntps %%xmm2, 32(%1)\n"
197 "movntps %%xmm3, 48(%1)\n"
198 :: "r" (from), "r" (to) : "memory");
199 ((const unsigned char *)from)+=64;
200 ((unsigned char *)to)+=64;
201 }
202 #else
203 for(; i>0; i--)
204 {
205 __asm__ __volatile__ (
206 #ifndef HAVE_MMX1
207 PREFETCH" 320(%0)\n"
208 #endif
209 "movq (%0), %%mm0\n"
210 "movq 8(%0), %%mm1\n"
211 "movq 16(%0), %%mm2\n"
212 "movq 24(%0), %%mm3\n"
213 "movq 32(%0), %%mm4\n"
214 "movq 40(%0), %%mm5\n"
215 "movq 48(%0), %%mm6\n"
216 "movq 56(%0), %%mm7\n"
217 MOVNTQ" %%mm0, (%1)\n"
218 MOVNTQ" %%mm1, 8(%1)\n"
219 MOVNTQ" %%mm2, 16(%1)\n"
220 MOVNTQ" %%mm3, 24(%1)\n"
221 MOVNTQ" %%mm4, 32(%1)\n"
222 MOVNTQ" %%mm5, 40(%1)\n"
223 MOVNTQ" %%mm6, 48(%1)\n"
224 MOVNTQ" %%mm7, 56(%1)\n"
225 :: "r" (from), "r" (to) : "memory");
226 ((const unsigned char *)from)+=64;
227 ((unsigned char *)to)+=64;
228 }
229 #endif /* Have SSE */
230 #ifdef HAVE_MMX2
231 /* since movntq is weakly-ordered, a "sfence"
232 * is needed to become ordered again. */
233 __asm__ __volatile__ ("sfence":::"memory");
234 #endif
235 #ifndef HAVE_SSE
236 /* enables to use FPU */
237 __asm__ __volatile__ (EMMS:::"memory");
238 #endif
239 }
240 /*
241 * Now do the tail of the block
242 */
243 if(len) small_memcpy(to, from, len);
244 return retval;
245 }
246
247
248 #endif /* #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) */
249 #endif /* USE_FASTMEMCPY */