annotate libvo/aclib.c @ 698:f0fbf1a9bf31

Moving fast_memcpy to separate file (Size optimization)
author nickols_k
date Sat, 05 May 2001 13:30:00 +0000
parents
children 5b69dabe5823
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
1 #include "../config.h"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
3 #ifdef USE_FASTMEMCPY
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
4 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
5 aclib - advanced C library ;)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
6 This file contains functions which improve and expand standard C-library
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
7 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
8
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
9 #include <stddef.h>
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
10
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
11 /* Enable this code, if SSE version works (faster) for you! */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
12 #if 1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
13 #undef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
14 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
15
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
16
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
17 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
18 This part of code was taken by me from Linux-2.4.3 and slightly modified
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
19 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
20 blocks but mplayer uses weakly ordered data and original sources can not
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
21 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
22
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
23 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
24
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
25 Order Number 245470:
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
26 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
27
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
28 Data referenced by a program can be temporal (data will be used again) or
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
29 non-temporal (data will be referenced once and not reused in the immediate
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
30 future). To make efficient use of the processor's caches, it is generally
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
31 desirable to cache temporal data and not cache non-temporal data. Overloading
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
32 the processor's caches with non-temporal data is sometimes referred to as
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
33 "polluting the caches".
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
34 The non-temporal data is written to memory with Write-Combining semantics.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
35
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
36 The PREFETCHh instructions permits a program to load data into the processor
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
37 at a suggested cache level, so that it is closer to the processors load and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
38 store unit when it is needed. If the data is already present in a level of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
39 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
40 will not result in any data movement.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
41 But we should you PREFETCHNTA: Non-temporal data fetch data into location
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
42 close to the processor, minimizing cache pollution.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
43
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
44 The MOVNTQ (store quadword using non-temporal hint) instruction stores
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
45 packed integer data from an MMX register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
46 The MOVNTPS (store packed single-precision floating-point values using
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
47 non-temporal hint) instruction stores packed floating-point data from an
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
48 XMM register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
49
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
50 The SFENCE (Store Fence) instruction controls write ordering by creating a
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
51 fence for memory store operations. This instruction guarantees that the results
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
52 of every store instruction that precedes the store fence in program order is
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
53 globally visible before any store instruction that follows the fence. The
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
54 SFENCE instruction provides an efficient way of ensuring ordering between
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
55 procedures that produce weakly-ordered data and procedures that consume that
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
56 data.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
57
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
58 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
59 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
60
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
61 // 3dnow memcpy support from kernel 2.4.2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
62 // by Pontscho/fresh!mindworkz
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
63
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
64 #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX )
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
65
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
66 #undef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
67 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
68 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
69 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
70 standard (non MMX-optimized) version.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
71 Note: on K6-2+ it speedups memory copying upto 25% and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
72 on K7 and P3 about 500% (5 times). */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
73 #define HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
74 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
75
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
76
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
77 #undef HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
78 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
79 #define HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
80 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
81
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
82 /* for small memory blocks (<256 bytes) this version is faster */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
83 #define small_memcpy(to,from,n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
84 {\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
85 register unsigned long int dummy;\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
86 __asm__ __volatile__(\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
87 "rep; movsb"\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
88 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
89 /* It's most portable way to notify compiler */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
90 /* that edi, esi and ecx are clobbered in asm block. */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
91 /* Thanks to A'rpi for hint!!! */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
92 :"0" (to), "1" (from),"2" (n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
93 : "memory");\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
94 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
95
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
96 #ifdef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
97 #define MMREG_SIZE 16
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
98 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
99 #define MMREG_SIZE 8
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
100 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
101
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
102 /* Small defines (for readability only) ;) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
103 #ifdef HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
104 #define PREFETCH "prefetch"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
105 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
106 #define EMMS "femms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
107 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
108 #define PREFETCH "prefetchnta"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
109 #define EMMS "emms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
110 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
111
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
112 #ifdef HAVE_MMX2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
113 #define MOVNTQ "movntq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
114 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
115 #define MOVNTQ "movq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
116 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
117
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
118 #ifdef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
119 #define MIN_LEN 0x800 /* 2K blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
120 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
121 #define MIN_LEN 0x40 /* 64-byte blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
122 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
123
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
124 void * fast_memcpy(void * to, const void * from, size_t len)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
125 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
126 void *retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
127 size_t i;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
128 retval = to;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
129 #ifndef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
130 /* PREFETCH has effect even for MOVSB instruction ;) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
131 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
132 PREFETCH" (%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
133 PREFETCH" 64(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
134 PREFETCH" 128(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
135 PREFETCH" 192(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
136 PREFETCH" 256(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
137 : : "r" (from) );
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
138 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
139 if(len >= MIN_LEN)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
140 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
141 register unsigned long int delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
142 /* Align destinition to MMREG_SIZE -boundary */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
143 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
144 if(delta)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
145 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
146 delta=MMREG_SIZE-delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
147 len -= delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
148 small_memcpy(to, from, delta);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
149 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
150 i = len >> 6; /* len/64 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
151 len&=63;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
152 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
153 This algorithm is top effective when the code consequently
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
154 reads and writes blocks which have size of cache line.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
155 Size of cache line is processor-dependent.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
156 It will, however, be a minimum of 32 bytes on any processors.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
157 It would be better to have a number of instructions which
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
158 perform reading and writing to be multiple to a number of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
159 processor's decoders, but it's not always possible.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
160 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
161 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
162 if(((unsigned long)from) & 15)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
163 /* if SRC is misaligned */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
164 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
165 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
166 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
167 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
168 "movups (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
169 "movups 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
170 "movups 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
171 "movups 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
172 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
173 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
174 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
175 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
176 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
177 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
178 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
179 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
180 else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
181 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
182 Only if SRC is aligned on 16-byte boundary.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
183 It allows to use movaps instead of movups, which required data
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
184 to be aligned or a general-protection exception (#GP) is generated.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
185 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
186 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
187 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
188 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
189 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
190 "movaps (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
191 "movaps 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
192 "movaps 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
193 "movaps 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
194 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
195 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
196 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
197 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
198 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
199 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
200 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
201 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
202 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
203 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
204 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
205 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
206 #ifndef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
207 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
208 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
209 "movq (%0), %%mm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
210 "movq 8(%0), %%mm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
211 "movq 16(%0), %%mm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
212 "movq 24(%0), %%mm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
213 "movq 32(%0), %%mm4\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
214 "movq 40(%0), %%mm5\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
215 "movq 48(%0), %%mm6\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
216 "movq 56(%0), %%mm7\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
217 MOVNTQ" %%mm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
218 MOVNTQ" %%mm1, 8(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
219 MOVNTQ" %%mm2, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
220 MOVNTQ" %%mm3, 24(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
221 MOVNTQ" %%mm4, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
222 MOVNTQ" %%mm5, 40(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
223 MOVNTQ" %%mm6, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
224 MOVNTQ" %%mm7, 56(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
225 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
226 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
227 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
228 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
229 #endif /* Have SSE */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
230 #ifdef HAVE_MMX2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
231 /* since movntq is weakly-ordered, a "sfence"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
232 * is needed to become ordered again. */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
233 __asm__ __volatile__ ("sfence":::"memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
234 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
235 #ifndef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
236 /* enables to use FPU */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
237 __asm__ __volatile__ (EMMS:::"memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
238 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
239 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
240 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
241 * Now do the tail of the block
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
242 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
243 if(len) small_memcpy(to, from, len);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
244 return retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
245 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
246
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
247
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
248 #endif /* #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
249 #endif /* USE_FASTMEMCPY */