annotate libvo/aclib.c @ 2537:28d30f50d89c

memalign 64
author nick
date Mon, 29 Oct 2001 18:25:07 +0000
parents 5b69dabe5823
children 99f6db3255aa
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
1 #include "../config.h"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
3 #ifdef USE_FASTMEMCPY
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
4 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
5 aclib - advanced C library ;)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
6 This file contains functions which improve and expand standard C-library
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
7 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
8
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
9 #include <stddef.h>
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
10
1123
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
11 #ifndef HAVE_SSE2
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
12 /*
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
13 P3 processor has only one SSE decoder so can execute only 1 sse insn per
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
14 cpu clock, but it has 3 mmx decoders (include load/store unit)
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
15 and executes 3 mmx insns per cpu clock.
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
16 P4 processor has some chances, but after reading:
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
17 http://www.emulators.com/pentium4.htm
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
18 I have doubts. Anyway SSE2 version of this code can be written better.
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
19 */
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
20 #undef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
21 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
22
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
23
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
24 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
25 This part of code was taken by me from Linux-2.4.3 and slightly modified
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
26 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
27 blocks but mplayer uses weakly ordered data and original sources can not
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
28 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
29
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
30 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
31
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
32 Order Number 245470:
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
33 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
34
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
35 Data referenced by a program can be temporal (data will be used again) or
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
36 non-temporal (data will be referenced once and not reused in the immediate
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
37 future). To make efficient use of the processor's caches, it is generally
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
38 desirable to cache temporal data and not cache non-temporal data. Overloading
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
39 the processor's caches with non-temporal data is sometimes referred to as
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
40 "polluting the caches".
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
41 The non-temporal data is written to memory with Write-Combining semantics.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
42
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
43 The PREFETCHh instructions permits a program to load data into the processor
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
44 at a suggested cache level, so that it is closer to the processors load and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
45 store unit when it is needed. If the data is already present in a level of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
46 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
47 will not result in any data movement.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
48 But we should you PREFETCHNTA: Non-temporal data fetch data into location
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
49 close to the processor, minimizing cache pollution.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
50
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
51 The MOVNTQ (store quadword using non-temporal hint) instruction stores
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
52 packed integer data from an MMX register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
53 The MOVNTPS (store packed single-precision floating-point values using
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
54 non-temporal hint) instruction stores packed floating-point data from an
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
55 XMM register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
56
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
57 The SFENCE (Store Fence) instruction controls write ordering by creating a
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
58 fence for memory store operations. This instruction guarantees that the results
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
59 of every store instruction that precedes the store fence in program order is
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
60 globally visible before any store instruction that follows the fence. The
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
61 SFENCE instruction provides an efficient way of ensuring ordering between
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
62 procedures that produce weakly-ordered data and procedures that consume that
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
63 data.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
64
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
65 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
66 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
67
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
68 // 3dnow memcpy support from kernel 2.4.2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
69 // by Pontscho/fresh!mindworkz
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
70
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
71 #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX )
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
72
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
73 #undef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
74 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
75 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
76 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
77 standard (non MMX-optimized) version.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
78 Note: on K6-2+ it speedups memory copying upto 25% and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
79 on K7 and P3 about 500% (5 times). */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
80 #define HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
81 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
82
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
83
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
84 #undef HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
85 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
86 #define HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
87 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
88
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
89 /* for small memory blocks (<256 bytes) this version is faster */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
90 #define small_memcpy(to,from,n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
91 {\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
92 register unsigned long int dummy;\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
93 __asm__ __volatile__(\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
94 "rep; movsb"\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
95 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
96 /* It's most portable way to notify compiler */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
97 /* that edi, esi and ecx are clobbered in asm block. */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
98 /* Thanks to A'rpi for hint!!! */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
99 :"0" (to), "1" (from),"2" (n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
100 : "memory");\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
101 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
102
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
103 #ifdef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
104 #define MMREG_SIZE 16
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
105 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
106 #define MMREG_SIZE 8
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
107 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
108
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
109 /* Small defines (for readability only) ;) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
110 #ifdef HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
111 #define PREFETCH "prefetch"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
112 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
113 #define EMMS "femms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
114 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
115 #define PREFETCH "prefetchnta"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
116 #define EMMS "emms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
117 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
118
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
119 #ifdef HAVE_MMX2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
120 #define MOVNTQ "movntq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
121 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
122 #define MOVNTQ "movq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
123 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
124
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
125 #ifdef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
126 #define MIN_LEN 0x800 /* 2K blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
127 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
128 #define MIN_LEN 0x40 /* 64-byte blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
129 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
130
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
131 void * fast_memcpy(void * to, const void * from, size_t len)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
132 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
133 void *retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
134 size_t i;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
135 retval = to;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
136 #ifndef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
137 /* PREFETCH has effect even for MOVSB instruction ;) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
138 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
139 PREFETCH" (%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
140 PREFETCH" 64(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
141 PREFETCH" 128(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
142 PREFETCH" 192(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
143 PREFETCH" 256(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
144 : : "r" (from) );
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
145 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
146 if(len >= MIN_LEN)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
147 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
148 register unsigned long int delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
149 /* Align destinition to MMREG_SIZE -boundary */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
150 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
151 if(delta)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
152 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
153 delta=MMREG_SIZE-delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
154 len -= delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
155 small_memcpy(to, from, delta);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
156 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
157 i = len >> 6; /* len/64 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
158 len&=63;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
159 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
160 This algorithm is top effective when the code consequently
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
161 reads and writes blocks which have size of cache line.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
162 Size of cache line is processor-dependent.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
163 It will, however, be a minimum of 32 bytes on any processors.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
164 It would be better to have a number of instructions which
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
165 perform reading and writing to be multiple to a number of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
166 processor's decoders, but it's not always possible.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
167 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
168 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
169 if(((unsigned long)from) & 15)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
170 /* if SRC is misaligned */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
171 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
172 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
173 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
174 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
175 "movups (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
176 "movups 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
177 "movups 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
178 "movups 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
179 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
180 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
181 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
182 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
183 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
184 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
185 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
186 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
187 else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
188 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
189 Only if SRC is aligned on 16-byte boundary.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
190 It allows to use movaps instead of movups, which required data
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
191 to be aligned or a general-protection exception (#GP) is generated.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
192 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
193 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
194 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
195 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
196 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
197 "movaps (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
198 "movaps 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
199 "movaps 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
200 "movaps 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
201 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
202 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
203 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
204 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
205 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
206 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
207 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
208 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
209 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
210 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
211 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
212 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
213 #ifndef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
214 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
215 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
216 "movq (%0), %%mm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
217 "movq 8(%0), %%mm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
218 "movq 16(%0), %%mm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
219 "movq 24(%0), %%mm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
220 "movq 32(%0), %%mm4\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
221 "movq 40(%0), %%mm5\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
222 "movq 48(%0), %%mm6\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
223 "movq 56(%0), %%mm7\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
224 MOVNTQ" %%mm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
225 MOVNTQ" %%mm1, 8(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
226 MOVNTQ" %%mm2, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
227 MOVNTQ" %%mm3, 24(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
228 MOVNTQ" %%mm4, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
229 MOVNTQ" %%mm5, 40(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
230 MOVNTQ" %%mm6, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
231 MOVNTQ" %%mm7, 56(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
232 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
233 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
234 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
235 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
236 #endif /* Have SSE */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
237 #ifdef HAVE_MMX2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
238 /* since movntq is weakly-ordered, a "sfence"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
239 * is needed to become ordered again. */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
240 __asm__ __volatile__ ("sfence":::"memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
241 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
242 #ifndef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
243 /* enables to use FPU */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
244 __asm__ __volatile__ (EMMS:::"memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
245 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
246 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
247 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
248 * Now do the tail of the block
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
249 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
250 if(len) small_memcpy(to, from, len);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
251 return retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
252 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
253
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
254
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
255 #endif /* #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
256 #endif /* USE_FASTMEMCPY */