Mercurial > mplayer.hg
annotate libvo/aclib.c @ 803:b25a887b6054
sub splitting patch applied by Vlada V.Dubsky@sh.cvut.cz
author | arpi_esp |
---|---|
date | Mon, 14 May 2001 22:08:12 +0000 |
parents | f0fbf1a9bf31 |
children | 5b69dabe5823 |
rev | line source |
---|---|
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
1 #include "../config.h" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
2 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
3 #ifdef USE_FASTMEMCPY |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
4 /* |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
5 aclib - advanced C library ;) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
6 This file contains functions which improve and expand standard C-library |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
7 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
8 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
9 #include <stddef.h> |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
10 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
11 /* Enable this code, if SSE version works (faster) for you! */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
12 #if 1 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
13 #undef HAVE_SSE |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
14 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
15 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
16 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
17 /* |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
18 This part of code was taken by me from Linux-2.4.3 and slightly modified |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
19 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
20 blocks but mplayer uses weakly ordered data and original sources can not |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
21 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect! |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
22 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
23 >From IA-32 Intel Architecture Software Developer's Manual Volume 1, |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
24 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
25 Order Number 245470: |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
26 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
27 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
28 Data referenced by a program can be temporal (data will be used again) or |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
29 non-temporal (data will be referenced once and not reused in the immediate |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
30 future). To make efficient use of the processor's caches, it is generally |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
31 desirable to cache temporal data and not cache non-temporal data. Overloading |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
32 the processor's caches with non-temporal data is sometimes referred to as |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
33 "polluting the caches". |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
34 The non-temporal data is written to memory with Write-Combining semantics. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
35 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
36 The PREFETCHh instructions permits a program to load data into the processor |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
37 at a suggested cache level, so that it is closer to the processors load and |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
38 store unit when it is needed. If the data is already present in a level of |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
39 the cache hierarchy that is closer to the processor, the PREFETCHh instruction |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
40 will not result in any data movement. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
41 But we should you PREFETCHNTA: Non-temporal data fetch data into location |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
42 close to the processor, minimizing cache pollution. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
43 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
44 The MOVNTQ (store quadword using non-temporal hint) instruction stores |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
45 packed integer data from an MMX register to memory, using a non-temporal hint. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
46 The MOVNTPS (store packed single-precision floating-point values using |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
47 non-temporal hint) instruction stores packed floating-point data from an |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
48 XMM register to memory, using a non-temporal hint. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
49 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
50 The SFENCE (Store Fence) instruction controls write ordering by creating a |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
51 fence for memory store operations. This instruction guarantees that the results |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
52 of every store instruction that precedes the store fence in program order is |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
53 globally visible before any store instruction that follows the fence. The |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
54 SFENCE instruction provides an efficient way of ensuring ordering between |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
55 procedures that produce weakly-ordered data and procedures that consume that |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
56 data. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
57 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
58 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
59 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
60 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
61 // 3dnow memcpy support from kernel 2.4.2 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
62 // by Pontscho/fresh!mindworkz |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
63 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
64 #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
65 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
66 #undef HAVE_MMX1 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
67 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
68 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
69 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
70 standard (non MMX-optimized) version. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
71 Note: on K6-2+ it speedups memory copying upto 25% and |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
72 on K7 and P3 about 500% (5 times). */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
73 #define HAVE_MMX1 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
74 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
75 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
76 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
77 #undef HAVE_K6_2PLUS |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
78 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
79 #define HAVE_K6_2PLUS |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
80 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
81 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
82 /* for small memory blocks (<256 bytes) this version is faster */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
83 #define small_memcpy(to,from,n)\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
84 {\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
85 register unsigned long int dummy;\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
86 __asm__ __volatile__(\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
87 "rep; movsb"\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
88 :"=&D"(to), "=&S"(from), "=&c"(dummy)\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
89 /* It's most portable way to notify compiler */\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
90 /* that edi, esi and ecx are clobbered in asm block. */\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
91 /* Thanks to A'rpi for hint!!! */\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
92 :"0" (to), "1" (from),"2" (n)\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
93 : "memory");\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
94 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
95 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
96 #ifdef HAVE_SSE |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
97 #define MMREG_SIZE 16 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
98 #else |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
99 #define MMREG_SIZE 8 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
100 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
101 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
102 /* Small defines (for readability only) ;) */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
103 #ifdef HAVE_K6_2PLUS |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
104 #define PREFETCH "prefetch" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
105 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
106 #define EMMS "femms" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
107 #else |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
108 #define PREFETCH "prefetchnta" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
109 #define EMMS "emms" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
110 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
111 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
112 #ifdef HAVE_MMX2 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
113 #define MOVNTQ "movntq" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
114 #else |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
115 #define MOVNTQ "movq" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
116 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
117 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
118 #ifdef HAVE_MMX1 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
119 #define MIN_LEN 0x800 /* 2K blocks */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
120 #else |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
121 #define MIN_LEN 0x40 /* 64-byte blocks */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
122 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
123 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
124 void * fast_memcpy(void * to, const void * from, size_t len) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
125 { |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
126 void *retval; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
127 size_t i; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
128 retval = to; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
129 #ifndef HAVE_MMX1 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
130 /* PREFETCH has effect even for MOVSB instruction ;) */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
131 __asm__ __volatile__ ( |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
132 PREFETCH" (%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
133 PREFETCH" 64(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
134 PREFETCH" 128(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
135 PREFETCH" 192(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
136 PREFETCH" 256(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
137 : : "r" (from) ); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
138 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
139 if(len >= MIN_LEN) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
140 { |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
141 register unsigned long int delta; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
142 /* Align destinition to MMREG_SIZE -boundary */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
143 delta = ((unsigned long int)to)&(MMREG_SIZE-1); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
144 if(delta) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
145 { |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
146 delta=MMREG_SIZE-delta; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
147 len -= delta; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
148 small_memcpy(to, from, delta); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
149 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
150 i = len >> 6; /* len/64 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
151 len&=63; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
152 /* |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
153 This algorithm is top effective when the code consequently |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
154 reads and writes blocks which have size of cache line. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
155 Size of cache line is processor-dependent. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
156 It will, however, be a minimum of 32 bytes on any processors. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
157 It would be better to have a number of instructions which |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
158 perform reading and writing to be multiple to a number of |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
159 processor's decoders, but it's not always possible. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
160 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
161 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
162 if(((unsigned long)from) & 15) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
163 /* if SRC is misaligned */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
164 for(; i>0; i--) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
165 { |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
166 __asm__ __volatile__ ( |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
167 PREFETCH" 320(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
168 "movups (%0), %%xmm0\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
169 "movups 16(%0), %%xmm1\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
170 "movups 32(%0), %%xmm2\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
171 "movups 48(%0), %%xmm3\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
172 "movntps %%xmm0, (%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
173 "movntps %%xmm1, 16(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
174 "movntps %%xmm2, 32(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
175 "movntps %%xmm3, 48(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
176 :: "r" (from), "r" (to) : "memory"); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
177 ((const unsigned char *)from)+=64; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
178 ((unsigned char *)to)+=64; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
179 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
180 else |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
181 /* |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
182 Only if SRC is aligned on 16-byte boundary. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
183 It allows to use movaps instead of movups, which required data |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
184 to be aligned or a general-protection exception (#GP) is generated. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
185 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
186 for(; i>0; i--) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
187 { |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
188 __asm__ __volatile__ ( |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
189 PREFETCH" 320(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
190 "movaps (%0), %%xmm0\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
191 "movaps 16(%0), %%xmm1\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
192 "movaps 32(%0), %%xmm2\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
193 "movaps 48(%0), %%xmm3\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
194 "movntps %%xmm0, (%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
195 "movntps %%xmm1, 16(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
196 "movntps %%xmm2, 32(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
197 "movntps %%xmm3, 48(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
198 :: "r" (from), "r" (to) : "memory"); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
199 ((const unsigned char *)from)+=64; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
200 ((unsigned char *)to)+=64; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
201 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
202 #else |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
203 for(; i>0; i--) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
204 { |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
205 __asm__ __volatile__ ( |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
206 #ifndef HAVE_MMX1 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
207 PREFETCH" 320(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
208 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
209 "movq (%0), %%mm0\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
210 "movq 8(%0), %%mm1\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
211 "movq 16(%0), %%mm2\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
212 "movq 24(%0), %%mm3\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
213 "movq 32(%0), %%mm4\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
214 "movq 40(%0), %%mm5\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
215 "movq 48(%0), %%mm6\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
216 "movq 56(%0), %%mm7\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
217 MOVNTQ" %%mm0, (%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
218 MOVNTQ" %%mm1, 8(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
219 MOVNTQ" %%mm2, 16(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
220 MOVNTQ" %%mm3, 24(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
221 MOVNTQ" %%mm4, 32(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
222 MOVNTQ" %%mm5, 40(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
223 MOVNTQ" %%mm6, 48(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
224 MOVNTQ" %%mm7, 56(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
225 :: "r" (from), "r" (to) : "memory"); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
226 ((const unsigned char *)from)+=64; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
227 ((unsigned char *)to)+=64; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
228 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
229 #endif /* Have SSE */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
230 #ifdef HAVE_MMX2 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
231 /* since movntq is weakly-ordered, a "sfence" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
232 * is needed to become ordered again. */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
233 __asm__ __volatile__ ("sfence":::"memory"); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
234 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
235 #ifndef HAVE_SSE |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
236 /* enables to use FPU */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
237 __asm__ __volatile__ (EMMS:::"memory"); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
238 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
239 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
240 /* |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
241 * Now do the tail of the block |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
242 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
243 if(len) small_memcpy(to, from, len); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
244 return retval; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
245 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
246 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
247 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
248 #endif /* #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
249 #endif /* USE_FASTMEMCPY */ |