annotate libvo/aclib_template.c @ 13047:4e613c87c2f5

spelling: big-endian and little-endian
author diego
date Wed, 18 Aug 2004 19:00:05 +0000
parents 113d66d78967
children 821f464b4d90
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
1 /*
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
2 aclib - advanced C library ;)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
3 This file contains functions which improve and expand standard C-library
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
4 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
5
1123
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
6 #ifndef HAVE_SSE2
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
7 /*
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
8 P3 processor has only one SSE decoder so can execute only 1 sse insn per
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
9 cpu clock, but it has 3 mmx decoders (include load/store unit)
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
10 and executes 3 mmx insns per cpu clock.
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
11 P4 processor has some chances, but after reading:
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
12 http://www.emulators.com/pentium4.htm
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
13 I have doubts. Anyway SSE2 version of this code can be written better.
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
14 */
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
15 #undef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
16 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
17
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
18
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
19 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
20 This part of code was taken by me from Linux-2.4.3 and slightly modified
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
21 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
22 blocks but mplayer uses weakly ordered data and original sources can not
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
23 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
24
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
25 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
26
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
27 Order Number 245470:
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
28 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
29
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
30 Data referenced by a program can be temporal (data will be used again) or
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
31 non-temporal (data will be referenced once and not reused in the immediate
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
32 future). To make efficient use of the processor's caches, it is generally
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
33 desirable to cache temporal data and not cache non-temporal data. Overloading
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
34 the processor's caches with non-temporal data is sometimes referred to as
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
35 "polluting the caches".
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
36 The non-temporal data is written to memory with Write-Combining semantics.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
37
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
38 The PREFETCHh instructions permits a program to load data into the processor
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
39 at a suggested cache level, so that it is closer to the processors load and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
40 store unit when it is needed. If the data is already present in a level of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
41 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
42 will not result in any data movement.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
43 But we should you PREFETCHNTA: Non-temporal data fetch data into location
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
44 close to the processor, minimizing cache pollution.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
45
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
46 The MOVNTQ (store quadword using non-temporal hint) instruction stores
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
47 packed integer data from an MMX register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
48 The MOVNTPS (store packed single-precision floating-point values using
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
49 non-temporal hint) instruction stores packed floating-point data from an
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
50 XMM register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
51
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
52 The SFENCE (Store Fence) instruction controls write ordering by creating a
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
53 fence for memory store operations. This instruction guarantees that the results
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
54 of every store instruction that precedes the store fence in program order is
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
55 globally visible before any store instruction that follows the fence. The
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
56 SFENCE instruction provides an efficient way of ensuring ordering between
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
57 procedures that produce weakly-ordered data and procedures that consume that
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
58 data.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
59
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
60 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
61 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
62
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
63 // 3dnow memcpy support from kernel 2.4.2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
64 // by Pontscho/fresh!mindworkz
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
65
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
66
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
67 #undef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
68 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
69 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
70 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
71 standard (non MMX-optimized) version.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
72 Note: on K6-2+ it speedups memory copying upto 25% and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
73 on K7 and P3 about 500% (5 times). */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
74 #define HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
75 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
76
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
77
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
78 #undef HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
79 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
80 #define HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
81 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
82
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
83 /* for small memory blocks (<256 bytes) this version is faster */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
84 #define small_memcpy(to,from,n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
85 {\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
86 register unsigned long int dummy;\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
87 __asm__ __volatile__(\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
88 "rep; movsb"\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
89 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
90 /* It's most portable way to notify compiler */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
91 /* that edi, esi and ecx are clobbered in asm block. */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
92 /* Thanks to A'rpi for hint!!! */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
93 :"0" (to), "1" (from),"2" (n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
94 : "memory");\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
95 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
96
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
97 #undef MMREG_SIZE
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
98 #ifdef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
99 #define MMREG_SIZE 16
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
100 #else
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
101 #define MMREG_SIZE 64 //8
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
102 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
103
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
104 #undef PREFETCH
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
105 #undef EMMS
5660
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
106
5662
663ca5050f7e prefer prefetchnta if its available
michael
parents: 5660
diff changeset
107 #ifdef HAVE_MMX2
663ca5050f7e prefer prefetchnta if its available
michael
parents: 5660
diff changeset
108 #define PREFETCH "prefetchnta"
663ca5050f7e prefer prefetchnta if its available
michael
parents: 5660
diff changeset
109 #elif defined ( HAVE_3DNOW )
5660
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
110 #define PREFETCH "prefetch"
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
111 #else
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
112 #define PREFETCH "/nop"
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
113 #endif
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
114
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
115 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
5660
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
116 #ifdef HAVE_3DNOW
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
117 #define EMMS "femms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
118 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
119 #define EMMS "emms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
120 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
121
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
122 #undef MOVNTQ
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
123 #ifdef HAVE_MMX2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
124 #define MOVNTQ "movntq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
125 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
126 #define MOVNTQ "movq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
127 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
128
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
129 #undef MIN_LEN
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
130 #ifdef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
131 #define MIN_LEN 0x800 /* 2K blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
132 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
133 #define MIN_LEN 0x40 /* 64-byte blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
134 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
135
7072
113d66d78967 removed nonsense 'inline'
arpi
parents: 5662
diff changeset
136 static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
137 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
138 void *retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
139 size_t i;
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
140 retval = to;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
141 #ifdef STATISTICS
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
142 {
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
143 static int freq[33];
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
144 static int t=0;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
145 int i;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
146 for(i=0; len>(1<<i); i++);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
147 freq[i]++;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
148 t++;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
149 if(1024*1024*1024 % t == 0)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
150 for(i=0; i<32; i++)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
151 printf("freq < %8d %4d\n", 1<<i, freq[i]);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
152 }
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
153 #endif
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
154 #ifndef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
155 /* PREFETCH has effect even for MOVSB instruction ;) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
156 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
157 PREFETCH" (%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
158 PREFETCH" 64(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
159 PREFETCH" 128(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
160 PREFETCH" 192(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
161 PREFETCH" 256(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
162 : : "r" (from) );
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
163 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
164 if(len >= MIN_LEN)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
165 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
166 register unsigned long int delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
167 /* Align destinition to MMREG_SIZE -boundary */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
168 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
169 if(delta)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
170 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
171 delta=MMREG_SIZE-delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
172 len -= delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
173 small_memcpy(to, from, delta);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
174 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
175 i = len >> 6; /* len/64 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
176 len&=63;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
177 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
178 This algorithm is top effective when the code consequently
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
179 reads and writes blocks which have size of cache line.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
180 Size of cache line is processor-dependent.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
181 It will, however, be a minimum of 32 bytes on any processors.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
182 It would be better to have a number of instructions which
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
183 perform reading and writing to be multiple to a number of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
184 processor's decoders, but it's not always possible.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
185 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
186 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
187 if(((unsigned long)from) & 15)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
188 /* if SRC is misaligned */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
189 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
190 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
191 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
192 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
193 "movups (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
194 "movups 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
195 "movups 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
196 "movups 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
197 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
198 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
199 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
200 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
201 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
202 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
203 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
204 }
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
205 else
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
206 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
207 Only if SRC is aligned on 16-byte boundary.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
208 It allows to use movaps instead of movups, which required data
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
209 to be aligned or a general-protection exception (#GP) is generated.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
210 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
211 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
212 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
213 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
214 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
215 "movaps (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
216 "movaps 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
217 "movaps 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
218 "movaps 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
219 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
220 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
221 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
222 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
223 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
224 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
225 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
226 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
227 #else
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
228 // Align destination at BLOCK_SIZE boundary
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
229 for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
230 {
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
231 __asm__ __volatile__ (
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
232 #ifndef HAVE_MMX1
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
233 PREFETCH" 320(%0)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
234 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
235 "movq (%0), %%mm0\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
236 "movq 8(%0), %%mm1\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
237 "movq 16(%0), %%mm2\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
238 "movq 24(%0), %%mm3\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
239 "movq 32(%0), %%mm4\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
240 "movq 40(%0), %%mm5\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
241 "movq 48(%0), %%mm6\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
242 "movq 56(%0), %%mm7\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
243 MOVNTQ" %%mm0, (%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
244 MOVNTQ" %%mm1, 8(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
245 MOVNTQ" %%mm2, 16(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
246 MOVNTQ" %%mm3, 24(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
247 MOVNTQ" %%mm4, 32(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
248 MOVNTQ" %%mm5, 40(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
249 MOVNTQ" %%mm6, 48(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
250 MOVNTQ" %%mm7, 56(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
251 :: "r" (from), "r" (to) : "memory");
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
252 ((const unsigned char *)from)+=64;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
253 ((unsigned char *)to)+=64;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
254 }
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
255
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
256 // printf(" %d %d\n", (int)from&1023, (int)to&1023);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
257 // Pure Assembly cuz gcc is a bit unpredictable ;)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
258 if(i>=BLOCK_SIZE/64)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
259 asm volatile(
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
260 "xorl %%eax, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
261 ".balign 16 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
262 "1: \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
263 "movl (%0, %%eax), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
264 "movl 32(%0, %%eax), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
265 "movl 64(%0, %%eax), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
266 "movl 96(%0, %%eax), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
267 "addl $128, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
268 "cmpl %3, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
269 " jb 1b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
270
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
271 "xorl %%eax, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
272
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
273 ".balign 16 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
274 "2: \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
275 "movq (%0, %%eax), %%mm0\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
276 "movq 8(%0, %%eax), %%mm1\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
277 "movq 16(%0, %%eax), %%mm2\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
278 "movq 24(%0, %%eax), %%mm3\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
279 "movq 32(%0, %%eax), %%mm4\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
280 "movq 40(%0, %%eax), %%mm5\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
281 "movq 48(%0, %%eax), %%mm6\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
282 "movq 56(%0, %%eax), %%mm7\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
283 MOVNTQ" %%mm0, (%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
284 MOVNTQ" %%mm1, 8(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
285 MOVNTQ" %%mm2, 16(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
286 MOVNTQ" %%mm3, 24(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
287 MOVNTQ" %%mm4, 32(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
288 MOVNTQ" %%mm5, 40(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
289 MOVNTQ" %%mm6, 48(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
290 MOVNTQ" %%mm7, 56(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
291 "addl $64, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
292 "cmpl %3, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
293 "jb 2b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
294
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
295 #if CONFUSION_FACTOR > 0
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
296 // a few percent speedup on out of order executing CPUs
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
297 "movl %5, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
298 "2: \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
299 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
300 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
301 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
302 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
303 "decl %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
304 " jnz 2b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
305 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
306
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
307 "xorl %%eax, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
308 "addl %3, %0 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
309 "addl %3, %1 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
310 "subl %4, %2 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
311 "cmpl %4, %2 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
312 " jae 1b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
313 : "+r" (from), "+r" (to), "+r" (i)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
314 : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
315 : "%eax", "%ebx"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
316 );
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
317
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
318 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
319 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
320 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
321 #ifndef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
322 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
323 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
324 "movq (%0), %%mm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
325 "movq 8(%0), %%mm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
326 "movq 16(%0), %%mm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
327 "movq 24(%0), %%mm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
328 "movq 32(%0), %%mm4\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
329 "movq 40(%0), %%mm5\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
330 "movq 48(%0), %%mm6\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
331 "movq 56(%0), %%mm7\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
332 MOVNTQ" %%mm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
333 MOVNTQ" %%mm1, 8(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
334 MOVNTQ" %%mm2, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
335 MOVNTQ" %%mm3, 24(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
336 MOVNTQ" %%mm4, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
337 MOVNTQ" %%mm5, 40(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
338 MOVNTQ" %%mm6, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
339 MOVNTQ" %%mm7, 56(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
340 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
341 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
342 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
343 }
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
344
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
345 #endif /* Have SSE */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
346 #ifdef HAVE_MMX2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
347 /* since movntq is weakly-ordered, a "sfence"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
348 * is needed to become ordered again. */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
349 __asm__ __volatile__ ("sfence":::"memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
350 #endif
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
351 #ifndef HAVE_SSE
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
352 /* enables to use FPU */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
353 __asm__ __volatile__ (EMMS:::"memory");
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
354 #endif
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
355 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
356 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
357 * Now do the tail of the block
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
358 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
359 if(len) small_memcpy(to, from, len);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
360 return retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
361 }
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
362
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
363 /**
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
364 * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
365 */
7072
113d66d78967 removed nonsense 'inline'
arpi
parents: 5662
diff changeset
366 static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
367 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
368 void *retval;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
369 size_t i;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
370 retval = to;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
371 #ifdef STATISTICS
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
372 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
373 static int freq[33];
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
374 static int t=0;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
375 int i;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
376 for(i=0; len>(1<<i); i++);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
377 freq[i]++;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
378 t++;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
379 if(1024*1024*1024 % t == 0)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
380 for(i=0; i<32; i++)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
381 printf("mem2agp freq < %8d %4d\n", 1<<i, freq[i]);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
382 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
383 #endif
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
384 if(len >= MIN_LEN)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
385 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
386 register unsigned long int delta;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
387 /* Align destinition to MMREG_SIZE -boundary */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
388 delta = ((unsigned long int)to)&7;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
389 if(delta)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
390 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
391 delta=8-delta;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
392 len -= delta;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
393 small_memcpy(to, from, delta);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
394 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
395 i = len >> 6; /* len/64 */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
396 len &= 63;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
397 /*
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
398 This algorithm is top effective when the code consequently
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
399 reads and writes blocks which have size of cache line.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
400 Size of cache line is processor-dependent.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
401 It will, however, be a minimum of 32 bytes on any processors.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
402 It would be better to have a number of instructions which
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
403 perform reading and writing to be multiple to a number of
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
404 processor's decoders, but it's not always possible.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
405 */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
406 for(; i>0; i--)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
407 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
408 __asm__ __volatile__ (
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
409 PREFETCH" 320(%0)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
410 "movq (%0), %%mm0\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
411 "movq 8(%0), %%mm1\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
412 "movq 16(%0), %%mm2\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
413 "movq 24(%0), %%mm3\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
414 "movq 32(%0), %%mm4\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
415 "movq 40(%0), %%mm5\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
416 "movq 48(%0), %%mm6\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
417 "movq 56(%0), %%mm7\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
418 MOVNTQ" %%mm0, (%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
419 MOVNTQ" %%mm1, 8(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
420 MOVNTQ" %%mm2, 16(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
421 MOVNTQ" %%mm3, 24(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
422 MOVNTQ" %%mm4, 32(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
423 MOVNTQ" %%mm5, 40(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
424 MOVNTQ" %%mm6, 48(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
425 MOVNTQ" %%mm7, 56(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
426 :: "r" (from), "r" (to) : "memory");
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
427 ((const unsigned char *)from)+=64;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
428 ((unsigned char *)to)+=64;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
429 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
430 #ifdef HAVE_MMX2
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
431 /* since movntq is weakly-ordered, a "sfence"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
432 * is needed to become ordered again. */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
433 __asm__ __volatile__ ("sfence":::"memory");
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
434 #endif
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
435 /* enables to use FPU */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
436 __asm__ __volatile__ (EMMS:::"memory");
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
437 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
438 /*
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
439 * Now do the tail of the block
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
440 */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
441 if(len) small_memcpy(to, from, len);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
442 return retval;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
443 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
444