annotate libvo/aclib_template.c @ 28496:147df13f0d46

Replace a mention of MPlayer by MEncoder in the MEncoder section.
author diego
date Wed, 11 Feb 2009 16:46:45 +0000
parents 7681eab10aea
children 7eb282a13214
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
1 /*
28446
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
2 * aclib - advanced C library ;)
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
3 * functions which improve and expand the standard C library
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
4 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
5 * This file is part of MPlayer.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
6 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
7 * MPlayer is free software; you can redistribute it and/or modify
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
8 * it under the terms of the GNU General Public License as published by
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
9 * the Free Software Foundation; either version 2 of the License, or
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
10 * (at your option) any later version.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
11 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
12 * MPlayer is distributed in the hope that it will be useful,
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
15 * GNU General Public License for more details.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
16 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
17 * You should have received a copy of the GNU General Public License along
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
18 * with MPlayer; if not, write to the Free Software Foundation, Inc.,
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
20 */
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
21
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
22 #if !HAVE_SSE2
1123
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
23 /*
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
24 P3 processor has only one SSE decoder so can execute only 1 sse insn per
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
25 cpu clock, but it has 3 mmx decoders (include load/store unit)
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
26 and executes 3 mmx insns per cpu clock.
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
27 P4 processor has some chances, but after reading:
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
28 http://www.emulators.com/pentium4.htm
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
29 I have doubts. Anyway SSE2 version of this code can be written better.
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
30 */
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
31 #undef HAVE_SSE
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
32 #define HAVE_SSE 0
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
33 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
34
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
35
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
36 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
37 This part of code was taken by me from Linux-2.4.3 and slightly modified
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
38 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
39 blocks but mplayer uses weakly ordered data and original sources can not
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
40 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
41
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
42 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
43
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
44 Order Number 245470:
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
45 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
46
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
47 Data referenced by a program can be temporal (data will be used again) or
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
48 non-temporal (data will be referenced once and not reused in the immediate
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
49 future). To make efficient use of the processor's caches, it is generally
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
50 desirable to cache temporal data and not cache non-temporal data. Overloading
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
51 the processor's caches with non-temporal data is sometimes referred to as
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
52 "polluting the caches".
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
53 The non-temporal data is written to memory with Write-Combining semantics.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
54
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
55 The PREFETCHh instructions permits a program to load data into the processor
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
56 at a suggested cache level, so that it is closer to the processors load and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
57 store unit when it is needed. If the data is already present in a level of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
58 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
59 will not result in any data movement.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
60 But we should you PREFETCHNTA: Non-temporal data fetch data into location
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
61 close to the processor, minimizing cache pollution.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
62
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
63 The MOVNTQ (store quadword using non-temporal hint) instruction stores
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
64 packed integer data from an MMX register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
65 The MOVNTPS (store packed single-precision floating-point values using
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
66 non-temporal hint) instruction stores packed floating-point data from an
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
67 XMM register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
68
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
69 The SFENCE (Store Fence) instruction controls write ordering by creating a
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
70 fence for memory store operations. This instruction guarantees that the results
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
71 of every store instruction that precedes the store fence in program order is
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
72 globally visible before any store instruction that follows the fence. The
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
73 SFENCE instruction provides an efficient way of ensuring ordering between
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
74 procedures that produce weakly-ordered data and procedures that consume that
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
75 data.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
76
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
77 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
78 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
79
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
80 // 3dnow memcpy support from kernel 2.4.2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
81 // by Pontscho/fresh!mindworkz
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
82
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
83
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
84 #undef HAVE_ONLY_MMX1
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28290
diff changeset
85 #if HAVE_MMX && !HAVE_MMX2 && !HAVE_AMD3DNOW && !HAVE_SSE
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
86 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
87 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
88 standard (non MMX-optimized) version.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
89 Note: on K6-2+ it speedups memory copying upto 25% and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
90 on K7 and P3 about 500% (5 times). */
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
91 #define HAVE_ONLY_MMX1
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
92 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
93
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
94
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
95 #undef HAVE_K6_2PLUS
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28290
diff changeset
96 #if !HAVE_MMX2 && HAVE_AMD3DNOW
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
97 #define HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
98 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
99
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
100 /* for small memory blocks (<256 bytes) this version is faster */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
101 #define small_memcpy(to,from,n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
102 {\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
103 register unsigned long int dummy;\
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
104 __asm__ volatile(\
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
105 "rep; movsb"\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
106 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
107 /* It's most portable way to notify compiler */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
108 /* that edi, esi and ecx are clobbered in asm block. */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
109 /* Thanks to A'rpi for hint!!! */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
110 :"0" (to), "1" (from),"2" (n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
111 : "memory");\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
112 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
113
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
114 #undef MMREG_SIZE
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
115 #if HAVE_SSE
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
116 #define MMREG_SIZE 16
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
117 #else
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
118 #define MMREG_SIZE 64 //8
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
119 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
120
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
121 #undef PREFETCH
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
122 #undef EMMS
5660
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
123
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
124 #if HAVE_MMX2
5662
663ca5050f7e prefer prefetchnta if its available
michael
parents: 5660
diff changeset
125 #define PREFETCH "prefetchnta"
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28290
diff changeset
126 #elif HAVE_AMD3DNOW
5660
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
127 #define PREFETCH "prefetch"
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
128 #else
25973
ef4297ed0d12 libvo: change asm syntax to use ASMALIGN and " # nop"
uau
parents: 23378
diff changeset
129 #define PREFETCH " # nop"
5660
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
130 #endif
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
131
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
132 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28290
diff changeset
133 #if HAVE_AMD3DNOW
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
134 #define EMMS "femms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
135 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
136 #define EMMS "emms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
137 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
138
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
139 #undef MOVNTQ
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
140 #if HAVE_MMX2
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
141 #define MOVNTQ "movntq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
142 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
143 #define MOVNTQ "movq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
144 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
145
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
146 #undef MIN_LEN
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
147 #ifdef HAVE_ONLY_MMX1
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
148 #define MIN_LEN 0x800 /* 2K blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
149 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
150 #define MIN_LEN 0x40 /* 64-byte blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
151 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
152
7072
113d66d78967 removed nonsense 'inline'
arpi
parents: 5662
diff changeset
153 static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
154 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
155 void *retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
156 size_t i;
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
157 retval = to;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
158 #ifdef STATISTICS
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
159 {
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
160 static int freq[33];
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
161 static int t=0;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
162 int i;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
163 for(i=0; len>(1<<i); i++);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
164 freq[i]++;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
165 t++;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
166 if(1024*1024*1024 % t == 0)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
167 for(i=0; i<32; i++)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
168 printf("freq < %8d %4d\n", 1<<i, freq[i]);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
169 }
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
170 #endif
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
171 #ifndef HAVE_ONLY_MMX1
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
172 /* PREFETCH has effect even for MOVSB instruction ;) */
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
173 __asm__ volatile (
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
174 PREFETCH" (%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
175 PREFETCH" 64(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
176 PREFETCH" 128(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
177 PREFETCH" 192(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
178 PREFETCH" 256(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
179 : : "r" (from) );
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
180 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
181 if(len >= MIN_LEN)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
182 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
183 register unsigned long int delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
184 /* Align destinition to MMREG_SIZE -boundary */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
185 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
186 if(delta)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
187 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
188 delta=MMREG_SIZE-delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
189 len -= delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
190 small_memcpy(to, from, delta);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
191 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
192 i = len >> 6; /* len/64 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
193 len&=63;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
194 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
195 This algorithm is top effective when the code consequently
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
196 reads and writes blocks which have size of cache line.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
197 Size of cache line is processor-dependent.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
198 It will, however, be a minimum of 32 bytes on any processors.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
199 It would be better to have a number of instructions which
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
200 perform reading and writing to be multiple to a number of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
201 processor's decoders, but it's not always possible.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
202 */
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
203 #if HAVE_SSE /* Only P3 (may be Cyrix3) */
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
204 if(((unsigned long)from) & 15)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
205 /* if SRC is misaligned */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
206 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
207 {
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
208 __asm__ volatile (
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
209 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
210 "movups (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
211 "movups 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
212 "movups 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
213 "movups 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
214 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
215 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
216 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
217 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
218 :: "r" (from), "r" (to) : "memory");
14565
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
219 from=((const unsigned char *) from)+64;
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
220 to=((unsigned char *)to)+64;
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
221 }
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
222 else
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
223 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
224 Only if SRC is aligned on 16-byte boundary.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
225 It allows to use movaps instead of movups, which required data
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
226 to be aligned or a general-protection exception (#GP) is generated.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
227 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
228 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
229 {
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
230 __asm__ volatile (
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
231 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
232 "movaps (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
233 "movaps 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
234 "movaps 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
235 "movaps 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
236 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
237 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
238 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
239 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
240 :: "r" (from), "r" (to) : "memory");
14565
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
241 from=((const unsigned char *)from)+64;
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
242 to=((unsigned char *)to)+64;
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
243 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
244 #else
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
245 // Align destination at BLOCK_SIZE boundary
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
246 for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
247 {
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
248 __asm__ volatile (
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
249 #ifndef HAVE_ONLY_MMX1
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
250 PREFETCH" 320(%0)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
251 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
252 "movq (%0), %%mm0\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
253 "movq 8(%0), %%mm1\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
254 "movq 16(%0), %%mm2\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
255 "movq 24(%0), %%mm3\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
256 "movq 32(%0), %%mm4\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
257 "movq 40(%0), %%mm5\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
258 "movq 48(%0), %%mm6\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
259 "movq 56(%0), %%mm7\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
260 MOVNTQ" %%mm0, (%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
261 MOVNTQ" %%mm1, 8(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
262 MOVNTQ" %%mm2, 16(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
263 MOVNTQ" %%mm3, 24(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
264 MOVNTQ" %%mm4, 32(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
265 MOVNTQ" %%mm5, 40(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
266 MOVNTQ" %%mm6, 48(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
267 MOVNTQ" %%mm7, 56(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
268 :: "r" (from), "r" (to) : "memory");
15639
f26450da61a1 More gcc-4.0 fixes
gpoirier
parents: 14565
diff changeset
269 from=((const unsigned char *)from)+64;
f26450da61a1 More gcc-4.0 fixes
gpoirier
parents: 14565
diff changeset
270 to=((unsigned char *)to)+64;
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
271 }
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
272
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
273 // printf(" %d %d\n", (int)from&1023, (int)to&1023);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
274 // Pure Assembly cuz gcc is a bit unpredictable ;)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
275 if(i>=BLOCK_SIZE/64)
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25973
diff changeset
276 __asm__ volatile(
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
277 "xor %%"REG_a", %%"REG_a" \n\t"
25973
ef4297ed0d12 libvo: change asm syntax to use ASMALIGN and " # nop"
uau
parents: 23378
diff changeset
278 ASMALIGN(4)
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
279 "1: \n\t"
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
280 "movl (%0, %%"REG_a"), %%ebx \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
281 "movl 32(%0, %%"REG_a"), %%ebx \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
282 "movl 64(%0, %%"REG_a"), %%ebx \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
283 "movl 96(%0, %%"REG_a"), %%ebx \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
284 "add $128, %%"REG_a" \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
285 "cmp %3, %%"REG_a" \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
286 " jb 1b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
287
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
288 "xor %%"REG_a", %%"REG_a" \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
289
25973
ef4297ed0d12 libvo: change asm syntax to use ASMALIGN and " # nop"
uau
parents: 23378
diff changeset
290 ASMALIGN(4)
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
291 "2: \n\t"
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
292 "movq (%0, %%"REG_a"), %%mm0\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
293 "movq 8(%0, %%"REG_a"), %%mm1\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
294 "movq 16(%0, %%"REG_a"), %%mm2\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
295 "movq 24(%0, %%"REG_a"), %%mm3\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
296 "movq 32(%0, %%"REG_a"), %%mm4\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
297 "movq 40(%0, %%"REG_a"), %%mm5\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
298 "movq 48(%0, %%"REG_a"), %%mm6\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
299 "movq 56(%0, %%"REG_a"), %%mm7\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
300 MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
301 MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
302 MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
303 MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
304 MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
305 MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
306 MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
307 MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
308 "add $64, %%"REG_a" \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
309 "cmp %3, %%"REG_a" \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
310 "jb 2b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
311
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
312 #if CONFUSION_FACTOR > 0
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
313 // a few percent speedup on out of order executing CPUs
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
314 "mov %5, %%"REG_a" \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
315 "2: \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
316 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
317 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
318 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
319 "movl (%0), %%ebx \n\t"
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
320 "dec %%"REG_a" \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
321 " jnz 2b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
322 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
323
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
324 "xor %%"REG_a", %%"REG_a" \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
325 "add %3, %0 \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
326 "add %3, %1 \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
327 "sub %4, %2 \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
328 "cmp %4, %2 \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
329 " jae 1b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
330 : "+r" (from), "+r" (to), "+r" (i)
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
331 : "r" ((long)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((long)CONFUSION_FACTOR)
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
332 : "%"REG_a, "%ebx"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
333 );
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
334
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
335 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
336 {
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
337 __asm__ volatile (
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
338 #ifndef HAVE_ONLY_MMX1
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
339 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
340 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
341 "movq (%0), %%mm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
342 "movq 8(%0), %%mm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
343 "movq 16(%0), %%mm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
344 "movq 24(%0), %%mm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
345 "movq 32(%0), %%mm4\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
346 "movq 40(%0), %%mm5\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
347 "movq 48(%0), %%mm6\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
348 "movq 56(%0), %%mm7\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
349 MOVNTQ" %%mm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
350 MOVNTQ" %%mm1, 8(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
351 MOVNTQ" %%mm2, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
352 MOVNTQ" %%mm3, 24(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
353 MOVNTQ" %%mm4, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
354 MOVNTQ" %%mm5, 40(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
355 MOVNTQ" %%mm6, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
356 MOVNTQ" %%mm7, 56(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
357 :: "r" (from), "r" (to) : "memory");
15639
f26450da61a1 More gcc-4.0 fixes
gpoirier
parents: 14565
diff changeset
358 from=((const unsigned char *)from)+64;
f26450da61a1 More gcc-4.0 fixes
gpoirier
parents: 14565
diff changeset
359 to=((unsigned char *)to)+64;
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
360 }
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
361
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
362 #endif /* Have SSE */
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
363 #if HAVE_MMX2
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
364 /* since movntq is weakly-ordered, a "sfence"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
365 * is needed to become ordered again. */
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
366 __asm__ volatile ("sfence":::"memory");
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
367 #endif
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
368 #if !HAVE_SSE
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
369 /* enables to use FPU */
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
370 __asm__ volatile (EMMS:::"memory");
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
371 #endif
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
372 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
373 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
374 * Now do the tail of the block
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
375 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
376 if(len) small_memcpy(to, from, len);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
377 return retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
378 }
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
379
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
380 /**
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
381 * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
382 */
7072
113d66d78967 removed nonsense 'inline'
arpi
parents: 5662
diff changeset
383 static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
384 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
385 void *retval;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
386 size_t i;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
387 retval = to;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
388 #ifdef STATISTICS
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
389 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
390 static int freq[33];
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
391 static int t=0;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
392 int i;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
393 for(i=0; len>(1<<i); i++);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
394 freq[i]++;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
395 t++;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
396 if(1024*1024*1024 % t == 0)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
397 for(i=0; i<32; i++)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
398 printf("mem2agp freq < %8d %4d\n", 1<<i, freq[i]);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
399 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
400 #endif
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
401 if(len >= MIN_LEN)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
402 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
403 register unsigned long int delta;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
404 /* Align destinition to MMREG_SIZE -boundary */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
405 delta = ((unsigned long int)to)&7;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
406 if(delta)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
407 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
408 delta=8-delta;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
409 len -= delta;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
410 small_memcpy(to, from, delta);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
411 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
412 i = len >> 6; /* len/64 */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
413 len &= 63;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
414 /*
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
415 This algorithm is top effective when the code consequently
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
416 reads and writes blocks which have size of cache line.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
417 Size of cache line is processor-dependent.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
418 It will, however, be a minimum of 32 bytes on any processors.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
419 It would be better to have a number of instructions which
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
420 perform reading and writing to be multiple to a number of
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
421 processor's decoders, but it's not always possible.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
422 */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
423 for(; i>0; i--)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
424 {
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
425 __asm__ volatile (
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
426 PREFETCH" 320(%0)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
427 "movq (%0), %%mm0\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
428 "movq 8(%0), %%mm1\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
429 "movq 16(%0), %%mm2\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
430 "movq 24(%0), %%mm3\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
431 "movq 32(%0), %%mm4\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
432 "movq 40(%0), %%mm5\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
433 "movq 48(%0), %%mm6\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
434 "movq 56(%0), %%mm7\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
435 MOVNTQ" %%mm0, (%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
436 MOVNTQ" %%mm1, 8(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
437 MOVNTQ" %%mm2, 16(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
438 MOVNTQ" %%mm3, 24(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
439 MOVNTQ" %%mm4, 32(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
440 MOVNTQ" %%mm5, 40(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
441 MOVNTQ" %%mm6, 48(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
442 MOVNTQ" %%mm7, 56(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
443 :: "r" (from), "r" (to) : "memory");
14565
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
444 from=((const unsigned char *)from)+64;
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
445 to=((unsigned char *)to)+64;
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
446 }
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
447 #if HAVE_MMX2
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
448 /* since movntq is weakly-ordered, a "sfence"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
449 * is needed to become ordered again. */
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
450 __asm__ volatile ("sfence":::"memory");
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
451 #endif
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
452 /* enables to use FPU */
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
453 __asm__ volatile (EMMS:::"memory");
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
454 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
455 /*
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
456 * Now do the tail of the block
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
457 */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
458 if(len) small_memcpy(to, from, len);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
459 return retval;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
460 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
461