annotate libvo/aclib_template.c @ 34390:9082a0976655

Remove options cdrom_device and dvd_device from the Win32 GUI. Windows determines these devices by directly checking all drives for type CDROM (and may handle even multiple of them).
author ib
date Sat, 31 Dec 2011 13:15:30 +0000
parents 4e2f4bd081ce
children b4ce15212bfc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
1 /*
28446
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
2 * aclib - advanced C library ;)
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
3 * functions which improve and expand the standard C library
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
4 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
5 * This file is part of MPlayer.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
6 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
7 * MPlayer is free software; you can redistribute it and/or modify
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
8 * it under the terms of the GNU General Public License as published by
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
9 * the Free Software Foundation; either version 2 of the License, or
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
10 * (at your option) any later version.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
11 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
12 * MPlayer is distributed in the hope that it will be useful,
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
15 * GNU General Public License for more details.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
16 *
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
17 * You should have received a copy of the GNU General Public License along
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
18 * with MPlayer; if not, write to the Free Software Foundation, Inc.,
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
7681eab10aea Add standard license headers, unify header formatting.
diego
parents: 28335
diff changeset
20 */
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
21
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
22 #if !HAVE_SSE2
1123
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
23 /*
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
24 P3 processor has only one SSE decoder so can execute only 1 sse insn per
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
25 cpu clock, but it has 3 mmx decoders (include load/store unit)
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
26 and executes 3 mmx insns per cpu clock.
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
27 P4 processor has some chances, but after reading:
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
28 http://www.emulators.com/pentium4.htm
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
29 I have doubts. Anyway SSE2 version of this code can be written better.
5b69dabe5823 Issues about P3 performance and SSE2 support.
nickols_k
parents: 698
diff changeset
30 */
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
31 #undef HAVE_SSE
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
32 #define HAVE_SSE 0
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
33 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
34
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
35
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
36 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
37 This part of code was taken by me from Linux-2.4.3 and slightly modified
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
38 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
39 blocks but mplayer uses weakly ordered data and original sources can not
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
40 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
41
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
42 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
43
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
44 Order Number 245470:
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
45 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
46
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
47 Data referenced by a program can be temporal (data will be used again) or
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
48 non-temporal (data will be referenced once and not reused in the immediate
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
49 future). To make efficient use of the processor's caches, it is generally
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
50 desirable to cache temporal data and not cache non-temporal data. Overloading
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
51 the processor's caches with non-temporal data is sometimes referred to as
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
52 "polluting the caches".
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
53 The non-temporal data is written to memory with Write-Combining semantics.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
54
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
55 The PREFETCHh instructions permits a program to load data into the processor
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
56 at a suggested cache level, so that it is closer to the processors load and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
57 store unit when it is needed. If the data is already present in a level of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
58 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
59 will not result in any data movement.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
60 But we should you PREFETCHNTA: Non-temporal data fetch data into location
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
61 close to the processor, minimizing cache pollution.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
62
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
63 The MOVNTQ (store quadword using non-temporal hint) instruction stores
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
64 packed integer data from an MMX register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
65 The MOVNTPS (store packed single-precision floating-point values using
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
66 non-temporal hint) instruction stores packed floating-point data from an
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
67 XMM register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
68
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
69 The SFENCE (Store Fence) instruction controls write ordering by creating a
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
70 fence for memory store operations. This instruction guarantees that the results
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
71 of every store instruction that precedes the store fence in program order is
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
72 globally visible before any store instruction that follows the fence. The
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
73 SFENCE instruction provides an efficient way of ensuring ordering between
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
74 procedures that produce weakly-ordered data and procedures that consume that
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
75 data.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
76
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
77 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
78 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
79
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
80 // 3dnow memcpy support from kernel 2.4.2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
81 // by Pontscho/fresh!mindworkz
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
82
34239
4e2f4bd081ce libvo: Move x86_cpu.h #include to aclib_template, where it is actually used.
diego
parents: 30633
diff changeset
83 #include "libavutil/x86_cpu.h"
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
84
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
85 #undef HAVE_ONLY_MMX1
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28290
diff changeset
86 #if HAVE_MMX && !HAVE_MMX2 && !HAVE_AMD3DNOW && !HAVE_SSE
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
87 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
88 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
89 standard (non MMX-optimized) version.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
90 Note: on K6-2+ it speedups memory copying upto 25% and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
91 on K7 and P3 about 500% (5 times). */
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
92 #define HAVE_ONLY_MMX1
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
93 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
94
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
95
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
96 #undef HAVE_K6_2PLUS
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28290
diff changeset
97 #if !HAVE_MMX2 && HAVE_AMD3DNOW
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
98 #define HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
99 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
100
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
101 /* for small memory blocks (<256 bytes) this version is faster */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
102 #define small_memcpy(to,from,n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
103 {\
30135
807fce7a4bb3 Do not assume that "long" is the size of a register.
reimar
parents: 29645
diff changeset
104 register x86_reg dummy;\
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
105 __asm__ volatile(\
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
106 "rep; movsb"\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
107 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
108 /* It's most portable way to notify compiler */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
109 /* that edi, esi and ecx are clobbered in asm block. */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
110 /* Thanks to A'rpi for hint!!! */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
111 :"0" (to), "1" (from),"2" (n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
112 : "memory");\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
113 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
114
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
115 #undef MMREG_SIZE
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
116 #if HAVE_SSE
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
117 #define MMREG_SIZE 16
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
118 #else
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
119 #define MMREG_SIZE 64 //8
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
120 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
121
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
122 #undef PREFETCH
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
123 #undef EMMS
5660
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
124
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
125 #if HAVE_MMX2
5662
663ca5050f7e prefer prefetchnta if its available
michael
parents: 5660
diff changeset
126 #define PREFETCH "prefetchnta"
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28290
diff changeset
127 #elif HAVE_AMD3DNOW
5660
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
128 #define PREFETCH "prefetch"
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
129 #else
25973
ef4297ed0d12 libvo: change asm syntax to use ASMALIGN and " # nop"
uau
parents: 23378
diff changeset
130 #define PREFETCH " # nop"
5660
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
131 #endif
4dcc7af65eec pre mmx2/3dnow fix
michael
parents: 4684
diff changeset
132
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
133 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28335
31287e75b5d8 HAVE_3DNOW --> HAVE_AMD3DNOW
diego
parents: 28290
diff changeset
134 #if HAVE_AMD3DNOW
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
135 #define EMMS "femms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
136 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
137 #define EMMS "emms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
138 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
139
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
140 #undef MOVNTQ
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
141 #if HAVE_MMX2
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
142 #define MOVNTQ "movntq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
143 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
144 #define MOVNTQ "movq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
145 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
146
3393
3624cd351618 runtime cpu detection
michael
parents: 3077
diff changeset
147 #undef MIN_LEN
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
148 #ifdef HAVE_ONLY_MMX1
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
149 #define MIN_LEN 0x800 /* 2K blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
150 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
151 #define MIN_LEN 0x40 /* 64-byte blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
152 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
153
7072
113d66d78967 removed nonsense 'inline'
arpi
parents: 5662
diff changeset
154 static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
155 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
156 void *retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
157 size_t i;
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
158 retval = to;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
159 #ifdef STATISTICS
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
160 {
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
161 static int freq[33];
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
162 static int t=0;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
163 int i;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
164 for(i=0; len>(1<<i); i++);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
165 freq[i]++;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
166 t++;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
167 if(1024*1024*1024 % t == 0)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
168 for(i=0; i<32; i++)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
169 printf("freq < %8d %4d\n", 1<<i, freq[i]);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
170 }
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
171 #endif
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
172 #ifndef HAVE_ONLY_MMX1
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
173 /* PREFETCH has effect even for MOVSB instruction ;) */
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
174 __asm__ volatile (
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
175 PREFETCH" (%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
176 PREFETCH" 64(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
177 PREFETCH" 128(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
178 PREFETCH" 192(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
179 PREFETCH" 256(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
180 : : "r" (from) );
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
181 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
182 if(len >= MIN_LEN)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
183 {
30135
807fce7a4bb3 Do not assume that "long" is the size of a register.
reimar
parents: 29645
diff changeset
184 register x86_reg delta;
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
185 /* Align destinition to MMREG_SIZE -boundary */
30135
807fce7a4bb3 Do not assume that "long" is the size of a register.
reimar
parents: 29645
diff changeset
186 delta = ((intptr_t)to)&(MMREG_SIZE-1);
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
187 if(delta)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
188 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
189 delta=MMREG_SIZE-delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
190 len -= delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
191 small_memcpy(to, from, delta);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
192 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
193 i = len >> 6; /* len/64 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
194 len&=63;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
195 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
196 This algorithm is top effective when the code consequently
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
197 reads and writes blocks which have size of cache line.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
198 Size of cache line is processor-dependent.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
199 It will, however, be a minimum of 32 bytes on any processors.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
200 It would be better to have a number of instructions which
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
201 perform reading and writing to be multiple to a number of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
202 processor's decoders, but it's not always possible.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
203 */
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
204 #if HAVE_SSE /* Only P3 (may be Cyrix3) */
30135
807fce7a4bb3 Do not assume that "long" is the size of a register.
reimar
parents: 29645
diff changeset
205 if(((intptr_t)from) & 15)
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
206 /* if SRC is misaligned */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
207 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
208 {
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
209 __asm__ volatile (
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
210 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
211 "movups (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
212 "movups 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
213 "movups 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
214 "movups 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
215 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
216 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
217 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
218 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
219 :: "r" (from), "r" (to) : "memory");
14565
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
220 from=((const unsigned char *) from)+64;
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
221 to=((unsigned char *)to)+64;
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
222 }
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
223 else
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
224 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
225 Only if SRC is aligned on 16-byte boundary.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
226 It allows to use movaps instead of movups, which required data
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
227 to be aligned or a general-protection exception (#GP) is generated.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
228 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
229 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
230 {
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
231 __asm__ volatile (
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
232 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
233 "movaps (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
234 "movaps 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
235 "movaps 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
236 "movaps 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
237 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
238 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
239 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
240 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
241 :: "r" (from), "r" (to) : "memory");
14565
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
242 from=((const unsigned char *)from)+64;
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
243 to=((unsigned char *)to)+64;
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
244 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
245 #else
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
246 // Align destination at BLOCK_SIZE boundary
30135
807fce7a4bb3 Do not assume that "long" is the size of a register.
reimar
parents: 29645
diff changeset
247 for(; ((intptr_t)to & (BLOCK_SIZE-1)) && i>0; i--)
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
248 {
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
249 __asm__ volatile (
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
250 #ifndef HAVE_ONLY_MMX1
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
251 PREFETCH" 320(%0)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
252 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
253 "movq (%0), %%mm0\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
254 "movq 8(%0), %%mm1\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
255 "movq 16(%0), %%mm2\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
256 "movq 24(%0), %%mm3\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
257 "movq 32(%0), %%mm4\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
258 "movq 40(%0), %%mm5\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
259 "movq 48(%0), %%mm6\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
260 "movq 56(%0), %%mm7\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
261 MOVNTQ" %%mm0, (%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
262 MOVNTQ" %%mm1, 8(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
263 MOVNTQ" %%mm2, 16(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
264 MOVNTQ" %%mm3, 24(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
265 MOVNTQ" %%mm4, 32(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
266 MOVNTQ" %%mm5, 40(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
267 MOVNTQ" %%mm6, 48(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
268 MOVNTQ" %%mm7, 56(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
269 :: "r" (from), "r" (to) : "memory");
15639
f26450da61a1 More gcc-4.0 fixes
gpoirier
parents: 14565
diff changeset
270 from=((const unsigned char *)from)+64;
f26450da61a1 More gcc-4.0 fixes
gpoirier
parents: 14565
diff changeset
271 to=((unsigned char *)to)+64;
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
272 }
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
273
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
274 // printf(" %d %d\n", (int)from&1023, (int)to&1023);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
275 // Pure Assembly cuz gcc is a bit unpredictable ;)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
276 if(i>=BLOCK_SIZE/64)
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 25973
diff changeset
277 __asm__ volatile(
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
278 "xor %%"REG_a", %%"REG_a" \n\t"
25973
ef4297ed0d12 libvo: change asm syntax to use ASMALIGN and " # nop"
uau
parents: 23378
diff changeset
279 ASMALIGN(4)
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
280 "1: \n\t"
29645
7eb282a13214 Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents: 28446
diff changeset
281 "movl (%0, %%"REG_a"), %%ecx \n\t"
7eb282a13214 Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents: 28446
diff changeset
282 "movl 32(%0, %%"REG_a"), %%ecx \n\t"
7eb282a13214 Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents: 28446
diff changeset
283 "movl 64(%0, %%"REG_a"), %%ecx \n\t"
7eb282a13214 Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents: 28446
diff changeset
284 "movl 96(%0, %%"REG_a"), %%ecx \n\t"
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
285 "add $128, %%"REG_a" \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
286 "cmp %3, %%"REG_a" \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
287 " jb 1b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
288
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
289 "xor %%"REG_a", %%"REG_a" \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
290
25973
ef4297ed0d12 libvo: change asm syntax to use ASMALIGN and " # nop"
uau
parents: 23378
diff changeset
291 ASMALIGN(4)
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
292 "2: \n\t"
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
293 "movq (%0, %%"REG_a"), %%mm0\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
294 "movq 8(%0, %%"REG_a"), %%mm1\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
295 "movq 16(%0, %%"REG_a"), %%mm2\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
296 "movq 24(%0, %%"REG_a"), %%mm3\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
297 "movq 32(%0, %%"REG_a"), %%mm4\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
298 "movq 40(%0, %%"REG_a"), %%mm5\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
299 "movq 48(%0, %%"REG_a"), %%mm6\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
300 "movq 56(%0, %%"REG_a"), %%mm7\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
301 MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
302 MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
303 MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
304 MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
305 MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
306 MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
307 MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
308 MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
309 "add $64, %%"REG_a" \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
310 "cmp %3, %%"REG_a" \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
311 "jb 2b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
312
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
313 #if CONFUSION_FACTOR > 0
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
314 // a few percent speedup on out of order executing CPUs
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
315 "mov %5, %%"REG_a" \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
316 "2: \n\t"
29645
7eb282a13214 Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents: 28446
diff changeset
317 "movl (%0), %%ecx \n\t"
7eb282a13214 Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents: 28446
diff changeset
318 "movl (%0), %%ecx \n\t"
7eb282a13214 Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents: 28446
diff changeset
319 "movl (%0), %%ecx \n\t"
7eb282a13214 Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents: 28446
diff changeset
320 "movl (%0), %%ecx \n\t"
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
321 "dec %%"REG_a" \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
322 " jnz 2b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
323 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
324
13720
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
325 "xor %%"REG_a", %%"REG_a" \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
326 "add %3, %0 \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
327 "add %3, %1 \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
328 "sub %4, %2 \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents: 7072
diff changeset
329 "cmp %4, %2 \n\t"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
330 " jae 1b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
331 : "+r" (from), "+r" (to), "+r" (i)
30135
807fce7a4bb3 Do not assume that "long" is the size of a register.
reimar
parents: 29645
diff changeset
332 : "r" ((x86_reg)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((x86_reg)CONFUSION_FACTOR)
29645
7eb282a13214 Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents: 28446
diff changeset
333 : "%"REG_a, "%ecx"
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
334 );
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
335
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
336 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
337 {
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
338 __asm__ volatile (
23378
ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents: 15639
diff changeset
339 #ifndef HAVE_ONLY_MMX1
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
340 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
341 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
342 "movq (%0), %%mm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
343 "movq 8(%0), %%mm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
344 "movq 16(%0), %%mm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
345 "movq 24(%0), %%mm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
346 "movq 32(%0), %%mm4\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
347 "movq 40(%0), %%mm5\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
348 "movq 48(%0), %%mm6\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
349 "movq 56(%0), %%mm7\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
350 MOVNTQ" %%mm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
351 MOVNTQ" %%mm1, 8(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
352 MOVNTQ" %%mm2, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
353 MOVNTQ" %%mm3, 24(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
354 MOVNTQ" %%mm4, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
355 MOVNTQ" %%mm5, 40(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
356 MOVNTQ" %%mm6, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
357 MOVNTQ" %%mm7, 56(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
358 :: "r" (from), "r" (to) : "memory");
15639
f26450da61a1 More gcc-4.0 fixes
gpoirier
parents: 14565
diff changeset
359 from=((const unsigned char *)from)+64;
f26450da61a1 More gcc-4.0 fixes
gpoirier
parents: 14565
diff changeset
360 to=((unsigned char *)to)+64;
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
361 }
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
362
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
363 #endif /* Have SSE */
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
364 #if HAVE_MMX2
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
365 /* since movntq is weakly-ordered, a "sfence"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
366 * is needed to become ordered again. */
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
367 __asm__ volatile ("sfence":::"memory");
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
368 #endif
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
369 #if !HAVE_SSE
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
370 /* enables to use FPU */
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
371 __asm__ volatile (EMMS:::"memory");
3077
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents: 1123
diff changeset
372 #endif
698
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
373 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
374 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
375 * Now do the tail of the block
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
376 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
377 if(len) small_memcpy(to, from, len);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
378 return retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff changeset
379 }
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
380
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
381 /**
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
382 * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
383 */
7072
113d66d78967 removed nonsense 'inline'
arpi
parents: 5662
diff changeset
384 static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
385 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
386 void *retval;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
387 size_t i;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
388 retval = to;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
389 #ifdef STATISTICS
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
390 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
391 static int freq[33];
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
392 static int t=0;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
393 int i;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
394 for(i=0; len>(1<<i); i++);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
395 freq[i]++;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
396 t++;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
397 if(1024*1024*1024 % t == 0)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
398 for(i=0; i<32; i++)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
399 printf("mem2agp freq < %8d %4d\n", 1<<i, freq[i]);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
400 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
401 #endif
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
402 if(len >= MIN_LEN)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
403 {
30135
807fce7a4bb3 Do not assume that "long" is the size of a register.
reimar
parents: 29645
diff changeset
404 register x86_reg delta;
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
405 /* Align destinition to MMREG_SIZE -boundary */
30135
807fce7a4bb3 Do not assume that "long" is the size of a register.
reimar
parents: 29645
diff changeset
406 delta = ((intptr_t)to)&7;
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
407 if(delta)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
408 {
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
409 delta=8-delta;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
410 len -= delta;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
411 small_memcpy(to, from, delta);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
412 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
413 i = len >> 6; /* len/64 */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
414 len &= 63;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
415 /*
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
416 This algorithm is top effective when the code consequently
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
417 reads and writes blocks which have size of cache line.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
418 Size of cache line is processor-dependent.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
419 It will, however, be a minimum of 32 bytes on any processors.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
420 It would be better to have a number of instructions which
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
421 perform reading and writing to be multiple to a number of
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
422 processor's decoders, but it's not always possible.
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
423 */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
424 for(; i>0; i--)
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
425 {
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
426 __asm__ volatile (
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
427 PREFETCH" 320(%0)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
428 "movq (%0), %%mm0\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
429 "movq 8(%0), %%mm1\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
430 "movq 16(%0), %%mm2\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
431 "movq 24(%0), %%mm3\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
432 "movq 32(%0), %%mm4\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
433 "movq 40(%0), %%mm5\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
434 "movq 48(%0), %%mm6\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
435 "movq 56(%0), %%mm7\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
436 MOVNTQ" %%mm0, (%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
437 MOVNTQ" %%mm1, 8(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
438 MOVNTQ" %%mm2, 16(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
439 MOVNTQ" %%mm3, 24(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
440 MOVNTQ" %%mm4, 32(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
441 MOVNTQ" %%mm5, 40(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
442 MOVNTQ" %%mm6, 48(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
443 MOVNTQ" %%mm7, 56(%1)\n"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
444 :: "r" (from), "r" (to) : "memory");
14565
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
445 from=((const unsigned char *)from)+64;
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents: 13720
diff changeset
446 to=((unsigned char *)to)+64;
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
447 }
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27757
diff changeset
448 #if HAVE_MMX2
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
449 /* since movntq is weakly-ordered, a "sfence"
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
450 * is needed to become ordered again. */
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
451 __asm__ volatile ("sfence":::"memory");
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
452 #endif
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
453 /* enables to use FPU */
27757
b5a46071062a Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents: 27754
diff changeset
454 __asm__ volatile (EMMS:::"memory");
4681
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
455 }
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
456 /*
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
457 * Now do the tail of the block
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
458 */
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
459 if(len) small_memcpy(to, from, len);
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
460 return retval;
8db59073127e mem2agpcpy()
michael
parents: 3393
diff changeset
461 }