Mercurial > mplayer.hg
annotate libvo/aclib_template.c @ 33028:038ff73f9894
Enable both frame and slice threading.
author | reimar |
---|---|
date | Sat, 26 Mar 2011 13:00:11 +0000 |
parents | 32725ca88fed |
children | 4e2f4bd081ce |
rev | line source |
---|---|
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
1 /* |
28446
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
2 * aclib - advanced C library ;) |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
3 * functions which improve and expand the standard C library |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
4 * |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
5 * This file is part of MPlayer. |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
6 * |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
7 * MPlayer is free software; you can redistribute it and/or modify |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
8 * it under the terms of the GNU General Public License as published by |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
9 * the Free Software Foundation; either version 2 of the License, or |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
10 * (at your option) any later version. |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
11 * |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
12 * MPlayer is distributed in the hope that it will be useful, |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
15 * GNU General Public License for more details. |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
16 * |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
17 * You should have received a copy of the GNU General Public License along |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
18 * with MPlayer; if not, write to the Free Software Foundation, Inc., |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
7681eab10aea
Add standard license headers, unify header formatting.
diego
parents:
28335
diff
changeset
|
20 */ |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
21 |
28290 | 22 #if !HAVE_SSE2 |
1123 | 23 /* |
24 P3 processor has only one SSE decoder so can execute only 1 sse insn per | |
25 cpu clock, but it has 3 mmx decoders (include load/store unit) | |
26 and executes 3 mmx insns per cpu clock. | |
27 P4 processor has some chances, but after reading: | |
28 http://www.emulators.com/pentium4.htm | |
29 I have doubts. Anyway SSE2 version of this code can be written better. | |
30 */ | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
31 #undef HAVE_SSE |
28290 | 32 #define HAVE_SSE 0 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
33 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
34 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
35 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
36 /* |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
37 This part of code was taken by me from Linux-2.4.3 and slightly modified |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
38 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
39 blocks but mplayer uses weakly ordered data and original sources can not |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
40 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect! |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
41 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
42 >From IA-32 Intel Architecture Software Developer's Manual Volume 1, |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
43 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
44 Order Number 245470: |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
45 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
46 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
47 Data referenced by a program can be temporal (data will be used again) or |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
48 non-temporal (data will be referenced once and not reused in the immediate |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
49 future). To make efficient use of the processor's caches, it is generally |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
50 desirable to cache temporal data and not cache non-temporal data. Overloading |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
51 the processor's caches with non-temporal data is sometimes referred to as |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
52 "polluting the caches". |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
53 The non-temporal data is written to memory with Write-Combining semantics. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
54 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
55 The PREFETCHh instructions permits a program to load data into the processor |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
56 at a suggested cache level, so that it is closer to the processors load and |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
57 store unit when it is needed. If the data is already present in a level of |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
58 the cache hierarchy that is closer to the processor, the PREFETCHh instruction |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
59 will not result in any data movement. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
60 But we should you PREFETCHNTA: Non-temporal data fetch data into location |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
61 close to the processor, minimizing cache pollution. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
62 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
63 The MOVNTQ (store quadword using non-temporal hint) instruction stores |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
64 packed integer data from an MMX register to memory, using a non-temporal hint. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
65 The MOVNTPS (store packed single-precision floating-point values using |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
66 non-temporal hint) instruction stores packed floating-point data from an |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
67 XMM register to memory, using a non-temporal hint. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
68 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
69 The SFENCE (Store Fence) instruction controls write ordering by creating a |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
70 fence for memory store operations. This instruction guarantees that the results |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
71 of every store instruction that precedes the store fence in program order is |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
72 globally visible before any store instruction that follows the fence. The |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
73 SFENCE instruction provides an efficient way of ensuring ordering between |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
74 procedures that produce weakly-ordered data and procedures that consume that |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
75 data. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
76 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
77 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
78 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
79 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
80 // 3dnow memcpy support from kernel 2.4.2 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
81 // by Pontscho/fresh!mindworkz |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
82 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
83 |
23378
ef54df9f07d3
HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents:
15639
diff
changeset
|
84 #undef HAVE_ONLY_MMX1 |
28335 | 85 #if HAVE_MMX && !HAVE_MMX2 && !HAVE_AMD3DNOW && !HAVE_SSE |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
86 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
87 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
88 standard (non MMX-optimized) version. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
89 Note: on K6-2+ it speedups memory copying upto 25% and |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
90 on K7 and P3 about 500% (5 times). */ |
23378
ef54df9f07d3
HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents:
15639
diff
changeset
|
91 #define HAVE_ONLY_MMX1 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
92 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
93 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
94 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
95 #undef HAVE_K6_2PLUS |
28335 | 96 #if !HAVE_MMX2 && HAVE_AMD3DNOW |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
97 #define HAVE_K6_2PLUS |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
98 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
99 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
100 /* for small memory blocks (<256 bytes) this version is faster */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
101 #define small_memcpy(to,from,n)\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
102 {\ |
30135
807fce7a4bb3
Do not assume that "long" is the size of a register.
reimar
parents:
29645
diff
changeset
|
103 register x86_reg dummy;\ |
27757
b5a46071062a
Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents:
27754
diff
changeset
|
104 __asm__ volatile(\ |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
105 "rep; movsb"\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
106 :"=&D"(to), "=&S"(from), "=&c"(dummy)\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
107 /* It's most portable way to notify compiler */\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
108 /* that edi, esi and ecx are clobbered in asm block. */\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
109 /* Thanks to A'rpi for hint!!! */\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
110 :"0" (to), "1" (from),"2" (n)\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
111 : "memory");\ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
112 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
113 |
3393 | 114 #undef MMREG_SIZE |
28290 | 115 #if HAVE_SSE |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
116 #define MMREG_SIZE 16 |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
117 #else |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
118 #define MMREG_SIZE 64 //8 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
119 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
120 |
3393 | 121 #undef PREFETCH |
122 #undef EMMS | |
5660 | 123 |
28290 | 124 #if HAVE_MMX2 |
5662 | 125 #define PREFETCH "prefetchnta" |
28335 | 126 #elif HAVE_AMD3DNOW |
5660 | 127 #define PREFETCH "prefetch" |
128 #else | |
25973
ef4297ed0d12
libvo: change asm syntax to use ASMALIGN and " # nop"
uau
parents:
23378
diff
changeset
|
129 #define PREFETCH " # nop" |
5660 | 130 #endif |
131 | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
132 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ |
28335 | 133 #if HAVE_AMD3DNOW |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
134 #define EMMS "femms" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
135 #else |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
136 #define EMMS "emms" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
137 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
138 |
3393 | 139 #undef MOVNTQ |
28290 | 140 #if HAVE_MMX2 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
141 #define MOVNTQ "movntq" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
142 #else |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
143 #define MOVNTQ "movq" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
144 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
145 |
3393 | 146 #undef MIN_LEN |
23378
ef54df9f07d3
HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents:
15639
diff
changeset
|
147 #ifdef HAVE_ONLY_MMX1 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
148 #define MIN_LEN 0x800 /* 2K blocks */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
149 #else |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
150 #define MIN_LEN 0x40 /* 64-byte blocks */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
151 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
152 |
7072 | 153 static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len) |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
154 { |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
155 void *retval; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
156 size_t i; |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
157 retval = to; |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
158 #ifdef STATISTICS |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
159 { |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
160 static int freq[33]; |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
161 static int t=0; |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
162 int i; |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
163 for(i=0; len>(1<<i); i++); |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
164 freq[i]++; |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
165 t++; |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
166 if(1024*1024*1024 % t == 0) |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
167 for(i=0; i<32; i++) |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
168 printf("freq < %8d %4d\n", 1<<i, freq[i]); |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
169 } |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
170 #endif |
23378
ef54df9f07d3
HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents:
15639
diff
changeset
|
171 #ifndef HAVE_ONLY_MMX1 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
172 /* PREFETCH has effect even for MOVSB instruction ;) */ |
27757
b5a46071062a
Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents:
27754
diff
changeset
|
173 __asm__ volatile ( |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
174 PREFETCH" (%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
175 PREFETCH" 64(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
176 PREFETCH" 128(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
177 PREFETCH" 192(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
178 PREFETCH" 256(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
179 : : "r" (from) ); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
180 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
181 if(len >= MIN_LEN) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
182 { |
30135
807fce7a4bb3
Do not assume that "long" is the size of a register.
reimar
parents:
29645
diff
changeset
|
183 register x86_reg delta; |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
184 /* Align destinition to MMREG_SIZE -boundary */ |
30135
807fce7a4bb3
Do not assume that "long" is the size of a register.
reimar
parents:
29645
diff
changeset
|
185 delta = ((intptr_t)to)&(MMREG_SIZE-1); |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
186 if(delta) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
187 { |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
188 delta=MMREG_SIZE-delta; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
189 len -= delta; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
190 small_memcpy(to, from, delta); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
191 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
192 i = len >> 6; /* len/64 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
193 len&=63; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
194 /* |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
195 This algorithm is top effective when the code consequently |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
196 reads and writes blocks which have size of cache line. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
197 Size of cache line is processor-dependent. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
198 It will, however, be a minimum of 32 bytes on any processors. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
199 It would be better to have a number of instructions which |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
200 perform reading and writing to be multiple to a number of |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
201 processor's decoders, but it's not always possible. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
202 */ |
28290 | 203 #if HAVE_SSE /* Only P3 (may be Cyrix3) */ |
30135
807fce7a4bb3
Do not assume that "long" is the size of a register.
reimar
parents:
29645
diff
changeset
|
204 if(((intptr_t)from) & 15) |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
205 /* if SRC is misaligned */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
206 for(; i>0; i--) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
207 { |
27757
b5a46071062a
Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents:
27754
diff
changeset
|
208 __asm__ volatile ( |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
209 PREFETCH" 320(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
210 "movups (%0), %%xmm0\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
211 "movups 16(%0), %%xmm1\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
212 "movups 32(%0), %%xmm2\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
213 "movups 48(%0), %%xmm3\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
214 "movntps %%xmm0, (%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
215 "movntps %%xmm1, 16(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
216 "movntps %%xmm2, 32(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
217 "movntps %%xmm3, 48(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
218 :: "r" (from), "r" (to) : "memory"); |
14565
1a13df0d4fc2
Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents:
13720
diff
changeset
|
219 from=((const unsigned char *) from)+64; |
1a13df0d4fc2
Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents:
13720
diff
changeset
|
220 to=((unsigned char *)to)+64; |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
221 } |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
222 else |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
223 /* |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
224 Only if SRC is aligned on 16-byte boundary. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
225 It allows to use movaps instead of movups, which required data |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
226 to be aligned or a general-protection exception (#GP) is generated. |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
227 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
228 for(; i>0; i--) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
229 { |
27757
b5a46071062a
Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents:
27754
diff
changeset
|
230 __asm__ volatile ( |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
231 PREFETCH" 320(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
232 "movaps (%0), %%xmm0\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
233 "movaps 16(%0), %%xmm1\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
234 "movaps 32(%0), %%xmm2\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
235 "movaps 48(%0), %%xmm3\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
236 "movntps %%xmm0, (%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
237 "movntps %%xmm1, 16(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
238 "movntps %%xmm2, 32(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
239 "movntps %%xmm3, 48(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
240 :: "r" (from), "r" (to) : "memory"); |
14565
1a13df0d4fc2
Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents:
13720
diff
changeset
|
241 from=((const unsigned char *)from)+64; |
1a13df0d4fc2
Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents:
13720
diff
changeset
|
242 to=((unsigned char *)to)+64; |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
243 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
244 #else |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
245 // Align destination at BLOCK_SIZE boundary |
30135
807fce7a4bb3
Do not assume that "long" is the size of a register.
reimar
parents:
29645
diff
changeset
|
246 for(; ((intptr_t)to & (BLOCK_SIZE-1)) && i>0; i--) |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
247 { |
27757
b5a46071062a
Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents:
27754
diff
changeset
|
248 __asm__ volatile ( |
23378
ef54df9f07d3
HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents:
15639
diff
changeset
|
249 #ifndef HAVE_ONLY_MMX1 |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
250 PREFETCH" 320(%0)\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
251 #endif |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
252 "movq (%0), %%mm0\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
253 "movq 8(%0), %%mm1\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
254 "movq 16(%0), %%mm2\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
255 "movq 24(%0), %%mm3\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
256 "movq 32(%0), %%mm4\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
257 "movq 40(%0), %%mm5\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
258 "movq 48(%0), %%mm6\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
259 "movq 56(%0), %%mm7\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
260 MOVNTQ" %%mm0, (%1)\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
261 MOVNTQ" %%mm1, 8(%1)\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
262 MOVNTQ" %%mm2, 16(%1)\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
263 MOVNTQ" %%mm3, 24(%1)\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
264 MOVNTQ" %%mm4, 32(%1)\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
265 MOVNTQ" %%mm5, 40(%1)\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
266 MOVNTQ" %%mm6, 48(%1)\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
267 MOVNTQ" %%mm7, 56(%1)\n" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
268 :: "r" (from), "r" (to) : "memory"); |
15639 | 269 from=((const unsigned char *)from)+64; |
270 to=((unsigned char *)to)+64; | |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
271 } |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
272 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
273 // printf(" %d %d\n", (int)from&1023, (int)to&1023); |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
274 // Pure Assembly cuz gcc is a bit unpredictable ;) |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
275 if(i>=BLOCK_SIZE/64) |
27754
08d18fe9da52
Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents:
25973
diff
changeset
|
276 __asm__ volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
277 "xor %%"REG_a", %%"REG_a" \n\t" |
25973
ef4297ed0d12
libvo: change asm syntax to use ASMALIGN and " # nop"
uau
parents:
23378
diff
changeset
|
278 ASMALIGN(4) |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
279 "1: \n\t" |
29645
7eb282a13214
Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents:
28446
diff
changeset
|
280 "movl (%0, %%"REG_a"), %%ecx \n\t" |
7eb282a13214
Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents:
28446
diff
changeset
|
281 "movl 32(%0, %%"REG_a"), %%ecx \n\t" |
7eb282a13214
Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents:
28446
diff
changeset
|
282 "movl 64(%0, %%"REG_a"), %%ecx \n\t" |
7eb282a13214
Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents:
28446
diff
changeset
|
283 "movl 96(%0, %%"REG_a"), %%ecx \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
284 "add $128, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
285 "cmp %3, %%"REG_a" \n\t" |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
286 " jb 1b \n\t" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
287 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
288 "xor %%"REG_a", %%"REG_a" \n\t" |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
289 |
25973
ef4297ed0d12
libvo: change asm syntax to use ASMALIGN and " # nop"
uau
parents:
23378
diff
changeset
|
290 ASMALIGN(4) |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
291 "2: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
292 "movq (%0, %%"REG_a"), %%mm0\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
293 "movq 8(%0, %%"REG_a"), %%mm1\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
294 "movq 16(%0, %%"REG_a"), %%mm2\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
295 "movq 24(%0, %%"REG_a"), %%mm3\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
296 "movq 32(%0, %%"REG_a"), %%mm4\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
297 "movq 40(%0, %%"REG_a"), %%mm5\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
298 "movq 48(%0, %%"REG_a"), %%mm6\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
299 "movq 56(%0, %%"REG_a"), %%mm7\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
300 MOVNTQ" %%mm0, (%1, %%"REG_a")\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
301 MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
302 MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
303 MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
304 MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
305 MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
306 MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
307 MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
308 "add $64, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
309 "cmp %3, %%"REG_a" \n\t" |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
310 "jb 2b \n\t" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
311 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
312 #if CONFUSION_FACTOR > 0 |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
313 // a few percent speedup on out of order executing CPUs |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
314 "mov %5, %%"REG_a" \n\t" |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
315 "2: \n\t" |
29645
7eb282a13214
Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents:
28446
diff
changeset
|
316 "movl (%0), %%ecx \n\t" |
7eb282a13214
Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents:
28446
diff
changeset
|
317 "movl (%0), %%ecx \n\t" |
7eb282a13214
Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents:
28446
diff
changeset
|
318 "movl (%0), %%ecx \n\t" |
7eb282a13214
Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents:
28446
diff
changeset
|
319 "movl (%0), %%ecx \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
320 "dec %%"REG_a" \n\t" |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
321 " jnz 2b \n\t" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
322 #endif |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
323 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
324 "xor %%"REG_a", %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
325 "add %3, %0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
326 "add %3, %1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
327 "sub %4, %2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
7072
diff
changeset
|
328 "cmp %4, %2 \n\t" |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
329 " jae 1b \n\t" |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
330 : "+r" (from), "+r" (to), "+r" (i) |
30135
807fce7a4bb3
Do not assume that "long" is the size of a register.
reimar
parents:
29645
diff
changeset
|
331 : "r" ((x86_reg)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((x86_reg)CONFUSION_FACTOR) |
29645
7eb282a13214
Use ecx instead of ebx to avoid unnecessary issues with PIC.
reimar
parents:
28446
diff
changeset
|
332 : "%"REG_a, "%ecx" |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
333 ); |
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
334 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
335 for(; i>0; i--) |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
336 { |
27757
b5a46071062a
Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents:
27754
diff
changeset
|
337 __asm__ volatile ( |
23378
ef54df9f07d3
HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...)
michael
parents:
15639
diff
changeset
|
338 #ifndef HAVE_ONLY_MMX1 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
339 PREFETCH" 320(%0)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
340 #endif |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
341 "movq (%0), %%mm0\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
342 "movq 8(%0), %%mm1\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
343 "movq 16(%0), %%mm2\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
344 "movq 24(%0), %%mm3\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
345 "movq 32(%0), %%mm4\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
346 "movq 40(%0), %%mm5\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
347 "movq 48(%0), %%mm6\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
348 "movq 56(%0), %%mm7\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
349 MOVNTQ" %%mm0, (%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
350 MOVNTQ" %%mm1, 8(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
351 MOVNTQ" %%mm2, 16(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
352 MOVNTQ" %%mm3, 24(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
353 MOVNTQ" %%mm4, 32(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
354 MOVNTQ" %%mm5, 40(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
355 MOVNTQ" %%mm6, 48(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
356 MOVNTQ" %%mm7, 56(%1)\n" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
357 :: "r" (from), "r" (to) : "memory"); |
15639 | 358 from=((const unsigned char *)from)+64; |
359 to=((unsigned char *)to)+64; | |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
360 } |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
361 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
362 #endif /* Have SSE */ |
28290 | 363 #if HAVE_MMX2 |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
364 /* since movntq is weakly-ordered, a "sfence" |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
365 * is needed to become ordered again. */ |
27757
b5a46071062a
Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents:
27754
diff
changeset
|
366 __asm__ volatile ("sfence":::"memory"); |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
367 #endif |
28290 | 368 #if !HAVE_SSE |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
369 /* enables to use FPU */ |
27757
b5a46071062a
Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents:
27754
diff
changeset
|
370 __asm__ volatile (EMMS:::"memory"); |
3077
99f6db3255aa
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
michael
parents:
1123
diff
changeset
|
371 #endif |
698
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
372 } |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
373 /* |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
374 * Now do the tail of the block |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
375 */ |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
376 if(len) small_memcpy(to, from, len); |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
377 return retval; |
f0fbf1a9bf31
Moving fast_memcpy to separate file (Size optimization)
nickols_k
parents:
diff
changeset
|
378 } |
4681 | 379 |
380 /** | |
381 * special copy routine for mem -> agp/pci copy (based upon fast_memcpy) | |
382 */ | |
7072 | 383 static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len) |
4681 | 384 { |
385 void *retval; | |
386 size_t i; | |
387 retval = to; | |
388 #ifdef STATISTICS | |
389 { | |
390 static int freq[33]; | |
391 static int t=0; | |
392 int i; | |
393 for(i=0; len>(1<<i); i++); | |
394 freq[i]++; | |
395 t++; | |
396 if(1024*1024*1024 % t == 0) | |
397 for(i=0; i<32; i++) | |
398 printf("mem2agp freq < %8d %4d\n", 1<<i, freq[i]); | |
399 } | |
400 #endif | |
401 if(len >= MIN_LEN) | |
402 { | |
30135
807fce7a4bb3
Do not assume that "long" is the size of a register.
reimar
parents:
29645
diff
changeset
|
403 register x86_reg delta; |
4681 | 404 /* Align destinition to MMREG_SIZE -boundary */ |
30135
807fce7a4bb3
Do not assume that "long" is the size of a register.
reimar
parents:
29645
diff
changeset
|
405 delta = ((intptr_t)to)&7; |
4681 | 406 if(delta) |
407 { | |
408 delta=8-delta; | |
409 len -= delta; | |
410 small_memcpy(to, from, delta); | |
411 } | |
412 i = len >> 6; /* len/64 */ | |
413 len &= 63; | |
414 /* | |
415 This algorithm is top effective when the code consequently | |
416 reads and writes blocks which have size of cache line. | |
417 Size of cache line is processor-dependent. | |
418 It will, however, be a minimum of 32 bytes on any processors. | |
419 It would be better to have a number of instructions which | |
420 perform reading and writing to be multiple to a number of | |
421 processor's decoders, but it's not always possible. | |
422 */ | |
423 for(; i>0; i--) | |
424 { | |
27757
b5a46071062a
Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents:
27754
diff
changeset
|
425 __asm__ volatile ( |
4681 | 426 PREFETCH" 320(%0)\n" |
427 "movq (%0), %%mm0\n" | |
428 "movq 8(%0), %%mm1\n" | |
429 "movq 16(%0), %%mm2\n" | |
430 "movq 24(%0), %%mm3\n" | |
431 "movq 32(%0), %%mm4\n" | |
432 "movq 40(%0), %%mm5\n" | |
433 "movq 48(%0), %%mm6\n" | |
434 "movq 56(%0), %%mm7\n" | |
435 MOVNTQ" %%mm0, (%1)\n" | |
436 MOVNTQ" %%mm1, 8(%1)\n" | |
437 MOVNTQ" %%mm2, 16(%1)\n" | |
438 MOVNTQ" %%mm3, 24(%1)\n" | |
439 MOVNTQ" %%mm4, 32(%1)\n" | |
440 MOVNTQ" %%mm5, 40(%1)\n" | |
441 MOVNTQ" %%mm6, 48(%1)\n" | |
442 MOVNTQ" %%mm7, 56(%1)\n" | |
443 :: "r" (from), "r" (to) : "memory"); | |
14565
1a13df0d4fc2
Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents:
13720
diff
changeset
|
444 from=((const unsigned char *)from)+64; |
1a13df0d4fc2
Make this file compile with gcc-4.0.0. The old code was invalid C.
gpoirier
parents:
13720
diff
changeset
|
445 to=((unsigned char *)to)+64; |
4681 | 446 } |
28290 | 447 #if HAVE_MMX2 |
4681 | 448 /* since movntq is weakly-ordered, a "sfence" |
449 * is needed to become ordered again. */ | |
27757
b5a46071062a
Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents:
27754
diff
changeset
|
450 __asm__ volatile ("sfence":::"memory"); |
4681 | 451 #endif |
452 /* enables to use FPU */ | |
27757
b5a46071062a
Replace all occurrences of '__volatile__' and '__volatile' by plain 'volatile'.
diego
parents:
27754
diff
changeset
|
453 __asm__ volatile (EMMS:::"memory"); |
4681 | 454 } |
455 /* | |
456 * Now do the tail of the block | |
457 */ | |
458 if(len) small_memcpy(to, from, len); | |
459 return retval; | |
460 } |