mplayer.hg: libvo/aclib.c annotate

annotate libvo/aclib.c @ 3195:62d797a16f72

unistd.h required at least by FreeBSD

author	nexus
date	Thu, 29 Nov 2001 13:43:52 +0000
parents	99f6db3255aa
children	3624cd351618

rev	line source
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	1 #include "../config.h"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	3 #ifdef USE_FASTMEMCPY
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	4 /*
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	5 aclib - advanced C library ;)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	6 This file contains functions which improve and expand standard C-library
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	7 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	8
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	9 #include <stddef.h>
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	10
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	11 #define BLOCK_SIZE 4096
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	12 #define CONFUSION_FACTOR 0
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	13 //Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	14
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	15 //#define STATISTICS
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	16
1123 5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	17 #ifndef HAVE_SSE2
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	18 /*
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	19 P3 processor has only one SSE decoder so can execute only 1 sse insn per
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	20 cpu clock, but it has 3 mmx decoders (include load/store unit)
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	21 and executes 3 mmx insns per cpu clock.
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	22 P4 processor has some chances, but after reading:
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	23 http://www.emulators.com/pentium4.htm
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	24 I have doubts. Anyway SSE2 version of this code can be written better.
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	25 */
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	26 #undef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	27 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	28
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	29
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	30 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	31 This part of code was taken by me from Linux-2.4.3 and slightly modified
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	32 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	33 blocks but mplayer uses weakly ordered data and original sources can not
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	34 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	35
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	36 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	37
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	38 Order Number 245470:
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	39 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	40
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	41 Data referenced by a program can be temporal (data will be used again) or
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	42 non-temporal (data will be referenced once and not reused in the immediate
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	43 future). To make efficient use of the processor's caches, it is generally
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	44 desirable to cache temporal data and not cache non-temporal data. Overloading
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	45 the processor's caches with non-temporal data is sometimes referred to as
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	46 "polluting the caches".
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	47 The non-temporal data is written to memory with Write-Combining semantics.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	48
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	49 The PREFETCHh instructions permits a program to load data into the processor
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	50 at a suggested cache level, so that it is closer to the processors load and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	51 store unit when it is needed. If the data is already present in a level of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	52 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	53 will not result in any data movement.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	54 But we should you PREFETCHNTA: Non-temporal data fetch data into location
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	55 close to the processor, minimizing cache pollution.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	56
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	57 The MOVNTQ (store quadword using non-temporal hint) instruction stores
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	58 packed integer data from an MMX register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	59 The MOVNTPS (store packed single-precision floating-point values using
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	60 non-temporal hint) instruction stores packed floating-point data from an
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	61 XMM register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	62
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	63 The SFENCE (Store Fence) instruction controls write ordering by creating a
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	64 fence for memory store operations. This instruction guarantees that the results
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	65 of every store instruction that precedes the store fence in program order is
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	66 globally visible before any store instruction that follows the fence. The
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	67 SFENCE instruction provides an efficient way of ensuring ordering between
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	68 procedures that produce weakly-ordered data and procedures that consume that
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	69 data.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	70
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	71 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	72 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	73
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	74 // 3dnow memcpy support from kernel 2.4.2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	75 // by Pontscho/fresh!mindworkz
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	76
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	77 #if defined( HAVE_MMX2 ) \|\| defined( HAVE_3DNOW ) \|\| defined( HAVE_MMX )
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	78
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	79 #undef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	80 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	81 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	82 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	83 standard (non MMX-optimized) version.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	84 Note: on K6-2+ it speedups memory copying upto 25% and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	85 on K7 and P3 about 500% (5 times). */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	86 #define HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	87 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	88
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	89
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	90 #undef HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	91 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	92 #define HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	93 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	94
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	95 /* for small memory blocks (<256 bytes) this version is faster */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	96 #define small_memcpy(to,from,n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	97 {\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	98 register unsigned long int dummy;\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	99 __asm__ __volatile__(\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	100 "rep; movsb"\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	101 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	102 /* It's most portable way to notify compiler */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	103 /* that edi, esi and ecx are clobbered in asm block. */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	104 /* Thanks to A'rpi for hint!!! */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	105 :"0" (to), "1" (from),"2" (n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	106 : "memory");\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	107 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	108
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	109 #ifdef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	110 #define MMREG_SIZE 16
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	111 #else
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	112 #define MMREG_SIZE 64 //8
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	113 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	114
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	115 /* Small defines (for readability only) ;) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	116 #ifdef HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	117 #define PREFETCH "prefetch"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	118 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	119 #define EMMS "femms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	120 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	121 #define PREFETCH "prefetchnta"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	122 #define EMMS "emms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	123 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	124
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	125 #ifdef HAVE_MMX2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	126 #define MOVNTQ "movntq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	127 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	128 #define MOVNTQ "movq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	129 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	130
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	131 #ifdef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	132 #define MIN_LEN 0x800 /* 2K blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	133 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	134 #define MIN_LEN 0x40 /* 64-byte blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	135 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	136
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	137 void * fast_memcpy(void * to, const void * from, size_t len)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	138 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	139 void *retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	140 size_t i;
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	141 retval = to;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	142 #ifdef STATISTICS
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	143 {
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	144 static int freq[33];
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	145 static int t=0;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	146 int i;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	147 for(i=0; len>(1<<i); i++);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	148 freq[i]++;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	149 t++;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	150 if(102410241024 % t == 0)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	151 for(i=0; i<32; i++)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	152 printf("freq < %8d %4d\n", 1<<i, freq[i]);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	153 }
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	154 #endif
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	155 #ifndef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	156 /* PREFETCH has effect even for MOVSB instruction ;) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	157 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	158 PREFETCH" (%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	159 PREFETCH" 64(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	160 PREFETCH" 128(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	161 PREFETCH" 192(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	162 PREFETCH" 256(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	163 : : "r" (from) );
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	164 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	165 if(len >= MIN_LEN)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	166 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	167 register unsigned long int delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	168 /* Align destinition to MMREG_SIZE -boundary */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	169 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	170 if(delta)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	171 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	172 delta=MMREG_SIZE-delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	173 len -= delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	174 small_memcpy(to, from, delta);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	175 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	176 i = len >> 6; /* len/64 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	177 len&=63;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	178 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	179 This algorithm is top effective when the code consequently
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	180 reads and writes blocks which have size of cache line.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	181 Size of cache line is processor-dependent.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	182 It will, however, be a minimum of 32 bytes on any processors.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	183 It would be better to have a number of instructions which
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	184 perform reading and writing to be multiple to a number of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	185 processor's decoders, but it's not always possible.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	186 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	187 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	188 if(((unsigned long)from) & 15)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	189 /* if SRC is misaligned */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	190 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	191 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	192 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	193 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	194 "movups (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	195 "movups 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	196 "movups 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	197 "movups 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	198 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	199 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	200 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	201 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	202 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	203 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	204 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	205 }
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	206 else
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	207 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	208 Only if SRC is aligned on 16-byte boundary.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	209 It allows to use movaps instead of movups, which required data
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	210 to be aligned or a general-protection exception (#GP) is generated.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	211 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	212 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	213 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	214 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	215 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	216 "movaps (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	217 "movaps 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	218 "movaps 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	219 "movaps 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	220 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	221 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	222 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	223 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	224 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	225 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	226 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	227 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	228 #else
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	229 // Align destination at BLOCK_SIZE boundary
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	230 for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	231 {
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	232 __asm__ __volatile__ (
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	233 #ifndef HAVE_MMX1
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	234 PREFETCH" 320(%0)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	235 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	236 "movq (%0), %%mm0\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	237 "movq 8(%0), %%mm1\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	238 "movq 16(%0), %%mm2\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	239 "movq 24(%0), %%mm3\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	240 "movq 32(%0), %%mm4\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	241 "movq 40(%0), %%mm5\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	242 "movq 48(%0), %%mm6\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	243 "movq 56(%0), %%mm7\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	244 MOVNTQ" %%mm0, (%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	245 MOVNTQ" %%mm1, 8(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	246 MOVNTQ" %%mm2, 16(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	247 MOVNTQ" %%mm3, 24(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	248 MOVNTQ" %%mm4, 32(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	249 MOVNTQ" %%mm5, 40(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	250 MOVNTQ" %%mm6, 48(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	251 MOVNTQ" %%mm7, 56(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	252 :: "r" (from), "r" (to) : "memory");
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	253 ((const unsigned char *)from)+=64;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	254 ((unsigned char *)to)+=64;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	255 }
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	256
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	257 // printf(" %d %d\n", (int)from&1023, (int)to&1023);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	258 // Pure Assembly cuz gcc is a bit unpredictable ;)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	259 if(i>=BLOCK_SIZE/64)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	260 asm volatile(
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	261 "xorl %%eax, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	262 ".balign 16 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	263 "1: \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	264 "movl (%0, %%eax), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	265 "movl 32(%0, %%eax), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	266 "movl 64(%0, %%eax), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	267 "movl 96(%0, %%eax), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	268 "addl $128, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	269 "cmpl %3, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	270 " jb 1b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	271
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	272 "xorl %%eax, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	273
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	274 ".balign 16 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	275 "2: \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	276 "movq (%0, %%eax), %%mm0\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	277 "movq 8(%0, %%eax), %%mm1\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	278 "movq 16(%0, %%eax), %%mm2\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	279 "movq 24(%0, %%eax), %%mm3\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	280 "movq 32(%0, %%eax), %%mm4\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	281 "movq 40(%0, %%eax), %%mm5\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	282 "movq 48(%0, %%eax), %%mm6\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	283 "movq 56(%0, %%eax), %%mm7\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	284 MOVNTQ" %%mm0, (%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	285 MOVNTQ" %%mm1, 8(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	286 MOVNTQ" %%mm2, 16(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	287 MOVNTQ" %%mm3, 24(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	288 MOVNTQ" %%mm4, 32(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	289 MOVNTQ" %%mm5, 40(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	290 MOVNTQ" %%mm6, 48(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	291 MOVNTQ" %%mm7, 56(%1, %%eax)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	292 "addl $64, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	293 "cmpl %3, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	294 "jb 2b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	295
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	296 #if CONFUSION_FACTOR > 0
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	297 // a few percent speedup on out of order executing CPUs
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	298 "movl %5, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	299 "2: \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	300 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	301 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	302 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	303 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	304 "decl %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	305 " jnz 2b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	306 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	307
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	308 "xorl %%eax, %%eax \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	309 "addl %3, %0 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	310 "addl %3, %1 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	311 "subl %4, %2 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	312 "cmpl %4, %2 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	313 " jae 1b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	314 : "+r" (from), "+r" (to), "+r" (i)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	315 : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	316 : "%eax", "%ebx"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	317 );
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	318
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	319 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	320 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	321 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	322 #ifndef HAVE_MMX1
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	323 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	324 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	325 "movq (%0), %%mm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	326 "movq 8(%0), %%mm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	327 "movq 16(%0), %%mm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	328 "movq 24(%0), %%mm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	329 "movq 32(%0), %%mm4\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	330 "movq 40(%0), %%mm5\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	331 "movq 48(%0), %%mm6\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	332 "movq 56(%0), %%mm7\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	333 MOVNTQ" %%mm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	334 MOVNTQ" %%mm1, 8(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	335 MOVNTQ" %%mm2, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	336 MOVNTQ" %%mm3, 24(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	337 MOVNTQ" %%mm4, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	338 MOVNTQ" %%mm5, 40(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	339 MOVNTQ" %%mm6, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	340 MOVNTQ" %%mm7, 56(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	341 :: "r" (from), "r" (to) : "memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	342 ((const unsigned char *)from)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	343 ((unsigned char *)to)+=64;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	344 }
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	345
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	346 #endif /* Have SSE */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	347 #ifdef HAVE_MMX2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	348 /* since movntq is weakly-ordered, a "sfence"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	349 * is needed to become ordered again. */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	350 __asm__ __volatile__ ("sfence":::"memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	351 #endif
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	352 #ifndef HAVE_SSE
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	353 /* enables to use FPU */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	354 __asm__ __volatile__ (EMMS:::"memory");
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	355 #endif
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	356 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	357 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	358 * Now do the tail of the block
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	359 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	360 if(len) small_memcpy(to, from, len);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	361 return retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	362 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	363
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	364
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	365 #endif /* #if defined( HAVE_MMX2 ) \|\| defined( HAVE_3DNOW ) \|\| defined( HAVE_MMX ) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	366 #endif /* USE_FASTMEMCPY */

Mercurial > mplayer.hg

annotate libvo/aclib.c @ 3195:62d797a16f72