mplayer.hg: libvo/aclib_template.c annotate

annotate libvo/aclib_template.c @ 25194:e816d546c4fe

ao_null: Make duration of "buffered" audio constant Choose the "buffer size" for the amount of audio the driver accepts so that it corresponds to about 0.2 seconds of playback based on the number of channels, sample size and samplerate.

author	uau
date	Sat, 01 Dec 2007 01:39:39 +0000
parents	ef54df9f07d3
children	ef4297ed0d12

rev	line source
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	1 /*
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	2 aclib - advanced C library ;)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	3 This file contains functions which improve and expand standard C-library
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	4 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	5
1123 5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	6 #ifndef HAVE_SSE2
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	7 /*
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	8 P3 processor has only one SSE decoder so can execute only 1 sse insn per
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	9 cpu clock, but it has 3 mmx decoders (include load/store unit)
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	10 and executes 3 mmx insns per cpu clock.
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	11 P4 processor has some chances, but after reading:
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	12 http://www.emulators.com/pentium4.htm
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	13 I have doubts. Anyway SSE2 version of this code can be written better.
5b69dabe5823 Issues about P3 performance and SSE2 support. nickols_k parents: 698 diff changeset	14 */
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	15 #undef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	16 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	17
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	18
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	19 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	20 This part of code was taken by me from Linux-2.4.3 and slightly modified
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	21 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	22 blocks but mplayer uses weakly ordered data and original sources can not
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	23 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	24
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	25 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	26
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	27 Order Number 245470:
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	28 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	29
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	30 Data referenced by a program can be temporal (data will be used again) or
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	31 non-temporal (data will be referenced once and not reused in the immediate
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	32 future). To make efficient use of the processor's caches, it is generally
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	33 desirable to cache temporal data and not cache non-temporal data. Overloading
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	34 the processor's caches with non-temporal data is sometimes referred to as
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	35 "polluting the caches".
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	36 The non-temporal data is written to memory with Write-Combining semantics.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	37
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	38 The PREFETCHh instructions permits a program to load data into the processor
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	39 at a suggested cache level, so that it is closer to the processors load and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	40 store unit when it is needed. If the data is already present in a level of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	41 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	42 will not result in any data movement.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	43 But we should you PREFETCHNTA: Non-temporal data fetch data into location
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	44 close to the processor, minimizing cache pollution.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	45
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	46 The MOVNTQ (store quadword using non-temporal hint) instruction stores
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	47 packed integer data from an MMX register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	48 The MOVNTPS (store packed single-precision floating-point values using
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	49 non-temporal hint) instruction stores packed floating-point data from an
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	50 XMM register to memory, using a non-temporal hint.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	51
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	52 The SFENCE (Store Fence) instruction controls write ordering by creating a
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	53 fence for memory store operations. This instruction guarantees that the results
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	54 of every store instruction that precedes the store fence in program order is
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	55 globally visible before any store instruction that follows the fence. The
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	56 SFENCE instruction provides an efficient way of ensuring ordering between
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	57 procedures that produce weakly-ordered data and procedures that consume that
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	58 data.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	59
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	60 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	61 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	62
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	63 // 3dnow memcpy support from kernel 2.4.2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	64 // by Pontscho/fresh!mindworkz
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	65
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	66
23378 ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...) michael parents: 15639 diff changeset	67 #undef HAVE_ONLY_MMX1
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	68 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	69 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	70 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	71 standard (non MMX-optimized) version.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	72 Note: on K6-2+ it speedups memory copying upto 25% and
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	73 on K7 and P3 about 500% (5 times). */
23378 ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...) michael parents: 15639 diff changeset	74 #define HAVE_ONLY_MMX1
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	75 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	76
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	77
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	78 #undef HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	79 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	80 #define HAVE_K6_2PLUS
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	81 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	82
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	83 /* for small memory blocks (<256 bytes) this version is faster */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	84 #define small_memcpy(to,from,n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	85 {\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	86 register unsigned long int dummy;\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	87 __asm__ __volatile__(\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	88 "rep; movsb"\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	89 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	90 /* It's most portable way to notify compiler */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	91 /* that edi, esi and ecx are clobbered in asm block. */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	92 /* Thanks to A'rpi for hint!!! */\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	93 :"0" (to), "1" (from),"2" (n)\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	94 : "memory");\
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	95 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	96
3393 3624cd351618 runtime cpu detection michael parents: 3077 diff changeset	97 #undef MMREG_SIZE
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	98 #ifdef HAVE_SSE
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	99 #define MMREG_SIZE 16
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	100 #else
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	101 #define MMREG_SIZE 64 //8
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	102 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	103
3393 3624cd351618 runtime cpu detection michael parents: 3077 diff changeset	104 #undef PREFETCH
3624cd351618 runtime cpu detection michael parents: 3077 diff changeset	105 #undef EMMS
5660 4dcc7af65eec pre mmx2/3dnow fix michael parents: 4684 diff changeset	106
5662 663ca5050f7e prefer prefetchnta if its available michael parents: 5660 diff changeset	107 #ifdef HAVE_MMX2
663ca5050f7e prefer prefetchnta if its available michael parents: 5660 diff changeset	108 #define PREFETCH "prefetchnta"
663ca5050f7e prefer prefetchnta if its available michael parents: 5660 diff changeset	109 #elif defined ( HAVE_3DNOW )
5660 4dcc7af65eec pre mmx2/3dnow fix michael parents: 4684 diff changeset	110 #define PREFETCH "prefetch"
4dcc7af65eec pre mmx2/3dnow fix michael parents: 4684 diff changeset	111 #else
4dcc7af65eec pre mmx2/3dnow fix michael parents: 4684 diff changeset	112 #define PREFETCH "/nop"
4dcc7af65eec pre mmx2/3dnow fix michael parents: 4684 diff changeset	113 #endif
4dcc7af65eec pre mmx2/3dnow fix michael parents: 4684 diff changeset	114
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	115 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
5660 4dcc7af65eec pre mmx2/3dnow fix michael parents: 4684 diff changeset	116 #ifdef HAVE_3DNOW
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	117 #define EMMS "femms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	118 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	119 #define EMMS "emms"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	120 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	121
3393 3624cd351618 runtime cpu detection michael parents: 3077 diff changeset	122 #undef MOVNTQ
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	123 #ifdef HAVE_MMX2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	124 #define MOVNTQ "movntq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	125 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	126 #define MOVNTQ "movq"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	127 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	128
3393 3624cd351618 runtime cpu detection michael parents: 3077 diff changeset	129 #undef MIN_LEN
23378 ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...) michael parents: 15639 diff changeset	130 #ifdef HAVE_ONLY_MMX1
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	131 #define MIN_LEN 0x800 /* 2K blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	132 #else
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	133 #define MIN_LEN 0x40 /* 64-byte blocks */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	134 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	135
7072 113d66d78967 removed nonsense 'inline' arpi parents: 5662 diff changeset	136 static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	137 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	138 void *retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	139 size_t i;
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	140 retval = to;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	141 #ifdef STATISTICS
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	142 {
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	143 static int freq[33];
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	144 static int t=0;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	145 int i;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	146 for(i=0; len>(1<<i); i++);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	147 freq[i]++;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	148 t++;
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	149 if(102410241024 % t == 0)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	150 for(i=0; i<32; i++)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	151 printf("freq < %8d %4d\n", 1<<i, freq[i]);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	152 }
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	153 #endif
23378 ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...) michael parents: 15639 diff changeset	154 #ifndef HAVE_ONLY_MMX1
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	155 /* PREFETCH has effect even for MOVSB instruction ;) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	156 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	157 PREFETCH" (%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	158 PREFETCH" 64(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	159 PREFETCH" 128(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	160 PREFETCH" 192(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	161 PREFETCH" 256(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	162 : : "r" (from) );
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	163 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	164 if(len >= MIN_LEN)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	165 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	166 register unsigned long int delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	167 /* Align destinition to MMREG_SIZE -boundary */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	168 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	169 if(delta)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	170 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	171 delta=MMREG_SIZE-delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	172 len -= delta;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	173 small_memcpy(to, from, delta);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	174 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	175 i = len >> 6; /* len/64 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	176 len&=63;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	177 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	178 This algorithm is top effective when the code consequently
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	179 reads and writes blocks which have size of cache line.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	180 Size of cache line is processor-dependent.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	181 It will, however, be a minimum of 32 bytes on any processors.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	182 It would be better to have a number of instructions which
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	183 perform reading and writing to be multiple to a number of
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	184 processor's decoders, but it's not always possible.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	185 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	186 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	187 if(((unsigned long)from) & 15)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	188 /* if SRC is misaligned */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	189 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	190 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	191 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	192 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	193 "movups (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	194 "movups 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	195 "movups 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	196 "movups 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	197 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	198 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	199 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	200 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	201 :: "r" (from), "r" (to) : "memory");
14565 1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C. gpoirier parents: 13720 diff changeset	202 from=((const unsigned char *) from)+64;
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C. gpoirier parents: 13720 diff changeset	203 to=((unsigned char *)to)+64;
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	204 }
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	205 else
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	206 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	207 Only if SRC is aligned on 16-byte boundary.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	208 It allows to use movaps instead of movups, which required data
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	209 to be aligned or a general-protection exception (#GP) is generated.
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	210 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	211 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	212 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	213 __asm__ __volatile__ (
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	214 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	215 "movaps (%0), %%xmm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	216 "movaps 16(%0), %%xmm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	217 "movaps 32(%0), %%xmm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	218 "movaps 48(%0), %%xmm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	219 "movntps %%xmm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	220 "movntps %%xmm1, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	221 "movntps %%xmm2, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	222 "movntps %%xmm3, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	223 :: "r" (from), "r" (to) : "memory");
14565 1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C. gpoirier parents: 13720 diff changeset	224 from=((const unsigned char *)from)+64;
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C. gpoirier parents: 13720 diff changeset	225 to=((unsigned char *)to)+64;
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	226 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	227 #else
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	228 // Align destination at BLOCK_SIZE boundary
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	229 for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	230 {
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	231 __asm__ __volatile__ (
23378 ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...) michael parents: 15639 diff changeset	232 #ifndef HAVE_ONLY_MMX1
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	233 PREFETCH" 320(%0)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	234 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	235 "movq (%0), %%mm0\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	236 "movq 8(%0), %%mm1\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	237 "movq 16(%0), %%mm2\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	238 "movq 24(%0), %%mm3\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	239 "movq 32(%0), %%mm4\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	240 "movq 40(%0), %%mm5\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	241 "movq 48(%0), %%mm6\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	242 "movq 56(%0), %%mm7\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	243 MOVNTQ" %%mm0, (%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	244 MOVNTQ" %%mm1, 8(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	245 MOVNTQ" %%mm2, 16(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	246 MOVNTQ" %%mm3, 24(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	247 MOVNTQ" %%mm4, 32(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	248 MOVNTQ" %%mm5, 40(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	249 MOVNTQ" %%mm6, 48(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	250 MOVNTQ" %%mm7, 56(%1)\n"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	251 :: "r" (from), "r" (to) : "memory");
15639 f26450da61a1 More gcc-4.0 fixes gpoirier parents: 14565 diff changeset	252 from=((const unsigned char *)from)+64;
f26450da61a1 More gcc-4.0 fixes gpoirier parents: 14565 diff changeset	253 to=((unsigned char *)to)+64;
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	254 }
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	255
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	256 // printf(" %d %d\n", (int)from&1023, (int)to&1023);
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	257 // Pure Assembly cuz gcc is a bit unpredictable ;)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	258 if(i>=BLOCK_SIZE/64)
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	259 asm volatile(
13720 821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	260 "xor %%"REG_a", %%"REG_a" \n\t"
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	261 ".balign 16 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	262 "1: \n\t"
13720 821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	263 "movl (%0, %%"REG_a"), %%ebx \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	264 "movl 32(%0, %%"REG_a"), %%ebx \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	265 "movl 64(%0, %%"REG_a"), %%ebx \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	266 "movl 96(%0, %%"REG_a"), %%ebx \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	267 "add $128, %%"REG_a" \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	268 "cmp %3, %%"REG_a" \n\t"
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	269 " jb 1b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	270
13720 821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	271 "xor %%"REG_a", %%"REG_a" \n\t"
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	272
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	273 ".balign 16 \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	274 "2: \n\t"
13720 821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	275 "movq (%0, %%"REG_a"), %%mm0\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	276 "movq 8(%0, %%"REG_a"), %%mm1\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	277 "movq 16(%0, %%"REG_a"), %%mm2\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	278 "movq 24(%0, %%"REG_a"), %%mm3\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	279 "movq 32(%0, %%"REG_a"), %%mm4\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	280 "movq 40(%0, %%"REG_a"), %%mm5\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	281 "movq 48(%0, %%"REG_a"), %%mm6\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	282 "movq 56(%0, %%"REG_a"), %%mm7\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	283 MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	284 MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	285 MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	286 MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	287 MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	288 MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	289 MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	290 MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	291 "add $64, %%"REG_a" \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	292 "cmp %3, %%"REG_a" \n\t"
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	293 "jb 2b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	294
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	295 #if CONFUSION_FACTOR > 0
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	296 // a few percent speedup on out of order executing CPUs
13720 821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	297 "mov %5, %%"REG_a" \n\t"
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	298 "2: \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	299 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	300 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	301 "movl (%0), %%ebx \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	302 "movl (%0), %%ebx \n\t"
13720 821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	303 "dec %%"REG_a" \n\t"
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	304 " jnz 2b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	305 #endif
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	306
13720 821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	307 "xor %%"REG_a", %%"REG_a" \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	308 "add %3, %0 \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	309 "add %3, %1 \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	310 "sub %4, %2 \n\t"
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	311 "cmp %4, %2 \n\t"
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	312 " jae 1b \n\t"
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	313 : "+r" (from), "+r" (to), "+r" (i)
13720 821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	314 : "r" ((long)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((long)CONFUSION_FACTOR)
821f464b4d90 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 aurel parents: 7072 diff changeset	315 : "%"REG_a, "%ebx"
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	316 );
99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	317
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	318 for(; i>0; i--)
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	319 {
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	320 __asm__ __volatile__ (
23378 ef54df9f07d3 HAVE_MMX1 -> HAVE_ONLY_MMX1 (makes more sense ...) michael parents: 15639 diff changeset	321 #ifndef HAVE_ONLY_MMX1
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	322 PREFETCH" 320(%0)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	323 #endif
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	324 "movq (%0), %%mm0\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	325 "movq 8(%0), %%mm1\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	326 "movq 16(%0), %%mm2\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	327 "movq 24(%0), %%mm3\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	328 "movq 32(%0), %%mm4\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	329 "movq 40(%0), %%mm5\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	330 "movq 48(%0), %%mm6\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	331 "movq 56(%0), %%mm7\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	332 MOVNTQ" %%mm0, (%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	333 MOVNTQ" %%mm1, 8(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	334 MOVNTQ" %%mm2, 16(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	335 MOVNTQ" %%mm3, 24(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	336 MOVNTQ" %%mm4, 32(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	337 MOVNTQ" %%mm5, 40(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	338 MOVNTQ" %%mm6, 48(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	339 MOVNTQ" %%mm7, 56(%1)\n"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	340 :: "r" (from), "r" (to) : "memory");
15639 f26450da61a1 More gcc-4.0 fixes gpoirier parents: 14565 diff changeset	341 from=((const unsigned char *)from)+64;
f26450da61a1 More gcc-4.0 fixes gpoirier parents: 14565 diff changeset	342 to=((unsigned char *)to)+64;
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	343 }
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	344
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	345 #endif /* Have SSE */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	346 #ifdef HAVE_MMX2
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	347 /* since movntq is weakly-ordered, a "sfence"
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	348 * is needed to become ordered again. */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	349 __asm__ __volatile__ ("sfence":::"memory");
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	350 #endif
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	351 #ifndef HAVE_SSE
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	352 /* enables to use FPU */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	353 __asm__ __volatile__ (EMMS:::"memory");
3077 99f6db3255aa 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) michael parents: 1123 diff changeset	354 #endif
698 f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	355 }
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	356 /*
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	357 * Now do the tail of the block
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	358 */
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	359 if(len) small_memcpy(to, from, len);
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	360 return retval;
f0fbf1a9bf31 Moving fast_memcpy to separate file (Size optimization) nickols_k parents: diff changeset	361 }
4681 8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	362
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	363 /**
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	364 * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	365 */
7072 113d66d78967 removed nonsense 'inline' arpi parents: 5662 diff changeset	366 static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
4681 8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	367 {
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	368 void *retval;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	369 size_t i;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	370 retval = to;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	371 #ifdef STATISTICS
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	372 {
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	373 static int freq[33];
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	374 static int t=0;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	375 int i;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	376 for(i=0; len>(1<<i); i++);
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	377 freq[i]++;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	378 t++;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	379 if(102410241024 % t == 0)
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	380 for(i=0; i<32; i++)
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	381 printf("mem2agp freq < %8d %4d\n", 1<<i, freq[i]);
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	382 }
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	383 #endif
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	384 if(len >= MIN_LEN)
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	385 {
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	386 register unsigned long int delta;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	387 /* Align destinition to MMREG_SIZE -boundary */
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	388 delta = ((unsigned long int)to)&7;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	389 if(delta)
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	390 {
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	391 delta=8-delta;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	392 len -= delta;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	393 small_memcpy(to, from, delta);
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	394 }
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	395 i = len >> 6; /* len/64 */
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	396 len &= 63;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	397 /*
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	398 This algorithm is top effective when the code consequently
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	399 reads and writes blocks which have size of cache line.
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	400 Size of cache line is processor-dependent.
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	401 It will, however, be a minimum of 32 bytes on any processors.
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	402 It would be better to have a number of instructions which
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	403 perform reading and writing to be multiple to a number of
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	404 processor's decoders, but it's not always possible.
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	405 */
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	406 for(; i>0; i--)
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	407 {
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	408 __asm__ __volatile__ (
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	409 PREFETCH" 320(%0)\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	410 "movq (%0), %%mm0\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	411 "movq 8(%0), %%mm1\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	412 "movq 16(%0), %%mm2\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	413 "movq 24(%0), %%mm3\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	414 "movq 32(%0), %%mm4\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	415 "movq 40(%0), %%mm5\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	416 "movq 48(%0), %%mm6\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	417 "movq 56(%0), %%mm7\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	418 MOVNTQ" %%mm0, (%1)\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	419 MOVNTQ" %%mm1, 8(%1)\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	420 MOVNTQ" %%mm2, 16(%1)\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	421 MOVNTQ" %%mm3, 24(%1)\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	422 MOVNTQ" %%mm4, 32(%1)\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	423 MOVNTQ" %%mm5, 40(%1)\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	424 MOVNTQ" %%mm6, 48(%1)\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	425 MOVNTQ" %%mm7, 56(%1)\n"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	426 :: "r" (from), "r" (to) : "memory");
14565 1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C. gpoirier parents: 13720 diff changeset	427 from=((const unsigned char *)from)+64;
1a13df0d4fc2 Make this file compile with gcc-4.0.0. The old code was invalid C. gpoirier parents: 13720 diff changeset	428 to=((unsigned char *)to)+64;
4681 8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	429 }
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	430 #ifdef HAVE_MMX2
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	431 /* since movntq is weakly-ordered, a "sfence"
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	432 * is needed to become ordered again. */
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	433 __asm__ __volatile__ ("sfence":::"memory");
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	434 #endif
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	435 /* enables to use FPU */
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	436 __asm__ __volatile__ (EMMS:::"memory");
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	437 }
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	438 /*
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	439 * Now do the tail of the block
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	440 */
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	441 if(len) small_memcpy(to, from, len);
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	442 return retval;
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	443 }
8db59073127e mem2agpcpy() michael parents: 3393 diff changeset	444

Mercurial > mplayer.hg

annotate libvo/aclib_template.c @ 25194:e816d546c4fe