Mercurial > mplayer.hg
changeset 376:63c47ec706cd
P3 fixes...
author | arpi_esp |
---|---|
date | Thu, 12 Apr 2001 14:40:10 +0000 |
parents | 28cc43e013fd |
children | 8ad594a8c94c |
files | libvo/fastmemcpy.h |
diffstat | 1 files changed, 24 insertions(+), 30 deletions(-) [+] |
line wrap: on
line diff
--- a/libvo/fastmemcpy.h Thu Apr 12 02:24:41 2001 +0000 +++ b/libvo/fastmemcpy.h Thu Apr 12 14:40:10 2001 +0000 @@ -2,31 +2,19 @@ This part of code was taken by from Linux-2.4.3 and slightly modified for MMX2 instruction set. I have done it since linux uses page aligned blocks but mplayer uses weakly ordered data and original sources can not -speedup their. Only using prefetch and movntq together have effect! +speedup their. Only using prefetchnta and movntq together have effect! If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru. */ +#ifdef HAVE_MMX2 +/* for small memory blocks (<256 bytes) this version is faster */ +#define small_memcpy(to,from,n)\ +{\ +__asm__ __volatile__(\ + "rep ; movsb\n"\ + ::"D" (to), "S" (from),"c" (n)\ + : "memory");\ +} -#ifndef HAVE_MMX2 -//static inline void * __memcpy(void * to, const void * from, unsigned n) -inline static void * fast_memcpy(void * to, const void * from, unsigned n) -{ -int d0, d1, d2; -__asm__ __volatile__( - "rep ; movsl\n\t" - "testb $2,%b4\n\t" - "je 1f\n\t" - "movsw\n" - "1:\ttestb $1,%b4\n\t" - "je 2f\n\t" - "movsb\n" - "2:" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) - : "memory"); -return (to); -} -#else -//inline static void *__memcpy_mmx2(void *to, const void *from, unsigned len) inline static void * fast_memcpy(void * to, const void * from, unsigned len) { void *p; @@ -36,12 +24,15 @@ { p = to; i = len >> 6; /* len/64 */ + len&=63; + __asm__ __volatile__ ( - "1: prefetch (%0)\n" /* This set is 28 bytes */ - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" + "1: prefetchnta (%0)\n" /* This set is 28 bytes */ + " prefetchnta 64(%0)\n" + " prefetchnta 128(%0)\n" + " prefetchnta 192(%0)\n" + " prefetchnta 256(%0)\n" +#if 0 "2: \n" ".section .fixup, \"ax\"\n" "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ @@ -51,13 +42,14 @@ " .align 4\n" " .long 1b, 3b\n" ".previous" +#endif : : "r" (from) ); for(; i>0; i--) { __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" + "1: prefetchnta 320(%0)\n" "2: movq (%0), %%mm0\n" " movq 8(%0), %%mm1\n" " movq 16(%0), %%mm2\n" @@ -74,6 +66,7 @@ " movntq %%mm1, 40(%1)\n" " movntq %%mm2, 48(%1)\n" " movntq %%mm3, 56(%1)\n" +#if 0 ".section .fixup, \"ax\"\n" "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" @@ -82,6 +75,7 @@ " .align 4\n" " .long 1b, 3b\n" ".previous" +#endif : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; @@ -91,10 +85,10 @@ /* * Now do the tail of the block */ - memcpy(to, from, len&63); + small_memcpy(to, from, len); return p; } +#define memcpy(a,b,c) fast_memcpy(a,b,c) #endif -#define memcpy(a,b,c) fast_memcpy(a,b,c)