diff libvo/fastmemcpy.h @ 376:63c47ec706cd

P3 fixes...
author arpi_esp
date Thu, 12 Apr 2001 14:40:10 +0000
parents baf3fe20eb23
children 90a50c8e15b8
line wrap: on
line diff
--- a/libvo/fastmemcpy.h	Thu Apr 12 02:24:41 2001 +0000
+++ b/libvo/fastmemcpy.h	Thu Apr 12 14:40:10 2001 +0000
@@ -2,31 +2,19 @@
  This part of code was taken by from Linux-2.4.3 and slightly modified
 for MMX2 instruction set. I have done it since linux uses page aligned
 blocks but mplayer uses weakly ordered data and original sources can not
-speedup their. Only using prefetch and movntq together have effect! 
+speedup their. Only using prefetchnta and movntq together have effect! 
 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
 */
+#ifdef HAVE_MMX2
+/* for small memory blocks (<256 bytes) this version is faster */
+#define small_memcpy(to,from,n)\
+{\
+__asm__ __volatile__(\
+	"rep ; movsb\n"\
+	::"D" (to), "S" (from),"c" (n)\
+	: "memory");\
+}
 
-#ifndef HAVE_MMX2
-//static inline void * __memcpy(void * to, const void * from, unsigned n)
-inline static void * fast_memcpy(void * to, const void * from, unsigned n)
-{
-int d0, d1, d2;
-__asm__ __volatile__(
-	"rep ; movsl\n\t"
-	"testb $2,%b4\n\t"
-	"je 1f\n\t"
-	"movsw\n"
-	"1:\ttestb $1,%b4\n\t"
-	"je 2f\n\t"
-	"movsb\n"
-	"2:"
-	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
-	:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
-	: "memory");
-return (to);
-}
-#else
-//inline static void *__memcpy_mmx2(void *to, const void *from, unsigned len)
 inline static void * fast_memcpy(void * to, const void * from, unsigned len)
 {
 	void *p;
@@ -36,12 +24,15 @@
 	{
   	  p = to;
 	  i = len >> 6; /* len/64 */
+	  len&=63;
+	  
 	__asm__ __volatile__ (
-		"1: prefetch (%0)\n"		/* This set is 28 bytes */
-		"   prefetch 64(%0)\n"
-		"   prefetch 128(%0)\n"
-		"   prefetch 192(%0)\n"
-		"   prefetch 256(%0)\n"
+		"1: prefetchnta (%0)\n"		/* This set is 28 bytes */
+		"   prefetchnta 64(%0)\n"
+		"   prefetchnta 128(%0)\n"
+		"   prefetchnta 192(%0)\n"
+		"   prefetchnta 256(%0)\n"
+#if 0		
 		"2:  \n"
 		".section .fixup, \"ax\"\n"
 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
@@ -51,13 +42,14 @@
 		"	.align 4\n"
 		"	.long 1b, 3b\n"
 		".previous"
+#endif		
 		: : "r" (from) );
 		
 	
 	for(; i>0; i--)
 	{
 		__asm__ __volatile__ (
-		"1:  prefetch 320(%0)\n"
+		"1:  prefetchnta 320(%0)\n"
 		"2:  movq (%0), %%mm0\n"
 		"  movq 8(%0), %%mm1\n"
 		"  movq 16(%0), %%mm2\n"
@@ -74,6 +66,7 @@
 		"  movntq %%mm1, 40(%1)\n"
 		"  movntq %%mm2, 48(%1)\n"
 		"  movntq %%mm3, 56(%1)\n"
+#if 0		
 		".section .fixup, \"ax\"\n"
 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
 		"   jmp 2b\n"
@@ -82,6 +75,7 @@
 		"	.align 4\n"
 		"	.long 1b, 3b\n"
 		".previous"
+#endif		
 		: : "r" (from), "r" (to) : "memory");
 		from+=64;
 		to+=64;
@@ -91,10 +85,10 @@
 	/*
 	 *	Now do the tail of the block
 	 */
-	memcpy(to, from, len&63);
+	small_memcpy(to, from, len);
 	return p;
 }
+#define memcpy(a,b,c) fast_memcpy(a,b,c)
 #endif
 
-#define memcpy(a,b,c) fast_memcpy(a,b,c)