# HG changeset patch
# User michael
# Date 1006458038 0
# Node ID 99f6db3255aa6e28e5ecdefdb6645784f00b6089
# Parent  bfc881c0e591e4291ba92b8121bd9291aa1c2e92
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)

diff -r bfc881c0e591 -r 99f6db3255aa libvo/aclib.c
--- a/libvo/aclib.c	Thu Nov 22 19:22:49 2001 +0000
+++ b/libvo/aclib.c	Thu Nov 22 19:40:38 2001 +0000
@@ -1,13 +1,19 @@
 #include "../config.h"
 
 #ifdef USE_FASTMEMCPY
-/* 
+/*
   aclib - advanced C library ;)
   This file contains functions which improve and expand standard C-library
 */
 
 #include <stddef.h>
 
+#define BLOCK_SIZE 4096
+#define CONFUSION_FACTOR 0
+//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
+
+//#define STATISTICS
+
 #ifndef HAVE_SSE2
 /*
    P3 processor has only one SSE decoder so can execute only 1 sse insn per
@@ -103,7 +109,7 @@
 #ifdef HAVE_SSE
 #define MMREG_SIZE 16
 #else
-#define MMREG_SIZE 8
+#define MMREG_SIZE 64 //8
 #endif
 
 /* Small defines (for readability only) ;) */
@@ -132,7 +138,20 @@
 {
 	void *retval;
 	size_t i;
-  	retval = to;
+	retval = to;
+#ifdef STATISTICS
+	{
+		static int freq[33];
+		static int t=0;
+		int i;
+		for(i=0; len>(1<<i); i++);
+		freq[i]++;
+		t++;
+		if(1024*1024*1024 % t == 0)
+			for(i=0; i<32; i++)
+				printf("freq < %8d %4d\n", 1<<i, freq[i]);
+	}
+#endif
 #ifndef HAVE_MMX1
         /* PREFETCH has effect even for MOVSB instruction ;) */
 	__asm__ __volatile__ (
@@ -184,7 +203,7 @@
 		((const unsigned char *)from)+=64;
 		((unsigned char *)to)+=64;
 	}
-	else 
+	else
 	/*
 	   Only if SRC is aligned on 16-byte boundary.
 	   It allows to use movaps instead of movups, which required data
@@ -207,6 +226,96 @@
 		((unsigned char *)to)+=64;
 	}
 #else
+	// Align destination at BLOCK_SIZE boundary
+	for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
+	{
+		__asm__ __volatile__ (
+#ifndef HAVE_MMX1
+        	PREFETCH" 320(%0)\n"
+#endif
+		"movq (%0), %%mm0\n"
+		"movq 8(%0), %%mm1\n"
+		"movq 16(%0), %%mm2\n"
+		"movq 24(%0), %%mm3\n"
+		"movq 32(%0), %%mm4\n"
+		"movq 40(%0), %%mm5\n"
+		"movq 48(%0), %%mm6\n"
+		"movq 56(%0), %%mm7\n"
+		MOVNTQ" %%mm0, (%1)\n"
+		MOVNTQ" %%mm1, 8(%1)\n"
+		MOVNTQ" %%mm2, 16(%1)\n"
+		MOVNTQ" %%mm3, 24(%1)\n"
+		MOVNTQ" %%mm4, 32(%1)\n"
+		MOVNTQ" %%mm5, 40(%1)\n"
+		MOVNTQ" %%mm6, 48(%1)\n"
+		MOVNTQ" %%mm7, 56(%1)\n"
+		:: "r" (from), "r" (to) : "memory");
+		((const unsigned char *)from)+=64;
+		((unsigned char *)to)+=64;
+	}
+
+//	printf(" %d %d\n", (int)from&1023, (int)to&1023);
+	// Pure Assembly cuz gcc is a bit unpredictable ;)
+	if(i>=BLOCK_SIZE/64)
+		asm volatile(
+			"xorl %%eax, %%eax	\n\t"
+			".balign 16		\n\t"
+			"1:			\n\t"
+				"movl (%0, %%eax), %%ebx 	\n\t"
+				"movl 32(%0, %%eax), %%ebx 	\n\t"
+				"movl 64(%0, %%eax), %%ebx 	\n\t"
+				"movl 96(%0, %%eax), %%ebx 	\n\t"
+				"addl $128, %%eax		\n\t"
+				"cmpl %3, %%eax			\n\t"
+				" jb 1b				\n\t"
+
+			"xorl %%eax, %%eax	\n\t"
+
+				".balign 16		\n\t"
+				"2:			\n\t"
+				"movq (%0, %%eax), %%mm0\n"
+				"movq 8(%0, %%eax), %%mm1\n"
+				"movq 16(%0, %%eax), %%mm2\n"
+				"movq 24(%0, %%eax), %%mm3\n"
+				"movq 32(%0, %%eax), %%mm4\n"
+				"movq 40(%0, %%eax), %%mm5\n"
+				"movq 48(%0, %%eax), %%mm6\n"
+				"movq 56(%0, %%eax), %%mm7\n"
+				MOVNTQ" %%mm0, (%1, %%eax)\n"
+				MOVNTQ" %%mm1, 8(%1, %%eax)\n"
+				MOVNTQ" %%mm2, 16(%1, %%eax)\n"
+				MOVNTQ" %%mm3, 24(%1, %%eax)\n"
+				MOVNTQ" %%mm4, 32(%1, %%eax)\n"
+				MOVNTQ" %%mm5, 40(%1, %%eax)\n"
+				MOVNTQ" %%mm6, 48(%1, %%eax)\n"
+				MOVNTQ" %%mm7, 56(%1, %%eax)\n"
+				"addl $64, %%eax		\n\t"
+				"cmpl %3, %%eax		\n\t"
+				"jb 2b				\n\t"
+
+#if CONFUSION_FACTOR > 0
+	// a few percent speedup on out of order executing CPUs
+			"movl %5, %%eax		\n\t"
+				"2:			\n\t"
+				"movl (%0), %%ebx	\n\t"
+				"movl (%0), %%ebx	\n\t"
+				"movl (%0), %%ebx	\n\t"
+				"movl (%0), %%ebx	\n\t"
+				"decl %%eax		\n\t"
+				" jnz 2b		\n\t"
+#endif
+
+			"xorl %%eax, %%eax	\n\t"
+			"addl %3, %0		\n\t"
+			"addl %3, %1		\n\t"
+			"subl %4, %2		\n\t"
+			"cmpl %4, %2		\n\t"
+			" jae 1b		\n\t"
+				: "+r" (from), "+r" (to), "+r" (i)
+				: "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
+				: "%eax", "%ebx"
+		);
+
 	for(; i>0; i--)
 	{
 		__asm__ __volatile__ (
@@ -233,16 +342,17 @@
 		((const unsigned char *)from)+=64;
 		((unsigned char *)to)+=64;
 	}
+
 #endif /* Have SSE */
 #ifdef HAVE_MMX2
                 /* since movntq is weakly-ordered, a "sfence"
 		 * is needed to become ordered again. */
 		__asm__ __volatile__ ("sfence":::"memory");
 #endif
-#ifndef HAVE_SSE		
+#ifndef HAVE_SSE
 		/* enables to use FPU */
 		__asm__ __volatile__ (EMMS:::"memory");
-#endif		
+#endif
 	}
 	/*
 	 *	Now do the tail of the block
diff -r bfc881c0e591 -r 99f6db3255aa libvo/aclib_template.c
--- a/libvo/aclib_template.c	Thu Nov 22 19:22:49 2001 +0000
+++ b/libvo/aclib_template.c	Thu Nov 22 19:40:38 2001 +0000
@@ -1,13 +1,19 @@
 #include "../config.h"
 
 #ifdef USE_FASTMEMCPY
-/* 
+/*
   aclib - advanced C library ;)
   This file contains functions which improve and expand standard C-library
 */
 
 #include <stddef.h>
 
+#define BLOCK_SIZE 4096
+#define CONFUSION_FACTOR 0
+//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
+
+//#define STATISTICS
+
 #ifndef HAVE_SSE2
 /*
    P3 processor has only one SSE decoder so can execute only 1 sse insn per
@@ -103,7 +109,7 @@
 #ifdef HAVE_SSE
 #define MMREG_SIZE 16
 #else
-#define MMREG_SIZE 8
+#define MMREG_SIZE 64 //8
 #endif
 
 /* Small defines (for readability only) ;) */
@@ -132,7 +138,20 @@
 {
 	void *retval;
 	size_t i;
-  	retval = to;
+	retval = to;
+#ifdef STATISTICS
+	{
+		static int freq[33];
+		static int t=0;
+		int i;
+		for(i=0; len>(1<<i); i++);
+		freq[i]++;
+		t++;
+		if(1024*1024*1024 % t == 0)
+			for(i=0; i<32; i++)
+				printf("freq < %8d %4d\n", 1<<i, freq[i]);
+	}
+#endif
 #ifndef HAVE_MMX1
         /* PREFETCH has effect even for MOVSB instruction ;) */
 	__asm__ __volatile__ (
@@ -184,7 +203,7 @@
 		((const unsigned char *)from)+=64;
 		((unsigned char *)to)+=64;
 	}
-	else 
+	else
 	/*
 	   Only if SRC is aligned on 16-byte boundary.
 	   It allows to use movaps instead of movups, which required data
@@ -207,6 +226,96 @@
 		((unsigned char *)to)+=64;
 	}
 #else
+	// Align destination at BLOCK_SIZE boundary
+	for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
+	{
+		__asm__ __volatile__ (
+#ifndef HAVE_MMX1
+        	PREFETCH" 320(%0)\n"
+#endif
+		"movq (%0), %%mm0\n"
+		"movq 8(%0), %%mm1\n"
+		"movq 16(%0), %%mm2\n"
+		"movq 24(%0), %%mm3\n"
+		"movq 32(%0), %%mm4\n"
+		"movq 40(%0), %%mm5\n"
+		"movq 48(%0), %%mm6\n"
+		"movq 56(%0), %%mm7\n"
+		MOVNTQ" %%mm0, (%1)\n"
+		MOVNTQ" %%mm1, 8(%1)\n"
+		MOVNTQ" %%mm2, 16(%1)\n"
+		MOVNTQ" %%mm3, 24(%1)\n"
+		MOVNTQ" %%mm4, 32(%1)\n"
+		MOVNTQ" %%mm5, 40(%1)\n"
+		MOVNTQ" %%mm6, 48(%1)\n"
+		MOVNTQ" %%mm7, 56(%1)\n"
+		:: "r" (from), "r" (to) : "memory");
+		((const unsigned char *)from)+=64;
+		((unsigned char *)to)+=64;
+	}
+
+//	printf(" %d %d\n", (int)from&1023, (int)to&1023);
+	// Pure Assembly cuz gcc is a bit unpredictable ;)
+	if(i>=BLOCK_SIZE/64)
+		asm volatile(
+			"xorl %%eax, %%eax	\n\t"
+			".balign 16		\n\t"
+			"1:			\n\t"
+				"movl (%0, %%eax), %%ebx 	\n\t"
+				"movl 32(%0, %%eax), %%ebx 	\n\t"
+				"movl 64(%0, %%eax), %%ebx 	\n\t"
+				"movl 96(%0, %%eax), %%ebx 	\n\t"
+				"addl $128, %%eax		\n\t"
+				"cmpl %3, %%eax			\n\t"
+				" jb 1b				\n\t"
+
+			"xorl %%eax, %%eax	\n\t"
+
+				".balign 16		\n\t"
+				"2:			\n\t"
+				"movq (%0, %%eax), %%mm0\n"
+				"movq 8(%0, %%eax), %%mm1\n"
+				"movq 16(%0, %%eax), %%mm2\n"
+				"movq 24(%0, %%eax), %%mm3\n"
+				"movq 32(%0, %%eax), %%mm4\n"
+				"movq 40(%0, %%eax), %%mm5\n"
+				"movq 48(%0, %%eax), %%mm6\n"
+				"movq 56(%0, %%eax), %%mm7\n"
+				MOVNTQ" %%mm0, (%1, %%eax)\n"
+				MOVNTQ" %%mm1, 8(%1, %%eax)\n"
+				MOVNTQ" %%mm2, 16(%1, %%eax)\n"
+				MOVNTQ" %%mm3, 24(%1, %%eax)\n"
+				MOVNTQ" %%mm4, 32(%1, %%eax)\n"
+				MOVNTQ" %%mm5, 40(%1, %%eax)\n"
+				MOVNTQ" %%mm6, 48(%1, %%eax)\n"
+				MOVNTQ" %%mm7, 56(%1, %%eax)\n"
+				"addl $64, %%eax		\n\t"
+				"cmpl %3, %%eax		\n\t"
+				"jb 2b				\n\t"
+
+#if CONFUSION_FACTOR > 0
+	// a few percent speedup on out of order executing CPUs
+			"movl %5, %%eax		\n\t"
+				"2:			\n\t"
+				"movl (%0), %%ebx	\n\t"
+				"movl (%0), %%ebx	\n\t"
+				"movl (%0), %%ebx	\n\t"
+				"movl (%0), %%ebx	\n\t"
+				"decl %%eax		\n\t"
+				" jnz 2b		\n\t"
+#endif
+
+			"xorl %%eax, %%eax	\n\t"
+			"addl %3, %0		\n\t"
+			"addl %3, %1		\n\t"
+			"subl %4, %2		\n\t"
+			"cmpl %4, %2		\n\t"
+			" jae 1b		\n\t"
+				: "+r" (from), "+r" (to), "+r" (i)
+				: "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
+				: "%eax", "%ebx"
+		);
+
 	for(; i>0; i--)
 	{
 		__asm__ __volatile__ (
@@ -233,16 +342,17 @@
 		((const unsigned char *)from)+=64;
 		((unsigned char *)to)+=64;
 	}
+
 #endif /* Have SSE */
 #ifdef HAVE_MMX2
                 /* since movntq is weakly-ordered, a "sfence"
 		 * is needed to become ordered again. */
 		__asm__ __volatile__ ("sfence":::"memory");
 #endif
-#ifndef HAVE_SSE		
+#ifndef HAVE_SSE
 		/* enables to use FPU */
 		__asm__ __volatile__ (EMMS:::"memory");
-#endif		
+#endif
 	}
 	/*
 	 *	Now do the tail of the block