# HG changeset patch # User michael # Date 1006458038 0 # Node ID 99f6db3255aa6e28e5ecdefdb6645784f00b6089 # Parent bfc881c0e591e4291ba92b8121bd9291aa1c2e92 10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR) diff -r bfc881c0e591 -r 99f6db3255aa libvo/aclib.c --- a/libvo/aclib.c Thu Nov 22 19:22:49 2001 +0000 +++ b/libvo/aclib.c Thu Nov 22 19:40:38 2001 +0000 @@ -1,13 +1,19 @@ #include "../config.h" #ifdef USE_FASTMEMCPY -/* +/* aclib - advanced C library ;) This file contains functions which improve and expand standard C-library */ #include +#define BLOCK_SIZE 4096 +#define CONFUSION_FACTOR 0 +//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) + +//#define STATISTICS + #ifndef HAVE_SSE2 /* P3 processor has only one SSE decoder so can execute only 1 sse insn per @@ -103,7 +109,7 @@ #ifdef HAVE_SSE #define MMREG_SIZE 16 #else -#define MMREG_SIZE 8 +#define MMREG_SIZE 64 //8 #endif /* Small defines (for readability only) ;) */ @@ -132,7 +138,20 @@ { void *retval; size_t i; - retval = to; + retval = to; +#ifdef STATISTICS + { + static int freq[33]; + static int t=0; + int i; + for(i=0; len>(1<0; i--) + { + __asm__ __volatile__ ( +#ifndef HAVE_MMX1 + PREFETCH" 320(%0)\n" +#endif + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "movq 32(%0), %%mm4\n" + "movq 40(%0), %%mm5\n" + "movq 48(%0), %%mm6\n" + "movq 56(%0), %%mm7\n" + MOVNTQ" %%mm0, (%1)\n" + MOVNTQ" %%mm1, 8(%1)\n" + MOVNTQ" %%mm2, 16(%1)\n" + MOVNTQ" %%mm3, 24(%1)\n" + MOVNTQ" %%mm4, 32(%1)\n" + MOVNTQ" %%mm5, 40(%1)\n" + MOVNTQ" %%mm6, 48(%1)\n" + MOVNTQ" %%mm7, 56(%1)\n" + :: "r" (from), "r" (to) : "memory"); + ((const unsigned char *)from)+=64; + ((unsigned char *)to)+=64; + } + +// printf(" %d %d\n", (int)from&1023, (int)to&1023); + // Pure Assembly cuz gcc is a bit unpredictable ;) + if(i>=BLOCK_SIZE/64) + asm volatile( + "xorl %%eax, %%eax \n\t" + ".balign 16 \n\t" + "1: \n\t" + "movl (%0, %%eax), %%ebx \n\t" + "movl 32(%0, %%eax), %%ebx \n\t" + "movl 64(%0, %%eax), %%ebx \n\t" + "movl 96(%0, %%eax), %%ebx \n\t" + "addl $128, %%eax \n\t" + "cmpl %3, %%eax \n\t" + " jb 1b \n\t" + + "xorl %%eax, %%eax \n\t" + + ".balign 16 \n\t" + "2: \n\t" + "movq (%0, %%eax), %%mm0\n" + "movq 8(%0, %%eax), %%mm1\n" + "movq 16(%0, %%eax), %%mm2\n" + "movq 24(%0, %%eax), %%mm3\n" + "movq 32(%0, %%eax), %%mm4\n" + "movq 40(%0, %%eax), %%mm5\n" + "movq 48(%0, %%eax), %%mm6\n" + "movq 56(%0, %%eax), %%mm7\n" + MOVNTQ" %%mm0, (%1, %%eax)\n" + MOVNTQ" %%mm1, 8(%1, %%eax)\n" + MOVNTQ" %%mm2, 16(%1, %%eax)\n" + MOVNTQ" %%mm3, 24(%1, %%eax)\n" + MOVNTQ" %%mm4, 32(%1, %%eax)\n" + MOVNTQ" %%mm5, 40(%1, %%eax)\n" + MOVNTQ" %%mm6, 48(%1, %%eax)\n" + MOVNTQ" %%mm7, 56(%1, %%eax)\n" + "addl $64, %%eax \n\t" + "cmpl %3, %%eax \n\t" + "jb 2b \n\t" + +#if CONFUSION_FACTOR > 0 + // a few percent speedup on out of order executing CPUs + "movl %5, %%eax \n\t" + "2: \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "decl %%eax \n\t" + " jnz 2b \n\t" +#endif + + "xorl %%eax, %%eax \n\t" + "addl %3, %0 \n\t" + "addl %3, %1 \n\t" + "subl %4, %2 \n\t" + "cmpl %4, %2 \n\t" + " jae 1b \n\t" + : "+r" (from), "+r" (to), "+r" (i) + : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR) + : "%eax", "%ebx" + ); + for(; i>0; i--) { __asm__ __volatile__ ( @@ -233,16 +342,17 @@ ((const unsigned char *)from)+=64; ((unsigned char *)to)+=64; } + #endif /* Have SSE */ #ifdef HAVE_MMX2 /* since movntq is weakly-ordered, a "sfence" * is needed to become ordered again. */ __asm__ __volatile__ ("sfence":::"memory"); #endif -#ifndef HAVE_SSE +#ifndef HAVE_SSE /* enables to use FPU */ __asm__ __volatile__ (EMMS:::"memory"); -#endif +#endif } /* * Now do the tail of the block diff -r bfc881c0e591 -r 99f6db3255aa libvo/aclib_template.c --- a/libvo/aclib_template.c Thu Nov 22 19:22:49 2001 +0000 +++ b/libvo/aclib_template.c Thu Nov 22 19:40:38 2001 +0000 @@ -1,13 +1,19 @@ #include "../config.h" #ifdef USE_FASTMEMCPY -/* +/* aclib - advanced C library ;) This file contains functions which improve and expand standard C-library */ #include +#define BLOCK_SIZE 4096 +#define CONFUSION_FACTOR 0 +//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) + +//#define STATISTICS + #ifndef HAVE_SSE2 /* P3 processor has only one SSE decoder so can execute only 1 sse insn per @@ -103,7 +109,7 @@ #ifdef HAVE_SSE #define MMREG_SIZE 16 #else -#define MMREG_SIZE 8 +#define MMREG_SIZE 64 //8 #endif /* Small defines (for readability only) ;) */ @@ -132,7 +138,20 @@ { void *retval; size_t i; - retval = to; + retval = to; +#ifdef STATISTICS + { + static int freq[33]; + static int t=0; + int i; + for(i=0; len>(1<0; i--) + { + __asm__ __volatile__ ( +#ifndef HAVE_MMX1 + PREFETCH" 320(%0)\n" +#endif + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "movq 32(%0), %%mm4\n" + "movq 40(%0), %%mm5\n" + "movq 48(%0), %%mm6\n" + "movq 56(%0), %%mm7\n" + MOVNTQ" %%mm0, (%1)\n" + MOVNTQ" %%mm1, 8(%1)\n" + MOVNTQ" %%mm2, 16(%1)\n" + MOVNTQ" %%mm3, 24(%1)\n" + MOVNTQ" %%mm4, 32(%1)\n" + MOVNTQ" %%mm5, 40(%1)\n" + MOVNTQ" %%mm6, 48(%1)\n" + MOVNTQ" %%mm7, 56(%1)\n" + :: "r" (from), "r" (to) : "memory"); + ((const unsigned char *)from)+=64; + ((unsigned char *)to)+=64; + } + +// printf(" %d %d\n", (int)from&1023, (int)to&1023); + // Pure Assembly cuz gcc is a bit unpredictable ;) + if(i>=BLOCK_SIZE/64) + asm volatile( + "xorl %%eax, %%eax \n\t" + ".balign 16 \n\t" + "1: \n\t" + "movl (%0, %%eax), %%ebx \n\t" + "movl 32(%0, %%eax), %%ebx \n\t" + "movl 64(%0, %%eax), %%ebx \n\t" + "movl 96(%0, %%eax), %%ebx \n\t" + "addl $128, %%eax \n\t" + "cmpl %3, %%eax \n\t" + " jb 1b \n\t" + + "xorl %%eax, %%eax \n\t" + + ".balign 16 \n\t" + "2: \n\t" + "movq (%0, %%eax), %%mm0\n" + "movq 8(%0, %%eax), %%mm1\n" + "movq 16(%0, %%eax), %%mm2\n" + "movq 24(%0, %%eax), %%mm3\n" + "movq 32(%0, %%eax), %%mm4\n" + "movq 40(%0, %%eax), %%mm5\n" + "movq 48(%0, %%eax), %%mm6\n" + "movq 56(%0, %%eax), %%mm7\n" + MOVNTQ" %%mm0, (%1, %%eax)\n" + MOVNTQ" %%mm1, 8(%1, %%eax)\n" + MOVNTQ" %%mm2, 16(%1, %%eax)\n" + MOVNTQ" %%mm3, 24(%1, %%eax)\n" + MOVNTQ" %%mm4, 32(%1, %%eax)\n" + MOVNTQ" %%mm5, 40(%1, %%eax)\n" + MOVNTQ" %%mm6, 48(%1, %%eax)\n" + MOVNTQ" %%mm7, 56(%1, %%eax)\n" + "addl $64, %%eax \n\t" + "cmpl %3, %%eax \n\t" + "jb 2b \n\t" + +#if CONFUSION_FACTOR > 0 + // a few percent speedup on out of order executing CPUs + "movl %5, %%eax \n\t" + "2: \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "decl %%eax \n\t" + " jnz 2b \n\t" +#endif + + "xorl %%eax, %%eax \n\t" + "addl %3, %0 \n\t" + "addl %3, %1 \n\t" + "subl %4, %2 \n\t" + "cmpl %4, %2 \n\t" + " jae 1b \n\t" + : "+r" (from), "+r" (to), "+r" (i) + : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR) + : "%eax", "%ebx" + ); + for(; i>0; i--) { __asm__ __volatile__ ( @@ -233,16 +342,17 @@ ((const unsigned char *)from)+=64; ((unsigned char *)to)+=64; } + #endif /* Have SSE */ #ifdef HAVE_MMX2 /* since movntq is weakly-ordered, a "sfence" * is needed to become ordered again. */ __asm__ __volatile__ ("sfence":::"memory"); #endif -#ifndef HAVE_SSE +#ifndef HAVE_SSE /* enables to use FPU */ __asm__ __volatile__ (EMMS:::"memory"); -#endif +#endif } /* * Now do the tail of the block