# HG changeset patch # User atmosfear # Date 987271004 0 # Node ID 90a50c8e15b8e3084f064e00d430102442b5a795 # Parent b61c5c4484f8227ad227bae6ddd6c4c22929db0c - applied SSE patch by Nick Kurshev diff -r b61c5c4484f8 -r 90a50c8e15b8 libvo/fastmemcpy.h --- a/libvo/fastmemcpy.h Sat Apr 14 17:55:20 2001 +0000 +++ b/libvo/fastmemcpy.h Sat Apr 14 17:56:44 2001 +0000 @@ -27,60 +27,57 @@ len&=63; __asm__ __volatile__ ( - "1: prefetchnta (%0)\n" /* This set is 28 bytes */ - " prefetchnta 64(%0)\n" - " prefetchnta 128(%0)\n" - " prefetchnta 192(%0)\n" - " prefetchnta 256(%0)\n" -#if 0 - "2: \n" - ".section .fixup, \"ax\"\n" - "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" -#endif + "prefetchnta (%0)\n" + "prefetchnta 64(%0)\n" + "prefetchnta 128(%0)\n" + "prefetchnta 192(%0)\n" + "prefetchnta 256(%0)\n" : : "r" (from) ); - - + /* + This algorithm is top effective when the code consequently + reads and writes blocks which have size of cache line. + Size of cache line is processor-dependent. + It will, however, be a minimum of 32 bytes on any processors. + It would be better to have a number of instructions which + perform reading and writing to be multiple to a number of + processor's decoders, but it's not always possible. + */ for(; i>0; i--) { __asm__ __volatile__ ( - "1: prefetchnta 320(%0)\n" - "2: movq (%0), %%mm0\n" - " movq 8(%0), %%mm1\n" - " movq 16(%0), %%mm2\n" - " movq 24(%0), %%mm3\n" - " movntq %%mm0, (%1)\n" - " movntq %%mm1, 8(%1)\n" - " movntq %%mm2, 16(%1)\n" - " movntq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm0\n" - " movq 40(%0), %%mm1\n" - " movq 48(%0), %%mm2\n" - " movq 56(%0), %%mm3\n" - " movntq %%mm0, 32(%1)\n" - " movntq %%mm1, 40(%1)\n" - " movntq %%mm2, 48(%1)\n" - " movntq %%mm3, 56(%1)\n" -#if 0 - ".section .fixup, \"ax\"\n" - "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" -#endif - : : "r" (from), "r" (to) : "memory"); + "prefetchnta 320(%0)\n" +#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */ + "movups (%0), %%xmm0\n" + "movups 16(%0), %%xmm1\n" + "movntps %%xmm0, (%1)\n" + "movntps %%xmm1, 16(%1)\n" + "movups 32(%0), %%xmm0\n" + "movups 48(%0), %%xmm1\n" + "movntps %%xmm0, 32(%1)\n" + "movntps %%xmm1, 48(%1)\n" +#else /* Only K7 (may be other) */ + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "movntq %%mm0, (%1)\n" + "movntq %%mm1, 8(%1)\n" + "movntq %%mm2, 16(%1)\n" + "movntq %%mm3, 24(%1)\n" + "movq 32(%0), %%mm0\n" + "movq 40(%0), %%mm1\n" + "movq 48(%0), %%mm2\n" + "movq 56(%0), %%mm3\n" + "movntq %%mm0, 32(%1)\n" + "movntq %%mm1, 40(%1)\n" + "movntq %%mm2, 48(%1)\n" + "movntq %%mm3, 56(%1)\n" +#endif + :: "r" (from), "r" (to) : "memory"); from+=64; to+=64; } - __asm__ __volatile__ ("emms":::"memory"); + __asm__ __volatile__ ("emms":::"memory"); } /* * Now do the tail of the block