mplayer.hg: libvo/fastmemcpy.h annotate

annotate libvo/fastmemcpy.h @ 620:b623949ddedb

Some names, types changed.

author	se7encode
date	Tue, 24 Apr 2001 15:56:41 +0000
parents	ec64a051b0e0
children	88eb1a3f7bfb

rev	line source
477 a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	1 #ifndef __MPLAYER_MEMCPY
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	2 #define __MPLAYER_MEMCPY
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	3
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	4 /*
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	5 This part of code was taken by from Linux-2.4.3 and slightly modified
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	6 for MMX2, SSE instruction set. I have done it since linux uses page aligned
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	7 blocks but mplayer uses weakly ordered data and original sources can not
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	8 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	9
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	10 From IA-32 Intel Architecture Software Developer's Manual Volume 1,
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	11 Order Number 245470:
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	12 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	13
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	14 Data referenced by a program can be temporal (data will be used again) or
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	15 non-temporal (data will be referenced once and not reused in the immediate
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	16 future). To make efficient use of the processor's caches, it is generally
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	17 desirable to cache temporal data and not cache non-temporal data. Overloading
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	18 the processor's caches with non-temporal data is sometimes referred to as
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	19 "polluting the caches".
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	20 The non-temporal data is written to memory with Write-Combining semantics.
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	21
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	22 The PREFETCHh instructions permits a program to load data into the processor
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	23 at a suggested cache level, so that it is closer to the processors load and
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	24 store unit when it is needed. If the data is already present in a level of
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	25 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	26 will not result in any data movement.
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	27 But we should you PREFETCHNTA: Non-temporal data fetch data into location
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	28 close to the processor, minimizing cache pollution.
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	29
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	30 The MOVNTQ (store quadword using non-temporal hint) instruction stores
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	31 packed integer data from an MMX register to memory, using a non-temporal hint.
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	32 The MOVNTPS (store packed single-precision floating-point values using
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	33 non-temporal hint) instruction stores packed floating-point data from an
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	34 XMM register to memory, using a non-temporal hint.
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	35
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	36 The SFENCE (Store Fence) instruction controls write ordering by creating a
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	37 fence for memory store operations. This instruction guarantees that the results
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	38 of every store instruction that precedes the store fence in program order is
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	39 globally visible before any store instruction that follows the fence. The
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	40 SFENCE instruction provides an efficient way of ensuring ordering between
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	41 procedures that produce weakly-ordered data and procedures that consume that
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	42 data.
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	43
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	44 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	45 */
477 a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	46
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	47 // 3dnow memcpy support from kernel 2.4.2
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	48 // by Pontscho/fresh!mindworkz
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	49
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	50 #if defined( HAVE_MMX2 ) \|\| defined( HAVE_3DNOW )
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	51
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	52 #undef HAVE_K6_2PLUS
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	53 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	54 #define HAVE_K6_2PLUS
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	55 #endif
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	56
376 63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	57 /* for small memory blocks (<256 bytes) this version is faster */
63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	58 #define small_memcpy(to,from,n)\
63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	59 {\
63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	60 __asm__ __volatile__(\
63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	61 "rep ; movsb\n"\
63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	62 ::"D" (to), "S" (from),"c" (n)\
63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	63 : "memory");\
63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	64 }
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	65
370 baf3fe20eb23 __memcpy fix arpi_esp parents: 368 diff changeset	66 inline static void * fast_memcpy(void * to, const void * from, unsigned len)
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	67 {
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	68 void *p;
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	69 int i;
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	70
567 e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	71 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	72 // printf("fastmemcpy_pre(0x%X,0x%X,0x%X)\n",to,from,len);
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	73 // Align dest to 16-byte boundary:
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	74 if((unsigned long)to&15){
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	75 int len2=16-((unsigned long)to&15);
567 e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	76 if(len>len2){
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	77 len-=len2;
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	78 __asm__ __volatile__(
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	79 "rep ; movsb\n"
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	80 :"=D" (to), "=S" (from)
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	81 : "D" (to), "S" (from),"c" (len2)
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	82 : "memory");
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	83 }
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	84 }
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	85 // printf("fastmemcpy(0x%X,0x%X,0x%X)\n",to,from,len);
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	86 #endif
e7c7a386a0c0 16-byte align fix for SSE arpi_esp parents: 513 diff changeset	87
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	88 if(len >= 0x200) /* 512-byte blocks */
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	89 {
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	90 p = to;
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	91 i = len >> 6; /* len/64 */
376 63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	92 len&=63;
63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	93
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	94 __asm__ __volatile__ (
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	95 #ifdef HAVE_K6_2PLUS
477 a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	96 "prefetch (%0)\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	97 "prefetch 64(%0)\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	98 "prefetch 128(%0)\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	99 "prefetch 192(%0)\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	100 "prefetch 256(%0)\n"
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	101 #else /* K7, P3, CyrixIII */
409 90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	102 "prefetchnta (%0)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	103 "prefetchnta 64(%0)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	104 "prefetchnta 128(%0)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	105 "prefetchnta 192(%0)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	106 "prefetchnta 256(%0)\n"
477 a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	107 #endif
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	108 : : "r" (from) );
409 90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	109 /*
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	110 This algorithm is top effective when the code consequently
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	111 reads and writes blocks which have size of cache line.
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	112 Size of cache line is processor-dependent.
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	113 It will, however, be a minimum of 32 bytes on any processors.
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	114 It would be better to have a number of instructions which
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	115 perform reading and writing to be multiple to a number of
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	116 processor's decoders, but it's not always possible.
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	117 */
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	118 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	119 if(((unsigned long)from) & 15)
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	120 /* if SRC is misaligned */
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	121 for(; i>0; i--)
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	122 {
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	123 __asm__ __volatile__ (
409 90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	124 "prefetchnta 320(%0)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	125 "movups (%0), %%xmm0\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	126 "movups 16(%0), %%xmm1\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	127 "movntps %%xmm0, (%1)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	128 "movntps %%xmm1, 16(%1)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	129 "movups 32(%0), %%xmm0\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	130 "movups 48(%0), %%xmm1\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	131 "movntps %%xmm0, 32(%1)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	132 "movntps %%xmm1, 48(%1)\n"
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	133 :: "r" (from), "r" (to) : "memory");
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	134 from+=64;
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	135 to+=64;
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	136 }
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	137 else
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	138 /*
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	139 Only if SRC is aligned on 16-byte boundary.
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	140 It allows to use movaps instead of movups, which required data
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	141 to be aligned or a general-protection exception (#GP) is generated.
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	142 */
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	143 for(; i>0; i--)
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	144 {
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	145 __asm__ __volatile__ (
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	146 "prefetchnta 320(%0)\n"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	147 "movaps (%0), %%xmm0\n"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	148 "movaps 16(%0), %%xmm1\n"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	149 "movntps %%xmm0, (%1)\n"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	150 "movntps %%xmm1, 16(%1)\n"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	151 "movaps 32(%0), %%xmm0\n"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	152 "movaps 48(%0), %%xmm1\n"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	153 "movntps %%xmm0, 32(%1)\n"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	154 "movntps %%xmm1, 48(%1)\n"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	155 :: "r" (from), "r" (to) : "memory");
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	156 from+=64;
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	157 to+=64;
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	158 }
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	159 #else
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	160 for(; i>0; i--)
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	161 {
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	162 __asm__ __volatile__ (
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	163 #ifdef HAVE_K6_2PLUS
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	164 "prefetch 320(%0)\n"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	165 #else
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	166 "prefetchnta 320(%0)\n"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	167 #endif
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	168 #ifdef HAVE_K6_2PLUS
477 a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	169 "movq (%0), %%mm0\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	170 "movq 8(%0), %%mm1\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	171 "movq 16(%0), %%mm2\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	172 "movq 24(%0), %%mm3\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	173 "movq %%mm0, (%1)\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	174 "movq %%mm1, 8(%1)\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	175 "movq %%mm2, 16(%1)\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	176 "movq %%mm3, 24(%1)\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	177 "movq 32(%0), %%mm0\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	178 "movq 40(%0), %%mm1\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	179 "movq 48(%0), %%mm2\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	180 "movq 56(%0), %%mm3\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	181 "movq %%mm0, 32(%1)\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	182 "movq %%mm1, 40(%1)\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	183 "movq %%mm2, 48(%1)\n"
a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	184 "movq %%mm3, 56(%1)\n"
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	185 #else /* K7 */
409 90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	186 "movq (%0), %%mm0\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	187 "movq 8(%0), %%mm1\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	188 "movq 16(%0), %%mm2\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	189 "movq 24(%0), %%mm3\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	190 "movntq %%mm0, (%1)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	191 "movntq %%mm1, 8(%1)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	192 "movntq %%mm2, 16(%1)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	193 "movntq %%mm3, 24(%1)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	194 "movq 32(%0), %%mm0\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	195 "movq 40(%0), %%mm1\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	196 "movq 48(%0), %%mm2\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	197 "movq 56(%0), %%mm3\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	198 "movntq %%mm0, 32(%1)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	199 "movntq %%mm1, 40(%1)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	200 "movntq %%mm2, 48(%1)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	201 "movntq %%mm3, 56(%1)\n"
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	202 #endif
90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	203 :: "r" (from), "r" (to) : "memory");
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	204 from+=64;
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	205 to+=64;
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	206 }
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	207 #endif /* Have SSE */
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	208 #ifdef HAVE_K6_2PLUS
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	209 /* On K6 femms is fatser of emms.
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	210 On K7 femms is directly mapped on emms. */
477 a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	211 __asm__ __volatile__ ("femms":::"memory");
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	212 #else /* K7, P3, CyrixIII */
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	213 /* since movntq is weakly-ordered, a "sfence"
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	214 * is needed to become ordered again. */
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	215 __asm__ __volatile__ ("sfence":::"memory");
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	216 #ifndef HAVE_SSE
ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	217 /* enables to use FPU */
409 90a50c8e15b8 - applied SSE patch by Nick Kurshev atmosfear parents: 376 diff changeset	218 __asm__ __volatile__ ("emms":::"memory");
581 ec64a051b0e0 New optimized SSE code, overall optimizations. atmosfear parents: 567 diff changeset	219 #endif
477 a1ceb65cf6ab add 3dnow support pontscho parents: 409 diff changeset	220 #endif
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	221 }
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	222 /*
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	223 * Now do the tail of the block
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	224 */
376 63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	225 small_memcpy(to, from, len);
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	226 return p;
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	227 }
376 63c47ec706cd P3 fixes... arpi_esp parents: 370 diff changeset	228 #define memcpy(a,b,c) fast_memcpy(a,b,c)
513 aec3cad1e41a replace "movsl..." to small_memcpy pontscho parents: 478 diff changeset	229 #undef small_memcpy
aec3cad1e41a replace "movsl..." to small_memcpy pontscho parents: 478 diff changeset	230
358 223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	231 #endif
223439e2de87 fast memcpy() using x86 asm or mmxext arpi_esp parents: diff changeset	232
478 9df257e7ef78 sorry, for the #endif pontscho parents: 477 diff changeset	233 #endif

Mercurial > mplayer.hg

annotate libvo/fastmemcpy.h @ 620:b623949ddedb