477
|
1
|
|
2 #ifndef __MPLAYER_MEMCPY
|
|
3 #define __MPLAYER_MEMCPY
|
|
4
|
358
|
5 /*
|
|
6 This part of code was taken by from Linux-2.4.3 and slightly modified
|
|
7 for MMX2 instruction set. I have done it since linux uses page aligned
|
|
8 blocks but mplayer uses weakly ordered data and original sources can not
|
376
|
9 speedup their. Only using prefetchnta and movntq together have effect!
|
358
|
10 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
|
|
11 */
|
477
|
12
|
|
13 // 3dnow memcpy support from kernel 2.4.2
|
|
14 // by Pontscho/fresh!mindworkz
|
|
15
|
|
16 #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW )
|
|
17
|
376
|
18 /* for small memory blocks (<256 bytes) this version is faster */
|
|
19 #define small_memcpy(to,from,n)\
|
|
20 {\
|
|
21 __asm__ __volatile__(\
|
|
22 "rep ; movsb\n"\
|
|
23 ::"D" (to), "S" (from),"c" (n)\
|
|
24 : "memory");\
|
|
25 }
|
358
|
26
|
370
|
27 inline static void * fast_memcpy(void * to, const void * from, unsigned len)
|
358
|
28 {
|
|
29 void *p;
|
|
30 int i;
|
|
31
|
|
32 if(len >= 0x200) /* 512-byte blocks */
|
|
33 {
|
|
34 p = to;
|
|
35 i = len >> 6; /* len/64 */
|
376
|
36 len&=63;
|
|
37
|
358
|
38 __asm__ __volatile__ (
|
477
|
39 #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
|
|
40 "prefetch (%0)\n"
|
|
41 "prefetch 64(%0)\n"
|
|
42 "prefetch 128(%0)\n"
|
|
43 "prefetch 192(%0)\n"
|
|
44 "prefetch 256(%0)\n"
|
|
45 #else
|
409
|
46 "prefetchnta (%0)\n"
|
|
47 "prefetchnta 64(%0)\n"
|
|
48 "prefetchnta 128(%0)\n"
|
|
49 "prefetchnta 192(%0)\n"
|
|
50 "prefetchnta 256(%0)\n"
|
477
|
51 #endif
|
358
|
52 : : "r" (from) );
|
409
|
53 /*
|
|
54 This algorithm is top effective when the code consequently
|
|
55 reads and writes blocks which have size of cache line.
|
|
56 Size of cache line is processor-dependent.
|
|
57 It will, however, be a minimum of 32 bytes on any processors.
|
|
58 It would be better to have a number of instructions which
|
|
59 perform reading and writing to be multiple to a number of
|
|
60 processor's decoders, but it's not always possible.
|
|
61 */
|
358
|
62 for(; i>0; i--)
|
|
63 {
|
|
64 __asm__ __volatile__ (
|
477
|
65 #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
|
|
66 "prefetch 320(%0)\n"
|
|
67 #else
|
409
|
68 "prefetchnta 320(%0)\n"
|
477
|
69 #endif
|
409
|
70 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
|
|
71 "movups (%0), %%xmm0\n"
|
|
72 "movups 16(%0), %%xmm1\n"
|
|
73 "movntps %%xmm0, (%1)\n"
|
|
74 "movntps %%xmm1, 16(%1)\n"
|
|
75 "movups 32(%0), %%xmm0\n"
|
|
76 "movups 48(%0), %%xmm1\n"
|
|
77 "movntps %%xmm0, 32(%1)\n"
|
|
78 "movntps %%xmm1, 48(%1)\n"
|
|
79 #else /* Only K7 (may be other) */
|
477
|
80 #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
|
|
81 "movq (%0), %%mm0\n"
|
|
82 "movq 8(%0), %%mm1\n"
|
|
83 "movq 16(%0), %%mm2\n"
|
|
84 "movq 24(%0), %%mm3\n"
|
|
85 "movq %%mm0, (%1)\n"
|
|
86 "movq %%mm1, 8(%1)\n"
|
|
87 "movq %%mm2, 16(%1)\n"
|
|
88 "movq %%mm3, 24(%1)\n"
|
|
89 "movq 32(%0), %%mm0\n"
|
|
90 "movq 40(%0), %%mm1\n"
|
|
91 "movq 48(%0), %%mm2\n"
|
|
92 "movq 56(%0), %%mm3\n"
|
|
93 "movq %%mm0, 32(%1)\n"
|
|
94 "movq %%mm1, 40(%1)\n"
|
|
95 "movq %%mm2, 48(%1)\n"
|
|
96 "movq %%mm3, 56(%1)\n"
|
|
97 #else
|
409
|
98 "movq (%0), %%mm0\n"
|
|
99 "movq 8(%0), %%mm1\n"
|
|
100 "movq 16(%0), %%mm2\n"
|
|
101 "movq 24(%0), %%mm3\n"
|
|
102 "movntq %%mm0, (%1)\n"
|
|
103 "movntq %%mm1, 8(%1)\n"
|
|
104 "movntq %%mm2, 16(%1)\n"
|
|
105 "movntq %%mm3, 24(%1)\n"
|
|
106 "movq 32(%0), %%mm0\n"
|
|
107 "movq 40(%0), %%mm1\n"
|
|
108 "movq 48(%0), %%mm2\n"
|
|
109 "movq 56(%0), %%mm3\n"
|
|
110 "movntq %%mm0, 32(%1)\n"
|
|
111 "movntq %%mm1, 40(%1)\n"
|
|
112 "movntq %%mm2, 48(%1)\n"
|
|
113 "movntq %%mm3, 56(%1)\n"
|
|
114 #endif
|
477
|
115 #endif
|
409
|
116 :: "r" (from), "r" (to) : "memory");
|
358
|
117 from+=64;
|
|
118 to+=64;
|
|
119 }
|
477
|
120 #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
|
|
121 __asm__ __volatile__ ("femms":::"memory");
|
|
122 #else
|
409
|
123 __asm__ __volatile__ ("emms":::"memory");
|
477
|
124 #endif
|
358
|
125 }
|
|
126 /*
|
|
127 * Now do the tail of the block
|
|
128 */
|
477
|
129 #if 0
|
376
|
130 small_memcpy(to, from, len);
|
477
|
131 #else
|
|
132 __asm__ __volatile__ (
|
|
133 "shrl $1,%%ecx\n"
|
|
134 "jnc 1f\n"
|
|
135 "movsb\n"
|
|
136 "1:\n"
|
|
137 "shrl $1,%%ecx\n"
|
|
138 "jnc 2f\n"
|
|
139 "movsw\n"
|
|
140 "2:\n"
|
|
141 "rep ; movsl\n"
|
|
142 ::"D" (to), "S" (from),"c" (len)
|
|
143 : "memory");
|
|
144 #endif
|
358
|
145 return p;
|
|
146 }
|
376
|
147 #define memcpy(a,b,c) fast_memcpy(a,b,c)
|
477
|
148
|
358
|
149 #endif
|
|
150
|
478
|
151 #endif
|