Mercurial > mplayer.hg
annotate mmx.h @ 16995:f0247f8df2fa
Small fixes...
author | ptt |
---|---|
date | Wed, 16 Nov 2005 19:11:03 +0000 |
parents | f03a8d54e5f9 |
children |
rev | line source |
---|---|
2509 | 1 /* mmx.h |
2 | |
3 MultiMedia eXtensions GCC interface library for IA32. | |
4 | |
5 To use this library, simply include this header file | |
6 and compile with GCC. You MUST have inlining enabled | |
7 in order for mmx_ok() to work; this can be done by | |
8 simply using -O on the GCC command line. | |
9 | |
10 Compiling with -DMMX_TRACE will cause detailed trace | |
11 output to be sent to stderr for each mmx operation. | |
12 This adds lots of code, and obviously slows execution to | |
13 a crawl, but can be very useful for debugging. | |
14 | |
15 THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY | |
16 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT | |
17 LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY | |
18 AND FITNESS FOR ANY PARTICULAR PURPOSE. | |
19 | |
20 1997-99 by H. Dietz and R. Fisher | |
21 | |
22 Notes: | |
23 It appears that the latest gas has the pand problem fixed, therefore | |
24 I'll undefine BROKEN_PAND by default. | |
25 */ | |
26 | |
27 #ifndef _MMX_H | |
28 #define _MMX_H | |
29 | |
30 | |
31 /* Warning: at this writing, the version of GAS packaged | |
32 with most Linux distributions does not handle the | |
33 parallel AND operation mnemonic correctly. If the | |
34 symbol BROKEN_PAND is defined, a slower alternative | |
35 coding will be used. If execution of mmxtest results | |
36 in an illegal instruction fault, define this symbol. | |
37 */ | |
38 #undef BROKEN_PAND | |
39 | |
40 | |
41 /* The type of an value that fits in an MMX register | |
42 (note that long long constant values MUST be suffixed | |
43 by LL and unsigned long long values by ULL, lest | |
44 they be truncated by the compiler) | |
45 */ | |
46 typedef union { | |
47 long long q; /* Quadword (64-bit) value */ | |
48 unsigned long long uq; /* Unsigned Quadword */ | |
49 int d[2]; /* 2 Doubleword (32-bit) values */ | |
50 unsigned int ud[2]; /* 2 Unsigned Doubleword */ | |
51 short w[4]; /* 4 Word (16-bit) values */ | |
52 unsigned short uw[4]; /* 4 Unsigned Word */ | |
53 char b[8]; /* 8 Byte (8-bit) values */ | |
54 unsigned char ub[8]; /* 8 Unsigned Byte */ | |
55 float s[2]; /* Single-precision (32-bit) value */ | |
56 } __attribute__ ((aligned (8))) mmx_t; /* On an 8-byte (64-bit) boundary */ | |
57 | |
58 | |
59 | |
60 /* Function to test if multimedia instructions are supported... | |
61 */ | |
62 inline extern int | |
63 mm_support(void) | |
64 { | |
65 /* Returns 1 if MMX instructions are supported, | |
66 3 if Cyrix MMX and Extended MMX instructions are supported | |
67 5 if AMD MMX and 3DNow! instructions are supported | |
68 0 if hardware does not support any of these | |
69 */ | |
70 register int rval = 0; | |
71 | |
72 __asm__ __volatile__ ( | |
73 /* See if CPUID instruction is supported ... */ | |
74 /* ... Get copies of EFLAGS into eax and ecx */ | |
75 "pushf\n\t" | |
76 "popl %%eax\n\t" | |
77 "movl %%eax, %%ecx\n\t" | |
78 | |
79 /* ... Toggle the ID bit in one copy and store */ | |
80 /* to the EFLAGS reg */ | |
81 "xorl $0x200000, %%eax\n\t" | |
82 "push %%eax\n\t" | |
83 "popf\n\t" | |
84 | |
85 /* ... Get the (hopefully modified) EFLAGS */ | |
86 "pushf\n\t" | |
87 "popl %%eax\n\t" | |
88 | |
89 /* ... Compare and test result */ | |
90 "xorl %%eax, %%ecx\n\t" | |
91 "testl $0x200000, %%ecx\n\t" | |
92 "jz NotSupported1\n\t" /* CPUID not supported */ | |
93 | |
94 | |
95 /* Get standard CPUID information, and | |
96 go to a specific vendor section */ | |
97 "movl $0, %%eax\n\t" | |
98 "cpuid\n\t" | |
99 | |
100 /* Check for Intel */ | |
101 "cmpl $0x756e6547, %%ebx\n\t" | |
102 "jne TryAMD\n\t" | |
103 "cmpl $0x49656e69, %%edx\n\t" | |
104 "jne TryAMD\n\t" | |
105 "cmpl $0x6c65746e, %%ecx\n" | |
106 "jne TryAMD\n\t" | |
107 "jmp Intel\n\t" | |
108 | |
109 /* Check for AMD */ | |
110 "\nTryAMD:\n\t" | |
111 "cmpl $0x68747541, %%ebx\n\t" | |
112 "jne TryCyrix\n\t" | |
113 "cmpl $0x69746e65, %%edx\n\t" | |
114 "jne TryCyrix\n\t" | |
115 "cmpl $0x444d4163, %%ecx\n" | |
116 "jne TryCyrix\n\t" | |
117 "jmp AMD\n\t" | |
118 | |
119 /* Check for Cyrix */ | |
120 "\nTryCyrix:\n\t" | |
121 "cmpl $0x69727943, %%ebx\n\t" | |
122 "jne NotSupported2\n\t" | |
123 "cmpl $0x736e4978, %%edx\n\t" | |
124 "jne NotSupported3\n\t" | |
125 "cmpl $0x64616574, %%ecx\n\t" | |
126 "jne NotSupported4\n\t" | |
127 /* Drop through to Cyrix... */ | |
128 | |
129 | |
130 /* Cyrix Section */ | |
131 /* See if extended CPUID level 80000001 is supported */ | |
132 /* The value of CPUID/80000001 for the 6x86MX is undefined | |
133 according to the Cyrix CPU Detection Guide (Preliminary | |
134 Rev. 1.01 table 1), so we'll check the value of eax for | |
135 CPUID/0 to see if standard CPUID level 2 is supported. | |
136 According to the table, the only CPU which supports level | |
137 2 is also the only one which supports extended CPUID levels. | |
138 */ | |
139 "cmpl $0x2, %%eax\n\t" | |
140 "jne MMXtest\n\t" /* Use standard CPUID instead */ | |
141 | |
142 /* Extended CPUID supported (in theory), so get extended | |
143 features */ | |
144 "movl $0x80000001, %%eax\n\t" | |
145 "cpuid\n\t" | |
146 "testl $0x00800000, %%eax\n\t" /* Test for MMX */ | |
147 "jz NotSupported5\n\t" /* MMX not supported */ | |
148 "testl $0x01000000, %%eax\n\t" /* Test for Ext'd MMX */ | |
149 "jnz EMMXSupported\n\t" | |
150 "movl $1, %0:\n\n\t" /* MMX Supported */ | |
151 "jmp Return\n\n" | |
152 "EMMXSupported:\n\t" | |
153 "movl $3, %0:\n\n\t" /* EMMX and MMX Supported */ | |
154 "jmp Return\n\t" | |
155 | |
156 | |
157 /* AMD Section */ | |
158 "AMD:\n\t" | |
159 | |
160 /* See if extended CPUID is supported */ | |
161 "movl $0x80000000, %%eax\n\t" | |
162 "cpuid\n\t" | |
163 "cmpl $0x80000000, %%eax\n\t" | |
164 "jl MMXtest\n\t" /* Use standard CPUID instead */ | |
165 | |
166 /* Extended CPUID supported, so get extended features */ | |
167 "movl $0x80000001, %%eax\n\t" | |
168 "cpuid\n\t" | |
169 "testl $0x00800000, %%edx\n\t" /* Test for MMX */ | |
170 "jz NotSupported6\n\t" /* MMX not supported */ | |
171 "testl $0x80000000, %%edx\n\t" /* Test for 3DNow! */ | |
172 "jnz ThreeDNowSupported\n\t" | |
173 "movl $1, %0:\n\n\t" /* MMX Supported */ | |
174 "jmp Return\n\n" | |
175 "ThreeDNowSupported:\n\t" | |
176 "movl $5, %0:\n\n\t" /* 3DNow! and MMX Supported */ | |
177 "jmp Return\n\t" | |
178 | |
179 | |
180 /* Intel Section */ | |
181 "Intel:\n\t" | |
182 | |
183 /* Check for MMX */ | |
184 "MMXtest:\n\t" | |
185 "movl $1, %%eax\n\t" | |
186 "cpuid\n\t" | |
187 "testl $0x00800000, %%edx\n\t" /* Test for MMX */ | |
188 "jz NotSupported7\n\t" /* MMX Not supported */ | |
189 "movl $1, %0:\n\n\t" /* MMX Supported */ | |
190 "jmp Return\n\t" | |
191 | |
192 /* Nothing supported */ | |
193 "\nNotSupported1:\n\t" | |
194 "#movl $101, %0:\n\n\t" | |
195 "\nNotSupported2:\n\t" | |
196 "#movl $102, %0:\n\n\t" | |
197 "\nNotSupported3:\n\t" | |
198 "#movl $103, %0:\n\n\t" | |
199 "\nNotSupported4:\n\t" | |
200 "#movl $104, %0:\n\n\t" | |
201 "\nNotSupported5:\n\t" | |
202 "#movl $105, %0:\n\n\t" | |
203 "\nNotSupported6:\n\t" | |
204 "#movl $106, %0:\n\n\t" | |
205 "\nNotSupported7:\n\t" | |
206 "#movl $107, %0:\n\n\t" | |
207 "movl $0, %0:\n\n\t" | |
208 | |
209 "Return:\n\t" | |
210 : "=a" (rval) | |
211 : /* no input */ | |
212 : "eax", "ebx", "ecx", "edx" | |
213 ); | |
214 | |
215 /* Return */ | |
216 return(rval); | |
217 } | |
218 | |
219 /* Function to test if mmx instructions are supported... | |
220 */ | |
221 inline extern int | |
222 mmx_ok(void) | |
223 { | |
224 /* Returns 1 if MMX instructions are supported, 0 otherwise */ | |
225 return ( mm_support() & 0x1 ); | |
226 } | |
227 | |
228 | |
229 /* Helper functions for the instruction macros that follow... | |
230 (note that memory-to-register, m2r, instructions are nearly | |
231 as efficient as register-to-register, r2r, instructions; | |
232 however, memory-to-memory instructions are really simulated | |
233 as a convenience, and are only 1/3 as efficient) | |
234 */ | |
235 #ifdef MMX_TRACE | |
236 | |
237 /* Include the stuff for printing a trace to stderr... | |
238 */ | |
239 | |
240 #include <stdio.h> | |
241 | |
242 #define mmx_i2r(op, imm, reg) \ | |
243 { \ | |
244 mmx_t mmx_trace; \ | |
245 mmx_trace.uq = (imm); \ | |
246 printf(#op "_i2r(" #imm "=0x%08x%08x, ", \ | |
247 mmx_trace.d[1], mmx_trace.d[0]); \ | |
248 __asm__ __volatile__ ("movq %%" #reg ", %0" \ | |
15808 | 249 : "=m" (mmx_trace) \ |
2509 | 250 : /* nothing */ ); \ |
251 printf(#reg "=0x%08x%08x) => ", \ | |
252 mmx_trace.d[1], mmx_trace.d[0]); \ | |
253 __asm__ __volatile__ (#op " %0, %%" #reg \ | |
254 : /* nothing */ \ | |
15808 | 255 : "i" (imm)); \ |
2509 | 256 __asm__ __volatile__ ("movq %%" #reg ", %0" \ |
15808 | 257 : "=m" (mmx_trace) \ |
2509 | 258 : /* nothing */ ); \ |
259 printf(#reg "=0x%08x%08x\n", \ | |
260 mmx_trace.d[1], mmx_trace.d[0]); \ | |
261 } | |
262 | |
263 #define mmx_m2r(op, mem, reg) \ | |
264 { \ | |
265 mmx_t mmx_trace; \ | |
266 mmx_trace = (mem); \ | |
267 printf(#op "_m2r(" #mem "=0x%08x%08x, ", \ | |
268 mmx_trace.d[1], mmx_trace.d[0]); \ | |
269 __asm__ __volatile__ ("movq %%" #reg ", %0" \ | |
15808 | 270 : "=m" (mmx_trace) \ |
2509 | 271 : /* nothing */ ); \ |
272 printf(#reg "=0x%08x%08x) => ", \ | |
273 mmx_trace.d[1], mmx_trace.d[0]); \ | |
274 __asm__ __volatile__ (#op " %0, %%" #reg \ | |
275 : /* nothing */ \ | |
276 : "X" (mem)); \ | |
277 __asm__ __volatile__ ("movq %%" #reg ", %0" \ | |
15808 | 278 : "=m" (mmx_trace) \ |
2509 | 279 : /* nothing */ ); \ |
280 printf(#reg "=0x%08x%08x\n", \ | |
281 mmx_trace.d[1], mmx_trace.d[0]); \ | |
282 } | |
283 | |
284 #define mmx_r2m(op, reg, mem) \ | |
285 { \ | |
286 mmx_t mmx_trace; \ | |
287 __asm__ __volatile__ ("movq %%" #reg ", %0" \ | |
15808 | 288 : "=m" (mmx_trace) \ |
2509 | 289 : /* nothing */ ); \ |
290 printf(#op "_r2m(" #reg "=0x%08x%08x, ", \ | |
291 mmx_trace.d[1], mmx_trace.d[0]); \ | |
292 mmx_trace = (mem); \ | |
293 printf(#mem "=0x%08x%08x) => ", \ | |
294 mmx_trace.d[1], mmx_trace.d[0]); \ | |
295 __asm__ __volatile__ (#op " %%" #reg ", %0" \ | |
15808 | 296 : "=m" (mem) \ |
2509 | 297 : /* nothing */ ); \ |
298 mmx_trace = (mem); \ | |
299 printf(#mem "=0x%08x%08x\n", \ | |
300 mmx_trace.d[1], mmx_trace.d[0]); \ | |
301 } | |
302 | |
303 #define mmx_r2r(op, regs, regd) \ | |
304 { \ | |
305 mmx_t mmx_trace; \ | |
306 __asm__ __volatile__ ("movq %%" #regs ", %0" \ | |
15808 | 307 : "=m" (mmx_trace) \ |
2509 | 308 : /* nothing */ ); \ |
309 printf(#op "_r2r(" #regs "=0x%08x%08x, ", \ | |
310 mmx_trace.d[1], mmx_trace.d[0]); \ | |
311 __asm__ __volatile__ ("movq %%" #regd ", %0" \ | |
15808 | 312 : "=m" (mmx_trace) \ |
2509 | 313 : /* nothing */ ); \ |
314 printf(#regd "=0x%08x%08x) => ", \ | |
315 mmx_trace.d[1], mmx_trace.d[0]); \ | |
316 __asm__ __volatile__ (#op " %" #regs ", %" #regd); \ | |
317 __asm__ __volatile__ ("movq %%" #regd ", %0" \ | |
15808 | 318 : "=m" (mmx_trace) \ |
2509 | 319 : /* nothing */ ); \ |
320 printf(#regd "=0x%08x%08x\n", \ | |
321 mmx_trace.d[1], mmx_trace.d[0]); \ | |
322 } | |
323 | |
324 #define mmx_m2m(op, mems, memd) \ | |
325 { \ | |
326 mmx_t mmx_trace; \ | |
327 mmx_trace = (mems); \ | |
328 printf(#op "_m2m(" #mems "=0x%08x%08x, ", \ | |
329 mmx_trace.d[1], mmx_trace.d[0]); \ | |
330 mmx_trace = (memd); \ | |
331 printf(#memd "=0x%08x%08x) => ", \ | |
332 mmx_trace.d[1], mmx_trace.d[0]); \ | |
333 __asm__ __volatile__ ("movq %0, %%mm0\n\t" \ | |
334 #op " %1, %%mm0\n\t" \ | |
335 "movq %%mm0, %0" \ | |
15808 | 336 : "=m" (memd) \ |
337 : "m" (mems)); \ | |
2509 | 338 mmx_trace = (memd); \ |
339 printf(#memd "=0x%08x%08x\n", \ | |
340 mmx_trace.d[1], mmx_trace.d[0]); \ | |
341 } | |
342 | |
343 #else | |
344 | |
345 /* These macros are a lot simpler without the tracing... | |
346 */ | |
347 | |
348 #define mmx_i2r(op, imm, reg) \ | |
349 __asm__ __volatile__ (#op " %0, %%" #reg \ | |
350 : /* nothing */ \ | |
15808 | 351 : "i" (imm) ) |
2509 | 352 |
353 #define mmx_m2r(op, mem, reg) \ | |
354 __asm__ __volatile__ (#op " %0, %%" #reg \ | |
355 : /* nothing */ \ | |
15617
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
2509
diff
changeset
|
356 : "m" (mem)) |
2509 | 357 |
358 #define mmx_r2m(op, reg, mem) \ | |
359 __asm__ __volatile__ (#op " %%" #reg ", %0" \ | |
15808 | 360 : "=m" (mem) \ |
2509 | 361 : /* nothing */ ) |
362 | |
363 #define mmx_r2r(op, regs, regd) \ | |
364 __asm__ __volatile__ (#op " %" #regs ", %" #regd) | |
365 | |
366 #define mmx_m2m(op, mems, memd) \ | |
367 __asm__ __volatile__ ("movq %0, %%mm0\n\t" \ | |
368 #op " %1, %%mm0\n\t" \ | |
369 "movq %%mm0, %0" \ | |
15808 | 370 : "=m" (memd) \ |
371 : "m" (mems)) | |
2509 | 372 |
373 #endif | |
374 | |
375 | |
376 /* 1x64 MOVe Quadword | |
377 (this is both a load and a store... | |
378 in fact, it is the only way to store) | |
379 */ | |
380 #define movq_m2r(var, reg) mmx_m2r(movq, var, reg) | |
381 #define movq_r2m(reg, var) mmx_r2m(movq, reg, var) | |
382 #define movq_r2r(regs, regd) mmx_r2r(movq, regs, regd) | |
383 #define movq(vars, vard) \ | |
384 __asm__ __volatile__ ("movq %1, %%mm0\n\t" \ | |
385 "movq %%mm0, %0" \ | |
15808 | 386 : "=m" (vard) \ |
387 : "m" (vars)) | |
2509 | 388 |
389 | |
390 /* 1x32 MOVe Doubleword | |
391 (like movq, this is both load and store... | |
392 but is most useful for moving things between | |
393 mmx registers and ordinary registers) | |
394 */ | |
395 #define movd_m2r(var, reg) mmx_m2r(movd, var, reg) | |
396 #define movd_r2m(reg, var) mmx_r2m(movd, reg, var) | |
397 #define movd_r2r(regs, regd) mmx_r2r(movd, regs, regd) | |
398 #define movd(vars, vard) \ | |
399 __asm__ __volatile__ ("movd %1, %%mm0\n\t" \ | |
400 "movd %%mm0, %0" \ | |
15808 | 401 : "=m" (vard) \ |
402 : "m" (vars)) | |
2509 | 403 |
404 | |
405 /* 2x32, 4x16, and 8x8 Parallel ADDs | |
406 */ | |
407 #define paddd_m2r(var, reg) mmx_m2r(paddd, var, reg) | |
408 #define paddd_r2r(regs, regd) mmx_r2r(paddd, regs, regd) | |
409 #define paddd(vars, vard) mmx_m2m(paddd, vars, vard) | |
410 | |
411 #define paddw_m2r(var, reg) mmx_m2r(paddw, var, reg) | |
412 #define paddw_r2r(regs, regd) mmx_r2r(paddw, regs, regd) | |
413 #define paddw(vars, vard) mmx_m2m(paddw, vars, vard) | |
414 | |
415 #define paddb_m2r(var, reg) mmx_m2r(paddb, var, reg) | |
416 #define paddb_r2r(regs, regd) mmx_r2r(paddb, regs, regd) | |
417 #define paddb(vars, vard) mmx_m2m(paddb, vars, vard) | |
418 | |
419 | |
420 /* 4x16 and 8x8 Parallel ADDs using Saturation arithmetic | |
421 */ | |
422 #define paddsw_m2r(var, reg) mmx_m2r(paddsw, var, reg) | |
423 #define paddsw_r2r(regs, regd) mmx_r2r(paddsw, regs, regd) | |
424 #define paddsw(vars, vard) mmx_m2m(paddsw, vars, vard) | |
425 | |
426 #define paddsb_m2r(var, reg) mmx_m2r(paddsb, var, reg) | |
427 #define paddsb_r2r(regs, regd) mmx_r2r(paddsb, regs, regd) | |
428 #define paddsb(vars, vard) mmx_m2m(paddsb, vars, vard) | |
429 | |
430 | |
431 /* 4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic | |
432 */ | |
433 #define paddusw_m2r(var, reg) mmx_m2r(paddusw, var, reg) | |
434 #define paddusw_r2r(regs, regd) mmx_r2r(paddusw, regs, regd) | |
435 #define paddusw(vars, vard) mmx_m2m(paddusw, vars, vard) | |
436 | |
437 #define paddusb_m2r(var, reg) mmx_m2r(paddusb, var, reg) | |
438 #define paddusb_r2r(regs, regd) mmx_r2r(paddusb, regs, regd) | |
439 #define paddusb(vars, vard) mmx_m2m(paddusb, vars, vard) | |
440 | |
441 | |
442 /* 2x32, 4x16, and 8x8 Parallel SUBs | |
443 */ | |
444 #define psubd_m2r(var, reg) mmx_m2r(psubd, var, reg) | |
445 #define psubd_r2r(regs, regd) mmx_r2r(psubd, regs, regd) | |
446 #define psubd(vars, vard) mmx_m2m(psubd, vars, vard) | |
447 | |
448 #define psubw_m2r(var, reg) mmx_m2r(psubw, var, reg) | |
449 #define psubw_r2r(regs, regd) mmx_r2r(psubw, regs, regd) | |
450 #define psubw(vars, vard) mmx_m2m(psubw, vars, vard) | |
451 | |
452 #define psubb_m2r(var, reg) mmx_m2r(psubb, var, reg) | |
453 #define psubb_r2r(regs, regd) mmx_r2r(psubb, regs, regd) | |
454 #define psubb(vars, vard) mmx_m2m(psubb, vars, vard) | |
455 | |
456 | |
457 /* 4x16 and 8x8 Parallel SUBs using Saturation arithmetic | |
458 */ | |
459 #define psubsw_m2r(var, reg) mmx_m2r(psubsw, var, reg) | |
460 #define psubsw_r2r(regs, regd) mmx_r2r(psubsw, regs, regd) | |
461 #define psubsw(vars, vard) mmx_m2m(psubsw, vars, vard) | |
462 | |
463 #define psubsb_m2r(var, reg) mmx_m2r(psubsb, var, reg) | |
464 #define psubsb_r2r(regs, regd) mmx_r2r(psubsb, regs, regd) | |
465 #define psubsb(vars, vard) mmx_m2m(psubsb, vars, vard) | |
466 | |
467 | |
468 /* 4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic | |
469 */ | |
470 #define psubusw_m2r(var, reg) mmx_m2r(psubusw, var, reg) | |
471 #define psubusw_r2r(regs, regd) mmx_r2r(psubusw, regs, regd) | |
472 #define psubusw(vars, vard) mmx_m2m(psubusw, vars, vard) | |
473 | |
474 #define psubusb_m2r(var, reg) mmx_m2r(psubusb, var, reg) | |
475 #define psubusb_r2r(regs, regd) mmx_r2r(psubusb, regs, regd) | |
476 #define psubusb(vars, vard) mmx_m2m(psubusb, vars, vard) | |
477 | |
478 | |
479 /* 4x16 Parallel MULs giving Low 4x16 portions of results | |
480 */ | |
481 #define pmullw_m2r(var, reg) mmx_m2r(pmullw, var, reg) | |
482 #define pmullw_r2r(regs, regd) mmx_r2r(pmullw, regs, regd) | |
483 #define pmullw(vars, vard) mmx_m2m(pmullw, vars, vard) | |
484 | |
485 | |
486 /* 4x16 Parallel MULs giving High 4x16 portions of results | |
487 */ | |
488 #define pmulhw_m2r(var, reg) mmx_m2r(pmulhw, var, reg) | |
489 #define pmulhw_r2r(regs, regd) mmx_r2r(pmulhw, regs, regd) | |
490 #define pmulhw(vars, vard) mmx_m2m(pmulhw, vars, vard) | |
491 | |
492 | |
493 /* 4x16->2x32 Parallel Mul-ADD | |
494 (muls like pmullw, then adds adjacent 16-bit fields | |
495 in the multiply result to make the final 2x32 result) | |
496 */ | |
497 #define pmaddwd_m2r(var, reg) mmx_m2r(pmaddwd, var, reg) | |
498 #define pmaddwd_r2r(regs, regd) mmx_r2r(pmaddwd, regs, regd) | |
499 #define pmaddwd(vars, vard) mmx_m2m(pmaddwd, vars, vard) | |
500 | |
501 | |
502 /* 1x64 bitwise AND | |
503 */ | |
504 #ifdef BROKEN_PAND | |
505 #define pand_m2r(var, reg) \ | |
506 { \ | |
507 mmx_m2r(pandn, (mmx_t) -1LL, reg); \ | |
508 mmx_m2r(pandn, var, reg); \ | |
509 } | |
510 #define pand_r2r(regs, regd) \ | |
511 { \ | |
512 mmx_m2r(pandn, (mmx_t) -1LL, regd); \ | |
513 mmx_r2r(pandn, regs, regd) \ | |
514 } | |
515 #define pand(vars, vard) \ | |
516 { \ | |
517 movq_m2r(vard, mm0); \ | |
518 mmx_m2r(pandn, (mmx_t) -1LL, mm0); \ | |
519 mmx_m2r(pandn, vars, mm0); \ | |
520 movq_r2m(mm0, vard); \ | |
521 } | |
522 #else | |
523 #define pand_m2r(var, reg) mmx_m2r(pand, var, reg) | |
524 #define pand_r2r(regs, regd) mmx_r2r(pand, regs, regd) | |
525 #define pand(vars, vard) mmx_m2m(pand, vars, vard) | |
526 #endif | |
527 | |
528 | |
529 /* 1x64 bitwise AND with Not the destination | |
530 */ | |
531 #define pandn_m2r(var, reg) mmx_m2r(pandn, var, reg) | |
532 #define pandn_r2r(regs, regd) mmx_r2r(pandn, regs, regd) | |
533 #define pandn(vars, vard) mmx_m2m(pandn, vars, vard) | |
534 | |
535 | |
536 /* 1x64 bitwise OR | |
537 */ | |
538 #define por_m2r(var, reg) mmx_m2r(por, var, reg) | |
539 #define por_r2r(regs, regd) mmx_r2r(por, regs, regd) | |
540 #define por(vars, vard) mmx_m2m(por, vars, vard) | |
541 | |
542 | |
543 /* 1x64 bitwise eXclusive OR | |
544 */ | |
545 #define pxor_m2r(var, reg) mmx_m2r(pxor, var, reg) | |
546 #define pxor_r2r(regs, regd) mmx_r2r(pxor, regs, regd) | |
547 #define pxor(vars, vard) mmx_m2m(pxor, vars, vard) | |
548 | |
549 | |
550 /* 2x32, 4x16, and 8x8 Parallel CoMPare for EQuality | |
551 (resulting fields are either 0 or -1) | |
552 */ | |
553 #define pcmpeqd_m2r(var, reg) mmx_m2r(pcmpeqd, var, reg) | |
554 #define pcmpeqd_r2r(regs, regd) mmx_r2r(pcmpeqd, regs, regd) | |
555 #define pcmpeqd(vars, vard) mmx_m2m(pcmpeqd, vars, vard) | |
556 | |
557 #define pcmpeqw_m2r(var, reg) mmx_m2r(pcmpeqw, var, reg) | |
558 #define pcmpeqw_r2r(regs, regd) mmx_r2r(pcmpeqw, regs, regd) | |
559 #define pcmpeqw(vars, vard) mmx_m2m(pcmpeqw, vars, vard) | |
560 | |
561 #define pcmpeqb_m2r(var, reg) mmx_m2r(pcmpeqb, var, reg) | |
562 #define pcmpeqb_r2r(regs, regd) mmx_r2r(pcmpeqb, regs, regd) | |
563 #define pcmpeqb(vars, vard) mmx_m2m(pcmpeqb, vars, vard) | |
564 | |
565 | |
566 /* 2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than | |
567 (resulting fields are either 0 or -1) | |
568 */ | |
569 #define pcmpgtd_m2r(var, reg) mmx_m2r(pcmpgtd, var, reg) | |
570 #define pcmpgtd_r2r(regs, regd) mmx_r2r(pcmpgtd, regs, regd) | |
571 #define pcmpgtd(vars, vard) mmx_m2m(pcmpgtd, vars, vard) | |
572 | |
573 #define pcmpgtw_m2r(var, reg) mmx_m2r(pcmpgtw, var, reg) | |
574 #define pcmpgtw_r2r(regs, regd) mmx_r2r(pcmpgtw, regs, regd) | |
575 #define pcmpgtw(vars, vard) mmx_m2m(pcmpgtw, vars, vard) | |
576 | |
577 #define pcmpgtb_m2r(var, reg) mmx_m2r(pcmpgtb, var, reg) | |
578 #define pcmpgtb_r2r(regs, regd) mmx_r2r(pcmpgtb, regs, regd) | |
579 #define pcmpgtb(vars, vard) mmx_m2m(pcmpgtb, vars, vard) | |
580 | |
581 | |
582 /* 1x64, 2x32, and 4x16 Parallel Shift Left Logical | |
583 */ | |
584 #define psllq_i2r(imm, reg) mmx_i2r(psllq, imm, reg) | |
585 #define psllq_m2r(var, reg) mmx_m2r(psllq, var, reg) | |
586 #define psllq_r2r(regs, regd) mmx_r2r(psllq, regs, regd) | |
587 #define psllq(vars, vard) mmx_m2m(psllq, vars, vard) | |
588 | |
589 #define pslld_i2r(imm, reg) mmx_i2r(pslld, imm, reg) | |
590 #define pslld_m2r(var, reg) mmx_m2r(pslld, var, reg) | |
591 #define pslld_r2r(regs, regd) mmx_r2r(pslld, regs, regd) | |
592 #define pslld(vars, vard) mmx_m2m(pslld, vars, vard) | |
593 | |
594 #define psllw_i2r(imm, reg) mmx_i2r(psllw, imm, reg) | |
595 #define psllw_m2r(var, reg) mmx_m2r(psllw, var, reg) | |
596 #define psllw_r2r(regs, regd) mmx_r2r(psllw, regs, regd) | |
597 #define psllw(vars, vard) mmx_m2m(psllw, vars, vard) | |
598 | |
599 | |
600 /* 1x64, 2x32, and 4x16 Parallel Shift Right Logical | |
601 */ | |
602 #define psrlq_i2r(imm, reg) mmx_i2r(psrlq, imm, reg) | |
603 #define psrlq_m2r(var, reg) mmx_m2r(psrlq, var, reg) | |
604 #define psrlq_r2r(regs, regd) mmx_r2r(psrlq, regs, regd) | |
605 #define psrlq(vars, vard) mmx_m2m(psrlq, vars, vard) | |
606 | |
607 #define psrld_i2r(imm, reg) mmx_i2r(psrld, imm, reg) | |
608 #define psrld_m2r(var, reg) mmx_m2r(psrld, var, reg) | |
609 #define psrld_r2r(regs, regd) mmx_r2r(psrld, regs, regd) | |
610 #define psrld(vars, vard) mmx_m2m(psrld, vars, vard) | |
611 | |
612 #define psrlw_i2r(imm, reg) mmx_i2r(psrlw, imm, reg) | |
613 #define psrlw_m2r(var, reg) mmx_m2r(psrlw, var, reg) | |
614 #define psrlw_r2r(regs, regd) mmx_r2r(psrlw, regs, regd) | |
615 #define psrlw(vars, vard) mmx_m2m(psrlw, vars, vard) | |
616 | |
617 | |
618 /* 2x32 and 4x16 Parallel Shift Right Arithmetic | |
619 */ | |
620 #define psrad_i2r(imm, reg) mmx_i2r(psrad, imm, reg) | |
621 #define psrad_m2r(var, reg) mmx_m2r(psrad, var, reg) | |
622 #define psrad_r2r(regs, regd) mmx_r2r(psrad, regs, regd) | |
623 #define psrad(vars, vard) mmx_m2m(psrad, vars, vard) | |
624 | |
625 #define psraw_i2r(imm, reg) mmx_i2r(psraw, imm, reg) | |
626 #define psraw_m2r(var, reg) mmx_m2r(psraw, var, reg) | |
627 #define psraw_r2r(regs, regd) mmx_r2r(psraw, regs, regd) | |
628 #define psraw(vars, vard) mmx_m2m(psraw, vars, vard) | |
629 | |
630 | |
631 /* 2x32->4x16 and 4x16->8x8 PACK and Signed Saturate | |
632 (packs source and dest fields into dest in that order) | |
633 */ | |
634 #define packssdw_m2r(var, reg) mmx_m2r(packssdw, var, reg) | |
635 #define packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd) | |
636 #define packssdw(vars, vard) mmx_m2m(packssdw, vars, vard) | |
637 | |
638 #define packsswb_m2r(var, reg) mmx_m2r(packsswb, var, reg) | |
639 #define packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd) | |
640 #define packsswb(vars, vard) mmx_m2m(packsswb, vars, vard) | |
641 | |
642 | |
643 /* 4x16->8x8 PACK and Unsigned Saturate | |
644 (packs source and dest fields into dest in that order) | |
645 */ | |
646 #define packuswb_m2r(var, reg) mmx_m2r(packuswb, var, reg) | |
647 #define packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd) | |
648 #define packuswb(vars, vard) mmx_m2m(packuswb, vars, vard) | |
649 | |
650 | |
651 /* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low | |
652 (interleaves low half of dest with low half of source | |
653 as padding in each result field) | |
654 */ | |
655 #define punpckldq_m2r(var, reg) mmx_m2r(punpckldq, var, reg) | |
656 #define punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd) | |
657 #define punpckldq(vars, vard) mmx_m2m(punpckldq, vars, vard) | |
658 | |
659 #define punpcklwd_m2r(var, reg) mmx_m2r(punpcklwd, var, reg) | |
660 #define punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd) | |
661 #define punpcklwd(vars, vard) mmx_m2m(punpcklwd, vars, vard) | |
662 | |
663 #define punpcklbw_m2r(var, reg) mmx_m2r(punpcklbw, var, reg) | |
664 #define punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd) | |
665 #define punpcklbw(vars, vard) mmx_m2m(punpcklbw, vars, vard) | |
666 | |
667 | |
668 /* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High | |
669 (interleaves high half of dest with high half of source | |
670 as padding in each result field) | |
671 */ | |
672 #define punpckhdq_m2r(var, reg) mmx_m2r(punpckhdq, var, reg) | |
673 #define punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd) | |
674 #define punpckhdq(vars, vard) mmx_m2m(punpckhdq, vars, vard) | |
675 | |
676 #define punpckhwd_m2r(var, reg) mmx_m2r(punpckhwd, var, reg) | |
677 #define punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd) | |
678 #define punpckhwd(vars, vard) mmx_m2m(punpckhwd, vars, vard) | |
679 | |
680 #define punpckhbw_m2r(var, reg) mmx_m2r(punpckhbw, var, reg) | |
681 #define punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd) | |
682 #define punpckhbw(vars, vard) mmx_m2m(punpckhbw, vars, vard) | |
683 | |
684 | |
685 /* Empty MMx State | |
686 (used to clean-up when going from mmx to float use | |
687 of the registers that are shared by both; note that | |
688 there is no float-to-mmx operation needed, because | |
689 only the float tag word info is corruptible) | |
690 */ | |
691 #ifdef MMX_TRACE | |
692 | |
693 #define emms() \ | |
694 { \ | |
695 printf("emms()\n"); \ | |
696 __asm__ __volatile__ ("emms"); \ | |
697 } | |
698 | |
699 #else | |
700 | |
701 #define emms() __asm__ __volatile__ ("emms") | |
702 | |
703 #endif | |
704 | |
705 #endif | |
706 |