Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 444:a5edef76dac6 libavcodec
* new mmx code - based upon http://aggregate.org/MAGIC
for now it's rather sneak preview (new functions are nearly 100% faster)
author | kabi |
---|---|
date | Wed, 29 May 2002 14:29:48 +0000 |
parents | fe58fe638f9b |
children | 62c01dbdc1e0 |
comparison
equal
deleted
inserted
replaced
443:63467327c06c | 444:a5edef76dac6 |
---|---|
47 /* external functions, from idct_mmx.c */ | 47 /* external functions, from idct_mmx.c */ |
48 void ff_mmx_idct(DCTELEM *block); | 48 void ff_mmx_idct(DCTELEM *block); |
49 void ff_mmxext_idct(DCTELEM *block); | 49 void ff_mmxext_idct(DCTELEM *block); |
50 | 50 |
51 /* pixel operations */ | 51 /* pixel operations */ |
52 static const uint64_t mm_bfe __attribute__ ((aligned(8))) = 0xfefefefefefefefeULL; | |
52 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; | 53 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
53 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | 54 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; |
54 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | 55 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; |
55 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; | 56 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; |
56 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | 57 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; |
60 | 61 |
61 #ifndef PIC | 62 #ifndef PIC |
62 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) | 63 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) |
63 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) | 64 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
64 #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t" | 65 #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t" |
66 #define MOVQ_BFE(regd) "movq "MANGLE(mm_bfe)", "#regd" \n\t" | |
65 #else | 67 #else |
66 // for shared library it's better to use this way for accessing constants | 68 // for shared library it's better to use this way for accessing constants |
67 // pcmpeqd -> -1 | 69 // pcmpeqd -> -1 |
68 #define MOVQ_WONE(regd) \ | 70 #define MOVQ_WONE(regd) \ |
69 __asm __volatile ( \ | 71 __asm __volatile ( \ |
78 | 80 |
79 #define MOVQ_BONE(regd) \ | 81 #define MOVQ_BONE(regd) \ |
80 "pcmpeqd " #regd ", " #regd " \n\t" \ | 82 "pcmpeqd " #regd ", " #regd " \n\t" \ |
81 "psrlw $15, " #regd " \n\t"\ | 83 "psrlw $15, " #regd " \n\t"\ |
82 "packuswb " #regd ", " #regd " \n\t" | 84 "packuswb " #regd ", " #regd " \n\t" |
85 | |
86 #define MOVQ_BFE(regd) \ | |
87 "pcmpeqd " #regd ", " #regd " \n\t"\ | |
88 "paddb " #regd ", " #regd " \n\t" | |
83 #endif | 89 #endif |
90 | |
91 // using mm6 as temporary and for the output result | |
92 // first argument is unmodifed and second is trashed | |
93 // mm7 is supposed to contain 0xfefefefefefefefe | |
94 #define PAVG_MMX_NO_RND(rega, regb) \ | |
95 "movq " #rega ", %%mm6 \n\t"\ | |
96 "pand " #regb ", %%mm6 \n\t"\ | |
97 "pxor " #rega ", " #regb " \n\t"\ | |
98 "pand %%mm7, " #regb " \n\t"\ | |
99 "psrlq $1, " #regb " \n\t"\ | |
100 "paddb " #regb ", %%mm6 \n\t" | |
101 | |
102 #define PAVG_MMX(rega, regb) \ | |
103 "movq " #rega ", %%mm6 \n\t"\ | |
104 "por " #regb ", %%mm6 \n\t"\ | |
105 "pxor " #rega ", " #regb " \n\t"\ | |
106 "pand %%mm7, " #regb " \n\t"\ | |
107 "psrlq $1, " #regb " \n\t"\ | |
108 "psubb " #regb ", %%mm6 \n\t" | |
84 | 109 |
85 | 110 |
86 /***********************************/ | 111 /***********************************/ |
87 /* 3Dnow specific */ | 112 /* 3Dnow specific */ |
88 | 113 |
289 : "r"(line_size) | 314 : "r"(line_size) |
290 : "%eax", "memory" | 315 : "%eax", "memory" |
291 ); | 316 ); |
292 } | 317 } |
293 | 318 |
319 // will have to be check if it's better to have bigger | |
320 // unrolled code also on Celerons - for now yes | |
321 #define LONG_UNROLL 1 | |
294 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 322 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
295 { | 323 { |
324 #if 0 | |
296 UINT8 *p; | 325 UINT8 *p; |
297 const UINT8 *pix; | 326 const UINT8 *pix; |
298 p = block; | 327 p = block; |
299 pix = pixels; | 328 pix = pixels; |
300 MOVQ_ZERO(mm7); | 329 MOVQ_ZERO(mm7); |
301 MOVQ_WONE(mm4); | 330 MOVQ_WONE(mm4); |
302 JUMPALIGN(); | 331 JUMPALIGN(); |
303 do { | 332 do { |
304 __asm __volatile( | 333 __asm __volatile( |
305 "movq %1, %%mm0\n\t" | 334 "movq %1, %%mm0\n\t" |
306 "movq 1%1, %%mm1\n\t" | 335 "movq 1%1, %%mm1\n\t" |
307 "movq %%mm0, %%mm2\n\t" | 336 "movq %%mm0, %%mm2\n\t" |
308 "movq %%mm1, %%mm3\n\t" | 337 "movq %%mm1, %%mm3\n\t" |
309 "punpcklbw %%mm7, %%mm0\n\t" | 338 "punpcklbw %%mm7, %%mm0\n\t" |
318 "psrlw $1, %%mm2\n\t" | 347 "psrlw $1, %%mm2\n\t" |
319 "packuswb %%mm2, %%mm0\n\t" | 348 "packuswb %%mm2, %%mm0\n\t" |
320 "movq %%mm0, %0\n\t" | 349 "movq %%mm0, %0\n\t" |
321 :"=m"(*p) | 350 :"=m"(*p) |
322 :"m"(*pix) | 351 :"m"(*pix) |
323 :"memory"); | 352 :"memory"); |
324 pix += line_size; p += line_size; | 353 pix += line_size; p += line_size; |
325 } while (--h); | 354 } while (--h); |
355 #else | |
356 __asm __volatile( | |
357 MOVQ_BFE(%%mm7) | |
358 "lea (%3, %3), %%eax \n\t" | |
359 ".balign 8 \n\t" | |
360 "1: \n\t" | |
361 "movq (%1), %%mm0 \n\t" | |
362 "movq (%1, %3), %%mm2 \n\t" | |
363 "movq 1(%1), %%mm1 \n\t" | |
364 "movq 1(%1, %3), %%mm3 \n\t" | |
365 PAVG_MMX(%%mm0, %%mm1) | |
366 "movq %%mm6, (%2) \n\t" | |
367 PAVG_MMX(%%mm2, %%mm3) | |
368 "movq %%mm6, (%2, %3) \n\t" | |
369 "addl %%eax, %1 \n\t" | |
370 "addl %%eax, %2 \n\t" | |
371 #if LONG_UNROLL | |
372 "movq (%1), %%mm0 \n\t" | |
373 "movq (%1, %3), %%mm2 \n\t" | |
374 "movq 1(%1), %%mm1 \n\t" | |
375 "movq 1(%1, %3), %%mm3 \n\t" | |
376 PAVG_MMX(%%mm0, %%mm1) | |
377 "movq %%mm6, (%2) \n\t" | |
378 PAVG_MMX(%%mm2, %%mm3) | |
379 "movq %%mm6, (%2, %3) \n\t" | |
380 "addl %%eax, %1 \n\t" | |
381 "addl %%eax, %2 \n\t" | |
382 "subl $4, %0 \n\t" | |
383 #else | |
384 "subl $2, %0 \n\t" | |
385 #endif | |
386 "jnz 1b \n\t" | |
387 :"+g"(h), "+S"(pixels), "+D"(block) | |
388 :"r"(line_size) | |
389 :"eax", "memory"); | |
390 #endif | |
326 } | 391 } |
327 | 392 |
328 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 393 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
329 { | 394 { |
395 #if 0 | |
330 UINT8 *p; | 396 UINT8 *p; |
331 const UINT8 *pix; | 397 const UINT8 *pix; |
332 p = block; | 398 p = block; |
333 pix = pixels; | 399 pix = pixels; |
334 MOVQ_ZERO(mm7); | 400 MOVQ_ZERO(mm7); |
357 "m"(*(pix+line_size)) | 423 "m"(*(pix+line_size)) |
358 :"memory"); | 424 :"memory"); |
359 pix += line_size; | 425 pix += line_size; |
360 p += line_size; | 426 p += line_size; |
361 } while (--h); | 427 } while (--h); |
428 #else | |
429 __asm __volatile( | |
430 MOVQ_BFE(%%mm7) | |
431 "lea (%3, %3), %%eax \n\t" | |
432 "movq (%1), %%mm0 \n\t" | |
433 ".balign 8 \n\t" | |
434 "1: \n\t" | |
435 "movq (%1, %3), %%mm1 \n\t" | |
436 "movq (%1, %%eax),%%mm2 \n\t" | |
437 PAVG_MMX(%%mm1, %%mm0) | |
438 "movq %%mm6, (%2) \n\t" | |
439 PAVG_MMX(%%mm2, %%mm1) | |
440 "movq %%mm6, (%2, %3) \n\t" | |
441 "addl %%eax, %1 \n\t" | |
442 "addl %%eax, %2 \n\t" | |
443 #ifdef LONG_UNROLL | |
444 "movq (%1, %3), %%mm1 \n\t" | |
445 "movq (%1, %%eax),%%mm0 \n\t" | |
446 PAVG_MMX(%%mm1, %%mm2) | |
447 "movq %%mm6, (%2) \n\t" | |
448 PAVG_MMX(%%mm0, %%mm1) | |
449 "movq %%mm6, (%2, %3) \n\t" | |
450 "addl %%eax, %1 \n\t" | |
451 "addl %%eax, %2 \n\t" | |
452 "subl $4, %0 \n\t" | |
453 #else | |
454 "subl $2, %0 \n\t" | |
455 #endif | |
456 "jnz 1b \n\t" | |
457 :"+g"(h), "+S"(pixels), "+D"(block) | |
458 :"r"(line_size) | |
459 :"eax", "memory"); | |
460 #endif | |
461 | |
462 | |
362 } | 463 } |
363 | 464 |
364 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 465 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
365 { | 466 { |
366 UINT8 *p; | 467 UINT8 *p; |