Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 445:62c01dbdc1e0 libavcodec
* code with new PAVGB for MMX only CPU splited into separate file
and being compiled in the same way as _avg.h
* PAVG_MMX macros accept also output parameter
* implemented faster put_pixels_xy2, but it has slightly smaller precission.
But there is not visible difference in the image quality - might be eventualy
easily switched back (#if 0 #endif)- please check
author | kabi |
---|---|
date | Wed, 29 May 2002 17:16:22 +0000 |
parents | a5edef76dac6 |
children | efe0c0d40577 |
comparison
equal
deleted
inserted
replaced
444:a5edef76dac6 | 445:62c01dbdc1e0 |
---|---|
89 #endif | 89 #endif |
90 | 90 |
91 // using mm6 as temporary and for the output result | 91 // using mm6 as temporary and for the output result |
92 // first argument is unmodifed and second is trashed | 92 // first argument is unmodifed and second is trashed |
93 // mm7 is supposed to contain 0xfefefefefefefefe | 93 // mm7 is supposed to contain 0xfefefefefefefefe |
94 #define PAVG_MMX_NO_RND(rega, regb) \ | 94 #define PAVGB_MMX_NO_RND(rega, regb, regr) \ |
95 "movq " #rega ", %%mm6 \n\t"\ | 95 "movq " #rega ", " #regr " \n\t"\ |
96 "pand " #regb ", %%mm6 \n\t"\ | 96 "pand " #regb ", " #regr " \n\t"\ |
97 "pxor " #rega ", " #regb " \n\t"\ | 97 "pxor " #rega ", " #regb " \n\t"\ |
98 "pand %%mm7, " #regb " \n\t"\ | 98 "pand %%mm7, " #regb " \n\t"\ |
99 "psrlq $1, " #regb " \n\t"\ | 99 "psrlq $1, " #regb " \n\t"\ |
100 "paddb " #regb ", %%mm6 \n\t" | 100 "paddb " #regb ", " #regr " \n\t" |
101 | 101 |
102 #define PAVG_MMX(rega, regb) \ | 102 #define PAVGB_MMX(rega, regb, regr) \ |
103 "movq " #rega ", %%mm6 \n\t"\ | 103 "movq " #rega ", " #regr " \n\t"\ |
104 "por " #regb ", %%mm6 \n\t"\ | 104 "por " #regb ", " #regr " \n\t"\ |
105 "pxor " #rega ", " #regb " \n\t"\ | 105 "pxor " #rega ", " #regb " \n\t"\ |
106 "pand %%mm7, " #regb " \n\t"\ | 106 "pand %%mm7, " #regb " \n\t"\ |
107 "psrlq $1, " #regb " \n\t"\ | 107 "psrlq $1, " #regb " \n\t"\ |
108 "psubb " #regb ", %%mm6 \n\t" | 108 "psubb " #regb ", " #regr " \n\t" |
109 | 109 |
110 /***********************************/ | |
111 /* MMX no rounding */ | |
112 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx | |
113 | |
114 #define PAVGB(a, b) PAVGB_MMX_NO_RND(a, b, %%mm6) | |
115 #define PAVGBR(a, b, c) PAVGB_MMX_NO_RND(a, b, c) | |
116 #include "dsputil_mmx_rnd.h" | |
117 | |
118 #undef DEF | |
119 #undef PAVGB | |
120 #undef PAVGBR | |
121 /***********************************/ | |
122 /* MMX rounding */ | |
123 | |
124 #define DEF(x, y) x ## _ ## y ##_mmx | |
125 | |
126 #define PAVGB(a, b) PAVGB_MMX(a, b, %%mm6) | |
127 #define PAVGBR(a, b, c) PAVGB_MMX(a, b, c) | |
128 #include "dsputil_mmx_rnd.h" | |
129 | |
130 #undef DEF | |
131 #undef PAVGB | |
132 #undef PAVGBR | |
110 | 133 |
111 /***********************************/ | 134 /***********************************/ |
112 /* 3Dnow specific */ | 135 /* 3Dnow specific */ |
113 | 136 |
114 #define DEF(x) x ## _3dnow | 137 #define DEF(x) x ## _3dnow |
314 : "r"(line_size) | 337 : "r"(line_size) |
315 : "%eax", "memory" | 338 : "%eax", "memory" |
316 ); | 339 ); |
317 } | 340 } |
318 | 341 |
319 // will have to be check if it's better to have bigger | |
320 // unrolled code also on Celerons - for now yes | |
321 #define LONG_UNROLL 1 | |
322 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
323 { | |
324 #if 0 | 342 #if 0 |
325 UINT8 *p; | |
326 const UINT8 *pix; | |
327 p = block; | |
328 pix = pixels; | |
329 MOVQ_ZERO(mm7); | |
330 MOVQ_WONE(mm4); | |
331 JUMPALIGN(); | |
332 do { | |
333 __asm __volatile( | |
334 "movq %1, %%mm0\n\t" | |
335 "movq 1%1, %%mm1\n\t" | |
336 "movq %%mm0, %%mm2\n\t" | |
337 "movq %%mm1, %%mm3\n\t" | |
338 "punpcklbw %%mm7, %%mm0\n\t" | |
339 "punpcklbw %%mm7, %%mm1\n\t" | |
340 "punpckhbw %%mm7, %%mm2\n\t" | |
341 "punpckhbw %%mm7, %%mm3\n\t" | |
342 "paddusw %%mm1, %%mm0\n\t" | |
343 "paddusw %%mm3, %%mm2\n\t" | |
344 "paddusw %%mm4, %%mm0\n\t" | |
345 "paddusw %%mm4, %%mm2\n\t" | |
346 "psrlw $1, %%mm0\n\t" | |
347 "psrlw $1, %%mm2\n\t" | |
348 "packuswb %%mm2, %%mm0\n\t" | |
349 "movq %%mm0, %0\n\t" | |
350 :"=m"(*p) | |
351 :"m"(*pix) | |
352 :"memory"); | |
353 pix += line_size; p += line_size; | |
354 } while (--h); | |
355 #else | |
356 __asm __volatile( | |
357 MOVQ_BFE(%%mm7) | |
358 "lea (%3, %3), %%eax \n\t" | |
359 ".balign 8 \n\t" | |
360 "1: \n\t" | |
361 "movq (%1), %%mm0 \n\t" | |
362 "movq (%1, %3), %%mm2 \n\t" | |
363 "movq 1(%1), %%mm1 \n\t" | |
364 "movq 1(%1, %3), %%mm3 \n\t" | |
365 PAVG_MMX(%%mm0, %%mm1) | |
366 "movq %%mm6, (%2) \n\t" | |
367 PAVG_MMX(%%mm2, %%mm3) | |
368 "movq %%mm6, (%2, %3) \n\t" | |
369 "addl %%eax, %1 \n\t" | |
370 "addl %%eax, %2 \n\t" | |
371 #if LONG_UNROLL | |
372 "movq (%1), %%mm0 \n\t" | |
373 "movq (%1, %3), %%mm2 \n\t" | |
374 "movq 1(%1), %%mm1 \n\t" | |
375 "movq 1(%1, %3), %%mm3 \n\t" | |
376 PAVG_MMX(%%mm0, %%mm1) | |
377 "movq %%mm6, (%2) \n\t" | |
378 PAVG_MMX(%%mm2, %%mm3) | |
379 "movq %%mm6, (%2, %3) \n\t" | |
380 "addl %%eax, %1 \n\t" | |
381 "addl %%eax, %2 \n\t" | |
382 "subl $4, %0 \n\t" | |
383 #else | |
384 "subl $2, %0 \n\t" | |
385 #endif | |
386 "jnz 1b \n\t" | |
387 :"+g"(h), "+S"(pixels), "+D"(block) | |
388 :"r"(line_size) | |
389 :"eax", "memory"); | |
390 #endif | |
391 } | |
392 | |
393 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
394 { | |
395 #if 0 | |
396 UINT8 *p; | |
397 const UINT8 *pix; | |
398 p = block; | |
399 pix = pixels; | |
400 MOVQ_ZERO(mm7); | |
401 MOVQ_WONE(mm4); | |
402 JUMPALIGN(); | |
403 do { | |
404 __asm __volatile( | |
405 "movq %1, %%mm0\n\t" | |
406 "movq %2, %%mm1\n\t" | |
407 "movq %%mm0, %%mm2\n\t" | |
408 "movq %%mm1, %%mm3\n\t" | |
409 "punpcklbw %%mm7, %%mm0\n\t" | |
410 "punpcklbw %%mm7, %%mm1\n\t" | |
411 "punpckhbw %%mm7, %%mm2\n\t" | |
412 "punpckhbw %%mm7, %%mm3\n\t" | |
413 "paddusw %%mm1, %%mm0\n\t" | |
414 "paddusw %%mm3, %%mm2\n\t" | |
415 "paddusw %%mm4, %%mm0\n\t" | |
416 "paddusw %%mm4, %%mm2\n\t" | |
417 "psrlw $1, %%mm0\n\t" | |
418 "psrlw $1, %%mm2\n\t" | |
419 "packuswb %%mm2, %%mm0\n\t" | |
420 "movq %%mm0, %0\n\t" | |
421 :"=m"(*p) | |
422 :"m"(*pix), | |
423 "m"(*(pix+line_size)) | |
424 :"memory"); | |
425 pix += line_size; | |
426 p += line_size; | |
427 } while (--h); | |
428 #else | |
429 __asm __volatile( | |
430 MOVQ_BFE(%%mm7) | |
431 "lea (%3, %3), %%eax \n\t" | |
432 "movq (%1), %%mm0 \n\t" | |
433 ".balign 8 \n\t" | |
434 "1: \n\t" | |
435 "movq (%1, %3), %%mm1 \n\t" | |
436 "movq (%1, %%eax),%%mm2 \n\t" | |
437 PAVG_MMX(%%mm1, %%mm0) | |
438 "movq %%mm6, (%2) \n\t" | |
439 PAVG_MMX(%%mm2, %%mm1) | |
440 "movq %%mm6, (%2, %3) \n\t" | |
441 "addl %%eax, %1 \n\t" | |
442 "addl %%eax, %2 \n\t" | |
443 #ifdef LONG_UNROLL | |
444 "movq (%1, %3), %%mm1 \n\t" | |
445 "movq (%1, %%eax),%%mm0 \n\t" | |
446 PAVG_MMX(%%mm1, %%mm2) | |
447 "movq %%mm6, (%2) \n\t" | |
448 PAVG_MMX(%%mm0, %%mm1) | |
449 "movq %%mm6, (%2, %3) \n\t" | |
450 "addl %%eax, %1 \n\t" | |
451 "addl %%eax, %2 \n\t" | |
452 "subl $4, %0 \n\t" | |
453 #else | |
454 "subl $2, %0 \n\t" | |
455 #endif | |
456 "jnz 1b \n\t" | |
457 :"+g"(h), "+S"(pixels), "+D"(block) | |
458 :"r"(line_size) | |
459 :"eax", "memory"); | |
460 #endif | |
461 | |
462 | |
463 } | |
464 | |
465 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 343 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
466 { | 344 { |
467 UINT8 *p; | 345 UINT8 *p; |
468 const UINT8 *pix; | 346 const UINT8 *pix; |
469 p = block; | 347 p = block; |
508 pix += line_size; | 386 pix += line_size; |
509 p += line_size; | 387 p += line_size; |
510 } while(--h); | 388 } while(--h); |
511 } | 389 } |
512 | 390 |
513 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
514 { | |
515 UINT8 *p; | |
516 const UINT8 *pix; | |
517 p = block; | |
518 pix = pixels; | |
519 MOVQ_ZERO(mm7); | |
520 do { | |
521 __asm __volatile( | |
522 "movq %1, %%mm0\n\t" | |
523 "movq 1%1, %%mm1\n\t" | |
524 "movq %%mm0, %%mm2\n\t" | |
525 "movq %%mm1, %%mm3\n\t" | |
526 "punpcklbw %%mm7, %%mm0\n\t" | |
527 "punpcklbw %%mm7, %%mm1\n\t" | |
528 "punpckhbw %%mm7, %%mm2\n\t" | |
529 "punpckhbw %%mm7, %%mm3\n\t" | |
530 "paddusw %%mm1, %%mm0\n\t" | |
531 "paddusw %%mm3, %%mm2\n\t" | |
532 "psrlw $1, %%mm0\n\t" | |
533 "psrlw $1, %%mm2\n\t" | |
534 "packuswb %%mm2, %%mm0\n\t" | |
535 "movq %%mm0, %0\n\t" | |
536 :"=m"(*p) | |
537 :"m"(*pix) | |
538 :"memory"); | |
539 pix += line_size; | |
540 p += line_size; | |
541 } while (--h); | |
542 } | |
543 | |
544 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
545 { | |
546 UINT8 *p; | |
547 const UINT8 *pix; | |
548 p = block; | |
549 pix = pixels; | |
550 MOVQ_ZERO(mm7); | |
551 JUMPALIGN(); | |
552 do { | |
553 __asm __volatile( | |
554 "movq %1, %%mm0\n\t" | |
555 "movq %2, %%mm1\n\t" | |
556 "movq %%mm0, %%mm2\n\t" | |
557 "movq %%mm1, %%mm3\n\t" | |
558 "punpcklbw %%mm7, %%mm0\n\t" | |
559 "punpcklbw %%mm7, %%mm1\n\t" | |
560 "punpckhbw %%mm7, %%mm2\n\t" | |
561 "punpckhbw %%mm7, %%mm3\n\t" | |
562 "paddusw %%mm1, %%mm0\n\t" | |
563 "paddusw %%mm3, %%mm2\n\t" | |
564 "psrlw $1, %%mm0\n\t" | |
565 "psrlw $1, %%mm2\n\t" | |
566 "packuswb %%mm2, %%mm0\n\t" | |
567 "movq %%mm0, %0\n\t" | |
568 :"=m"(*p) | |
569 :"m"(*pix), | |
570 "m"(*(pix+line_size)) | |
571 :"memory"); | |
572 pix += line_size; | |
573 p += line_size; | |
574 } while(--h); | |
575 } | |
576 | |
577 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | 391 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) |
578 { | 392 { |
579 UINT8 *p; | 393 UINT8 *p; |
580 const UINT8 *pix; | 394 const UINT8 *pix; |
581 p = block; | 395 p = block; |
619 :"memory"); | 433 :"memory"); |
620 pix += line_size; | 434 pix += line_size; |
621 p += line_size; | 435 p += line_size; |
622 } while(--h); | 436 } while(--h); |
623 } | 437 } |
624 | 438 #endif |
625 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 439 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
626 { | 440 { |
627 UINT8 *p; | 441 UINT8 *p; |
628 const UINT8 *pix; | 442 const UINT8 *pix; |
629 p = block; | 443 p = block; |