comparison i386/dsputil_mmx.c @ 445:62c01dbdc1e0 libavcodec

* code with new PAVGB for MMX only CPU splited into separate file and being compiled in the same way as _avg.h * PAVG_MMX macros accept also output parameter * implemented faster put_pixels_xy2, but it has slightly smaller precission. But there is not visible difference in the image quality - might be eventualy easily switched back (#if 0 #endif)- please check
author kabi
date Wed, 29 May 2002 17:16:22 +0000
parents a5edef76dac6
children efe0c0d40577
comparison
equal deleted inserted replaced
444:a5edef76dac6 445:62c01dbdc1e0
89 #endif 89 #endif
90 90
91 // using mm6 as temporary and for the output result 91 // using mm6 as temporary and for the output result
92 // first argument is unmodifed and second is trashed 92 // first argument is unmodifed and second is trashed
93 // mm7 is supposed to contain 0xfefefefefefefefe 93 // mm7 is supposed to contain 0xfefefefefefefefe
94 #define PAVG_MMX_NO_RND(rega, regb) \ 94 #define PAVGB_MMX_NO_RND(rega, regb, regr) \
95 "movq " #rega ", %%mm6 \n\t"\ 95 "movq " #rega ", " #regr " \n\t"\
96 "pand " #regb ", %%mm6 \n\t"\ 96 "pand " #regb ", " #regr " \n\t"\
97 "pxor " #rega ", " #regb " \n\t"\ 97 "pxor " #rega ", " #regb " \n\t"\
98 "pand %%mm7, " #regb " \n\t"\ 98 "pand %%mm7, " #regb " \n\t"\
99 "psrlq $1, " #regb " \n\t"\ 99 "psrlq $1, " #regb " \n\t"\
100 "paddb " #regb ", %%mm6 \n\t" 100 "paddb " #regb ", " #regr " \n\t"
101 101
102 #define PAVG_MMX(rega, regb) \ 102 #define PAVGB_MMX(rega, regb, regr) \
103 "movq " #rega ", %%mm6 \n\t"\ 103 "movq " #rega ", " #regr " \n\t"\
104 "por " #regb ", %%mm6 \n\t"\ 104 "por " #regb ", " #regr " \n\t"\
105 "pxor " #rega ", " #regb " \n\t"\ 105 "pxor " #rega ", " #regb " \n\t"\
106 "pand %%mm7, " #regb " \n\t"\ 106 "pand %%mm7, " #regb " \n\t"\
107 "psrlq $1, " #regb " \n\t"\ 107 "psrlq $1, " #regb " \n\t"\
108 "psubb " #regb ", %%mm6 \n\t" 108 "psubb " #regb ", " #regr " \n\t"
109 109
110 /***********************************/
111 /* MMX no rounding */
112 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
113
114 #define PAVGB(a, b) PAVGB_MMX_NO_RND(a, b, %%mm6)
115 #define PAVGBR(a, b, c) PAVGB_MMX_NO_RND(a, b, c)
116 #include "dsputil_mmx_rnd.h"
117
118 #undef DEF
119 #undef PAVGB
120 #undef PAVGBR
121 /***********************************/
122 /* MMX rounding */
123
124 #define DEF(x, y) x ## _ ## y ##_mmx
125
126 #define PAVGB(a, b) PAVGB_MMX(a, b, %%mm6)
127 #define PAVGBR(a, b, c) PAVGB_MMX(a, b, c)
128 #include "dsputil_mmx_rnd.h"
129
130 #undef DEF
131 #undef PAVGB
132 #undef PAVGBR
110 133
111 /***********************************/ 134 /***********************************/
112 /* 3Dnow specific */ 135 /* 3Dnow specific */
113 136
114 #define DEF(x) x ## _3dnow 137 #define DEF(x) x ## _3dnow
314 : "r"(line_size) 337 : "r"(line_size)
315 : "%eax", "memory" 338 : "%eax", "memory"
316 ); 339 );
317 } 340 }
318 341
319 // will have to be check if it's better to have bigger
320 // unrolled code also on Celerons - for now yes
321 #define LONG_UNROLL 1
322 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
323 {
324 #if 0 342 #if 0
325 UINT8 *p;
326 const UINT8 *pix;
327 p = block;
328 pix = pixels;
329 MOVQ_ZERO(mm7);
330 MOVQ_WONE(mm4);
331 JUMPALIGN();
332 do {
333 __asm __volatile(
334 "movq %1, %%mm0\n\t"
335 "movq 1%1, %%mm1\n\t"
336 "movq %%mm0, %%mm2\n\t"
337 "movq %%mm1, %%mm3\n\t"
338 "punpcklbw %%mm7, %%mm0\n\t"
339 "punpcklbw %%mm7, %%mm1\n\t"
340 "punpckhbw %%mm7, %%mm2\n\t"
341 "punpckhbw %%mm7, %%mm3\n\t"
342 "paddusw %%mm1, %%mm0\n\t"
343 "paddusw %%mm3, %%mm2\n\t"
344 "paddusw %%mm4, %%mm0\n\t"
345 "paddusw %%mm4, %%mm2\n\t"
346 "psrlw $1, %%mm0\n\t"
347 "psrlw $1, %%mm2\n\t"
348 "packuswb %%mm2, %%mm0\n\t"
349 "movq %%mm0, %0\n\t"
350 :"=m"(*p)
351 :"m"(*pix)
352 :"memory");
353 pix += line_size; p += line_size;
354 } while (--h);
355 #else
356 __asm __volatile(
357 MOVQ_BFE(%%mm7)
358 "lea (%3, %3), %%eax \n\t"
359 ".balign 8 \n\t"
360 "1: \n\t"
361 "movq (%1), %%mm0 \n\t"
362 "movq (%1, %3), %%mm2 \n\t"
363 "movq 1(%1), %%mm1 \n\t"
364 "movq 1(%1, %3), %%mm3 \n\t"
365 PAVG_MMX(%%mm0, %%mm1)
366 "movq %%mm6, (%2) \n\t"
367 PAVG_MMX(%%mm2, %%mm3)
368 "movq %%mm6, (%2, %3) \n\t"
369 "addl %%eax, %1 \n\t"
370 "addl %%eax, %2 \n\t"
371 #if LONG_UNROLL
372 "movq (%1), %%mm0 \n\t"
373 "movq (%1, %3), %%mm2 \n\t"
374 "movq 1(%1), %%mm1 \n\t"
375 "movq 1(%1, %3), %%mm3 \n\t"
376 PAVG_MMX(%%mm0, %%mm1)
377 "movq %%mm6, (%2) \n\t"
378 PAVG_MMX(%%mm2, %%mm3)
379 "movq %%mm6, (%2, %3) \n\t"
380 "addl %%eax, %1 \n\t"
381 "addl %%eax, %2 \n\t"
382 "subl $4, %0 \n\t"
383 #else
384 "subl $2, %0 \n\t"
385 #endif
386 "jnz 1b \n\t"
387 :"+g"(h), "+S"(pixels), "+D"(block)
388 :"r"(line_size)
389 :"eax", "memory");
390 #endif
391 }
392
393 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
394 {
395 #if 0
396 UINT8 *p;
397 const UINT8 *pix;
398 p = block;
399 pix = pixels;
400 MOVQ_ZERO(mm7);
401 MOVQ_WONE(mm4);
402 JUMPALIGN();
403 do {
404 __asm __volatile(
405 "movq %1, %%mm0\n\t"
406 "movq %2, %%mm1\n\t"
407 "movq %%mm0, %%mm2\n\t"
408 "movq %%mm1, %%mm3\n\t"
409 "punpcklbw %%mm7, %%mm0\n\t"
410 "punpcklbw %%mm7, %%mm1\n\t"
411 "punpckhbw %%mm7, %%mm2\n\t"
412 "punpckhbw %%mm7, %%mm3\n\t"
413 "paddusw %%mm1, %%mm0\n\t"
414 "paddusw %%mm3, %%mm2\n\t"
415 "paddusw %%mm4, %%mm0\n\t"
416 "paddusw %%mm4, %%mm2\n\t"
417 "psrlw $1, %%mm0\n\t"
418 "psrlw $1, %%mm2\n\t"
419 "packuswb %%mm2, %%mm0\n\t"
420 "movq %%mm0, %0\n\t"
421 :"=m"(*p)
422 :"m"(*pix),
423 "m"(*(pix+line_size))
424 :"memory");
425 pix += line_size;
426 p += line_size;
427 } while (--h);
428 #else
429 __asm __volatile(
430 MOVQ_BFE(%%mm7)
431 "lea (%3, %3), %%eax \n\t"
432 "movq (%1), %%mm0 \n\t"
433 ".balign 8 \n\t"
434 "1: \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq (%1, %%eax),%%mm2 \n\t"
437 PAVG_MMX(%%mm1, %%mm0)
438 "movq %%mm6, (%2) \n\t"
439 PAVG_MMX(%%mm2, %%mm1)
440 "movq %%mm6, (%2, %3) \n\t"
441 "addl %%eax, %1 \n\t"
442 "addl %%eax, %2 \n\t"
443 #ifdef LONG_UNROLL
444 "movq (%1, %3), %%mm1 \n\t"
445 "movq (%1, %%eax),%%mm0 \n\t"
446 PAVG_MMX(%%mm1, %%mm2)
447 "movq %%mm6, (%2) \n\t"
448 PAVG_MMX(%%mm0, %%mm1)
449 "movq %%mm6, (%2, %3) \n\t"
450 "addl %%eax, %1 \n\t"
451 "addl %%eax, %2 \n\t"
452 "subl $4, %0 \n\t"
453 #else
454 "subl $2, %0 \n\t"
455 #endif
456 "jnz 1b \n\t"
457 :"+g"(h), "+S"(pixels), "+D"(block)
458 :"r"(line_size)
459 :"eax", "memory");
460 #endif
461
462
463 }
464
465 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) 343 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
466 { 344 {
467 UINT8 *p; 345 UINT8 *p;
468 const UINT8 *pix; 346 const UINT8 *pix;
469 p = block; 347 p = block;
508 pix += line_size; 386 pix += line_size;
509 p += line_size; 387 p += line_size;
510 } while(--h); 388 } while(--h);
511 } 389 }
512 390
513 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
514 {
515 UINT8 *p;
516 const UINT8 *pix;
517 p = block;
518 pix = pixels;
519 MOVQ_ZERO(mm7);
520 do {
521 __asm __volatile(
522 "movq %1, %%mm0\n\t"
523 "movq 1%1, %%mm1\n\t"
524 "movq %%mm0, %%mm2\n\t"
525 "movq %%mm1, %%mm3\n\t"
526 "punpcklbw %%mm7, %%mm0\n\t"
527 "punpcklbw %%mm7, %%mm1\n\t"
528 "punpckhbw %%mm7, %%mm2\n\t"
529 "punpckhbw %%mm7, %%mm3\n\t"
530 "paddusw %%mm1, %%mm0\n\t"
531 "paddusw %%mm3, %%mm2\n\t"
532 "psrlw $1, %%mm0\n\t"
533 "psrlw $1, %%mm2\n\t"
534 "packuswb %%mm2, %%mm0\n\t"
535 "movq %%mm0, %0\n\t"
536 :"=m"(*p)
537 :"m"(*pix)
538 :"memory");
539 pix += line_size;
540 p += line_size;
541 } while (--h);
542 }
543
544 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
545 {
546 UINT8 *p;
547 const UINT8 *pix;
548 p = block;
549 pix = pixels;
550 MOVQ_ZERO(mm7);
551 JUMPALIGN();
552 do {
553 __asm __volatile(
554 "movq %1, %%mm0\n\t"
555 "movq %2, %%mm1\n\t"
556 "movq %%mm0, %%mm2\n\t"
557 "movq %%mm1, %%mm3\n\t"
558 "punpcklbw %%mm7, %%mm0\n\t"
559 "punpcklbw %%mm7, %%mm1\n\t"
560 "punpckhbw %%mm7, %%mm2\n\t"
561 "punpckhbw %%mm7, %%mm3\n\t"
562 "paddusw %%mm1, %%mm0\n\t"
563 "paddusw %%mm3, %%mm2\n\t"
564 "psrlw $1, %%mm0\n\t"
565 "psrlw $1, %%mm2\n\t"
566 "packuswb %%mm2, %%mm0\n\t"
567 "movq %%mm0, %0\n\t"
568 :"=m"(*p)
569 :"m"(*pix),
570 "m"(*(pix+line_size))
571 :"memory");
572 pix += line_size;
573 p += line_size;
574 } while(--h);
575 }
576
577 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) 391 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
578 { 392 {
579 UINT8 *p; 393 UINT8 *p;
580 const UINT8 *pix; 394 const UINT8 *pix;
581 p = block; 395 p = block;
619 :"memory"); 433 :"memory");
620 pix += line_size; 434 pix += line_size;
621 p += line_size; 435 p += line_size;
622 } while(--h); 436 } while(--h);
623 } 437 }
624 438 #endif
625 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) 439 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
626 { 440 {
627 UINT8 *p; 441 UINT8 *p;
628 const UINT8 *pix; 442 const UINT8 *pix;
629 p = block; 443 p = block;