Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 651:45e8f39fda50 libavcodec
put/avg_pixels16
fixing 2 small qpel bugs
author | michaelni |
---|---|
date | Wed, 11 Sep 2002 12:39:53 +0000 |
parents | 47a8964ba5cd |
children | 894b61908734 |
comparison
equal
deleted
inserted
replaced
650:ef4a33aad86e | 651:45e8f39fda50 |
---|---|
341 pix += line_size*2; | 341 pix += line_size*2; |
342 p += 16; | 342 p += 16; |
343 } while (--i); | 343 } while (--i); |
344 } | 344 } |
345 | 345 |
346 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 346 static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
347 { | 347 { |
348 __asm __volatile( | 348 __asm __volatile( |
349 "lea (%3, %3), %%eax \n\t" | 349 "lea (%3, %3), %%eax \n\t" |
350 ".balign 8 \n\t" | 350 ".balign 8 \n\t" |
351 "1: \n\t" | 351 "1: \n\t" |
367 : "r"(line_size) | 367 : "r"(line_size) |
368 : "%eax", "memory" | 368 : "%eax", "memory" |
369 ); | 369 ); |
370 } | 370 } |
371 | 371 |
372 static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
373 { | |
374 __asm __volatile( | |
375 "lea (%3, %3), %%eax \n\t" | |
376 ".balign 8 \n\t" | |
377 "1: \n\t" | |
378 "movq (%1), %%mm0 \n\t" | |
379 "movq 8(%1), %%mm4 \n\t" | |
380 "movq (%1, %3), %%mm1 \n\t" | |
381 "movq 8(%1, %3), %%mm5 \n\t" | |
382 "movq %%mm0, (%2) \n\t" | |
383 "movq %%mm4, 8(%2) \n\t" | |
384 "movq %%mm1, (%2, %3) \n\t" | |
385 "movq %%mm5, 8(%2, %3) \n\t" | |
386 "addl %%eax, %1 \n\t" | |
387 "addl %%eax, %2 \n\t" | |
388 "movq (%1), %%mm0 \n\t" | |
389 "movq 8(%1), %%mm4 \n\t" | |
390 "movq (%1, %3), %%mm1 \n\t" | |
391 "movq 8(%1, %3), %%mm5 \n\t" | |
392 "movq %%mm0, (%2) \n\t" | |
393 "movq %%mm4, 8(%2) \n\t" | |
394 "movq %%mm1, (%2, %3) \n\t" | |
395 "movq %%mm5, 8(%2, %3) \n\t" | |
396 "addl %%eax, %1 \n\t" | |
397 "addl %%eax, %2 \n\t" | |
398 "subl $4, %0 \n\t" | |
399 "jnz 1b \n\t" | |
400 : "+g"(h), "+r" (pixels), "+r" (block) | |
401 : "r"(line_size) | |
402 : "%eax", "memory" | |
403 ); | |
404 } | |
405 | |
372 static void clear_blocks_mmx(DCTELEM *blocks) | 406 static void clear_blocks_mmx(DCTELEM *blocks) |
373 { | 407 { |
374 __asm __volatile( | 408 __asm __volatile( |
375 "pxor %%mm7, %%mm7 \n\t" | 409 "pxor %%mm7, %%mm7 \n\t" |
376 "movl $-128*6, %%eax \n\t" | 410 "movl $-128*6, %%eax \n\t" |
422 pix_abs8x8 = pix_abs8x8_mmx; | 456 pix_abs8x8 = pix_abs8x8_mmx; |
423 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | 457 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; |
424 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | 458 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; |
425 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | 459 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; |
426 | 460 |
427 put_pixels_tab[0] = put_pixels_mmx; | 461 put_pixels_tab[0][0] = put_pixels16_mmx; |
428 put_pixels_tab[1] = put_pixels_x2_mmx; | 462 put_pixels_tab[0][1] = put_pixels16_x2_mmx; |
429 put_pixels_tab[2] = put_pixels_y2_mmx; | 463 put_pixels_tab[0][2] = put_pixels16_y2_mmx; |
430 put_pixels_tab[3] = put_pixels_xy2_mmx; | 464 put_pixels_tab[0][3] = put_pixels16_xy2_mmx; |
431 | 465 |
432 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | 466 put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx; |
433 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | 467 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; |
434 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | 468 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; |
435 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | 469 put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx; |
436 | 470 |
437 avg_pixels_tab[0] = avg_pixels_mmx; | 471 avg_pixels_tab[0][0] = avg_pixels16_mmx; |
438 avg_pixels_tab[1] = avg_pixels_x2_mmx; | 472 avg_pixels_tab[0][1] = avg_pixels16_x2_mmx; |
439 avg_pixels_tab[2] = avg_pixels_y2_mmx; | 473 avg_pixels_tab[0][2] = avg_pixels16_y2_mmx; |
440 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | 474 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; |
441 | 475 |
442 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | 476 avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx; |
443 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | 477 avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx; |
444 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | 478 avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx; |
445 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | 479 avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx; |
480 | |
481 put_pixels_tab[1][0] = put_pixels8_mmx; | |
482 put_pixels_tab[1][1] = put_pixels8_x2_mmx; | |
483 put_pixels_tab[1][2] = put_pixels8_y2_mmx; | |
484 put_pixels_tab[1][3] = put_pixels8_xy2_mmx; | |
485 | |
486 put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx; | |
487 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; | |
488 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; | |
489 put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; | |
490 | |
491 avg_pixels_tab[1][0] = avg_pixels8_mmx; | |
492 avg_pixels_tab[1][1] = avg_pixels8_x2_mmx; | |
493 avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; | |
494 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; | |
495 | |
496 avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx; | |
497 avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; | |
498 avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; | |
499 avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; | |
446 | 500 |
447 if (mm_flags & MM_MMXEXT) { | 501 if (mm_flags & MM_MMXEXT) { |
448 pix_abs16x16 = pix_abs16x16_mmx2; | 502 pix_abs16x16 = pix_abs16x16_mmx2; |
449 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | 503 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; |
450 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | 504 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; |
453 pix_abs8x8 = pix_abs8x8_mmx2; | 507 pix_abs8x8 = pix_abs8x8_mmx2; |
454 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | 508 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; |
455 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | 509 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; |
456 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | 510 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; |
457 | 511 |
458 put_pixels_tab[1] = put_pixels_x2_mmx2; | 512 put_pixels_tab[0][1] = put_pixels16_x2_mmx2; |
459 put_pixels_tab[2] = put_pixels_y2_mmx2; | 513 put_pixels_tab[0][2] = put_pixels16_y2_mmx2; |
460 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; | 514 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; |
461 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; | 515 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; |
462 | 516 |
463 avg_pixels_tab[0] = avg_pixels_mmx2; | 517 avg_pixels_tab[0][0] = avg_pixels16_mmx2; |
464 avg_pixels_tab[1] = avg_pixels_x2_mmx2; | 518 avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; |
465 avg_pixels_tab[2] = avg_pixels_y2_mmx2; | 519 avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; |
466 avg_pixels_tab[3] = avg_pixels_xy2_mmx2; | 520 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; |
521 | |
522 put_pixels_tab[1][1] = put_pixels8_x2_mmx2; | |
523 put_pixels_tab[1][2] = put_pixels8_y2_mmx2; | |
524 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; | |
525 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; | |
526 | |
527 avg_pixels_tab[1][0] = avg_pixels8_mmx2; | |
528 avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; | |
529 avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; | |
530 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | |
467 } else if (mm_flags & MM_3DNOW) { | 531 } else if (mm_flags & MM_3DNOW) { |
468 put_pixels_tab[1] = put_pixels_x2_3dnow; | 532 put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |
469 put_pixels_tab[2] = put_pixels_y2_3dnow; | 533 put_pixels_tab[0][2] = put_pixels16_y2_3dnow; |
470 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; | 534 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; |
471 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; | 535 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; |
472 | 536 |
473 avg_pixels_tab[0] = avg_pixels_3dnow; | 537 avg_pixels_tab[0][0] = avg_pixels16_3dnow; |
474 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | 538 avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; |
475 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | 539 avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; |
476 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | 540 avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; |
541 | |
542 put_pixels_tab[1][1] = put_pixels8_x2_3dnow; | |
543 put_pixels_tab[1][2] = put_pixels8_y2_3dnow; | |
544 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; | |
545 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; | |
546 | |
547 avg_pixels_tab[1][0] = avg_pixels8_3dnow; | |
548 avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; | |
549 avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; | |
550 avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; | |
477 } | 551 } |
478 | 552 |
479 /* idct */ | 553 /* idct */ |
480 if (mm_flags & MM_MMXEXT) { | 554 if (mm_flags & MM_MMXEXT) { |
481 ff_idct = ff_mmxext_idct; | 555 ff_idct = ff_mmxext_idct; |
550 this function should be kept as small as possible because it is | 624 this function should be kept as small as possible because it is |
551 always difficult to test automatically non bit exact cases. */ | 625 always difficult to test automatically non bit exact cases. */ |
552 void dsputil_set_bit_exact_mmx(void) | 626 void dsputil_set_bit_exact_mmx(void) |
553 { | 627 { |
554 if (mm_flags & MM_MMX) { | 628 if (mm_flags & MM_MMX) { |
629 | |
630 /* MMX2 & 3DNOW */ | |
631 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; | |
632 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; | |
633 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; | |
634 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; | |
635 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; | |
636 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; | |
637 | |
555 if (mm_flags & MM_MMXEXT) { | 638 if (mm_flags & MM_MMXEXT) { |
556 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
557 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
558 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
559 | |
560 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | 639 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; |
561 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | 640 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; |
562 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; | 641 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
563 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | 642 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; |
564 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | 643 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; |
565 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | 644 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; |
566 } else if (mm_flags & MM_3DNOW) { | |
567 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
568 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
569 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
570 } | 645 } |
571 #ifdef SIMPLE_IDCT | 646 #ifdef SIMPLE_IDCT |
572 if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx) | 647 if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx) |
573 ff_idct_put= bit_exact_idct_put; | 648 ff_idct_put= bit_exact_idct_put; |
574 #endif | 649 #endif |