comparison i386/dsputil_mmx.c @ 651:45e8f39fda50 libavcodec

put/avg_pixels16 fixing 2 small qpel bugs
author michaelni
date Wed, 11 Sep 2002 12:39:53 +0000
parents 47a8964ba5cd
children 894b61908734
comparison
equal deleted inserted replaced
650:ef4a33aad86e 651:45e8f39fda50
341 pix += line_size*2; 341 pix += line_size*2;
342 p += 16; 342 p += 16;
343 } while (--i); 343 } while (--i);
344 } 344 }
345 345
346 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) 346 static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
347 { 347 {
348 __asm __volatile( 348 __asm __volatile(
349 "lea (%3, %3), %%eax \n\t" 349 "lea (%3, %3), %%eax \n\t"
350 ".balign 8 \n\t" 350 ".balign 8 \n\t"
351 "1: \n\t" 351 "1: \n\t"
367 : "r"(line_size) 367 : "r"(line_size)
368 : "%eax", "memory" 368 : "%eax", "memory"
369 ); 369 );
370 } 370 }
371 371
372 static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
373 {
374 __asm __volatile(
375 "lea (%3, %3), %%eax \n\t"
376 ".balign 8 \n\t"
377 "1: \n\t"
378 "movq (%1), %%mm0 \n\t"
379 "movq 8(%1), %%mm4 \n\t"
380 "movq (%1, %3), %%mm1 \n\t"
381 "movq 8(%1, %3), %%mm5 \n\t"
382 "movq %%mm0, (%2) \n\t"
383 "movq %%mm4, 8(%2) \n\t"
384 "movq %%mm1, (%2, %3) \n\t"
385 "movq %%mm5, 8(%2, %3) \n\t"
386 "addl %%eax, %1 \n\t"
387 "addl %%eax, %2 \n\t"
388 "movq (%1), %%mm0 \n\t"
389 "movq 8(%1), %%mm4 \n\t"
390 "movq (%1, %3), %%mm1 \n\t"
391 "movq 8(%1, %3), %%mm5 \n\t"
392 "movq %%mm0, (%2) \n\t"
393 "movq %%mm4, 8(%2) \n\t"
394 "movq %%mm1, (%2, %3) \n\t"
395 "movq %%mm5, 8(%2, %3) \n\t"
396 "addl %%eax, %1 \n\t"
397 "addl %%eax, %2 \n\t"
398 "subl $4, %0 \n\t"
399 "jnz 1b \n\t"
400 : "+g"(h), "+r" (pixels), "+r" (block)
401 : "r"(line_size)
402 : "%eax", "memory"
403 );
404 }
405
372 static void clear_blocks_mmx(DCTELEM *blocks) 406 static void clear_blocks_mmx(DCTELEM *blocks)
373 { 407 {
374 __asm __volatile( 408 __asm __volatile(
375 "pxor %%mm7, %%mm7 \n\t" 409 "pxor %%mm7, %%mm7 \n\t"
376 "movl $-128*6, %%eax \n\t" 410 "movl $-128*6, %%eax \n\t"
422 pix_abs8x8 = pix_abs8x8_mmx; 456 pix_abs8x8 = pix_abs8x8_mmx;
423 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; 457 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
424 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; 458 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
425 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; 459 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
426 460
427 put_pixels_tab[0] = put_pixels_mmx; 461 put_pixels_tab[0][0] = put_pixels16_mmx;
428 put_pixels_tab[1] = put_pixels_x2_mmx; 462 put_pixels_tab[0][1] = put_pixels16_x2_mmx;
429 put_pixels_tab[2] = put_pixels_y2_mmx; 463 put_pixels_tab[0][2] = put_pixels16_y2_mmx;
430 put_pixels_tab[3] = put_pixels_xy2_mmx; 464 put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
431 465
432 put_no_rnd_pixels_tab[0] = put_pixels_mmx; 466 put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
433 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; 467 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
434 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; 468 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
435 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; 469 put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
436 470
437 avg_pixels_tab[0] = avg_pixels_mmx; 471 avg_pixels_tab[0][0] = avg_pixels16_mmx;
438 avg_pixels_tab[1] = avg_pixels_x2_mmx; 472 avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
439 avg_pixels_tab[2] = avg_pixels_y2_mmx; 473 avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
440 avg_pixels_tab[3] = avg_pixels_xy2_mmx; 474 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
441 475
442 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; 476 avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
443 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; 477 avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
444 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; 478 avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
445 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; 479 avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
480
481 put_pixels_tab[1][0] = put_pixels8_mmx;
482 put_pixels_tab[1][1] = put_pixels8_x2_mmx;
483 put_pixels_tab[1][2] = put_pixels8_y2_mmx;
484 put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
485
486 put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
487 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
488 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
489 put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
490
491 avg_pixels_tab[1][0] = avg_pixels8_mmx;
492 avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
493 avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
494 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
495
496 avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
497 avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
498 avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
499 avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
446 500
447 if (mm_flags & MM_MMXEXT) { 501 if (mm_flags & MM_MMXEXT) {
448 pix_abs16x16 = pix_abs16x16_mmx2; 502 pix_abs16x16 = pix_abs16x16_mmx2;
449 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; 503 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
450 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; 504 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
453 pix_abs8x8 = pix_abs8x8_mmx2; 507 pix_abs8x8 = pix_abs8x8_mmx2;
454 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; 508 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
455 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; 509 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
456 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; 510 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
457 511
458 put_pixels_tab[1] = put_pixels_x2_mmx2; 512 put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
459 put_pixels_tab[2] = put_pixels_y2_mmx2; 513 put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
460 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; 514 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
461 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; 515 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
462 516
463 avg_pixels_tab[0] = avg_pixels_mmx2; 517 avg_pixels_tab[0][0] = avg_pixels16_mmx2;
464 avg_pixels_tab[1] = avg_pixels_x2_mmx2; 518 avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
465 avg_pixels_tab[2] = avg_pixels_y2_mmx2; 519 avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
466 avg_pixels_tab[3] = avg_pixels_xy2_mmx2; 520 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
521
522 put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
523 put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
524 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
525 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
526
527 avg_pixels_tab[1][0] = avg_pixels8_mmx2;
528 avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
529 avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
530 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
467 } else if (mm_flags & MM_3DNOW) { 531 } else if (mm_flags & MM_3DNOW) {
468 put_pixels_tab[1] = put_pixels_x2_3dnow; 532 put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
469 put_pixels_tab[2] = put_pixels_y2_3dnow; 533 put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
470 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; 534 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
471 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; 535 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
472 536
473 avg_pixels_tab[0] = avg_pixels_3dnow; 537 avg_pixels_tab[0][0] = avg_pixels16_3dnow;
474 avg_pixels_tab[1] = avg_pixels_x2_3dnow; 538 avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
475 avg_pixels_tab[2] = avg_pixels_y2_3dnow; 539 avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
476 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; 540 avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
541
542 put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
543 put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
544 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
545 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
546
547 avg_pixels_tab[1][0] = avg_pixels8_3dnow;
548 avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
549 avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
550 avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
477 } 551 }
478 552
479 /* idct */ 553 /* idct */
480 if (mm_flags & MM_MMXEXT) { 554 if (mm_flags & MM_MMXEXT) {
481 ff_idct = ff_mmxext_idct; 555 ff_idct = ff_mmxext_idct;
550 this function should be kept as small as possible because it is 624 this function should be kept as small as possible because it is
551 always difficult to test automatically non bit exact cases. */ 625 always difficult to test automatically non bit exact cases. */
552 void dsputil_set_bit_exact_mmx(void) 626 void dsputil_set_bit_exact_mmx(void)
553 { 627 {
554 if (mm_flags & MM_MMX) { 628 if (mm_flags & MM_MMX) {
629
630 /* MMX2 & 3DNOW */
631 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
632 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
633 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
634 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
635 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
636 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
637
555 if (mm_flags & MM_MMXEXT) { 638 if (mm_flags & MM_MMXEXT) {
556 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
557 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
558 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
559
560 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; 639 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
561 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; 640 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
562 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; 641 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
563 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; 642 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
564 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; 643 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
565 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; 644 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
566 } else if (mm_flags & MM_3DNOW) {
567 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
568 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
569 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
570 } 645 }
571 #ifdef SIMPLE_IDCT 646 #ifdef SIMPLE_IDCT
572 if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx) 647 if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx)
573 ff_idct_put= bit_exact_idct_put; 648 ff_idct_put= bit_exact_idct_put;
574 #endif 649 #endif