comparison i386/snowdsp_mmx.c @ 3566:b63ef6fcbc70 libavcodec

Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
author lu_zero
date Thu, 10 Aug 2006 16:05:29 +0000
parents e0927bc44a10
children c8c591fe26f8
comparison
equal deleted inserted replaced
3565:f086f8868bb6 3566:b63ef6fcbc70
461 asm volatile ( 461 asm volatile (
462 "jmp 2f \n\t" 462 "jmp 2f \n\t"
463 "1: \n\t" 463 "1: \n\t"
464 464
465 "mov %6, %%"REG_a" \n\t" 465 "mov %6, %%"REG_a" \n\t"
466 "mov %4, %%"REG_b" \n\t" 466 "mov %4, %%"REG_S" \n\t"
467 467
468 snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6") 468 snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6")
469 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") 469 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
470 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") 470 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
471 snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\ 471 snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
472 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") 472 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
473 473
480 snow_vertical_compose_sse2_sra("3","xmm0","xmm2","xmm4","xmm6") 480 snow_vertical_compose_sse2_sra("3","xmm0","xmm2","xmm4","xmm6")
481 snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7") 481 snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7")
482 snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") 482 snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
483 snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7") 483 snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7")
484 "mov %3, %%"REG_c" \n\t" 484 "mov %3, %%"REG_c" \n\t"
485 snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6") 485 snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6")
486 snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7") 486 snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7")
487 snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") 487 snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
488 snow_vertical_compose_sse2_store(REG_b,"xmm0","xmm2","xmm4","xmm6") 488 snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6")
489 "mov %2, %%"REG_a" \n\t" 489 "mov %2, %%"REG_a" \n\t"
490 snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7") 490 snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7")
491 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") 491 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
492 snow_vertical_compose_sse2_sll("2","xmm1","xmm3","xmm5","xmm7")\ 492 snow_vertical_compose_sse2_sll("2","xmm1","xmm3","xmm5","xmm7")\
493 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") 493 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
494 494
495 "pcmpeqd %%xmm1, %%xmm1 \n\t" 495 "pcmpeqd %%xmm1, %%xmm1 \n\t"
496 "pslld $31, %%xmm1 \n\t" 496 "pslld $31, %%xmm1 \n\t"
497 "psrld $28, %%xmm1 \n\t" 497 "psrld $28, %%xmm1 \n\t"
498 "mov %1, %%"REG_b" \n\t" 498 "mov %1, %%"REG_S" \n\t"
499 499
500 snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") 500 snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
501 snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6") 501 snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6")
502 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") 502 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
503 snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6") 503 snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6")
504 snow_vertical_compose_sse2_add(REG_b,"xmm0","xmm2","xmm4","xmm6") 504 snow_vertical_compose_sse2_add(REG_S,"xmm0","xmm2","xmm4","xmm6")
505 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") 505 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
506 snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\ 506 snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
507 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") 507 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
508 snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6") 508 snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6")
509 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") 509 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
513 "sub $16, %%"REG_d" \n\t" 513 "sub $16, %%"REG_d" \n\t"
514 "jge 1b \n\t" 514 "jge 1b \n\t"
515 :"+d"(i) 515 :"+d"(i)
516 : 516 :
517 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): 517 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
518 "%"REG_a"","%"REG_b"","%"REG_c""); 518 "%"REG_a"","%"REG_S"","%"REG_c"");
519 } 519 }
520 520
521 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ 521 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
522 ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\ 522 ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\
523 ""op" 8(%%"r",%%"REG_d",4), %%"t1" \n\t"\ 523 ""op" 8(%%"r",%%"REG_d",4), %%"t1" \n\t"\
568 asm volatile( 568 asm volatile(
569 "jmp 2f \n\t" 569 "jmp 2f \n\t"
570 "1: \n\t" 570 "1: \n\t"
571 571
572 "mov %6, %%"REG_a" \n\t" 572 "mov %6, %%"REG_a" \n\t"
573 "mov %4, %%"REG_b" \n\t" 573 "mov %4, %%"REG_S" \n\t"
574 574
575 snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6") 575 snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6")
576 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") 576 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
577 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") 577 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
578 snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6") 578 snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
579 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") 579 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
580 580
587 snow_vertical_compose_mmx_sra("3","mm0","mm2","mm4","mm6") 587 snow_vertical_compose_mmx_sra("3","mm0","mm2","mm4","mm6")
588 snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7") 588 snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7")
589 snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") 589 snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
590 snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7") 590 snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7")
591 "mov %3, %%"REG_c" \n\t" 591 "mov %3, %%"REG_c" \n\t"
592 snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6") 592 snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6")
593 snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7") 593 snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7")
594 snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") 594 snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
595 snow_vertical_compose_mmx_store(REG_b,"mm0","mm2","mm4","mm6") 595 snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6")
596 "mov %2, %%"REG_a" \n\t" 596 "mov %2, %%"REG_a" \n\t"
597 snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7") 597 snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7")
598 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") 598 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
599 snow_vertical_compose_mmx_sll("2","mm1","mm3","mm5","mm7") 599 snow_vertical_compose_mmx_sll("2","mm1","mm3","mm5","mm7")
600 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") 600 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
601 601
602 "pcmpeqd %%mm1, %%mm1 \n\t" 602 "pcmpeqd %%mm1, %%mm1 \n\t"
603 "pslld $31, %%mm1 \n\t" 603 "pslld $31, %%mm1 \n\t"
604 "psrld $28, %%mm1 \n\t" 604 "psrld $28, %%mm1 \n\t"
605 "mov %1, %%"REG_b" \n\t" 605 "mov %1, %%"REG_S" \n\t"
606 606
607 snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6") 607 snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
608 snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6") 608 snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6")
609 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") 609 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
610 snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6") 610 snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6")
611 snow_vertical_compose_mmx_add(REG_b,"mm0","mm2","mm4","mm6") 611 snow_vertical_compose_mmx_add(REG_S,"mm0","mm2","mm4","mm6")
612 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") 612 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
613 snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6") 613 snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
614 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") 614 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
615 snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6") 615 snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6")
616 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") 616 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
620 "sub $8, %%"REG_d" \n\t" 620 "sub $8, %%"REG_d" \n\t"
621 "jge 1b \n\t" 621 "jge 1b \n\t"
622 :"+d"(i) 622 :"+d"(i)
623 : 623 :
624 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): 624 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
625 "%"REG_a"","%"REG_b"","%"REG_c""); 625 "%"REG_a"","%"REG_S"","%"REG_c"");
626 } 626 }
627 627
628 #define snow_inner_add_yblock_sse2_header \ 628 #define snow_inner_add_yblock_sse2_header \
629 DWTELEM * * dst_array = sb->line + src_y;\ 629 DWTELEM * * dst_array = sb->line + src_y;\
630 long tmp;\
630 asm volatile(\ 631 asm volatile(\
631 "mov %6, %%"REG_c" \n\t"\ 632 "mov %7, %%"REG_c" \n\t"\
632 "mov %5, %%"REG_b" \n\t"\ 633 "mov %6, %2 \n\t"\
633 "mov %3, %%"REG_S" \n\t"\ 634 "mov %4, %%"REG_S" \n\t"\
634 "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ 635 "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
635 "pcmpeqd %%xmm3, %%xmm3 \n\t"\ 636 "pcmpeqd %%xmm3, %%xmm3 \n\t"\
636 "pslld $31, %%xmm3 \n\t"\ 637 "pslld $31, %%xmm3 \n\t"\
637 "psrld $24, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ 638 "psrld $24, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
638 "1: \n\t"\ 639 "1: \n\t"\
639 "mov %1, %%"REG_D" \n\t"\ 640 "mov %1, %%"REG_D" \n\t"\
640 "mov (%%"REG_D"), %%"REG_D" \n\t"\ 641 "mov (%%"REG_D"), %%"REG_D" \n\t"\
641 "add %2, %%"REG_D" \n\t" 642 "add %3, %%"REG_D" \n\t"
642 643
643 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ 644 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
644 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ 645 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
645 "movq (%%"REG_d"), %%"out_reg1" \n\t"\ 646 "movq (%%"REG_d"), %%"out_reg1" \n\t"\
646 "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ 647 "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
684 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ 685 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
685 "add %%"REG_c", (%%"REG_a") \n\t" 686 "add %%"REG_c", (%%"REG_a") \n\t"
686 687
687 #define snow_inner_add_yblock_sse2_end_common2\ 688 #define snow_inner_add_yblock_sse2_end_common2\
688 "jnz 1b \n\t"\ 689 "jnz 1b \n\t"\
689 :"+m"(dst8),"+m"(dst_array)\ 690 :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
690 :\ 691 :\
691 "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ 692 "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
692 "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); 693 "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
693 694
694 #define snow_inner_add_yblock_sse2_end_8\ 695 #define snow_inner_add_yblock_sse2_end_8\
695 "sal $1, %%"REG_c" \n\t"\ 696 "sal $1, %%"REG_c" \n\t"\
696 "add $"PTR_SIZE"*2, %1 \n\t"\ 697 "add $"PTR_SIZE"*2, %1 \n\t"\
697 snow_inner_add_yblock_sse2_end_common1\ 698 snow_inner_add_yblock_sse2_end_common1\
698 "sar $1, %%"REG_c" \n\t"\ 699 "sar $1, %%"REG_c" \n\t"\
699 "sub $2, %%"REG_b" \n\t"\ 700 "sub $2, %2 \n\t"\
700 snow_inner_add_yblock_sse2_end_common2 701 snow_inner_add_yblock_sse2_end_common2
701 702
702 #define snow_inner_add_yblock_sse2_end_16\ 703 #define snow_inner_add_yblock_sse2_end_16\
703 "add $"PTR_SIZE"*1, %1 \n\t"\ 704 "add $"PTR_SIZE"*1, %1 \n\t"\
704 snow_inner_add_yblock_sse2_end_common1\ 705 snow_inner_add_yblock_sse2_end_common1\
705 "dec %%"REG_b" \n\t"\ 706 "dec %2 \n\t"\
706 snow_inner_add_yblock_sse2_end_common2 707 snow_inner_add_yblock_sse2_end_common2
707 708
708 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, 709 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
709 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ 710 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
710 snow_inner_add_yblock_sse2_header 711 snow_inner_add_yblock_sse2_header
725 "paddd %%xmm3, %%xmm0 \n\t" 726 "paddd %%xmm3, %%xmm0 \n\t"
726 "paddd %%xmm3, %%xmm2 \n\t" 727 "paddd %%xmm3, %%xmm2 \n\t"
727 728
728 "mov %1, %%"REG_D" \n\t" 729 "mov %1, %%"REG_D" \n\t"
729 "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" 730 "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
730 "add %2, %%"REG_D" \n\t" 731 "add %3, %%"REG_D" \n\t"
731 732
732 "movdqa (%%"REG_D"), %%xmm4 \n\t" 733 "movdqa (%%"REG_D"), %%xmm4 \n\t"
733 "movdqa %%xmm5, %%xmm6 \n\t" 734 "movdqa %%xmm5, %%xmm6 \n\t"
734 "punpckhwd %%xmm7, %%xmm5 \n\t" 735 "punpckhwd %%xmm7, %%xmm5 \n\t"
735 "punpcklwd %%xmm7, %%xmm6 \n\t" 736 "punpcklwd %%xmm7, %%xmm6 \n\t"
790 snow_inner_add_yblock_sse2_end_16 791 snow_inner_add_yblock_sse2_end_16
791 } 792 }
792 793
793 #define snow_inner_add_yblock_mmx_header \ 794 #define snow_inner_add_yblock_mmx_header \
794 DWTELEM * * dst_array = sb->line + src_y;\ 795 DWTELEM * * dst_array = sb->line + src_y;\
796 long tmp;\
795 asm volatile(\ 797 asm volatile(\
796 "mov %6, %%"REG_c" \n\t"\ 798 "mov %7, %%"REG_c" \n\t"\
797 "mov %5, %%"REG_b" \n\t"\ 799 "mov %6, %2 \n\t"\
798 "mov %3, %%"REG_S" \n\t"\ 800 "mov %4, %%"REG_S" \n\t"\
799 "pxor %%mm7, %%mm7 \n\t" /* 0 */\ 801 "pxor %%mm7, %%mm7 \n\t" /* 0 */\
800 "pcmpeqd %%mm3, %%mm3 \n\t"\ 802 "pcmpeqd %%mm3, %%mm3 \n\t"\
801 "pslld $31, %%mm3 \n\t"\ 803 "pslld $31, %%mm3 \n\t"\
802 "psrld $24, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ 804 "psrld $24, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
803 "1: \n\t"\ 805 "1: \n\t"\
804 "mov %1, %%"REG_D" \n\t"\ 806 "mov %1, %%"REG_D" \n\t"\
805 "mov (%%"REG_D"), %%"REG_D" \n\t"\ 807 "mov (%%"REG_D"), %%"REG_D" \n\t"\
806 "add %2, %%"REG_D" \n\t" 808 "add %3, %%"REG_D" \n\t"
807 809
808 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ 810 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
809 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ 811 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
810 "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ 812 "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
811 "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ 813 "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
855 "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ 857 "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
856 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ 858 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
857 "add %%"REG_c", (%%"REG_a") \n\t"\ 859 "add %%"REG_c", (%%"REG_a") \n\t"\
858 "add $"PTR_SIZE"*1, %1 \n\t"\ 860 "add $"PTR_SIZE"*1, %1 \n\t"\
859 "add %%"REG_c", %0 \n\t"\ 861 "add %%"REG_c", %0 \n\t"\
860 "dec %%"REG_b" \n\t"\ 862 "dec %2 \n\t"\
861 "jnz 1b \n\t"\ 863 "jnz 1b \n\t"\
862 :"+m"(dst8),"+m"(dst_array)\ 864 :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
863 :\ 865 :\
864 "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ 866 "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
865 "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); 867 "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
866 868
867 static void inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, 869 static void inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
868 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ 870 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
869 snow_inner_add_yblock_mmx_header 871 snow_inner_add_yblock_mmx_header
870 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") 872 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")