Mercurial > libavcodec.hg
comparison i386/snowdsp_mmx.c @ 3566:b63ef6fcbc70 libavcodec
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
author | lu_zero |
---|---|
date | Thu, 10 Aug 2006 16:05:29 +0000 |
parents | e0927bc44a10 |
children | c8c591fe26f8 |
comparison
equal
deleted
inserted
replaced
3565:f086f8868bb6 | 3566:b63ef6fcbc70 |
---|---|
461 asm volatile ( | 461 asm volatile ( |
462 "jmp 2f \n\t" | 462 "jmp 2f \n\t" |
463 "1: \n\t" | 463 "1: \n\t" |
464 | 464 |
465 "mov %6, %%"REG_a" \n\t" | 465 "mov %6, %%"REG_a" \n\t" |
466 "mov %4, %%"REG_b" \n\t" | 466 "mov %4, %%"REG_S" \n\t" |
467 | 467 |
468 snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6") | 468 snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6") |
469 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") | 469 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") |
470 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") | 470 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
471 snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\ | 471 snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\ |
472 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") | 472 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
473 | 473 |
480 snow_vertical_compose_sse2_sra("3","xmm0","xmm2","xmm4","xmm6") | 480 snow_vertical_compose_sse2_sra("3","xmm0","xmm2","xmm4","xmm6") |
481 snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7") | 481 snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7") |
482 snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") | 482 snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
483 snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7") | 483 snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7") |
484 "mov %3, %%"REG_c" \n\t" | 484 "mov %3, %%"REG_c" \n\t" |
485 snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6") | 485 snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6") |
486 snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7") | 486 snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7") |
487 snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") | 487 snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
488 snow_vertical_compose_sse2_store(REG_b,"xmm0","xmm2","xmm4","xmm6") | 488 snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6") |
489 "mov %2, %%"REG_a" \n\t" | 489 "mov %2, %%"REG_a" \n\t" |
490 snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7") | 490 snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7") |
491 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") | 491 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") |
492 snow_vertical_compose_sse2_sll("2","xmm1","xmm3","xmm5","xmm7")\ | 492 snow_vertical_compose_sse2_sll("2","xmm1","xmm3","xmm5","xmm7")\ |
493 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") | 493 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
494 | 494 |
495 "pcmpeqd %%xmm1, %%xmm1 \n\t" | 495 "pcmpeqd %%xmm1, %%xmm1 \n\t" |
496 "pslld $31, %%xmm1 \n\t" | 496 "pslld $31, %%xmm1 \n\t" |
497 "psrld $28, %%xmm1 \n\t" | 497 "psrld $28, %%xmm1 \n\t" |
498 "mov %1, %%"REG_b" \n\t" | 498 "mov %1, %%"REG_S" \n\t" |
499 | 499 |
500 snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") | 500 snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") |
501 snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6") | 501 snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6") |
502 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") | 502 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") |
503 snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6") | 503 snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6") |
504 snow_vertical_compose_sse2_add(REG_b,"xmm0","xmm2","xmm4","xmm6") | 504 snow_vertical_compose_sse2_add(REG_S,"xmm0","xmm2","xmm4","xmm6") |
505 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") | 505 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
506 snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\ | 506 snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\ |
507 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") | 507 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
508 snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6") | 508 snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6") |
509 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") | 509 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") |
513 "sub $16, %%"REG_d" \n\t" | 513 "sub $16, %%"REG_d" \n\t" |
514 "jge 1b \n\t" | 514 "jge 1b \n\t" |
515 :"+d"(i) | 515 :"+d"(i) |
516 : | 516 : |
517 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): | 517 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): |
518 "%"REG_a"","%"REG_b"","%"REG_c""); | 518 "%"REG_a"","%"REG_S"","%"REG_c""); |
519 } | 519 } |
520 | 520 |
521 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ | 521 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ |
522 ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\ | 522 ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\ |
523 ""op" 8(%%"r",%%"REG_d",4), %%"t1" \n\t"\ | 523 ""op" 8(%%"r",%%"REG_d",4), %%"t1" \n\t"\ |
568 asm volatile( | 568 asm volatile( |
569 "jmp 2f \n\t" | 569 "jmp 2f \n\t" |
570 "1: \n\t" | 570 "1: \n\t" |
571 | 571 |
572 "mov %6, %%"REG_a" \n\t" | 572 "mov %6, %%"REG_a" \n\t" |
573 "mov %4, %%"REG_b" \n\t" | 573 "mov %4, %%"REG_S" \n\t" |
574 | 574 |
575 snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6") | 575 snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6") |
576 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") | 576 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") |
577 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") | 577 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
578 snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6") | 578 snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6") |
579 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") | 579 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
580 | 580 |
587 snow_vertical_compose_mmx_sra("3","mm0","mm2","mm4","mm6") | 587 snow_vertical_compose_mmx_sra("3","mm0","mm2","mm4","mm6") |
588 snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7") | 588 snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7") |
589 snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") | 589 snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
590 snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7") | 590 snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7") |
591 "mov %3, %%"REG_c" \n\t" | 591 "mov %3, %%"REG_c" \n\t" |
592 snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6") | 592 snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6") |
593 snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7") | 593 snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7") |
594 snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") | 594 snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
595 snow_vertical_compose_mmx_store(REG_b,"mm0","mm2","mm4","mm6") | 595 snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6") |
596 "mov %2, %%"REG_a" \n\t" | 596 "mov %2, %%"REG_a" \n\t" |
597 snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7") | 597 snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7") |
598 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") | 598 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") |
599 snow_vertical_compose_mmx_sll("2","mm1","mm3","mm5","mm7") | 599 snow_vertical_compose_mmx_sll("2","mm1","mm3","mm5","mm7") |
600 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") | 600 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
601 | 601 |
602 "pcmpeqd %%mm1, %%mm1 \n\t" | 602 "pcmpeqd %%mm1, %%mm1 \n\t" |
603 "pslld $31, %%mm1 \n\t" | 603 "pslld $31, %%mm1 \n\t" |
604 "psrld $28, %%mm1 \n\t" | 604 "psrld $28, %%mm1 \n\t" |
605 "mov %1, %%"REG_b" \n\t" | 605 "mov %1, %%"REG_S" \n\t" |
606 | 606 |
607 snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6") | 607 snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6") |
608 snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6") | 608 snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6") |
609 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") | 609 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") |
610 snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6") | 610 snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6") |
611 snow_vertical_compose_mmx_add(REG_b,"mm0","mm2","mm4","mm6") | 611 snow_vertical_compose_mmx_add(REG_S,"mm0","mm2","mm4","mm6") |
612 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") | 612 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
613 snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6") | 613 snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6") |
614 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") | 614 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
615 snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6") | 615 snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6") |
616 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") | 616 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") |
620 "sub $8, %%"REG_d" \n\t" | 620 "sub $8, %%"REG_d" \n\t" |
621 "jge 1b \n\t" | 621 "jge 1b \n\t" |
622 :"+d"(i) | 622 :"+d"(i) |
623 : | 623 : |
624 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): | 624 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): |
625 "%"REG_a"","%"REG_b"","%"REG_c""); | 625 "%"REG_a"","%"REG_S"","%"REG_c""); |
626 } | 626 } |
627 | 627 |
628 #define snow_inner_add_yblock_sse2_header \ | 628 #define snow_inner_add_yblock_sse2_header \ |
629 DWTELEM * * dst_array = sb->line + src_y;\ | 629 DWTELEM * * dst_array = sb->line + src_y;\ |
630 long tmp;\ | |
630 asm volatile(\ | 631 asm volatile(\ |
631 "mov %6, %%"REG_c" \n\t"\ | 632 "mov %7, %%"REG_c" \n\t"\ |
632 "mov %5, %%"REG_b" \n\t"\ | 633 "mov %6, %2 \n\t"\ |
633 "mov %3, %%"REG_S" \n\t"\ | 634 "mov %4, %%"REG_S" \n\t"\ |
634 "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ | 635 "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ |
635 "pcmpeqd %%xmm3, %%xmm3 \n\t"\ | 636 "pcmpeqd %%xmm3, %%xmm3 \n\t"\ |
636 "pslld $31, %%xmm3 \n\t"\ | 637 "pslld $31, %%xmm3 \n\t"\ |
637 "psrld $24, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ | 638 "psrld $24, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ |
638 "1: \n\t"\ | 639 "1: \n\t"\ |
639 "mov %1, %%"REG_D" \n\t"\ | 640 "mov %1, %%"REG_D" \n\t"\ |
640 "mov (%%"REG_D"), %%"REG_D" \n\t"\ | 641 "mov (%%"REG_D"), %%"REG_D" \n\t"\ |
641 "add %2, %%"REG_D" \n\t" | 642 "add %3, %%"REG_D" \n\t" |
642 | 643 |
643 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ | 644 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ |
644 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ | 645 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
645 "movq (%%"REG_d"), %%"out_reg1" \n\t"\ | 646 "movq (%%"REG_d"), %%"out_reg1" \n\t"\ |
646 "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ | 647 "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ |
684 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ | 685 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ |
685 "add %%"REG_c", (%%"REG_a") \n\t" | 686 "add %%"REG_c", (%%"REG_a") \n\t" |
686 | 687 |
687 #define snow_inner_add_yblock_sse2_end_common2\ | 688 #define snow_inner_add_yblock_sse2_end_common2\ |
688 "jnz 1b \n\t"\ | 689 "jnz 1b \n\t"\ |
689 :"+m"(dst8),"+m"(dst_array)\ | 690 :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ |
690 :\ | 691 :\ |
691 "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ | 692 "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ |
692 "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); | 693 "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
693 | 694 |
694 #define snow_inner_add_yblock_sse2_end_8\ | 695 #define snow_inner_add_yblock_sse2_end_8\ |
695 "sal $1, %%"REG_c" \n\t"\ | 696 "sal $1, %%"REG_c" \n\t"\ |
696 "add $"PTR_SIZE"*2, %1 \n\t"\ | 697 "add $"PTR_SIZE"*2, %1 \n\t"\ |
697 snow_inner_add_yblock_sse2_end_common1\ | 698 snow_inner_add_yblock_sse2_end_common1\ |
698 "sar $1, %%"REG_c" \n\t"\ | 699 "sar $1, %%"REG_c" \n\t"\ |
699 "sub $2, %%"REG_b" \n\t"\ | 700 "sub $2, %2 \n\t"\ |
700 snow_inner_add_yblock_sse2_end_common2 | 701 snow_inner_add_yblock_sse2_end_common2 |
701 | 702 |
702 #define snow_inner_add_yblock_sse2_end_16\ | 703 #define snow_inner_add_yblock_sse2_end_16\ |
703 "add $"PTR_SIZE"*1, %1 \n\t"\ | 704 "add $"PTR_SIZE"*1, %1 \n\t"\ |
704 snow_inner_add_yblock_sse2_end_common1\ | 705 snow_inner_add_yblock_sse2_end_common1\ |
705 "dec %%"REG_b" \n\t"\ | 706 "dec %2 \n\t"\ |
706 snow_inner_add_yblock_sse2_end_common2 | 707 snow_inner_add_yblock_sse2_end_common2 |
707 | 708 |
708 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, | 709 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, |
709 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ | 710 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
710 snow_inner_add_yblock_sse2_header | 711 snow_inner_add_yblock_sse2_header |
725 "paddd %%xmm3, %%xmm0 \n\t" | 726 "paddd %%xmm3, %%xmm0 \n\t" |
726 "paddd %%xmm3, %%xmm2 \n\t" | 727 "paddd %%xmm3, %%xmm2 \n\t" |
727 | 728 |
728 "mov %1, %%"REG_D" \n\t" | 729 "mov %1, %%"REG_D" \n\t" |
729 "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" | 730 "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" |
730 "add %2, %%"REG_D" \n\t" | 731 "add %3, %%"REG_D" \n\t" |
731 | 732 |
732 "movdqa (%%"REG_D"), %%xmm4 \n\t" | 733 "movdqa (%%"REG_D"), %%xmm4 \n\t" |
733 "movdqa %%xmm5, %%xmm6 \n\t" | 734 "movdqa %%xmm5, %%xmm6 \n\t" |
734 "punpckhwd %%xmm7, %%xmm5 \n\t" | 735 "punpckhwd %%xmm7, %%xmm5 \n\t" |
735 "punpcklwd %%xmm7, %%xmm6 \n\t" | 736 "punpcklwd %%xmm7, %%xmm6 \n\t" |
790 snow_inner_add_yblock_sse2_end_16 | 791 snow_inner_add_yblock_sse2_end_16 |
791 } | 792 } |
792 | 793 |
793 #define snow_inner_add_yblock_mmx_header \ | 794 #define snow_inner_add_yblock_mmx_header \ |
794 DWTELEM * * dst_array = sb->line + src_y;\ | 795 DWTELEM * * dst_array = sb->line + src_y;\ |
796 long tmp;\ | |
795 asm volatile(\ | 797 asm volatile(\ |
796 "mov %6, %%"REG_c" \n\t"\ | 798 "mov %7, %%"REG_c" \n\t"\ |
797 "mov %5, %%"REG_b" \n\t"\ | 799 "mov %6, %2 \n\t"\ |
798 "mov %3, %%"REG_S" \n\t"\ | 800 "mov %4, %%"REG_S" \n\t"\ |
799 "pxor %%mm7, %%mm7 \n\t" /* 0 */\ | 801 "pxor %%mm7, %%mm7 \n\t" /* 0 */\ |
800 "pcmpeqd %%mm3, %%mm3 \n\t"\ | 802 "pcmpeqd %%mm3, %%mm3 \n\t"\ |
801 "pslld $31, %%mm3 \n\t"\ | 803 "pslld $31, %%mm3 \n\t"\ |
802 "psrld $24, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ | 804 "psrld $24, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ |
803 "1: \n\t"\ | 805 "1: \n\t"\ |
804 "mov %1, %%"REG_D" \n\t"\ | 806 "mov %1, %%"REG_D" \n\t"\ |
805 "mov (%%"REG_D"), %%"REG_D" \n\t"\ | 807 "mov (%%"REG_D"), %%"REG_D" \n\t"\ |
806 "add %2, %%"REG_D" \n\t" | 808 "add %3, %%"REG_D" \n\t" |
807 | 809 |
808 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ | 810 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ |
809 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ | 811 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
810 "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ | 812 "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ |
811 "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ | 813 "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ |
855 "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ | 857 "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ |
856 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ | 858 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ |
857 "add %%"REG_c", (%%"REG_a") \n\t"\ | 859 "add %%"REG_c", (%%"REG_a") \n\t"\ |
858 "add $"PTR_SIZE"*1, %1 \n\t"\ | 860 "add $"PTR_SIZE"*1, %1 \n\t"\ |
859 "add %%"REG_c", %0 \n\t"\ | 861 "add %%"REG_c", %0 \n\t"\ |
860 "dec %%"REG_b" \n\t"\ | 862 "dec %2 \n\t"\ |
861 "jnz 1b \n\t"\ | 863 "jnz 1b \n\t"\ |
862 :"+m"(dst8),"+m"(dst_array)\ | 864 :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ |
863 :\ | 865 :\ |
864 "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ | 866 "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ |
865 "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); | 867 "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
866 | 868 |
867 static void inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, | 869 static void inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, |
868 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ | 870 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
869 snow_inner_add_yblock_mmx_header | 871 snow_inner_add_yblock_mmx_header |
870 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") | 872 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") |