Mercurial > libavcodec.hg
comparison x86/h264dsp_mmx.c @ 10955:fdddf3d4238f libavcodec
Use two separate memory arguments since 8+() is invalid gas syntax
author | conrad |
---|---|
date | Thu, 21 Jan 2010 09:46:57 +0000 |
parents | eb9a2581f50e |
children | 34a65026fa06 |
comparison
equal
deleted
inserted
replaced
10954:d7ef6611a49e | 10955:fdddf3d4238f |
---|---|
615 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ | 615 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ |
616 "movq %%mm1, "#tmp" \n\t"\ | 616 "movq %%mm1, "#tmp" \n\t"\ |
617 "pavgb %%mm2, "#tmp" \n\t"\ | 617 "pavgb %%mm2, "#tmp" \n\t"\ |
618 "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ | 618 "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ |
619 "pxor "q2addr", "#tmp" \n\t"\ | 619 "pxor "q2addr", "#tmp" \n\t"\ |
620 "pand %8, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ | 620 "pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ |
621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ | 621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ |
622 "movq "#p1", "#tmp" \n\t"\ | 622 "movq "#p1", "#tmp" \n\t"\ |
623 "psubusb "#tc0", "#tmp" \n\t"\ | 623 "psubusb "#tc0", "#tmp" \n\t"\ |
624 "paddusb "#p1", "#tc0" \n\t"\ | 624 "paddusb "#p1", "#tc0" \n\t"\ |
625 "pmaxub "#tmp", "#q2" \n\t"\ | 625 "pmaxub "#tmp", "#q2" \n\t"\ |
629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) | 629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) |
630 { | 630 { |
631 DECLARE_ALIGNED_8(uint64_t, tmp0[2]); | 631 DECLARE_ALIGNED_8(uint64_t, tmp0[2]); |
632 | 632 |
633 __asm__ volatile( | 633 __asm__ volatile( |
634 "movq (%1,%3), %%mm0 \n\t" //p1 | 634 "movq (%2,%4), %%mm0 \n\t" //p1 |
635 "movq (%1,%3,2), %%mm1 \n\t" //p0 | 635 "movq (%2,%4,2), %%mm1 \n\t" //p0 |
636 "movq (%2), %%mm2 \n\t" //q0 | 636 "movq (%3), %%mm2 \n\t" //q0 |
637 "movq (%2,%3), %%mm3 \n\t" //q1 | 637 "movq (%3,%4), %%mm3 \n\t" //q1 |
638 H264_DEBLOCK_MASK(%6, %7) | 638 H264_DEBLOCK_MASK(%7, %8) |
639 | 639 |
640 "movd %5, %%mm4 \n\t" | 640 "movd %6, %%mm4 \n\t" |
641 "punpcklbw %%mm4, %%mm4 \n\t" | 641 "punpcklbw %%mm4, %%mm4 \n\t" |
642 "punpcklwd %%mm4, %%mm4 \n\t" | 642 "punpcklwd %%mm4, %%mm4 \n\t" |
643 "pcmpeqb %%mm3, %%mm3 \n\t" | 643 "pcmpeqb %%mm3, %%mm3 \n\t" |
644 "movq %%mm4, %%mm6 \n\t" | 644 "movq %%mm4, %%mm6 \n\t" |
645 "pcmpgtb %%mm3, %%mm4 \n\t" | 645 "pcmpgtb %%mm3, %%mm4 \n\t" |
646 "movq %%mm6, 8+%0 \n\t" | 646 "movq %%mm6, %1 \n\t" |
647 "pand %%mm4, %%mm7 \n\t" | 647 "pand %%mm4, %%mm7 \n\t" |
648 "movq %%mm7, %0 \n\t" | 648 "movq %%mm7, %0 \n\t" |
649 | 649 |
650 /* filter p1 */ | 650 /* filter p1 */ |
651 "movq (%1), %%mm3 \n\t" //p2 | 651 "movq (%2), %%mm3 \n\t" //p2 |
652 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 | 652 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 |
653 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta | 653 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta |
654 "pand 8+%0, %%mm7 \n\t" // mask & tc0 | 654 "pand %1, %%mm7 \n\t" // mask & tc0 |
655 "movq %%mm7, %%mm4 \n\t" | 655 "movq %%mm7, %%mm4 \n\t" |
656 "psubb %%mm6, %%mm7 \n\t" | 656 "psubb %%mm6, %%mm7 \n\t" |
657 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 | 657 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 |
658 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) | 658 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4) |
659 | 659 |
660 /* filter q1 */ | 660 /* filter q1 */ |
661 "movq (%2,%3,2), %%mm4 \n\t" //q2 | 661 "movq (%3,%4,2), %%mm4 \n\t" //q2 |
662 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 | 662 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 |
663 "pand %0, %%mm6 \n\t" | 663 "pand %0, %%mm6 \n\t" |
664 "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then | 664 "movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then |
665 "pand %%mm6, %%mm5 \n\t" | 665 "pand %%mm6, %%mm5 \n\t" |
666 "psubb %%mm6, %%mm7 \n\t" | 666 "psubb %%mm6, %%mm7 \n\t" |
667 "movq (%2,%3), %%mm3 \n\t" | 667 "movq (%3,%4), %%mm3 \n\t" |
668 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6) | 668 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6) |
669 | 669 |
670 /* filter p0, q0 */ | 670 /* filter p0, q0 */ |
671 H264_DEBLOCK_P0_Q0(%8, unused) | 671 H264_DEBLOCK_P0_Q0(%9, unused) |
672 "movq %%mm1, (%1,%3,2) \n\t" | 672 "movq %%mm1, (%2,%4,2) \n\t" |
673 "movq %%mm2, (%2) \n\t" | 673 "movq %%mm2, (%3) \n\t" |
674 | 674 |
675 : "=m"(*tmp0) | 675 : "=m"(tmp0[0]), "=m"(tmp0[1]) |
676 : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), | 676 : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), |
677 "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), | 677 "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), |
678 "m"(ff_bone) | 678 "m"(ff_bone) |
679 ); | 679 ); |
680 } | 680 } |