comparison x86/h264dsp_mmx.c @ 10955:fdddf3d4238f libavcodec

Use two separate memory arguments since 8+() is invalid gas syntax
author conrad
date Thu, 21 Jan 2010 09:46:57 +0000
parents eb9a2581f50e
children 34a65026fa06
comparison
equal deleted inserted replaced
10954:d7ef6611a49e 10955:fdddf3d4238f
615 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ 615 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
616 "movq %%mm1, "#tmp" \n\t"\ 616 "movq %%mm1, "#tmp" \n\t"\
617 "pavgb %%mm2, "#tmp" \n\t"\ 617 "pavgb %%mm2, "#tmp" \n\t"\
618 "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ 618 "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\
619 "pxor "q2addr", "#tmp" \n\t"\ 619 "pxor "q2addr", "#tmp" \n\t"\
620 "pand %8, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ 620 "pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\
621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ 621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
622 "movq "#p1", "#tmp" \n\t"\ 622 "movq "#p1", "#tmp" \n\t"\
623 "psubusb "#tc0", "#tmp" \n\t"\ 623 "psubusb "#tc0", "#tmp" \n\t"\
624 "paddusb "#p1", "#tc0" \n\t"\ 624 "paddusb "#p1", "#tc0" \n\t"\
625 "pmaxub "#tmp", "#q2" \n\t"\ 625 "pmaxub "#tmp", "#q2" \n\t"\
629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) 629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
630 { 630 {
631 DECLARE_ALIGNED_8(uint64_t, tmp0[2]); 631 DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
632 632
633 __asm__ volatile( 633 __asm__ volatile(
634 "movq (%1,%3), %%mm0 \n\t" //p1 634 "movq (%2,%4), %%mm0 \n\t" //p1
635 "movq (%1,%3,2), %%mm1 \n\t" //p0 635 "movq (%2,%4,2), %%mm1 \n\t" //p0
636 "movq (%2), %%mm2 \n\t" //q0 636 "movq (%3), %%mm2 \n\t" //q0
637 "movq (%2,%3), %%mm3 \n\t" //q1 637 "movq (%3,%4), %%mm3 \n\t" //q1
638 H264_DEBLOCK_MASK(%6, %7) 638 H264_DEBLOCK_MASK(%7, %8)
639 639
640 "movd %5, %%mm4 \n\t" 640 "movd %6, %%mm4 \n\t"
641 "punpcklbw %%mm4, %%mm4 \n\t" 641 "punpcklbw %%mm4, %%mm4 \n\t"
642 "punpcklwd %%mm4, %%mm4 \n\t" 642 "punpcklwd %%mm4, %%mm4 \n\t"
643 "pcmpeqb %%mm3, %%mm3 \n\t" 643 "pcmpeqb %%mm3, %%mm3 \n\t"
644 "movq %%mm4, %%mm6 \n\t" 644 "movq %%mm4, %%mm6 \n\t"
645 "pcmpgtb %%mm3, %%mm4 \n\t" 645 "pcmpgtb %%mm3, %%mm4 \n\t"
646 "movq %%mm6, 8+%0 \n\t" 646 "movq %%mm6, %1 \n\t"
647 "pand %%mm4, %%mm7 \n\t" 647 "pand %%mm4, %%mm7 \n\t"
648 "movq %%mm7, %0 \n\t" 648 "movq %%mm7, %0 \n\t"
649 649
650 /* filter p1 */ 650 /* filter p1 */
651 "movq (%1), %%mm3 \n\t" //p2 651 "movq (%2), %%mm3 \n\t" //p2
652 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 652 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
653 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta 653 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta
654 "pand 8+%0, %%mm7 \n\t" // mask & tc0 654 "pand %1, %%mm7 \n\t" // mask & tc0
655 "movq %%mm7, %%mm4 \n\t" 655 "movq %%mm7, %%mm4 \n\t"
656 "psubb %%mm6, %%mm7 \n\t" 656 "psubb %%mm6, %%mm7 \n\t"
657 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 657 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0
658 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) 658 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)
659 659
660 /* filter q1 */ 660 /* filter q1 */
661 "movq (%2,%3,2), %%mm4 \n\t" //q2 661 "movq (%3,%4,2), %%mm4 \n\t" //q2
662 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 662 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
663 "pand %0, %%mm6 \n\t" 663 "pand %0, %%mm6 \n\t"
664 "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then 664 "movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then
665 "pand %%mm6, %%mm5 \n\t" 665 "pand %%mm6, %%mm5 \n\t"
666 "psubb %%mm6, %%mm7 \n\t" 666 "psubb %%mm6, %%mm7 \n\t"
667 "movq (%2,%3), %%mm3 \n\t" 667 "movq (%3,%4), %%mm3 \n\t"
668 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6) 668 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)
669 669
670 /* filter p0, q0 */ 670 /* filter p0, q0 */
671 H264_DEBLOCK_P0_Q0(%8, unused) 671 H264_DEBLOCK_P0_Q0(%9, unused)
672 "movq %%mm1, (%1,%3,2) \n\t" 672 "movq %%mm1, (%2,%4,2) \n\t"
673 "movq %%mm2, (%2) \n\t" 673 "movq %%mm2, (%3) \n\t"
674 674
675 : "=m"(*tmp0) 675 : "=m"(tmp0[0]), "=m"(tmp0[1])
676 : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), 676 : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
677 "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), 677 "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
678 "m"(ff_bone) 678 "m"(ff_bone)
679 ); 679 );
680 } 680 }