comparison x86/h264dsp_mmx.c @ 12454:f4355cd85faa libavcodec

Port latest x264 deblock asm (before they moved to using NV12 as internal format), LGPL'ed with permission from Jason and Loren. This includes mmx2 code, so remove inline asm from h264dsp_mmx.c accordingly.
author rbultje
date Fri, 03 Sep 2010 16:52:46 +0000
parents 4c3e6ff1237e
children a5ddb39627fd
comparison
equal deleted inserted replaced
12453:35e1de8243c6 12454:f4355cd85faa
546 } 546 }
547 #endif 547 #endif
548 548
549 /***********************************/ 549 /***********************************/
550 /* deblocking */ 550 /* deblocking */
551
552 // out: o = |x-y|>a
553 // clobbers: t
554 #define DIFF_GT_MMX(x,y,a,o,t)\
555 "movq "#y", "#t" \n\t"\
556 "movq "#x", "#o" \n\t"\
557 "psubusb "#x", "#t" \n\t"\
558 "psubusb "#y", "#o" \n\t"\
559 "por "#t", "#o" \n\t"\
560 "psubusb "#a", "#o" \n\t"
561
562 // out: o = |x-y|>a
563 // clobbers: t
564 #define DIFF_GT2_MMX(x,y,a,o,t)\
565 "movq "#y", "#t" \n\t"\
566 "movq "#x", "#o" \n\t"\
567 "psubusb "#x", "#t" \n\t"\
568 "psubusb "#y", "#o" \n\t"\
569 "psubusb "#a", "#t" \n\t"\
570 "psubusb "#a", "#o" \n\t"\
571 "pcmpeqb "#t", "#o" \n\t"\
572
573 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
574 // out: mm5=beta-1, mm7=mask
575 // clobbers: mm4,mm6
576 #define H264_DEBLOCK_MASK(alpha1, beta1) \
577 "pshufw $0, "#alpha1", %%mm4 \n\t"\
578 "pshufw $0, "#beta1 ", %%mm5 \n\t"\
579 "packuswb %%mm4, %%mm4 \n\t"\
580 "packuswb %%mm5, %%mm5 \n\t"\
581 DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
582 DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
583 "por %%mm4, %%mm7 \n\t"\
584 DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
585 "por %%mm4, %%mm7 \n\t"\
586 "pxor %%mm6, %%mm6 \n\t"\
587 "pcmpeqb %%mm6, %%mm7 \n\t"
588
589 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
590 // out: mm1=p0' mm2=q0'
591 // clobbers: mm0,3-6
592 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
593 "movq %%mm1 , %%mm5 \n\t"\
594 "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\
595 "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\
596 "pcmpeqb %%mm4 , %%mm4 \n\t"\
597 "pxor %%mm4 , %%mm3 \n\t"\
598 "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
599 "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
600 "pxor %%mm1 , %%mm4 \n\t"\
601 "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
602 "pavgb %%mm5 , %%mm3 \n\t"\
603 "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\
604 "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
605 "psubusb %%mm3 , %%mm6 \n\t"\
606 "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
607 "pminub %%mm7 , %%mm6 \n\t"\
608 "pminub %%mm7 , %%mm3 \n\t"\
609 "psubusb %%mm6 , %%mm1 \n\t"\
610 "psubusb %%mm3 , %%mm2 \n\t"\
611 "paddusb %%mm3 , %%mm1 \n\t"\
612 "paddusb %%mm6 , %%mm2 \n\t"
613
614 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
615 // out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
616 // clobbers: q2, tmp, tc0
617 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
618 "movq %%mm1, "#tmp" \n\t"\
619 "pavgb %%mm2, "#tmp" \n\t"\
620 "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\
621 "pxor "q2addr", "#tmp" \n\t"\
622 "pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\
623 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
624 "movq "#p1", "#tmp" \n\t"\
625 "psubusb "#tc0", "#tmp" \n\t"\
626 "paddusb "#p1", "#tc0" \n\t"\
627 "pmaxub "#tmp", "#q2" \n\t"\
628 "pminub "#tc0", "#q2" \n\t"\
629 "movq "#q2", "q1addr" \n\t"
630
631 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
632 {
633 DECLARE_ALIGNED(8, uint64_t, tmp0)[2];
634
635 __asm__ volatile(
636 "movq (%2,%4), %%mm0 \n\t" //p1
637 "movq (%2,%4,2), %%mm1 \n\t" //p0
638 "movq (%3), %%mm2 \n\t" //q0
639 "movq (%3,%4), %%mm3 \n\t" //q1
640 H264_DEBLOCK_MASK(%7, %8)
641
642 "movd %6, %%mm4 \n\t"
643 "punpcklbw %%mm4, %%mm4 \n\t"
644 "punpcklwd %%mm4, %%mm4 \n\t"
645 "pcmpeqb %%mm3, %%mm3 \n\t"
646 "movq %%mm4, %%mm6 \n\t"
647 "pcmpgtb %%mm3, %%mm4 \n\t"
648 "movq %%mm6, %1 \n\t"
649 "pand %%mm4, %%mm7 \n\t"
650 "movq %%mm7, %0 \n\t"
651
652 /* filter p1 */
653 "movq (%2), %%mm3 \n\t" //p2
654 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
655 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta
656 "pand %1, %%mm7 \n\t" // mask & tc0
657 "movq %%mm7, %%mm4 \n\t"
658 "psubb %%mm6, %%mm7 \n\t"
659 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0
660 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)
661
662 /* filter q1 */
663 "movq (%3,%4,2), %%mm4 \n\t" //q2
664 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
665 "pand %0, %%mm6 \n\t"
666 "movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then
667 "pand %%mm6, %%mm5 \n\t"
668 "psubb %%mm6, %%mm7 \n\t"
669 "movq (%3,%4), %%mm3 \n\t"
670 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)
671
672 /* filter p0, q0 */
673 H264_DEBLOCK_P0_Q0(%9, unused)
674 "movq %%mm1, (%2,%4,2) \n\t"
675 "movq %%mm2, (%3) \n\t"
676
677 : "=m"(tmp0[0]), "=m"(tmp0[1])
678 : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
679 "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
680 "m"(ff_bone)
681 );
682 }
683
684 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
685 {
686 if((tc0[0] & tc0[1]) >= 0)
687 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
688 if((tc0[2] & tc0[3]) >= 0)
689 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
690 }
691 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
692 {
693 //FIXME: could cut some load/stores by merging transpose with filter
694 // also, it only needs to transpose 6x8
695 DECLARE_ALIGNED(8, uint8_t, trans)[8*8];
696 int i;
697 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
698 if((tc0[0] & tc0[1]) < 0)
699 continue;
700 transpose4x4(trans, pix-4, 8, stride);
701 transpose4x4(trans +4*8, pix, 8, stride);
702 transpose4x4(trans+4, pix-4+4*stride, 8, stride);
703 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride);
704 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
705 transpose4x4(pix-2, trans +2*8, stride, 8);
706 transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
707 }
708 }
709
710 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
711 {
712 __asm__ volatile(
713 "movq (%0), %%mm0 \n\t" //p1
714 "movq (%0,%2), %%mm1 \n\t" //p0
715 "movq (%1), %%mm2 \n\t" //q0
716 "movq (%1,%2), %%mm3 \n\t" //q1
717 H264_DEBLOCK_MASK(%4, %5)
718 "movd %3, %%mm6 \n\t"
719 "punpcklbw %%mm6, %%mm6 \n\t"
720 "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask
721 H264_DEBLOCK_P0_Q0(%6, %7)
722 "movq %%mm1, (%0,%2) \n\t"
723 "movq %%mm2, (%1) \n\t"
724
725 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
726 "r"(*(uint32_t*)tc0),
727 "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
728 );
729 }
730
731 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
732 {
733 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
734 }
735
736 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
737 {
738 //FIXME: could cut some load/stores by merging transpose with filter
739 DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
740 transpose4x4(trans, pix-2, 8, stride);
741 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
742 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
743 transpose4x4(pix-2, trans, stride, 8);
744 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
745 }
746
747 // p0 = (p0 + q1 + 2*p1 + 2) >> 2
748 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
749 "movq "#p0", %%mm4 \n\t"\
750 "pxor "#q1", %%mm4 \n\t"\
751 "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\
752 "pavgb "#q1", "#p0" \n\t"\
753 "psubusb %%mm4, "#p0" \n\t"\
754 "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
755
756 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
757 {
758 __asm__ volatile(
759 "movq (%0), %%mm0 \n\t"
760 "movq (%0,%2), %%mm1 \n\t"
761 "movq (%1), %%mm2 \n\t"
762 "movq (%1,%2), %%mm3 \n\t"
763 H264_DEBLOCK_MASK(%3, %4)
764 "movq %%mm1, %%mm5 \n\t"
765 "movq %%mm2, %%mm6 \n\t"
766 H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
767 H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
768 "psubb %%mm5, %%mm1 \n\t"
769 "psubb %%mm6, %%mm2 \n\t"
770 "pand %%mm7, %%mm1 \n\t"
771 "pand %%mm7, %%mm2 \n\t"
772 "paddb %%mm5, %%mm1 \n\t"
773 "paddb %%mm6, %%mm2 \n\t"
774 "movq %%mm1, (%0,%2) \n\t"
775 "movq %%mm2, (%1) \n\t"
776 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
777 "m"(alpha1), "m"(beta1), "m"(ff_bone)
778 );
779 }
780
781 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
782 {
783 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
784 }
785
786 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
787 {
788 //FIXME: could cut some load/stores by merging transpose with filter
789 DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
790 transpose4x4(trans, pix-2, 8, stride);
791 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
792 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
793 transpose4x4(pix-2, trans, stride, 8);
794 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
795 }
796 551
797 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], 552 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
798 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { 553 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
799 int dir; 554 int dir;
800 __asm__ volatile( 555 __asm__ volatile(
916 ::"r"(bS[0]) 671 ::"r"(bS[0])
917 :"memory" 672 :"memory"
918 ); 673 );
919 } 674 }
920 675
676 #define LF_FUNC(DIR, TYPE, OPT) \
677 void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
678 int alpha, int beta, int8_t *tc0);
679 #define LF_IFUNC(DIR, TYPE, OPT) \
680 void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
681 int alpha, int beta);
682
683 LF_FUNC (h, chroma, mmxext)
684 LF_IFUNC(h, chroma_intra, mmxext)
685 LF_FUNC (v, chroma, mmxext)
686 LF_IFUNC(v, chroma_intra, mmxext)
687
688 LF_FUNC (h, luma, mmxext)
689 LF_IFUNC(h, luma_intra, mmxext)
690 #if HAVE_YASM && ARCH_X86_32
691 LF_FUNC (v8, luma, mmxext)
692 static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
693 {
694 if((tc0[0] & tc0[1]) >= 0)
695 ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
696 if((tc0[2] & tc0[3]) >= 0)
697 ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
698 }
699 LF_IFUNC(v8, luma_intra, mmxext)
700 static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
701 {
702 ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
703 ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
704 }
705 #endif
706
707 LF_FUNC (h, luma, sse2)
708 LF_IFUNC(h, luma_intra, sse2)
709 LF_FUNC (v, luma, sse2)
710 LF_IFUNC(v, luma_intra, sse2)
711
921 /***********************************/ 712 /***********************************/
922 /* weighted prediction */ 713 /* weighted prediction */
923 714
924 #define H264_WEIGHT(W, H, OPT) \ 715 #define H264_WEIGHT(W, H, OPT) \
925 void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ 716 void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
947 H264_BIWEIGHT_MMX_SSE( 8, 4) 738 H264_BIWEIGHT_MMX_SSE( 8, 4)
948 H264_BIWEIGHT_MMX ( 4, 8) 739 H264_BIWEIGHT_MMX ( 4, 8)
949 H264_BIWEIGHT_MMX ( 4, 4) 740 H264_BIWEIGHT_MMX ( 4, 4)
950 H264_BIWEIGHT_MMX ( 4, 2) 741 H264_BIWEIGHT_MMX ( 4, 2)
951 742
952 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
953 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
954 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
955 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
956 void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
957
958 #if HAVE_YASM && ARCH_X86_32
959 void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
960 static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
961 {
962 ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
963 ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
964 }
965 #endif
966
967 void ff_h264dsp_init_x86(H264DSPContext *c) 743 void ff_h264dsp_init_x86(H264DSPContext *c)
968 { 744 {
969 int mm_flags = mm_support(); 745 int mm_flags = mm_support();
970 746
971 if (mm_flags & FF_MM_MMX) { 747 if (mm_flags & FF_MM_MMX) {
985 c->h264_idct_add16 = ff_h264_idct_add16_mmx2; 761 c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
986 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; 762 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
987 c->h264_idct_add8 = ff_h264_idct_add8_mmx2; 763 c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
988 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; 764 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
989 765
990 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
991 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
992 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
993 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
994 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
995 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
996 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; 766 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
997 } 767 }
998 if(mm_flags & FF_MM_SSE2){ 768 if(mm_flags & FF_MM_SSE2){
999 c->h264_idct8_add = ff_h264_idct8_add_sse2; 769 c->h264_idct8_add = ff_h264_idct8_add_sse2;
1000 c->h264_idct8_add4= ff_h264_idct8_add4_sse2; 770 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
1001 } 771 }
1002 772
1003 #if HAVE_YASM 773 #if HAVE_YASM
1004 if (mm_flags & FF_MM_MMX2){ 774 if (mm_flags & FF_MM_MMX2){
775 c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext;
776 c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext;
777 c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext;
778 c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext;
1005 #if ARCH_X86_32 779 #if ARCH_X86_32
780 c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext;
781 c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext;
1006 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; 782 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
1007 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; 783 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
1008 #endif 784 #endif
1009 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; 785 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
1010 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; 786 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;