Mercurial > libavcodec.hg
comparison x86/h264dsp_mmx.c @ 12454:f4355cd85faa libavcodec
Port latest x264 deblock asm (before they moved to using NV12 as internal
format), LGPL'ed with permission from Jason and Loren. This includes mmx2
code, so remove inline asm from h264dsp_mmx.c accordingly.
author | rbultje |
---|---|
date | Fri, 03 Sep 2010 16:52:46 +0000 |
parents | 4c3e6ff1237e |
children | a5ddb39627fd |
comparison
equal
deleted
inserted
replaced
12453:35e1de8243c6 | 12454:f4355cd85faa |
---|---|
546 } | 546 } |
547 #endif | 547 #endif |
548 | 548 |
549 /***********************************/ | 549 /***********************************/ |
550 /* deblocking */ | 550 /* deblocking */ |
551 | |
552 // out: o = |x-y|>a | |
553 // clobbers: t | |
554 #define DIFF_GT_MMX(x,y,a,o,t)\ | |
555 "movq "#y", "#t" \n\t"\ | |
556 "movq "#x", "#o" \n\t"\ | |
557 "psubusb "#x", "#t" \n\t"\ | |
558 "psubusb "#y", "#o" \n\t"\ | |
559 "por "#t", "#o" \n\t"\ | |
560 "psubusb "#a", "#o" \n\t" | |
561 | |
562 // out: o = |x-y|>a | |
563 // clobbers: t | |
564 #define DIFF_GT2_MMX(x,y,a,o,t)\ | |
565 "movq "#y", "#t" \n\t"\ | |
566 "movq "#x", "#o" \n\t"\ | |
567 "psubusb "#x", "#t" \n\t"\ | |
568 "psubusb "#y", "#o" \n\t"\ | |
569 "psubusb "#a", "#t" \n\t"\ | |
570 "psubusb "#a", "#o" \n\t"\ | |
571 "pcmpeqb "#t", "#o" \n\t"\ | |
572 | |
573 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 | |
574 // out: mm5=beta-1, mm7=mask | |
575 // clobbers: mm4,mm6 | |
576 #define H264_DEBLOCK_MASK(alpha1, beta1) \ | |
577 "pshufw $0, "#alpha1", %%mm4 \n\t"\ | |
578 "pshufw $0, "#beta1 ", %%mm5 \n\t"\ | |
579 "packuswb %%mm4, %%mm4 \n\t"\ | |
580 "packuswb %%mm5, %%mm5 \n\t"\ | |
581 DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\ | |
582 DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\ | |
583 "por %%mm4, %%mm7 \n\t"\ | |
584 DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\ | |
585 "por %%mm4, %%mm7 \n\t"\ | |
586 "pxor %%mm6, %%mm6 \n\t"\ | |
587 "pcmpeqb %%mm6, %%mm7 \n\t" | |
588 | |
589 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) | |
590 // out: mm1=p0' mm2=q0' | |
591 // clobbers: mm0,3-6 | |
592 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ | |
593 "movq %%mm1 , %%mm5 \n\t"\ | |
594 "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ | |
595 "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ | |
596 "pcmpeqb %%mm4 , %%mm4 \n\t"\ | |
597 "pxor %%mm4 , %%mm3 \n\t"\ | |
598 "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ | |
599 "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ | |
600 "pxor %%mm1 , %%mm4 \n\t"\ | |
601 "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ | |
602 "pavgb %%mm5 , %%mm3 \n\t"\ | |
603 "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ | |
604 "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ | |
605 "psubusb %%mm3 , %%mm6 \n\t"\ | |
606 "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ | |
607 "pminub %%mm7 , %%mm6 \n\t"\ | |
608 "pminub %%mm7 , %%mm3 \n\t"\ | |
609 "psubusb %%mm6 , %%mm1 \n\t"\ | |
610 "psubusb %%mm3 , %%mm2 \n\t"\ | |
611 "paddusb %%mm3 , %%mm1 \n\t"\ | |
612 "paddusb %%mm6 , %%mm2 \n\t" | |
613 | |
614 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone | |
615 // out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) | |
616 // clobbers: q2, tmp, tc0 | |
617 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ | |
618 "movq %%mm1, "#tmp" \n\t"\ | |
619 "pavgb %%mm2, "#tmp" \n\t"\ | |
620 "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ | |
621 "pxor "q2addr", "#tmp" \n\t"\ | |
622 "pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ | |
623 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ | |
624 "movq "#p1", "#tmp" \n\t"\ | |
625 "psubusb "#tc0", "#tmp" \n\t"\ | |
626 "paddusb "#p1", "#tc0" \n\t"\ | |
627 "pmaxub "#tmp", "#q2" \n\t"\ | |
628 "pminub "#tc0", "#q2" \n\t"\ | |
629 "movq "#q2", "q1addr" \n\t" | |
630 | |
631 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) | |
632 { | |
633 DECLARE_ALIGNED(8, uint64_t, tmp0)[2]; | |
634 | |
635 __asm__ volatile( | |
636 "movq (%2,%4), %%mm0 \n\t" //p1 | |
637 "movq (%2,%4,2), %%mm1 \n\t" //p0 | |
638 "movq (%3), %%mm2 \n\t" //q0 | |
639 "movq (%3,%4), %%mm3 \n\t" //q1 | |
640 H264_DEBLOCK_MASK(%7, %8) | |
641 | |
642 "movd %6, %%mm4 \n\t" | |
643 "punpcklbw %%mm4, %%mm4 \n\t" | |
644 "punpcklwd %%mm4, %%mm4 \n\t" | |
645 "pcmpeqb %%mm3, %%mm3 \n\t" | |
646 "movq %%mm4, %%mm6 \n\t" | |
647 "pcmpgtb %%mm3, %%mm4 \n\t" | |
648 "movq %%mm6, %1 \n\t" | |
649 "pand %%mm4, %%mm7 \n\t" | |
650 "movq %%mm7, %0 \n\t" | |
651 | |
652 /* filter p1 */ | |
653 "movq (%2), %%mm3 \n\t" //p2 | |
654 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 | |
655 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta | |
656 "pand %1, %%mm7 \n\t" // mask & tc0 | |
657 "movq %%mm7, %%mm4 \n\t" | |
658 "psubb %%mm6, %%mm7 \n\t" | |
659 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 | |
660 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4) | |
661 | |
662 /* filter q1 */ | |
663 "movq (%3,%4,2), %%mm4 \n\t" //q2 | |
664 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 | |
665 "pand %0, %%mm6 \n\t" | |
666 "movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then | |
667 "pand %%mm6, %%mm5 \n\t" | |
668 "psubb %%mm6, %%mm7 \n\t" | |
669 "movq (%3,%4), %%mm3 \n\t" | |
670 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6) | |
671 | |
672 /* filter p0, q0 */ | |
673 H264_DEBLOCK_P0_Q0(%9, unused) | |
674 "movq %%mm1, (%2,%4,2) \n\t" | |
675 "movq %%mm2, (%3) \n\t" | |
676 | |
677 : "=m"(tmp0[0]), "=m"(tmp0[1]) | |
678 : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), | |
679 "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), | |
680 "m"(ff_bone) | |
681 ); | |
682 } | |
683 | |
684 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) | |
685 { | |
686 if((tc0[0] & tc0[1]) >= 0) | |
687 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); | |
688 if((tc0[2] & tc0[3]) >= 0) | |
689 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); | |
690 } | |
691 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) | |
692 { | |
693 //FIXME: could cut some load/stores by merging transpose with filter | |
694 // also, it only needs to transpose 6x8 | |
695 DECLARE_ALIGNED(8, uint8_t, trans)[8*8]; | |
696 int i; | |
697 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { | |
698 if((tc0[0] & tc0[1]) < 0) | |
699 continue; | |
700 transpose4x4(trans, pix-4, 8, stride); | |
701 transpose4x4(trans +4*8, pix, 8, stride); | |
702 transpose4x4(trans+4, pix-4+4*stride, 8, stride); | |
703 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); | |
704 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); | |
705 transpose4x4(pix-2, trans +2*8, stride, 8); | |
706 transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8); | |
707 } | |
708 } | |
709 | |
710 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) | |
711 { | |
712 __asm__ volatile( | |
713 "movq (%0), %%mm0 \n\t" //p1 | |
714 "movq (%0,%2), %%mm1 \n\t" //p0 | |
715 "movq (%1), %%mm2 \n\t" //q0 | |
716 "movq (%1,%2), %%mm3 \n\t" //q1 | |
717 H264_DEBLOCK_MASK(%4, %5) | |
718 "movd %3, %%mm6 \n\t" | |
719 "punpcklbw %%mm6, %%mm6 \n\t" | |
720 "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask | |
721 H264_DEBLOCK_P0_Q0(%6, %7) | |
722 "movq %%mm1, (%0,%2) \n\t" | |
723 "movq %%mm2, (%1) \n\t" | |
724 | |
725 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), | |
726 "r"(*(uint32_t*)tc0), | |
727 "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F) | |
728 ); | |
729 } | |
730 | |
731 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) | |
732 { | |
733 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); | |
734 } | |
735 | |
736 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) | |
737 { | |
738 //FIXME: could cut some load/stores by merging transpose with filter | |
739 DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; | |
740 transpose4x4(trans, pix-2, 8, stride); | |
741 transpose4x4(trans+4, pix-2+4*stride, 8, stride); | |
742 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); | |
743 transpose4x4(pix-2, trans, stride, 8); | |
744 transpose4x4(pix-2+4*stride, trans+4, stride, 8); | |
745 } | |
746 | |
747 // p0 = (p0 + q1 + 2*p1 + 2) >> 2 | |
748 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \ | |
749 "movq "#p0", %%mm4 \n\t"\ | |
750 "pxor "#q1", %%mm4 \n\t"\ | |
751 "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ | |
752 "pavgb "#q1", "#p0" \n\t"\ | |
753 "psubusb %%mm4, "#p0" \n\t"\ | |
754 "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ | |
755 | |
756 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) | |
757 { | |
758 __asm__ volatile( | |
759 "movq (%0), %%mm0 \n\t" | |
760 "movq (%0,%2), %%mm1 \n\t" | |
761 "movq (%1), %%mm2 \n\t" | |
762 "movq (%1,%2), %%mm3 \n\t" | |
763 H264_DEBLOCK_MASK(%3, %4) | |
764 "movq %%mm1, %%mm5 \n\t" | |
765 "movq %%mm2, %%mm6 \n\t" | |
766 H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' | |
767 H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' | |
768 "psubb %%mm5, %%mm1 \n\t" | |
769 "psubb %%mm6, %%mm2 \n\t" | |
770 "pand %%mm7, %%mm1 \n\t" | |
771 "pand %%mm7, %%mm2 \n\t" | |
772 "paddb %%mm5, %%mm1 \n\t" | |
773 "paddb %%mm6, %%mm2 \n\t" | |
774 "movq %%mm1, (%0,%2) \n\t" | |
775 "movq %%mm2, (%1) \n\t" | |
776 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), | |
777 "m"(alpha1), "m"(beta1), "m"(ff_bone) | |
778 ); | |
779 } | |
780 | |
781 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) | |
782 { | |
783 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); | |
784 } | |
785 | |
786 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) | |
787 { | |
788 //FIXME: could cut some load/stores by merging transpose with filter | |
789 DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; | |
790 transpose4x4(trans, pix-2, 8, stride); | |
791 transpose4x4(trans+4, pix-2+4*stride, 8, stride); | |
792 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); | |
793 transpose4x4(pix-2, trans, stride, 8); | |
794 transpose4x4(pix-2+4*stride, trans+4, stride, 8); | |
795 } | |
796 | 551 |
797 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], | 552 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], |
798 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { | 553 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { |
799 int dir; | 554 int dir; |
800 __asm__ volatile( | 555 __asm__ volatile( |
916 ::"r"(bS[0]) | 671 ::"r"(bS[0]) |
917 :"memory" | 672 :"memory" |
918 ); | 673 ); |
919 } | 674 } |
920 | 675 |
676 #define LF_FUNC(DIR, TYPE, OPT) \ | |
677 void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ | |
678 int alpha, int beta, int8_t *tc0); | |
679 #define LF_IFUNC(DIR, TYPE, OPT) \ | |
680 void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ | |
681 int alpha, int beta); | |
682 | |
683 LF_FUNC (h, chroma, mmxext) | |
684 LF_IFUNC(h, chroma_intra, mmxext) | |
685 LF_FUNC (v, chroma, mmxext) | |
686 LF_IFUNC(v, chroma_intra, mmxext) | |
687 | |
688 LF_FUNC (h, luma, mmxext) | |
689 LF_IFUNC(h, luma_intra, mmxext) | |
690 #if HAVE_YASM && ARCH_X86_32 | |
691 LF_FUNC (v8, luma, mmxext) | |
692 static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) | |
693 { | |
694 if((tc0[0] & tc0[1]) >= 0) | |
695 ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); | |
696 if((tc0[2] & tc0[3]) >= 0) | |
697 ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); | |
698 } | |
699 LF_IFUNC(v8, luma_intra, mmxext) | |
700 static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) | |
701 { | |
702 ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); | |
703 ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); | |
704 } | |
705 #endif | |
706 | |
707 LF_FUNC (h, luma, sse2) | |
708 LF_IFUNC(h, luma_intra, sse2) | |
709 LF_FUNC (v, luma, sse2) | |
710 LF_IFUNC(v, luma_intra, sse2) | |
711 | |
921 /***********************************/ | 712 /***********************************/ |
922 /* weighted prediction */ | 713 /* weighted prediction */ |
923 | 714 |
924 #define H264_WEIGHT(W, H, OPT) \ | 715 #define H264_WEIGHT(W, H, OPT) \ |
925 void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ | 716 void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ |
947 H264_BIWEIGHT_MMX_SSE( 8, 4) | 738 H264_BIWEIGHT_MMX_SSE( 8, 4) |
948 H264_BIWEIGHT_MMX ( 4, 8) | 739 H264_BIWEIGHT_MMX ( 4, 8) |
949 H264_BIWEIGHT_MMX ( 4, 4) | 740 H264_BIWEIGHT_MMX ( 4, 4) |
950 H264_BIWEIGHT_MMX ( 4, 2) | 741 H264_BIWEIGHT_MMX ( 4, 2) |
951 | 742 |
952 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | |
953 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | |
954 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); | |
955 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); | |
956 void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); | |
957 | |
958 #if HAVE_YASM && ARCH_X86_32 | |
959 void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); | |
960 static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) | |
961 { | |
962 ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); | |
963 ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); | |
964 } | |
965 #endif | |
966 | |
967 void ff_h264dsp_init_x86(H264DSPContext *c) | 743 void ff_h264dsp_init_x86(H264DSPContext *c) |
968 { | 744 { |
969 int mm_flags = mm_support(); | 745 int mm_flags = mm_support(); |
970 | 746 |
971 if (mm_flags & FF_MM_MMX) { | 747 if (mm_flags & FF_MM_MMX) { |
985 c->h264_idct_add16 = ff_h264_idct_add16_mmx2; | 761 c->h264_idct_add16 = ff_h264_idct_add16_mmx2; |
986 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; | 762 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; |
987 c->h264_idct_add8 = ff_h264_idct_add8_mmx2; | 763 c->h264_idct_add8 = ff_h264_idct_add8_mmx2; |
988 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; | 764 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; |
989 | 765 |
990 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; | |
991 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; | |
992 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; | |
993 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; | |
994 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; | |
995 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; | |
996 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; | 766 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; |
997 } | 767 } |
998 if(mm_flags & FF_MM_SSE2){ | 768 if(mm_flags & FF_MM_SSE2){ |
999 c->h264_idct8_add = ff_h264_idct8_add_sse2; | 769 c->h264_idct8_add = ff_h264_idct8_add_sse2; |
1000 c->h264_idct8_add4= ff_h264_idct8_add4_sse2; | 770 c->h264_idct8_add4= ff_h264_idct8_add4_sse2; |
1001 } | 771 } |
1002 | 772 |
1003 #if HAVE_YASM | 773 #if HAVE_YASM |
1004 if (mm_flags & FF_MM_MMX2){ | 774 if (mm_flags & FF_MM_MMX2){ |
775 c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext; | |
776 c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext; | |
777 c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext; | |
778 c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext; | |
1005 #if ARCH_X86_32 | 779 #if ARCH_X86_32 |
780 c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext; | |
781 c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext; | |
1006 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; | 782 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; |
1007 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; | 783 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; |
1008 #endif | 784 #endif |
1009 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; | 785 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; |
1010 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; | 786 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; |