Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 959:3ec070eef24a libavcodec
qpel in b frames bugfixes
author | michaelni |
---|---|
date | Sun, 05 Jan 2003 20:59:29 +0000 |
parents | 9bb668034ecf |
children | f8c5babc7b4e |
comparison
equal
deleted
inserted
replaced
958:9bb668034ecf | 959:3ec070eef24a |
---|---|
649 return sum&0xFFFF; | 649 return sum&0xFFFF; |
650 } | 650 } |
651 | 651 |
652 WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx) | 652 WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx) |
653 | 653 |
654 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d) | |
655 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d) | |
656 | |
654 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ | 657 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
655 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ | 658 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ |
656 "movq " #pw_20 ", %%mm4 \n\t" /* 20 */\ | 659 "movq " #pw_20 ", %%mm4 \n\t" /* 20 */\ |
657 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ | 660 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ |
658 "movq "#in7", " #m3 " \n\t" /* d */\ | 661 "movq "#in7", " #m3 " \n\t" /* d */\ |
670 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ | 673 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ |
671 "psraw $5, %%mm5 \n\t"\ | 674 "psraw $5, %%mm5 \n\t"\ |
672 "packuswb %%mm5, %%mm5 \n\t"\ | 675 "packuswb %%mm5, %%mm5 \n\t"\ |
673 OP(%%mm5, out, %%mm7, d) | 676 OP(%%mm5, out, %%mm7, d) |
674 | 677 |
675 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP)\ | 678 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ |
676 void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | 679 void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
677 uint64_t temp;\ | 680 uint64_t temp;\ |
678 \ | 681 \ |
679 asm volatile(\ | 682 asm volatile(\ |
680 "pushl %0 \n\t"\ | 683 "pushl %0 \n\t"\ |
736 "paddw %8, %%mm1 \n\t"\ | 739 "paddw %8, %%mm1 \n\t"\ |
737 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ | 740 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ |
738 "psraw $5, %%mm3 \n\t"\ | 741 "psraw $5, %%mm3 \n\t"\ |
739 "movq %7, %%mm1 \n\t"\ | 742 "movq %7, %%mm1 \n\t"\ |
740 "packuswb %%mm3, %%mm1 \n\t"\ | 743 "packuswb %%mm3, %%mm1 \n\t"\ |
741 OP(%%mm1, (%1),%%mm4, q)\ | 744 OP_MMX2(%%mm1, (%1),%%mm4, q)\ |
742 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ | 745 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ |
743 \ | 746 \ |
744 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ | 747 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ |
745 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ | 748 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ |
746 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ | 749 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ |
782 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ | 785 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ |
783 "paddw %8, %%mm4 \n\t"\ | 786 "paddw %8, %%mm4 \n\t"\ |
784 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ | 787 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ |
785 "psraw $5, %%mm4 \n\t"\ | 788 "psraw $5, %%mm4 \n\t"\ |
786 "packuswb %%mm4, %%mm0 \n\t"\ | 789 "packuswb %%mm4, %%mm0 \n\t"\ |
787 OP(%%mm0, 8(%1), %%mm4, q)\ | 790 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ |
788 \ | 791 \ |
789 "addl %3, %0 \n\t"\ | 792 "addl %3, %0 \n\t"\ |
790 "addl %4, %1 \n\t"\ | 793 "addl %4, %1 \n\t"\ |
791 "decl %2 \n\t"\ | 794 "decl %2 \n\t"\ |
792 " jnz 1b \n\t"\ | 795 " jnz 1b \n\t"\ |
826 "paddw %2, %%mm0 \n\t"\ | 829 "paddw %2, %%mm0 \n\t"\ |
827 "paddw %2, %%mm1 \n\t"\ | 830 "paddw %2, %%mm1 \n\t"\ |
828 "psraw $5, %%mm0 \n\t"\ | 831 "psraw $5, %%mm0 \n\t"\ |
829 "psraw $5, %%mm1 \n\t"\ | 832 "psraw $5, %%mm1 \n\t"\ |
830 "packuswb %%mm1, %%mm0 \n\t"\ | 833 "packuswb %%mm1, %%mm0 \n\t"\ |
831 OP(%%mm0, (%1), %%mm1, q)\ | 834 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ |
832 "movq 16(%0), %%mm0 \n\t"\ | 835 "movq 16(%0), %%mm0 \n\t"\ |
833 "movq 24(%0), %%mm1 \n\t"\ | 836 "movq 24(%0), %%mm1 \n\t"\ |
834 "paddw %2, %%mm0 \n\t"\ | 837 "paddw %2, %%mm0 \n\t"\ |
835 "paddw %2, %%mm1 \n\t"\ | 838 "paddw %2, %%mm1 \n\t"\ |
836 "psraw $5, %%mm0 \n\t"\ | 839 "psraw $5, %%mm0 \n\t"\ |
837 "psraw $5, %%mm1 \n\t"\ | 840 "psraw $5, %%mm1 \n\t"\ |
838 "packuswb %%mm1, %%mm0 \n\t"\ | 841 "packuswb %%mm1, %%mm0 \n\t"\ |
839 OP(%%mm0, 8(%1), %%mm1, q)\ | 842 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ |
840 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | 843 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ |
841 );\ | 844 );\ |
842 dst+=dstStride;\ | 845 dst+=dstStride;\ |
843 src+=srcStride;\ | 846 src+=srcStride;\ |
844 }\ | 847 }\ |
845 }\ | 848 }\ |
846 \ | 849 \ |
847 void OPNAME ## mpeg4_qpel16_v_lowpass_mmx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 850 void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
851 uint64_t temp;\ | |
852 \ | |
853 asm volatile(\ | |
854 "pushl %0 \n\t"\ | |
855 "pushl %1 \n\t"\ | |
856 "pushl %2 \n\t"\ | |
857 "pxor %%mm7, %%mm7 \n\t"\ | |
858 "1: \n\t"\ | |
859 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
860 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
861 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
862 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
863 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
864 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
865 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
866 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
867 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
868 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
869 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
870 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
871 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
872 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
873 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
874 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
875 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
876 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
877 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
878 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
879 "pmullw %6, %%mm6 \n\t" /* 3c - 6b */\ | |
880 "paddw %%mm4, %%mm0 \n\t" /* a */\ | |
881 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
882 "pmullw %5, %%mm0 \n\t" /* 20a */\ | |
883 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ | |
884 "paddw %8, %%mm6 \n\t"\ | |
885 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
886 "psraw $5, %%mm0 \n\t"\ | |
887 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ | |
888 \ | |
889 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ | |
890 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ | |
891 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ | |
892 "paddw %%mm5, %%mm1 \n\t" /* a */\ | |
893 "paddw %%mm6, %%mm2 \n\t" /* b */\ | |
894 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ | |
895 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ | |
896 "paddw %%mm6, %%mm3 \n\t" /* c */\ | |
897 "paddw %%mm5, %%mm4 \n\t" /* d */\ | |
898 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
899 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
900 "pmullw %5, %%mm1 \n\t" /* 20a */\ | |
901 "pmullw %6, %%mm3 \n\t" /* 3c - 6b */\ | |
902 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ | |
903 "paddw %8, %%mm1 \n\t"\ | |
904 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ | |
905 "psraw $5, %%mm3 \n\t"\ | |
906 "packuswb %%mm3, %%mm0 \n\t"\ | |
907 OP_MMX2(%%mm0, (%1), %%mm4, q)\ | |
908 \ | |
909 "addl %3, %0 \n\t"\ | |
910 "addl %4, %1 \n\t"\ | |
911 "decl %2 \n\t"\ | |
912 " jnz 1b \n\t"\ | |
913 "popl %2 \n\t"\ | |
914 "popl %1 \n\t"\ | |
915 "popl %0 \n\t"\ | |
916 :: "r"(src), "r"(dst), "r"(h),\ | |
917 "r"(srcStride), "r"(dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\ | |
918 );\ | |
919 }\ | |
920 \ | |
921 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
922 int i;\ | |
923 int16_t temp[8];\ | |
924 /* quick HACK, XXX FIXME MUST be optimized */\ | |
925 for(i=0; i<h; i++)\ | |
926 {\ | |
927 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
928 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
929 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
930 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
931 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
932 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ | |
933 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ | |
934 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ | |
935 asm volatile(\ | |
936 "movq (%0), %%mm0 \n\t"\ | |
937 "movq 8(%0), %%mm1 \n\t"\ | |
938 "paddw %2, %%mm0 \n\t"\ | |
939 "paddw %2, %%mm1 \n\t"\ | |
940 "psraw $5, %%mm0 \n\t"\ | |
941 "psraw $5, %%mm1 \n\t"\ | |
942 "packuswb %%mm1, %%mm0 \n\t"\ | |
943 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ | |
944 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | |
945 );\ | |
946 dst+=dstStride;\ | |
947 src+=srcStride;\ | |
948 }\ | |
949 } | |
950 | |
951 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ | |
952 \ | |
953 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
848 uint64_t temp[17*4];\ | 954 uint64_t temp[17*4];\ |
849 uint64_t *temp_ptr= temp;\ | 955 uint64_t *temp_ptr= temp;\ |
850 int count= 17;\ | 956 int count= 17;\ |
851 \ | 957 \ |
852 /*FIXME unroll */\ | 958 /*FIXME unroll */\ |
924 \ | 1030 \ |
925 :: "r"(temp_ptr), "r"(dst), "r"(count),\ | 1031 :: "r"(temp_ptr), "r"(dst), "r"(count),\ |
926 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-14*dstStride)\ | 1032 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-14*dstStride)\ |
927 );\ | 1033 );\ |
928 }\ | 1034 }\ |
929 void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | 1035 \ |
930 uint64_t temp;\ | 1036 void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
931 \ | |
932 asm volatile(\ | |
933 "pushl %0 \n\t"\ | |
934 "pushl %1 \n\t"\ | |
935 "pushl %2 \n\t"\ | |
936 "pxor %%mm7, %%mm7 \n\t"\ | |
937 "1: \n\t"\ | |
938 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
939 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
940 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
941 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
942 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
943 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
944 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
945 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
946 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
947 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
948 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
949 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
950 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
951 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
952 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
953 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
954 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
955 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
956 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
957 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
958 "pmullw %6, %%mm6 \n\t" /* 3c - 6b */\ | |
959 "paddw %%mm4, %%mm0 \n\t" /* a */\ | |
960 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
961 "pmullw %5, %%mm0 \n\t" /* 20a */\ | |
962 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ | |
963 "paddw %8, %%mm6 \n\t"\ | |
964 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
965 "psraw $5, %%mm0 \n\t"\ | |
966 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ | |
967 \ | |
968 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ | |
969 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ | |
970 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ | |
971 "paddw %%mm5, %%mm1 \n\t" /* a */\ | |
972 "paddw %%mm6, %%mm2 \n\t" /* b */\ | |
973 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ | |
974 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ | |
975 "paddw %%mm6, %%mm3 \n\t" /* c */\ | |
976 "paddw %%mm5, %%mm4 \n\t" /* d */\ | |
977 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
978 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
979 "pmullw %5, %%mm1 \n\t" /* 20a */\ | |
980 "pmullw %6, %%mm3 \n\t" /* 3c - 6b */\ | |
981 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ | |
982 "paddw %8, %%mm1 \n\t"\ | |
983 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ | |
984 "psraw $5, %%mm3 \n\t"\ | |
985 "packuswb %%mm3, %%mm0 \n\t"\ | |
986 OP(%%mm0, (%1), %%mm4, q)\ | |
987 \ | |
988 "addl %3, %0 \n\t"\ | |
989 "addl %4, %1 \n\t"\ | |
990 "decl %2 \n\t"\ | |
991 " jnz 1b \n\t"\ | |
992 "popl %2 \n\t"\ | |
993 "popl %1 \n\t"\ | |
994 "popl %0 \n\t"\ | |
995 :: "r"(src), "r"(dst), "r"(h),\ | |
996 "r"(srcStride), "r"(dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\ | |
997 );\ | |
998 }\ | |
999 \ | |
1000 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1001 int i;\ | |
1002 int16_t temp[8];\ | |
1003 /* quick HACK, XXX FIXME MUST be optimized */\ | |
1004 for(i=0; i<h; i++)\ | |
1005 {\ | |
1006 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
1007 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
1008 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
1009 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
1010 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
1011 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ | |
1012 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ | |
1013 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ | |
1014 asm volatile(\ | |
1015 "movq (%0), %%mm0 \n\t"\ | |
1016 "movq 8(%0), %%mm1 \n\t"\ | |
1017 "paddw %2, %%mm0 \n\t"\ | |
1018 "paddw %2, %%mm1 \n\t"\ | |
1019 "psraw $5, %%mm0 \n\t"\ | |
1020 "psraw $5, %%mm1 \n\t"\ | |
1021 "packuswb %%mm1, %%mm0 \n\t"\ | |
1022 OP(%%mm0, (%1), %%mm1, q)\ | |
1023 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | |
1024 );\ | |
1025 dst+=dstStride;\ | |
1026 src+=srcStride;\ | |
1027 }\ | |
1028 }\ | |
1029 \ | |
1030 void OPNAME ## mpeg4_qpel8_v_lowpass_mmx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1031 uint64_t temp[9*4];\ | 1037 uint64_t temp[9*4];\ |
1032 uint64_t *temp_ptr= temp;\ | 1038 uint64_t *temp_ptr= temp;\ |
1033 int count= 9;\ | 1039 int count= 9;\ |
1034 \ | 1040 \ |
1035 /*FIXME unroll */\ | 1041 /*FIXME unroll */\ |
1087 "popl %0 \n\t"\ | 1093 "popl %0 \n\t"\ |
1088 \ | 1094 \ |
1089 :: "r"(temp_ptr), "r"(dst), "r"(count),\ | 1095 :: "r"(temp_ptr), "r"(dst), "r"(count),\ |
1090 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-6*dstStride)\ | 1096 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-6*dstStride)\ |
1091 );\ | 1097 );\ |
1092 } | 1098 }\ |
1093 | |
1094 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ | |
1095 \ | 1099 \ |
1096 static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\ | 1100 static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\ |
1097 put_pixels8_mmx(dst, src, stride, 8);\ | 1101 OPNAME ## pixels8_mmx(dst, src, stride, 8);\ |
1098 }\ | 1102 }\ |
1099 \ | 1103 \ |
1100 static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1104 static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1101 uint64_t temp[32];\ | 1105 uint64_t temp[32];\ |
1102 uint8_t * const half= (uint8_t*)temp;\ | 1106 uint8_t * const half= (uint8_t*)temp;\ |
1116 }\ | 1120 }\ |
1117 \ | 1121 \ |
1118 static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1122 static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1119 uint64_t temp[32];\ | 1123 uint64_t temp[32];\ |
1120 uint8_t * const half= (uint8_t*)temp;\ | 1124 uint8_t * const half= (uint8_t*)temp;\ |
1121 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(half, src, 8, stride);\ | 1125 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
1122 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\ | 1126 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\ |
1123 }\ | 1127 }\ |
1124 \ | 1128 \ |
1125 static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1129 static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1126 OPNAME ## mpeg4_qpel8_v_lowpass_mmx(dst, src, stride, stride);\ | 1130 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
1127 }\ | 1131 }\ |
1128 \ | 1132 \ |
1129 static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1133 static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1130 uint64_t temp[32];\ | 1134 uint64_t temp[32];\ |
1131 uint8_t * const half= (uint8_t*)temp;\ | 1135 uint8_t * const half= (uint8_t*)temp;\ |
1132 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(half, src, 8, stride);\ | 1136 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
1133 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\ | 1137 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\ |
1134 }\ | 1138 }\ |
1135 static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1139 static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1136 uint64_t half[8*2 + 8*2 + 18*2];\ | 1140 uint64_t half[8*2 + 8*2 + 18*2];\ |
1137 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\ | 1141 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\ |
1138 uint8_t * const halfV= ((uint8_t*)half);\ | 1142 uint8_t * const halfV= ((uint8_t*)half);\ |
1139 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ | 1143 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ |
1140 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | 1144 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
1141 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\ | 1145 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\ |
1142 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ | 1146 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
1143 OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\ | 1147 OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\ |
1144 }\ | 1148 }\ |
1145 static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1149 static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1146 uint64_t half[8*2 + 8*2 + 18*2];\ | 1150 uint64_t half[8*2 + 8*2 + 18*2];\ |
1147 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\ | 1151 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\ |
1148 uint8_t * const halfV= ((uint8_t*)half);\ | 1152 uint8_t * const halfV= ((uint8_t*)half);\ |
1149 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ | 1153 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ |
1150 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | 1154 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
1151 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\ | 1155 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\ |
1152 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ | 1156 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
1153 OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\ | 1157 OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\ |
1154 }\ | 1158 }\ |
1155 static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1159 static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1156 uint64_t half[8*2 + 8*2 + 9*2];\ | 1160 uint64_t half[8*2 + 8*2 + 9*2];\ |
1157 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ | 1161 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ |
1158 uint8_t * const halfV= ((uint8_t*)half);\ | 1162 uint8_t * const halfV= ((uint8_t*)half);\ |
1159 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ | 1163 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ |
1160 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | 1164 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
1161 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\ | 1165 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\ |
1162 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ | 1166 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
1163 OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\ | 1167 OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\ |
1164 }\ | 1168 }\ |
1165 static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1169 static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1166 uint64_t half[8*2 + 8*2 + 9*2];\ | 1170 uint64_t half[8*2 + 8*2 + 9*2];\ |
1167 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ | 1171 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ |
1168 uint8_t * const halfV= ((uint8_t*)half);\ | 1172 uint8_t * const halfV= ((uint8_t*)half);\ |
1169 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ | 1173 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ |
1170 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src , 8, stride, 9);\ | 1174 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src , 8, stride, 9);\ |
1171 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\ | 1175 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\ |
1172 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ | 1176 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
1173 OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\ | 1177 OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\ |
1174 }\ | 1178 }\ |
1175 static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1179 static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1176 uint64_t half[8*2 + 9*2];\ | 1180 uint64_t half[8*2 + 9*2];\ |
1177 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | 1181 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
1178 uint8_t * const halfHV= ((uint8_t*)half);\ | 1182 uint8_t * const halfHV= ((uint8_t*)half);\ |
1179 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | 1183 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
1180 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ | 1184 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
1181 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\ | 1185 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\ |
1182 }\ | 1186 }\ |
1183 static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1187 static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1184 uint64_t half[8*2 + 9*2];\ | 1188 uint64_t half[8*2 + 9*2];\ |
1185 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | 1189 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
1186 uint8_t * const halfHV= ((uint8_t*)half);\ | 1190 uint8_t * const halfHV= ((uint8_t*)half);\ |
1187 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | 1191 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
1188 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ | 1192 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
1189 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\ | 1193 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\ |
1190 }\ | 1194 }\ |
1191 static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1195 static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1192 uint64_t half[8*2 + 8*2 + 9*2];\ | 1196 uint64_t half[8*2 + 8*2 + 9*2];\ |
1193 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ | 1197 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ |
1194 uint8_t * const halfV= ((uint8_t*)half);\ | 1198 uint8_t * const halfV= ((uint8_t*)half);\ |
1195 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ | 1199 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ |
1196 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | 1200 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
1197 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\ | 1201 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\ |
1198 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ | 1202 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
1199 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\ | 1203 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\ |
1200 }\ | 1204 }\ |
1201 static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1205 static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1202 uint64_t half[8*2 + 8*2 + 9*2];\ | 1206 uint64_t half[8*2 + 8*2 + 9*2];\ |
1203 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ | 1207 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ |
1204 uint8_t * const halfV= ((uint8_t*)half);\ | 1208 uint8_t * const halfV= ((uint8_t*)half);\ |
1205 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ | 1209 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ |
1206 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | 1210 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
1207 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\ | 1211 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\ |
1208 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ | 1212 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
1209 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\ | 1213 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\ |
1210 }\ | 1214 }\ |
1211 static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1215 static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1212 uint64_t half[9*2];\ | 1216 uint64_t half[9*2];\ |
1213 uint8_t * const halfH= ((uint8_t*)half);\ | 1217 uint8_t * const halfH= ((uint8_t*)half);\ |
1214 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | 1218 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
1215 OPNAME ## mpeg4_qpel8_v_lowpass_mmx(dst, halfH, stride, 8);\ | 1219 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
1216 }\ | 1220 }\ |
1217 static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\ | 1221 static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\ |
1218 put_pixels16_mmx(dst, src, stride, 16);\ | 1222 OPNAME ## pixels16_mmx(dst, src, stride, 16);\ |
1219 }\ | 1223 }\ |
1220 \ | 1224 \ |
1221 static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1225 static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1222 uint64_t temp[32];\ | 1226 uint64_t temp[32];\ |
1223 uint8_t * const half= (uint8_t*)temp;\ | 1227 uint8_t * const half= (uint8_t*)temp;\ |
1237 }\ | 1241 }\ |
1238 \ | 1242 \ |
1239 static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1243 static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1240 uint64_t temp[32];\ | 1244 uint64_t temp[32];\ |
1241 uint8_t * const half= (uint8_t*)temp;\ | 1245 uint8_t * const half= (uint8_t*)temp;\ |
1242 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(half, src, 16, stride);\ | 1246 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
1243 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\ | 1247 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\ |
1244 }\ | 1248 }\ |
1245 \ | 1249 \ |
1246 static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1250 static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1247 OPNAME ## mpeg4_qpel16_v_lowpass_mmx(dst, src, stride, stride);\ | 1251 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
1248 }\ | 1252 }\ |
1249 \ | 1253 \ |
1250 static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1254 static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1251 uint64_t temp[32];\ | 1255 uint64_t temp[32];\ |
1252 uint8_t * const half= (uint8_t*)temp;\ | 1256 uint8_t * const half= (uint8_t*)temp;\ |
1253 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(half, src, 16, stride);\ | 1257 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
1254 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\ | 1258 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\ |
1255 }\ | 1259 }\ |
1256 static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1260 static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1257 uint64_t half[16*2 + 16*2 + 18*2];\ | 1261 uint64_t half[16*2 + 16*2 + 18*2];\ |
1258 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\ | 1262 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\ |
1259 uint8_t * const halfV= ((uint8_t*)half);\ | 1263 uint8_t * const halfV= ((uint8_t*)half);\ |
1260 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ | 1264 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ |
1261 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | 1265 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
1262 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\ | 1266 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\ |
1263 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ | 1267 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
1264 OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\ | 1268 OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\ |
1265 }\ | 1269 }\ |
1266 static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1270 static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1267 uint64_t half[16*2 + 16*2 + 18*2];\ | 1271 uint64_t half[16*2 + 16*2 + 18*2];\ |
1268 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\ | 1272 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\ |
1269 uint8_t * const halfV= ((uint8_t*)half);\ | 1273 uint8_t * const halfV= ((uint8_t*)half);\ |
1270 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ | 1274 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ |
1271 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | 1275 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
1272 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\ | 1276 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\ |
1273 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ | 1277 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
1274 OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\ | 1278 OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\ |
1275 }\ | 1279 }\ |
1276 static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1280 static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1277 uint64_t half[16*2 + 16*2 + 17*2];\ | 1281 uint64_t half[16*2 + 16*2 + 17*2];\ |
1278 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ | 1282 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ |
1279 uint8_t * const halfV= ((uint8_t*)half);\ | 1283 uint8_t * const halfV= ((uint8_t*)half);\ |
1280 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ | 1284 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ |
1281 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | 1285 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
1282 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\ | 1286 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\ |
1283 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ | 1287 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
1284 OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\ | 1288 OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\ |
1285 }\ | 1289 }\ |
1286 static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1290 static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1287 uint64_t half[16*2 + 16*2 + 17*2];\ | 1291 uint64_t half[16*2 + 16*2 + 17*2];\ |
1288 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ | 1292 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ |
1289 uint8_t * const halfV= ((uint8_t*)half);\ | 1293 uint8_t * const halfV= ((uint8_t*)half);\ |
1290 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ | 1294 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ |
1291 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src , 16, stride, 17);\ | 1295 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src , 16, stride, 17);\ |
1292 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\ | 1296 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\ |
1293 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ | 1297 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
1294 OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\ | 1298 OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\ |
1295 }\ | 1299 }\ |
1296 static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1300 static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1297 uint64_t half[16*2 + 17*2];\ | 1301 uint64_t half[16*2 + 17*2];\ |
1298 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | 1302 uint8_t * const halfH= ((uint8_t*)half) + 256;\ |
1299 uint8_t * const halfHV= ((uint8_t*)half);\ | 1303 uint8_t * const halfHV= ((uint8_t*)half);\ |
1300 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | 1304 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
1301 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ | 1305 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
1302 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\ | 1306 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\ |
1303 }\ | 1307 }\ |
1304 static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1308 static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1305 uint64_t half[16*2 + 17*2];\ | 1309 uint64_t half[16*2 + 17*2];\ |
1306 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | 1310 uint8_t * const halfH= ((uint8_t*)half) + 256;\ |
1307 uint8_t * const halfHV= ((uint8_t*)half);\ | 1311 uint8_t * const halfHV= ((uint8_t*)half);\ |
1308 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | 1312 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
1309 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ | 1313 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
1310 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\ | 1314 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\ |
1311 }\ | 1315 }\ |
1312 static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1316 static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1313 uint64_t half[16*2 + 16*2 + 17*2];\ | 1317 uint64_t half[16*2 + 16*2 + 17*2];\ |
1314 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ | 1318 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ |
1315 uint8_t * const halfV= ((uint8_t*)half);\ | 1319 uint8_t * const halfV= ((uint8_t*)half);\ |
1316 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ | 1320 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ |
1317 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | 1321 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
1318 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\ | 1322 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\ |
1319 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ | 1323 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
1320 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\ | 1324 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\ |
1321 }\ | 1325 }\ |
1322 static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1326 static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1323 uint64_t half[16*2 + 16*2 + 17*2];\ | 1327 uint64_t half[16*2 + 16*2 + 17*2];\ |
1324 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ | 1328 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ |
1325 uint8_t * const halfV= ((uint8_t*)half);\ | 1329 uint8_t * const halfV= ((uint8_t*)half);\ |
1326 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ | 1330 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ |
1327 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | 1331 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
1328 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\ | 1332 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\ |
1329 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ | 1333 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
1330 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\ | 1334 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\ |
1331 }\ | 1335 }\ |
1332 static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ | 1336 static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ |
1333 uint64_t half[17*2];\ | 1337 uint64_t half[17*2];\ |
1334 uint8_t * const halfH= ((uint8_t*)half);\ | 1338 uint8_t * const halfH= ((uint8_t*)half);\ |
1335 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | 1339 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
1336 OPNAME ## mpeg4_qpel16_v_lowpass_mmx(dst, halfH, stride, 16);\ | 1340 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
1337 } | 1341 } |
1338 | 1342 |
1339 | 1343 |
1340 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | 1344 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" |
1341 #define AVG_OP(a,b,temp, size) \ | 1345 #define AVG_3DNOW_OP(a,b,temp, size) \ |
1342 "mov" #size " " #b ", " #temp " \n\t"\ | 1346 "mov" #size " " #b ", " #temp " \n\t"\ |
1343 "pavgusb " #temp ", " #a " \n\t"\ | 1347 "pavgusb " #temp ", " #a " \n\t"\ |
1344 "mov" #size " " #a ", " #b " \n\t" | 1348 "mov" #size " " #a ", " #b " \n\t" |
1345 | 1349 #define AVG_MMX2_OP(a,b,temp, size) \ |
1346 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP) | |
1347 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_OP) | |
1348 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP) | |
1349 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) | |
1350 QPEL_OP(avg_ , ff_pw_16, _ , AVG_OP, 3dnow) | |
1351 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) | |
1352 | |
1353 #undef AVG_OP | |
1354 #define AVG_OP(a,b,temp, size) \ | |
1355 "mov" #size " " #b ", " #temp " \n\t"\ | 1350 "mov" #size " " #b ", " #temp " \n\t"\ |
1356 "pavgb " #temp ", " #a " \n\t"\ | 1351 "pavgb " #temp ", " #a " \n\t"\ |
1357 "mov" #size " " #a ", " #b " \n\t" | 1352 "mov" #size " " #a ", " #b " \n\t" |
1353 | |
1354 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) | |
1355 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) | |
1356 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) | |
1357 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) | |
1358 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) | |
1359 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) | |
1358 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) | 1360 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) |
1359 QPEL_OP(avg_ , ff_pw_16, _ , AVG_OP, mmx2) | 1361 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) |
1360 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) | 1362 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) |
1361 | 1363 |
1362 #if 0 | 1364 #if 0 |
1363 static void just_return() { return; } | 1365 static void just_return() { return; } |
1364 #endif | 1366 #endif |
1483 | 1485 |
1484 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; | 1486 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; |
1485 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; | 1487 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; |
1486 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; | 1488 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; |
1487 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | 1489 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; |
1490 | |
1488 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2) | 1491 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2) |
1489 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2) | 1492 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2) |
1490 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2) | 1493 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2) |
1491 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2) | 1494 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2) |
1492 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2) | 1495 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2) |