comparison i386/dsputil_mmx.c @ 959:3ec070eef24a libavcodec

qpel in b frames bugfixes
author michaelni
date Sun, 05 Jan 2003 20:59:29 +0000
parents 9bb668034ecf
children f8c5babc7b4e
comparison
equal deleted inserted replaced
958:9bb668034ecf 959:3ec070eef24a
649 return sum&0xFFFF; 649 return sum&0xFFFF;
650 } 650 }
651 651
652 WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx) 652 WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
653 653
654 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
655 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
656
654 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ 657 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
655 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ 658 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
656 "movq " #pw_20 ", %%mm4 \n\t" /* 20 */\ 659 "movq " #pw_20 ", %%mm4 \n\t" /* 20 */\
657 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ 660 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
658 "movq "#in7", " #m3 " \n\t" /* d */\ 661 "movq "#in7", " #m3 " \n\t" /* d */\
670 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ 673 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
671 "psraw $5, %%mm5 \n\t"\ 674 "psraw $5, %%mm5 \n\t"\
672 "packuswb %%mm5, %%mm5 \n\t"\ 675 "packuswb %%mm5, %%mm5 \n\t"\
673 OP(%%mm5, out, %%mm7, d) 676 OP(%%mm5, out, %%mm7, d)
674 677
675 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP)\ 678 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
676 void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 679 void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
677 uint64_t temp;\ 680 uint64_t temp;\
678 \ 681 \
679 asm volatile(\ 682 asm volatile(\
680 "pushl %0 \n\t"\ 683 "pushl %0 \n\t"\
736 "paddw %8, %%mm1 \n\t"\ 739 "paddw %8, %%mm1 \n\t"\
737 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ 740 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
738 "psraw $5, %%mm3 \n\t"\ 741 "psraw $5, %%mm3 \n\t"\
739 "movq %7, %%mm1 \n\t"\ 742 "movq %7, %%mm1 \n\t"\
740 "packuswb %%mm3, %%mm1 \n\t"\ 743 "packuswb %%mm3, %%mm1 \n\t"\
741 OP(%%mm1, (%1),%%mm4, q)\ 744 OP_MMX2(%%mm1, (%1),%%mm4, q)\
742 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ 745 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
743 \ 746 \
744 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ 747 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
745 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ 748 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
746 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ 749 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
782 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ 785 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
783 "paddw %8, %%mm4 \n\t"\ 786 "paddw %8, %%mm4 \n\t"\
784 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ 787 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
785 "psraw $5, %%mm4 \n\t"\ 788 "psraw $5, %%mm4 \n\t"\
786 "packuswb %%mm4, %%mm0 \n\t"\ 789 "packuswb %%mm4, %%mm0 \n\t"\
787 OP(%%mm0, 8(%1), %%mm4, q)\ 790 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
788 \ 791 \
789 "addl %3, %0 \n\t"\ 792 "addl %3, %0 \n\t"\
790 "addl %4, %1 \n\t"\ 793 "addl %4, %1 \n\t"\
791 "decl %2 \n\t"\ 794 "decl %2 \n\t"\
792 " jnz 1b \n\t"\ 795 " jnz 1b \n\t"\
826 "paddw %2, %%mm0 \n\t"\ 829 "paddw %2, %%mm0 \n\t"\
827 "paddw %2, %%mm1 \n\t"\ 830 "paddw %2, %%mm1 \n\t"\
828 "psraw $5, %%mm0 \n\t"\ 831 "psraw $5, %%mm0 \n\t"\
829 "psraw $5, %%mm1 \n\t"\ 832 "psraw $5, %%mm1 \n\t"\
830 "packuswb %%mm1, %%mm0 \n\t"\ 833 "packuswb %%mm1, %%mm0 \n\t"\
831 OP(%%mm0, (%1), %%mm1, q)\ 834 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
832 "movq 16(%0), %%mm0 \n\t"\ 835 "movq 16(%0), %%mm0 \n\t"\
833 "movq 24(%0), %%mm1 \n\t"\ 836 "movq 24(%0), %%mm1 \n\t"\
834 "paddw %2, %%mm0 \n\t"\ 837 "paddw %2, %%mm0 \n\t"\
835 "paddw %2, %%mm1 \n\t"\ 838 "paddw %2, %%mm1 \n\t"\
836 "psraw $5, %%mm0 \n\t"\ 839 "psraw $5, %%mm0 \n\t"\
837 "psraw $5, %%mm1 \n\t"\ 840 "psraw $5, %%mm1 \n\t"\
838 "packuswb %%mm1, %%mm0 \n\t"\ 841 "packuswb %%mm1, %%mm0 \n\t"\
839 OP(%%mm0, 8(%1), %%mm1, q)\ 842 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
840 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ 843 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
841 );\ 844 );\
842 dst+=dstStride;\ 845 dst+=dstStride;\
843 src+=srcStride;\ 846 src+=srcStride;\
844 }\ 847 }\
845 }\ 848 }\
846 \ 849 \
847 void OPNAME ## mpeg4_qpel16_v_lowpass_mmx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 850 void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
851 uint64_t temp;\
852 \
853 asm volatile(\
854 "pushl %0 \n\t"\
855 "pushl %1 \n\t"\
856 "pushl %2 \n\t"\
857 "pxor %%mm7, %%mm7 \n\t"\
858 "1: \n\t"\
859 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
860 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
861 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
862 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
863 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
864 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
865 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
866 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
867 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
868 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
869 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
870 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
871 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
872 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
873 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
874 "paddw %%mm3, %%mm5 \n\t" /* b */\
875 "paddw %%mm2, %%mm6 \n\t" /* c */\
876 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
877 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
878 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
879 "pmullw %6, %%mm6 \n\t" /* 3c - 6b */\
880 "paddw %%mm4, %%mm0 \n\t" /* a */\
881 "paddw %%mm1, %%mm5 \n\t" /* d */\
882 "pmullw %5, %%mm0 \n\t" /* 20a */\
883 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
884 "paddw %8, %%mm6 \n\t"\
885 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
886 "psraw $5, %%mm0 \n\t"\
887 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
888 \
889 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
890 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
891 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
892 "paddw %%mm5, %%mm1 \n\t" /* a */\
893 "paddw %%mm6, %%mm2 \n\t" /* b */\
894 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
895 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
896 "paddw %%mm6, %%mm3 \n\t" /* c */\
897 "paddw %%mm5, %%mm4 \n\t" /* d */\
898 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
899 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
900 "pmullw %5, %%mm1 \n\t" /* 20a */\
901 "pmullw %6, %%mm3 \n\t" /* 3c - 6b */\
902 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
903 "paddw %8, %%mm1 \n\t"\
904 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
905 "psraw $5, %%mm3 \n\t"\
906 "packuswb %%mm3, %%mm0 \n\t"\
907 OP_MMX2(%%mm0, (%1), %%mm4, q)\
908 \
909 "addl %3, %0 \n\t"\
910 "addl %4, %1 \n\t"\
911 "decl %2 \n\t"\
912 " jnz 1b \n\t"\
913 "popl %2 \n\t"\
914 "popl %1 \n\t"\
915 "popl %0 \n\t"\
916 :: "r"(src), "r"(dst), "r"(h),\
917 "r"(srcStride), "r"(dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\
918 );\
919 }\
920 \
921 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
922 int i;\
923 int16_t temp[8];\
924 /* quick HACK, XXX FIXME MUST be optimized */\
925 for(i=0; i<h; i++)\
926 {\
927 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
928 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
929 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
930 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
931 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
932 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
933 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
934 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
935 asm volatile(\
936 "movq (%0), %%mm0 \n\t"\
937 "movq 8(%0), %%mm1 \n\t"\
938 "paddw %2, %%mm0 \n\t"\
939 "paddw %2, %%mm1 \n\t"\
940 "psraw $5, %%mm0 \n\t"\
941 "psraw $5, %%mm1 \n\t"\
942 "packuswb %%mm1, %%mm0 \n\t"\
943 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
944 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
945 );\
946 dst+=dstStride;\
947 src+=srcStride;\
948 }\
949 }
950
951 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
952 \
953 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
848 uint64_t temp[17*4];\ 954 uint64_t temp[17*4];\
849 uint64_t *temp_ptr= temp;\ 955 uint64_t *temp_ptr= temp;\
850 int count= 17;\ 956 int count= 17;\
851 \ 957 \
852 /*FIXME unroll */\ 958 /*FIXME unroll */\
924 \ 1030 \
925 :: "r"(temp_ptr), "r"(dst), "r"(count),\ 1031 :: "r"(temp_ptr), "r"(dst), "r"(count),\
926 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-14*dstStride)\ 1032 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-14*dstStride)\
927 );\ 1033 );\
928 }\ 1034 }\
929 void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1035 \
930 uint64_t temp;\ 1036 void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
931 \
932 asm volatile(\
933 "pushl %0 \n\t"\
934 "pushl %1 \n\t"\
935 "pushl %2 \n\t"\
936 "pxor %%mm7, %%mm7 \n\t"\
937 "1: \n\t"\
938 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
939 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
940 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
941 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
942 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
943 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
944 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
945 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
946 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
947 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
948 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
949 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
950 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
951 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
952 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
953 "paddw %%mm3, %%mm5 \n\t" /* b */\
954 "paddw %%mm2, %%mm6 \n\t" /* c */\
955 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
956 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
957 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
958 "pmullw %6, %%mm6 \n\t" /* 3c - 6b */\
959 "paddw %%mm4, %%mm0 \n\t" /* a */\
960 "paddw %%mm1, %%mm5 \n\t" /* d */\
961 "pmullw %5, %%mm0 \n\t" /* 20a */\
962 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
963 "paddw %8, %%mm6 \n\t"\
964 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
965 "psraw $5, %%mm0 \n\t"\
966 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
967 \
968 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
969 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
970 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
971 "paddw %%mm5, %%mm1 \n\t" /* a */\
972 "paddw %%mm6, %%mm2 \n\t" /* b */\
973 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
974 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
975 "paddw %%mm6, %%mm3 \n\t" /* c */\
976 "paddw %%mm5, %%mm4 \n\t" /* d */\
977 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
978 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
979 "pmullw %5, %%mm1 \n\t" /* 20a */\
980 "pmullw %6, %%mm3 \n\t" /* 3c - 6b */\
981 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
982 "paddw %8, %%mm1 \n\t"\
983 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
984 "psraw $5, %%mm3 \n\t"\
985 "packuswb %%mm3, %%mm0 \n\t"\
986 OP(%%mm0, (%1), %%mm4, q)\
987 \
988 "addl %3, %0 \n\t"\
989 "addl %4, %1 \n\t"\
990 "decl %2 \n\t"\
991 " jnz 1b \n\t"\
992 "popl %2 \n\t"\
993 "popl %1 \n\t"\
994 "popl %0 \n\t"\
995 :: "r"(src), "r"(dst), "r"(h),\
996 "r"(srcStride), "r"(dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\
997 );\
998 }\
999 \
1000 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1001 int i;\
1002 int16_t temp[8];\
1003 /* quick HACK, XXX FIXME MUST be optimized */\
1004 for(i=0; i<h; i++)\
1005 {\
1006 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1007 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1008 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1009 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1010 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1011 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1012 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1013 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1014 asm volatile(\
1015 "movq (%0), %%mm0 \n\t"\
1016 "movq 8(%0), %%mm1 \n\t"\
1017 "paddw %2, %%mm0 \n\t"\
1018 "paddw %2, %%mm1 \n\t"\
1019 "psraw $5, %%mm0 \n\t"\
1020 "psraw $5, %%mm1 \n\t"\
1021 "packuswb %%mm1, %%mm0 \n\t"\
1022 OP(%%mm0, (%1), %%mm1, q)\
1023 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1024 );\
1025 dst+=dstStride;\
1026 src+=srcStride;\
1027 }\
1028 }\
1029 \
1030 void OPNAME ## mpeg4_qpel8_v_lowpass_mmx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1031 uint64_t temp[9*4];\ 1037 uint64_t temp[9*4];\
1032 uint64_t *temp_ptr= temp;\ 1038 uint64_t *temp_ptr= temp;\
1033 int count= 9;\ 1039 int count= 9;\
1034 \ 1040 \
1035 /*FIXME unroll */\ 1041 /*FIXME unroll */\
1087 "popl %0 \n\t"\ 1093 "popl %0 \n\t"\
1088 \ 1094 \
1089 :: "r"(temp_ptr), "r"(dst), "r"(count),\ 1095 :: "r"(temp_ptr), "r"(dst), "r"(count),\
1090 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-6*dstStride)\ 1096 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-6*dstStride)\
1091 );\ 1097 );\
1092 } 1098 }\
1093
1094 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1095 \ 1099 \
1096 static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\ 1100 static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
1097 put_pixels8_mmx(dst, src, stride, 8);\ 1101 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
1098 }\ 1102 }\
1099 \ 1103 \
1100 static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1104 static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1101 uint64_t temp[32];\ 1105 uint64_t temp[32];\
1102 uint8_t * const half= (uint8_t*)temp;\ 1106 uint8_t * const half= (uint8_t*)temp;\
1116 }\ 1120 }\
1117 \ 1121 \
1118 static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1122 static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1119 uint64_t temp[32];\ 1123 uint64_t temp[32];\
1120 uint8_t * const half= (uint8_t*)temp;\ 1124 uint8_t * const half= (uint8_t*)temp;\
1121 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(half, src, 8, stride);\ 1125 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1122 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\ 1126 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1123 }\ 1127 }\
1124 \ 1128 \
1125 static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1129 static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1126 OPNAME ## mpeg4_qpel8_v_lowpass_mmx(dst, src, stride, stride);\ 1130 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1127 }\ 1131 }\
1128 \ 1132 \
1129 static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1133 static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1130 uint64_t temp[32];\ 1134 uint64_t temp[32];\
1131 uint8_t * const half= (uint8_t*)temp;\ 1135 uint8_t * const half= (uint8_t*)temp;\
1132 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(half, src, 8, stride);\ 1136 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1133 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\ 1137 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1134 }\ 1138 }\
1135 static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1139 static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1136 uint64_t half[8*2 + 8*2 + 18*2];\ 1140 uint64_t half[8*2 + 8*2 + 18*2];\
1137 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\ 1141 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1138 uint8_t * const halfV= ((uint8_t*)half);\ 1142 uint8_t * const halfV= ((uint8_t*)half);\
1139 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ 1143 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1140 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1144 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1141 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\ 1145 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1142 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ 1146 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1143 OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\ 1147 OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\
1144 }\ 1148 }\
1145 static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1149 static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1146 uint64_t half[8*2 + 8*2 + 18*2];\ 1150 uint64_t half[8*2 + 8*2 + 18*2];\
1147 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\ 1151 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1148 uint8_t * const halfV= ((uint8_t*)half);\ 1152 uint8_t * const halfV= ((uint8_t*)half);\
1149 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ 1153 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1150 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1154 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1151 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\ 1155 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1152 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ 1156 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1153 OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\ 1157 OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\
1154 }\ 1158 }\
1155 static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1159 static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1156 uint64_t half[8*2 + 8*2 + 9*2];\ 1160 uint64_t half[8*2 + 8*2 + 9*2];\
1157 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ 1161 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1158 uint8_t * const halfV= ((uint8_t*)half);\ 1162 uint8_t * const halfV= ((uint8_t*)half);\
1159 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ 1163 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1160 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1164 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1161 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\ 1165 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1162 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ 1166 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1163 OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\ 1167 OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\
1164 }\ 1168 }\
1165 static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1169 static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1166 uint64_t half[8*2 + 8*2 + 9*2];\ 1170 uint64_t half[8*2 + 8*2 + 9*2];\
1167 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ 1171 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1168 uint8_t * const halfV= ((uint8_t*)half);\ 1172 uint8_t * const halfV= ((uint8_t*)half);\
1169 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ 1173 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1170 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src , 8, stride, 9);\ 1174 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src , 8, stride, 9);\
1171 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\ 1175 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1172 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ 1176 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1173 OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\ 1177 OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\
1174 }\ 1178 }\
1175 static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1179 static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1176 uint64_t half[8*2 + 9*2];\ 1180 uint64_t half[8*2 + 9*2];\
1177 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1181 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1178 uint8_t * const halfHV= ((uint8_t*)half);\ 1182 uint8_t * const halfHV= ((uint8_t*)half);\
1179 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1183 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1180 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ 1184 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1181 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\ 1185 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1182 }\ 1186 }\
1183 static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1187 static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1184 uint64_t half[8*2 + 9*2];\ 1188 uint64_t half[8*2 + 9*2];\
1185 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1189 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1186 uint8_t * const halfHV= ((uint8_t*)half);\ 1190 uint8_t * const halfHV= ((uint8_t*)half);\
1187 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1191 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1188 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ 1192 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1189 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\ 1193 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1190 }\ 1194 }\
1191 static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1195 static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1192 uint64_t half[8*2 + 8*2 + 9*2];\ 1196 uint64_t half[8*2 + 8*2 + 9*2];\
1193 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ 1197 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1194 uint8_t * const halfV= ((uint8_t*)half);\ 1198 uint8_t * const halfV= ((uint8_t*)half);\
1195 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ 1199 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1196 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1200 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1197 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\ 1201 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1198 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ 1202 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1199 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\ 1203 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1200 }\ 1204 }\
1201 static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1205 static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1202 uint64_t half[8*2 + 8*2 + 9*2];\ 1206 uint64_t half[8*2 + 8*2 + 9*2];\
1203 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\ 1207 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1204 uint8_t * const halfV= ((uint8_t*)half);\ 1208 uint8_t * const halfV= ((uint8_t*)half);\
1205 uint8_t * const halfHV= ((uint8_t*)half) + 64;\ 1209 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1206 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1210 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1207 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\ 1211 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1208 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\ 1212 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1209 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\ 1213 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1210 }\ 1214 }\
1211 static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1215 static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1212 uint64_t half[9*2];\ 1216 uint64_t half[9*2];\
1213 uint8_t * const halfH= ((uint8_t*)half);\ 1217 uint8_t * const halfH= ((uint8_t*)half);\
1214 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1218 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1215 OPNAME ## mpeg4_qpel8_v_lowpass_mmx(dst, halfH, stride, 8);\ 1219 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1216 }\ 1220 }\
1217 static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\ 1221 static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
1218 put_pixels16_mmx(dst, src, stride, 16);\ 1222 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
1219 }\ 1223 }\
1220 \ 1224 \
1221 static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1225 static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1222 uint64_t temp[32];\ 1226 uint64_t temp[32];\
1223 uint8_t * const half= (uint8_t*)temp;\ 1227 uint8_t * const half= (uint8_t*)temp;\
1237 }\ 1241 }\
1238 \ 1242 \
1239 static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1243 static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1240 uint64_t temp[32];\ 1244 uint64_t temp[32];\
1241 uint8_t * const half= (uint8_t*)temp;\ 1245 uint8_t * const half= (uint8_t*)temp;\
1242 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(half, src, 16, stride);\ 1246 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1243 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\ 1247 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1244 }\ 1248 }\
1245 \ 1249 \
1246 static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1250 static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1247 OPNAME ## mpeg4_qpel16_v_lowpass_mmx(dst, src, stride, stride);\ 1251 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1248 }\ 1252 }\
1249 \ 1253 \
1250 static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1254 static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1251 uint64_t temp[32];\ 1255 uint64_t temp[32];\
1252 uint8_t * const half= (uint8_t*)temp;\ 1256 uint8_t * const half= (uint8_t*)temp;\
1253 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(half, src, 16, stride);\ 1257 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1254 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\ 1258 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1255 }\ 1259 }\
1256 static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1260 static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1257 uint64_t half[16*2 + 16*2 + 18*2];\ 1261 uint64_t half[16*2 + 16*2 + 18*2];\
1258 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\ 1262 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1259 uint8_t * const halfV= ((uint8_t*)half);\ 1263 uint8_t * const halfV= ((uint8_t*)half);\
1260 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ 1264 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1261 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1265 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1262 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\ 1266 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1263 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ 1267 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1264 OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\ 1268 OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\
1265 }\ 1269 }\
1266 static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1270 static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1267 uint64_t half[16*2 + 16*2 + 18*2];\ 1271 uint64_t half[16*2 + 16*2 + 18*2];\
1268 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\ 1272 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1269 uint8_t * const halfV= ((uint8_t*)half);\ 1273 uint8_t * const halfV= ((uint8_t*)half);\
1270 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ 1274 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1271 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1275 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1272 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\ 1276 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1273 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ 1277 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1274 OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\ 1278 OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\
1275 }\ 1279 }\
1276 static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1280 static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1277 uint64_t half[16*2 + 16*2 + 17*2];\ 1281 uint64_t half[16*2 + 16*2 + 17*2];\
1278 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ 1282 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1279 uint8_t * const halfV= ((uint8_t*)half);\ 1283 uint8_t * const halfV= ((uint8_t*)half);\
1280 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ 1284 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1281 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1285 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1282 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\ 1286 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1283 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ 1287 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1284 OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\ 1288 OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\
1285 }\ 1289 }\
1286 static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1290 static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1287 uint64_t half[16*2 + 16*2 + 17*2];\ 1291 uint64_t half[16*2 + 16*2 + 17*2];\
1288 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ 1292 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1289 uint8_t * const halfV= ((uint8_t*)half);\ 1293 uint8_t * const halfV= ((uint8_t*)half);\
1290 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ 1294 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1291 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src , 16, stride, 17);\ 1295 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src , 16, stride, 17);\
1292 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\ 1296 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1293 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ 1297 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1294 OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\ 1298 OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\
1295 }\ 1299 }\
1296 static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1300 static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1297 uint64_t half[16*2 + 17*2];\ 1301 uint64_t half[16*2 + 17*2];\
1298 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1302 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1299 uint8_t * const halfHV= ((uint8_t*)half);\ 1303 uint8_t * const halfHV= ((uint8_t*)half);\
1300 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1304 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1301 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ 1305 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1302 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\ 1306 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1303 }\ 1307 }\
1304 static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1308 static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1305 uint64_t half[16*2 + 17*2];\ 1309 uint64_t half[16*2 + 17*2];\
1306 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1310 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1307 uint8_t * const halfHV= ((uint8_t*)half);\ 1311 uint8_t * const halfHV= ((uint8_t*)half);\
1308 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1312 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1309 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ 1313 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1310 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\ 1314 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1311 }\ 1315 }\
1312 static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1316 static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1313 uint64_t half[16*2 + 16*2 + 17*2];\ 1317 uint64_t half[16*2 + 16*2 + 17*2];\
1314 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ 1318 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1315 uint8_t * const halfV= ((uint8_t*)half);\ 1319 uint8_t * const halfV= ((uint8_t*)half);\
1316 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ 1320 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1317 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1321 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1318 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\ 1322 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1319 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ 1323 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1320 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\ 1324 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1321 }\ 1325 }\
1322 static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1326 static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1323 uint64_t half[16*2 + 16*2 + 17*2];\ 1327 uint64_t half[16*2 + 16*2 + 17*2];\
1324 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\ 1328 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1325 uint8_t * const halfV= ((uint8_t*)half);\ 1329 uint8_t * const halfV= ((uint8_t*)half);\
1326 uint8_t * const halfHV= ((uint8_t*)half) + 256;\ 1330 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1327 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1331 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1328 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\ 1332 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1329 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\ 1333 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1330 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\ 1334 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1331 }\ 1335 }\
1332 static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ 1336 static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1333 uint64_t half[17*2];\ 1337 uint64_t half[17*2];\
1334 uint8_t * const halfH= ((uint8_t*)half);\ 1338 uint8_t * const halfH= ((uint8_t*)half);\
1335 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1339 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1336 OPNAME ## mpeg4_qpel16_v_lowpass_mmx(dst, halfH, stride, 16);\ 1340 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1337 } 1341 }
1338 1342
1339 1343
1340 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" 1344 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1341 #define AVG_OP(a,b,temp, size) \ 1345 #define AVG_3DNOW_OP(a,b,temp, size) \
1342 "mov" #size " " #b ", " #temp " \n\t"\ 1346 "mov" #size " " #b ", " #temp " \n\t"\
1343 "pavgusb " #temp ", " #a " \n\t"\ 1347 "pavgusb " #temp ", " #a " \n\t"\
1344 "mov" #size " " #a ", " #b " \n\t" 1348 "mov" #size " " #a ", " #b " \n\t"
1345 1349 #define AVG_MMX2_OP(a,b,temp, size) \
1346 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP)
1347 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_OP)
1348 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1349 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1350 QPEL_OP(avg_ , ff_pw_16, _ , AVG_OP, 3dnow)
1351 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1352
1353 #undef AVG_OP
1354 #define AVG_OP(a,b,temp, size) \
1355 "mov" #size " " #b ", " #temp " \n\t"\ 1350 "mov" #size " " #b ", " #temp " \n\t"\
1356 "pavgb " #temp ", " #a " \n\t"\ 1351 "pavgb " #temp ", " #a " \n\t"\
1357 "mov" #size " " #a ", " #b " \n\t" 1352 "mov" #size " " #a ", " #b " \n\t"
1353
1354 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1355 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1356 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1357 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1358 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1359 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1358 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) 1360 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1359 QPEL_OP(avg_ , ff_pw_16, _ , AVG_OP, mmx2) 1361 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1360 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) 1362 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1361 1363
1362 #if 0 1364 #if 0
1363 static void just_return() { return; } 1365 static void just_return() { return; }
1364 #endif 1366 #endif
1483 1485
1484 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; 1486 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1485 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; 1487 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1486 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; 1488 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1487 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; 1489 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
1490
1488 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2) 1491 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1489 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2) 1492 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1490 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2) 1493 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1491 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2) 1494 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
1492 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2) 1495 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)