comparison i386/dsputilenc_mmx.c @ 7220:a94b2cf78a2e libavcodec

Make LOAD4/STORE4 macros more generic. Patch by Victor Pollex victor pollex web de Original thread: [PATCH] mmx implementation of vc-1 inverse transformations Date: 06/21/2008 03:37 PM
author benoit
date Tue, 08 Jul 2008 09:24:11 +0000
parents 51c80db5905c
children cecd3150f82b
comparison
equal deleted inserted replaced
7219:f72ef5b28253 7220:a94b2cf78a2e
996 "movdqa (%1), %%xmm2 \n\t"\ 996 "movdqa (%1), %%xmm2 \n\t"\
997 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ 997 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
998 "paddusw %%xmm1, %%xmm0 \n\t" 998 "paddusw %%xmm1, %%xmm0 \n\t"
999 #endif 999 #endif
1000 1000
1001 #define LOAD4(o, a, b, c, d)\
1002 "movq "#o"(%1), "#a" \n\t"\
1003 "movq "#o"+8(%1), "#b" \n\t"\
1004 "movq "#o"+16(%1), "#c" \n\t"\
1005 "movq "#o"+24(%1), "#d" \n\t"\
1006
1007 #define STORE4(o, a, b, c, d)\
1008 "movq "#a", "#o"(%1) \n\t"\
1009 "movq "#b", "#o"+8(%1) \n\t"\
1010 "movq "#c", "#o"+16(%1) \n\t"\
1011 "movq "#d", "#o"+24(%1) \n\t"\
1012
1013 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to 1001 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1014 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, 1002 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1015 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ 1003 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1016 #define HSUM_MMX(a, t, dst)\ 1004 #define HSUM_MMX(a, t, dst)\
1017 "movq "#a", "#t" \n\t"\ 1005 "movq "#a", "#t" \n\t"\
1051 HADAMARD48\ 1039 HADAMARD48\
1052 \ 1040 \
1053 "movq %%mm7, 96(%1) \n\t"\ 1041 "movq %%mm7, 96(%1) \n\t"\
1054 \ 1042 \
1055 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 1043 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1056 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\ 1044 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1057 \ 1045 \
1058 "movq 96(%1), %%mm7 \n\t"\ 1046 "movq 96(%1), %%mm7 \n\t"\
1059 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 1047 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1060 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\ 1048 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
1061 \ 1049 \
1062 : "=r" (sum)\ 1050 : "=r" (sum)\
1063 : "r"(temp)\ 1051 : "r"(temp)\
1064 );\ 1052 );\
1065 \ 1053 \
1069 HADAMARD48\ 1057 HADAMARD48\
1070 \ 1058 \
1071 "movq %%mm7, 96(%1) \n\t"\ 1059 "movq %%mm7, 96(%1) \n\t"\
1072 \ 1060 \
1073 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 1061 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1074 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\ 1062 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1075 \ 1063 \
1076 "movq 96(%1), %%mm7 \n\t"\ 1064 "movq 96(%1), %%mm7 \n\t"\
1077 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 1065 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1078 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ 1066 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1079 "movq %%mm6, %%mm7 \n\t"\ 1067 "movq %%mm6, %%mm7 \n\t"\
1080 "movq %%mm0, %%mm6 \n\t"\ 1068 "movq %%mm0, %%mm6 \n\t"\
1081 \ 1069 \
1082 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\ 1070 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1083 \ 1071 \
1084 HADAMARD48\ 1072 HADAMARD48\
1085 "movq %%mm7, 64(%1) \n\t"\ 1073 "movq %%mm7, 64(%1) \n\t"\
1086 MMABS(%%mm0, %%mm7)\ 1074 MMABS(%%mm0, %%mm7)\
1087 MMABS(%%mm1, %%mm7)\ 1075 MMABS(%%mm1, %%mm7)\
1093 "movq 64(%1), %%mm2 \n\t"\ 1081 "movq 64(%1), %%mm2 \n\t"\
1094 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ 1082 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1095 "paddusw %%mm1, %%mm0 \n\t"\ 1083 "paddusw %%mm1, %%mm0 \n\t"\
1096 "movq %%mm0, 64(%1) \n\t"\ 1084 "movq %%mm0, 64(%1) \n\t"\
1097 \ 1085 \
1098 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\ 1086 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1099 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\ 1087 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
1100 \ 1088 \
1101 HADAMARD48\ 1089 HADAMARD48\
1102 "movq %%mm7, (%1) \n\t"\ 1090 "movq %%mm7, (%1) \n\t"\
1103 MMABS(%%mm0, %%mm7)\ 1091 MMABS(%%mm0, %%mm7)\
1104 MMABS(%%mm1, %%mm7)\ 1092 MMABS(%%mm1, %%mm7)\