Mercurial > libavcodec.hg
comparison i386/dsputilenc_mmx.c @ 7220:a94b2cf78a2e libavcodec
Make LOAD4/STORE4 macros more generic.
Patch by Victor Pollex victor pollex web de
Original thread: [PATCH] mmx implementation of vc-1 inverse transformations
Date: 06/21/2008 03:37 PM
author | benoit |
---|---|
date | Tue, 08 Jul 2008 09:24:11 +0000 |
parents | 51c80db5905c |
children | cecd3150f82b |
comparison
equal
deleted
inserted
replaced
7219:f72ef5b28253 | 7220:a94b2cf78a2e |
---|---|
996 "movdqa (%1), %%xmm2 \n\t"\ | 996 "movdqa (%1), %%xmm2 \n\t"\ |
997 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ | 997 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ |
998 "paddusw %%xmm1, %%xmm0 \n\t" | 998 "paddusw %%xmm1, %%xmm0 \n\t" |
999 #endif | 999 #endif |
1000 | 1000 |
1001 #define LOAD4(o, a, b, c, d)\ | |
1002 "movq "#o"(%1), "#a" \n\t"\ | |
1003 "movq "#o"+8(%1), "#b" \n\t"\ | |
1004 "movq "#o"+16(%1), "#c" \n\t"\ | |
1005 "movq "#o"+24(%1), "#d" \n\t"\ | |
1006 | |
1007 #define STORE4(o, a, b, c, d)\ | |
1008 "movq "#a", "#o"(%1) \n\t"\ | |
1009 "movq "#b", "#o"+8(%1) \n\t"\ | |
1010 "movq "#c", "#o"+16(%1) \n\t"\ | |
1011 "movq "#d", "#o"+24(%1) \n\t"\ | |
1012 | |
1013 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to | 1001 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to |
1014 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, | 1002 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
1015 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ | 1003 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ |
1016 #define HSUM_MMX(a, t, dst)\ | 1004 #define HSUM_MMX(a, t, dst)\ |
1017 "movq "#a", "#t" \n\t"\ | 1005 "movq "#a", "#t" \n\t"\ |
1051 HADAMARD48\ | 1039 HADAMARD48\ |
1052 \ | 1040 \ |
1053 "movq %%mm7, 96(%1) \n\t"\ | 1041 "movq %%mm7, 96(%1) \n\t"\ |
1054 \ | 1042 \ |
1055 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ | 1043 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ |
1056 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\ | 1044 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ |
1057 \ | 1045 \ |
1058 "movq 96(%1), %%mm7 \n\t"\ | 1046 "movq 96(%1), %%mm7 \n\t"\ |
1059 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ | 1047 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ |
1060 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\ | 1048 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ |
1061 \ | 1049 \ |
1062 : "=r" (sum)\ | 1050 : "=r" (sum)\ |
1063 : "r"(temp)\ | 1051 : "r"(temp)\ |
1064 );\ | 1052 );\ |
1065 \ | 1053 \ |
1069 HADAMARD48\ | 1057 HADAMARD48\ |
1070 \ | 1058 \ |
1071 "movq %%mm7, 96(%1) \n\t"\ | 1059 "movq %%mm7, 96(%1) \n\t"\ |
1072 \ | 1060 \ |
1073 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ | 1061 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ |
1074 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\ | 1062 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ |
1075 \ | 1063 \ |
1076 "movq 96(%1), %%mm7 \n\t"\ | 1064 "movq 96(%1), %%mm7 \n\t"\ |
1077 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ | 1065 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ |
1078 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ | 1066 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ |
1079 "movq %%mm6, %%mm7 \n\t"\ | 1067 "movq %%mm6, %%mm7 \n\t"\ |
1080 "movq %%mm0, %%mm6 \n\t"\ | 1068 "movq %%mm0, %%mm6 \n\t"\ |
1081 \ | 1069 \ |
1082 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\ | 1070 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ |
1083 \ | 1071 \ |
1084 HADAMARD48\ | 1072 HADAMARD48\ |
1085 "movq %%mm7, 64(%1) \n\t"\ | 1073 "movq %%mm7, 64(%1) \n\t"\ |
1086 MMABS(%%mm0, %%mm7)\ | 1074 MMABS(%%mm0, %%mm7)\ |
1087 MMABS(%%mm1, %%mm7)\ | 1075 MMABS(%%mm1, %%mm7)\ |
1093 "movq 64(%1), %%mm2 \n\t"\ | 1081 "movq 64(%1), %%mm2 \n\t"\ |
1094 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ | 1082 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ |
1095 "paddusw %%mm1, %%mm0 \n\t"\ | 1083 "paddusw %%mm1, %%mm0 \n\t"\ |
1096 "movq %%mm0, 64(%1) \n\t"\ | 1084 "movq %%mm0, 64(%1) \n\t"\ |
1097 \ | 1085 \ |
1098 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\ | 1086 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ |
1099 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\ | 1087 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ |
1100 \ | 1088 \ |
1101 HADAMARD48\ | 1089 HADAMARD48\ |
1102 "movq %%mm7, (%1) \n\t"\ | 1090 "movq %%mm7, (%1) \n\t"\ |
1103 MMABS(%%mm0, %%mm7)\ | 1091 MMABS(%%mm0, %%mm7)\ |
1104 MMABS(%%mm1, %%mm7)\ | 1092 MMABS(%%mm1, %%mm7)\ |