Mercurial > libavcodec.hg
comparison x86/h264dsp_mmx.c @ 12450:3941687b4fa9 libavcodec
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
still #included in dsputil_mmx.c and is part of DSPContext, and h264dsp_mmx.c,
which represents H264DSPContext and is now compiled on its own.
author | rbultje |
---|---|
date | Wed, 01 Sep 2010 20:48:59 +0000 |
parents | 33ecda76b2f2 |
children | 4c3e6ff1237e |
comparison
equal
deleted
inserted
replaced
12449:3bca212d6f51 | 12450:3941687b4fa9 |
---|---|
16 * You should have received a copy of the GNU Lesser General Public | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with FFmpeg; if not, write to the Free Software | 17 * License along with FFmpeg; if not, write to the Free Software |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 */ | 19 */ |
20 | 20 |
21 #include "libavutil/x86_cpu.h" | |
22 #include "libavcodec/h264dsp.h" | |
21 #include "dsputil_mmx.h" | 23 #include "dsputil_mmx.h" |
22 #include "libavcodec/h264pred.h" | |
23 | 24 |
24 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; | 25 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; |
25 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; | 26 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; |
26 | 27 |
27 /***********************************/ | 28 /***********************************/ |
916 :"memory" | 917 :"memory" |
917 ); | 918 ); |
918 } | 919 } |
919 | 920 |
920 /***********************************/ | 921 /***********************************/ |
921 /* motion compensation */ | |
922 | |
923 #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ | |
924 "mov"#q" "#C", "#T" \n\t"\ | |
925 "mov"#d" (%0), "#F" \n\t"\ | |
926 "paddw "#D", "#T" \n\t"\ | |
927 "psllw $2, "#T" \n\t"\ | |
928 "psubw "#B", "#T" \n\t"\ | |
929 "psubw "#E", "#T" \n\t"\ | |
930 "punpcklbw "#Z", "#F" \n\t"\ | |
931 "pmullw %4, "#T" \n\t"\ | |
932 "paddw %5, "#A" \n\t"\ | |
933 "add %2, %0 \n\t"\ | |
934 "paddw "#F", "#A" \n\t"\ | |
935 "paddw "#A", "#T" \n\t"\ | |
936 "psraw $5, "#T" \n\t"\ | |
937 "packuswb "#T", "#T" \n\t"\ | |
938 OP(T, (%1), A, d)\ | |
939 "add %3, %1 \n\t" | |
940 | |
941 #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ | |
942 "mov"#q" "#C", "#T" \n\t"\ | |
943 "mov"#d" (%0), "#F" \n\t"\ | |
944 "paddw "#D", "#T" \n\t"\ | |
945 "psllw $2, "#T" \n\t"\ | |
946 "paddw %4, "#A" \n\t"\ | |
947 "psubw "#B", "#T" \n\t"\ | |
948 "psubw "#E", "#T" \n\t"\ | |
949 "punpcklbw "#Z", "#F" \n\t"\ | |
950 "pmullw %3, "#T" \n\t"\ | |
951 "paddw "#F", "#A" \n\t"\ | |
952 "add %2, %0 \n\t"\ | |
953 "paddw "#A", "#T" \n\t"\ | |
954 "mov"#q" "#T", "#OF"(%1) \n\t" | |
955 | |
956 #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) | |
957 #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) | |
958 #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) | |
959 #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) | |
960 | |
961 | |
962 #define QPEL_H264(OPNAME, OP, MMX)\ | |
963 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
964 int h=4;\ | |
965 \ | |
966 __asm__ volatile(\ | |
967 "pxor %%mm7, %%mm7 \n\t"\ | |
968 "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\ | |
969 "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ | |
970 "1: \n\t"\ | |
971 "movd -1(%0), %%mm1 \n\t"\ | |
972 "movd (%0), %%mm2 \n\t"\ | |
973 "movd 1(%0), %%mm3 \n\t"\ | |
974 "movd 2(%0), %%mm0 \n\t"\ | |
975 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
976 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
977 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
978 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
979 "paddw %%mm0, %%mm1 \n\t"\ | |
980 "paddw %%mm3, %%mm2 \n\t"\ | |
981 "movd -2(%0), %%mm0 \n\t"\ | |
982 "movd 3(%0), %%mm3 \n\t"\ | |
983 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
984 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
985 "paddw %%mm3, %%mm0 \n\t"\ | |
986 "psllw $2, %%mm2 \n\t"\ | |
987 "psubw %%mm1, %%mm2 \n\t"\ | |
988 "pmullw %%mm4, %%mm2 \n\t"\ | |
989 "paddw %%mm5, %%mm0 \n\t"\ | |
990 "paddw %%mm2, %%mm0 \n\t"\ | |
991 "psraw $5, %%mm0 \n\t"\ | |
992 "packuswb %%mm0, %%mm0 \n\t"\ | |
993 OP(%%mm0, (%1),%%mm6, d)\ | |
994 "add %3, %0 \n\t"\ | |
995 "add %4, %1 \n\t"\ | |
996 "decl %2 \n\t"\ | |
997 " jnz 1b \n\t"\ | |
998 : "+a"(src), "+c"(dst), "+g"(h)\ | |
999 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ | |
1000 : "memory"\ | |
1001 );\ | |
1002 }\ | |
1003 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ | |
1004 int h=4;\ | |
1005 __asm__ volatile(\ | |
1006 "pxor %%mm7, %%mm7 \n\t"\ | |
1007 "movq %0, %%mm4 \n\t"\ | |
1008 "movq %1, %%mm5 \n\t"\ | |
1009 :: "m"(ff_pw_5), "m"(ff_pw_16)\ | |
1010 );\ | |
1011 do{\ | |
1012 __asm__ volatile(\ | |
1013 "movd -1(%0), %%mm1 \n\t"\ | |
1014 "movd (%0), %%mm2 \n\t"\ | |
1015 "movd 1(%0), %%mm3 \n\t"\ | |
1016 "movd 2(%0), %%mm0 \n\t"\ | |
1017 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1018 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1019 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
1020 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1021 "paddw %%mm0, %%mm1 \n\t"\ | |
1022 "paddw %%mm3, %%mm2 \n\t"\ | |
1023 "movd -2(%0), %%mm0 \n\t"\ | |
1024 "movd 3(%0), %%mm3 \n\t"\ | |
1025 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1026 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
1027 "paddw %%mm3, %%mm0 \n\t"\ | |
1028 "psllw $2, %%mm2 \n\t"\ | |
1029 "psubw %%mm1, %%mm2 \n\t"\ | |
1030 "pmullw %%mm4, %%mm2 \n\t"\ | |
1031 "paddw %%mm5, %%mm0 \n\t"\ | |
1032 "paddw %%mm2, %%mm0 \n\t"\ | |
1033 "movd (%2), %%mm3 \n\t"\ | |
1034 "psraw $5, %%mm0 \n\t"\ | |
1035 "packuswb %%mm0, %%mm0 \n\t"\ | |
1036 PAVGB" %%mm3, %%mm0 \n\t"\ | |
1037 OP(%%mm0, (%1),%%mm6, d)\ | |
1038 "add %4, %0 \n\t"\ | |
1039 "add %4, %1 \n\t"\ | |
1040 "add %3, %2 \n\t"\ | |
1041 : "+a"(src), "+c"(dst), "+d"(src2)\ | |
1042 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ | |
1043 : "memory"\ | |
1044 );\ | |
1045 }while(--h);\ | |
1046 }\ | |
1047 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1048 src -= 2*srcStride;\ | |
1049 __asm__ volatile(\ | |
1050 "pxor %%mm7, %%mm7 \n\t"\ | |
1051 "movd (%0), %%mm0 \n\t"\ | |
1052 "add %2, %0 \n\t"\ | |
1053 "movd (%0), %%mm1 \n\t"\ | |
1054 "add %2, %0 \n\t"\ | |
1055 "movd (%0), %%mm2 \n\t"\ | |
1056 "add %2, %0 \n\t"\ | |
1057 "movd (%0), %%mm3 \n\t"\ | |
1058 "add %2, %0 \n\t"\ | |
1059 "movd (%0), %%mm4 \n\t"\ | |
1060 "add %2, %0 \n\t"\ | |
1061 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1062 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1063 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1064 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
1065 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
1066 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | |
1067 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | |
1068 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | |
1069 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | |
1070 \ | |
1071 : "+a"(src), "+c"(dst)\ | |
1072 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
1073 : "memory"\ | |
1074 );\ | |
1075 }\ | |
1076 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
1077 int h=4;\ | |
1078 int w=3;\ | |
1079 src -= 2*srcStride+2;\ | |
1080 while(w--){\ | |
1081 __asm__ volatile(\ | |
1082 "pxor %%mm7, %%mm7 \n\t"\ | |
1083 "movd (%0), %%mm0 \n\t"\ | |
1084 "add %2, %0 \n\t"\ | |
1085 "movd (%0), %%mm1 \n\t"\ | |
1086 "add %2, %0 \n\t"\ | |
1087 "movd (%0), %%mm2 \n\t"\ | |
1088 "add %2, %0 \n\t"\ | |
1089 "movd (%0), %%mm3 \n\t"\ | |
1090 "add %2, %0 \n\t"\ | |
1091 "movd (%0), %%mm4 \n\t"\ | |
1092 "add %2, %0 \n\t"\ | |
1093 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1094 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1095 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1096 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
1097 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
1098 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ | |
1099 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ | |
1100 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ | |
1101 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ | |
1102 \ | |
1103 : "+a"(src)\ | |
1104 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
1105 : "memory"\ | |
1106 );\ | |
1107 tmp += 4;\ | |
1108 src += 4 - 9*srcStride;\ | |
1109 }\ | |
1110 tmp -= 3*4;\ | |
1111 __asm__ volatile(\ | |
1112 "1: \n\t"\ | |
1113 "movq (%0), %%mm0 \n\t"\ | |
1114 "paddw 10(%0), %%mm0 \n\t"\ | |
1115 "movq 2(%0), %%mm1 \n\t"\ | |
1116 "paddw 8(%0), %%mm1 \n\t"\ | |
1117 "movq 4(%0), %%mm2 \n\t"\ | |
1118 "paddw 6(%0), %%mm2 \n\t"\ | |
1119 "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ | |
1120 "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ | |
1121 "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ | |
1122 "paddsw %%mm2, %%mm0 \n\t"\ | |
1123 "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\ | |
1124 "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\ | |
1125 "psraw $6, %%mm0 \n\t"\ | |
1126 "packuswb %%mm0, %%mm0 \n\t"\ | |
1127 OP(%%mm0, (%1),%%mm7, d)\ | |
1128 "add $24, %0 \n\t"\ | |
1129 "add %3, %1 \n\t"\ | |
1130 "decl %2 \n\t"\ | |
1131 " jnz 1b \n\t"\ | |
1132 : "+a"(tmp), "+c"(dst), "+g"(h)\ | |
1133 : "S"((x86_reg)dstStride)\ | |
1134 : "memory"\ | |
1135 );\ | |
1136 }\ | |
1137 \ | |
1138 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1139 int h=8;\ | |
1140 __asm__ volatile(\ | |
1141 "pxor %%mm7, %%mm7 \n\t"\ | |
1142 "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ | |
1143 "1: \n\t"\ | |
1144 "movq (%0), %%mm0 \n\t"\ | |
1145 "movq 1(%0), %%mm2 \n\t"\ | |
1146 "movq %%mm0, %%mm1 \n\t"\ | |
1147 "movq %%mm2, %%mm3 \n\t"\ | |
1148 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1149 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
1150 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1151 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1152 "paddw %%mm2, %%mm0 \n\t"\ | |
1153 "paddw %%mm3, %%mm1 \n\t"\ | |
1154 "psllw $2, %%mm0 \n\t"\ | |
1155 "psllw $2, %%mm1 \n\t"\ | |
1156 "movq -1(%0), %%mm2 \n\t"\ | |
1157 "movq 2(%0), %%mm4 \n\t"\ | |
1158 "movq %%mm2, %%mm3 \n\t"\ | |
1159 "movq %%mm4, %%mm5 \n\t"\ | |
1160 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1161 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1162 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
1163 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
1164 "paddw %%mm4, %%mm2 \n\t"\ | |
1165 "paddw %%mm3, %%mm5 \n\t"\ | |
1166 "psubw %%mm2, %%mm0 \n\t"\ | |
1167 "psubw %%mm5, %%mm1 \n\t"\ | |
1168 "pmullw %%mm6, %%mm0 \n\t"\ | |
1169 "pmullw %%mm6, %%mm1 \n\t"\ | |
1170 "movd -2(%0), %%mm2 \n\t"\ | |
1171 "movd 7(%0), %%mm5 \n\t"\ | |
1172 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1173 "punpcklbw %%mm7, %%mm5 \n\t"\ | |
1174 "paddw %%mm3, %%mm2 \n\t"\ | |
1175 "paddw %%mm5, %%mm4 \n\t"\ | |
1176 "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ | |
1177 "paddw %%mm5, %%mm2 \n\t"\ | |
1178 "paddw %%mm5, %%mm4 \n\t"\ | |
1179 "paddw %%mm2, %%mm0 \n\t"\ | |
1180 "paddw %%mm4, %%mm1 \n\t"\ | |
1181 "psraw $5, %%mm0 \n\t"\ | |
1182 "psraw $5, %%mm1 \n\t"\ | |
1183 "packuswb %%mm1, %%mm0 \n\t"\ | |
1184 OP(%%mm0, (%1),%%mm5, q)\ | |
1185 "add %3, %0 \n\t"\ | |
1186 "add %4, %1 \n\t"\ | |
1187 "decl %2 \n\t"\ | |
1188 " jnz 1b \n\t"\ | |
1189 : "+a"(src), "+c"(dst), "+g"(h)\ | |
1190 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ | |
1191 : "memory"\ | |
1192 );\ | |
1193 }\ | |
1194 \ | |
1195 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ | |
1196 int h=8;\ | |
1197 __asm__ volatile(\ | |
1198 "pxor %%mm7, %%mm7 \n\t"\ | |
1199 "movq %0, %%mm6 \n\t"\ | |
1200 :: "m"(ff_pw_5)\ | |
1201 );\ | |
1202 do{\ | |
1203 __asm__ volatile(\ | |
1204 "movq (%0), %%mm0 \n\t"\ | |
1205 "movq 1(%0), %%mm2 \n\t"\ | |
1206 "movq %%mm0, %%mm1 \n\t"\ | |
1207 "movq %%mm2, %%mm3 \n\t"\ | |
1208 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1209 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
1210 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1211 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1212 "paddw %%mm2, %%mm0 \n\t"\ | |
1213 "paddw %%mm3, %%mm1 \n\t"\ | |
1214 "psllw $2, %%mm0 \n\t"\ | |
1215 "psllw $2, %%mm1 \n\t"\ | |
1216 "movq -1(%0), %%mm2 \n\t"\ | |
1217 "movq 2(%0), %%mm4 \n\t"\ | |
1218 "movq %%mm2, %%mm3 \n\t"\ | |
1219 "movq %%mm4, %%mm5 \n\t"\ | |
1220 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1221 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1222 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
1223 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
1224 "paddw %%mm4, %%mm2 \n\t"\ | |
1225 "paddw %%mm3, %%mm5 \n\t"\ | |
1226 "psubw %%mm2, %%mm0 \n\t"\ | |
1227 "psubw %%mm5, %%mm1 \n\t"\ | |
1228 "pmullw %%mm6, %%mm0 \n\t"\ | |
1229 "pmullw %%mm6, %%mm1 \n\t"\ | |
1230 "movd -2(%0), %%mm2 \n\t"\ | |
1231 "movd 7(%0), %%mm5 \n\t"\ | |
1232 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1233 "punpcklbw %%mm7, %%mm5 \n\t"\ | |
1234 "paddw %%mm3, %%mm2 \n\t"\ | |
1235 "paddw %%mm5, %%mm4 \n\t"\ | |
1236 "movq %5, %%mm5 \n\t"\ | |
1237 "paddw %%mm5, %%mm2 \n\t"\ | |
1238 "paddw %%mm5, %%mm4 \n\t"\ | |
1239 "paddw %%mm2, %%mm0 \n\t"\ | |
1240 "paddw %%mm4, %%mm1 \n\t"\ | |
1241 "psraw $5, %%mm0 \n\t"\ | |
1242 "psraw $5, %%mm1 \n\t"\ | |
1243 "movq (%2), %%mm4 \n\t"\ | |
1244 "packuswb %%mm1, %%mm0 \n\t"\ | |
1245 PAVGB" %%mm4, %%mm0 \n\t"\ | |
1246 OP(%%mm0, (%1),%%mm5, q)\ | |
1247 "add %4, %0 \n\t"\ | |
1248 "add %4, %1 \n\t"\ | |
1249 "add %3, %2 \n\t"\ | |
1250 : "+a"(src), "+c"(dst), "+d"(src2)\ | |
1251 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ | |
1252 "m"(ff_pw_16)\ | |
1253 : "memory"\ | |
1254 );\ | |
1255 }while(--h);\ | |
1256 }\ | |
1257 \ | |
1258 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1259 int w= 2;\ | |
1260 src -= 2*srcStride;\ | |
1261 \ | |
1262 while(w--){\ | |
1263 __asm__ volatile(\ | |
1264 "pxor %%mm7, %%mm7 \n\t"\ | |
1265 "movd (%0), %%mm0 \n\t"\ | |
1266 "add %2, %0 \n\t"\ | |
1267 "movd (%0), %%mm1 \n\t"\ | |
1268 "add %2, %0 \n\t"\ | |
1269 "movd (%0), %%mm2 \n\t"\ | |
1270 "add %2, %0 \n\t"\ | |
1271 "movd (%0), %%mm3 \n\t"\ | |
1272 "add %2, %0 \n\t"\ | |
1273 "movd (%0), %%mm4 \n\t"\ | |
1274 "add %2, %0 \n\t"\ | |
1275 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1276 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1277 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1278 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
1279 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
1280 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | |
1281 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | |
1282 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | |
1283 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | |
1284 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ | |
1285 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ | |
1286 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | |
1287 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | |
1288 \ | |
1289 : "+a"(src), "+c"(dst)\ | |
1290 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
1291 : "memory"\ | |
1292 );\ | |
1293 if(h==16){\ | |
1294 __asm__ volatile(\ | |
1295 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | |
1296 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | |
1297 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ | |
1298 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ | |
1299 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | |
1300 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | |
1301 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | |
1302 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | |
1303 \ | |
1304 : "+a"(src), "+c"(dst)\ | |
1305 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
1306 : "memory"\ | |
1307 );\ | |
1308 }\ | |
1309 src += 4-(h+5)*srcStride;\ | |
1310 dst += 4-h*dstStride;\ | |
1311 }\ | |
1312 }\ | |
1313 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ | |
1314 int w = (size+8)>>2;\ | |
1315 src -= 2*srcStride+2;\ | |
1316 while(w--){\ | |
1317 __asm__ volatile(\ | |
1318 "pxor %%mm7, %%mm7 \n\t"\ | |
1319 "movd (%0), %%mm0 \n\t"\ | |
1320 "add %2, %0 \n\t"\ | |
1321 "movd (%0), %%mm1 \n\t"\ | |
1322 "add %2, %0 \n\t"\ | |
1323 "movd (%0), %%mm2 \n\t"\ | |
1324 "add %2, %0 \n\t"\ | |
1325 "movd (%0), %%mm3 \n\t"\ | |
1326 "add %2, %0 \n\t"\ | |
1327 "movd (%0), %%mm4 \n\t"\ | |
1328 "add %2, %0 \n\t"\ | |
1329 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1330 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1331 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1332 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
1333 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
1334 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\ | |
1335 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\ | |
1336 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\ | |
1337 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\ | |
1338 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\ | |
1339 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\ | |
1340 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\ | |
1341 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\ | |
1342 : "+a"(src)\ | |
1343 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
1344 : "memory"\ | |
1345 );\ | |
1346 if(size==16){\ | |
1347 __asm__ volatile(\ | |
1348 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ | |
1349 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ | |
1350 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ | |
1351 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ | |
1352 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ | |
1353 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\ | |
1354 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\ | |
1355 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\ | |
1356 : "+a"(src)\ | |
1357 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
1358 : "memory"\ | |
1359 );\ | |
1360 }\ | |
1361 tmp += 4;\ | |
1362 src += 4 - (size+5)*srcStride;\ | |
1363 }\ | |
1364 }\ | |
1365 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ | |
1366 int w = size>>4;\ | |
1367 do{\ | |
1368 int h = size;\ | |
1369 __asm__ volatile(\ | |
1370 "1: \n\t"\ | |
1371 "movq (%0), %%mm0 \n\t"\ | |
1372 "movq 8(%0), %%mm3 \n\t"\ | |
1373 "movq 2(%0), %%mm1 \n\t"\ | |
1374 "movq 10(%0), %%mm4 \n\t"\ | |
1375 "paddw %%mm4, %%mm0 \n\t"\ | |
1376 "paddw %%mm3, %%mm1 \n\t"\ | |
1377 "paddw 18(%0), %%mm3 \n\t"\ | |
1378 "paddw 16(%0), %%mm4 \n\t"\ | |
1379 "movq 4(%0), %%mm2 \n\t"\ | |
1380 "movq 12(%0), %%mm5 \n\t"\ | |
1381 "paddw 6(%0), %%mm2 \n\t"\ | |
1382 "paddw 14(%0), %%mm5 \n\t"\ | |
1383 "psubw %%mm1, %%mm0 \n\t"\ | |
1384 "psubw %%mm4, %%mm3 \n\t"\ | |
1385 "psraw $2, %%mm0 \n\t"\ | |
1386 "psraw $2, %%mm3 \n\t"\ | |
1387 "psubw %%mm1, %%mm0 \n\t"\ | |
1388 "psubw %%mm4, %%mm3 \n\t"\ | |
1389 "paddsw %%mm2, %%mm0 \n\t"\ | |
1390 "paddsw %%mm5, %%mm3 \n\t"\ | |
1391 "psraw $2, %%mm0 \n\t"\ | |
1392 "psraw $2, %%mm3 \n\t"\ | |
1393 "paddw %%mm2, %%mm0 \n\t"\ | |
1394 "paddw %%mm5, %%mm3 \n\t"\ | |
1395 "psraw $6, %%mm0 \n\t"\ | |
1396 "psraw $6, %%mm3 \n\t"\ | |
1397 "packuswb %%mm3, %%mm0 \n\t"\ | |
1398 OP(%%mm0, (%1),%%mm7, q)\ | |
1399 "add $48, %0 \n\t"\ | |
1400 "add %3, %1 \n\t"\ | |
1401 "decl %2 \n\t"\ | |
1402 " jnz 1b \n\t"\ | |
1403 : "+a"(tmp), "+c"(dst), "+g"(h)\ | |
1404 : "S"((x86_reg)dstStride)\ | |
1405 : "memory"\ | |
1406 );\ | |
1407 tmp += 8 - size*24;\ | |
1408 dst += 8 - size*dstStride;\ | |
1409 }while(w--);\ | |
1410 }\ | |
1411 \ | |
1412 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1413 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ | |
1414 }\ | |
1415 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1416 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ | |
1417 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ | |
1418 }\ | |
1419 \ | |
1420 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1421 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
1422 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
1423 src += 8*srcStride;\ | |
1424 dst += 8*dstStride;\ | |
1425 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
1426 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
1427 }\ | |
1428 \ | |
1429 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ | |
1430 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ | |
1431 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ | |
1432 src += 8*dstStride;\ | |
1433 dst += 8*dstStride;\ | |
1434 src2 += 8*src2Stride;\ | |
1435 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ | |
1436 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ | |
1437 }\ | |
1438 \ | |
1439 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ | |
1440 put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ | |
1441 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ | |
1442 }\ | |
1443 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
1444 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ | |
1445 }\ | |
1446 \ | |
1447 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
1448 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ | |
1449 }\ | |
1450 \ | |
1451 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ | |
1452 {\ | |
1453 __asm__ volatile(\ | |
1454 "movq (%1), %%mm0 \n\t"\ | |
1455 "movq 24(%1), %%mm1 \n\t"\ | |
1456 "psraw $5, %%mm0 \n\t"\ | |
1457 "psraw $5, %%mm1 \n\t"\ | |
1458 "packuswb %%mm0, %%mm0 \n\t"\ | |
1459 "packuswb %%mm1, %%mm1 \n\t"\ | |
1460 PAVGB" (%0), %%mm0 \n\t"\ | |
1461 PAVGB" (%0,%3), %%mm1 \n\t"\ | |
1462 OP(%%mm0, (%2), %%mm4, d)\ | |
1463 OP(%%mm1, (%2,%4), %%mm5, d)\ | |
1464 "lea (%0,%3,2), %0 \n\t"\ | |
1465 "lea (%2,%4,2), %2 \n\t"\ | |
1466 "movq 48(%1), %%mm0 \n\t"\ | |
1467 "movq 72(%1), %%mm1 \n\t"\ | |
1468 "psraw $5, %%mm0 \n\t"\ | |
1469 "psraw $5, %%mm1 \n\t"\ | |
1470 "packuswb %%mm0, %%mm0 \n\t"\ | |
1471 "packuswb %%mm1, %%mm1 \n\t"\ | |
1472 PAVGB" (%0), %%mm0 \n\t"\ | |
1473 PAVGB" (%0,%3), %%mm1 \n\t"\ | |
1474 OP(%%mm0, (%2), %%mm4, d)\ | |
1475 OP(%%mm1, (%2,%4), %%mm5, d)\ | |
1476 :"+a"(src8), "+c"(src16), "+d"(dst)\ | |
1477 :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\ | |
1478 :"memory");\ | |
1479 }\ | |
1480 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ | |
1481 {\ | |
1482 do{\ | |
1483 __asm__ volatile(\ | |
1484 "movq (%1), %%mm0 \n\t"\ | |
1485 "movq 8(%1), %%mm1 \n\t"\ | |
1486 "movq 48(%1), %%mm2 \n\t"\ | |
1487 "movq 8+48(%1), %%mm3 \n\t"\ | |
1488 "psraw $5, %%mm0 \n\t"\ | |
1489 "psraw $5, %%mm1 \n\t"\ | |
1490 "psraw $5, %%mm2 \n\t"\ | |
1491 "psraw $5, %%mm3 \n\t"\ | |
1492 "packuswb %%mm1, %%mm0 \n\t"\ | |
1493 "packuswb %%mm3, %%mm2 \n\t"\ | |
1494 PAVGB" (%0), %%mm0 \n\t"\ | |
1495 PAVGB" (%0,%3), %%mm2 \n\t"\ | |
1496 OP(%%mm0, (%2), %%mm5, q)\ | |
1497 OP(%%mm2, (%2,%4), %%mm5, q)\ | |
1498 ::"a"(src8), "c"(src16), "d"(dst),\ | |
1499 "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ | |
1500 :"memory");\ | |
1501 src8 += 2L*src8Stride;\ | |
1502 src16 += 48;\ | |
1503 dst += 2L*dstStride;\ | |
1504 }while(h-=2);\ | |
1505 }\ | |
1506 static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ | |
1507 {\ | |
1508 OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ | |
1509 OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ | |
1510 }\ | |
1511 | |
1512 | |
1513 #if ARCH_X86_64 | |
1514 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ | |
1515 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ | |
1516 int h=16;\ | |
1517 __asm__ volatile(\ | |
1518 "pxor %%xmm15, %%xmm15 \n\t"\ | |
1519 "movdqa %6, %%xmm14 \n\t"\ | |
1520 "movdqa %7, %%xmm13 \n\t"\ | |
1521 "1: \n\t"\ | |
1522 "lddqu 6(%0), %%xmm1 \n\t"\ | |
1523 "lddqu -2(%0), %%xmm7 \n\t"\ | |
1524 "movdqa %%xmm1, %%xmm0 \n\t"\ | |
1525 "punpckhbw %%xmm15, %%xmm1 \n\t"\ | |
1526 "punpcklbw %%xmm15, %%xmm0 \n\t"\ | |
1527 "punpcklbw %%xmm15, %%xmm7 \n\t"\ | |
1528 "movdqa %%xmm1, %%xmm2 \n\t"\ | |
1529 "movdqa %%xmm0, %%xmm6 \n\t"\ | |
1530 "movdqa %%xmm1, %%xmm3 \n\t"\ | |
1531 "movdqa %%xmm0, %%xmm8 \n\t"\ | |
1532 "movdqa %%xmm1, %%xmm4 \n\t"\ | |
1533 "movdqa %%xmm0, %%xmm9 \n\t"\ | |
1534 "movdqa %%xmm0, %%xmm12 \n\t"\ | |
1535 "movdqa %%xmm1, %%xmm11 \n\t"\ | |
1536 "palignr $10,%%xmm0, %%xmm11\n\t"\ | |
1537 "palignr $10,%%xmm7, %%xmm12\n\t"\ | |
1538 "palignr $2, %%xmm0, %%xmm4 \n\t"\ | |
1539 "palignr $2, %%xmm7, %%xmm9 \n\t"\ | |
1540 "palignr $4, %%xmm0, %%xmm3 \n\t"\ | |
1541 "palignr $4, %%xmm7, %%xmm8 \n\t"\ | |
1542 "palignr $6, %%xmm0, %%xmm2 \n\t"\ | |
1543 "palignr $6, %%xmm7, %%xmm6 \n\t"\ | |
1544 "paddw %%xmm0 ,%%xmm11 \n\t"\ | |
1545 "palignr $8, %%xmm0, %%xmm1 \n\t"\ | |
1546 "palignr $8, %%xmm7, %%xmm0 \n\t"\ | |
1547 "paddw %%xmm12,%%xmm7 \n\t"\ | |
1548 "paddw %%xmm3, %%xmm2 \n\t"\ | |
1549 "paddw %%xmm8, %%xmm6 \n\t"\ | |
1550 "paddw %%xmm4, %%xmm1 \n\t"\ | |
1551 "paddw %%xmm9, %%xmm0 \n\t"\ | |
1552 "psllw $2, %%xmm2 \n\t"\ | |
1553 "psllw $2, %%xmm6 \n\t"\ | |
1554 "psubw %%xmm1, %%xmm2 \n\t"\ | |
1555 "psubw %%xmm0, %%xmm6 \n\t"\ | |
1556 "paddw %%xmm13,%%xmm11 \n\t"\ | |
1557 "paddw %%xmm13,%%xmm7 \n\t"\ | |
1558 "pmullw %%xmm14,%%xmm2 \n\t"\ | |
1559 "pmullw %%xmm14,%%xmm6 \n\t"\ | |
1560 "lddqu (%2), %%xmm3 \n\t"\ | |
1561 "paddw %%xmm11,%%xmm2 \n\t"\ | |
1562 "paddw %%xmm7, %%xmm6 \n\t"\ | |
1563 "psraw $5, %%xmm2 \n\t"\ | |
1564 "psraw $5, %%xmm6 \n\t"\ | |
1565 "packuswb %%xmm2,%%xmm6 \n\t"\ | |
1566 "pavgb %%xmm3, %%xmm6 \n\t"\ | |
1567 OP(%%xmm6, (%1), %%xmm4, dqa)\ | |
1568 "add %5, %0 \n\t"\ | |
1569 "add %5, %1 \n\t"\ | |
1570 "add %4, %2 \n\t"\ | |
1571 "decl %3 \n\t"\ | |
1572 "jg 1b \n\t"\ | |
1573 : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ | |
1574 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ | |
1575 "m"(ff_pw_5), "m"(ff_pw_16)\ | |
1576 : "memory"\ | |
1577 );\ | |
1578 } | |
1579 #else // ARCH_X86_64 | |
1580 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ | |
1581 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ | |
1582 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ | |
1583 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ | |
1584 src += 8*dstStride;\ | |
1585 dst += 8*dstStride;\ | |
1586 src2 += 8*src2Stride;\ | |
1587 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ | |
1588 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ | |
1589 } | |
1590 #endif // ARCH_X86_64 | |
1591 | |
1592 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ | |
1593 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ | |
1594 int h=8;\ | |
1595 __asm__ volatile(\ | |
1596 "pxor %%xmm7, %%xmm7 \n\t"\ | |
1597 "movdqa %0, %%xmm6 \n\t"\ | |
1598 :: "m"(ff_pw_5)\ | |
1599 );\ | |
1600 do{\ | |
1601 __asm__ volatile(\ | |
1602 "lddqu -2(%0), %%xmm1 \n\t"\ | |
1603 "movdqa %%xmm1, %%xmm0 \n\t"\ | |
1604 "punpckhbw %%xmm7, %%xmm1 \n\t"\ | |
1605 "punpcklbw %%xmm7, %%xmm0 \n\t"\ | |
1606 "movdqa %%xmm1, %%xmm2 \n\t"\ | |
1607 "movdqa %%xmm1, %%xmm3 \n\t"\ | |
1608 "movdqa %%xmm1, %%xmm4 \n\t"\ | |
1609 "movdqa %%xmm1, %%xmm5 \n\t"\ | |
1610 "palignr $2, %%xmm0, %%xmm4 \n\t"\ | |
1611 "palignr $4, %%xmm0, %%xmm3 \n\t"\ | |
1612 "palignr $6, %%xmm0, %%xmm2 \n\t"\ | |
1613 "palignr $8, %%xmm0, %%xmm1 \n\t"\ | |
1614 "palignr $10,%%xmm0, %%xmm5 \n\t"\ | |
1615 "paddw %%xmm5, %%xmm0 \n\t"\ | |
1616 "paddw %%xmm3, %%xmm2 \n\t"\ | |
1617 "paddw %%xmm4, %%xmm1 \n\t"\ | |
1618 "psllw $2, %%xmm2 \n\t"\ | |
1619 "movq (%2), %%xmm3 \n\t"\ | |
1620 "psubw %%xmm1, %%xmm2 \n\t"\ | |
1621 "paddw %5, %%xmm0 \n\t"\ | |
1622 "pmullw %%xmm6, %%xmm2 \n\t"\ | |
1623 "paddw %%xmm0, %%xmm2 \n\t"\ | |
1624 "psraw $5, %%xmm2 \n\t"\ | |
1625 "packuswb %%xmm2, %%xmm2 \n\t"\ | |
1626 "pavgb %%xmm3, %%xmm2 \n\t"\ | |
1627 OP(%%xmm2, (%1), %%xmm4, q)\ | |
1628 "add %4, %0 \n\t"\ | |
1629 "add %4, %1 \n\t"\ | |
1630 "add %3, %2 \n\t"\ | |
1631 : "+a"(src), "+c"(dst), "+d"(src2)\ | |
1632 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ | |
1633 "m"(ff_pw_16)\ | |
1634 : "memory"\ | |
1635 );\ | |
1636 }while(--h);\ | |
1637 }\ | |
1638 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ | |
1639 \ | |
1640 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1641 int h=8;\ | |
1642 __asm__ volatile(\ | |
1643 "pxor %%xmm7, %%xmm7 \n\t"\ | |
1644 "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\ | |
1645 "1: \n\t"\ | |
1646 "lddqu -2(%0), %%xmm1 \n\t"\ | |
1647 "movdqa %%xmm1, %%xmm0 \n\t"\ | |
1648 "punpckhbw %%xmm7, %%xmm1 \n\t"\ | |
1649 "punpcklbw %%xmm7, %%xmm0 \n\t"\ | |
1650 "movdqa %%xmm1, %%xmm2 \n\t"\ | |
1651 "movdqa %%xmm1, %%xmm3 \n\t"\ | |
1652 "movdqa %%xmm1, %%xmm4 \n\t"\ | |
1653 "movdqa %%xmm1, %%xmm5 \n\t"\ | |
1654 "palignr $2, %%xmm0, %%xmm4 \n\t"\ | |
1655 "palignr $4, %%xmm0, %%xmm3 \n\t"\ | |
1656 "palignr $6, %%xmm0, %%xmm2 \n\t"\ | |
1657 "palignr $8, %%xmm0, %%xmm1 \n\t"\ | |
1658 "palignr $10,%%xmm0, %%xmm5 \n\t"\ | |
1659 "paddw %%xmm5, %%xmm0 \n\t"\ | |
1660 "paddw %%xmm3, %%xmm2 \n\t"\ | |
1661 "paddw %%xmm4, %%xmm1 \n\t"\ | |
1662 "psllw $2, %%xmm2 \n\t"\ | |
1663 "psubw %%xmm1, %%xmm2 \n\t"\ | |
1664 "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\ | |
1665 "pmullw %%xmm6, %%xmm2 \n\t"\ | |
1666 "paddw %%xmm0, %%xmm2 \n\t"\ | |
1667 "psraw $5, %%xmm2 \n\t"\ | |
1668 "packuswb %%xmm2, %%xmm2 \n\t"\ | |
1669 OP(%%xmm2, (%1), %%xmm4, q)\ | |
1670 "add %3, %0 \n\t"\ | |
1671 "add %4, %1 \n\t"\ | |
1672 "decl %2 \n\t"\ | |
1673 " jnz 1b \n\t"\ | |
1674 : "+a"(src), "+c"(dst), "+g"(h)\ | |
1675 : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ | |
1676 : "memory"\ | |
1677 );\ | |
1678 }\ | |
1679 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1680 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
1681 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
1682 src += 8*srcStride;\ | |
1683 dst += 8*dstStride;\ | |
1684 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
1685 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
1686 }\ | |
1687 | |
1688 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ | |
1689 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1690 src -= 2*srcStride;\ | |
1691 \ | |
1692 __asm__ volatile(\ | |
1693 "pxor %%xmm7, %%xmm7 \n\t"\ | |
1694 "movq (%0), %%xmm0 \n\t"\ | |
1695 "add %2, %0 \n\t"\ | |
1696 "movq (%0), %%xmm1 \n\t"\ | |
1697 "add %2, %0 \n\t"\ | |
1698 "movq (%0), %%xmm2 \n\t"\ | |
1699 "add %2, %0 \n\t"\ | |
1700 "movq (%0), %%xmm3 \n\t"\ | |
1701 "add %2, %0 \n\t"\ | |
1702 "movq (%0), %%xmm4 \n\t"\ | |
1703 "add %2, %0 \n\t"\ | |
1704 "punpcklbw %%xmm7, %%xmm0 \n\t"\ | |
1705 "punpcklbw %%xmm7, %%xmm1 \n\t"\ | |
1706 "punpcklbw %%xmm7, %%xmm2 \n\t"\ | |
1707 "punpcklbw %%xmm7, %%xmm3 \n\t"\ | |
1708 "punpcklbw %%xmm7, %%xmm4 \n\t"\ | |
1709 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ | |
1710 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ | |
1711 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ | |
1712 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ | |
1713 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ | |
1714 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ | |
1715 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ | |
1716 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ | |
1717 \ | |
1718 : "+a"(src), "+c"(dst)\ | |
1719 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
1720 : "memory"\ | |
1721 );\ | |
1722 if(h==16){\ | |
1723 __asm__ volatile(\ | |
1724 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ | |
1725 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ | |
1726 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ | |
1727 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ | |
1728 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ | |
1729 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ | |
1730 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ | |
1731 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ | |
1732 \ | |
1733 : "+a"(src), "+c"(dst)\ | |
1734 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
1735 : "memory"\ | |
1736 );\ | |
1737 }\ | |
1738 }\ | |
1739 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1740 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ | |
1741 }\ | |
1742 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1743 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ | |
1744 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ | |
1745 } | |
1746 | |
1747 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ | |
1748 int w = (size+8)>>3; | |
1749 src -= 2*srcStride+2; | |
1750 while(w--){ | |
1751 __asm__ volatile( | |
1752 "pxor %%xmm7, %%xmm7 \n\t" | |
1753 "movq (%0), %%xmm0 \n\t" | |
1754 "add %2, %0 \n\t" | |
1755 "movq (%0), %%xmm1 \n\t" | |
1756 "add %2, %0 \n\t" | |
1757 "movq (%0), %%xmm2 \n\t" | |
1758 "add %2, %0 \n\t" | |
1759 "movq (%0), %%xmm3 \n\t" | |
1760 "add %2, %0 \n\t" | |
1761 "movq (%0), %%xmm4 \n\t" | |
1762 "add %2, %0 \n\t" | |
1763 "punpcklbw %%xmm7, %%xmm0 \n\t" | |
1764 "punpcklbw %%xmm7, %%xmm1 \n\t" | |
1765 "punpcklbw %%xmm7, %%xmm2 \n\t" | |
1766 "punpcklbw %%xmm7, %%xmm3 \n\t" | |
1767 "punpcklbw %%xmm7, %%xmm4 \n\t" | |
1768 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) | |
1769 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) | |
1770 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) | |
1771 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) | |
1772 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) | |
1773 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) | |
1774 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) | |
1775 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) | |
1776 : "+a"(src) | |
1777 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) | |
1778 : "memory" | |
1779 ); | |
1780 if(size==16){ | |
1781 __asm__ volatile( | |
1782 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) | |
1783 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) | |
1784 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) | |
1785 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) | |
1786 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) | |
1787 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) | |
1788 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) | |
1789 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) | |
1790 : "+a"(src) | |
1791 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) | |
1792 : "memory" | |
1793 ); | |
1794 } | |
1795 tmp += 8; | |
1796 src += 8 - (size+5)*srcStride; | |
1797 } | |
1798 } | |
1799 | |
1800 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ | |
1801 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ | |
1802 int h = size;\ | |
1803 if(size == 16){\ | |
1804 __asm__ volatile(\ | |
1805 "1: \n\t"\ | |
1806 "movdqa 32(%0), %%xmm4 \n\t"\ | |
1807 "movdqa 16(%0), %%xmm5 \n\t"\ | |
1808 "movdqa (%0), %%xmm7 \n\t"\ | |
1809 "movdqa %%xmm4, %%xmm3 \n\t"\ | |
1810 "movdqa %%xmm4, %%xmm2 \n\t"\ | |
1811 "movdqa %%xmm4, %%xmm1 \n\t"\ | |
1812 "movdqa %%xmm4, %%xmm0 \n\t"\ | |
1813 "palignr $10, %%xmm5, %%xmm0 \n\t"\ | |
1814 "palignr $8, %%xmm5, %%xmm1 \n\t"\ | |
1815 "palignr $6, %%xmm5, %%xmm2 \n\t"\ | |
1816 "palignr $4, %%xmm5, %%xmm3 \n\t"\ | |
1817 "palignr $2, %%xmm5, %%xmm4 \n\t"\ | |
1818 "paddw %%xmm5, %%xmm0 \n\t"\ | |
1819 "paddw %%xmm4, %%xmm1 \n\t"\ | |
1820 "paddw %%xmm3, %%xmm2 \n\t"\ | |
1821 "movdqa %%xmm5, %%xmm6 \n\t"\ | |
1822 "movdqa %%xmm5, %%xmm4 \n\t"\ | |
1823 "movdqa %%xmm5, %%xmm3 \n\t"\ | |
1824 "palignr $8, %%xmm7, %%xmm4 \n\t"\ | |
1825 "palignr $2, %%xmm7, %%xmm6 \n\t"\ | |
1826 "palignr $10, %%xmm7, %%xmm3 \n\t"\ | |
1827 "paddw %%xmm6, %%xmm4 \n\t"\ | |
1828 "movdqa %%xmm5, %%xmm6 \n\t"\ | |
1829 "palignr $6, %%xmm7, %%xmm5 \n\t"\ | |
1830 "palignr $4, %%xmm7, %%xmm6 \n\t"\ | |
1831 "paddw %%xmm7, %%xmm3 \n\t"\ | |
1832 "paddw %%xmm6, %%xmm5 \n\t"\ | |
1833 \ | |
1834 "psubw %%xmm1, %%xmm0 \n\t"\ | |
1835 "psubw %%xmm4, %%xmm3 \n\t"\ | |
1836 "psraw $2, %%xmm0 \n\t"\ | |
1837 "psraw $2, %%xmm3 \n\t"\ | |
1838 "psubw %%xmm1, %%xmm0 \n\t"\ | |
1839 "psubw %%xmm4, %%xmm3 \n\t"\ | |
1840 "paddw %%xmm2, %%xmm0 \n\t"\ | |
1841 "paddw %%xmm5, %%xmm3 \n\t"\ | |
1842 "psraw $2, %%xmm0 \n\t"\ | |
1843 "psraw $2, %%xmm3 \n\t"\ | |
1844 "paddw %%xmm2, %%xmm0 \n\t"\ | |
1845 "paddw %%xmm5, %%xmm3 \n\t"\ | |
1846 "psraw $6, %%xmm0 \n\t"\ | |
1847 "psraw $6, %%xmm3 \n\t"\ | |
1848 "packuswb %%xmm0, %%xmm3 \n\t"\ | |
1849 OP(%%xmm3, (%1), %%xmm7, dqa)\ | |
1850 "add $48, %0 \n\t"\ | |
1851 "add %3, %1 \n\t"\ | |
1852 "decl %2 \n\t"\ | |
1853 " jnz 1b \n\t"\ | |
1854 : "+a"(tmp), "+c"(dst), "+g"(h)\ | |
1855 : "S"((x86_reg)dstStride)\ | |
1856 : "memory"\ | |
1857 );\ | |
1858 }else{\ | |
1859 __asm__ volatile(\ | |
1860 "1: \n\t"\ | |
1861 "movdqa 16(%0), %%xmm1 \n\t"\ | |
1862 "movdqa (%0), %%xmm0 \n\t"\ | |
1863 "movdqa %%xmm1, %%xmm2 \n\t"\ | |
1864 "movdqa %%xmm1, %%xmm3 \n\t"\ | |
1865 "movdqa %%xmm1, %%xmm4 \n\t"\ | |
1866 "movdqa %%xmm1, %%xmm5 \n\t"\ | |
1867 "palignr $10, %%xmm0, %%xmm5 \n\t"\ | |
1868 "palignr $8, %%xmm0, %%xmm4 \n\t"\ | |
1869 "palignr $6, %%xmm0, %%xmm3 \n\t"\ | |
1870 "palignr $4, %%xmm0, %%xmm2 \n\t"\ | |
1871 "palignr $2, %%xmm0, %%xmm1 \n\t"\ | |
1872 "paddw %%xmm5, %%xmm0 \n\t"\ | |
1873 "paddw %%xmm4, %%xmm1 \n\t"\ | |
1874 "paddw %%xmm3, %%xmm2 \n\t"\ | |
1875 "psubw %%xmm1, %%xmm0 \n\t"\ | |
1876 "psraw $2, %%xmm0 \n\t"\ | |
1877 "psubw %%xmm1, %%xmm0 \n\t"\ | |
1878 "paddw %%xmm2, %%xmm0 \n\t"\ | |
1879 "psraw $2, %%xmm0 \n\t"\ | |
1880 "paddw %%xmm2, %%xmm0 \n\t"\ | |
1881 "psraw $6, %%xmm0 \n\t"\ | |
1882 "packuswb %%xmm0, %%xmm0 \n\t"\ | |
1883 OP(%%xmm0, (%1), %%xmm7, q)\ | |
1884 "add $48, %0 \n\t"\ | |
1885 "add %3, %1 \n\t"\ | |
1886 "decl %2 \n\t"\ | |
1887 " jnz 1b \n\t"\ | |
1888 : "+a"(tmp), "+c"(dst), "+g"(h)\ | |
1889 : "S"((x86_reg)dstStride)\ | |
1890 : "memory"\ | |
1891 );\ | |
1892 }\ | |
1893 } | |
1894 | |
1895 #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ | |
1896 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ | |
1897 put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ | |
1898 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ | |
1899 }\ | |
1900 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
1901 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ | |
1902 }\ | |
1903 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
1904 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ | |
1905 }\ | |
1906 | |
1907 #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 | |
1908 #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 | |
1909 #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 | |
1910 #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 | |
1911 #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 | |
1912 #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 | |
1913 #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 | |
1914 #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 | |
1915 | |
1916 #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 | |
1917 #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 | |
1918 #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 | |
1919 #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 | |
1920 #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 | |
1921 #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 | |
1922 #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 | |
1923 #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 | |
1924 | |
1925 #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 | |
1926 #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 | |
1927 #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 | |
1928 #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 | |
1929 | |
1930 #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 | |
1931 #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 | |
1932 #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 | |
1933 #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 | |
1934 | |
1935 #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 | |
1936 #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 | |
1937 | |
1938 #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ | |
1939 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ | |
1940 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ | |
1941 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ | |
1942 H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ | |
1943 | |
1944 static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ | |
1945 put_pixels16_sse2(dst, src, stride, 16); | |
1946 } | |
1947 static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ | |
1948 avg_pixels16_sse2(dst, src, stride, 16); | |
1949 } | |
1950 #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 | |
1951 #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 | |
1952 | |
1953 #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ | |
1954 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ | |
1955 OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ | |
1956 }\ | |
1957 | |
1958 #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ | |
1959 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1960 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ | |
1961 }\ | |
1962 \ | |
1963 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1964 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ | |
1965 }\ | |
1966 \ | |
1967 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1968 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ | |
1969 }\ | |
1970 | |
1971 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ | |
1972 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1973 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ | |
1974 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ | |
1975 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ | |
1976 }\ | |
1977 \ | |
1978 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1979 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ | |
1980 }\ | |
1981 \ | |
1982 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1983 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ | |
1984 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ | |
1985 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ | |
1986 }\ | |
1987 | |
1988 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ | |
1989 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1990 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ | |
1991 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ | |
1992 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ | |
1993 }\ | |
1994 \ | |
1995 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1996 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ | |
1997 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ | |
1998 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ | |
1999 }\ | |
2000 \ | |
2001 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2002 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ | |
2003 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ | |
2004 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ | |
2005 }\ | |
2006 \ | |
2007 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2008 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ | |
2009 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ | |
2010 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ | |
2011 }\ | |
2012 \ | |
2013 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2014 DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ | |
2015 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ | |
2016 }\ | |
2017 \ | |
2018 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2019 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ | |
2020 uint8_t * const halfHV= temp;\ | |
2021 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ | |
2022 assert(((int)temp & 7) == 0);\ | |
2023 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ | |
2024 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ | |
2025 }\ | |
2026 \ | |
2027 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2028 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ | |
2029 uint8_t * const halfHV= temp;\ | |
2030 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ | |
2031 assert(((int)temp & 7) == 0);\ | |
2032 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ | |
2033 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ | |
2034 }\ | |
2035 \ | |
2036 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2037 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ | |
2038 uint8_t * const halfHV= temp;\ | |
2039 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ | |
2040 assert(((int)temp & 7) == 0);\ | |
2041 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ | |
2042 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ | |
2043 }\ | |
2044 \ | |
2045 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2046 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ | |
2047 uint8_t * const halfHV= temp;\ | |
2048 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ | |
2049 assert(((int)temp & 7) == 0);\ | |
2050 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ | |
2051 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ | |
2052 }\ | |
2053 | |
2054 #define H264_MC_4816(MMX)\ | |
2055 H264_MC(put_, 4, MMX, 8)\ | |
2056 H264_MC(put_, 8, MMX, 8)\ | |
2057 H264_MC(put_, 16,MMX, 8)\ | |
2058 H264_MC(avg_, 4, MMX, 8)\ | |
2059 H264_MC(avg_, 8, MMX, 8)\ | |
2060 H264_MC(avg_, 16,MMX, 8)\ | |
2061 | |
2062 #define H264_MC_816(QPEL, XMM)\ | |
2063 QPEL(put_, 8, XMM, 16)\ | |
2064 QPEL(put_, 16,XMM, 16)\ | |
2065 QPEL(avg_, 8, XMM, 16)\ | |
2066 QPEL(avg_, 16,XMM, 16)\ | |
2067 | |
2068 | |
2069 #define AVG_3DNOW_OP(a,b,temp, size) \ | |
2070 "mov" #size " " #b ", " #temp " \n\t"\ | |
2071 "pavgusb " #temp ", " #a " \n\t"\ | |
2072 "mov" #size " " #a ", " #b " \n\t" | |
2073 #define AVG_MMX2_OP(a,b,temp, size) \ | |
2074 "mov" #size " " #b ", " #temp " \n\t"\ | |
2075 "pavgb " #temp ", " #a " \n\t"\ | |
2076 "mov" #size " " #a ", " #b " \n\t" | |
2077 | |
2078 #define PAVGB "pavgusb" | |
2079 QPEL_H264(put_, PUT_OP, 3dnow) | |
2080 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) | |
2081 #undef PAVGB | |
2082 #define PAVGB "pavgb" | |
2083 QPEL_H264(put_, PUT_OP, mmx2) | |
2084 QPEL_H264(avg_, AVG_MMX2_OP, mmx2) | |
2085 QPEL_H264_V_XMM(put_, PUT_OP, sse2) | |
2086 QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) | |
2087 QPEL_H264_HV_XMM(put_, PUT_OP, sse2) | |
2088 QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) | |
2089 #if HAVE_SSSE3 | |
2090 QPEL_H264_H_XMM(put_, PUT_OP, ssse3) | |
2091 QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) | |
2092 QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) | |
2093 QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) | |
2094 QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) | |
2095 QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) | |
2096 #endif | |
2097 #undef PAVGB | |
2098 | |
2099 H264_MC_4816(3dnow) | |
2100 H264_MC_4816(mmx2) | |
2101 H264_MC_816(H264_MC_V, sse2) | |
2102 H264_MC_816(H264_MC_HV, sse2) | |
2103 #if HAVE_SSSE3 | |
2104 H264_MC_816(H264_MC_H, ssse3) | |
2105 H264_MC_816(H264_MC_HV, ssse3) | |
2106 #endif | |
2107 | |
2108 /***********************************/ | |
2109 /* weighted prediction */ | 922 /* weighted prediction */ |
2110 | 923 |
2111 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) | 924 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) |
2112 { | 925 { |
2113 int x, y; | 926 int x, y; |
2215 int offset); | 1028 int offset); |
2216 | 1029 |
2217 void ff_h264_biweight_16x16_ssse3(uint8_t *dst, uint8_t *src, int stride, | 1030 void ff_h264_biweight_16x16_ssse3(uint8_t *dst, uint8_t *src, int stride, |
2218 int log2_denom, int weightd, int weights, | 1031 int log2_denom, int weightd, int weights, |
2219 int offset); | 1032 int offset); |
1033 | |
1034 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | |
1035 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | |
1036 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); | |
1037 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); | |
1038 void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); | |
1039 | |
1040 #if HAVE_YASM && ARCH_X86_32 | |
1041 void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); | |
1042 static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) | |
1043 { | |
1044 ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); | |
1045 ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); | |
1046 } | |
1047 #endif | |
1048 | |
1049 void ff_h264dsp_init_x86(H264DSPContext *c) | |
1050 { | |
1051 int mm_flags = mm_support(); | |
1052 | |
1053 if (mm_flags & FF_MM_MMX) { | |
1054 c->h264_idct_dc_add= | |
1055 c->h264_idct_add= ff_h264_idct_add_mmx; | |
1056 c->h264_idct8_dc_add= | |
1057 c->h264_idct8_add= ff_h264_idct8_add_mmx; | |
1058 | |
1059 c->h264_idct_add16 = ff_h264_idct_add16_mmx; | |
1060 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; | |
1061 c->h264_idct_add8 = ff_h264_idct_add8_mmx; | |
1062 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; | |
1063 | |
1064 if (mm_flags & FF_MM_MMX2) { | |
1065 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; | |
1066 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; | |
1067 c->h264_idct_add16 = ff_h264_idct_add16_mmx2; | |
1068 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; | |
1069 c->h264_idct_add8 = ff_h264_idct_add8_mmx2; | |
1070 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; | |
1071 | |
1072 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; | |
1073 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; | |
1074 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; | |
1075 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; | |
1076 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; | |
1077 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; | |
1078 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; | |
1079 | |
1080 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; | |
1081 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; | |
1082 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; | |
1083 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; | |
1084 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; | |
1085 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; | |
1086 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; | |
1087 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; | |
1088 | |
1089 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; | |
1090 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; | |
1091 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; | |
1092 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; | |
1093 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; | |
1094 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; | |
1095 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; | |
1096 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; | |
1097 } | |
1098 if(mm_flags & FF_MM_SSE2){ | |
1099 c->h264_idct8_add = ff_h264_idct8_add_sse2; | |
1100 c->h264_idct8_add4= ff_h264_idct8_add4_sse2; | |
1101 } | |
1102 | |
1103 #if HAVE_YASM | |
1104 if (mm_flags & FF_MM_MMX2){ | |
1105 #if ARCH_X86_32 | |
1106 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; | |
1107 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; | |
1108 #endif | |
1109 if( mm_flags&FF_MM_SSE2 ){ | |
1110 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; | |
1111 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; | |
1112 #if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110 | |
1113 c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; | |
1114 c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; | |
1115 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; | |
1116 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; | |
1117 #endif | |
1118 #if CONFIG_GPL | |
1119 c->h264_idct_add16 = ff_h264_idct_add16_sse2; | |
1120 c->h264_idct_add8 = ff_h264_idct_add8_sse2; | |
1121 c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2; | |
1122 #endif | |
1123 } | |
1124 if ( mm_flags&FF_MM_SSSE3 ){ | |
1125 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; | |
1126 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; | |
1127 } | |
1128 } | |
1129 #endif | |
1130 } | |
1131 } |