Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12086:d780ae746855 libavcodec
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
author | rbultje |
---|---|
date | Sat, 03 Jul 2010 19:26:30 +0000 |
parents | 8527154f6e81 |
children | b246b214c2e9 |
comparison
equal
deleted
inserted
replaced
12085:8454bb880008 | 12086:d780ae746855 |
---|---|
144 | 144 |
145 pw_20091: times 4 dw 20091 | 145 pw_20091: times 4 dw 20091 |
146 pw_17734: times 4 dw 17734 | 146 pw_17734: times 4 dw 17734 |
147 | 147 |
148 cextern pw_3 | 148 cextern pw_3 |
149 cextern pb_3 | |
149 cextern pw_4 | 150 cextern pw_4 |
151 cextern pb_4 | |
150 cextern pw_64 | 152 cextern pw_64 |
153 cextern pb_80 | |
154 cextern pb_F8 | |
155 cextern pb_FE | |
151 | 156 |
152 SECTION .text | 157 SECTION .text |
153 | 158 |
154 ;----------------------------------------------------------------------------- | 159 ;----------------------------------------------------------------------------- |
155 ; subpel MC functions: | 160 ; subpel MC functions: |
1061 add r0, 2*16*4 | 1066 add r0, 2*16*4 |
1062 SCATTER_WHT 2 | 1067 SCATTER_WHT 2 |
1063 add r0, 2*16*4 | 1068 add r0, 2*16*4 |
1064 SCATTER_WHT 3 | 1069 SCATTER_WHT 3 |
1065 RET | 1070 RET |
1071 | |
1072 ;----------------------------------------------------------------------------- | |
1073 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); | |
1074 ;----------------------------------------------------------------------------- | |
1075 | |
1076 ; macro called with 7 mm register indexes as argument, and 4 regular registers | |
1077 ; | |
1078 ; first 4 mm registers will carry the transposed pixel data | |
1079 ; the other three are scratchspace (one would be sufficient, but this allows | |
1080 ; for more spreading/pipelining and thus faster execution on OOE CPUs) | |
1081 ; | |
1082 ; first two regular registers are buf+4*stride and buf+5*stride | |
1083 ; third is -stride, fourth is +stride | |
1084 %macro READ_8x4_INTERLEAVED 11 | |
1085 ; interleave 8 (A-H) rows of 4 pixels each | |
1086 movd m%1, [%8+%10*4] ; A0-3 | |
1087 movd m%5, [%9+%10*4] ; B0-3 | |
1088 movd m%2, [%8+%10*2] ; C0-3 | |
1089 movd m%6, [%8+%10] ; D0-3 | |
1090 movd m%3, [%8] ; E0-3 | |
1091 movd m%7, [%9] ; F0-3 | |
1092 movd m%4, [%9+%11] ; G0-3 | |
1093 punpcklbw m%1, m%5 ; A/B interleaved | |
1094 movd m%5, [%9+%11*2] ; H0-3 | |
1095 punpcklbw m%2, m%6 ; C/D interleaved | |
1096 punpcklbw m%3, m%7 ; E/F interleaved | |
1097 punpcklbw m%4, m%5 ; G/H interleaved | |
1098 %endmacro | |
1099 | |
1100 ; macro called with 7 mm register indexes as argument, and 5 regular registers | |
1101 ; first 11 mean the same as READ_8x4_TRANSPOSED above | |
1102 ; fifth regular register is scratchspace to reach the bottom 8 rows, it | |
1103 ; will be set to second regular register + 8*stride at the end | |
1104 %macro READ_16x4_INTERLEAVED 12 | |
1105 ; transpose 16 (A-P) rows of 4 pixels each | |
1106 lea %12, [r0+8*r2] | |
1107 | |
1108 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M | |
1109 movd m%1, [%8+%10*4] ; A0-3 | |
1110 movd m%3, [%12+%10*4] ; I0-3 | |
1111 movd m%2, [%8+%10*2] ; C0-3 | |
1112 movd m%4, [%12+%10*2] ; K0-3 | |
1113 movd m%6, [%8+%10] ; D0-3 | |
1114 movd m%5, [%12+%10] ; L0-3 | |
1115 movd m%7, [%12] ; M0-3 | |
1116 add %12, %11 | |
1117 punpcklbw m%1, m%3 ; A/I | |
1118 movd m%3, [%8] ; E0-3 | |
1119 punpcklbw m%2, m%4 ; C/K | |
1120 punpcklbw m%6, m%5 ; D/L | |
1121 punpcklbw m%3, m%7 ; E/M | |
1122 punpcklbw m%2, m%6 ; C/D/K/L interleaved | |
1123 | |
1124 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P | |
1125 movd m%5, [%9+%10*4] ; B0-3 | |
1126 movd m%4, [%12+%10*4] ; J0-3 | |
1127 movd m%7, [%9] ; F0-3 | |
1128 movd m%6, [%12] ; N0-3 | |
1129 punpcklbw m%5, m%4 ; B/J | |
1130 punpcklbw m%7, m%6 ; F/N | |
1131 punpcklbw m%1, m%5 ; A/B/I/J interleaved | |
1132 punpcklbw m%3, m%7 ; E/F/M/N interleaved | |
1133 movd m%4, [%9+%11] ; G0-3 | |
1134 movd m%6, [%12+%11] ; O0-3 | |
1135 movd m%5, [%9+%11*2] ; H0-3 | |
1136 movd m%7, [%12+%11*2] ; P0-3 | |
1137 punpcklbw m%4, m%6 ; G/O | |
1138 punpcklbw m%5, m%7 ; H/P | |
1139 punpcklbw m%4, m%5 ; G/H/O/P interleaved | |
1140 %endmacro | |
1141 | |
1142 ; write 4 mm registers of 2 dwords each | |
1143 ; first four arguments are mm register indexes containing source data | |
1144 ; last four are registers containing buf+4*stride, buf+5*stride, | |
1145 ; -stride and +stride | |
1146 %macro WRITE_4x2D 8 | |
1147 ; write out (2 dwords per register) | |
1148 movd [%5+%7*4], m%1 | |
1149 movd [%5+%7*2], m%2 | |
1150 movd [%5], m%3 | |
1151 movd [%6+%8], m%4 | |
1152 punpckhdq m%1, m%1 | |
1153 punpckhdq m%2, m%2 | |
1154 punpckhdq m%3, m%3 | |
1155 punpckhdq m%4, m%4 | |
1156 movd [%6+%7*4], m%1 | |
1157 movd [%5+%7], m%2 | |
1158 movd [%6], m%3 | |
1159 movd [%6+%8*2], m%4 | |
1160 %endmacro | |
1161 | |
1162 ; write 4 xmm registers of 4 dwords each | |
1163 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular | |
1164 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride | |
1165 ; we add 1*stride to the third regular registry in the process | |
1166 %macro WRITE_4x4D 9 | |
1167 ; write out (4 dwords per register), start with dwords zero | |
1168 movd [%5+%8*4], m%1 | |
1169 movd [%5], m%2 | |
1170 movd [%5+%9*4], m%3 | |
1171 movd [%5+%9*8], m%4 | |
1172 | |
1173 ; store dwords 1 | |
1174 psrldq m%1, 4 | |
1175 psrldq m%2, 4 | |
1176 psrldq m%3, 4 | |
1177 psrldq m%4, 4 | |
1178 movd [%6+%8*4], m%1 | |
1179 movd [%6], m%2 | |
1180 movd [%6+%9*4], m%3 | |
1181 movd [%6+%9*8], m%4 | |
1182 | |
1183 ; write dwords 2 | |
1184 psrldq m%1, 4 | |
1185 psrldq m%2, 4 | |
1186 psrldq m%3, 4 | |
1187 psrldq m%4, 4 | |
1188 movd [%5+%8*2], m%1 | |
1189 movd [%6+%9], m%2 | |
1190 movd [%7+%8*2], m%3 | |
1191 movd [%7+%9*2], m%4 | |
1192 add %7, %9 | |
1193 | |
1194 ; store dwords 3 | |
1195 psrldq m%1, 4 | |
1196 psrldq m%2, 4 | |
1197 psrldq m%3, 4 | |
1198 psrldq m%4, 4 | |
1199 movd [%5+%8], m%1 | |
1200 movd [%6+%9*2], m%2 | |
1201 movd [%7+%8*2], m%3 | |
1202 movd [%7+%9*2], m%4 | |
1203 %endmacro | |
1204 | |
1205 %macro SIMPLE_LOOPFILTER 3 | |
1206 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 | |
1207 %ifidn %2, h | |
1208 mov r5, rsp ; backup stack pointer | |
1209 and rsp, ~(mmsize-1) ; align stack | |
1210 %endif | |
1211 %if mmsize == 8 ; mmx/mmxext | |
1212 mov r3, 2 | |
1213 %endif | |
1214 | |
1215 ; splat register with "flim" | |
1216 movd m7, r2 | |
1217 punpcklbw m7, m7 | |
1218 %if mmsize == 16 ; sse2 | |
1219 punpcklwd m7, m7 | |
1220 pshufd m7, m7, 0x0 | |
1221 %elifidn %1, mmx | |
1222 punpcklwd m7, m7 | |
1223 punpckldq m7, m7 | |
1224 %else ; mmxext | |
1225 pshufw m7, m7, 0x0 | |
1226 %endif | |
1227 | |
1228 ; set up indexes to address 4 rows | |
1229 mov r2, r1 | |
1230 neg r1 | |
1231 %ifidn %2, h | |
1232 lea r0, [r0+4*r2-2] | |
1233 sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 | |
1234 %endif | |
1235 | |
1236 %if mmsize == 8 ; mmx / mmxext | |
1237 .next8px | |
1238 %endif | |
1239 %ifidn %2, v | |
1240 ; read 4 half/full rows of pixels | |
1241 mova m0, [r0+r1*2] ; p1 | |
1242 mova m1, [r0+r1] ; p0 | |
1243 mova m2, [r0] ; q0 | |
1244 mova m3, [r0+r2] ; q1 | |
1245 %else ; h | |
1246 lea r4, [r0+r2] | |
1247 | |
1248 %if mmsize == 8 ; mmx/mmxext | |
1249 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 | |
1250 %else ; sse2 | |
1251 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 | |
1252 %endif | |
1253 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1254 | |
1255 mova [rsp], m0 ; store p1 | |
1256 mova [rsp+mmsize], m3 ; store q1 | |
1257 %endif | |
1258 | |
1259 ; simple_limit | |
1260 mova m5, m2 ; m5=backup of q0 | |
1261 mova m6, m1 ; m6=backup of p0 | |
1262 psubusb m1, m2 ; p0-q0 | |
1263 psubusb m2, m6 ; q0-p0 | |
1264 por m1, m2 ; FFABS(p0-q0) | |
1265 paddusb m1, m1 ; m1=FFABS(p0-q0)*2 | |
1266 | |
1267 mova m4, m3 | |
1268 mova m2, m0 | |
1269 psubusb m3, m0 ; q1-p1 | |
1270 psubusb m0, m4 ; p1-q1 | |
1271 por m3, m0 ; FFABS(p1-q1) | |
1272 mova m0, [pb_80] | |
1273 pxor m2, m0 | |
1274 pxor m4, m0 | |
1275 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below | |
1276 pand m3, [pb_FE] | |
1277 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed | |
1278 paddusb m3, m1 | |
1279 psubusb m3, m7 | |
1280 pxor m1, m1 | |
1281 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) | |
1282 | |
1283 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) | |
1284 mova m4, m5 | |
1285 pxor m5, m0 | |
1286 pxor m0, m6 | |
1287 psubsb m5, m0 ; q0-p0 (signed) | |
1288 paddsb m2, m5 | |
1289 paddsb m2, m5 | |
1290 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) | |
1291 pand m2, m3 ; apply filter mask (m3) | |
1292 | |
1293 mova m3, [pb_F8] | |
1294 mova m1, m2 | |
1295 paddsb m2, [pb_4] ; f1<<3=a+4 | |
1296 paddsb m1, [pb_3] ; f2<<3=a+3 | |
1297 pand m2, m3 | |
1298 pand m1, m3 ; cache f2<<3 | |
1299 | |
1300 pxor m0, m0 | |
1301 pxor m3, m3 | |
1302 pcmpgtb m0, m2 ; which values are <0? | |
1303 psubb m3, m2 ; -f1<<3 | |
1304 psrlq m2, 3 ; +f1 | |
1305 psrlq m3, 3 ; -f1 | |
1306 pand m3, m0 | |
1307 pandn m0, m2 | |
1308 psubusb m4, m0 | |
1309 paddusb m4, m3 ; q0-f1 | |
1310 | |
1311 pxor m0, m0 | |
1312 pxor m3, m3 | |
1313 pcmpgtb m0, m1 ; which values are <0? | |
1314 psubb m3, m1 ; -f2<<3 | |
1315 psrlq m1, 3 ; +f2 | |
1316 psrlq m3, 3 ; -f2 | |
1317 pand m3, m0 | |
1318 pandn m0, m1 | |
1319 paddusb m6, m0 | |
1320 psubusb m6, m3 ; p0+f2 | |
1321 | |
1322 ; store | |
1323 %ifidn %2, v | |
1324 mova [r0], m4 | |
1325 mova [r0+r1], m6 | |
1326 %else ; h | |
1327 mova m0, [rsp] ; p1 | |
1328 SWAP 2, 4 ; p0 | |
1329 SWAP 1, 6 ; q0 | |
1330 mova m3, [rsp+mmsize] ; q1 | |
1331 | |
1332 TRANSPOSE4x4B 0, 1, 2, 3, 4 | |
1333 %if mmsize == 16 ; sse2 | |
1334 add r3, r1 ; change from r4*8*stride to r0+8*stride | |
1335 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2 | |
1336 %else ; mmx/mmxext | |
1337 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 | |
1338 %endif | |
1339 %endif | |
1340 | |
1341 %if mmsize == 8 ; mmx/mmxext | |
1342 ; next 8 pixels | |
1343 %ifidn %2, v | |
1344 add r0, 8 ; advance 8 cols = pixels | |
1345 %else ; h | |
1346 lea r0, [r0+r2*8] ; advance 8 rows = lines | |
1347 %endif | |
1348 dec r3 | |
1349 jg .next8px | |
1350 %ifidn %2, v | |
1351 REP_RET | |
1352 %else ; h | |
1353 mov rsp, r5 ; restore stack pointer | |
1354 RET | |
1355 %endif | |
1356 %else ; sse2 | |
1357 %ifidn %2, h | |
1358 mov rsp, r5 ; restore stack pointer | |
1359 %endif | |
1360 RET | |
1361 %endif | |
1362 %endmacro | |
1363 | |
1364 INIT_MMX | |
1365 SIMPLE_LOOPFILTER mmx, v, 4 | |
1366 SIMPLE_LOOPFILTER mmx, h, 6 | |
1367 SIMPLE_LOOPFILTER mmxext, v, 4 | |
1368 SIMPLE_LOOPFILTER mmxext, h, 6 | |
1369 INIT_XMM | |
1370 SIMPLE_LOOPFILTER sse2, v, 3 | |
1371 SIMPLE_LOOPFILTER sse2, h, 6 |