comparison postproc/postprocess.c @ 2416:25e7342b5171

and another +2% speedup
author michael
date Tue, 23 Oct 2001 12:05:34 +0000
parents 32e733ec8a88
children 85cda20c530f
comparison
equal deleted inserted replaced
2415:58ea110b4036 2416:25e7342b5171
334 // "movd %%mm0, (%1, %2, 4)\n\t" 334 // "movd %%mm0, (%1, %2, 4)\n\t"
335 "movd %%mm0, %0 \n\t" 335 "movd %%mm0, %0 \n\t"
336 : "=r" (isOk) 336 : "=r" (isOk)
337 : "r" (src), "r" (stride) 337 : "r" (src), "r" (stride)
338 ); 338 );
339 return isOk ? 1 : 0; 339 return isOk;
340 #else 340 #else
341 341
342 int isOk2= 1; 342 int isOk2= 1;
343 int x; 343 int x;
344 src+= stride*3; 344 src+= stride*3;
1301 // src++; 1301 // src++;
1302 int numEq= 0; 1302 int numEq= 0;
1303 #ifdef HAVE_MMX 1303 #ifdef HAVE_MMX
1304 asm volatile ( 1304 asm volatile (
1305 // "int $3 \n\t" 1305 // "int $3 \n\t"
1306 "pushl %1\n\t" 1306 "leal (%1, %2), %%ecx \n\t"
1307 "leal (%%ecx, %2, 4), %%ebx \n\t"
1308 // 0 1 2 3 4 5 6 7 8 9
1309 // %1 ecx ecx+%2 ecx+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
1307 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F 1310 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
1308 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D 1311 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
1312 "pxor %%mm0, %%mm0 \n\t"
1313 "movl %1, %%eax \n\t"
1314 "andl $0x1F, %%eax \n\t"
1315 "cmpl $24, %%eax \n\t"
1309 "leal tempBlock, %%eax \n\t" 1316 "leal tempBlock, %%eax \n\t"
1310 "pxor %%mm0, %%mm0 \n\t" 1317 "jb 1f \n\t"
1311 1318
1312 #define HDC_CHECK_AND_CPY(i) \ 1319 #define HDC_CHECK_AND_CPY(src, dst) \
1313 "movq -4(%1), %%mm2 \n\t"\ 1320 "movd " #src ", %%mm2 \n\t"\
1314 "psrlq $32, %%mm2 \n\t"\ 1321 "punpckldq 4" #src ", %%mm2 \n\t" /* (%1) */\
1315 "punpckldq 4(%1), %%mm2 \n\t" /* (%1) */\
1316 "movq %%mm2, %%mm1 \n\t"\ 1322 "movq %%mm2, %%mm1 \n\t"\
1317 "psrlq $8, %%mm2 \n\t"\ 1323 "psrlq $8, %%mm2 \n\t"\
1318 "psubb %%mm1, %%mm2 \n\t"\ 1324 "psubb %%mm1, %%mm2 \n\t"\
1319 "paddb %%mm7, %%mm2 \n\t"\ 1325 "paddb %%mm7, %%mm2 \n\t"\
1320 "pcmpgtb %%mm6, %%mm2 \n\t"\ 1326 "pcmpgtb %%mm6, %%mm2 \n\t"\
1321 "paddb %%mm2, %%mm0 \n\t"\ 1327 "paddb %%mm2, %%mm0 \n\t"\
1322 "movq %%mm1," #i "(%%eax) \n\t" 1328 "movq %%mm1," #dst "(%%eax) \n\t"
1323 1329
1324 HDC_CHECK_AND_CPY(0) 1330 HDC_CHECK_AND_CPY((%1),0)
1325 "addl %2, %1 \n\t" 1331 HDC_CHECK_AND_CPY((%%ecx),8)
1326 HDC_CHECK_AND_CPY(8) 1332 HDC_CHECK_AND_CPY((%%ecx, %2),16)
1327 "addl %2, %1 \n\t" 1333 HDC_CHECK_AND_CPY((%%ecx, %2, 2),24)
1328 HDC_CHECK_AND_CPY(16) 1334 HDC_CHECK_AND_CPY((%1, %2, 4),32)
1329 "addl %2, %1 \n\t" 1335 HDC_CHECK_AND_CPY((%%ebx),40)
1330 HDC_CHECK_AND_CPY(24) 1336 HDC_CHECK_AND_CPY((%%ebx, %2),48)
1331 "addl %2, %1 \n\t" 1337 HDC_CHECK_AND_CPY((%%ebx, %2, 2),56)
1332 HDC_CHECK_AND_CPY(32) 1338 "jmp 2f \n\t"
1333 "addl %2, %1 \n\t" 1339 "1: \n\t"
1334 HDC_CHECK_AND_CPY(40) 1340 // src does not cross a 32 byte cache line so dont waste time with alignment
1335 "addl %2, %1 \n\t" 1341 #define HDC_CHECK_AND_CPY2(src, dst) \
1336 HDC_CHECK_AND_CPY(48) 1342 "movq " #src ", %%mm2 \n\t"\
1337 "addl %2, %1 \n\t" 1343 "movq " #src ", %%mm1 \n\t"\
1338 HDC_CHECK_AND_CPY(56) 1344 "psrlq $8, %%mm2 \n\t"\
1339 1345 "psubb %%mm1, %%mm2 \n\t"\
1346 "paddb %%mm7, %%mm2 \n\t"\
1347 "pcmpgtb %%mm6, %%mm2 \n\t"\
1348 "paddb %%mm2, %%mm0 \n\t"\
1349 "movq %%mm1," #dst "(%%eax) \n\t"
1350
1351 HDC_CHECK_AND_CPY2((%1),0)
1352 HDC_CHECK_AND_CPY2((%%ecx),8)
1353 HDC_CHECK_AND_CPY2((%%ecx, %2),16)
1354 HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24)
1355 HDC_CHECK_AND_CPY2((%1, %2, 4),32)
1356 HDC_CHECK_AND_CPY2((%%ebx),40)
1357 HDC_CHECK_AND_CPY2((%%ebx, %2),48)
1358 HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56)
1359 "2: \n\t"
1340 "psllq $8, %%mm0 \n\t" // remove dummy value 1360 "psllq $8, %%mm0 \n\t" // remove dummy value
1341 "movq %%mm0, %%mm1 \n\t" 1361 "movq %%mm0, %%mm1 \n\t"
1342 "psrlw $8, %%mm0 \n\t" 1362 "psrlw $8, %%mm0 \n\t"
1343 "paddb %%mm1, %%mm0 \n\t" 1363 "paddb %%mm1, %%mm0 \n\t"
1344 "movq %%mm0, %%mm1 \n\t" 1364 "movq %%mm0, %%mm1 \n\t"
1345 "psrlq $16, %%mm0 \n\t" 1365 "psrlq $16, %%mm0 \n\t"
1346 "paddb %%mm1, %%mm0 \n\t" 1366 "paddb %%mm1, %%mm0 \n\t"
1347 "movq %%mm0, %%mm1 \n\t" 1367 "movq %%mm0, %%mm1 \n\t"
1348 "psrlq $32, %%mm0 \n\t" 1368 "psrlq $32, %%mm0 \n\t"
1349 "paddb %%mm1, %%mm0 \n\t" 1369 "paddb %%mm1, %%mm0 \n\t"
1350 "popl %1\n\t"
1351 "movd %%mm0, %0 \n\t" 1370 "movd %%mm0, %0 \n\t"
1352 : "=r" (numEq) 1371 : "=r" (numEq)
1353 : "r" (src), "r" (stride) 1372 : "r" (src), "r" (stride)
1354 : "%eax" 1373 : "%eax", "%ebx", "%ecx"
1355 ); 1374 );
1356 // printf("%d\n", numEq); 1375 // printf("%d\n", numEq);
1357 numEq= (256 - (numEq & 0xFF)) &0xFF; 1376 numEq= (256 - numEq) &0xFF;
1358 #else 1377 #else
1359 int y; 1378 int y;
1360 for(y=0; y<BLOCK_SIZE; y++) 1379 for(y=0; y<BLOCK_SIZE; y++)
1361 { 1380 {
1362 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; 1381 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;