comparison ppc/dsputil_altivec.c @ 1951:2599b8444831 libavcodec

better hadamard8_diff16 in AltiVec, and more patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
author michael
date Thu, 22 Apr 2004 13:21:59 +0000
parents 66215baae7b9
children 96cfc6393b1f
comparison
equal deleted inserted replaced
1950:a3c60fa850dc 1951:2599b8444831
1304 1304
1305 int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 1305 int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1306 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); 1306 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
1307 int sum; 1307 int sum;
1308 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); 1308 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
1309 register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
1310 register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1311 #ifdef CONFIG_DARWIN
1309 { 1312 {
1310 const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0); 1313 register const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
1311 #ifdef CONFIG_DARWIN 1314 register const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
1312 const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); 1315 register const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
1313 const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); 1316 register const_vector unsigned char perm1 = (const_vector unsigned char)
1314 const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
1315 const_vector unsigned char perm1 = (const_vector unsigned char)
1316 (0x02, 0x03, 0x00, 0x01, 1317 (0x02, 0x03, 0x00, 0x01,
1317 0x06, 0x07, 0x04, 0x05, 1318 0x06, 0x07, 0x04, 0x05,
1318 0x0A, 0x0B, 0x08, 0x09, 1319 0x0A, 0x0B, 0x08, 0x09,
1319 0x0E, 0x0F, 0x0C, 0x0D); 1320 0x0E, 0x0F, 0x0C, 0x0D);
1320 const_vector unsigned char perm2 = (const_vector unsigned char) 1321 register const_vector unsigned char perm2 = (const_vector unsigned char)
1321 (0x04, 0x05, 0x06, 0x07, 1322 (0x04, 0x05, 0x06, 0x07,
1322 0x00, 0x01, 0x02, 0x03, 1323 0x00, 0x01, 0x02, 0x03,
1323 0x0C, 0x0D, 0x0E, 0x0F, 1324 0x0C, 0x0D, 0x0E, 0x0F,
1324 0x08, 0x09, 0x0A, 0x0B); 1325 0x08, 0x09, 0x0A, 0x0B);
1325 const_vector unsigned char perm3 = (const_vector unsigned char) 1326 register const_vector unsigned char perm3 = (const_vector unsigned char)
1326 (0x08, 0x09, 0x0A, 0x0B, 1327 (0x08, 0x09, 0x0A, 0x0B,
1327 0x0C, 0x0D, 0x0E, 0x0F, 1328 0x0C, 0x0D, 0x0E, 0x0F,
1328 0x00, 0x01, 0x02, 0x03, 1329 0x00, 0x01, 0x02, 0x03,
1329 0x04, 0x05, 0x06, 0x07); 1330 0x04, 0x05, 0x06, 0x07);
1330 #else 1331 #else
1331 const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; 1332 register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
1332 const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; 1333 register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
1333 const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; 1334 register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
1334 const_vector unsigned char perm1 = (const_vector unsigned char) 1335 register const_vector unsigned char perm1 = (const_vector unsigned char)
1335 {0x02, 0x03, 0x00, 0x01, 1336 {0x02, 0x03, 0x00, 0x01,
1336 0x06, 0x07, 0x04, 0x05, 1337 0x06, 0x07, 0x04, 0x05,
1337 0x0A, 0x0B, 0x08, 0x09, 1338 0x0A, 0x0B, 0x08, 0x09,
1338 0x0E, 0x0F, 0x0C, 0x0D}; 1339 0x0E, 0x0F, 0x0C, 0x0D};
1339 const_vector unsigned char perm2 = (const_vector unsigned char) 1340 register const_vector unsigned char perm2 = (const_vector unsigned char)
1340 {0x04, 0x05, 0x06, 0x07, 1341 {0x04, 0x05, 0x06, 0x07,
1341 0x00, 0x01, 0x02, 0x03, 1342 0x00, 0x01, 0x02, 0x03,
1342 0x0C, 0x0D, 0x0E, 0x0F, 1343 0x0C, 0x0D, 0x0E, 0x0F,
1343 0x08, 0x09, 0x0A, 0x0B}; 1344 0x08, 0x09, 0x0A, 0x0B};
1344 const_vector unsigned char perm3 = (const_vector unsigned char) 1345 register const_vector unsigned char perm3 = (const_vector unsigned char)
1345 {0x08, 0x09, 0x0A, 0x0B, 1346 {0x08, 0x09, 0x0A, 0x0B,
1346 0x0C, 0x0D, 0x0E, 0x0F, 1347 0x0C, 0x0D, 0x0E, 0x0F,
1347 0x00, 0x01, 0x02, 0x03, 1348 0x00, 0x01, 0x02, 0x03,
1348 0x04, 0x05, 0x06, 0x07}; 1349 0x04, 0x05, 0x06, 0x07};
1349 #endif 1350 #endif
1350 1351
1351 #define ONEITERBUTTERFLY(i, res) \ 1352 #define ONEITERBUTTERFLY(i, res) \
1352 { \ 1353 { \
1353 vector unsigned char src1, src2, srcO; \ 1354 register vector unsigned char src1, src2, srcO; \
1354 vector unsigned char dst1, dst2, dstO; \ 1355 register vector unsigned char dst1, dst2, dstO; \
1355 src1 = vec_ld(stride * i, src); \ 1356 src1 = vec_ld(stride * i, src); \
1356 if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \ 1357 if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
1357 src2 = vec_ld((stride * i) + 16, src); \ 1358 src2 = vec_ld((stride * i) + 16, src); \
1358 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 1359 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1359 dst1 = vec_ld(stride * i, dst); \ 1360 dst1 = vec_ld(stride * i, dst); \
1360 if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \ 1361 if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
1361 dst2 = vec_ld((stride * i) + 16, dst); \ 1362 dst2 = vec_ld((stride * i) + 16, dst); \
1362 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 1363 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1363 /* promote the unsigned chars to signed shorts */ \ 1364 /* promote the unsigned chars to signed shorts */ \
1364 /* we're in the 8x8 function, we only care for the first 8 */ \ 1365 /* we're in the 8x8 function, we only care for the first 8 */ \
1365 vector signed short srcV = \ 1366 register vector signed short srcV = \
1366 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ 1367 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1367 vector signed short dstV = \ 1368 register vector signed short dstV = \
1368 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ 1369 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1369 /* substractions inside the first butterfly */ \ 1370 /* substractions inside the first butterfly */ \
1370 vector signed short but0 = vec_sub(srcV, dstV); \ 1371 register vector signed short but0 = vec_sub(srcV, dstV); \
1371 vector signed short op1 = vec_perm(but0, but0, perm1); \ 1372 register vector signed short op1 = vec_perm(but0, but0, perm1); \
1372 vector signed short but1 = vec_mladd(but0, vprod1, op1); \ 1373 register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
1373 vector signed short op2 = vec_perm(but1, but1, perm2); \ 1374 register vector signed short op2 = vec_perm(but1, but1, perm2); \
1374 vector signed short but2 = vec_mladd(but1, vprod2, op2); \ 1375 register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
1375 vector signed short op3 = vec_perm(but2, but2, perm3); \ 1376 register vector signed short op3 = vec_perm(but2, but2, perm3); \
1376 res = vec_mladd(but2, vprod3, op3); \ 1377 res = vec_mladd(but2, vprod3, op3); \
1377 } 1378 }
1378 vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1379 ONEITERBUTTERFLY(0, temp0); 1379 ONEITERBUTTERFLY(0, temp0);
1380 ONEITERBUTTERFLY(1, temp1); 1380 ONEITERBUTTERFLY(1, temp1);
1381 ONEITERBUTTERFLY(2, temp2); 1381 ONEITERBUTTERFLY(2, temp2);
1382 ONEITERBUTTERFLY(3, temp3); 1382 ONEITERBUTTERFLY(3, temp3);
1383 ONEITERBUTTERFLY(4, temp4); 1383 ONEITERBUTTERFLY(4, temp4);
1384 ONEITERBUTTERFLY(5, temp5); 1384 ONEITERBUTTERFLY(5, temp5);
1385 ONEITERBUTTERFLY(6, temp6); 1385 ONEITERBUTTERFLY(6, temp6);
1386 ONEITERBUTTERFLY(7, temp7); 1386 ONEITERBUTTERFLY(7, temp7);
1387 }
1387 #undef ONEITERBUTTERFLY 1388 #undef ONEITERBUTTERFLY
1388 { 1389 {
1389 vector signed int vsum; 1390 register vector signed int vsum;
1390 vector signed short line0 = vec_add(temp0, temp1); 1391 register vector signed short line0 = vec_add(temp0, temp1);
1391 vector signed short line1 = vec_sub(temp0, temp1); 1392 register vector signed short line1 = vec_sub(temp0, temp1);
1392 vector signed short line2 = vec_add(temp2, temp3); 1393 register vector signed short line2 = vec_add(temp2, temp3);
1393 vector signed short line3 = vec_sub(temp2, temp3); 1394 register vector signed short line3 = vec_sub(temp2, temp3);
1394 vector signed short line4 = vec_add(temp4, temp5); 1395 register vector signed short line4 = vec_add(temp4, temp5);
1395 vector signed short line5 = vec_sub(temp4, temp5); 1396 register vector signed short line5 = vec_sub(temp4, temp5);
1396 vector signed short line6 = vec_add(temp6, temp7); 1397 register vector signed short line6 = vec_add(temp6, temp7);
1397 vector signed short line7 = vec_sub(temp6, temp7); 1398 register vector signed short line7 = vec_sub(temp6, temp7);
1398 1399
1399 vector signed short line0B = vec_add(line0, line2); 1400 register vector signed short line0B = vec_add(line0, line2);
1400 vector signed short line2B = vec_sub(line0, line2); 1401 register vector signed short line2B = vec_sub(line0, line2);
1401 vector signed short line1B = vec_add(line1, line3); 1402 register vector signed short line1B = vec_add(line1, line3);
1402 vector signed short line3B = vec_sub(line1, line3); 1403 register vector signed short line3B = vec_sub(line1, line3);
1403 vector signed short line4B = vec_add(line4, line6); 1404 register vector signed short line4B = vec_add(line4, line6);
1404 vector signed short line6B = vec_sub(line4, line6); 1405 register vector signed short line6B = vec_sub(line4, line6);
1405 vector signed short line5B = vec_add(line5, line7); 1406 register vector signed short line5B = vec_add(line5, line7);
1406 vector signed short line7B = vec_sub(line5, line7); 1407 register vector signed short line7B = vec_sub(line5, line7);
1407 1408
1408 vector signed short line0C = vec_add(line0B, line4B); 1409 register vector signed short line0C = vec_add(line0B, line4B);
1409 vector signed short line4C = vec_sub(line0B, line4B); 1410 register vector signed short line4C = vec_sub(line0B, line4B);
1410 vector signed short line1C = vec_add(line1B, line5B); 1411 register vector signed short line1C = vec_add(line1B, line5B);
1411 vector signed short line5C = vec_sub(line1B, line5B); 1412 register vector signed short line5C = vec_sub(line1B, line5B);
1412 vector signed short line2C = vec_add(line2B, line6B); 1413 register vector signed short line2C = vec_add(line2B, line6B);
1413 vector signed short line6C = vec_sub(line2B, line6B); 1414 register vector signed short line6C = vec_sub(line2B, line6B);
1414 vector signed short line3C = vec_add(line3B, line7B); 1415 register vector signed short line3C = vec_add(line3B, line7B);
1415 vector signed short line7C = vec_sub(line3B, line7B); 1416 register vector signed short line7C = vec_sub(line3B, line7B);
1416 1417
1417 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); 1418 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1418 vsum = vec_sum4s(vec_abs(line1C), vsum); 1419 vsum = vec_sum4s(vec_abs(line1C), vsum);
1419 vsum = vec_sum4s(vec_abs(line2C), vsum); 1420 vsum = vec_sum4s(vec_abs(line2C), vsum);
1420 vsum = vec_sum4s(vec_abs(line3C), vsum); 1421 vsum = vec_sum4s(vec_abs(line3C), vsum);
1421 vsum = vec_sum4s(vec_abs(line4C), vsum); 1422 vsum = vec_sum4s(vec_abs(line4C), vsum);
1422 vsum = vec_sum4s(vec_abs(line5C), vsum); 1423 vsum = vec_sum4s(vec_abs(line5C), vsum);
1423 vsum = vec_sum4s(vec_abs(line6C), vsum); 1424 vsum = vec_sum4s(vec_abs(line6C), vsum);
1424 vsum = vec_sum4s(vec_abs(line7C), vsum); 1425 vsum = vec_sum4s(vec_abs(line7C), vsum);
1425 vsum = vec_sums(vsum, (vector signed int)vzero); 1426 vsum = vec_sums(vsum, (vector signed int)vzero);
1426 vsum = vec_splat(vsum, 3); 1427 vsum = vec_splat(vsum, 3);
1427 vec_ste(vsum, 0, &sum); 1428 vec_ste(vsum, 0, &sum);
1428 }
1429 } 1429 }
1430 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); 1430 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
1431 return sum; 1431 return sum;
1432 }
1433
1434 /*
1435 16x8 works with 16 elements ; it allows to avoid replicating
1436 loads, and give the compiler more rooms for scheduling.
1437 It's only used from inside hadamard8_diff16_altivec.
1438
1439 Unfortunately, it seems gcc-3.3 is a bit dumb, and
1440 the compiled code has a LOT of spill code, it seems
1441 gcc (unlike xlc) cannot keep everything in registers
1442 by itself. The following code include hand-made
1443 registers allocation. It's not clean, but on
1444 a 7450 the resulting code is much faster (best case
1445 fall from 700+ cycles to 550).
1446
1447 xlc doesn't add spill code, but it doesn't know how to
1448 schedule for the 7450, and its code isn't much faster than
1449 gcc-3.3 on the 7450 (but uses 25% less instructions...)
1450
1451 On the 970, the hand-made RA is still a win (arount 690
1452 vs. around 780), but xlc goes to around 660 on the
1453 regular C code...
1454 */
1455
1456 static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
1457 int sum;
1458 register vector signed short
1459 temp0 asm ("v0"),
1460 temp1 asm ("v1"),
1461 temp2 asm ("v2"),
1462 temp3 asm ("v3"),
1463 temp4 asm ("v4"),
1464 temp5 asm ("v5"),
1465 temp6 asm ("v6"),
1466 temp7 asm ("v7");
1467 register vector signed short
1468 temp0S asm ("v8"),
1469 temp1S asm ("v9"),
1470 temp2S asm ("v10"),
1471 temp3S asm ("v11"),
1472 temp4S asm ("v12"),
1473 temp5S asm ("v13"),
1474 temp6S asm ("v14"),
1475 temp7S asm ("v15");
1476 register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0);
1477 {
1478 #ifdef CONFIG_DARWIN
1479 register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
1480 register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
1481 register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
1482 register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char)
1483 (0x02, 0x03, 0x00, 0x01,
1484 0x06, 0x07, 0x04, 0x05,
1485 0x0A, 0x0B, 0x08, 0x09,
1486 0x0E, 0x0F, 0x0C, 0x0D);
1487 register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char)
1488 (0x04, 0x05, 0x06, 0x07,
1489 0x00, 0x01, 0x02, 0x03,
1490 0x0C, 0x0D, 0x0E, 0x0F,
1491 0x08, 0x09, 0x0A, 0x0B);
1492 register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char)
1493 (0x08, 0x09, 0x0A, 0x0B,
1494 0x0C, 0x0D, 0x0E, 0x0F,
1495 0x00, 0x01, 0x02, 0x03,
1496 0x04, 0x05, 0x06, 0x07);
1497 #else
1498 register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
1499 register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
1500 register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
1501 register const_vector unsigned char perm1 = (const_vector unsigned char)
1502 {0x02, 0x03, 0x00, 0x01,
1503 0x06, 0x07, 0x04, 0x05,
1504 0x0A, 0x0B, 0x08, 0x09,
1505 0x0E, 0x0F, 0x0C, 0x0D};
1506 register const_vector unsigned char perm2 = (const_vector unsigned char)
1507 {0x04, 0x05, 0x06, 0x07,
1508 0x00, 0x01, 0x02, 0x03,
1509 0x0C, 0x0D, 0x0E, 0x0F,
1510 0x08, 0x09, 0x0A, 0x0B};
1511 register const_vector unsigned char perm3 = (const_vector unsigned char)
1512 {0x08, 0x09, 0x0A, 0x0B,
1513 0x0C, 0x0D, 0x0E, 0x0F,
1514 0x00, 0x01, 0x02, 0x03,
1515 0x04, 0x05, 0x06, 0x07};
1516 #endif
1517 #define ONEITERBUTTERFLY(i, res1, res2) \
1518 { \
1519 register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
1520 register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
1521 src1 = vec_ld(stride * i, src); \
1522 src2 = vec_ld((stride * i) + 16, src); \
1523 register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1524 dst1 = vec_ld(stride * i, dst); \
1525 dst2 = vec_ld((stride * i) + 16, dst); \
1526 register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1527 /* promote the unsigned chars to signed shorts */ \
1528 register vector signed short srcV asm ("v24") = \
1529 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1530 register vector signed short dstV asm ("v25") = \
1531 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1532 register vector signed short srcW asm ("v26") = \
1533 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
1534 register vector signed short dstW asm ("v27") = \
1535 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
1536 /* substractions inside the first butterfly */ \
1537 register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
1538 register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
1539 register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
1540 register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \
1541 register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \
1542 register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \
1543 register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \
1544 register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \
1545 register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
1546 register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
1547 register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
1548 res1 = vec_mladd(but2, vprod3, op3); \
1549 register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
1550 res2 = vec_mladd(but2S, vprod3, op3S); \
1551 }
1552 ONEITERBUTTERFLY(0, temp0, temp0S);
1553 ONEITERBUTTERFLY(1, temp1, temp1S);
1554 ONEITERBUTTERFLY(2, temp2, temp2S);
1555 ONEITERBUTTERFLY(3, temp3, temp3S);
1556 ONEITERBUTTERFLY(4, temp4, temp4S);
1557 ONEITERBUTTERFLY(5, temp5, temp5S);
1558 ONEITERBUTTERFLY(6, temp6, temp6S);
1559 ONEITERBUTTERFLY(7, temp7, temp7S);
1560 }
1561 #undef ONEITERBUTTERFLY
1562 {
1563 register vector signed int vsum;
1564 register vector signed short line0 = vec_add(temp0, temp1);
1565 register vector signed short line1 = vec_sub(temp0, temp1);
1566 register vector signed short line2 = vec_add(temp2, temp3);
1567 register vector signed short line3 = vec_sub(temp2, temp3);
1568 register vector signed short line4 = vec_add(temp4, temp5);
1569 register vector signed short line5 = vec_sub(temp4, temp5);
1570 register vector signed short line6 = vec_add(temp6, temp7);
1571 register vector signed short line7 = vec_sub(temp6, temp7);
1572
1573 register vector signed short line0B = vec_add(line0, line2);
1574 register vector signed short line2B = vec_sub(line0, line2);
1575 register vector signed short line1B = vec_add(line1, line3);
1576 register vector signed short line3B = vec_sub(line1, line3);
1577 register vector signed short line4B = vec_add(line4, line6);
1578 register vector signed short line6B = vec_sub(line4, line6);
1579 register vector signed short line5B = vec_add(line5, line7);
1580 register vector signed short line7B = vec_sub(line5, line7);
1581
1582 register vector signed short line0C = vec_add(line0B, line4B);
1583 register vector signed short line4C = vec_sub(line0B, line4B);
1584 register vector signed short line1C = vec_add(line1B, line5B);
1585 register vector signed short line5C = vec_sub(line1B, line5B);
1586 register vector signed short line2C = vec_add(line2B, line6B);
1587 register vector signed short line6C = vec_sub(line2B, line6B);
1588 register vector signed short line3C = vec_add(line3B, line7B);
1589 register vector signed short line7C = vec_sub(line3B, line7B);
1590
1591 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1592 vsum = vec_sum4s(vec_abs(line1C), vsum);
1593 vsum = vec_sum4s(vec_abs(line2C), vsum);
1594 vsum = vec_sum4s(vec_abs(line3C), vsum);
1595 vsum = vec_sum4s(vec_abs(line4C), vsum);
1596 vsum = vec_sum4s(vec_abs(line5C), vsum);
1597 vsum = vec_sum4s(vec_abs(line6C), vsum);
1598 vsum = vec_sum4s(vec_abs(line7C), vsum);
1599
1600 register vector signed short line0S = vec_add(temp0S, temp1S);
1601 register vector signed short line1S = vec_sub(temp0S, temp1S);
1602 register vector signed short line2S = vec_add(temp2S, temp3S);
1603 register vector signed short line3S = vec_sub(temp2S, temp3S);
1604 register vector signed short line4S = vec_add(temp4S, temp5S);
1605 register vector signed short line5S = vec_sub(temp4S, temp5S);
1606 register vector signed short line6S = vec_add(temp6S, temp7S);
1607 register vector signed short line7S = vec_sub(temp6S, temp7S);
1608
1609 register vector signed short line0BS = vec_add(line0S, line2S);
1610 register vector signed short line2BS = vec_sub(line0S, line2S);
1611 register vector signed short line1BS = vec_add(line1S, line3S);
1612 register vector signed short line3BS = vec_sub(line1S, line3S);
1613 register vector signed short line4BS = vec_add(line4S, line6S);
1614 register vector signed short line6BS = vec_sub(line4S, line6S);
1615 register vector signed short line5BS = vec_add(line5S, line7S);
1616 register vector signed short line7BS = vec_sub(line5S, line7S);
1617
1618 register vector signed short line0CS = vec_add(line0BS, line4BS);
1619 register vector signed short line4CS = vec_sub(line0BS, line4BS);
1620 register vector signed short line1CS = vec_add(line1BS, line5BS);
1621 register vector signed short line5CS = vec_sub(line1BS, line5BS);
1622 register vector signed short line2CS = vec_add(line2BS, line6BS);
1623 register vector signed short line6CS = vec_sub(line2BS, line6BS);
1624 register vector signed short line3CS = vec_add(line3BS, line7BS);
1625 register vector signed short line7CS = vec_sub(line3BS, line7BS);
1626
1627 vsum = vec_sum4s(vec_abs(line0CS), vsum);
1628 vsum = vec_sum4s(vec_abs(line1CS), vsum);
1629 vsum = vec_sum4s(vec_abs(line2CS), vsum);
1630 vsum = vec_sum4s(vec_abs(line3CS), vsum);
1631 vsum = vec_sum4s(vec_abs(line4CS), vsum);
1632 vsum = vec_sum4s(vec_abs(line5CS), vsum);
1633 vsum = vec_sum4s(vec_abs(line6CS), vsum);
1634 vsum = vec_sum4s(vec_abs(line7CS), vsum);
1635 vsum = vec_sums(vsum, (vector signed int)vzero);
1636 vsum = vec_splat(vsum, 3);
1637 vec_ste(vsum, 0, &sum);
1638 }
1639 return sum;
1640 }
1641
1642 int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1643 POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
1644 int score;
1645 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
1646 score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1647 if (h==16) {
1648 dst += 8*stride;
1649 src += 8*stride;
1650 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1651 }
1652 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
1653 return score;
1432 } 1654 }
1433 1655
1434 int has_altivec(void) 1656 int has_altivec(void)
1435 { 1657 {
1436 #ifdef CONFIG_DARWIN 1658 #ifdef CONFIG_DARWIN