Mercurial > libavcodec.hg
comparison ppc/dsputil_altivec.c @ 1951:2599b8444831 libavcodec
better hadamard8_diff16 in AltiVec, and more patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
author | michael |
---|---|
date | Thu, 22 Apr 2004 13:21:59 +0000 |
parents | 66215baae7b9 |
children | 96cfc6393b1f |
comparison
equal
deleted
inserted
replaced
1950:a3c60fa850dc | 1951:2599b8444831 |
---|---|
1304 | 1304 |
1305 int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ | 1305 int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ |
1306 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); | 1306 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); |
1307 int sum; | 1307 int sum; |
1308 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); | 1308 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); |
1309 register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0); | |
1310 register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | |
1311 #ifdef CONFIG_DARWIN | |
1309 { | 1312 { |
1310 const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0); | 1313 register const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); |
1311 #ifdef CONFIG_DARWIN | 1314 register const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); |
1312 const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); | 1315 register const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1); |
1313 const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); | 1316 register const_vector unsigned char perm1 = (const_vector unsigned char) |
1314 const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1); | |
1315 const_vector unsigned char perm1 = (const_vector unsigned char) | |
1316 (0x02, 0x03, 0x00, 0x01, | 1317 (0x02, 0x03, 0x00, 0x01, |
1317 0x06, 0x07, 0x04, 0x05, | 1318 0x06, 0x07, 0x04, 0x05, |
1318 0x0A, 0x0B, 0x08, 0x09, | 1319 0x0A, 0x0B, 0x08, 0x09, |
1319 0x0E, 0x0F, 0x0C, 0x0D); | 1320 0x0E, 0x0F, 0x0C, 0x0D); |
1320 const_vector unsigned char perm2 = (const_vector unsigned char) | 1321 register const_vector unsigned char perm2 = (const_vector unsigned char) |
1321 (0x04, 0x05, 0x06, 0x07, | 1322 (0x04, 0x05, 0x06, 0x07, |
1322 0x00, 0x01, 0x02, 0x03, | 1323 0x00, 0x01, 0x02, 0x03, |
1323 0x0C, 0x0D, 0x0E, 0x0F, | 1324 0x0C, 0x0D, 0x0E, 0x0F, |
1324 0x08, 0x09, 0x0A, 0x0B); | 1325 0x08, 0x09, 0x0A, 0x0B); |
1325 const_vector unsigned char perm3 = (const_vector unsigned char) | 1326 register const_vector unsigned char perm3 = (const_vector unsigned char) |
1326 (0x08, 0x09, 0x0A, 0x0B, | 1327 (0x08, 0x09, 0x0A, 0x0B, |
1327 0x0C, 0x0D, 0x0E, 0x0F, | 1328 0x0C, 0x0D, 0x0E, 0x0F, |
1328 0x00, 0x01, 0x02, 0x03, | 1329 0x00, 0x01, 0x02, 0x03, |
1329 0x04, 0x05, 0x06, 0x07); | 1330 0x04, 0x05, 0x06, 0x07); |
1330 #else | 1331 #else |
1331 const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; | 1332 register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; |
1332 const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; | 1333 register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; |
1333 const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; | 1334 register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; |
1334 const_vector unsigned char perm1 = (const_vector unsigned char) | 1335 register const_vector unsigned char perm1 = (const_vector unsigned char) |
1335 {0x02, 0x03, 0x00, 0x01, | 1336 {0x02, 0x03, 0x00, 0x01, |
1336 0x06, 0x07, 0x04, 0x05, | 1337 0x06, 0x07, 0x04, 0x05, |
1337 0x0A, 0x0B, 0x08, 0x09, | 1338 0x0A, 0x0B, 0x08, 0x09, |
1338 0x0E, 0x0F, 0x0C, 0x0D}; | 1339 0x0E, 0x0F, 0x0C, 0x0D}; |
1339 const_vector unsigned char perm2 = (const_vector unsigned char) | 1340 register const_vector unsigned char perm2 = (const_vector unsigned char) |
1340 {0x04, 0x05, 0x06, 0x07, | 1341 {0x04, 0x05, 0x06, 0x07, |
1341 0x00, 0x01, 0x02, 0x03, | 1342 0x00, 0x01, 0x02, 0x03, |
1342 0x0C, 0x0D, 0x0E, 0x0F, | 1343 0x0C, 0x0D, 0x0E, 0x0F, |
1343 0x08, 0x09, 0x0A, 0x0B}; | 1344 0x08, 0x09, 0x0A, 0x0B}; |
1344 const_vector unsigned char perm3 = (const_vector unsigned char) | 1345 register const_vector unsigned char perm3 = (const_vector unsigned char) |
1345 {0x08, 0x09, 0x0A, 0x0B, | 1346 {0x08, 0x09, 0x0A, 0x0B, |
1346 0x0C, 0x0D, 0x0E, 0x0F, | 1347 0x0C, 0x0D, 0x0E, 0x0F, |
1347 0x00, 0x01, 0x02, 0x03, | 1348 0x00, 0x01, 0x02, 0x03, |
1348 0x04, 0x05, 0x06, 0x07}; | 1349 0x04, 0x05, 0x06, 0x07}; |
1349 #endif | 1350 #endif |
1350 | 1351 |
1351 #define ONEITERBUTTERFLY(i, res) \ | 1352 #define ONEITERBUTTERFLY(i, res) \ |
1352 { \ | 1353 { \ |
1353 vector unsigned char src1, src2, srcO; \ | 1354 register vector unsigned char src1, src2, srcO; \ |
1354 vector unsigned char dst1, dst2, dstO; \ | 1355 register vector unsigned char dst1, dst2, dstO; \ |
1355 src1 = vec_ld(stride * i, src); \ | 1356 src1 = vec_ld(stride * i, src); \ |
1356 if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \ | 1357 if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \ |
1357 src2 = vec_ld((stride * i) + 16, src); \ | 1358 src2 = vec_ld((stride * i) + 16, src); \ |
1358 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ | 1359 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
1359 dst1 = vec_ld(stride * i, dst); \ | 1360 dst1 = vec_ld(stride * i, dst); \ |
1360 if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \ | 1361 if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \ |
1361 dst2 = vec_ld((stride * i) + 16, dst); \ | 1362 dst2 = vec_ld((stride * i) + 16, dst); \ |
1362 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ | 1363 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
1363 /* promote the unsigned chars to signed shorts */ \ | 1364 /* promote the unsigned chars to signed shorts */ \ |
1364 /* we're in the 8x8 function, we only care for the first 8 */ \ | 1365 /* we're in the 8x8 function, we only care for the first 8 */ \ |
1365 vector signed short srcV = \ | 1366 register vector signed short srcV = \ |
1366 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ | 1367 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ |
1367 vector signed short dstV = \ | 1368 register vector signed short dstV = \ |
1368 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ | 1369 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ |
1369 /* substractions inside the first butterfly */ \ | 1370 /* substractions inside the first butterfly */ \ |
1370 vector signed short but0 = vec_sub(srcV, dstV); \ | 1371 register vector signed short but0 = vec_sub(srcV, dstV); \ |
1371 vector signed short op1 = vec_perm(but0, but0, perm1); \ | 1372 register vector signed short op1 = vec_perm(but0, but0, perm1); \ |
1372 vector signed short but1 = vec_mladd(but0, vprod1, op1); \ | 1373 register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ |
1373 vector signed short op2 = vec_perm(but1, but1, perm2); \ | 1374 register vector signed short op2 = vec_perm(but1, but1, perm2); \ |
1374 vector signed short but2 = vec_mladd(but1, vprod2, op2); \ | 1375 register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ |
1375 vector signed short op3 = vec_perm(but2, but2, perm3); \ | 1376 register vector signed short op3 = vec_perm(but2, but2, perm3); \ |
1376 res = vec_mladd(but2, vprod3, op3); \ | 1377 res = vec_mladd(but2, vprod3, op3); \ |
1377 } | 1378 } |
1378 vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | |
1379 ONEITERBUTTERFLY(0, temp0); | 1379 ONEITERBUTTERFLY(0, temp0); |
1380 ONEITERBUTTERFLY(1, temp1); | 1380 ONEITERBUTTERFLY(1, temp1); |
1381 ONEITERBUTTERFLY(2, temp2); | 1381 ONEITERBUTTERFLY(2, temp2); |
1382 ONEITERBUTTERFLY(3, temp3); | 1382 ONEITERBUTTERFLY(3, temp3); |
1383 ONEITERBUTTERFLY(4, temp4); | 1383 ONEITERBUTTERFLY(4, temp4); |
1384 ONEITERBUTTERFLY(5, temp5); | 1384 ONEITERBUTTERFLY(5, temp5); |
1385 ONEITERBUTTERFLY(6, temp6); | 1385 ONEITERBUTTERFLY(6, temp6); |
1386 ONEITERBUTTERFLY(7, temp7); | 1386 ONEITERBUTTERFLY(7, temp7); |
1387 } | |
1387 #undef ONEITERBUTTERFLY | 1388 #undef ONEITERBUTTERFLY |
1388 { | 1389 { |
1389 vector signed int vsum; | 1390 register vector signed int vsum; |
1390 vector signed short line0 = vec_add(temp0, temp1); | 1391 register vector signed short line0 = vec_add(temp0, temp1); |
1391 vector signed short line1 = vec_sub(temp0, temp1); | 1392 register vector signed short line1 = vec_sub(temp0, temp1); |
1392 vector signed short line2 = vec_add(temp2, temp3); | 1393 register vector signed short line2 = vec_add(temp2, temp3); |
1393 vector signed short line3 = vec_sub(temp2, temp3); | 1394 register vector signed short line3 = vec_sub(temp2, temp3); |
1394 vector signed short line4 = vec_add(temp4, temp5); | 1395 register vector signed short line4 = vec_add(temp4, temp5); |
1395 vector signed short line5 = vec_sub(temp4, temp5); | 1396 register vector signed short line5 = vec_sub(temp4, temp5); |
1396 vector signed short line6 = vec_add(temp6, temp7); | 1397 register vector signed short line6 = vec_add(temp6, temp7); |
1397 vector signed short line7 = vec_sub(temp6, temp7); | 1398 register vector signed short line7 = vec_sub(temp6, temp7); |
1398 | 1399 |
1399 vector signed short line0B = vec_add(line0, line2); | 1400 register vector signed short line0B = vec_add(line0, line2); |
1400 vector signed short line2B = vec_sub(line0, line2); | 1401 register vector signed short line2B = vec_sub(line0, line2); |
1401 vector signed short line1B = vec_add(line1, line3); | 1402 register vector signed short line1B = vec_add(line1, line3); |
1402 vector signed short line3B = vec_sub(line1, line3); | 1403 register vector signed short line3B = vec_sub(line1, line3); |
1403 vector signed short line4B = vec_add(line4, line6); | 1404 register vector signed short line4B = vec_add(line4, line6); |
1404 vector signed short line6B = vec_sub(line4, line6); | 1405 register vector signed short line6B = vec_sub(line4, line6); |
1405 vector signed short line5B = vec_add(line5, line7); | 1406 register vector signed short line5B = vec_add(line5, line7); |
1406 vector signed short line7B = vec_sub(line5, line7); | 1407 register vector signed short line7B = vec_sub(line5, line7); |
1407 | 1408 |
1408 vector signed short line0C = vec_add(line0B, line4B); | 1409 register vector signed short line0C = vec_add(line0B, line4B); |
1409 vector signed short line4C = vec_sub(line0B, line4B); | 1410 register vector signed short line4C = vec_sub(line0B, line4B); |
1410 vector signed short line1C = vec_add(line1B, line5B); | 1411 register vector signed short line1C = vec_add(line1B, line5B); |
1411 vector signed short line5C = vec_sub(line1B, line5B); | 1412 register vector signed short line5C = vec_sub(line1B, line5B); |
1412 vector signed short line2C = vec_add(line2B, line6B); | 1413 register vector signed short line2C = vec_add(line2B, line6B); |
1413 vector signed short line6C = vec_sub(line2B, line6B); | 1414 register vector signed short line6C = vec_sub(line2B, line6B); |
1414 vector signed short line3C = vec_add(line3B, line7B); | 1415 register vector signed short line3C = vec_add(line3B, line7B); |
1415 vector signed short line7C = vec_sub(line3B, line7B); | 1416 register vector signed short line7C = vec_sub(line3B, line7B); |
1416 | 1417 |
1417 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); | 1418 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
1418 vsum = vec_sum4s(vec_abs(line1C), vsum); | 1419 vsum = vec_sum4s(vec_abs(line1C), vsum); |
1419 vsum = vec_sum4s(vec_abs(line2C), vsum); | 1420 vsum = vec_sum4s(vec_abs(line2C), vsum); |
1420 vsum = vec_sum4s(vec_abs(line3C), vsum); | 1421 vsum = vec_sum4s(vec_abs(line3C), vsum); |
1421 vsum = vec_sum4s(vec_abs(line4C), vsum); | 1422 vsum = vec_sum4s(vec_abs(line4C), vsum); |
1422 vsum = vec_sum4s(vec_abs(line5C), vsum); | 1423 vsum = vec_sum4s(vec_abs(line5C), vsum); |
1423 vsum = vec_sum4s(vec_abs(line6C), vsum); | 1424 vsum = vec_sum4s(vec_abs(line6C), vsum); |
1424 vsum = vec_sum4s(vec_abs(line7C), vsum); | 1425 vsum = vec_sum4s(vec_abs(line7C), vsum); |
1425 vsum = vec_sums(vsum, (vector signed int)vzero); | 1426 vsum = vec_sums(vsum, (vector signed int)vzero); |
1426 vsum = vec_splat(vsum, 3); | 1427 vsum = vec_splat(vsum, 3); |
1427 vec_ste(vsum, 0, &sum); | 1428 vec_ste(vsum, 0, &sum); |
1428 } | |
1429 } | 1429 } |
1430 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); | 1430 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); |
1431 return sum; | 1431 return sum; |
1432 } | |
1433 | |
1434 /* | |
1435 16x8 works with 16 elements ; it allows to avoid replicating | |
1436 loads, and give the compiler more rooms for scheduling. | |
1437 It's only used from inside hadamard8_diff16_altivec. | |
1438 | |
1439 Unfortunately, it seems gcc-3.3 is a bit dumb, and | |
1440 the compiled code has a LOT of spill code, it seems | |
1441 gcc (unlike xlc) cannot keep everything in registers | |
1442 by itself. The following code include hand-made | |
1443 registers allocation. It's not clean, but on | |
1444 a 7450 the resulting code is much faster (best case | |
1445 fall from 700+ cycles to 550). | |
1446 | |
1447 xlc doesn't add spill code, but it doesn't know how to | |
1448 schedule for the 7450, and its code isn't much faster than | |
1449 gcc-3.3 on the 7450 (but uses 25% less instructions...) | |
1450 | |
1451 On the 970, the hand-made RA is still a win (arount 690 | |
1452 vs. around 780), but xlc goes to around 660 on the | |
1453 regular C code... | |
1454 */ | |
1455 | |
1456 static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { | |
1457 int sum; | |
1458 register vector signed short | |
1459 temp0 asm ("v0"), | |
1460 temp1 asm ("v1"), | |
1461 temp2 asm ("v2"), | |
1462 temp3 asm ("v3"), | |
1463 temp4 asm ("v4"), | |
1464 temp5 asm ("v5"), | |
1465 temp6 asm ("v6"), | |
1466 temp7 asm ("v7"); | |
1467 register vector signed short | |
1468 temp0S asm ("v8"), | |
1469 temp1S asm ("v9"), | |
1470 temp2S asm ("v10"), | |
1471 temp3S asm ("v11"), | |
1472 temp4S asm ("v12"), | |
1473 temp5S asm ("v13"), | |
1474 temp6S asm ("v14"), | |
1475 temp7S asm ("v15"); | |
1476 register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0); | |
1477 { | |
1478 #ifdef CONFIG_DARWIN | |
1479 register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); | |
1480 register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); | |
1481 register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1); | |
1482 register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char) | |
1483 (0x02, 0x03, 0x00, 0x01, | |
1484 0x06, 0x07, 0x04, 0x05, | |
1485 0x0A, 0x0B, 0x08, 0x09, | |
1486 0x0E, 0x0F, 0x0C, 0x0D); | |
1487 register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char) | |
1488 (0x04, 0x05, 0x06, 0x07, | |
1489 0x00, 0x01, 0x02, 0x03, | |
1490 0x0C, 0x0D, 0x0E, 0x0F, | |
1491 0x08, 0x09, 0x0A, 0x0B); | |
1492 register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char) | |
1493 (0x08, 0x09, 0x0A, 0x0B, | |
1494 0x0C, 0x0D, 0x0E, 0x0F, | |
1495 0x00, 0x01, 0x02, 0x03, | |
1496 0x04, 0x05, 0x06, 0x07); | |
1497 #else | |
1498 register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; | |
1499 register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; | |
1500 register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; | |
1501 register const_vector unsigned char perm1 = (const_vector unsigned char) | |
1502 {0x02, 0x03, 0x00, 0x01, | |
1503 0x06, 0x07, 0x04, 0x05, | |
1504 0x0A, 0x0B, 0x08, 0x09, | |
1505 0x0E, 0x0F, 0x0C, 0x0D}; | |
1506 register const_vector unsigned char perm2 = (const_vector unsigned char) | |
1507 {0x04, 0x05, 0x06, 0x07, | |
1508 0x00, 0x01, 0x02, 0x03, | |
1509 0x0C, 0x0D, 0x0E, 0x0F, | |
1510 0x08, 0x09, 0x0A, 0x0B}; | |
1511 register const_vector unsigned char perm3 = (const_vector unsigned char) | |
1512 {0x08, 0x09, 0x0A, 0x0B, | |
1513 0x0C, 0x0D, 0x0E, 0x0F, | |
1514 0x00, 0x01, 0x02, 0x03, | |
1515 0x04, 0x05, 0x06, 0x07}; | |
1516 #endif | |
1517 #define ONEITERBUTTERFLY(i, res1, res2) \ | |
1518 { \ | |
1519 register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \ | |
1520 register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \ | |
1521 src1 = vec_ld(stride * i, src); \ | |
1522 src2 = vec_ld((stride * i) + 16, src); \ | |
1523 register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ | |
1524 dst1 = vec_ld(stride * i, dst); \ | |
1525 dst2 = vec_ld((stride * i) + 16, dst); \ | |
1526 register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ | |
1527 /* promote the unsigned chars to signed shorts */ \ | |
1528 register vector signed short srcV asm ("v24") = \ | |
1529 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ | |
1530 register vector signed short dstV asm ("v25") = \ | |
1531 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ | |
1532 register vector signed short srcW asm ("v26") = \ | |
1533 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \ | |
1534 register vector signed short dstW asm ("v27") = \ | |
1535 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \ | |
1536 /* substractions inside the first butterfly */ \ | |
1537 register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \ | |
1538 register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \ | |
1539 register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \ | |
1540 register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \ | |
1541 register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \ | |
1542 register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \ | |
1543 register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \ | |
1544 register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \ | |
1545 register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \ | |
1546 register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \ | |
1547 register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \ | |
1548 res1 = vec_mladd(but2, vprod3, op3); \ | |
1549 register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \ | |
1550 res2 = vec_mladd(but2S, vprod3, op3S); \ | |
1551 } | |
1552 ONEITERBUTTERFLY(0, temp0, temp0S); | |
1553 ONEITERBUTTERFLY(1, temp1, temp1S); | |
1554 ONEITERBUTTERFLY(2, temp2, temp2S); | |
1555 ONEITERBUTTERFLY(3, temp3, temp3S); | |
1556 ONEITERBUTTERFLY(4, temp4, temp4S); | |
1557 ONEITERBUTTERFLY(5, temp5, temp5S); | |
1558 ONEITERBUTTERFLY(6, temp6, temp6S); | |
1559 ONEITERBUTTERFLY(7, temp7, temp7S); | |
1560 } | |
1561 #undef ONEITERBUTTERFLY | |
1562 { | |
1563 register vector signed int vsum; | |
1564 register vector signed short line0 = vec_add(temp0, temp1); | |
1565 register vector signed short line1 = vec_sub(temp0, temp1); | |
1566 register vector signed short line2 = vec_add(temp2, temp3); | |
1567 register vector signed short line3 = vec_sub(temp2, temp3); | |
1568 register vector signed short line4 = vec_add(temp4, temp5); | |
1569 register vector signed short line5 = vec_sub(temp4, temp5); | |
1570 register vector signed short line6 = vec_add(temp6, temp7); | |
1571 register vector signed short line7 = vec_sub(temp6, temp7); | |
1572 | |
1573 register vector signed short line0B = vec_add(line0, line2); | |
1574 register vector signed short line2B = vec_sub(line0, line2); | |
1575 register vector signed short line1B = vec_add(line1, line3); | |
1576 register vector signed short line3B = vec_sub(line1, line3); | |
1577 register vector signed short line4B = vec_add(line4, line6); | |
1578 register vector signed short line6B = vec_sub(line4, line6); | |
1579 register vector signed short line5B = vec_add(line5, line7); | |
1580 register vector signed short line7B = vec_sub(line5, line7); | |
1581 | |
1582 register vector signed short line0C = vec_add(line0B, line4B); | |
1583 register vector signed short line4C = vec_sub(line0B, line4B); | |
1584 register vector signed short line1C = vec_add(line1B, line5B); | |
1585 register vector signed short line5C = vec_sub(line1B, line5B); | |
1586 register vector signed short line2C = vec_add(line2B, line6B); | |
1587 register vector signed short line6C = vec_sub(line2B, line6B); | |
1588 register vector signed short line3C = vec_add(line3B, line7B); | |
1589 register vector signed short line7C = vec_sub(line3B, line7B); | |
1590 | |
1591 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); | |
1592 vsum = vec_sum4s(vec_abs(line1C), vsum); | |
1593 vsum = vec_sum4s(vec_abs(line2C), vsum); | |
1594 vsum = vec_sum4s(vec_abs(line3C), vsum); | |
1595 vsum = vec_sum4s(vec_abs(line4C), vsum); | |
1596 vsum = vec_sum4s(vec_abs(line5C), vsum); | |
1597 vsum = vec_sum4s(vec_abs(line6C), vsum); | |
1598 vsum = vec_sum4s(vec_abs(line7C), vsum); | |
1599 | |
1600 register vector signed short line0S = vec_add(temp0S, temp1S); | |
1601 register vector signed short line1S = vec_sub(temp0S, temp1S); | |
1602 register vector signed short line2S = vec_add(temp2S, temp3S); | |
1603 register vector signed short line3S = vec_sub(temp2S, temp3S); | |
1604 register vector signed short line4S = vec_add(temp4S, temp5S); | |
1605 register vector signed short line5S = vec_sub(temp4S, temp5S); | |
1606 register vector signed short line6S = vec_add(temp6S, temp7S); | |
1607 register vector signed short line7S = vec_sub(temp6S, temp7S); | |
1608 | |
1609 register vector signed short line0BS = vec_add(line0S, line2S); | |
1610 register vector signed short line2BS = vec_sub(line0S, line2S); | |
1611 register vector signed short line1BS = vec_add(line1S, line3S); | |
1612 register vector signed short line3BS = vec_sub(line1S, line3S); | |
1613 register vector signed short line4BS = vec_add(line4S, line6S); | |
1614 register vector signed short line6BS = vec_sub(line4S, line6S); | |
1615 register vector signed short line5BS = vec_add(line5S, line7S); | |
1616 register vector signed short line7BS = vec_sub(line5S, line7S); | |
1617 | |
1618 register vector signed short line0CS = vec_add(line0BS, line4BS); | |
1619 register vector signed short line4CS = vec_sub(line0BS, line4BS); | |
1620 register vector signed short line1CS = vec_add(line1BS, line5BS); | |
1621 register vector signed short line5CS = vec_sub(line1BS, line5BS); | |
1622 register vector signed short line2CS = vec_add(line2BS, line6BS); | |
1623 register vector signed short line6CS = vec_sub(line2BS, line6BS); | |
1624 register vector signed short line3CS = vec_add(line3BS, line7BS); | |
1625 register vector signed short line7CS = vec_sub(line3BS, line7BS); | |
1626 | |
1627 vsum = vec_sum4s(vec_abs(line0CS), vsum); | |
1628 vsum = vec_sum4s(vec_abs(line1CS), vsum); | |
1629 vsum = vec_sum4s(vec_abs(line2CS), vsum); | |
1630 vsum = vec_sum4s(vec_abs(line3CS), vsum); | |
1631 vsum = vec_sum4s(vec_abs(line4CS), vsum); | |
1632 vsum = vec_sum4s(vec_abs(line5CS), vsum); | |
1633 vsum = vec_sum4s(vec_abs(line6CS), vsum); | |
1634 vsum = vec_sum4s(vec_abs(line7CS), vsum); | |
1635 vsum = vec_sums(vsum, (vector signed int)vzero); | |
1636 vsum = vec_splat(vsum, 3); | |
1637 vec_ste(vsum, 0, &sum); | |
1638 } | |
1639 return sum; | |
1640 } | |
1641 | |
1642 int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ | |
1643 POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1); | |
1644 int score; | |
1645 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1); | |
1646 score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); | |
1647 if (h==16) { | |
1648 dst += 8*stride; | |
1649 src += 8*stride; | |
1650 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); | |
1651 } | |
1652 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); | |
1653 return score; | |
1432 } | 1654 } |
1433 | 1655 |
1434 int has_altivec(void) | 1656 int has_altivec(void) |
1435 { | 1657 { |
1436 #ifdef CONFIG_DARWIN | 1658 #ifdef CONFIG_DARWIN |