Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12168:b246b214c2e9 libavcodec
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
author | rbultje |
---|---|
date | Thu, 15 Jul 2010 23:02:34 +0000 |
parents | d780ae746855 |
children | c47ddb7df424 |
comparison
equal
deleted
inserted
replaced
12167:69bbfd8f2ba5 | 12168:b246b214c2e9 |
---|---|
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | 143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |
144 | 144 |
145 pw_20091: times 4 dw 20091 | 145 pw_20091: times 4 dw 20091 |
146 pw_17734: times 4 dw 17734 | 146 pw_17734: times 4 dw 17734 |
147 | 147 |
148 cextern pb_1 | |
148 cextern pw_3 | 149 cextern pw_3 |
149 cextern pb_3 | 150 cextern pb_3 |
150 cextern pw_4 | 151 cextern pw_4 |
151 cextern pb_4 | 152 cextern pb_4 |
152 cextern pw_64 | 153 cextern pw_64 |
1200 movd [%6+%9*2], m%2 | 1201 movd [%6+%9*2], m%2 |
1201 movd [%7+%8*2], m%3 | 1202 movd [%7+%8*2], m%3 |
1202 movd [%7+%9*2], m%4 | 1203 movd [%7+%9*2], m%4 |
1203 %endmacro | 1204 %endmacro |
1204 | 1205 |
1206 %macro SPLATB_REG 3 | |
1207 movd %1, %2 | |
1208 punpcklbw %1, %1 | |
1209 %if mmsize == 16 ; sse2 | |
1210 punpcklwd %1, %1 | |
1211 pshufd %1, %1, 0x0 | |
1212 %elifidn %3, mmx | |
1213 punpcklwd %1, %1 | |
1214 punpckldq %1, %1 | |
1215 %else ; mmxext | |
1216 pshufw %1, %1, 0x0 | |
1217 %endif | |
1218 %endmacro | |
1219 | |
1205 %macro SIMPLE_LOOPFILTER 3 | 1220 %macro SIMPLE_LOOPFILTER 3 |
1206 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 | 1221 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |
1207 %ifidn %2, h | 1222 %ifidn %2, h |
1208 mov r5, rsp ; backup stack pointer | 1223 mov r5, rsp ; backup stack pointer |
1209 and rsp, ~(mmsize-1) ; align stack | 1224 and rsp, ~(mmsize-1) ; align stack |
1210 %endif | 1225 %endif |
1211 %if mmsize == 8 ; mmx/mmxext | 1226 %if mmsize == 8 ; mmx/mmxext |
1212 mov r3, 2 | 1227 mov r3, 2 |
1213 %endif | 1228 %endif |
1214 | 1229 SPLATB_REG m7, r2, %1 ; splat "flim" into register |
1215 ; splat register with "flim" | |
1216 movd m7, r2 | |
1217 punpcklbw m7, m7 | |
1218 %if mmsize == 16 ; sse2 | |
1219 punpcklwd m7, m7 | |
1220 pshufd m7, m7, 0x0 | |
1221 %elifidn %1, mmx | |
1222 punpcklwd m7, m7 | |
1223 punpckldq m7, m7 | |
1224 %else ; mmxext | |
1225 pshufw m7, m7, 0x0 | |
1226 %endif | |
1227 | 1230 |
1228 ; set up indexes to address 4 rows | 1231 ; set up indexes to address 4 rows |
1229 mov r2, r1 | 1232 mov r2, r1 |
1230 neg r1 | 1233 neg r1 |
1231 %ifidn %2, h | 1234 %ifidn %2, h |
1367 SIMPLE_LOOPFILTER mmxext, v, 4 | 1370 SIMPLE_LOOPFILTER mmxext, v, 4 |
1368 SIMPLE_LOOPFILTER mmxext, h, 6 | 1371 SIMPLE_LOOPFILTER mmxext, h, 6 |
1369 INIT_XMM | 1372 INIT_XMM |
1370 SIMPLE_LOOPFILTER sse2, v, 3 | 1373 SIMPLE_LOOPFILTER sse2, v, 3 |
1371 SIMPLE_LOOPFILTER sse2, h, 6 | 1374 SIMPLE_LOOPFILTER sse2, h, 6 |
1375 | |
1376 ;----------------------------------------------------------------------------- | |
1377 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, int stride, | |
1378 ; int flimE, int flimI, int hev_thr); | |
1379 ;----------------------------------------------------------------------------- | |
1380 | |
1381 %macro INNER_LOOPFILTER 4 | |
1382 cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 | |
1383 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 | |
1384 ; splat function arguments | |
1385 SPLATB_REG m0, r2, %1 ; E | |
1386 SPLATB_REG m1, r3, %1 ; I | |
1387 SPLATB_REG m2, r4, %1 ; hev_thresh | |
1388 | |
1389 ; align stack | |
1390 mov r4, rsp ; backup stack pointer | |
1391 and rsp, ~(mmsize-1) ; align stack | |
1392 %ifidn %2, v | |
1393 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr | |
1394 ; [3]=hev() result | |
1395 %else ; h | |
1396 sub rsp, mmsize * 6 ; extra storage space for transposes | |
1397 %endif | |
1398 | |
1399 %define flim_E [rsp] | |
1400 %define flim_I [rsp+mmsize] | |
1401 %define hev_thr [rsp+mmsize*2] | |
1402 %define mask_res [rsp+mmsize*3] | |
1403 | |
1404 mova flim_E, m0 | |
1405 mova flim_I, m1 | |
1406 mova hev_thr, m2 | |
1407 | |
1408 %else ; sse2 on x86-64 | |
1409 | |
1410 %define flim_E m9 | |
1411 %define flim_I m10 | |
1412 %define hev_thr m11 | |
1413 %define mask_res m12 | |
1414 | |
1415 ; splat function arguments | |
1416 SPLATB_REG flim_E, r2, %1 ; E | |
1417 SPLATB_REG flim_I, r3, %1 ; I | |
1418 SPLATB_REG hev_thr, r4, %1 ; hev_thresh | |
1419 %endif | |
1420 | |
1421 %if mmsize == 8 ; mmx/mmxext | |
1422 mov r5, 2 | |
1423 %endif | |
1424 mov r2, r1 | |
1425 neg r1 | |
1426 %ifidn %2, h | |
1427 lea r0, [r0+4*r2-4] | |
1428 %endif | |
1429 | |
1430 %if mmsize == 8 | |
1431 .next8px | |
1432 %endif | |
1433 ; read | |
1434 lea r3, [r0+r2] | |
1435 %ifidn %2, v | |
1436 mova m0, [r0+r1*4] ; p3 | |
1437 mova m1, [r3+r1*4] ; p2 | |
1438 mova m2, [r0+r1*2] ; p1 | |
1439 mova m5, [r3] ; q1 | |
1440 mova m6, [r3+r2] ; q2 | |
1441 mova m7, [r3+r2*2] ; q3 | |
1442 %elif mmsize == 8 ; mmx/mmxext (h) | |
1443 ; read 8 rows of 8px each | |
1444 movu m0, [r0+r1*4] | |
1445 movu m1, [r3+r1*4] | |
1446 movu m2, [r0+r1*2] | |
1447 movu m3, [r0+r1] | |
1448 movu m4, [r0] | |
1449 movu m5, [r3] | |
1450 movu m6, [r3+r2] | |
1451 | |
1452 ; 8x8 transpose | |
1453 TRANSPOSE4x4B 0, 1, 2, 3, 7 | |
1454 %ifdef m13 | |
1455 SWAP 1, 13 | |
1456 %else | |
1457 mova [rsp+mmsize*4], m1 | |
1458 %endif | |
1459 movu m7, [r3+r2*2] | |
1460 TRANSPOSE4x4B 4, 5, 6, 7, 1 | |
1461 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 | |
1462 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 | |
1463 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 | |
1464 %ifdef m13 | |
1465 SWAP 1, 13 | |
1466 SWAP 2, 13 | |
1467 %else | |
1468 mova m1, [rsp+mmsize*4] | |
1469 mova [rsp+mmsize*4], m2 ; store q0 | |
1470 %endif | |
1471 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 | |
1472 %ifdef m14 | |
1473 SWAP 5, 14 | |
1474 %else | |
1475 mova [rsp+mmsize*5], m5 ; store p0 | |
1476 %endif | |
1477 SWAP 1, 4 | |
1478 SWAP 2, 4 | |
1479 SWAP 6, 3 | |
1480 SWAP 5, 3 | |
1481 %else ; sse2 (h) | |
1482 lea r5, [r0+r2*8] | |
1483 | |
1484 ; read 16 rows of 8px each, interleave | |
1485 movh m0, [r0+r1*4] | |
1486 movh m1, [r5+r1*4] | |
1487 movh m2, [r0+r1*2] | |
1488 movh m5, [r5+r1*2] | |
1489 movh m3, [r0+r1] | |
1490 movh m6, [r5+r1] | |
1491 movh m4, [r0] | |
1492 movh m7, [r5] | |
1493 punpcklbw m0, m1 ; A/I | |
1494 punpcklbw m2, m5 ; C/K | |
1495 punpcklbw m3, m6 ; D/L | |
1496 punpcklbw m4, m7 ; E/M | |
1497 | |
1498 add r5, r2 | |
1499 movh m1, [r3+r1*4] | |
1500 movh m6, [r5+r1*4] | |
1501 movh m5, [r3] | |
1502 movh m7, [r5] | |
1503 punpcklbw m1, m6 ; B/J | |
1504 punpcklbw m5, m7 ; F/N | |
1505 movh m6, [r3+r2] | |
1506 movh m7, [r5+r2] | |
1507 punpcklbw m6, m7 ; G/O | |
1508 | |
1509 ; 8x16 transpose | |
1510 TRANSPOSE4x4B 0, 1, 2, 3, 7 | |
1511 %ifdef m13 | |
1512 SWAP 1, 13 | |
1513 %else | |
1514 mova [rsp+mmsize*4], m1 | |
1515 %endif | |
1516 movh m7, [r3+r2*2] | |
1517 movh m1, [r5+r2*2] | |
1518 punpcklbw m7, m1 ; H/P | |
1519 TRANSPOSE4x4B 4, 5, 6, 7, 1 | |
1520 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 | |
1521 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 | |
1522 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 | |
1523 %ifdef m13 | |
1524 SWAP 1, 13 | |
1525 SWAP 2, 13 | |
1526 %else | |
1527 mova m1, [rsp+mmsize*4] | |
1528 mova [rsp+mmsize*4], m2 ; store q0 | |
1529 %endif | |
1530 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 | |
1531 %ifdef m14 | |
1532 SWAP 5, 14 | |
1533 %else | |
1534 mova [rsp+mmsize*5], m5 ; store p0 | |
1535 %endif | |
1536 SWAP 1, 4 | |
1537 SWAP 2, 4 | |
1538 SWAP 6, 3 | |
1539 SWAP 5, 3 | |
1540 %endif | |
1541 | |
1542 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 | |
1543 mova m4, m1 | |
1544 SWAP 4, 1 | |
1545 psubusb m4, m0 ; p2-p3 | |
1546 psubusb m0, m1 ; p3-p2 | |
1547 por m0, m4 ; abs(p3-p2) | |
1548 | |
1549 mova m4, m2 | |
1550 SWAP 4, 2 | |
1551 psubusb m4, m1 ; p1-p2 | |
1552 psubusb m1, m2 ; p2-p1 | |
1553 por m1, m4 ; abs(p2-p1) | |
1554 | |
1555 mova m4, m6 | |
1556 SWAP 4, 6 | |
1557 psubusb m4, m7 ; q2-q3 | |
1558 psubusb m7, m6 ; q3-q2 | |
1559 por m7, m4 ; abs(q3-q2) | |
1560 | |
1561 mova m4, m5 | |
1562 SWAP 4, 5 | |
1563 psubusb m4, m6 ; q1-q2 | |
1564 psubusb m6, m5 ; q2-q1 | |
1565 por m6, m4 ; abs(q2-q1) | |
1566 | |
1567 %ifidn %1, mmx | |
1568 %ifdef m10 | |
1569 SWAP 4, 10 | |
1570 %else | |
1571 mova m4, [rsp+mmsize] | |
1572 %endif | |
1573 pxor m3, m3 | |
1574 psubusb m0, m4 | |
1575 psubusb m1, m4 | |
1576 psubusb m7, m4 | |
1577 psubusb m6, m4 | |
1578 pcmpeqb m0, m3 ; abs(p3-p2) <= I | |
1579 pcmpeqb m1, m3 ; abs(p2-p1) <= I | |
1580 pcmpeqb m7, m3 ; abs(q3-q2) <= I | |
1581 pcmpeqb m6, m3 ; abs(q2-q1) <= I | |
1582 pand m0, m1 | |
1583 pand m7, m6 | |
1584 pand m0, m7 | |
1585 %else ; mmxext/sse2 | |
1586 pmaxub m0, m1 | |
1587 pmaxub m6, m7 | |
1588 pmaxub m0, m6 | |
1589 %endif | |
1590 | |
1591 ; normal_limit and high_edge_variance for p1-p0, q1-q0 | |
1592 SWAP 7, 3 ; now m7 is zero | |
1593 %ifidn %2, v | |
1594 mova m3, [r0+r1] ; p0 | |
1595 %elifdef m14 | |
1596 SWAP 3, 14 | |
1597 %else | |
1598 mova m3, [rsp+mmsize*5] | |
1599 %endif | |
1600 | |
1601 mova m1, m2 | |
1602 SWAP 1, 2 | |
1603 mova m6, m3 | |
1604 SWAP 3, 6 | |
1605 psubusb m1, m3 ; p1-p0 | |
1606 psubusb m6, m2 ; p0-p1 | |
1607 por m1, m6 ; abs(p1-p0) | |
1608 %ifidn %1, mmx | |
1609 mova m6, m1 | |
1610 psubusb m1, m4 | |
1611 psubusb m6, hev_thr | |
1612 pcmpeqb m1, m7 ; abs(p1-p0) <= I | |
1613 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh | |
1614 pand m0, m1 | |
1615 %ifdef m12 | |
1616 SWAP 6, 12 | |
1617 %else | |
1618 mova [rsp+mmsize*3], m6 | |
1619 %endif | |
1620 %else ; mmxext/sse2 | |
1621 pmaxub m0, m1 ; max_I | |
1622 SWAP 1, 4 ; max_hev_thresh | |
1623 %endif | |
1624 | |
1625 SWAP 6, 4 ; now m6 is I | |
1626 %ifidn %2, v | |
1627 mova m4, [r0] ; q0 | |
1628 %elifdef m13 | |
1629 SWAP 4, 13 | |
1630 %else | |
1631 mova m4, [rsp+mmsize*4] | |
1632 %endif | |
1633 mova m1, m4 | |
1634 SWAP 1, 4 | |
1635 mova m7, m5 | |
1636 SWAP 7, 5 | |
1637 psubusb m1, m5 ; q0-q1 | |
1638 psubusb m7, m4 ; q1-q0 | |
1639 por m1, m7 ; abs(q1-q0) | |
1640 %ifidn %1, mmx | |
1641 mova m7, m1 | |
1642 psubusb m1, m6 | |
1643 psubusb m7, hev_thr | |
1644 pxor m6, m6 | |
1645 pcmpeqb m1, m6 ; abs(q1-q0) <= I | |
1646 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh | |
1647 %ifdef m12 | |
1648 SWAP 6, 12 | |
1649 %else | |
1650 mova m6, [rsp+mmsize*3] | |
1651 %endif | |
1652 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I | |
1653 pand m6, m7 | |
1654 %else ; mmxext/sse2 | |
1655 pxor m7, m7 | |
1656 pmaxub m0, m1 | |
1657 pmaxub m6, m1 | |
1658 psubusb m0, flim_I | |
1659 psubusb m6, hev_thr | |
1660 pcmpeqb m0, m7 ; max(abs(..)) <= I | |
1661 pcmpeqb m6, m7 ; !(max(abs..) > thresh) | |
1662 %endif | |
1663 %ifdef m12 | |
1664 SWAP 6, 12 | |
1665 %else | |
1666 mova [rsp+mmsize*3], m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) | |
1667 %endif | |
1668 | |
1669 ; simple_limit | |
1670 mova m1, m3 | |
1671 SWAP 1, 3 | |
1672 mova m6, m4 ; keep copies of p0/q0 around for later use | |
1673 SWAP 6, 4 | |
1674 psubusb m1, m4 ; p0-q0 | |
1675 psubusb m6, m3 ; q0-p0 | |
1676 por m1, m6 ; abs(q0-p0) | |
1677 paddusb m1, m1 ; m1=2*abs(q0-p0) | |
1678 | |
1679 mova m7, m2 | |
1680 SWAP 7, 2 | |
1681 mova m6, m5 | |
1682 SWAP 6, 5 | |
1683 psubusb m7, m5 ; p1-q1 | |
1684 psubusb m6, m2 ; q1-p1 | |
1685 por m7, m6 ; abs(q1-p1) | |
1686 pxor m6, m6 | |
1687 pand m7, [pb_FE] | |
1688 psrlq m7, 1 ; abs(q1-p1)/2 | |
1689 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 | |
1690 psubusb m7, flim_E | |
1691 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E | |
1692 pand m0, m7 ; normal_limit result | |
1693 | |
1694 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask | |
1695 %ifdef m8 ; x86-64 && sse2 | |
1696 mova m8, [pb_80] | |
1697 %define pb_80_var m8 | |
1698 %else ; x86-32 or mmx/mmxext | |
1699 %define pb_80_var [pb_80] | |
1700 %endif | |
1701 mova m1, m4 | |
1702 mova m7, m3 | |
1703 pxor m1, pb_80_var | |
1704 pxor m7, pb_80_var | |
1705 psubsb m1, m7 ; (signed) q0-p0 | |
1706 mova m6, m2 | |
1707 mova m7, m5 | |
1708 pxor m6, pb_80_var | |
1709 pxor m7, pb_80_var | |
1710 psubsb m6, m7 ; (signed) p1-q1 | |
1711 mova m7, mask_res | |
1712 pandn m7, m6 | |
1713 paddsb m7, m1 | |
1714 paddsb m7, m1 | |
1715 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) | |
1716 | |
1717 pand m7, m0 | |
1718 mova m1, [pb_F8] | |
1719 mova m6, m7 | |
1720 paddsb m7, [pb_3] | |
1721 paddsb m6, [pb_4] | |
1722 pand m7, m1 | |
1723 pand m6, m1 | |
1724 | |
1725 pxor m1, m1 | |
1726 pxor m0, m0 | |
1727 pcmpgtb m1, m7 | |
1728 psubb m0, m7 | |
1729 psrlq m7, 3 ; +f2 | |
1730 psrlq m0, 3 ; -f2 | |
1731 pand m0, m1 | |
1732 pandn m1, m7 | |
1733 psubusb m3, m0 | |
1734 paddusb m3, m1 ; p0+f2 | |
1735 | |
1736 pxor m1, m1 | |
1737 pxor m0, m0 | |
1738 pcmpgtb m0, m6 | |
1739 psubb m1, m6 | |
1740 psrlq m6, 3 ; +f1 | |
1741 psrlq m1, 3 ; -f1 | |
1742 pand m1, m0 | |
1743 pandn m0, m6 | |
1744 psubusb m4, m0 | |
1745 paddusb m4, m1 ; q0-f1 | |
1746 | |
1747 %ifdef m12 | |
1748 SWAP 6, 12 | |
1749 %else | |
1750 mova m6, [rsp+mmsize*3] | |
1751 %endif | |
1752 %ifidn %1, mmx | |
1753 mova m7, [pb_1] | |
1754 %else ; mmxext/sse2 | |
1755 pxor m7, m7 | |
1756 %endif | |
1757 pand m0, m6 | |
1758 pand m1, m6 | |
1759 %ifidn %1, mmx | |
1760 paddusb m0, m7 | |
1761 pand m1, [pb_FE] | |
1762 pandn m7, m0 | |
1763 psrlq m1, 1 | |
1764 psrlq m7, 1 | |
1765 SWAP 0, 7 | |
1766 %else ; mmxext/sse2 | |
1767 psubusb m1, [pb_1] | |
1768 pavgb m0, m7 ; a | |
1769 pavgb m1, m7 ; -a | |
1770 %endif | |
1771 psubusb m5, m0 | |
1772 psubusb m2, m1 | |
1773 paddusb m5, m1 ; q1-a | |
1774 paddusb m2, m0 ; p1+a | |
1775 | |
1776 ; store | |
1777 %ifidn %2, v | |
1778 mova [r0+r1*2], m2 | |
1779 mova [r0+r1], m3 | |
1780 mova [r0], m4 | |
1781 mova [r0+r2], m5 | |
1782 %else ; h | |
1783 add r0, 2 | |
1784 add r3, 2 | |
1785 | |
1786 ; 4x8/16 transpose | |
1787 TRANSPOSE4x4B 2, 3, 4, 5, 6 | |
1788 | |
1789 %if mmsize == 8 ; mmx/mmxext (h) | |
1790 WRITE_4x2D 2, 3, 4, 5, r0, r3, r1, r2 | |
1791 %else ; sse2 (h) | |
1792 lea r5, [r5+r1+2] | |
1793 WRITE_4x4D 2, 3, 4, 5, r0, r3, r5, r1, r2 | |
1794 %endif | |
1795 %endif | |
1796 | |
1797 %if mmsize == 8 | |
1798 %ifidn %2, h | |
1799 lea r0, [r0+8*r2-2] | |
1800 %else ; v | |
1801 add r0, 8 | |
1802 %endif | |
1803 dec r5 | |
1804 jg .next8px | |
1805 %endif | |
1806 | |
1807 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext | |
1808 mov rsp, r4 ; restore stack pointer | |
1809 RET | |
1810 %else ; sse2 on x86-64 | |
1811 REP_RET | |
1812 %endif | |
1813 %endmacro | |
1814 | |
1815 INIT_MMX | |
1816 INNER_LOOPFILTER mmx, v, 6, 8 | |
1817 INNER_LOOPFILTER mmx, h, 6, 8 | |
1818 INNER_LOOPFILTER mmxext, v, 6, 8 | |
1819 INNER_LOOPFILTER mmxext, h, 6, 8 | |
1820 INIT_XMM | |
1821 INNER_LOOPFILTER sse2, v, 5, 13 | |
1822 INNER_LOOPFILTER sse2, h, 6, 15 |