comparison x86/vp8dsp.asm @ 12168:b246b214c2e9 libavcodec

VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
author rbultje
date Thu, 15 Jul 2010 23:02:34 +0000
parents d780ae746855
children c47ddb7df424
comparison
equal deleted inserted replaced
12167:69bbfd8f2ba5 12168:b246b214c2e9
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
144 144
145 pw_20091: times 4 dw 20091 145 pw_20091: times 4 dw 20091
146 pw_17734: times 4 dw 17734 146 pw_17734: times 4 dw 17734
147 147
148 cextern pb_1
148 cextern pw_3 149 cextern pw_3
149 cextern pb_3 150 cextern pb_3
150 cextern pw_4 151 cextern pw_4
151 cextern pb_4 152 cextern pb_4
152 cextern pw_64 153 cextern pw_64
1200 movd [%6+%9*2], m%2 1201 movd [%6+%9*2], m%2
1201 movd [%7+%8*2], m%3 1202 movd [%7+%8*2], m%3
1202 movd [%7+%9*2], m%4 1203 movd [%7+%9*2], m%4
1203 %endmacro 1204 %endmacro
1204 1205
1206 %macro SPLATB_REG 3
1207 movd %1, %2
1208 punpcklbw %1, %1
1209 %if mmsize == 16 ; sse2
1210 punpcklwd %1, %1
1211 pshufd %1, %1, 0x0
1212 %elifidn %3, mmx
1213 punpcklwd %1, %1
1214 punpckldq %1, %1
1215 %else ; mmxext
1216 pshufw %1, %1, 0x0
1217 %endif
1218 %endmacro
1219
1205 %macro SIMPLE_LOOPFILTER 3 1220 %macro SIMPLE_LOOPFILTER 3
1206 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 1221 cglobal vp8_%2_loop_filter_simple_%1, 3, %3
1207 %ifidn %2, h 1222 %ifidn %2, h
1208 mov r5, rsp ; backup stack pointer 1223 mov r5, rsp ; backup stack pointer
1209 and rsp, ~(mmsize-1) ; align stack 1224 and rsp, ~(mmsize-1) ; align stack
1210 %endif 1225 %endif
1211 %if mmsize == 8 ; mmx/mmxext 1226 %if mmsize == 8 ; mmx/mmxext
1212 mov r3, 2 1227 mov r3, 2
1213 %endif 1228 %endif
1214 1229 SPLATB_REG m7, r2, %1 ; splat "flim" into register
1215 ; splat register with "flim"
1216 movd m7, r2
1217 punpcklbw m7, m7
1218 %if mmsize == 16 ; sse2
1219 punpcklwd m7, m7
1220 pshufd m7, m7, 0x0
1221 %elifidn %1, mmx
1222 punpcklwd m7, m7
1223 punpckldq m7, m7
1224 %else ; mmxext
1225 pshufw m7, m7, 0x0
1226 %endif
1227 1230
1228 ; set up indexes to address 4 rows 1231 ; set up indexes to address 4 rows
1229 mov r2, r1 1232 mov r2, r1
1230 neg r1 1233 neg r1
1231 %ifidn %2, h 1234 %ifidn %2, h
1367 SIMPLE_LOOPFILTER mmxext, v, 4 1370 SIMPLE_LOOPFILTER mmxext, v, 4
1368 SIMPLE_LOOPFILTER mmxext, h, 6 1371 SIMPLE_LOOPFILTER mmxext, h, 6
1369 INIT_XMM 1372 INIT_XMM
1370 SIMPLE_LOOPFILTER sse2, v, 3 1373 SIMPLE_LOOPFILTER sse2, v, 3
1371 SIMPLE_LOOPFILTER sse2, h, 6 1374 SIMPLE_LOOPFILTER sse2, h, 6
1375
1376 ;-----------------------------------------------------------------------------
1377 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, int stride,
1378 ; int flimE, int flimI, int hev_thr);
1379 ;-----------------------------------------------------------------------------
1380
1381 %macro INNER_LOOPFILTER 4
1382 cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4
1383 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
1384 ; splat function arguments
1385 SPLATB_REG m0, r2, %1 ; E
1386 SPLATB_REG m1, r3, %1 ; I
1387 SPLATB_REG m2, r4, %1 ; hev_thresh
1388
1389 ; align stack
1390 mov r4, rsp ; backup stack pointer
1391 and rsp, ~(mmsize-1) ; align stack
1392 %ifidn %2, v
1393 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1394 ; [3]=hev() result
1395 %else ; h
1396 sub rsp, mmsize * 6 ; extra storage space for transposes
1397 %endif
1398
1399 %define flim_E [rsp]
1400 %define flim_I [rsp+mmsize]
1401 %define hev_thr [rsp+mmsize*2]
1402 %define mask_res [rsp+mmsize*3]
1403
1404 mova flim_E, m0
1405 mova flim_I, m1
1406 mova hev_thr, m2
1407
1408 %else ; sse2 on x86-64
1409
1410 %define flim_E m9
1411 %define flim_I m10
1412 %define hev_thr m11
1413 %define mask_res m12
1414
1415 ; splat function arguments
1416 SPLATB_REG flim_E, r2, %1 ; E
1417 SPLATB_REG flim_I, r3, %1 ; I
1418 SPLATB_REG hev_thr, r4, %1 ; hev_thresh
1419 %endif
1420
1421 %if mmsize == 8 ; mmx/mmxext
1422 mov r5, 2
1423 %endif
1424 mov r2, r1
1425 neg r1
1426 %ifidn %2, h
1427 lea r0, [r0+4*r2-4]
1428 %endif
1429
1430 %if mmsize == 8
1431 .next8px
1432 %endif
1433 ; read
1434 lea r3, [r0+r2]
1435 %ifidn %2, v
1436 mova m0, [r0+r1*4] ; p3
1437 mova m1, [r3+r1*4] ; p2
1438 mova m2, [r0+r1*2] ; p1
1439 mova m5, [r3] ; q1
1440 mova m6, [r3+r2] ; q2
1441 mova m7, [r3+r2*2] ; q3
1442 %elif mmsize == 8 ; mmx/mmxext (h)
1443 ; read 8 rows of 8px each
1444 movu m0, [r0+r1*4]
1445 movu m1, [r3+r1*4]
1446 movu m2, [r0+r1*2]
1447 movu m3, [r0+r1]
1448 movu m4, [r0]
1449 movu m5, [r3]
1450 movu m6, [r3+r2]
1451
1452 ; 8x8 transpose
1453 TRANSPOSE4x4B 0, 1, 2, 3, 7
1454 %ifdef m13
1455 SWAP 1, 13
1456 %else
1457 mova [rsp+mmsize*4], m1
1458 %endif
1459 movu m7, [r3+r2*2]
1460 TRANSPOSE4x4B 4, 5, 6, 7, 1
1461 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1462 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1463 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1464 %ifdef m13
1465 SWAP 1, 13
1466 SWAP 2, 13
1467 %else
1468 mova m1, [rsp+mmsize*4]
1469 mova [rsp+mmsize*4], m2 ; store q0
1470 %endif
1471 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1472 %ifdef m14
1473 SWAP 5, 14
1474 %else
1475 mova [rsp+mmsize*5], m5 ; store p0
1476 %endif
1477 SWAP 1, 4
1478 SWAP 2, 4
1479 SWAP 6, 3
1480 SWAP 5, 3
1481 %else ; sse2 (h)
1482 lea r5, [r0+r2*8]
1483
1484 ; read 16 rows of 8px each, interleave
1485 movh m0, [r0+r1*4]
1486 movh m1, [r5+r1*4]
1487 movh m2, [r0+r1*2]
1488 movh m5, [r5+r1*2]
1489 movh m3, [r0+r1]
1490 movh m6, [r5+r1]
1491 movh m4, [r0]
1492 movh m7, [r5]
1493 punpcklbw m0, m1 ; A/I
1494 punpcklbw m2, m5 ; C/K
1495 punpcklbw m3, m6 ; D/L
1496 punpcklbw m4, m7 ; E/M
1497
1498 add r5, r2
1499 movh m1, [r3+r1*4]
1500 movh m6, [r5+r1*4]
1501 movh m5, [r3]
1502 movh m7, [r5]
1503 punpcklbw m1, m6 ; B/J
1504 punpcklbw m5, m7 ; F/N
1505 movh m6, [r3+r2]
1506 movh m7, [r5+r2]
1507 punpcklbw m6, m7 ; G/O
1508
1509 ; 8x16 transpose
1510 TRANSPOSE4x4B 0, 1, 2, 3, 7
1511 %ifdef m13
1512 SWAP 1, 13
1513 %else
1514 mova [rsp+mmsize*4], m1
1515 %endif
1516 movh m7, [r3+r2*2]
1517 movh m1, [r5+r2*2]
1518 punpcklbw m7, m1 ; H/P
1519 TRANSPOSE4x4B 4, 5, 6, 7, 1
1520 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1521 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1522 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1523 %ifdef m13
1524 SWAP 1, 13
1525 SWAP 2, 13
1526 %else
1527 mova m1, [rsp+mmsize*4]
1528 mova [rsp+mmsize*4], m2 ; store q0
1529 %endif
1530 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1531 %ifdef m14
1532 SWAP 5, 14
1533 %else
1534 mova [rsp+mmsize*5], m5 ; store p0
1535 %endif
1536 SWAP 1, 4
1537 SWAP 2, 4
1538 SWAP 6, 3
1539 SWAP 5, 3
1540 %endif
1541
1542 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1543 mova m4, m1
1544 SWAP 4, 1
1545 psubusb m4, m0 ; p2-p3
1546 psubusb m0, m1 ; p3-p2
1547 por m0, m4 ; abs(p3-p2)
1548
1549 mova m4, m2
1550 SWAP 4, 2
1551 psubusb m4, m1 ; p1-p2
1552 psubusb m1, m2 ; p2-p1
1553 por m1, m4 ; abs(p2-p1)
1554
1555 mova m4, m6
1556 SWAP 4, 6
1557 psubusb m4, m7 ; q2-q3
1558 psubusb m7, m6 ; q3-q2
1559 por m7, m4 ; abs(q3-q2)
1560
1561 mova m4, m5
1562 SWAP 4, 5
1563 psubusb m4, m6 ; q1-q2
1564 psubusb m6, m5 ; q2-q1
1565 por m6, m4 ; abs(q2-q1)
1566
1567 %ifidn %1, mmx
1568 %ifdef m10
1569 SWAP 4, 10
1570 %else
1571 mova m4, [rsp+mmsize]
1572 %endif
1573 pxor m3, m3
1574 psubusb m0, m4
1575 psubusb m1, m4
1576 psubusb m7, m4
1577 psubusb m6, m4
1578 pcmpeqb m0, m3 ; abs(p3-p2) <= I
1579 pcmpeqb m1, m3 ; abs(p2-p1) <= I
1580 pcmpeqb m7, m3 ; abs(q3-q2) <= I
1581 pcmpeqb m6, m3 ; abs(q2-q1) <= I
1582 pand m0, m1
1583 pand m7, m6
1584 pand m0, m7
1585 %else ; mmxext/sse2
1586 pmaxub m0, m1
1587 pmaxub m6, m7
1588 pmaxub m0, m6
1589 %endif
1590
1591 ; normal_limit and high_edge_variance for p1-p0, q1-q0
1592 SWAP 7, 3 ; now m7 is zero
1593 %ifidn %2, v
1594 mova m3, [r0+r1] ; p0
1595 %elifdef m14
1596 SWAP 3, 14
1597 %else
1598 mova m3, [rsp+mmsize*5]
1599 %endif
1600
1601 mova m1, m2
1602 SWAP 1, 2
1603 mova m6, m3
1604 SWAP 3, 6
1605 psubusb m1, m3 ; p1-p0
1606 psubusb m6, m2 ; p0-p1
1607 por m1, m6 ; abs(p1-p0)
1608 %ifidn %1, mmx
1609 mova m6, m1
1610 psubusb m1, m4
1611 psubusb m6, hev_thr
1612 pcmpeqb m1, m7 ; abs(p1-p0) <= I
1613 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
1614 pand m0, m1
1615 %ifdef m12
1616 SWAP 6, 12
1617 %else
1618 mova [rsp+mmsize*3], m6
1619 %endif
1620 %else ; mmxext/sse2
1621 pmaxub m0, m1 ; max_I
1622 SWAP 1, 4 ; max_hev_thresh
1623 %endif
1624
1625 SWAP 6, 4 ; now m6 is I
1626 %ifidn %2, v
1627 mova m4, [r0] ; q0
1628 %elifdef m13
1629 SWAP 4, 13
1630 %else
1631 mova m4, [rsp+mmsize*4]
1632 %endif
1633 mova m1, m4
1634 SWAP 1, 4
1635 mova m7, m5
1636 SWAP 7, 5
1637 psubusb m1, m5 ; q0-q1
1638 psubusb m7, m4 ; q1-q0
1639 por m1, m7 ; abs(q1-q0)
1640 %ifidn %1, mmx
1641 mova m7, m1
1642 psubusb m1, m6
1643 psubusb m7, hev_thr
1644 pxor m6, m6
1645 pcmpeqb m1, m6 ; abs(q1-q0) <= I
1646 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
1647 %ifdef m12
1648 SWAP 6, 12
1649 %else
1650 mova m6, [rsp+mmsize*3]
1651 %endif
1652 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
1653 pand m6, m7
1654 %else ; mmxext/sse2
1655 pxor m7, m7
1656 pmaxub m0, m1
1657 pmaxub m6, m1
1658 psubusb m0, flim_I
1659 psubusb m6, hev_thr
1660 pcmpeqb m0, m7 ; max(abs(..)) <= I
1661 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
1662 %endif
1663 %ifdef m12
1664 SWAP 6, 12
1665 %else
1666 mova [rsp+mmsize*3], m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1667 %endif
1668
1669 ; simple_limit
1670 mova m1, m3
1671 SWAP 1, 3
1672 mova m6, m4 ; keep copies of p0/q0 around for later use
1673 SWAP 6, 4
1674 psubusb m1, m4 ; p0-q0
1675 psubusb m6, m3 ; q0-p0
1676 por m1, m6 ; abs(q0-p0)
1677 paddusb m1, m1 ; m1=2*abs(q0-p0)
1678
1679 mova m7, m2
1680 SWAP 7, 2
1681 mova m6, m5
1682 SWAP 6, 5
1683 psubusb m7, m5 ; p1-q1
1684 psubusb m6, m2 ; q1-p1
1685 por m7, m6 ; abs(q1-p1)
1686 pxor m6, m6
1687 pand m7, [pb_FE]
1688 psrlq m7, 1 ; abs(q1-p1)/2
1689 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
1690 psubusb m7, flim_E
1691 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1692 pand m0, m7 ; normal_limit result
1693
1694 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1695 %ifdef m8 ; x86-64 && sse2
1696 mova m8, [pb_80]
1697 %define pb_80_var m8
1698 %else ; x86-32 or mmx/mmxext
1699 %define pb_80_var [pb_80]
1700 %endif
1701 mova m1, m4
1702 mova m7, m3
1703 pxor m1, pb_80_var
1704 pxor m7, pb_80_var
1705 psubsb m1, m7 ; (signed) q0-p0
1706 mova m6, m2
1707 mova m7, m5
1708 pxor m6, pb_80_var
1709 pxor m7, pb_80_var
1710 psubsb m6, m7 ; (signed) p1-q1
1711 mova m7, mask_res
1712 pandn m7, m6
1713 paddsb m7, m1
1714 paddsb m7, m1
1715 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
1716
1717 pand m7, m0
1718 mova m1, [pb_F8]
1719 mova m6, m7
1720 paddsb m7, [pb_3]
1721 paddsb m6, [pb_4]
1722 pand m7, m1
1723 pand m6, m1
1724
1725 pxor m1, m1
1726 pxor m0, m0
1727 pcmpgtb m1, m7
1728 psubb m0, m7
1729 psrlq m7, 3 ; +f2
1730 psrlq m0, 3 ; -f2
1731 pand m0, m1
1732 pandn m1, m7
1733 psubusb m3, m0
1734 paddusb m3, m1 ; p0+f2
1735
1736 pxor m1, m1
1737 pxor m0, m0
1738 pcmpgtb m0, m6
1739 psubb m1, m6
1740 psrlq m6, 3 ; +f1
1741 psrlq m1, 3 ; -f1
1742 pand m1, m0
1743 pandn m0, m6
1744 psubusb m4, m0
1745 paddusb m4, m1 ; q0-f1
1746
1747 %ifdef m12
1748 SWAP 6, 12
1749 %else
1750 mova m6, [rsp+mmsize*3]
1751 %endif
1752 %ifidn %1, mmx
1753 mova m7, [pb_1]
1754 %else ; mmxext/sse2
1755 pxor m7, m7
1756 %endif
1757 pand m0, m6
1758 pand m1, m6
1759 %ifidn %1, mmx
1760 paddusb m0, m7
1761 pand m1, [pb_FE]
1762 pandn m7, m0
1763 psrlq m1, 1
1764 psrlq m7, 1
1765 SWAP 0, 7
1766 %else ; mmxext/sse2
1767 psubusb m1, [pb_1]
1768 pavgb m0, m7 ; a
1769 pavgb m1, m7 ; -a
1770 %endif
1771 psubusb m5, m0
1772 psubusb m2, m1
1773 paddusb m5, m1 ; q1-a
1774 paddusb m2, m0 ; p1+a
1775
1776 ; store
1777 %ifidn %2, v
1778 mova [r0+r1*2], m2
1779 mova [r0+r1], m3
1780 mova [r0], m4
1781 mova [r0+r2], m5
1782 %else ; h
1783 add r0, 2
1784 add r3, 2
1785
1786 ; 4x8/16 transpose
1787 TRANSPOSE4x4B 2, 3, 4, 5, 6
1788
1789 %if mmsize == 8 ; mmx/mmxext (h)
1790 WRITE_4x2D 2, 3, 4, 5, r0, r3, r1, r2
1791 %else ; sse2 (h)
1792 lea r5, [r5+r1+2]
1793 WRITE_4x4D 2, 3, 4, 5, r0, r3, r5, r1, r2
1794 %endif
1795 %endif
1796
1797 %if mmsize == 8
1798 %ifidn %2, h
1799 lea r0, [r0+8*r2-2]
1800 %else ; v
1801 add r0, 8
1802 %endif
1803 dec r5
1804 jg .next8px
1805 %endif
1806
1807 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
1808 mov rsp, r4 ; restore stack pointer
1809 RET
1810 %else ; sse2 on x86-64
1811 REP_RET
1812 %endif
1813 %endmacro
1814
1815 INIT_MMX
1816 INNER_LOOPFILTER mmx, v, 6, 8
1817 INNER_LOOPFILTER mmx, h, 6, 8
1818 INNER_LOOPFILTER mmxext, v, 6, 8
1819 INNER_LOOPFILTER mmxext, h, 6, 8
1820 INIT_XMM
1821 INNER_LOOPFILTER sse2, v, 5, 13
1822 INNER_LOOPFILTER sse2, h, 6, 15