Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12174:57038190cc5f libavcodec
Give x86 r%d registers names, this will simplify implementation of the chroma
inner loopfilter, and it also allows us to save one register on x86-64/sse2.
author | rbultje |
---|---|
date | Fri, 16 Jul 2010 19:38:10 +0000 |
parents | c47ddb7df424 |
children | eda3a4e59ec5 |
comparison
equal
deleted
inserted
replaced
12173:c47ddb7df424 | 12174:57038190cc5f |
---|---|
1378 ; int flimE, int flimI, int hev_thr); | 1378 ; int flimE, int flimI, int hev_thr); |
1379 ;----------------------------------------------------------------------------- | 1379 ;----------------------------------------------------------------------------- |
1380 | 1380 |
1381 %macro INNER_LOOPFILTER 4 | 1381 %macro INNER_LOOPFILTER 4 |
1382 cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 | 1382 cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 |
1383 %define dst_reg r0 | |
1384 %define mstride_reg r1 | |
1385 %define stride_reg r2 | |
1386 %define E_reg r2 | |
1387 %define I_reg r3 | |
1388 %define hev_thr_reg r4 | |
1389 %ifdef m8 ; x86-64, sse2 | |
1390 %define dst8_reg r4 | |
1391 %elif mmsize == 16 ; x86-32, sse2 | |
1392 %define dst8_reg r5 | |
1393 %else ; x86-32, mmx/mmxext | |
1394 %define cnt_reg r5 | |
1395 %endif | |
1396 %define stride_reg E_reg | |
1397 %define dst2_reg I_reg | |
1398 %ifndef m8 | |
1399 %define stack_reg hev_thr_reg | |
1400 %endif | |
1401 | |
1383 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 | 1402 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
1384 ; splat function arguments | 1403 ; splat function arguments |
1385 SPLATB_REG m0, r2, %1 ; E | 1404 SPLATB_REG m0, E_reg, %1 ; E |
1386 SPLATB_REG m1, r3, %1 ; I | 1405 SPLATB_REG m1, I_reg, %1 ; I |
1387 SPLATB_REG m2, r4, %1 ; hev_thresh | 1406 SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh |
1388 | 1407 |
1389 ; align stack | 1408 ; align stack |
1390 mov r4, rsp ; backup stack pointer | 1409 mov stack_reg, rsp ; backup stack pointer |
1391 and rsp, ~(mmsize-1) ; align stack | 1410 and rsp, ~(mmsize-1) ; align stack |
1392 %ifidn %2, v | 1411 %ifidn %2, v |
1393 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr | 1412 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
1394 ; [3]=hev() result | 1413 ; [3]=hev() result |
1395 %else ; h | 1414 %else ; h |
1411 %define flim_I m10 | 1430 %define flim_I m10 |
1412 %define hev_thr m11 | 1431 %define hev_thr m11 |
1413 %define mask_res m12 | 1432 %define mask_res m12 |
1414 | 1433 |
1415 ; splat function arguments | 1434 ; splat function arguments |
1416 SPLATB_REG flim_E, r2, %1 ; E | 1435 SPLATB_REG flim_E, E_reg, %1 ; E |
1417 SPLATB_REG flim_I, r3, %1 ; I | 1436 SPLATB_REG flim_I, I_reg, %1 ; I |
1418 SPLATB_REG hev_thr, r4, %1 ; hev_thresh | 1437 SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh |
1419 %endif | 1438 %endif |
1420 | 1439 |
1421 %if mmsize == 8 ; mmx/mmxext | 1440 %if mmsize == 8 ; mmx/mmxext |
1422 mov r5, 2 | 1441 mov cnt_reg, 2 |
1423 %endif | 1442 %endif |
1424 mov r2, r1 | 1443 mov stride_reg, mstride_reg |
1425 neg r1 | 1444 neg mstride_reg |
1426 %ifidn %2, h | 1445 %ifidn %2, h |
1427 lea r0, [r0+4*r2-4] | 1446 lea dst_reg, [dst_reg + stride_reg*4-4] |
1428 %endif | 1447 %endif |
1429 | 1448 |
1430 %if mmsize == 8 | 1449 %if mmsize == 8 |
1431 .next8px | 1450 .next8px |
1432 %endif | 1451 %endif |
1433 ; read | 1452 ; read |
1434 lea r3, [r0+r2] | 1453 lea dst2_reg, [dst_reg + stride_reg] |
1435 %ifidn %2, v | 1454 %ifidn %2, v |
1436 mova m0, [r0+r1*4] ; p3 | 1455 mova m0, [dst_reg +mstride_reg*4] ; p3 |
1437 mova m1, [r3+r1*4] ; p2 | 1456 mova m1, [dst2_reg+mstride_reg*4] ; p2 |
1438 mova m2, [r0+r1*2] ; p1 | 1457 mova m2, [dst_reg +mstride_reg*2] ; p1 |
1439 mova m5, [r3] ; q1 | 1458 mova m5, [dst2_reg] ; q1 |
1440 mova m6, [r3+r2] ; q2 | 1459 mova m6, [dst2_reg+ stride_reg] ; q2 |
1441 mova m7, [r3+r2*2] ; q3 | 1460 mova m7, [dst2_reg+ stride_reg*2] ; q3 |
1442 %elif mmsize == 8 ; mmx/mmxext (h) | 1461 %elif mmsize == 8 ; mmx/mmxext (h) |
1443 ; read 8 rows of 8px each | 1462 ; read 8 rows of 8px each |
1444 movu m0, [r0+r1*4] | 1463 movu m0, [dst_reg +mstride_reg*4] |
1445 movu m1, [r3+r1*4] | 1464 movu m1, [dst2_reg+mstride_reg*4] |
1446 movu m2, [r0+r1*2] | 1465 movu m2, [dst_reg +mstride_reg*2] |
1447 movu m3, [r0+r1] | 1466 movu m3, [dst_reg +mstride_reg] |
1448 movu m4, [r0] | 1467 movu m4, [dst_reg] |
1449 movu m5, [r3] | 1468 movu m5, [dst2_reg] |
1450 movu m6, [r3+r2] | 1469 movu m6, [dst2_reg+ stride_reg] |
1451 | 1470 |
1452 ; 8x8 transpose | 1471 ; 8x8 transpose |
1453 TRANSPOSE4x4B 0, 1, 2, 3, 7 | 1472 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
1454 %ifdef m13 | 1473 %ifdef m13 |
1455 SWAP 1, 13 | 1474 SWAP 1, 13 |
1456 %else | 1475 %else |
1457 mova [rsp+mmsize*4], m1 | 1476 mova [rsp+mmsize*4], m1 |
1458 %endif | 1477 %endif |
1459 movu m7, [r3+r2*2] | 1478 movu m7, [dst2_reg+ stride_reg*2] |
1460 TRANSPOSE4x4B 4, 5, 6, 7, 1 | 1479 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
1461 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 | 1480 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
1462 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 | 1481 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
1463 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 | 1482 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
1464 %ifdef m13 | 1483 %ifdef m13 |
1477 SWAP 1, 4 | 1496 SWAP 1, 4 |
1478 SWAP 2, 4 | 1497 SWAP 2, 4 |
1479 SWAP 6, 3 | 1498 SWAP 6, 3 |
1480 SWAP 5, 3 | 1499 SWAP 5, 3 |
1481 %else ; sse2 (h) | 1500 %else ; sse2 (h) |
1482 lea r5, [r0+r2*8] | 1501 lea dst8_reg, [dst_reg + stride_reg*8] |
1483 | 1502 |
1484 ; read 16 rows of 8px each, interleave | 1503 ; read 16 rows of 8px each, interleave |
1485 movh m0, [r0+r1*4] | 1504 movh m0, [dst_reg +mstride_reg*4] |
1486 movh m1, [r5+r1*4] | 1505 movh m1, [dst8_reg+mstride_reg*4] |
1487 movh m2, [r0+r1*2] | 1506 movh m2, [dst_reg +mstride_reg*2] |
1488 movh m5, [r5+r1*2] | 1507 movh m5, [dst8_reg+mstride_reg*2] |
1489 movh m3, [r0+r1] | 1508 movh m3, [dst_reg +mstride_reg] |
1490 movh m6, [r5+r1] | 1509 movh m6, [dst8_reg+mstride_reg] |
1491 movh m4, [r0] | 1510 movh m4, [dst_reg] |
1492 movh m7, [r5] | 1511 movh m7, [dst8_reg] |
1493 punpcklbw m0, m1 ; A/I | 1512 punpcklbw m0, m1 ; A/I |
1494 punpcklbw m2, m5 ; C/K | 1513 punpcklbw m2, m5 ; C/K |
1495 punpcklbw m3, m6 ; D/L | 1514 punpcklbw m3, m6 ; D/L |
1496 punpcklbw m4, m7 ; E/M | 1515 punpcklbw m4, m7 ; E/M |
1497 | 1516 |
1498 add r5, r2 | 1517 add dst8_reg, stride_reg |
1499 movh m1, [r3+r1*4] | 1518 movh m1, [dst2_reg+mstride_reg*4] |
1500 movh m6, [r5+r1*4] | 1519 movh m6, [dst8_reg+mstride_reg*4] |
1501 movh m5, [r3] | 1520 movh m5, [dst2_reg] |
1502 movh m7, [r5] | 1521 movh m7, [dst8_reg] |
1503 punpcklbw m1, m6 ; B/J | 1522 punpcklbw m1, m6 ; B/J |
1504 punpcklbw m5, m7 ; F/N | 1523 punpcklbw m5, m7 ; F/N |
1505 movh m6, [r3+r2] | 1524 movh m6, [dst2_reg+ stride_reg] |
1506 movh m7, [r5+r2] | 1525 movh m7, [dst8_reg+ stride_reg] |
1507 punpcklbw m6, m7 ; G/O | 1526 punpcklbw m6, m7 ; G/O |
1508 | 1527 |
1509 ; 8x16 transpose | 1528 ; 8x16 transpose |
1510 TRANSPOSE4x4B 0, 1, 2, 3, 7 | 1529 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
1511 %ifdef m13 | 1530 %ifdef m13 |
1512 SWAP 1, 13 | 1531 SWAP 1, 13 |
1513 %else | 1532 %else |
1514 mova [rsp+mmsize*4], m1 | 1533 mova [rsp+mmsize*4], m1 |
1515 %endif | 1534 %endif |
1516 movh m7, [r3+r2*2] | 1535 movh m7, [dst2_reg+ stride_reg*2] |
1517 movh m1, [r5+r2*2] | 1536 movh m1, [dst8_reg+ stride_reg*2] |
1518 punpcklbw m7, m1 ; H/P | 1537 punpcklbw m7, m1 ; H/P |
1519 TRANSPOSE4x4B 4, 5, 6, 7, 1 | 1538 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
1520 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 | 1539 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
1521 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 | 1540 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
1522 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 | 1541 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
1589 %endif | 1608 %endif |
1590 | 1609 |
1591 ; normal_limit and high_edge_variance for p1-p0, q1-q0 | 1610 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
1592 SWAP 7, 3 ; now m7 is zero | 1611 SWAP 7, 3 ; now m7 is zero |
1593 %ifidn %2, v | 1612 %ifidn %2, v |
1594 mova m3, [r0+r1] ; p0 | 1613 mova m3, [dst_reg +mstride_reg] ; p0 |
1595 %elifdef m14 | 1614 %elifdef m14 |
1596 SWAP 3, 14 | 1615 SWAP 3, 14 |
1597 %else | 1616 %else |
1598 mova m3, [rsp+mmsize*5] | 1617 mova m3, [rsp+mmsize*5] |
1599 %endif | 1618 %endif |
1622 SWAP 1, 4 ; max_hev_thresh | 1641 SWAP 1, 4 ; max_hev_thresh |
1623 %endif | 1642 %endif |
1624 | 1643 |
1625 SWAP 6, 4 ; now m6 is I | 1644 SWAP 6, 4 ; now m6 is I |
1626 %ifidn %2, v | 1645 %ifidn %2, v |
1627 mova m4, [r0] ; q0 | 1646 mova m4, [dst_reg] ; q0 |
1628 %elifdef m13 | 1647 %elifdef m13 |
1629 SWAP 4, 13 | 1648 SWAP 4, 13 |
1630 %else | 1649 %else |
1631 mova m4, [rsp+mmsize*4] | 1650 mova m4, [rsp+mmsize*4] |
1632 %endif | 1651 %endif |
1773 paddusb m5, m1 ; q1-a | 1792 paddusb m5, m1 ; q1-a |
1774 paddusb m2, m0 ; p1+a | 1793 paddusb m2, m0 ; p1+a |
1775 | 1794 |
1776 ; store | 1795 ; store |
1777 %ifidn %2, v | 1796 %ifidn %2, v |
1778 mova [r0+r1*2], m2 | 1797 mova [dst_reg+mstride_reg*2], m2 |
1779 mova [r0+r1], m3 | 1798 mova [dst_reg+mstride_reg ], m3 |
1780 mova [r0], m4 | 1799 mova [dst_reg], m4 |
1781 mova [r0+r2], m5 | 1800 mova [dst_reg+ stride_reg ], m5 |
1782 %else ; h | 1801 %else ; h |
1783 add r0, 2 | 1802 add dst_reg, 2 |
1784 add r3, 2 | 1803 add dst2_reg, 2 |
1785 | 1804 |
1786 ; 4x8/16 transpose | 1805 ; 4x8/16 transpose |
1787 TRANSPOSE4x4B 2, 3, 4, 5, 6 | 1806 TRANSPOSE4x4B 2, 3, 4, 5, 6 |
1788 | 1807 |
1789 %if mmsize == 8 ; mmx/mmxext (h) | 1808 %if mmsize == 8 ; mmx/mmxext (h) |
1790 WRITE_4x2D 2, 3, 4, 5, r0, r3, r1, r2 | 1809 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg |
1791 %else ; sse2 (h) | 1810 %else ; sse2 (h) |
1792 lea r5, [r5+r1+2] | 1811 lea r5, [r5+r1+2] |
1793 WRITE_4x4D 2, 3, 4, 5, r0, r3, r5, r1, r2 | 1812 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg |
1794 %endif | 1813 %endif |
1795 %endif | 1814 %endif |
1796 | 1815 |
1797 %if mmsize == 8 | 1816 %if mmsize == 8 |
1798 %ifidn %2, h | 1817 %ifidn %2, h |
1799 lea r0, [r0+8*r2-2] | 1818 lea dst_reg, [dst_reg + stride_reg*8-2] |
1800 %else ; v | 1819 %else ; v |
1801 add r0, 8 | 1820 add dst_reg, 8 |
1802 %endif | 1821 %endif |
1803 dec r5 | 1822 dec cnt_reg |
1804 jg .next8px | 1823 jg .next8px |
1805 %endif | 1824 %endif |
1806 | 1825 |
1807 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext | 1826 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
1808 mov rsp, r4 ; restore stack pointer | 1827 mov rsp, stack_reg ; restore stack pointer |
1809 %endif | 1828 %endif |
1810 RET | 1829 RET |
1811 %endmacro | 1830 %endmacro |
1812 | 1831 |
1813 INIT_MMX | 1832 INIT_MMX |
1815 INNER_LOOPFILTER mmx, h, 6, 8 | 1834 INNER_LOOPFILTER mmx, h, 6, 8 |
1816 INNER_LOOPFILTER mmxext, v, 6, 8 | 1835 INNER_LOOPFILTER mmxext, v, 6, 8 |
1817 INNER_LOOPFILTER mmxext, h, 6, 8 | 1836 INNER_LOOPFILTER mmxext, h, 6, 8 |
1818 INIT_XMM | 1837 INIT_XMM |
1819 INNER_LOOPFILTER sse2, v, 5, 13 | 1838 INNER_LOOPFILTER sse2, v, 5, 13 |
1839 %ifdef m8 | |
1840 INNER_LOOPFILTER sse2, h, 5, 15 | |
1841 %else | |
1820 INNER_LOOPFILTER sse2, h, 6, 15 | 1842 INNER_LOOPFILTER sse2, h, 6, 15 |
1843 %endif |