comparison x86/vp8dsp.asm @ 12174:57038190cc5f libavcodec

Give x86 r%d registers names, this will simplify implementation of the chroma inner loopfilter, and it also allows us to save one register on x86-64/sse2.
author rbultje
date Fri, 16 Jul 2010 19:38:10 +0000
parents c47ddb7df424
children eda3a4e59ec5
comparison
equal deleted inserted replaced
12173:c47ddb7df424 12174:57038190cc5f
1378 ; int flimE, int flimI, int hev_thr); 1378 ; int flimE, int flimI, int hev_thr);
1379 ;----------------------------------------------------------------------------- 1379 ;-----------------------------------------------------------------------------
1380 1380
1381 %macro INNER_LOOPFILTER 4 1381 %macro INNER_LOOPFILTER 4
1382 cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 1382 cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4
1383 %define dst_reg r0
1384 %define mstride_reg r1
1385 %define stride_reg r2
1386 %define E_reg r2
1387 %define I_reg r3
1388 %define hev_thr_reg r4
1389 %ifdef m8 ; x86-64, sse2
1390 %define dst8_reg r4
1391 %elif mmsize == 16 ; x86-32, sse2
1392 %define dst8_reg r5
1393 %else ; x86-32, mmx/mmxext
1394 %define cnt_reg r5
1395 %endif
1396 %define stride_reg E_reg
1397 %define dst2_reg I_reg
1398 %ifndef m8
1399 %define stack_reg hev_thr_reg
1400 %endif
1401
1383 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 1402 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
1384 ; splat function arguments 1403 ; splat function arguments
1385 SPLATB_REG m0, r2, %1 ; E 1404 SPLATB_REG m0, E_reg, %1 ; E
1386 SPLATB_REG m1, r3, %1 ; I 1405 SPLATB_REG m1, I_reg, %1 ; I
1387 SPLATB_REG m2, r4, %1 ; hev_thresh 1406 SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh
1388 1407
1389 ; align stack 1408 ; align stack
1390 mov r4, rsp ; backup stack pointer 1409 mov stack_reg, rsp ; backup stack pointer
1391 and rsp, ~(mmsize-1) ; align stack 1410 and rsp, ~(mmsize-1) ; align stack
1392 %ifidn %2, v 1411 %ifidn %2, v
1393 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr 1412 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1394 ; [3]=hev() result 1413 ; [3]=hev() result
1395 %else ; h 1414 %else ; h
1411 %define flim_I m10 1430 %define flim_I m10
1412 %define hev_thr m11 1431 %define hev_thr m11
1413 %define mask_res m12 1432 %define mask_res m12
1414 1433
1415 ; splat function arguments 1434 ; splat function arguments
1416 SPLATB_REG flim_E, r2, %1 ; E 1435 SPLATB_REG flim_E, E_reg, %1 ; E
1417 SPLATB_REG flim_I, r3, %1 ; I 1436 SPLATB_REG flim_I, I_reg, %1 ; I
1418 SPLATB_REG hev_thr, r4, %1 ; hev_thresh 1437 SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh
1419 %endif 1438 %endif
1420 1439
1421 %if mmsize == 8 ; mmx/mmxext 1440 %if mmsize == 8 ; mmx/mmxext
1422 mov r5, 2 1441 mov cnt_reg, 2
1423 %endif 1442 %endif
1424 mov r2, r1 1443 mov stride_reg, mstride_reg
1425 neg r1 1444 neg mstride_reg
1426 %ifidn %2, h 1445 %ifidn %2, h
1427 lea r0, [r0+4*r2-4] 1446 lea dst_reg, [dst_reg + stride_reg*4-4]
1428 %endif 1447 %endif
1429 1448
1430 %if mmsize == 8 1449 %if mmsize == 8
1431 .next8px 1450 .next8px
1432 %endif 1451 %endif
1433 ; read 1452 ; read
1434 lea r3, [r0+r2] 1453 lea dst2_reg, [dst_reg + stride_reg]
1435 %ifidn %2, v 1454 %ifidn %2, v
1436 mova m0, [r0+r1*4] ; p3 1455 mova m0, [dst_reg +mstride_reg*4] ; p3
1437 mova m1, [r3+r1*4] ; p2 1456 mova m1, [dst2_reg+mstride_reg*4] ; p2
1438 mova m2, [r0+r1*2] ; p1 1457 mova m2, [dst_reg +mstride_reg*2] ; p1
1439 mova m5, [r3] ; q1 1458 mova m5, [dst2_reg] ; q1
1440 mova m6, [r3+r2] ; q2 1459 mova m6, [dst2_reg+ stride_reg] ; q2
1441 mova m7, [r3+r2*2] ; q3 1460 mova m7, [dst2_reg+ stride_reg*2] ; q3
1442 %elif mmsize == 8 ; mmx/mmxext (h) 1461 %elif mmsize == 8 ; mmx/mmxext (h)
1443 ; read 8 rows of 8px each 1462 ; read 8 rows of 8px each
1444 movu m0, [r0+r1*4] 1463 movu m0, [dst_reg +mstride_reg*4]
1445 movu m1, [r3+r1*4] 1464 movu m1, [dst2_reg+mstride_reg*4]
1446 movu m2, [r0+r1*2] 1465 movu m2, [dst_reg +mstride_reg*2]
1447 movu m3, [r0+r1] 1466 movu m3, [dst_reg +mstride_reg]
1448 movu m4, [r0] 1467 movu m4, [dst_reg]
1449 movu m5, [r3] 1468 movu m5, [dst2_reg]
1450 movu m6, [r3+r2] 1469 movu m6, [dst2_reg+ stride_reg]
1451 1470
1452 ; 8x8 transpose 1471 ; 8x8 transpose
1453 TRANSPOSE4x4B 0, 1, 2, 3, 7 1472 TRANSPOSE4x4B 0, 1, 2, 3, 7
1454 %ifdef m13 1473 %ifdef m13
1455 SWAP 1, 13 1474 SWAP 1, 13
1456 %else 1475 %else
1457 mova [rsp+mmsize*4], m1 1476 mova [rsp+mmsize*4], m1
1458 %endif 1477 %endif
1459 movu m7, [r3+r2*2] 1478 movu m7, [dst2_reg+ stride_reg*2]
1460 TRANSPOSE4x4B 4, 5, 6, 7, 1 1479 TRANSPOSE4x4B 4, 5, 6, 7, 1
1461 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 1480 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1462 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 1481 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1463 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 1482 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1464 %ifdef m13 1483 %ifdef m13
1477 SWAP 1, 4 1496 SWAP 1, 4
1478 SWAP 2, 4 1497 SWAP 2, 4
1479 SWAP 6, 3 1498 SWAP 6, 3
1480 SWAP 5, 3 1499 SWAP 5, 3
1481 %else ; sse2 (h) 1500 %else ; sse2 (h)
1482 lea r5, [r0+r2*8] 1501 lea dst8_reg, [dst_reg + stride_reg*8]
1483 1502
1484 ; read 16 rows of 8px each, interleave 1503 ; read 16 rows of 8px each, interleave
1485 movh m0, [r0+r1*4] 1504 movh m0, [dst_reg +mstride_reg*4]
1486 movh m1, [r5+r1*4] 1505 movh m1, [dst8_reg+mstride_reg*4]
1487 movh m2, [r0+r1*2] 1506 movh m2, [dst_reg +mstride_reg*2]
1488 movh m5, [r5+r1*2] 1507 movh m5, [dst8_reg+mstride_reg*2]
1489 movh m3, [r0+r1] 1508 movh m3, [dst_reg +mstride_reg]
1490 movh m6, [r5+r1] 1509 movh m6, [dst8_reg+mstride_reg]
1491 movh m4, [r0] 1510 movh m4, [dst_reg]
1492 movh m7, [r5] 1511 movh m7, [dst8_reg]
1493 punpcklbw m0, m1 ; A/I 1512 punpcklbw m0, m1 ; A/I
1494 punpcklbw m2, m5 ; C/K 1513 punpcklbw m2, m5 ; C/K
1495 punpcklbw m3, m6 ; D/L 1514 punpcklbw m3, m6 ; D/L
1496 punpcklbw m4, m7 ; E/M 1515 punpcklbw m4, m7 ; E/M
1497 1516
1498 add r5, r2 1517 add dst8_reg, stride_reg
1499 movh m1, [r3+r1*4] 1518 movh m1, [dst2_reg+mstride_reg*4]
1500 movh m6, [r5+r1*4] 1519 movh m6, [dst8_reg+mstride_reg*4]
1501 movh m5, [r3] 1520 movh m5, [dst2_reg]
1502 movh m7, [r5] 1521 movh m7, [dst8_reg]
1503 punpcklbw m1, m6 ; B/J 1522 punpcklbw m1, m6 ; B/J
1504 punpcklbw m5, m7 ; F/N 1523 punpcklbw m5, m7 ; F/N
1505 movh m6, [r3+r2] 1524 movh m6, [dst2_reg+ stride_reg]
1506 movh m7, [r5+r2] 1525 movh m7, [dst8_reg+ stride_reg]
1507 punpcklbw m6, m7 ; G/O 1526 punpcklbw m6, m7 ; G/O
1508 1527
1509 ; 8x16 transpose 1528 ; 8x16 transpose
1510 TRANSPOSE4x4B 0, 1, 2, 3, 7 1529 TRANSPOSE4x4B 0, 1, 2, 3, 7
1511 %ifdef m13 1530 %ifdef m13
1512 SWAP 1, 13 1531 SWAP 1, 13
1513 %else 1532 %else
1514 mova [rsp+mmsize*4], m1 1533 mova [rsp+mmsize*4], m1
1515 %endif 1534 %endif
1516 movh m7, [r3+r2*2] 1535 movh m7, [dst2_reg+ stride_reg*2]
1517 movh m1, [r5+r2*2] 1536 movh m1, [dst8_reg+ stride_reg*2]
1518 punpcklbw m7, m1 ; H/P 1537 punpcklbw m7, m1 ; H/P
1519 TRANSPOSE4x4B 4, 5, 6, 7, 1 1538 TRANSPOSE4x4B 4, 5, 6, 7, 1
1520 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 1539 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1521 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 1540 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1522 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 1541 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1589 %endif 1608 %endif
1590 1609
1591 ; normal_limit and high_edge_variance for p1-p0, q1-q0 1610 ; normal_limit and high_edge_variance for p1-p0, q1-q0
1592 SWAP 7, 3 ; now m7 is zero 1611 SWAP 7, 3 ; now m7 is zero
1593 %ifidn %2, v 1612 %ifidn %2, v
1594 mova m3, [r0+r1] ; p0 1613 mova m3, [dst_reg +mstride_reg] ; p0
1595 %elifdef m14 1614 %elifdef m14
1596 SWAP 3, 14 1615 SWAP 3, 14
1597 %else 1616 %else
1598 mova m3, [rsp+mmsize*5] 1617 mova m3, [rsp+mmsize*5]
1599 %endif 1618 %endif
1622 SWAP 1, 4 ; max_hev_thresh 1641 SWAP 1, 4 ; max_hev_thresh
1623 %endif 1642 %endif
1624 1643
1625 SWAP 6, 4 ; now m6 is I 1644 SWAP 6, 4 ; now m6 is I
1626 %ifidn %2, v 1645 %ifidn %2, v
1627 mova m4, [r0] ; q0 1646 mova m4, [dst_reg] ; q0
1628 %elifdef m13 1647 %elifdef m13
1629 SWAP 4, 13 1648 SWAP 4, 13
1630 %else 1649 %else
1631 mova m4, [rsp+mmsize*4] 1650 mova m4, [rsp+mmsize*4]
1632 %endif 1651 %endif
1773 paddusb m5, m1 ; q1-a 1792 paddusb m5, m1 ; q1-a
1774 paddusb m2, m0 ; p1+a 1793 paddusb m2, m0 ; p1+a
1775 1794
1776 ; store 1795 ; store
1777 %ifidn %2, v 1796 %ifidn %2, v
1778 mova [r0+r1*2], m2 1797 mova [dst_reg+mstride_reg*2], m2
1779 mova [r0+r1], m3 1798 mova [dst_reg+mstride_reg ], m3
1780 mova [r0], m4 1799 mova [dst_reg], m4
1781 mova [r0+r2], m5 1800 mova [dst_reg+ stride_reg ], m5
1782 %else ; h 1801 %else ; h
1783 add r0, 2 1802 add dst_reg, 2
1784 add r3, 2 1803 add dst2_reg, 2
1785 1804
1786 ; 4x8/16 transpose 1805 ; 4x8/16 transpose
1787 TRANSPOSE4x4B 2, 3, 4, 5, 6 1806 TRANSPOSE4x4B 2, 3, 4, 5, 6
1788 1807
1789 %if mmsize == 8 ; mmx/mmxext (h) 1808 %if mmsize == 8 ; mmx/mmxext (h)
1790 WRITE_4x2D 2, 3, 4, 5, r0, r3, r1, r2 1809 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
1791 %else ; sse2 (h) 1810 %else ; sse2 (h)
1792 lea r5, [r5+r1+2] 1811 lea r5, [r5+r1+2]
1793 WRITE_4x4D 2, 3, 4, 5, r0, r3, r5, r1, r2 1812 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg
1794 %endif 1813 %endif
1795 %endif 1814 %endif
1796 1815
1797 %if mmsize == 8 1816 %if mmsize == 8
1798 %ifidn %2, h 1817 %ifidn %2, h
1799 lea r0, [r0+8*r2-2] 1818 lea dst_reg, [dst_reg + stride_reg*8-2]
1800 %else ; v 1819 %else ; v
1801 add r0, 8 1820 add dst_reg, 8
1802 %endif 1821 %endif
1803 dec r5 1822 dec cnt_reg
1804 jg .next8px 1823 jg .next8px
1805 %endif 1824 %endif
1806 1825
1807 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext 1826 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
1808 mov rsp, r4 ; restore stack pointer 1827 mov rsp, stack_reg ; restore stack pointer
1809 %endif 1828 %endif
1810 RET 1829 RET
1811 %endmacro 1830 %endmacro
1812 1831
1813 INIT_MMX 1832 INIT_MMX
1815 INNER_LOOPFILTER mmx, h, 6, 8 1834 INNER_LOOPFILTER mmx, h, 6, 8
1816 INNER_LOOPFILTER mmxext, v, 6, 8 1835 INNER_LOOPFILTER mmxext, v, 6, 8
1817 INNER_LOOPFILTER mmxext, h, 6, 8 1836 INNER_LOOPFILTER mmxext, h, 6, 8
1818 INIT_XMM 1837 INIT_XMM
1819 INNER_LOOPFILTER sse2, v, 5, 13 1838 INNER_LOOPFILTER sse2, v, 5, 13
1839 %ifdef m8
1840 INNER_LOOPFILTER sse2, h, 5, 15
1841 %else
1820 INNER_LOOPFILTER sse2, h, 6, 15 1842 INNER_LOOPFILTER sse2, h, 6, 15
1843 %endif