Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12195:e7847fcff0f4 libavcodec
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
for x86-32, or 2 MM registers on x86-64.
author | rbultje |
---|---|
date | Mon, 19 Jul 2010 21:45:36 +0000 |
parents | 80b142c2e9f7 |
children | 552c7c10bc73 |
comparison
equal
deleted
inserted
replaced
12194:80b142c2e9f7 | 12195:e7847fcff0f4 |
---|---|
1409 and rsp, ~(mmsize-1) ; align stack | 1409 and rsp, ~(mmsize-1) ; align stack |
1410 %ifidn %2, v | 1410 %ifidn %2, v |
1411 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr | 1411 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
1412 ; [3]=hev() result | 1412 ; [3]=hev() result |
1413 %else ; h | 1413 %else ; h |
1414 sub rsp, mmsize * 6 ; extra storage space for transposes | 1414 sub rsp, mmsize * 5 ; extra storage space for transposes |
1415 %endif | 1415 %endif |
1416 | 1416 |
1417 %define flim_E [rsp] | 1417 %define flim_E [rsp] |
1418 %define flim_I [rsp+mmsize] | 1418 %define flim_I [rsp+mmsize] |
1419 %define hev_thr [rsp+mmsize*2] | 1419 %define hev_thr [rsp+mmsize*2] |
1468 movu m6, [dst2_reg+ stride_reg] | 1468 movu m6, [dst2_reg+ stride_reg] |
1469 | 1469 |
1470 ; 8x8 transpose | 1470 ; 8x8 transpose |
1471 TRANSPOSE4x4B 0, 1, 2, 3, 7 | 1471 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
1472 %ifdef m13 | 1472 %ifdef m13 |
1473 SWAP 1, 13 | 1473 SWAP 1, 8 |
1474 %else | 1474 %else |
1475 mova [rsp+mmsize*4], m1 | 1475 mova [rsp+mmsize*4], m1 |
1476 %endif | 1476 %endif |
1477 movu m7, [dst2_reg+ stride_reg*2] | 1477 movu m7, [dst2_reg+ stride_reg*2] |
1478 TRANSPOSE4x4B 4, 5, 6, 7, 1 | 1478 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
1479 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 | 1479 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
1480 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 | 1480 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
1481 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 | 1481 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
1482 %ifdef m13 | 1482 %ifdef m13 |
1483 SWAP 1, 13 | 1483 SWAP 1, 8 |
1484 SWAP 2, 13 | 1484 SWAP 2, 8 |
1485 %else | 1485 %else |
1486 mova m1, [rsp+mmsize*4] | 1486 mova m1, [rsp+mmsize*4] |
1487 mova [rsp+mmsize*4], m2 ; store q0 | 1487 mova [rsp+mmsize*4], m2 ; store q0 |
1488 %endif | 1488 %endif |
1489 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 | 1489 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
1490 %ifdef m14 | 1490 %ifdef m14 |
1491 SWAP 5, 14 | 1491 SWAP 5, 12 |
1492 %else | 1492 %else |
1493 mova [rsp+mmsize*5], m5 ; store p0 | 1493 mova [rsp+mmsize*3], m5 ; store p0 |
1494 %endif | 1494 %endif |
1495 SWAP 1, 4 | 1495 SWAP 1, 4 |
1496 SWAP 2, 4 | 1496 SWAP 2, 4 |
1497 SWAP 6, 3 | 1497 SWAP 6, 3 |
1498 SWAP 5, 3 | 1498 SWAP 5, 3 |
1525 punpcklbw m6, m7 ; G/O | 1525 punpcklbw m6, m7 ; G/O |
1526 | 1526 |
1527 ; 8x16 transpose | 1527 ; 8x16 transpose |
1528 TRANSPOSE4x4B 0, 1, 2, 3, 7 | 1528 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
1529 %ifdef m13 | 1529 %ifdef m13 |
1530 SWAP 1, 13 | 1530 SWAP 1, 8 |
1531 %else | 1531 %else |
1532 mova [rsp+mmsize*4], m1 | 1532 mova [rsp+mmsize*4], m1 |
1533 %endif | 1533 %endif |
1534 movh m7, [dst2_reg+ stride_reg*2] | 1534 movh m7, [dst2_reg+ stride_reg*2] |
1535 movh m1, [dst8_reg+ stride_reg*2] | 1535 movh m1, [dst8_reg+ stride_reg*2] |
1537 TRANSPOSE4x4B 4, 5, 6, 7, 1 | 1537 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
1538 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 | 1538 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
1539 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 | 1539 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
1540 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 | 1540 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
1541 %ifdef m13 | 1541 %ifdef m13 |
1542 SWAP 1, 13 | 1542 SWAP 1, 8 |
1543 SWAP 2, 13 | 1543 SWAP 2, 8 |
1544 %else | 1544 %else |
1545 mova m1, [rsp+mmsize*4] | 1545 mova m1, [rsp+mmsize*4] |
1546 mova [rsp+mmsize*4], m2 ; store q0 | 1546 mova [rsp+mmsize*4], m2 ; store q0 |
1547 %endif | 1547 %endif |
1548 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 | 1548 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
1549 %ifdef m14 | 1549 %ifdef m14 |
1550 SWAP 5, 14 | 1550 SWAP 5, 12 |
1551 %else | 1551 %else |
1552 mova [rsp+mmsize*5], m5 ; store p0 | 1552 mova [rsp+mmsize*3], m5 ; store p0 |
1553 %endif | 1553 %endif |
1554 SWAP 1, 4 | 1554 SWAP 1, 4 |
1555 SWAP 2, 4 | 1555 SWAP 2, 4 |
1556 SWAP 6, 3 | 1556 SWAP 6, 3 |
1557 SWAP 5, 3 | 1557 SWAP 5, 3 |
1609 ; normal_limit and high_edge_variance for p1-p0, q1-q0 | 1609 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
1610 SWAP 7, 3 ; now m7 is zero | 1610 SWAP 7, 3 ; now m7 is zero |
1611 %ifidn %2, v | 1611 %ifidn %2, v |
1612 mova m3, [dst_reg +mstride_reg] ; p0 | 1612 mova m3, [dst_reg +mstride_reg] ; p0 |
1613 %elifdef m14 | 1613 %elifdef m14 |
1614 SWAP 3, 14 | 1614 SWAP 3, 12 |
1615 %else | 1615 %else |
1616 mova m3, [rsp+mmsize*5] | 1616 mova m3, [rsp+mmsize*3] |
1617 %endif | 1617 %endif |
1618 | 1618 |
1619 mova m1, m2 | 1619 mova m1, m2 |
1620 SWAP 1, 2 | 1620 SWAP 1, 2 |
1621 mova m6, m3 | 1621 mova m6, m3 |
1642 | 1642 |
1643 SWAP 6, 4 ; now m6 is I | 1643 SWAP 6, 4 ; now m6 is I |
1644 %ifidn %2, v | 1644 %ifidn %2, v |
1645 mova m4, [dst_reg] ; q0 | 1645 mova m4, [dst_reg] ; q0 |
1646 %elifdef m13 | 1646 %elifdef m13 |
1647 SWAP 4, 13 | 1647 SWAP 4, 8 |
1648 %else | 1648 %else |
1649 mova m4, [rsp+mmsize*4] | 1649 mova m4, [rsp+mmsize*4] |
1650 %endif | 1650 %endif |
1651 mova m1, m4 | 1651 mova m1, m4 |
1652 SWAP 1, 4 | 1652 SWAP 1, 4 |
1834 INNER_LOOPFILTER mmxext, v, 6, 8 | 1834 INNER_LOOPFILTER mmxext, v, 6, 8 |
1835 INNER_LOOPFILTER mmxext, h, 6, 8 | 1835 INNER_LOOPFILTER mmxext, h, 6, 8 |
1836 INIT_XMM | 1836 INIT_XMM |
1837 INNER_LOOPFILTER sse2, v, 5, 13 | 1837 INNER_LOOPFILTER sse2, v, 5, 13 |
1838 %ifdef m8 | 1838 %ifdef m8 |
1839 INNER_LOOPFILTER sse2, h, 5, 15 | 1839 INNER_LOOPFILTER sse2, h, 5, 13 |
1840 %else | 1840 %else |
1841 INNER_LOOPFILTER sse2, h, 6, 15 | 1841 INNER_LOOPFILTER sse2, h, 6, 13 |
1842 %endif | 1842 %endif |