comparison x86/vp8dsp.asm @ 12195:e7847fcff0f4 libavcodec

Be more efficient with registers or stack memory. Saves 8/16 bytes stack for x86-32, or 2 MM registers on x86-64.
author rbultje
date Mon, 19 Jul 2010 21:45:36 +0000
parents 80b142c2e9f7
children 552c7c10bc73
comparison
equal deleted inserted replaced
12194:80b142c2e9f7 12195:e7847fcff0f4
1409 and rsp, ~(mmsize-1) ; align stack 1409 and rsp, ~(mmsize-1) ; align stack
1410 %ifidn %2, v 1410 %ifidn %2, v
1411 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr 1411 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1412 ; [3]=hev() result 1412 ; [3]=hev() result
1413 %else ; h 1413 %else ; h
1414 sub rsp, mmsize * 6 ; extra storage space for transposes 1414 sub rsp, mmsize * 5 ; extra storage space for transposes
1415 %endif 1415 %endif
1416 1416
1417 %define flim_E [rsp] 1417 %define flim_E [rsp]
1418 %define flim_I [rsp+mmsize] 1418 %define flim_I [rsp+mmsize]
1419 %define hev_thr [rsp+mmsize*2] 1419 %define hev_thr [rsp+mmsize*2]
1468 movu m6, [dst2_reg+ stride_reg] 1468 movu m6, [dst2_reg+ stride_reg]
1469 1469
1470 ; 8x8 transpose 1470 ; 8x8 transpose
1471 TRANSPOSE4x4B 0, 1, 2, 3, 7 1471 TRANSPOSE4x4B 0, 1, 2, 3, 7
1472 %ifdef m13 1472 %ifdef m13
1473 SWAP 1, 13 1473 SWAP 1, 8
1474 %else 1474 %else
1475 mova [rsp+mmsize*4], m1 1475 mova [rsp+mmsize*4], m1
1476 %endif 1476 %endif
1477 movu m7, [dst2_reg+ stride_reg*2] 1477 movu m7, [dst2_reg+ stride_reg*2]
1478 TRANSPOSE4x4B 4, 5, 6, 7, 1 1478 TRANSPOSE4x4B 4, 5, 6, 7, 1
1479 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 1479 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1480 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 1480 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1481 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 1481 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1482 %ifdef m13 1482 %ifdef m13
1483 SWAP 1, 13 1483 SWAP 1, 8
1484 SWAP 2, 13 1484 SWAP 2, 8
1485 %else 1485 %else
1486 mova m1, [rsp+mmsize*4] 1486 mova m1, [rsp+mmsize*4]
1487 mova [rsp+mmsize*4], m2 ; store q0 1487 mova [rsp+mmsize*4], m2 ; store q0
1488 %endif 1488 %endif
1489 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 1489 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1490 %ifdef m14 1490 %ifdef m14
1491 SWAP 5, 14 1491 SWAP 5, 12
1492 %else 1492 %else
1493 mova [rsp+mmsize*5], m5 ; store p0 1493 mova [rsp+mmsize*3], m5 ; store p0
1494 %endif 1494 %endif
1495 SWAP 1, 4 1495 SWAP 1, 4
1496 SWAP 2, 4 1496 SWAP 2, 4
1497 SWAP 6, 3 1497 SWAP 6, 3
1498 SWAP 5, 3 1498 SWAP 5, 3
1525 punpcklbw m6, m7 ; G/O 1525 punpcklbw m6, m7 ; G/O
1526 1526
1527 ; 8x16 transpose 1527 ; 8x16 transpose
1528 TRANSPOSE4x4B 0, 1, 2, 3, 7 1528 TRANSPOSE4x4B 0, 1, 2, 3, 7
1529 %ifdef m13 1529 %ifdef m13
1530 SWAP 1, 13 1530 SWAP 1, 8
1531 %else 1531 %else
1532 mova [rsp+mmsize*4], m1 1532 mova [rsp+mmsize*4], m1
1533 %endif 1533 %endif
1534 movh m7, [dst2_reg+ stride_reg*2] 1534 movh m7, [dst2_reg+ stride_reg*2]
1535 movh m1, [dst8_reg+ stride_reg*2] 1535 movh m1, [dst8_reg+ stride_reg*2]
1537 TRANSPOSE4x4B 4, 5, 6, 7, 1 1537 TRANSPOSE4x4B 4, 5, 6, 7, 1
1538 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 1538 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1539 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 1539 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1540 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 1540 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1541 %ifdef m13 1541 %ifdef m13
1542 SWAP 1, 13 1542 SWAP 1, 8
1543 SWAP 2, 13 1543 SWAP 2, 8
1544 %else 1544 %else
1545 mova m1, [rsp+mmsize*4] 1545 mova m1, [rsp+mmsize*4]
1546 mova [rsp+mmsize*4], m2 ; store q0 1546 mova [rsp+mmsize*4], m2 ; store q0
1547 %endif 1547 %endif
1548 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 1548 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1549 %ifdef m14 1549 %ifdef m14
1550 SWAP 5, 14 1550 SWAP 5, 12
1551 %else 1551 %else
1552 mova [rsp+mmsize*5], m5 ; store p0 1552 mova [rsp+mmsize*3], m5 ; store p0
1553 %endif 1553 %endif
1554 SWAP 1, 4 1554 SWAP 1, 4
1555 SWAP 2, 4 1555 SWAP 2, 4
1556 SWAP 6, 3 1556 SWAP 6, 3
1557 SWAP 5, 3 1557 SWAP 5, 3
1609 ; normal_limit and high_edge_variance for p1-p0, q1-q0 1609 ; normal_limit and high_edge_variance for p1-p0, q1-q0
1610 SWAP 7, 3 ; now m7 is zero 1610 SWAP 7, 3 ; now m7 is zero
1611 %ifidn %2, v 1611 %ifidn %2, v
1612 mova m3, [dst_reg +mstride_reg] ; p0 1612 mova m3, [dst_reg +mstride_reg] ; p0
1613 %elifdef m14 1613 %elifdef m14
1614 SWAP 3, 14 1614 SWAP 3, 12
1615 %else 1615 %else
1616 mova m3, [rsp+mmsize*5] 1616 mova m3, [rsp+mmsize*3]
1617 %endif 1617 %endif
1618 1618
1619 mova m1, m2 1619 mova m1, m2
1620 SWAP 1, 2 1620 SWAP 1, 2
1621 mova m6, m3 1621 mova m6, m3
1642 1642
1643 SWAP 6, 4 ; now m6 is I 1643 SWAP 6, 4 ; now m6 is I
1644 %ifidn %2, v 1644 %ifidn %2, v
1645 mova m4, [dst_reg] ; q0 1645 mova m4, [dst_reg] ; q0
1646 %elifdef m13 1646 %elifdef m13
1647 SWAP 4, 13 1647 SWAP 4, 8
1648 %else 1648 %else
1649 mova m4, [rsp+mmsize*4] 1649 mova m4, [rsp+mmsize*4]
1650 %endif 1650 %endif
1651 mova m1, m4 1651 mova m1, m4
1652 SWAP 1, 4 1652 SWAP 1, 4
1834 INNER_LOOPFILTER mmxext, v, 6, 8 1834 INNER_LOOPFILTER mmxext, v, 6, 8
1835 INNER_LOOPFILTER mmxext, h, 6, 8 1835 INNER_LOOPFILTER mmxext, h, 6, 8
1836 INIT_XMM 1836 INIT_XMM
1837 INNER_LOOPFILTER sse2, v, 5, 13 1837 INNER_LOOPFILTER sse2, v, 5, 13
1838 %ifdef m8 1838 %ifdef m8
1839 INNER_LOOPFILTER sse2, h, 5, 15 1839 INNER_LOOPFILTER sse2, h, 5, 13
1840 %else 1840 %else
1841 INNER_LOOPFILTER sse2, h, 6, 15 1841 INNER_LOOPFILTER sse2, h, 6, 13
1842 %endif 1842 %endif