Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12334:435319d67bd8 libavcodec
Use word-writing instead of dword-writing (with two cached but otherwise
unchanged bytes) in the horizontal simple loopfilter. This makes the filter
quite a bit faster in itself (~30 cycles less on Core1), probably mostly
because we don't need a complex 4x4 transpose, but only a simple byte
interleave. Also allows using pextrw on SSE4, which speeds up even more
(e.g. 25% faster on Core i7).
author | rbultje |
---|---|
date | Sat, 31 Jul 2010 23:13:15 +0000 |
parents | 7fb91885433c |
children | 2d15f62f4f8a |
comparison
equal
deleted
inserted
replaced
12333:97219f0fa018 | 12334:435319d67bd8 |
---|---|
1352 movd [%6+%9*2], m%2 | 1352 movd [%6+%9*2], m%2 |
1353 movd [%7+%8*2], m%3 | 1353 movd [%7+%8*2], m%3 |
1354 movd [%7+%9*2], m%4 | 1354 movd [%7+%9*2], m%4 |
1355 %endmacro | 1355 %endmacro |
1356 | 1356 |
1357 ; write 4 or 8 words in the mmx/xmm registers as 8 lines | |
1358 ; 1 and 2 are the registers to write, this can be the same (for SSE2) | |
1359 ; for pre-SSE4: | |
1360 ; 3 is a general-purpose register that we will clobber | |
1361 ; for SSE4: | |
1362 ; 3 is a pointer to the destination's 5th line | |
1363 ; 4 is a pointer to the destination's 4th line | |
1364 ; 5/6 is -stride and +stride | |
1365 %macro WRITE_2x4W 6 | |
1366 movd %3, %1 | |
1367 punpckhdq %1, %1 | |
1368 mov [%4+%5*4], %3w | |
1369 shr %3, 16 | |
1370 add %4, %6 | |
1371 mov [%4+%5*4], %3w | |
1372 | |
1373 movd %3, %1 | |
1374 add %4, %5 | |
1375 mov [%4+%5*2], %3w | |
1376 shr %3, 16 | |
1377 mov [%4+%5 ], %3w | |
1378 | |
1379 movd %3, %2 | |
1380 punpckhdq %2, %2 | |
1381 mov [%4 ], %3w | |
1382 shr %3, 16 | |
1383 mov [%4+%6 ], %3w | |
1384 | |
1385 movd %3, %2 | |
1386 add %4, %6 | |
1387 mov [%4+%6 ], %3w | |
1388 shr %3, 16 | |
1389 mov [%4+%6*2], %3w | |
1390 add %4, %5 | |
1391 %endmacro | |
1392 | |
1393 %macro WRITE_8W_SSE2 5 | |
1394 movd %2, %1 | |
1395 psrldq %1, 4 | |
1396 mov [%3+%4*4], %2w | |
1397 shr %2, 16 | |
1398 add %3, %5 | |
1399 mov [%3+%4*4], %2w | |
1400 | |
1401 movd %2, %1 | |
1402 psrldq %1, 4 | |
1403 add %3, %4 | |
1404 mov [%3+%4*2], %2w | |
1405 shr %2, 16 | |
1406 mov [%3+%4 ], %2w | |
1407 | |
1408 movd %2, %1 | |
1409 psrldq %1, 4 | |
1410 mov [%3 ], %2w | |
1411 shr %2, 16 | |
1412 mov [%3+%5 ], %2w | |
1413 | |
1414 movd %2, %1 | |
1415 add %3, %5 | |
1416 mov [%3+%5 ], %2w | |
1417 shr %2, 16 | |
1418 mov [%3+%5*2], %2w | |
1419 %endmacro | |
1420 | |
1421 %macro WRITE_8W_SSE4 5 | |
1422 pextrw [%3+%4*4], %1, 0 | |
1423 pextrw [%2+%4*4], %1, 1 | |
1424 pextrw [%3+%4*2], %1, 2 | |
1425 pextrw [%3+%4 ], %1, 3 | |
1426 pextrw [%3 ], %1, 4 | |
1427 pextrw [%2 ], %1, 5 | |
1428 pextrw [%2+%5 ], %1, 6 | |
1429 pextrw [%2+%5*2], %1, 7 | |
1430 %endmacro | |
1431 | |
1357 %macro SPLATB_REG_MMX 2-3 | 1432 %macro SPLATB_REG_MMX 2-3 |
1358 movd %1, %2 | 1433 movd %1, %2 |
1359 punpcklbw %1, %1 | 1434 punpcklbw %1, %1 |
1360 punpcklwd %1, %1 | 1435 punpcklwd %1, %1 |
1361 punpckldq %1, %1 | 1436 punpckldq %1, %1 |
1379 pshufb %1, %3 | 1454 pshufb %1, %3 |
1380 %endmacro | 1455 %endmacro |
1381 | 1456 |
1382 %macro SIMPLE_LOOPFILTER 3 | 1457 %macro SIMPLE_LOOPFILTER 3 |
1383 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 | 1458 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |
1384 %ifidn %2, h | |
1385 mov r5, rsp ; backup stack pointer | |
1386 and rsp, ~(mmsize-1) ; align stack | |
1387 %endif | |
1388 %if mmsize == 8 ; mmx/mmxext | 1459 %if mmsize == 8 ; mmx/mmxext |
1389 mov r3, 2 | 1460 mov r3, 2 |
1390 %endif | 1461 %endif |
1391 %ifnidn %1, sse2 | 1462 %ifnidn %1, sse2 |
1392 %if mmsize == 16 | 1463 %if mmsize == 16 |
1398 ; set up indexes to address 4 rows | 1469 ; set up indexes to address 4 rows |
1399 mov r2, r1 | 1470 mov r2, r1 |
1400 neg r1 | 1471 neg r1 |
1401 %ifidn %2, h | 1472 %ifidn %2, h |
1402 lea r0, [r0+4*r2-2] | 1473 lea r0, [r0+4*r2-2] |
1403 sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 | |
1404 %endif | 1474 %endif |
1405 | 1475 |
1406 %if mmsize == 8 ; mmx / mmxext | 1476 %if mmsize == 8 ; mmx / mmxext |
1407 .next8px | 1477 .next8px |
1408 %endif | 1478 %endif |
1419 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 | 1489 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |
1420 %else ; sse2 | 1490 %else ; sse2 |
1421 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 | 1491 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |
1422 %endif | 1492 %endif |
1423 TRANSPOSE4x4W 0, 1, 2, 3, 4 | 1493 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
1424 | |
1425 mova [rsp], m0 ; store p1 | |
1426 mova [rsp+mmsize], m3 ; store q1 | |
1427 %endif | 1494 %endif |
1428 | 1495 |
1429 ; simple_limit | 1496 ; simple_limit |
1430 mova m5, m2 ; m5=backup of q0 | 1497 mova m5, m2 ; m5=backup of q0 |
1431 mova m6, m1 ; m6=backup of p0 | 1498 mova m6, m1 ; m6=backup of p0 |
1492 ; store | 1559 ; store |
1493 %ifidn %2, v | 1560 %ifidn %2, v |
1494 mova [r0], m4 | 1561 mova [r0], m4 |
1495 mova [r0+r1], m6 | 1562 mova [r0+r1], m6 |
1496 %else ; h | 1563 %else ; h |
1497 mova m0, [rsp] ; p1 | 1564 inc r0 |
1498 SWAP 2, 4 ; p0 | 1565 SBUTTERFLY bw, 6, 4, 0 |
1499 SWAP 1, 6 ; q0 | 1566 |
1500 mova m3, [rsp+mmsize] ; q1 | |
1501 | |
1502 TRANSPOSE4x4B 0, 1, 2, 3, 4 | |
1503 %if mmsize == 16 ; sse2 | 1567 %if mmsize == 16 ; sse2 |
1504 add r3, r1 ; change from r4*8*stride to r0+8*stride | 1568 %ifidn %1, sse4 |
1505 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16 | 1569 inc r4 |
1570 %endif | |
1571 WRITE_8W m6, r4, r0, r1, r2 | |
1572 lea r4, [r3+r1+1] | |
1573 %ifidn %1, sse4 | |
1574 inc r3 | |
1575 %endif | |
1576 WRITE_8W m4, r3, r4, r1, r2 | |
1506 %else ; mmx/mmxext | 1577 %else ; mmx/mmxext |
1507 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 | 1578 WRITE_2x4W m6, m4, r4, r0, r1, r2 |
1508 %endif | 1579 %endif |
1509 %endif | 1580 %endif |
1510 | 1581 |
1511 %if mmsize == 8 ; mmx/mmxext | 1582 %if mmsize == 8 ; mmx/mmxext |
1512 ; next 8 pixels | 1583 ; next 8 pixels |
1513 %ifidn %2, v | 1584 %ifidn %2, v |
1514 add r0, 8 ; advance 8 cols = pixels | 1585 add r0, 8 ; advance 8 cols = pixels |
1515 %else ; h | 1586 %else ; h |
1516 lea r0, [r0+r2*8] ; advance 8 rows = lines | 1587 lea r0, [r0+r2*8-1] ; advance 8 rows = lines |
1517 %endif | 1588 %endif |
1518 dec r3 | 1589 dec r3 |
1519 jg .next8px | 1590 jg .next8px |
1520 %ifidn %2, v | |
1521 REP_RET | 1591 REP_RET |
1522 %else ; h | |
1523 mov rsp, r5 ; restore stack pointer | |
1524 RET | |
1525 %endif | |
1526 %else ; sse2 | 1592 %else ; sse2 |
1527 %ifidn %2, h | |
1528 mov rsp, r5 ; restore stack pointer | |
1529 %endif | |
1530 RET | 1593 RET |
1531 %endif | 1594 %endif |
1532 %endmacro | 1595 %endmacro |
1533 | 1596 |
1534 INIT_MMX | 1597 INIT_MMX |
1535 %define SPLATB_REG SPLATB_REG_MMX | 1598 %define SPLATB_REG SPLATB_REG_MMX |
1536 SIMPLE_LOOPFILTER mmx, v, 4 | 1599 SIMPLE_LOOPFILTER mmx, v, 4 |
1537 SIMPLE_LOOPFILTER mmx, h, 6 | 1600 SIMPLE_LOOPFILTER mmx, h, 5 |
1538 %define SPLATB_REG SPLATB_REG_MMXEXT | 1601 %define SPLATB_REG SPLATB_REG_MMXEXT |
1539 SIMPLE_LOOPFILTER mmxext, v, 4 | 1602 SIMPLE_LOOPFILTER mmxext, v, 4 |
1540 SIMPLE_LOOPFILTER mmxext, h, 6 | 1603 SIMPLE_LOOPFILTER mmxext, h, 5 |
1541 INIT_XMM | 1604 INIT_XMM |
1542 %define SPLATB_REG SPLATB_REG_SSE2 | 1605 %define SPLATB_REG SPLATB_REG_SSE2 |
1606 %define WRITE_8W WRITE_8W_SSE2 | |
1543 SIMPLE_LOOPFILTER sse2, v, 3 | 1607 SIMPLE_LOOPFILTER sse2, v, 3 |
1544 SIMPLE_LOOPFILTER sse2, h, 6 | 1608 SIMPLE_LOOPFILTER sse2, h, 5 |
1545 %define SPLATB_REG SPLATB_REG_SSSE3 | 1609 %define SPLATB_REG SPLATB_REG_SSSE3 |
1546 SIMPLE_LOOPFILTER ssse3, v, 3 | 1610 SIMPLE_LOOPFILTER ssse3, v, 3 |
1547 SIMPLE_LOOPFILTER ssse3, h, 6 | 1611 SIMPLE_LOOPFILTER ssse3, h, 5 |
1612 %define WRITE_8W WRITE_8W_SSE4 | |
1613 SIMPLE_LOOPFILTER sse4, h, 5 | |
1548 | 1614 |
1549 ;----------------------------------------------------------------------------- | 1615 ;----------------------------------------------------------------------------- |
1550 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, | 1616 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
1551 ; int flimE, int flimI, int hev_thr); | 1617 ; int flimE, int flimI, int hev_thr); |
1552 ;----------------------------------------------------------------------------- | 1618 ;----------------------------------------------------------------------------- |
2073 ;----------------------------------------------------------------------------- | 2139 ;----------------------------------------------------------------------------- |
2074 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, | 2140 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
2075 ; int flimE, int flimI, int hev_thr); | 2141 ; int flimE, int flimI, int hev_thr); |
2076 ;----------------------------------------------------------------------------- | 2142 ;----------------------------------------------------------------------------- |
2077 | 2143 |
2078 ; write 4 or 8 words in the mmx/xmm registers as 8 lines | |
2079 ; 1 and 2 are the registers to write, this can be the same (for SSE2) | |
2080 ; for pre-SSE4: | |
2081 ; 3 is a general-purpose register that we will clobber | |
2082 ; for SSE4: | |
2083 ; 3 is a pointer to the destination's 5th line | |
2084 ; 4 is a pointer to the destination's 4th line | |
2085 ; 5/6 is -stride and +stride | |
2086 %macro WRITE_2x4W 6 | |
2087 movd %3, %1 | |
2088 punpckhdq %1, %1 | |
2089 mov [%4+%5*4], %3w | |
2090 shr %3, 16 | |
2091 add %4, %6 | |
2092 mov [%4+%5*4], %3w | |
2093 | |
2094 movd %3, %1 | |
2095 add %4, %5 | |
2096 mov [%4+%5*2], %3w | |
2097 shr %3, 16 | |
2098 mov [%4+%5 ], %3w | |
2099 | |
2100 movd %3, %2 | |
2101 punpckhdq %2, %2 | |
2102 mov [%4 ], %3w | |
2103 shr %3, 16 | |
2104 mov [%4+%6 ], %3w | |
2105 | |
2106 movd %3, %2 | |
2107 add %4, %6 | |
2108 mov [%4+%6 ], %3w | |
2109 shr %3, 16 | |
2110 mov [%4+%6*2], %3w | |
2111 add %4, %5 | |
2112 %endmacro | |
2113 | |
2114 %macro WRITE_8W_SSE2 5 | |
2115 movd %2, %1 | |
2116 psrldq %1, 4 | |
2117 mov [%3+%4*4], %2w | |
2118 shr %2, 16 | |
2119 add %3, %5 | |
2120 mov [%3+%4*4], %2w | |
2121 | |
2122 movd %2, %1 | |
2123 psrldq %1, 4 | |
2124 add %3, %4 | |
2125 mov [%3+%4*2], %2w | |
2126 shr %2, 16 | |
2127 mov [%3+%4 ], %2w | |
2128 | |
2129 movd %2, %1 | |
2130 psrldq %1, 4 | |
2131 mov [%3 ], %2w | |
2132 shr %2, 16 | |
2133 mov [%3+%5 ], %2w | |
2134 | |
2135 movd %2, %1 | |
2136 add %3, %5 | |
2137 mov [%3+%5 ], %2w | |
2138 shr %2, 16 | |
2139 mov [%3+%5*2], %2w | |
2140 %endmacro | |
2141 | |
2142 %macro WRITE_8W_SSE4 5 | |
2143 pextrw [%3+%4*4], %1, 0 | |
2144 pextrw [%2+%4*4], %1, 1 | |
2145 pextrw [%3+%4*2], %1, 2 | |
2146 pextrw [%3+%4 ], %1, 3 | |
2147 pextrw [%3 ], %1, 4 | |
2148 pextrw [%2 ], %1, 5 | |
2149 pextrw [%2+%5 ], %1, 6 | |
2150 pextrw [%2+%5*2], %1, 7 | |
2151 %endmacro | |
2152 | |
2153 %macro MBEDGE_LOOPFILTER 5 | 2144 %macro MBEDGE_LOOPFILTER 5 |
2154 %if %4 == 8 ; chroma | 2145 %if %4 == 8 ; chroma |
2155 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 | 2146 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 |
2156 %define dst8_reg r1 | 2147 %define dst8_reg r1 |
2157 %define mstride_reg r2 | 2148 %define mstride_reg r2 |