comparison x86/vp8dsp.asm @ 12334:435319d67bd8 libavcodec

Use word-writing instead of dword-writing (with two cached but otherwise unchanged bytes) in the horizontal simple loopfilter. This makes the filter quite a bit faster in itself (~30 cycles less on Core1), probably mostly because we don't need a complex 4x4 transpose, but only a simple byte interleave. Also allows using pextrw on SSE4, which speeds up even more (e.g. 25% faster on Core i7).
author rbultje
date Sat, 31 Jul 2010 23:13:15 +0000
parents 7fb91885433c
children 2d15f62f4f8a
comparison
equal deleted inserted replaced
12333:97219f0fa018 12334:435319d67bd8
1352 movd [%6+%9*2], m%2 1352 movd [%6+%9*2], m%2
1353 movd [%7+%8*2], m%3 1353 movd [%7+%8*2], m%3
1354 movd [%7+%9*2], m%4 1354 movd [%7+%9*2], m%4
1355 %endmacro 1355 %endmacro
1356 1356
1357 ; write 4 or 8 words in the mmx/xmm registers as 8 lines
1358 ; 1 and 2 are the registers to write, this can be the same (for SSE2)
1359 ; for pre-SSE4:
1360 ; 3 is a general-purpose register that we will clobber
1361 ; for SSE4:
1362 ; 3 is a pointer to the destination's 5th line
1363 ; 4 is a pointer to the destination's 4th line
1364 ; 5/6 is -stride and +stride
1365 %macro WRITE_2x4W 6
1366 movd %3, %1
1367 punpckhdq %1, %1
1368 mov [%4+%5*4], %3w
1369 shr %3, 16
1370 add %4, %6
1371 mov [%4+%5*4], %3w
1372
1373 movd %3, %1
1374 add %4, %5
1375 mov [%4+%5*2], %3w
1376 shr %3, 16
1377 mov [%4+%5 ], %3w
1378
1379 movd %3, %2
1380 punpckhdq %2, %2
1381 mov [%4 ], %3w
1382 shr %3, 16
1383 mov [%4+%6 ], %3w
1384
1385 movd %3, %2
1386 add %4, %6
1387 mov [%4+%6 ], %3w
1388 shr %3, 16
1389 mov [%4+%6*2], %3w
1390 add %4, %5
1391 %endmacro
1392
1393 %macro WRITE_8W_SSE2 5
1394 movd %2, %1
1395 psrldq %1, 4
1396 mov [%3+%4*4], %2w
1397 shr %2, 16
1398 add %3, %5
1399 mov [%3+%4*4], %2w
1400
1401 movd %2, %1
1402 psrldq %1, 4
1403 add %3, %4
1404 mov [%3+%4*2], %2w
1405 shr %2, 16
1406 mov [%3+%4 ], %2w
1407
1408 movd %2, %1
1409 psrldq %1, 4
1410 mov [%3 ], %2w
1411 shr %2, 16
1412 mov [%3+%5 ], %2w
1413
1414 movd %2, %1
1415 add %3, %5
1416 mov [%3+%5 ], %2w
1417 shr %2, 16
1418 mov [%3+%5*2], %2w
1419 %endmacro
1420
1421 %macro WRITE_8W_SSE4 5
1422 pextrw [%3+%4*4], %1, 0
1423 pextrw [%2+%4*4], %1, 1
1424 pextrw [%3+%4*2], %1, 2
1425 pextrw [%3+%4 ], %1, 3
1426 pextrw [%3 ], %1, 4
1427 pextrw [%2 ], %1, 5
1428 pextrw [%2+%5 ], %1, 6
1429 pextrw [%2+%5*2], %1, 7
1430 %endmacro
1431
1357 %macro SPLATB_REG_MMX 2-3 1432 %macro SPLATB_REG_MMX 2-3
1358 movd %1, %2 1433 movd %1, %2
1359 punpcklbw %1, %1 1434 punpcklbw %1, %1
1360 punpcklwd %1, %1 1435 punpcklwd %1, %1
1361 punpckldq %1, %1 1436 punpckldq %1, %1
1379 pshufb %1, %3 1454 pshufb %1, %3
1380 %endmacro 1455 %endmacro
1381 1456
1382 %macro SIMPLE_LOOPFILTER 3 1457 %macro SIMPLE_LOOPFILTER 3
1383 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 1458 cglobal vp8_%2_loop_filter_simple_%1, 3, %3
1384 %ifidn %2, h
1385 mov r5, rsp ; backup stack pointer
1386 and rsp, ~(mmsize-1) ; align stack
1387 %endif
1388 %if mmsize == 8 ; mmx/mmxext 1459 %if mmsize == 8 ; mmx/mmxext
1389 mov r3, 2 1460 mov r3, 2
1390 %endif 1461 %endif
1391 %ifnidn %1, sse2 1462 %ifnidn %1, sse2
1392 %if mmsize == 16 1463 %if mmsize == 16
1398 ; set up indexes to address 4 rows 1469 ; set up indexes to address 4 rows
1399 mov r2, r1 1470 mov r2, r1
1400 neg r1 1471 neg r1
1401 %ifidn %2, h 1472 %ifidn %2, h
1402 lea r0, [r0+4*r2-2] 1473 lea r0, [r0+4*r2-2]
1403 sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1
1404 %endif 1474 %endif
1405 1475
1406 %if mmsize == 8 ; mmx / mmxext 1476 %if mmsize == 8 ; mmx / mmxext
1407 .next8px 1477 .next8px
1408 %endif 1478 %endif
1419 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 1489 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
1420 %else ; sse2 1490 %else ; sse2
1421 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 1491 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
1422 %endif 1492 %endif
1423 TRANSPOSE4x4W 0, 1, 2, 3, 4 1493 TRANSPOSE4x4W 0, 1, 2, 3, 4
1424
1425 mova [rsp], m0 ; store p1
1426 mova [rsp+mmsize], m3 ; store q1
1427 %endif 1494 %endif
1428 1495
1429 ; simple_limit 1496 ; simple_limit
1430 mova m5, m2 ; m5=backup of q0 1497 mova m5, m2 ; m5=backup of q0
1431 mova m6, m1 ; m6=backup of p0 1498 mova m6, m1 ; m6=backup of p0
1492 ; store 1559 ; store
1493 %ifidn %2, v 1560 %ifidn %2, v
1494 mova [r0], m4 1561 mova [r0], m4
1495 mova [r0+r1], m6 1562 mova [r0+r1], m6
1496 %else ; h 1563 %else ; h
1497 mova m0, [rsp] ; p1 1564 inc r0
1498 SWAP 2, 4 ; p0 1565 SBUTTERFLY bw, 6, 4, 0
1499 SWAP 1, 6 ; q0 1566
1500 mova m3, [rsp+mmsize] ; q1
1501
1502 TRANSPOSE4x4B 0, 1, 2, 3, 4
1503 %if mmsize == 16 ; sse2 1567 %if mmsize == 16 ; sse2
1504 add r3, r1 ; change from r4*8*stride to r0+8*stride 1568 %ifidn %1, sse4
1505 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16 1569 inc r4
1570 %endif
1571 WRITE_8W m6, r4, r0, r1, r2
1572 lea r4, [r3+r1+1]
1573 %ifidn %1, sse4
1574 inc r3
1575 %endif
1576 WRITE_8W m4, r3, r4, r1, r2
1506 %else ; mmx/mmxext 1577 %else ; mmx/mmxext
1507 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 1578 WRITE_2x4W m6, m4, r4, r0, r1, r2
1508 %endif 1579 %endif
1509 %endif 1580 %endif
1510 1581
1511 %if mmsize == 8 ; mmx/mmxext 1582 %if mmsize == 8 ; mmx/mmxext
1512 ; next 8 pixels 1583 ; next 8 pixels
1513 %ifidn %2, v 1584 %ifidn %2, v
1514 add r0, 8 ; advance 8 cols = pixels 1585 add r0, 8 ; advance 8 cols = pixels
1515 %else ; h 1586 %else ; h
1516 lea r0, [r0+r2*8] ; advance 8 rows = lines 1587 lea r0, [r0+r2*8-1] ; advance 8 rows = lines
1517 %endif 1588 %endif
1518 dec r3 1589 dec r3
1519 jg .next8px 1590 jg .next8px
1520 %ifidn %2, v
1521 REP_RET 1591 REP_RET
1522 %else ; h
1523 mov rsp, r5 ; restore stack pointer
1524 RET
1525 %endif
1526 %else ; sse2 1592 %else ; sse2
1527 %ifidn %2, h
1528 mov rsp, r5 ; restore stack pointer
1529 %endif
1530 RET 1593 RET
1531 %endif 1594 %endif
1532 %endmacro 1595 %endmacro
1533 1596
1534 INIT_MMX 1597 INIT_MMX
1535 %define SPLATB_REG SPLATB_REG_MMX 1598 %define SPLATB_REG SPLATB_REG_MMX
1536 SIMPLE_LOOPFILTER mmx, v, 4 1599 SIMPLE_LOOPFILTER mmx, v, 4
1537 SIMPLE_LOOPFILTER mmx, h, 6 1600 SIMPLE_LOOPFILTER mmx, h, 5
1538 %define SPLATB_REG SPLATB_REG_MMXEXT 1601 %define SPLATB_REG SPLATB_REG_MMXEXT
1539 SIMPLE_LOOPFILTER mmxext, v, 4 1602 SIMPLE_LOOPFILTER mmxext, v, 4
1540 SIMPLE_LOOPFILTER mmxext, h, 6 1603 SIMPLE_LOOPFILTER mmxext, h, 5
1541 INIT_XMM 1604 INIT_XMM
1542 %define SPLATB_REG SPLATB_REG_SSE2 1605 %define SPLATB_REG SPLATB_REG_SSE2
1606 %define WRITE_8W WRITE_8W_SSE2
1543 SIMPLE_LOOPFILTER sse2, v, 3 1607 SIMPLE_LOOPFILTER sse2, v, 3
1544 SIMPLE_LOOPFILTER sse2, h, 6 1608 SIMPLE_LOOPFILTER sse2, h, 5
1545 %define SPLATB_REG SPLATB_REG_SSSE3 1609 %define SPLATB_REG SPLATB_REG_SSSE3
1546 SIMPLE_LOOPFILTER ssse3, v, 3 1610 SIMPLE_LOOPFILTER ssse3, v, 3
1547 SIMPLE_LOOPFILTER ssse3, h, 6 1611 SIMPLE_LOOPFILTER ssse3, h, 5
1612 %define WRITE_8W WRITE_8W_SSE4
1613 SIMPLE_LOOPFILTER sse4, h, 5
1548 1614
1549 ;----------------------------------------------------------------------------- 1615 ;-----------------------------------------------------------------------------
1550 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, 1616 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1551 ; int flimE, int flimI, int hev_thr); 1617 ; int flimE, int flimI, int hev_thr);
1552 ;----------------------------------------------------------------------------- 1618 ;-----------------------------------------------------------------------------
2073 ;----------------------------------------------------------------------------- 2139 ;-----------------------------------------------------------------------------
2074 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, 2140 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
2075 ; int flimE, int flimI, int hev_thr); 2141 ; int flimE, int flimI, int hev_thr);
2076 ;----------------------------------------------------------------------------- 2142 ;-----------------------------------------------------------------------------
2077 2143
2078 ; write 4 or 8 words in the mmx/xmm registers as 8 lines
2079 ; 1 and 2 are the registers to write, this can be the same (for SSE2)
2080 ; for pre-SSE4:
2081 ; 3 is a general-purpose register that we will clobber
2082 ; for SSE4:
2083 ; 3 is a pointer to the destination's 5th line
2084 ; 4 is a pointer to the destination's 4th line
2085 ; 5/6 is -stride and +stride
2086 %macro WRITE_2x4W 6
2087 movd %3, %1
2088 punpckhdq %1, %1
2089 mov [%4+%5*4], %3w
2090 shr %3, 16
2091 add %4, %6
2092 mov [%4+%5*4], %3w
2093
2094 movd %3, %1
2095 add %4, %5
2096 mov [%4+%5*2], %3w
2097 shr %3, 16
2098 mov [%4+%5 ], %3w
2099
2100 movd %3, %2
2101 punpckhdq %2, %2
2102 mov [%4 ], %3w
2103 shr %3, 16
2104 mov [%4+%6 ], %3w
2105
2106 movd %3, %2
2107 add %4, %6
2108 mov [%4+%6 ], %3w
2109 shr %3, 16
2110 mov [%4+%6*2], %3w
2111 add %4, %5
2112 %endmacro
2113
2114 %macro WRITE_8W_SSE2 5
2115 movd %2, %1
2116 psrldq %1, 4
2117 mov [%3+%4*4], %2w
2118 shr %2, 16
2119 add %3, %5
2120 mov [%3+%4*4], %2w
2121
2122 movd %2, %1
2123 psrldq %1, 4
2124 add %3, %4
2125 mov [%3+%4*2], %2w
2126 shr %2, 16
2127 mov [%3+%4 ], %2w
2128
2129 movd %2, %1
2130 psrldq %1, 4
2131 mov [%3 ], %2w
2132 shr %2, 16
2133 mov [%3+%5 ], %2w
2134
2135 movd %2, %1
2136 add %3, %5
2137 mov [%3+%5 ], %2w
2138 shr %2, 16
2139 mov [%3+%5*2], %2w
2140 %endmacro
2141
2142 %macro WRITE_8W_SSE4 5
2143 pextrw [%3+%4*4], %1, 0
2144 pextrw [%2+%4*4], %1, 1
2145 pextrw [%3+%4*2], %1, 2
2146 pextrw [%3+%4 ], %1, 3
2147 pextrw [%3 ], %1, 4
2148 pextrw [%2 ], %1, 5
2149 pextrw [%2+%5 ], %1, 6
2150 pextrw [%2+%5*2], %1, 7
2151 %endmacro
2152
2153 %macro MBEDGE_LOOPFILTER 5 2144 %macro MBEDGE_LOOPFILTER 5
2154 %if %4 == 8 ; chroma 2145 %if %4 == 8 ; chroma
2155 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 2146 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
2156 %define dst8_reg r1 2147 %define dst8_reg r1
2157 %define mstride_reg r2 2148 %define mstride_reg r2