comparison x86/vp8dsp.asm @ 12196:552c7c10bc73 libavcodec

Implement chroma (width=8) inner loopfilter MMX/MMX2/SSE2 functions.
author rbultje
date Mon, 19 Jul 2010 21:53:28 +0000
parents e7847fcff0f4
children 677570e65a75
comparison
equal deleted inserted replaced
12195:e7847fcff0f4 12196:552c7c10bc73
1162 1162
1163 ; write 4 xmm registers of 4 dwords each 1163 ; write 4 xmm registers of 4 dwords each
1164 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular 1164 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1165 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride 1165 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1166 ; we add 1*stride to the third regular registry in the process 1166 ; we add 1*stride to the third regular registry in the process
1167 %macro WRITE_4x4D 9 1167 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
1168 ; same memory region), or 8 if they cover two separate buffers (third one points to
1169 ; a different memory region than the first two), allowing for more optimal code for
1170 ; the 16-width case
1171 %macro WRITE_4x4D 10
1168 ; write out (4 dwords per register), start with dwords zero 1172 ; write out (4 dwords per register), start with dwords zero
1169 movd [%5+%8*4], m%1 1173 movd [%5+%8*4], m%1
1170 movd [%5], m%2 1174 movd [%5], m%2
1171 movd [%5+%9*4], m%3 1175 movd [%7+%8*4], m%3
1172 movd [%5+%9*8], m%4 1176 movd [%7], m%4
1173 1177
1174 ; store dwords 1 1178 ; store dwords 1
1175 psrldq m%1, 4 1179 psrldq m%1, 4
1176 psrldq m%2, 4 1180 psrldq m%2, 4
1177 psrldq m%3, 4 1181 psrldq m%3, 4
1178 psrldq m%4, 4 1182 psrldq m%4, 4
1179 movd [%6+%8*4], m%1 1183 movd [%6+%8*4], m%1
1180 movd [%6], m%2 1184 movd [%6], m%2
1185 %if %10 == 16
1181 movd [%6+%9*4], m%3 1186 movd [%6+%9*4], m%3
1182 movd [%6+%9*8], m%4 1187 %endif
1188 movd [%7+%9], m%4
1183 1189
1184 ; write dwords 2 1190 ; write dwords 2
1185 psrldq m%1, 4 1191 psrldq m%1, 4
1186 psrldq m%2, 4 1192 psrldq m%2, 4
1193 %if %10 == 8
1194 movd [%5+%8*2], m%1
1195 movd %5, m%3
1196 %endif
1187 psrldq m%3, 4 1197 psrldq m%3, 4
1188 psrldq m%4, 4 1198 psrldq m%4, 4
1199 %if %10 == 16
1189 movd [%5+%8*2], m%1 1200 movd [%5+%8*2], m%1
1201 %endif
1190 movd [%6+%9], m%2 1202 movd [%6+%9], m%2
1191 movd [%7+%8*2], m%3 1203 movd [%7+%8*2], m%3
1192 movd [%7+%9*2], m%4 1204 movd [%7+%9*2], m%4
1193 add %7, %9 1205 add %7, %9
1194 1206
1195 ; store dwords 3 1207 ; store dwords 3
1196 psrldq m%1, 4 1208 psrldq m%1, 4
1197 psrldq m%2, 4 1209 psrldq m%2, 4
1198 psrldq m%3, 4 1210 psrldq m%3, 4
1199 psrldq m%4, 4 1211 psrldq m%4, 4
1212 %if %10 == 8
1213 mov [%7+%8*4], %5
1214 movd [%6+%8*2], m%1
1215 %else
1200 movd [%5+%8], m%1 1216 movd [%5+%8], m%1
1217 %endif
1201 movd [%6+%9*2], m%2 1218 movd [%6+%9*2], m%2
1202 movd [%7+%8*2], m%3 1219 movd [%7+%8*2], m%3
1203 movd [%7+%9*2], m%4 1220 movd [%7+%9*2], m%4
1204 %endmacro 1221 %endmacro
1205 1222
1333 mova m3, [rsp+mmsize] ; q1 1350 mova m3, [rsp+mmsize] ; q1
1334 1351
1335 TRANSPOSE4x4B 0, 1, 2, 3, 4 1352 TRANSPOSE4x4B 0, 1, 2, 3, 4
1336 %if mmsize == 16 ; sse2 1353 %if mmsize == 16 ; sse2
1337 add r3, r1 ; change from r4*8*stride to r0+8*stride 1354 add r3, r1 ; change from r4*8*stride to r0+8*stride
1338 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2 1355 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16
1339 %else ; mmx/mmxext 1356 %else ; mmx/mmxext
1340 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 1357 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
1341 %endif 1358 %endif
1342 %endif 1359 %endif
1343 1360
1372 INIT_XMM 1389 INIT_XMM
1373 SIMPLE_LOOPFILTER sse2, v, 3 1390 SIMPLE_LOOPFILTER sse2, v, 3
1374 SIMPLE_LOOPFILTER sse2, h, 6 1391 SIMPLE_LOOPFILTER sse2, h, 6
1375 1392
1376 ;----------------------------------------------------------------------------- 1393 ;-----------------------------------------------------------------------------
1377 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, int stride, 1394 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1378 ; int flimE, int flimI, int hev_thr); 1395 ; int flimE, int flimI, int hev_thr);
1379 ;----------------------------------------------------------------------------- 1396 ;-----------------------------------------------------------------------------
1380 1397
1381 %macro INNER_LOOPFILTER 4 1398 %macro INNER_LOOPFILTER 5
1382 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %4 1399 %if %4 == 8 ; chroma
1383 %define dst_reg r0 1400 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
1401 %define dst8_reg r1
1402 %define mstride_reg r2
1403 %define E_reg r3
1404 %define I_reg r4
1405 %define hev_thr_reg r5
1406 %else ; luma
1407 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
1384 %define mstride_reg r1 1408 %define mstride_reg r1
1385 %define E_reg r2 1409 %define E_reg r2
1386 %define I_reg r3 1410 %define I_reg r3
1387 %define hev_thr_reg r4 1411 %define hev_thr_reg r4
1388 %ifdef m8 ; x86-64, sse2 1412 %ifdef m8 ; x86-64, sse2
1390 %elif mmsize == 16 ; x86-32, sse2 1414 %elif mmsize == 16 ; x86-32, sse2
1391 %define dst8_reg r5 1415 %define dst8_reg r5
1392 %else ; x86-32, mmx/mmxext 1416 %else ; x86-32, mmx/mmxext
1393 %define cnt_reg r5 1417 %define cnt_reg r5
1394 %endif 1418 %endif
1419 %endif
1420 %define dst_reg r0
1395 %define stride_reg E_reg 1421 %define stride_reg E_reg
1396 %define dst2_reg I_reg 1422 %define dst2_reg I_reg
1397 %ifndef m8 1423 %ifndef m8
1398 %define stack_reg hev_thr_reg 1424 %define stack_reg hev_thr_reg
1399 %endif 1425 %endif
1434 SPLATB_REG flim_E, E_reg, %1 ; E 1460 SPLATB_REG flim_E, E_reg, %1 ; E
1435 SPLATB_REG flim_I, I_reg, %1 ; I 1461 SPLATB_REG flim_I, I_reg, %1 ; I
1436 SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh 1462 SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh
1437 %endif 1463 %endif
1438 1464
1439 %if mmsize == 8 ; mmx/mmxext 1465 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
1440 mov cnt_reg, 2 1466 mov cnt_reg, 2
1441 %endif 1467 %endif
1442 mov stride_reg, mstride_reg 1468 mov stride_reg, mstride_reg
1443 neg mstride_reg 1469 neg mstride_reg
1444 %ifidn %2, h 1470 %ifidn %2, h
1445 lea dst_reg, [dst_reg + stride_reg*4-4] 1471 lea dst_reg, [dst_reg + stride_reg*4-4]
1472 %if %4 == 8
1473 lea dst8_reg, [dst8_reg+ stride_reg*4-4]
1474 %endif
1446 %endif 1475 %endif
1447 1476
1448 %if mmsize == 8 1477 %if mmsize == 8
1449 .next8px 1478 .next8px
1450 %endif 1479 %endif
1451 ; read 1480 ; read
1452 lea dst2_reg, [dst_reg + stride_reg] 1481 lea dst2_reg, [dst_reg + stride_reg]
1453 %ifidn %2, v 1482 %ifidn %2, v
1454 mova m0, [dst_reg +mstride_reg*4] ; p3 1483 %if %4 == 8 && mmsize == 16
1455 mova m1, [dst2_reg+mstride_reg*4] ; p2 1484 %define movrow movh
1456 mova m2, [dst_reg +mstride_reg*2] ; p1 1485 %else
1457 mova m5, [dst2_reg] ; q1 1486 %define movrow mova
1458 mova m6, [dst2_reg+ stride_reg] ; q2 1487 %endif
1459 mova m7, [dst2_reg+ stride_reg*2] ; q3 1488 movrow m0, [dst_reg +mstride_reg*4] ; p3
1489 movrow m1, [dst2_reg+mstride_reg*4] ; p2
1490 movrow m2, [dst_reg +mstride_reg*2] ; p1
1491 movrow m5, [dst2_reg] ; q1
1492 movrow m6, [dst2_reg+ stride_reg] ; q2
1493 movrow m7, [dst2_reg+ stride_reg*2] ; q3
1494 %if mmsize == 16 && %4 == 8
1495 movhps m0, [dst8_reg+mstride_reg*4]
1496 movhps m2, [dst8_reg+mstride_reg*2]
1497 add dst8_reg, stride_reg
1498 movhps m1, [dst8_reg+mstride_reg*4]
1499 movhps m5, [dst8_reg]
1500 movhps m6, [dst8_reg+ stride_reg]
1501 movhps m7, [dst8_reg+ stride_reg*2]
1502 add dst8_reg, mstride_reg
1503 %endif
1460 %elif mmsize == 8 ; mmx/mmxext (h) 1504 %elif mmsize == 8 ; mmx/mmxext (h)
1461 ; read 8 rows of 8px each 1505 ; read 8 rows of 8px each
1462 movu m0, [dst_reg +mstride_reg*4] 1506 movu m0, [dst_reg +mstride_reg*4]
1463 movu m1, [dst2_reg+mstride_reg*4] 1507 movu m1, [dst2_reg+mstride_reg*4]
1464 movu m2, [dst_reg +mstride_reg*2] 1508 movu m2, [dst_reg +mstride_reg*2]
1495 SWAP 1, 4 1539 SWAP 1, 4
1496 SWAP 2, 4 1540 SWAP 2, 4
1497 SWAP 6, 3 1541 SWAP 6, 3
1498 SWAP 5, 3 1542 SWAP 5, 3
1499 %else ; sse2 (h) 1543 %else ; sse2 (h)
1544 %if %4 == 16
1500 lea dst8_reg, [dst_reg + stride_reg*8] 1545 lea dst8_reg, [dst_reg + stride_reg*8]
1546 %endif
1501 1547
1502 ; read 16 rows of 8px each, interleave 1548 ; read 16 rows of 8px each, interleave
1503 movh m0, [dst_reg +mstride_reg*4] 1549 movh m0, [dst_reg +mstride_reg*4]
1504 movh m1, [dst8_reg+mstride_reg*4] 1550 movh m1, [dst8_reg+mstride_reg*4]
1505 movh m2, [dst_reg +mstride_reg*2] 1551 movh m2, [dst_reg +mstride_reg*2]
1607 %endif 1653 %endif
1608 1654
1609 ; normal_limit and high_edge_variance for p1-p0, q1-q0 1655 ; normal_limit and high_edge_variance for p1-p0, q1-q0
1610 SWAP 7, 3 ; now m7 is zero 1656 SWAP 7, 3 ; now m7 is zero
1611 %ifidn %2, v 1657 %ifidn %2, v
1612 mova m3, [dst_reg +mstride_reg] ; p0 1658 movrow m3, [dst_reg +mstride_reg] ; p0
1659 %if mmsize == 16 && %4 == 8
1660 movhps m3, [dst8_reg+mstride_reg]
1661 %endif
1613 %elifdef m14 1662 %elifdef m14
1614 SWAP 3, 12 1663 SWAP 3, 12
1615 %else 1664 %else
1616 mova m3, [rsp+mmsize*3] 1665 mova m3, [rsp+mmsize*3]
1617 %endif 1666 %endif
1640 SWAP 1, 4 ; max_hev_thresh 1689 SWAP 1, 4 ; max_hev_thresh
1641 %endif 1690 %endif
1642 1691
1643 SWAP 6, 4 ; now m6 is I 1692 SWAP 6, 4 ; now m6 is I
1644 %ifidn %2, v 1693 %ifidn %2, v
1645 mova m4, [dst_reg] ; q0 1694 movrow m4, [dst_reg] ; q0
1695 %if mmsize == 16 && %4 == 8
1696 movhps m4, [dst8_reg]
1697 %endif
1646 %elifdef m13 1698 %elifdef m13
1647 SWAP 4, 8 1699 SWAP 4, 8
1648 %else 1700 %else
1649 mova m4, [rsp+mmsize*4] 1701 mova m4, [rsp+mmsize*4]
1650 %endif 1702 %endif
1791 paddusb m5, m1 ; q1-a 1843 paddusb m5, m1 ; q1-a
1792 paddusb m2, m0 ; p1+a 1844 paddusb m2, m0 ; p1+a
1793 1845
1794 ; store 1846 ; store
1795 %ifidn %2, v 1847 %ifidn %2, v
1796 mova [dst_reg+mstride_reg*2], m2 1848 movrow [dst_reg +mstride_reg*2], m2
1797 mova [dst_reg+mstride_reg ], m3 1849 movrow [dst_reg +mstride_reg ], m3
1798 mova [dst_reg], m4 1850 movrow [dst_reg], m4
1799 mova [dst_reg+ stride_reg ], m5 1851 movrow [dst_reg + stride_reg ], m5
1852 %if mmsize == 16 && %4 == 8
1853 movhps [dst8_reg+mstride_reg*2], m2
1854 movhps [dst8_reg+mstride_reg ], m3
1855 movhps [dst8_reg], m4
1856 movhps [dst8_reg+ stride_reg ], m5
1857 %endif
1800 %else ; h 1858 %else ; h
1801 add dst_reg, 2 1859 add dst_reg, 2
1802 add dst2_reg, 2 1860 add dst2_reg, 2
1803 1861
1804 ; 4x8/16 transpose 1862 ; 4x8/16 transpose
1805 TRANSPOSE4x4B 2, 3, 4, 5, 6 1863 TRANSPOSE4x4B 2, 3, 4, 5, 6
1806 1864
1807 %if mmsize == 8 ; mmx/mmxext (h) 1865 %if mmsize == 8 ; mmx/mmxext (h)
1808 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg 1866 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
1809 %else ; sse2 (h) 1867 %else ; sse2 (h)
1810 lea dst8_reg, [dst8_reg+mstride_reg+2] 1868 lea dst8_reg, [dst8_reg+mstride_reg+2]
1811 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg 1869 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
1812 %endif 1870 %endif
1813 %endif 1871 %endif
1814 1872
1815 %if mmsize == 8 1873 %if mmsize == 8
1874 %if %4 == 8 ; chroma
1875 %ifidn %2, h
1876 sub dst_reg, 2
1877 %endif
1878 cmp dst_reg, dst8_reg
1879 mov dst_reg, dst8_reg
1880 jnz .next8px
1881 %else
1816 %ifidn %2, h 1882 %ifidn %2, h
1817 lea dst_reg, [dst_reg + stride_reg*8-2] 1883 lea dst_reg, [dst_reg + stride_reg*8-2]
1818 %else ; v 1884 %else ; v
1819 add dst_reg, 8 1885 add dst_reg, 8
1820 %endif 1886 %endif
1821 dec cnt_reg 1887 dec cnt_reg
1822 jg .next8px 1888 jg .next8px
1823 %endif 1889 %endif
1890 %endif
1824 1891
1825 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext 1892 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
1826 mov rsp, stack_reg ; restore stack pointer 1893 mov rsp, stack_reg ; restore stack pointer
1827 %endif 1894 %endif
1828 RET 1895 RET
1829 %endmacro 1896 %endmacro
1830 1897
1831 INIT_MMX 1898 INIT_MMX
1832 INNER_LOOPFILTER mmx, v, 6, 8 1899 INNER_LOOPFILTER mmx, v, 6, 16, 8
1833 INNER_LOOPFILTER mmx, h, 6, 8 1900 INNER_LOOPFILTER mmx, h, 6, 16, 8
1834 INNER_LOOPFILTER mmxext, v, 6, 8 1901 INNER_LOOPFILTER mmxext, v, 6, 16, 8
1835 INNER_LOOPFILTER mmxext, h, 6, 8 1902 INNER_LOOPFILTER mmxext, h, 6, 16, 8
1903
1904 INNER_LOOPFILTER mmx, v, 6, 8, 8
1905 INNER_LOOPFILTER mmx, h, 6, 8, 8
1906 INNER_LOOPFILTER mmxext, v, 6, 8, 8
1907 INNER_LOOPFILTER mmxext, h, 6, 8, 8
1908
1836 INIT_XMM 1909 INIT_XMM
1837 INNER_LOOPFILTER sse2, v, 5, 13 1910 INNER_LOOPFILTER sse2, v, 5, 16, 13
1838 %ifdef m8 1911 %ifdef m8
1839 INNER_LOOPFILTER sse2, h, 5, 13 1912 INNER_LOOPFILTER sse2, h, 5, 16, 13
1840 %else 1913 %else
1841 INNER_LOOPFILTER sse2, h, 6, 13 1914 INNER_LOOPFILTER sse2, h, 6, 16, 13
1842 %endif 1915 %endif
1916
1917 INNER_LOOPFILTER sse2, v, 6, 8, 13
1918 INNER_LOOPFILTER sse2, h, 6, 8, 13