Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12266:48d6738904a9 libavcodec
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
splits it into small optimization-specific macros which are selected for each
DSP function. The advantage of this approach is that the sse4 functions now
use the ssse3 codepath also without needing an explicit sse4 codepath.
author | rbultje |
---|---|
date | Sat, 24 Jul 2010 19:33:05 +0000 |
parents | c7f6ddcc5c01 |
children | 259988e7ad0f |
comparison
equal
deleted
inserted
replaced
12265:b120f1854e35 | 12266:48d6738904a9 |
---|---|
1358 movd [%6+%9*2], m%2 | 1358 movd [%6+%9*2], m%2 |
1359 movd [%7+%8*2], m%3 | 1359 movd [%7+%8*2], m%3 |
1360 movd [%7+%9*2], m%4 | 1360 movd [%7+%9*2], m%4 |
1361 %endmacro | 1361 %endmacro |
1362 | 1362 |
1363 %macro SPLATB_REG 3-4 | 1363 %macro SPLATB_REG_MMX 2-3 |
1364 movd %1, %2 | 1364 movd %1, %2 |
1365 %ifidn %3, ssse3 | |
1366 pshufb %1, %4 | |
1367 %else | |
1368 punpcklbw %1, %1 | 1365 punpcklbw %1, %1 |
1369 %if mmsize == 16 ; sse2 | 1366 punpcklwd %1, %1 |
1367 punpckldq %1, %1 | |
1368 %endmacro | |
1369 | |
1370 %macro SPLATB_REG_MMXEXT 2-3 | |
1371 movd %1, %2 | |
1372 punpcklbw %1, %1 | |
1373 pshufw %1, %1, 0x0 | |
1374 %endmacro | |
1375 | |
1376 %macro SPLATB_REG_SSE2 2-3 | |
1377 movd %1, %2 | |
1378 punpcklbw %1, %1 | |
1370 pshuflw %1, %1, 0x0 | 1379 pshuflw %1, %1, 0x0 |
1371 punpcklqdq %1, %1 | 1380 punpcklqdq %1, %1 |
1372 %elifidn %3, mmx | 1381 %endmacro |
1373 punpcklwd %1, %1 | 1382 |
1374 punpckldq %1, %1 | 1383 %macro SPLATB_REG_SSSE3 3 |
1375 %else ; mmxext | 1384 movd %1, %2 |
1376 pshufw %1, %1, 0x0 | 1385 pshufb %1, %3 |
1377 %endif | |
1378 %endif | |
1379 %endmacro | 1386 %endmacro |
1380 | 1387 |
1381 %macro SIMPLE_LOOPFILTER 3 | 1388 %macro SIMPLE_LOOPFILTER 3 |
1382 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 | 1389 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |
1383 %ifidn %2, h | 1390 %ifidn %2, h |
1385 and rsp, ~(mmsize-1) ; align stack | 1392 and rsp, ~(mmsize-1) ; align stack |
1386 %endif | 1393 %endif |
1387 %if mmsize == 8 ; mmx/mmxext | 1394 %if mmsize == 8 ; mmx/mmxext |
1388 mov r3, 2 | 1395 mov r3, 2 |
1389 %endif | 1396 %endif |
1390 %ifidn %1, ssse3 | 1397 %ifnidn %1, sse2 && mmsize == 16 |
1391 pxor m0, m0 | 1398 pxor m0, m0 |
1392 %endif | 1399 %endif |
1393 SPLATB_REG m7, r2, %1, m0 ; splat "flim" into register | 1400 SPLATB_REG m7, r2, m0 ; splat "flim" into register |
1394 | 1401 |
1395 ; set up indexes to address 4 rows | 1402 ; set up indexes to address 4 rows |
1396 mov r2, r1 | 1403 mov r2, r1 |
1397 neg r1 | 1404 neg r1 |
1398 %ifidn %2, h | 1405 %ifidn %2, h |
1527 RET | 1534 RET |
1528 %endif | 1535 %endif |
1529 %endmacro | 1536 %endmacro |
1530 | 1537 |
1531 INIT_MMX | 1538 INIT_MMX |
1539 %define SPLATB_REG SPLATB_REG_MMX | |
1532 SIMPLE_LOOPFILTER mmx, v, 4 | 1540 SIMPLE_LOOPFILTER mmx, v, 4 |
1533 SIMPLE_LOOPFILTER mmx, h, 6 | 1541 SIMPLE_LOOPFILTER mmx, h, 6 |
1542 %define SPLATB_REG SPLATB_REG_MMXEXT | |
1534 SIMPLE_LOOPFILTER mmxext, v, 4 | 1543 SIMPLE_LOOPFILTER mmxext, v, 4 |
1535 SIMPLE_LOOPFILTER mmxext, h, 6 | 1544 SIMPLE_LOOPFILTER mmxext, h, 6 |
1536 INIT_XMM | 1545 INIT_XMM |
1546 %define SPLATB_REG SPLATB_REG_SSE2 | |
1537 SIMPLE_LOOPFILTER sse2, v, 3 | 1547 SIMPLE_LOOPFILTER sse2, v, 3 |
1538 SIMPLE_LOOPFILTER sse2, h, 6 | 1548 SIMPLE_LOOPFILTER sse2, h, 6 |
1549 %define SPLATB_REG SPLATB_REG_SSSE3 | |
1539 SIMPLE_LOOPFILTER ssse3, v, 3 | 1550 SIMPLE_LOOPFILTER ssse3, v, 3 |
1540 SIMPLE_LOOPFILTER ssse3, h, 6 | 1551 SIMPLE_LOOPFILTER ssse3, h, 6 |
1541 | 1552 |
1542 ;----------------------------------------------------------------------------- | 1553 ;----------------------------------------------------------------------------- |
1543 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, | 1554 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
1571 %define dst2_reg I_reg | 1582 %define dst2_reg I_reg |
1572 %ifndef m8 | 1583 %ifndef m8 |
1573 %define stack_reg hev_thr_reg | 1584 %define stack_reg hev_thr_reg |
1574 %endif | 1585 %endif |
1575 | 1586 |
1576 %ifidn %1, ssse3 | 1587 %ifnidn %1, sse2 && mmsize == 16 |
1577 pxor m7, m7 | 1588 pxor m7, m7 |
1578 %endif | 1589 %endif |
1579 | 1590 |
1580 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 | 1591 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
1581 ; splat function arguments | 1592 ; splat function arguments |
1582 SPLATB_REG m0, E_reg, %1, m7 ; E | 1593 SPLATB_REG m0, E_reg, m7 ; E |
1583 SPLATB_REG m1, I_reg, %1, m7 ; I | 1594 SPLATB_REG m1, I_reg, m7 ; I |
1584 SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh | 1595 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
1585 | 1596 |
1586 ; align stack | 1597 ; align stack |
1587 mov stack_reg, rsp ; backup stack pointer | 1598 mov stack_reg, rsp ; backup stack pointer |
1588 and rsp, ~(mmsize-1) ; align stack | 1599 and rsp, ~(mmsize-1) ; align stack |
1589 %ifidn %2, v | 1600 %ifidn %2, v |
1612 %define mask_res m12 | 1623 %define mask_res m12 |
1613 %define p0backup m12 | 1624 %define p0backup m12 |
1614 %define q0backup m8 | 1625 %define q0backup m8 |
1615 | 1626 |
1616 ; splat function arguments | 1627 ; splat function arguments |
1617 SPLATB_REG flim_E, E_reg, %1, m7 ; E | 1628 SPLATB_REG flim_E, E_reg, m7 ; E |
1618 SPLATB_REG flim_I, I_reg, %1, m7 ; I | 1629 SPLATB_REG flim_I, I_reg, m7 ; I |
1619 SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh | 1630 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
1620 %endif | 1631 %endif |
1621 | 1632 |
1622 %if mmsize == 8 && %4 == 16 ; mmx/mmxext | 1633 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
1623 mov cnt_reg, 2 | 1634 mov cnt_reg, 2 |
1624 %endif | 1635 %endif |
2026 %endif | 2037 %endif |
2027 RET | 2038 RET |
2028 %endmacro | 2039 %endmacro |
2029 | 2040 |
2030 INIT_MMX | 2041 INIT_MMX |
2042 %define SPLATB_REG SPLATB_REG_MMX | |
2031 INNER_LOOPFILTER mmx, v, 6, 16, 0 | 2043 INNER_LOOPFILTER mmx, v, 6, 16, 0 |
2032 INNER_LOOPFILTER mmx, h, 6, 16, 0 | 2044 INNER_LOOPFILTER mmx, h, 6, 16, 0 |
2045 INNER_LOOPFILTER mmx, v, 6, 8, 0 | |
2046 INNER_LOOPFILTER mmx, h, 6, 8, 0 | |
2047 | |
2048 %define SPLATB_REG SPLATB_REG_MMXEXT | |
2033 INNER_LOOPFILTER mmxext, v, 6, 16, 0 | 2049 INNER_LOOPFILTER mmxext, v, 6, 16, 0 |
2034 INNER_LOOPFILTER mmxext, h, 6, 16, 0 | 2050 INNER_LOOPFILTER mmxext, h, 6, 16, 0 |
2035 | |
2036 INNER_LOOPFILTER mmx, v, 6, 8, 0 | |
2037 INNER_LOOPFILTER mmx, h, 6, 8, 0 | |
2038 INNER_LOOPFILTER mmxext, v, 6, 8, 0 | 2051 INNER_LOOPFILTER mmxext, v, 6, 8, 0 |
2039 INNER_LOOPFILTER mmxext, h, 6, 8, 0 | 2052 INNER_LOOPFILTER mmxext, h, 6, 8, 0 |
2040 | 2053 |
2041 INIT_XMM | 2054 INIT_XMM |
2055 %define SPLATB_REG SPLATB_REG_SSE2 | |
2042 INNER_LOOPFILTER sse2, v, 5, 16, 13 | 2056 INNER_LOOPFILTER sse2, v, 5, 16, 13 |
2043 %ifdef m8 | 2057 %ifdef m8 |
2044 INNER_LOOPFILTER sse2, h, 5, 16, 13 | 2058 INNER_LOOPFILTER sse2, h, 5, 16, 13 |
2045 %else | 2059 %else |
2046 INNER_LOOPFILTER sse2, h, 6, 16, 13 | 2060 INNER_LOOPFILTER sse2, h, 6, 16, 13 |
2047 %endif | 2061 %endif |
2048 INNER_LOOPFILTER sse2, v, 6, 8, 13 | 2062 INNER_LOOPFILTER sse2, v, 6, 8, 13 |
2049 INNER_LOOPFILTER sse2, h, 6, 8, 13 | 2063 INNER_LOOPFILTER sse2, h, 6, 8, 13 |
2050 | 2064 |
2065 %define SPLATB_REG SPLATB_REG_SSSE3 | |
2051 INNER_LOOPFILTER ssse3, v, 5, 16, 13 | 2066 INNER_LOOPFILTER ssse3, v, 5, 16, 13 |
2052 %ifdef m8 | 2067 %ifdef m8 |
2053 INNER_LOOPFILTER ssse3, h, 5, 16, 13 | 2068 INNER_LOOPFILTER ssse3, h, 5, 16, 13 |
2054 %else | 2069 %else |
2055 INNER_LOOPFILTER ssse3, h, 6, 16, 13 | 2070 INNER_LOOPFILTER ssse3, h, 6, 16, 13 |
2150 %define dst2_reg I_reg | 2165 %define dst2_reg I_reg |
2151 %ifndef m8 | 2166 %ifndef m8 |
2152 %define stack_reg hev_thr_reg | 2167 %define stack_reg hev_thr_reg |
2153 %endif | 2168 %endif |
2154 | 2169 |
2155 %ifidn %1, ssse3 | 2170 %ifnidn %1, sse2 && mmsize == 16 |
2156 pxor m7, m7 | 2171 pxor m7, m7 |
2157 %endif | 2172 %endif |
2158 | 2173 |
2159 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 | 2174 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
2160 ; splat function arguments | 2175 ; splat function arguments |
2161 SPLATB_REG m0, E_reg, %1, m7 ; E | 2176 SPLATB_REG m0, E_reg, m7 ; E |
2162 SPLATB_REG m1, I_reg, %1, m7 ; I | 2177 SPLATB_REG m1, I_reg, m7 ; I |
2163 SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh | 2178 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
2164 | 2179 |
2165 ; align stack | 2180 ; align stack |
2166 mov stack_reg, rsp ; backup stack pointer | 2181 mov stack_reg, rsp ; backup stack pointer |
2167 and rsp, ~(mmsize-1) ; align stack | 2182 and rsp, ~(mmsize-1) ; align stack |
2168 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr | 2183 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
2198 %define p2backup m13 | 2213 %define p2backup m13 |
2199 %define q2backup m14 | 2214 %define q2backup m14 |
2200 %define lim_sign m15 | 2215 %define lim_sign m15 |
2201 | 2216 |
2202 ; splat function arguments | 2217 ; splat function arguments |
2203 SPLATB_REG flim_E, E_reg, %1, m7 ; E | 2218 SPLATB_REG flim_E, E_reg, m7 ; E |
2204 SPLATB_REG flim_I, I_reg, %1, m7 ; I | 2219 SPLATB_REG flim_I, I_reg, m7 ; I |
2205 SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh | 2220 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
2206 %endif | 2221 %endif |
2207 | 2222 |
2208 %if mmsize == 8 && %4 == 16 ; mmx/mmxext | 2223 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
2209 mov cnt_reg, 2 | 2224 mov cnt_reg, 2 |
2210 %endif | 2225 %endif |
2694 %endif | 2709 %endif |
2695 RET | 2710 RET |
2696 %endmacro | 2711 %endmacro |
2697 | 2712 |
2698 INIT_MMX | 2713 INIT_MMX |
2714 %define SPLATB_REG SPLATB_REG_MMX | |
2699 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 | 2715 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 |
2700 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 | 2716 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 |
2717 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 | |
2718 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 | |
2719 | |
2720 %define SPLATB_REG SPLATB_REG_MMXEXT | |
2701 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 | 2721 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 |
2702 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 | 2722 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 |
2703 | |
2704 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 | |
2705 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 | |
2706 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 | 2723 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 |
2707 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 | 2724 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 |
2708 | 2725 |
2709 INIT_XMM | 2726 INIT_XMM |
2727 %define SPLATB_REG SPLATB_REG_SSE2 | |
2710 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 | 2728 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 |
2711 %ifdef m8 | 2729 %ifdef m8 |
2712 MBEDGE_LOOPFILTER sse2, h, 5, 16, 16 | 2730 MBEDGE_LOOPFILTER sse2, h, 5, 16, 16 |
2713 %else | 2731 %else |
2714 MBEDGE_LOOPFILTER sse2, h, 6, 16, 16 | 2732 MBEDGE_LOOPFILTER sse2, h, 6, 16, 16 |
2715 %endif | 2733 %endif |
2716 MBEDGE_LOOPFILTER sse2, v, 6, 8, 16 | 2734 MBEDGE_LOOPFILTER sse2, v, 6, 8, 16 |
2717 MBEDGE_LOOPFILTER sse2, h, 6, 8, 16 | 2735 MBEDGE_LOOPFILTER sse2, h, 6, 8, 16 |
2718 | 2736 |
2737 %define SPLATB_REG SPLATB_REG_SSSE3 | |
2719 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16 | 2738 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16 |
2720 %ifdef m8 | 2739 %ifdef m8 |
2721 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16 | 2740 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16 |
2722 %else | 2741 %else |
2723 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 | 2742 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 |