comparison x86/vp8dsp.asm @ 12266:48d6738904a9 libavcodec

Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this splits it into small optimization-specific macros which are selected for each DSP function. The advantage of this approach is that the sse4 functions now use the ssse3 codepath also without needing an explicit sse4 codepath.
author rbultje
date Sat, 24 Jul 2010 19:33:05 +0000
parents c7f6ddcc5c01
children 259988e7ad0f
comparison
equal deleted inserted replaced
12265:b120f1854e35 12266:48d6738904a9
1358 movd [%6+%9*2], m%2 1358 movd [%6+%9*2], m%2
1359 movd [%7+%8*2], m%3 1359 movd [%7+%8*2], m%3
1360 movd [%7+%9*2], m%4 1360 movd [%7+%9*2], m%4
1361 %endmacro 1361 %endmacro
1362 1362
1363 %macro SPLATB_REG 3-4 1363 %macro SPLATB_REG_MMX 2-3
1364 movd %1, %2 1364 movd %1, %2
1365 %ifidn %3, ssse3
1366 pshufb %1, %4
1367 %else
1368 punpcklbw %1, %1 1365 punpcklbw %1, %1
1369 %if mmsize == 16 ; sse2 1366 punpcklwd %1, %1
1367 punpckldq %1, %1
1368 %endmacro
1369
1370 %macro SPLATB_REG_MMXEXT 2-3
1371 movd %1, %2
1372 punpcklbw %1, %1
1373 pshufw %1, %1, 0x0
1374 %endmacro
1375
1376 %macro SPLATB_REG_SSE2 2-3
1377 movd %1, %2
1378 punpcklbw %1, %1
1370 pshuflw %1, %1, 0x0 1379 pshuflw %1, %1, 0x0
1371 punpcklqdq %1, %1 1380 punpcklqdq %1, %1
1372 %elifidn %3, mmx 1381 %endmacro
1373 punpcklwd %1, %1 1382
1374 punpckldq %1, %1 1383 %macro SPLATB_REG_SSSE3 3
1375 %else ; mmxext 1384 movd %1, %2
1376 pshufw %1, %1, 0x0 1385 pshufb %1, %3
1377 %endif
1378 %endif
1379 %endmacro 1386 %endmacro
1380 1387
1381 %macro SIMPLE_LOOPFILTER 3 1388 %macro SIMPLE_LOOPFILTER 3
1382 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 1389 cglobal vp8_%2_loop_filter_simple_%1, 3, %3
1383 %ifidn %2, h 1390 %ifidn %2, h
1385 and rsp, ~(mmsize-1) ; align stack 1392 and rsp, ~(mmsize-1) ; align stack
1386 %endif 1393 %endif
1387 %if mmsize == 8 ; mmx/mmxext 1394 %if mmsize == 8 ; mmx/mmxext
1388 mov r3, 2 1395 mov r3, 2
1389 %endif 1396 %endif
1390 %ifidn %1, ssse3 1397 %ifnidn %1, sse2 && mmsize == 16
1391 pxor m0, m0 1398 pxor m0, m0
1392 %endif 1399 %endif
1393 SPLATB_REG m7, r2, %1, m0 ; splat "flim" into register 1400 SPLATB_REG m7, r2, m0 ; splat "flim" into register
1394 1401
1395 ; set up indexes to address 4 rows 1402 ; set up indexes to address 4 rows
1396 mov r2, r1 1403 mov r2, r1
1397 neg r1 1404 neg r1
1398 %ifidn %2, h 1405 %ifidn %2, h
1527 RET 1534 RET
1528 %endif 1535 %endif
1529 %endmacro 1536 %endmacro
1530 1537
1531 INIT_MMX 1538 INIT_MMX
1539 %define SPLATB_REG SPLATB_REG_MMX
1532 SIMPLE_LOOPFILTER mmx, v, 4 1540 SIMPLE_LOOPFILTER mmx, v, 4
1533 SIMPLE_LOOPFILTER mmx, h, 6 1541 SIMPLE_LOOPFILTER mmx, h, 6
1542 %define SPLATB_REG SPLATB_REG_MMXEXT
1534 SIMPLE_LOOPFILTER mmxext, v, 4 1543 SIMPLE_LOOPFILTER mmxext, v, 4
1535 SIMPLE_LOOPFILTER mmxext, h, 6 1544 SIMPLE_LOOPFILTER mmxext, h, 6
1536 INIT_XMM 1545 INIT_XMM
1546 %define SPLATB_REG SPLATB_REG_SSE2
1537 SIMPLE_LOOPFILTER sse2, v, 3 1547 SIMPLE_LOOPFILTER sse2, v, 3
1538 SIMPLE_LOOPFILTER sse2, h, 6 1548 SIMPLE_LOOPFILTER sse2, h, 6
1549 %define SPLATB_REG SPLATB_REG_SSSE3
1539 SIMPLE_LOOPFILTER ssse3, v, 3 1550 SIMPLE_LOOPFILTER ssse3, v, 3
1540 SIMPLE_LOOPFILTER ssse3, h, 6 1551 SIMPLE_LOOPFILTER ssse3, h, 6
1541 1552
1542 ;----------------------------------------------------------------------------- 1553 ;-----------------------------------------------------------------------------
1543 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, 1554 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1571 %define dst2_reg I_reg 1582 %define dst2_reg I_reg
1572 %ifndef m8 1583 %ifndef m8
1573 %define stack_reg hev_thr_reg 1584 %define stack_reg hev_thr_reg
1574 %endif 1585 %endif
1575 1586
1576 %ifidn %1, ssse3 1587 %ifnidn %1, sse2 && mmsize == 16
1577 pxor m7, m7 1588 pxor m7, m7
1578 %endif 1589 %endif
1579 1590
1580 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 1591 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
1581 ; splat function arguments 1592 ; splat function arguments
1582 SPLATB_REG m0, E_reg, %1, m7 ; E 1593 SPLATB_REG m0, E_reg, m7 ; E
1583 SPLATB_REG m1, I_reg, %1, m7 ; I 1594 SPLATB_REG m1, I_reg, m7 ; I
1584 SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh 1595 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh
1585 1596
1586 ; align stack 1597 ; align stack
1587 mov stack_reg, rsp ; backup stack pointer 1598 mov stack_reg, rsp ; backup stack pointer
1588 and rsp, ~(mmsize-1) ; align stack 1599 and rsp, ~(mmsize-1) ; align stack
1589 %ifidn %2, v 1600 %ifidn %2, v
1612 %define mask_res m12 1623 %define mask_res m12
1613 %define p0backup m12 1624 %define p0backup m12
1614 %define q0backup m8 1625 %define q0backup m8
1615 1626
1616 ; splat function arguments 1627 ; splat function arguments
1617 SPLATB_REG flim_E, E_reg, %1, m7 ; E 1628 SPLATB_REG flim_E, E_reg, m7 ; E
1618 SPLATB_REG flim_I, I_reg, %1, m7 ; I 1629 SPLATB_REG flim_I, I_reg, m7 ; I
1619 SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh 1630 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh
1620 %endif 1631 %endif
1621 1632
1622 %if mmsize == 8 && %4 == 16 ; mmx/mmxext 1633 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
1623 mov cnt_reg, 2 1634 mov cnt_reg, 2
1624 %endif 1635 %endif
2026 %endif 2037 %endif
2027 RET 2038 RET
2028 %endmacro 2039 %endmacro
2029 2040
2030 INIT_MMX 2041 INIT_MMX
2042 %define SPLATB_REG SPLATB_REG_MMX
2031 INNER_LOOPFILTER mmx, v, 6, 16, 0 2043 INNER_LOOPFILTER mmx, v, 6, 16, 0
2032 INNER_LOOPFILTER mmx, h, 6, 16, 0 2044 INNER_LOOPFILTER mmx, h, 6, 16, 0
2045 INNER_LOOPFILTER mmx, v, 6, 8, 0
2046 INNER_LOOPFILTER mmx, h, 6, 8, 0
2047
2048 %define SPLATB_REG SPLATB_REG_MMXEXT
2033 INNER_LOOPFILTER mmxext, v, 6, 16, 0 2049 INNER_LOOPFILTER mmxext, v, 6, 16, 0
2034 INNER_LOOPFILTER mmxext, h, 6, 16, 0 2050 INNER_LOOPFILTER mmxext, h, 6, 16, 0
2035
2036 INNER_LOOPFILTER mmx, v, 6, 8, 0
2037 INNER_LOOPFILTER mmx, h, 6, 8, 0
2038 INNER_LOOPFILTER mmxext, v, 6, 8, 0 2051 INNER_LOOPFILTER mmxext, v, 6, 8, 0
2039 INNER_LOOPFILTER mmxext, h, 6, 8, 0 2052 INNER_LOOPFILTER mmxext, h, 6, 8, 0
2040 2053
2041 INIT_XMM 2054 INIT_XMM
2055 %define SPLATB_REG SPLATB_REG_SSE2
2042 INNER_LOOPFILTER sse2, v, 5, 16, 13 2056 INNER_LOOPFILTER sse2, v, 5, 16, 13
2043 %ifdef m8 2057 %ifdef m8
2044 INNER_LOOPFILTER sse2, h, 5, 16, 13 2058 INNER_LOOPFILTER sse2, h, 5, 16, 13
2045 %else 2059 %else
2046 INNER_LOOPFILTER sse2, h, 6, 16, 13 2060 INNER_LOOPFILTER sse2, h, 6, 16, 13
2047 %endif 2061 %endif
2048 INNER_LOOPFILTER sse2, v, 6, 8, 13 2062 INNER_LOOPFILTER sse2, v, 6, 8, 13
2049 INNER_LOOPFILTER sse2, h, 6, 8, 13 2063 INNER_LOOPFILTER sse2, h, 6, 8, 13
2050 2064
2065 %define SPLATB_REG SPLATB_REG_SSSE3
2051 INNER_LOOPFILTER ssse3, v, 5, 16, 13 2066 INNER_LOOPFILTER ssse3, v, 5, 16, 13
2052 %ifdef m8 2067 %ifdef m8
2053 INNER_LOOPFILTER ssse3, h, 5, 16, 13 2068 INNER_LOOPFILTER ssse3, h, 5, 16, 13
2054 %else 2069 %else
2055 INNER_LOOPFILTER ssse3, h, 6, 16, 13 2070 INNER_LOOPFILTER ssse3, h, 6, 16, 13
2150 %define dst2_reg I_reg 2165 %define dst2_reg I_reg
2151 %ifndef m8 2166 %ifndef m8
2152 %define stack_reg hev_thr_reg 2167 %define stack_reg hev_thr_reg
2153 %endif 2168 %endif
2154 2169
2155 %ifidn %1, ssse3 2170 %ifnidn %1, sse2 && mmsize == 16
2156 pxor m7, m7 2171 pxor m7, m7
2157 %endif 2172 %endif
2158 2173
2159 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 2174 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
2160 ; splat function arguments 2175 ; splat function arguments
2161 SPLATB_REG m0, E_reg, %1, m7 ; E 2176 SPLATB_REG m0, E_reg, m7 ; E
2162 SPLATB_REG m1, I_reg, %1, m7 ; I 2177 SPLATB_REG m1, I_reg, m7 ; I
2163 SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh 2178 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh
2164 2179
2165 ; align stack 2180 ; align stack
2166 mov stack_reg, rsp ; backup stack pointer 2181 mov stack_reg, rsp ; backup stack pointer
2167 and rsp, ~(mmsize-1) ; align stack 2182 and rsp, ~(mmsize-1) ; align stack
2168 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr 2183 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
2198 %define p2backup m13 2213 %define p2backup m13
2199 %define q2backup m14 2214 %define q2backup m14
2200 %define lim_sign m15 2215 %define lim_sign m15
2201 2216
2202 ; splat function arguments 2217 ; splat function arguments
2203 SPLATB_REG flim_E, E_reg, %1, m7 ; E 2218 SPLATB_REG flim_E, E_reg, m7 ; E
2204 SPLATB_REG flim_I, I_reg, %1, m7 ; I 2219 SPLATB_REG flim_I, I_reg, m7 ; I
2205 SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh 2220 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh
2206 %endif 2221 %endif
2207 2222
2208 %if mmsize == 8 && %4 == 16 ; mmx/mmxext 2223 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
2209 mov cnt_reg, 2 2224 mov cnt_reg, 2
2210 %endif 2225 %endif
2694 %endif 2709 %endif
2695 RET 2710 RET
2696 %endmacro 2711 %endmacro
2697 2712
2698 INIT_MMX 2713 INIT_MMX
2714 %define SPLATB_REG SPLATB_REG_MMX
2699 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 2715 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0
2700 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 2716 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0
2717 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0
2718 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0
2719
2720 %define SPLATB_REG SPLATB_REG_MMXEXT
2701 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 2721 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
2702 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 2722 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
2703
2704 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0
2705 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0
2706 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 2723 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0
2707 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 2724 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0
2708 2725
2709 INIT_XMM 2726 INIT_XMM
2727 %define SPLATB_REG SPLATB_REG_SSE2
2710 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 2728 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16
2711 %ifdef m8 2729 %ifdef m8
2712 MBEDGE_LOOPFILTER sse2, h, 5, 16, 16 2730 MBEDGE_LOOPFILTER sse2, h, 5, 16, 16
2713 %else 2731 %else
2714 MBEDGE_LOOPFILTER sse2, h, 6, 16, 16 2732 MBEDGE_LOOPFILTER sse2, h, 6, 16, 16
2715 %endif 2733 %endif
2716 MBEDGE_LOOPFILTER sse2, v, 6, 8, 16 2734 MBEDGE_LOOPFILTER sse2, v, 6, 8, 16
2717 MBEDGE_LOOPFILTER sse2, h, 6, 8, 16 2735 MBEDGE_LOOPFILTER sse2, h, 6, 8, 16
2718 2736
2737 %define SPLATB_REG SPLATB_REG_SSSE3
2719 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16 2738 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16
2720 %ifdef m8 2739 %ifdef m8
2721 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16 2740 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16
2722 %else 2741 %else
2723 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 2742 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16