Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12272:dd90555c98fd libavcodec
Split pextrw macro-spaghetti into several opt-specific macros, this will make
future new optimizations (imagine a sse5) much easier. Also fix a bug where
we used the direction (%2) rather than optimization (%1) to enable this, which
means it wasn't ever actually used...
author | rbultje |
---|---|
date | Mon, 26 Jul 2010 13:50:59 +0000 |
parents | 259988e7ad0f |
children | 1d207bb5cd29 |
comparison
equal
deleted
inserted
replaced
12271:b805a2660a00 | 12272:dd90555c98fd |
---|---|
2083 ; 3 is a general-purpose register that we will clobber | 2083 ; 3 is a general-purpose register that we will clobber |
2084 ; for SSE4: | 2084 ; for SSE4: |
2085 ; 3 is a pointer to the destination's 5th line | 2085 ; 3 is a pointer to the destination's 5th line |
2086 ; 4 is a pointer to the destination's 4th line | 2086 ; 4 is a pointer to the destination's 4th line |
2087 ; 5/6 is -stride and +stride | 2087 ; 5/6 is -stride and +stride |
2088 ; 7 is optimization string | 2088 %macro WRITE_2x4W 6 |
2089 %macro WRITE_8W 7 | |
2090 %ifidn %7, sse4 | |
2091 pextrw [%4+%5*4], %1, 0 | |
2092 pextrw [%3+%5*4], %1, 1 | |
2093 pextrw [%4+%5*2], %1, 2 | |
2094 pextrw [%4+%5 ], %1, 3 | |
2095 pextrw [%4 ], %1, 4 | |
2096 pextrw [%3 ], %1, 5 | |
2097 pextrw [%3+%6 ], %1, 6 | |
2098 pextrw [%3+%6*2], %1, 7 | |
2099 %else | |
2100 movd %3, %1 | 2089 movd %3, %1 |
2101 %if mmsize == 8 | |
2102 punpckhdq %1, %1 | 2090 punpckhdq %1, %1 |
2103 %else | |
2104 psrldq %1, 4 | |
2105 %endif | |
2106 mov [%4+%5*4], %3w | 2091 mov [%4+%5*4], %3w |
2107 shr %3, 16 | 2092 shr %3, 16 |
2108 add %4, %6 | 2093 add %4, %6 |
2109 mov [%4+%5*4], %3w | 2094 mov [%4+%5*4], %3w |
2110 | 2095 |
2111 movd %3, %1 | 2096 movd %3, %1 |
2112 %if mmsize == 16 | |
2113 psrldq %1, 4 | |
2114 %endif | |
2115 add %4, %5 | 2097 add %4, %5 |
2116 mov [%4+%5*2], %3w | 2098 mov [%4+%5*2], %3w |
2117 shr %3, 16 | 2099 shr %3, 16 |
2118 mov [%4+%5 ], %3w | 2100 mov [%4+%5 ], %3w |
2119 | 2101 |
2120 movd %3, %2 | 2102 movd %3, %2 |
2121 %if mmsize == 8 | |
2122 punpckhdq %2, %2 | 2103 punpckhdq %2, %2 |
2123 %else | |
2124 psrldq %2, 4 | |
2125 %endif | |
2126 mov [%4 ], %3w | 2104 mov [%4 ], %3w |
2127 shr %3, 16 | 2105 shr %3, 16 |
2128 mov [%4+%6 ], %3w | 2106 mov [%4+%6 ], %3w |
2129 | 2107 |
2130 movd %3, %2 | 2108 movd %3, %2 |
2131 add %4, %6 | 2109 add %4, %6 |
2132 mov [%4+%6 ], %3w | 2110 mov [%4+%6 ], %3w |
2133 shr %3, 16 | 2111 shr %3, 16 |
2134 mov [%4+%6*2], %3w | 2112 mov [%4+%6*2], %3w |
2135 %if mmsize == 8 | |
2136 add %4, %5 | 2113 add %4, %5 |
2137 %endif | 2114 %endmacro |
2138 %endif | 2115 |
2116 %macro WRITE_8W_SSE2 5 | |
2117 movd %2, %1 | |
2118 psrldq %1, 4 | |
2119 mov [%3+%4*4], %2w | |
2120 shr %2, 16 | |
2121 add %3, %5 | |
2122 mov [%3+%4*4], %2w | |
2123 | |
2124 movd %2, %1 | |
2125 psrldq %1, 4 | |
2126 add %3, %4 | |
2127 mov [%3+%4*2], %2w | |
2128 shr %2, 16 | |
2129 mov [%3+%4 ], %2w | |
2130 | |
2131 movd %2, %1 | |
2132 psrldq %1, 4 | |
2133 mov [%3 ], %2w | |
2134 shr %2, 16 | |
2135 mov [%3+%5 ], %2w | |
2136 | |
2137 movd %2, %1 | |
2138 add %3, %5 | |
2139 mov [%3+%5 ], %2w | |
2140 shr %2, 16 | |
2141 mov [%3+%5*2], %2w | |
2142 %endmacro | |
2143 | |
2144 %macro WRITE_8W_SSE4 5 | |
2145 pextrw [%3+%4*4], %1, 0 | |
2146 pextrw [%2+%4*4], %1, 1 | |
2147 pextrw [%3+%4*2], %1, 2 | |
2148 pextrw [%3+%4 ], %1, 3 | |
2149 pextrw [%3 ], %1, 4 | |
2150 pextrw [%2 ], %1, 5 | |
2151 pextrw [%2+%5 ], %1, 6 | |
2152 pextrw [%2+%5*2], %1, 7 | |
2139 %endmacro | 2153 %endmacro |
2140 | 2154 |
2141 %macro MBEDGE_LOOPFILTER 5 | 2155 %macro MBEDGE_LOOPFILTER 5 |
2142 %if %4 == 8 ; chroma | 2156 %if %4 == 8 ; chroma |
2143 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 | 2157 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 |
2669 SBUTTERFLY bw, 5, 6, 0 | 2683 SBUTTERFLY bw, 5, 6, 0 |
2670 | 2684 |
2671 %if mmsize == 8 ; mmx/mmxext (h) | 2685 %if mmsize == 8 ; mmx/mmxext (h) |
2672 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg | 2686 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg |
2673 add dst_reg, 4 | 2687 add dst_reg, 4 |
2674 WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4 | 2688 WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg |
2675 %else ; sse2 (h) | 2689 %else ; sse2 (h) |
2676 lea dst8_reg, [dst8_reg+mstride_reg+1] | 2690 lea dst8_reg, [dst8_reg+mstride_reg+1] |
2677 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 | 2691 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
2678 lea dst_reg, [dst2_reg+mstride_reg+4] | 2692 lea dst_reg, [dst2_reg+mstride_reg+4] |
2679 lea dst8_reg, [dst8_reg+mstride_reg+4] | 2693 lea dst8_reg, [dst8_reg+mstride_reg+4] |
2680 WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2 | 2694 %ifidn %1, sse4 |
2681 %ifidn %2, sse4 | 2695 add dst2_reg, 4 |
2696 %endif | |
2697 WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg | |
2698 %ifidn %1, sse4 | |
2682 lea dst2_reg, [dst8_reg+ stride_reg] | 2699 lea dst2_reg, [dst8_reg+ stride_reg] |
2683 %endif | 2700 %endif |
2684 WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2 | 2701 WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg |
2685 %endif | 2702 %endif |
2686 %endif | 2703 %endif |
2687 | 2704 |
2688 %if mmsize == 8 | 2705 %if mmsize == 8 |
2689 %if %4 == 8 ; chroma | 2706 %if %4 == 8 ; chroma |
2723 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 | 2740 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 |
2724 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 | 2741 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 |
2725 | 2742 |
2726 INIT_XMM | 2743 INIT_XMM |
2727 %define SPLATB_REG SPLATB_REG_SSE2 | 2744 %define SPLATB_REG SPLATB_REG_SSE2 |
2745 %define WRITE_8W WRITE_8W_SSE2 | |
2728 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 | 2746 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 |
2729 %ifdef m8 | 2747 %ifdef m8 |
2730 MBEDGE_LOOPFILTER sse2, h, 5, 16, 16 | 2748 MBEDGE_LOOPFILTER sse2, h, 5, 16, 16 |
2731 %else | 2749 %else |
2732 MBEDGE_LOOPFILTER sse2, h, 6, 16, 16 | 2750 MBEDGE_LOOPFILTER sse2, h, 6, 16, 16 |
2742 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 | 2760 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 |
2743 %endif | 2761 %endif |
2744 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 | 2762 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 |
2745 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 | 2763 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 |
2746 | 2764 |
2765 %define WRITE_8W WRITE_8W_SSE4 | |
2747 %ifdef m8 | 2766 %ifdef m8 |
2748 MBEDGE_LOOPFILTER sse4, h, 5, 16, 16 | 2767 MBEDGE_LOOPFILTER sse4, h, 5, 16, 16 |
2749 %else | 2768 %else |
2750 MBEDGE_LOOPFILTER sse4, h, 6, 16, 16 | 2769 MBEDGE_LOOPFILTER sse4, h, 6, 16, 16 |
2751 %endif | 2770 %endif |