comparison x86/vp8dsp.asm @ 12272:dd90555c98fd libavcodec

Split pextrw macro-spaghetti into several opt-specific macros, this will make future new optimizations (imagine a sse5) much easier. Also fix a bug where we used the direction (%2) rather than optimization (%1) to enable this, which means it wasn't ever actually used...
author rbultje
date Mon, 26 Jul 2010 13:50:59 +0000
parents 259988e7ad0f
children 1d207bb5cd29
comparison
equal deleted inserted replaced
12271:b805a2660a00 12272:dd90555c98fd
2083 ; 3 is a general-purpose register that we will clobber 2083 ; 3 is a general-purpose register that we will clobber
2084 ; for SSE4: 2084 ; for SSE4:
2085 ; 3 is a pointer to the destination's 5th line 2085 ; 3 is a pointer to the destination's 5th line
2086 ; 4 is a pointer to the destination's 4th line 2086 ; 4 is a pointer to the destination's 4th line
2087 ; 5/6 is -stride and +stride 2087 ; 5/6 is -stride and +stride
2088 ; 7 is optimization string 2088 %macro WRITE_2x4W 6
2089 %macro WRITE_8W 7
2090 %ifidn %7, sse4
2091 pextrw [%4+%5*4], %1, 0
2092 pextrw [%3+%5*4], %1, 1
2093 pextrw [%4+%5*2], %1, 2
2094 pextrw [%4+%5 ], %1, 3
2095 pextrw [%4 ], %1, 4
2096 pextrw [%3 ], %1, 5
2097 pextrw [%3+%6 ], %1, 6
2098 pextrw [%3+%6*2], %1, 7
2099 %else
2100 movd %3, %1 2089 movd %3, %1
2101 %if mmsize == 8
2102 punpckhdq %1, %1 2090 punpckhdq %1, %1
2103 %else
2104 psrldq %1, 4
2105 %endif
2106 mov [%4+%5*4], %3w 2091 mov [%4+%5*4], %3w
2107 shr %3, 16 2092 shr %3, 16
2108 add %4, %6 2093 add %4, %6
2109 mov [%4+%5*4], %3w 2094 mov [%4+%5*4], %3w
2110 2095
2111 movd %3, %1 2096 movd %3, %1
2112 %if mmsize == 16
2113 psrldq %1, 4
2114 %endif
2115 add %4, %5 2097 add %4, %5
2116 mov [%4+%5*2], %3w 2098 mov [%4+%5*2], %3w
2117 shr %3, 16 2099 shr %3, 16
2118 mov [%4+%5 ], %3w 2100 mov [%4+%5 ], %3w
2119 2101
2120 movd %3, %2 2102 movd %3, %2
2121 %if mmsize == 8
2122 punpckhdq %2, %2 2103 punpckhdq %2, %2
2123 %else
2124 psrldq %2, 4
2125 %endif
2126 mov [%4 ], %3w 2104 mov [%4 ], %3w
2127 shr %3, 16 2105 shr %3, 16
2128 mov [%4+%6 ], %3w 2106 mov [%4+%6 ], %3w
2129 2107
2130 movd %3, %2 2108 movd %3, %2
2131 add %4, %6 2109 add %4, %6
2132 mov [%4+%6 ], %3w 2110 mov [%4+%6 ], %3w
2133 shr %3, 16 2111 shr %3, 16
2134 mov [%4+%6*2], %3w 2112 mov [%4+%6*2], %3w
2135 %if mmsize == 8
2136 add %4, %5 2113 add %4, %5
2137 %endif 2114 %endmacro
2138 %endif 2115
2116 %macro WRITE_8W_SSE2 5
2117 movd %2, %1
2118 psrldq %1, 4
2119 mov [%3+%4*4], %2w
2120 shr %2, 16
2121 add %3, %5
2122 mov [%3+%4*4], %2w
2123
2124 movd %2, %1
2125 psrldq %1, 4
2126 add %3, %4
2127 mov [%3+%4*2], %2w
2128 shr %2, 16
2129 mov [%3+%4 ], %2w
2130
2131 movd %2, %1
2132 psrldq %1, 4
2133 mov [%3 ], %2w
2134 shr %2, 16
2135 mov [%3+%5 ], %2w
2136
2137 movd %2, %1
2138 add %3, %5
2139 mov [%3+%5 ], %2w
2140 shr %2, 16
2141 mov [%3+%5*2], %2w
2142 %endmacro
2143
2144 %macro WRITE_8W_SSE4 5
2145 pextrw [%3+%4*4], %1, 0
2146 pextrw [%2+%4*4], %1, 1
2147 pextrw [%3+%4*2], %1, 2
2148 pextrw [%3+%4 ], %1, 3
2149 pextrw [%3 ], %1, 4
2150 pextrw [%2 ], %1, 5
2151 pextrw [%2+%5 ], %1, 6
2152 pextrw [%2+%5*2], %1, 7
2139 %endmacro 2153 %endmacro
2140 2154
2141 %macro MBEDGE_LOOPFILTER 5 2155 %macro MBEDGE_LOOPFILTER 5
2142 %if %4 == 8 ; chroma 2156 %if %4 == 8 ; chroma
2143 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 2157 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
2669 SBUTTERFLY bw, 5, 6, 0 2683 SBUTTERFLY bw, 5, 6, 0
2670 2684
2671 %if mmsize == 8 ; mmx/mmxext (h) 2685 %if mmsize == 8 ; mmx/mmxext (h)
2672 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg 2686 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
2673 add dst_reg, 4 2687 add dst_reg, 4
2674 WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4 2688 WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
2675 %else ; sse2 (h) 2689 %else ; sse2 (h)
2676 lea dst8_reg, [dst8_reg+mstride_reg+1] 2690 lea dst8_reg, [dst8_reg+mstride_reg+1]
2677 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 2691 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2678 lea dst_reg, [dst2_reg+mstride_reg+4] 2692 lea dst_reg, [dst2_reg+mstride_reg+4]
2679 lea dst8_reg, [dst8_reg+mstride_reg+4] 2693 lea dst8_reg, [dst8_reg+mstride_reg+4]
2680 WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2 2694 %ifidn %1, sse4
2681 %ifidn %2, sse4 2695 add dst2_reg, 4
2696 %endif
2697 WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg
2698 %ifidn %1, sse4
2682 lea dst2_reg, [dst8_reg+ stride_reg] 2699 lea dst2_reg, [dst8_reg+ stride_reg]
2683 %endif 2700 %endif
2684 WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2 2701 WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
2685 %endif 2702 %endif
2686 %endif 2703 %endif
2687 2704
2688 %if mmsize == 8 2705 %if mmsize == 8
2689 %if %4 == 8 ; chroma 2706 %if %4 == 8 ; chroma
2723 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 2740 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0
2724 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 2741 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0
2725 2742
2726 INIT_XMM 2743 INIT_XMM
2727 %define SPLATB_REG SPLATB_REG_SSE2 2744 %define SPLATB_REG SPLATB_REG_SSE2
2745 %define WRITE_8W WRITE_8W_SSE2
2728 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 2746 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16
2729 %ifdef m8 2747 %ifdef m8
2730 MBEDGE_LOOPFILTER sse2, h, 5, 16, 16 2748 MBEDGE_LOOPFILTER sse2, h, 5, 16, 16
2731 %else 2749 %else
2732 MBEDGE_LOOPFILTER sse2, h, 6, 16, 16 2750 MBEDGE_LOOPFILTER sse2, h, 6, 16, 16
2742 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 2760 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16
2743 %endif 2761 %endif
2744 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 2762 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16
2745 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 2763 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16
2746 2764
2765 %define WRITE_8W WRITE_8W_SSE4
2747 %ifdef m8 2766 %ifdef m8
2748 MBEDGE_LOOPFILTER sse4, h, 5, 16, 16 2767 MBEDGE_LOOPFILTER sse4, h, 5, 16, 16
2749 %else 2768 %else
2750 MBEDGE_LOOPFILTER sse4, h, 6, 16, 16 2769 MBEDGE_LOOPFILTER sse4, h, 6, 16, 16
2751 %endif 2770 %endif