Mercurial > libavcodec.hg
comparison x86/dsputil_mmx.c @ 10300:4d1b9ca628fc libavcodec
Drop unused args from vector_fmul_add_add, simpify code, and rename
The src3 and step arguments to vector_fmul_add_add() are always zero
and one, respectively. This removes these arguments from the function,
simplifies the code accordingly, and renames the function to better
match the new operation.
author | mru |
---|---|
date | Sun, 27 Sep 2009 16:51:54 +0000 |
parents | 3b61bc6ce377 |
children | 02798c603744 |
comparison
equal
deleted
inserted
replaced
10299:a1654cd1b5b9 | 10300:4d1b9ca628fc |
---|---|
2123 :"+r"(i), "+r"(src1) | 2123 :"+r"(i), "+r"(src1) |
2124 :"r"(dst), "r"(src0) | 2124 :"r"(dst), "r"(src0) |
2125 ); | 2125 ); |
2126 } | 2126 } |
2127 | 2127 |
2128 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, | 2128 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1, |
2129 const float *src2, int src3, int len, int step){ | 2129 const float *src2, int len){ |
2130 x86_reg i = (len-4)*4; | 2130 x86_reg i = (len-4)*4; |
2131 if(step == 2 && src3 == 0){ | |
2132 dst += (len-4)*2; | |
2133 __asm__ volatile( | |
2134 "1: \n\t" | |
2135 "movq (%2,%0), %%mm0 \n\t" | |
2136 "movq 8(%2,%0), %%mm1 \n\t" | |
2137 "pfmul (%3,%0), %%mm0 \n\t" | |
2138 "pfmul 8(%3,%0), %%mm1 \n\t" | |
2139 "pfadd (%4,%0), %%mm0 \n\t" | |
2140 "pfadd 8(%4,%0), %%mm1 \n\t" | |
2141 "movd %%mm0, (%1) \n\t" | |
2142 "movd %%mm1, 16(%1) \n\t" | |
2143 "psrlq $32, %%mm0 \n\t" | |
2144 "psrlq $32, %%mm1 \n\t" | |
2145 "movd %%mm0, 8(%1) \n\t" | |
2146 "movd %%mm1, 24(%1) \n\t" | |
2147 "sub $32, %1 \n\t" | |
2148 "sub $16, %0 \n\t" | |
2149 "jge 1b \n\t" | |
2150 :"+r"(i), "+r"(dst) | |
2151 :"r"(src0), "r"(src1), "r"(src2) | |
2152 :"memory" | |
2153 ); | |
2154 } | |
2155 else if(step == 1 && src3 == 0){ | |
2156 __asm__ volatile( | 2131 __asm__ volatile( |
2157 "1: \n\t" | 2132 "1: \n\t" |
2158 "movq (%2,%0), %%mm0 \n\t" | 2133 "movq (%2,%0), %%mm0 \n\t" |
2159 "movq 8(%2,%0), %%mm1 \n\t" | 2134 "movq 8(%2,%0), %%mm1 \n\t" |
2160 "pfmul (%3,%0), %%mm0 \n\t" | 2135 "pfmul (%3,%0), %%mm0 \n\t" |
2167 "jge 1b \n\t" | 2142 "jge 1b \n\t" |
2168 :"+r"(i) | 2143 :"+r"(i) |
2169 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) | 2144 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) |
2170 :"memory" | 2145 :"memory" |
2171 ); | 2146 ); |
2172 } | |
2173 else | |
2174 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); | |
2175 __asm__ volatile("femms"); | 2147 __asm__ volatile("femms"); |
2176 } | 2148 } |
2177 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, | 2149 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1, |
2178 const float *src2, int src3, int len, int step){ | 2150 const float *src2, int len){ |
2179 x86_reg i = (len-8)*4; | 2151 x86_reg i = (len-8)*4; |
2180 if(step == 2 && src3 == 0){ | |
2181 dst += (len-8)*2; | |
2182 __asm__ volatile( | |
2183 "1: \n\t" | |
2184 "movaps (%2,%0), %%xmm0 \n\t" | |
2185 "movaps 16(%2,%0), %%xmm1 \n\t" | |
2186 "mulps (%3,%0), %%xmm0 \n\t" | |
2187 "mulps 16(%3,%0), %%xmm1 \n\t" | |
2188 "addps (%4,%0), %%xmm0 \n\t" | |
2189 "addps 16(%4,%0), %%xmm1 \n\t" | |
2190 "movss %%xmm0, (%1) \n\t" | |
2191 "movss %%xmm1, 32(%1) \n\t" | |
2192 "movhlps %%xmm0, %%xmm2 \n\t" | |
2193 "movhlps %%xmm1, %%xmm3 \n\t" | |
2194 "movss %%xmm2, 16(%1) \n\t" | |
2195 "movss %%xmm3, 48(%1) \n\t" | |
2196 "shufps $0xb1, %%xmm0, %%xmm0 \n\t" | |
2197 "shufps $0xb1, %%xmm1, %%xmm1 \n\t" | |
2198 "movss %%xmm0, 8(%1) \n\t" | |
2199 "movss %%xmm1, 40(%1) \n\t" | |
2200 "movhlps %%xmm0, %%xmm2 \n\t" | |
2201 "movhlps %%xmm1, %%xmm3 \n\t" | |
2202 "movss %%xmm2, 24(%1) \n\t" | |
2203 "movss %%xmm3, 56(%1) \n\t" | |
2204 "sub $64, %1 \n\t" | |
2205 "sub $32, %0 \n\t" | |
2206 "jge 1b \n\t" | |
2207 :"+r"(i), "+r"(dst) | |
2208 :"r"(src0), "r"(src1), "r"(src2) | |
2209 :"memory" | |
2210 ); | |
2211 } | |
2212 else if(step == 1 && src3 == 0){ | |
2213 __asm__ volatile( | 2152 __asm__ volatile( |
2214 "1: \n\t" | 2153 "1: \n\t" |
2215 "movaps (%2,%0), %%xmm0 \n\t" | 2154 "movaps (%2,%0), %%xmm0 \n\t" |
2216 "movaps 16(%2,%0), %%xmm1 \n\t" | 2155 "movaps 16(%2,%0), %%xmm1 \n\t" |
2217 "mulps (%3,%0), %%xmm0 \n\t" | 2156 "mulps (%3,%0), %%xmm0 \n\t" |
2224 "jge 1b \n\t" | 2163 "jge 1b \n\t" |
2225 :"+r"(i) | 2164 :"+r"(i) |
2226 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) | 2165 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) |
2227 :"memory" | 2166 :"memory" |
2228 ); | 2167 ); |
2229 } | |
2230 else | |
2231 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); | |
2232 } | 2168 } |
2233 | 2169 |
2234 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, | 2170 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, |
2235 const float *win, float add_bias, int len){ | 2171 const float *win, float add_bias, int len){ |
2236 #if HAVE_6REGS | 2172 #if HAVE_6REGS |
3075 if(mm_flags & FF_MM_SSE){ | 3011 if(mm_flags & FF_MM_SSE){ |
3076 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | 3012 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; |
3077 c->ac3_downmix = ac3_downmix_sse; | 3013 c->ac3_downmix = ac3_downmix_sse; |
3078 c->vector_fmul = vector_fmul_sse; | 3014 c->vector_fmul = vector_fmul_sse; |
3079 c->vector_fmul_reverse = vector_fmul_reverse_sse; | 3015 c->vector_fmul_reverse = vector_fmul_reverse_sse; |
3080 c->vector_fmul_add_add = vector_fmul_add_add_sse; | 3016 c->vector_fmul_add = vector_fmul_add_sse; |
3081 c->vector_fmul_window = vector_fmul_window_sse; | 3017 c->vector_fmul_window = vector_fmul_window_sse; |
3082 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | 3018 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; |
3083 c->vector_clipf = vector_clipf_sse; | 3019 c->vector_clipf = vector_clipf_sse; |
3084 c->float_to_int16 = float_to_int16_sse; | 3020 c->float_to_int16 = float_to_int16_sse; |
3085 c->float_to_int16_interleave = float_to_int16_interleave_sse; | 3021 c->float_to_int16_interleave = float_to_int16_interleave_sse; |
3086 } | 3022 } |
3087 if(mm_flags & FF_MM_3DNOW) | 3023 if(mm_flags & FF_MM_3DNOW) |
3088 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse | 3024 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse |
3089 if(mm_flags & FF_MM_SSE2){ | 3025 if(mm_flags & FF_MM_SSE2){ |
3090 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; | 3026 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; |
3091 c->float_to_int16 = float_to_int16_sse2; | 3027 c->float_to_int16 = float_to_int16_sse2; |
3092 c->float_to_int16_interleave = float_to_int16_interleave_sse2; | 3028 c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
3093 c->add_int16 = add_int16_sse2; | 3029 c->add_int16 = add_int16_sse2; |