comparison x86/dsputil_mmx.c @ 10300:4d1b9ca628fc libavcodec

Drop unused args from vector_fmul_add_add, simpify code, and rename The src3 and step arguments to vector_fmul_add_add() are always zero and one, respectively. This removes these arguments from the function, simplifies the code accordingly, and renames the function to better match the new operation.
author mru
date Sun, 27 Sep 2009 16:51:54 +0000
parents 3b61bc6ce377
children 02798c603744
comparison
equal deleted inserted replaced
10299:a1654cd1b5b9 10300:4d1b9ca628fc
2123 :"+r"(i), "+r"(src1) 2123 :"+r"(i), "+r"(src1)
2124 :"r"(dst), "r"(src0) 2124 :"r"(dst), "r"(src0)
2125 ); 2125 );
2126 } 2126 }
2127 2127
2128 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, 2128 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2129 const float *src2, int src3, int len, int step){ 2129 const float *src2, int len){
2130 x86_reg i = (len-4)*4; 2130 x86_reg i = (len-4)*4;
2131 if(step == 2 && src3 == 0){
2132 dst += (len-4)*2;
2133 __asm__ volatile(
2134 "1: \n\t"
2135 "movq (%2,%0), %%mm0 \n\t"
2136 "movq 8(%2,%0), %%mm1 \n\t"
2137 "pfmul (%3,%0), %%mm0 \n\t"
2138 "pfmul 8(%3,%0), %%mm1 \n\t"
2139 "pfadd (%4,%0), %%mm0 \n\t"
2140 "pfadd 8(%4,%0), %%mm1 \n\t"
2141 "movd %%mm0, (%1) \n\t"
2142 "movd %%mm1, 16(%1) \n\t"
2143 "psrlq $32, %%mm0 \n\t"
2144 "psrlq $32, %%mm1 \n\t"
2145 "movd %%mm0, 8(%1) \n\t"
2146 "movd %%mm1, 24(%1) \n\t"
2147 "sub $32, %1 \n\t"
2148 "sub $16, %0 \n\t"
2149 "jge 1b \n\t"
2150 :"+r"(i), "+r"(dst)
2151 :"r"(src0), "r"(src1), "r"(src2)
2152 :"memory"
2153 );
2154 }
2155 else if(step == 1 && src3 == 0){
2156 __asm__ volatile( 2131 __asm__ volatile(
2157 "1: \n\t" 2132 "1: \n\t"
2158 "movq (%2,%0), %%mm0 \n\t" 2133 "movq (%2,%0), %%mm0 \n\t"
2159 "movq 8(%2,%0), %%mm1 \n\t" 2134 "movq 8(%2,%0), %%mm1 \n\t"
2160 "pfmul (%3,%0), %%mm0 \n\t" 2135 "pfmul (%3,%0), %%mm0 \n\t"
2167 "jge 1b \n\t" 2142 "jge 1b \n\t"
2168 :"+r"(i) 2143 :"+r"(i)
2169 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) 2144 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2170 :"memory" 2145 :"memory"
2171 ); 2146 );
2172 }
2173 else
2174 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
2175 __asm__ volatile("femms"); 2147 __asm__ volatile("femms");
2176 } 2148 }
2177 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, 2149 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2178 const float *src2, int src3, int len, int step){ 2150 const float *src2, int len){
2179 x86_reg i = (len-8)*4; 2151 x86_reg i = (len-8)*4;
2180 if(step == 2 && src3 == 0){
2181 dst += (len-8)*2;
2182 __asm__ volatile(
2183 "1: \n\t"
2184 "movaps (%2,%0), %%xmm0 \n\t"
2185 "movaps 16(%2,%0), %%xmm1 \n\t"
2186 "mulps (%3,%0), %%xmm0 \n\t"
2187 "mulps 16(%3,%0), %%xmm1 \n\t"
2188 "addps (%4,%0), %%xmm0 \n\t"
2189 "addps 16(%4,%0), %%xmm1 \n\t"
2190 "movss %%xmm0, (%1) \n\t"
2191 "movss %%xmm1, 32(%1) \n\t"
2192 "movhlps %%xmm0, %%xmm2 \n\t"
2193 "movhlps %%xmm1, %%xmm3 \n\t"
2194 "movss %%xmm2, 16(%1) \n\t"
2195 "movss %%xmm3, 48(%1) \n\t"
2196 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
2197 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
2198 "movss %%xmm0, 8(%1) \n\t"
2199 "movss %%xmm1, 40(%1) \n\t"
2200 "movhlps %%xmm0, %%xmm2 \n\t"
2201 "movhlps %%xmm1, %%xmm3 \n\t"
2202 "movss %%xmm2, 24(%1) \n\t"
2203 "movss %%xmm3, 56(%1) \n\t"
2204 "sub $64, %1 \n\t"
2205 "sub $32, %0 \n\t"
2206 "jge 1b \n\t"
2207 :"+r"(i), "+r"(dst)
2208 :"r"(src0), "r"(src1), "r"(src2)
2209 :"memory"
2210 );
2211 }
2212 else if(step == 1 && src3 == 0){
2213 __asm__ volatile( 2152 __asm__ volatile(
2214 "1: \n\t" 2153 "1: \n\t"
2215 "movaps (%2,%0), %%xmm0 \n\t" 2154 "movaps (%2,%0), %%xmm0 \n\t"
2216 "movaps 16(%2,%0), %%xmm1 \n\t" 2155 "movaps 16(%2,%0), %%xmm1 \n\t"
2217 "mulps (%3,%0), %%xmm0 \n\t" 2156 "mulps (%3,%0), %%xmm0 \n\t"
2224 "jge 1b \n\t" 2163 "jge 1b \n\t"
2225 :"+r"(i) 2164 :"+r"(i)
2226 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) 2165 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2227 :"memory" 2166 :"memory"
2228 ); 2167 );
2229 }
2230 else
2231 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
2232 } 2168 }
2233 2169
2234 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, 2170 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2235 const float *win, float add_bias, int len){ 2171 const float *win, float add_bias, int len){
2236 #if HAVE_6REGS 2172 #if HAVE_6REGS
3075 if(mm_flags & FF_MM_SSE){ 3011 if(mm_flags & FF_MM_SSE){
3076 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; 3012 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3077 c->ac3_downmix = ac3_downmix_sse; 3013 c->ac3_downmix = ac3_downmix_sse;
3078 c->vector_fmul = vector_fmul_sse; 3014 c->vector_fmul = vector_fmul_sse;
3079 c->vector_fmul_reverse = vector_fmul_reverse_sse; 3015 c->vector_fmul_reverse = vector_fmul_reverse_sse;
3080 c->vector_fmul_add_add = vector_fmul_add_add_sse; 3016 c->vector_fmul_add = vector_fmul_add_sse;
3081 c->vector_fmul_window = vector_fmul_window_sse; 3017 c->vector_fmul_window = vector_fmul_window_sse;
3082 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; 3018 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
3083 c->vector_clipf = vector_clipf_sse; 3019 c->vector_clipf = vector_clipf_sse;
3084 c->float_to_int16 = float_to_int16_sse; 3020 c->float_to_int16 = float_to_int16_sse;
3085 c->float_to_int16_interleave = float_to_int16_interleave_sse; 3021 c->float_to_int16_interleave = float_to_int16_interleave_sse;
3086 } 3022 }
3087 if(mm_flags & FF_MM_3DNOW) 3023 if(mm_flags & FF_MM_3DNOW)
3088 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse 3024 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
3089 if(mm_flags & FF_MM_SSE2){ 3025 if(mm_flags & FF_MM_SSE2){
3090 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; 3026 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
3091 c->float_to_int16 = float_to_int16_sse2; 3027 c->float_to_int16 = float_to_int16_sse2;
3092 c->float_to_int16_interleave = float_to_int16_interleave_sse2; 3028 c->float_to_int16_interleave = float_to_int16_interleave_sse2;
3093 c->add_int16 = add_int16_sse2; 3029 c->add_int16 = add_int16_sse2;