Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 7564:7cf793954871 libavcodec
simd int->float
20% faster ac3 if downmixing, 15% if not
author | lorenm |
---|---|
date | Wed, 13 Aug 2008 23:35:40 +0000 |
parents | 8390efaa0c03 |
children | 474c7ae4b431 |
comparison
equal
deleted
inserted
replaced
7563:8390efaa0c03 | 7564:7cf793954871 |
---|---|
2190 }else | 2190 }else |
2191 #endif | 2191 #endif |
2192 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); | 2192 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); |
2193 } | 2193 } |
2194 | 2194 |
2195 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) | |
2196 { | |
2197 x86_reg i = -4*len; | |
2198 asm volatile( | |
2199 "movss %3, %%xmm4 \n" | |
2200 "shufps $0, %%xmm4, %%xmm4 \n" | |
2201 "1: \n" | |
2202 "cvtpi2ps (%2,%0), %%xmm0 \n" | |
2203 "cvtpi2ps 8(%2,%0), %%xmm1 \n" | |
2204 "cvtpi2ps 16(%2,%0), %%xmm2 \n" | |
2205 "cvtpi2ps 24(%2,%0), %%xmm3 \n" | |
2206 "movlhps %%xmm1, %%xmm0 \n" | |
2207 "movlhps %%xmm3, %%xmm2 \n" | |
2208 "mulps %%xmm4, %%xmm0 \n" | |
2209 "mulps %%xmm4, %%xmm2 \n" | |
2210 "movaps %%xmm0, (%1,%0) \n" | |
2211 "movaps %%xmm2, 16(%1,%0) \n" | |
2212 "add $32, %0 \n" | |
2213 "jl 1b \n" | |
2214 :"+r"(i) | |
2215 :"r"(dst+len), "r"(src+len), "xm"(mul) | |
2216 ); | |
2217 } | |
2218 | |
2219 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) | |
2220 { | |
2221 x86_reg i = -4*len; | |
2222 asm volatile( | |
2223 "movss %3, %%xmm4 \n" | |
2224 "shufps $0, %%xmm4, %%xmm4 \n" | |
2225 "1: \n" | |
2226 "cvtdq2ps (%2,%0), %%xmm0 \n" | |
2227 "cvtdq2ps 16(%2,%0), %%xmm1 \n" | |
2228 "mulps %%xmm4, %%xmm0 \n" | |
2229 "mulps %%xmm4, %%xmm1 \n" | |
2230 "movaps %%xmm0, (%1,%0) \n" | |
2231 "movaps %%xmm1, 16(%1,%0) \n" | |
2232 "add $32, %0 \n" | |
2233 "jl 1b \n" | |
2234 :"+r"(i) | |
2235 :"r"(dst+len), "r"(src+len), "xm"(mul) | |
2236 ); | |
2237 } | |
2238 | |
2195 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ | 2239 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ |
2196 // not bit-exact: pf2id uses different rounding than C and SSE | 2240 // not bit-exact: pf2id uses different rounding than C and SSE |
2197 asm volatile( | 2241 asm volatile( |
2198 "add %0 , %0 \n\t" | 2242 "add %0 , %0 \n\t" |
2199 "lea (%2,%0,2) , %2 \n\t" | 2243 "lea (%2,%0,2) , %2 \n\t" |
2784 c->ac3_downmix = ac3_downmix_sse; | 2828 c->ac3_downmix = ac3_downmix_sse; |
2785 c->vector_fmul = vector_fmul_sse; | 2829 c->vector_fmul = vector_fmul_sse; |
2786 c->vector_fmul_reverse = vector_fmul_reverse_sse; | 2830 c->vector_fmul_reverse = vector_fmul_reverse_sse; |
2787 c->vector_fmul_add_add = vector_fmul_add_add_sse; | 2831 c->vector_fmul_add_add = vector_fmul_add_add_sse; |
2788 c->vector_fmul_window = vector_fmul_window_sse; | 2832 c->vector_fmul_window = vector_fmul_window_sse; |
2833 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | |
2789 c->float_to_int16 = float_to_int16_sse; | 2834 c->float_to_int16 = float_to_int16_sse; |
2790 c->float_to_int16_interleave = float_to_int16_interleave_sse; | 2835 c->float_to_int16_interleave = float_to_int16_interleave_sse; |
2791 } | 2836 } |
2792 if(mm_flags & MM_3DNOW) | 2837 if(mm_flags & MM_3DNOW) |
2793 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse | 2838 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse |
2794 if(mm_flags & MM_SSE2){ | 2839 if(mm_flags & MM_SSE2){ |
2840 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; | |
2795 c->float_to_int16 = float_to_int16_sse2; | 2841 c->float_to_int16 = float_to_int16_sse2; |
2796 c->float_to_int16_interleave = float_to_int16_interleave_sse2; | 2842 c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
2797 c->add_int16 = add_int16_sse2; | 2843 c->add_int16 = add_int16_sse2; |
2798 c->sub_int16 = sub_int16_sse2; | 2844 c->sub_int16 = sub_int16_sse2; |
2799 c->scalarproduct_int16 = scalarproduct_int16_sse2; | 2845 c->scalarproduct_int16 = scalarproduct_int16_sse2; |