comparison i386/dsputil_mmx.c @ 7564:7cf793954871 libavcodec

simd int->float 20% faster ac3 if downmixing, 15% if not
author lorenm
date Wed, 13 Aug 2008 23:35:40 +0000
parents 8390efaa0c03
children 474c7ae4b431
comparison
equal deleted inserted replaced
7563:8390efaa0c03 7564:7cf793954871
2190 }else 2190 }else
2191 #endif 2191 #endif
2192 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); 2192 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
2193 } 2193 }
2194 2194
2195 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
2196 {
2197 x86_reg i = -4*len;
2198 asm volatile(
2199 "movss %3, %%xmm4 \n"
2200 "shufps $0, %%xmm4, %%xmm4 \n"
2201 "1: \n"
2202 "cvtpi2ps (%2,%0), %%xmm0 \n"
2203 "cvtpi2ps 8(%2,%0), %%xmm1 \n"
2204 "cvtpi2ps 16(%2,%0), %%xmm2 \n"
2205 "cvtpi2ps 24(%2,%0), %%xmm3 \n"
2206 "movlhps %%xmm1, %%xmm0 \n"
2207 "movlhps %%xmm3, %%xmm2 \n"
2208 "mulps %%xmm4, %%xmm0 \n"
2209 "mulps %%xmm4, %%xmm2 \n"
2210 "movaps %%xmm0, (%1,%0) \n"
2211 "movaps %%xmm2, 16(%1,%0) \n"
2212 "add $32, %0 \n"
2213 "jl 1b \n"
2214 :"+r"(i)
2215 :"r"(dst+len), "r"(src+len), "xm"(mul)
2216 );
2217 }
2218
2219 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
2220 {
2221 x86_reg i = -4*len;
2222 asm volatile(
2223 "movss %3, %%xmm4 \n"
2224 "shufps $0, %%xmm4, %%xmm4 \n"
2225 "1: \n"
2226 "cvtdq2ps (%2,%0), %%xmm0 \n"
2227 "cvtdq2ps 16(%2,%0), %%xmm1 \n"
2228 "mulps %%xmm4, %%xmm0 \n"
2229 "mulps %%xmm4, %%xmm1 \n"
2230 "movaps %%xmm0, (%1,%0) \n"
2231 "movaps %%xmm1, 16(%1,%0) \n"
2232 "add $32, %0 \n"
2233 "jl 1b \n"
2234 :"+r"(i)
2235 :"r"(dst+len), "r"(src+len), "xm"(mul)
2236 );
2237 }
2238
2195 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ 2239 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
2196 // not bit-exact: pf2id uses different rounding than C and SSE 2240 // not bit-exact: pf2id uses different rounding than C and SSE
2197 asm volatile( 2241 asm volatile(
2198 "add %0 , %0 \n\t" 2242 "add %0 , %0 \n\t"
2199 "lea (%2,%0,2) , %2 \n\t" 2243 "lea (%2,%0,2) , %2 \n\t"
2784 c->ac3_downmix = ac3_downmix_sse; 2828 c->ac3_downmix = ac3_downmix_sse;
2785 c->vector_fmul = vector_fmul_sse; 2829 c->vector_fmul = vector_fmul_sse;
2786 c->vector_fmul_reverse = vector_fmul_reverse_sse; 2830 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2787 c->vector_fmul_add_add = vector_fmul_add_add_sse; 2831 c->vector_fmul_add_add = vector_fmul_add_add_sse;
2788 c->vector_fmul_window = vector_fmul_window_sse; 2832 c->vector_fmul_window = vector_fmul_window_sse;
2833 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
2789 c->float_to_int16 = float_to_int16_sse; 2834 c->float_to_int16 = float_to_int16_sse;
2790 c->float_to_int16_interleave = float_to_int16_interleave_sse; 2835 c->float_to_int16_interleave = float_to_int16_interleave_sse;
2791 } 2836 }
2792 if(mm_flags & MM_3DNOW) 2837 if(mm_flags & MM_3DNOW)
2793 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse 2838 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
2794 if(mm_flags & MM_SSE2){ 2839 if(mm_flags & MM_SSE2){
2840 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
2795 c->float_to_int16 = float_to_int16_sse2; 2841 c->float_to_int16 = float_to_int16_sse2;
2796 c->float_to_int16_interleave = float_to_int16_interleave_sse2; 2842 c->float_to_int16_interleave = float_to_int16_interleave_sse2;
2797 c->add_int16 = add_int16_sse2; 2843 c->add_int16 = add_int16_sse2;
2798 c->sub_int16 = sub_int16_sse2; 2844 c->sub_int16 = sub_int16_sse2;
2799 c->scalarproduct_int16 = scalarproduct_int16_sse2; 2845 c->scalarproduct_int16 = scalarproduct_int16_sse2;