comparison x86/dsputil_mmx.c @ 10104:0fa3d21b317e libavcodec

SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
author vitor
date Thu, 27 Aug 2009 14:49:36 +0000
parents 3141f69e3905
children 7775f6627612
comparison
equal deleted inserted replaced
10103:2066cbe806ef 10104:0fa3d21b317e
2344 :"+r"(i) 2344 :"+r"(i)
2345 :"r"(dst+len), "r"(src+len), "m"(mul) 2345 :"r"(dst+len), "r"(src+len), "m"(mul)
2346 ); 2346 );
2347 } 2347 }
2348 2348
2349 static void vector_clipf_sse(float *dst, float *src, float min, float max,
2350 int len)
2351 {
2352 x86_reg i = (len-16)*4;
2353 __asm__ volatile(
2354 "movss %3, %%xmm4 \n"
2355 "movss %4, %%xmm5 \n"
2356 "shufps $0, %%xmm4, %%xmm4 \n"
2357 "shufps $0, %%xmm5, %%xmm5 \n"
2358 "1: \n\t"
2359 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2360 "movaps 16(%2,%0), %%xmm1 \n\t"
2361 "movaps 32(%2,%0), %%xmm2 \n\t"
2362 "movaps 48(%2,%0), %%xmm3 \n\t"
2363 "maxps %%xmm4, %%xmm0 \n\t"
2364 "maxps %%xmm4, %%xmm1 \n\t"
2365 "maxps %%xmm4, %%xmm2 \n\t"
2366 "maxps %%xmm4, %%xmm3 \n\t"
2367 "minps %%xmm5, %%xmm0 \n\t"
2368 "minps %%xmm5, %%xmm1 \n\t"
2369 "minps %%xmm5, %%xmm2 \n\t"
2370 "minps %%xmm5, %%xmm3 \n\t"
2371 "movaps %%xmm0, (%1,%0) \n\t"
2372 "movaps %%xmm1, 16(%1,%0) \n\t"
2373 "movaps %%xmm2, 32(%1,%0) \n\t"
2374 "movaps %%xmm3, 48(%1,%0) \n\t"
2375 "sub $64, %0 \n\t"
2376 "jge 1b \n\t"
2377 :"+r"(i)
2378 :"r"(dst), "r"(src), "m"(min), "m"(max)
2379 :"memory"
2380 );
2381 }
2382
2349 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ 2383 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
2350 x86_reg reglen = len; 2384 x86_reg reglen = len;
2351 // not bit-exact: pf2id uses different rounding than C and SSE 2385 // not bit-exact: pf2id uses different rounding than C and SSE
2352 __asm__ volatile( 2386 __asm__ volatile(
2353 "add %0 , %0 \n\t" 2387 "add %0 , %0 \n\t"
3044 c->vector_fmul = vector_fmul_sse; 3078 c->vector_fmul = vector_fmul_sse;
3045 c->vector_fmul_reverse = vector_fmul_reverse_sse; 3079 c->vector_fmul_reverse = vector_fmul_reverse_sse;
3046 c->vector_fmul_add_add = vector_fmul_add_add_sse; 3080 c->vector_fmul_add_add = vector_fmul_add_add_sse;
3047 c->vector_fmul_window = vector_fmul_window_sse; 3081 c->vector_fmul_window = vector_fmul_window_sse;
3048 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; 3082 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
3083 c->vector_clipf = vector_clipf_sse;
3049 c->float_to_int16 = float_to_int16_sse; 3084 c->float_to_int16 = float_to_int16_sse;
3050 c->float_to_int16_interleave = float_to_int16_interleave_sse; 3085 c->float_to_int16_interleave = float_to_int16_interleave_sse;
3051 } 3086 }
3052 if(mm_flags & FF_MM_3DNOW) 3087 if(mm_flags & FF_MM_3DNOW)
3053 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse 3088 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse