Mercurial > libavcodec.hg
comparison x86/dsputil_mmx.c @ 10104:0fa3d21b317e libavcodec
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
author | vitor |
---|---|
date | Thu, 27 Aug 2009 14:49:36 +0000 |
parents | 3141f69e3905 |
children | 7775f6627612 |
comparison
equal
deleted
inserted
replaced
10103:2066cbe806ef | 10104:0fa3d21b317e |
---|---|
2344 :"+r"(i) | 2344 :"+r"(i) |
2345 :"r"(dst+len), "r"(src+len), "m"(mul) | 2345 :"r"(dst+len), "r"(src+len), "m"(mul) |
2346 ); | 2346 ); |
2347 } | 2347 } |
2348 | 2348 |
2349 static void vector_clipf_sse(float *dst, float *src, float min, float max, | |
2350 int len) | |
2351 { | |
2352 x86_reg i = (len-16)*4; | |
2353 __asm__ volatile( | |
2354 "movss %3, %%xmm4 \n" | |
2355 "movss %4, %%xmm5 \n" | |
2356 "shufps $0, %%xmm4, %%xmm4 \n" | |
2357 "shufps $0, %%xmm5, %%xmm5 \n" | |
2358 "1: \n\t" | |
2359 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel | |
2360 "movaps 16(%2,%0), %%xmm1 \n\t" | |
2361 "movaps 32(%2,%0), %%xmm2 \n\t" | |
2362 "movaps 48(%2,%0), %%xmm3 \n\t" | |
2363 "maxps %%xmm4, %%xmm0 \n\t" | |
2364 "maxps %%xmm4, %%xmm1 \n\t" | |
2365 "maxps %%xmm4, %%xmm2 \n\t" | |
2366 "maxps %%xmm4, %%xmm3 \n\t" | |
2367 "minps %%xmm5, %%xmm0 \n\t" | |
2368 "minps %%xmm5, %%xmm1 \n\t" | |
2369 "minps %%xmm5, %%xmm2 \n\t" | |
2370 "minps %%xmm5, %%xmm3 \n\t" | |
2371 "movaps %%xmm0, (%1,%0) \n\t" | |
2372 "movaps %%xmm1, 16(%1,%0) \n\t" | |
2373 "movaps %%xmm2, 32(%1,%0) \n\t" | |
2374 "movaps %%xmm3, 48(%1,%0) \n\t" | |
2375 "sub $64, %0 \n\t" | |
2376 "jge 1b \n\t" | |
2377 :"+r"(i) | |
2378 :"r"(dst), "r"(src), "m"(min), "m"(max) | |
2379 :"memory" | |
2380 ); | |
2381 } | |
2382 | |
2349 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ | 2383 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ |
2350 x86_reg reglen = len; | 2384 x86_reg reglen = len; |
2351 // not bit-exact: pf2id uses different rounding than C and SSE | 2385 // not bit-exact: pf2id uses different rounding than C and SSE |
2352 __asm__ volatile( | 2386 __asm__ volatile( |
2353 "add %0 , %0 \n\t" | 2387 "add %0 , %0 \n\t" |
3044 c->vector_fmul = vector_fmul_sse; | 3078 c->vector_fmul = vector_fmul_sse; |
3045 c->vector_fmul_reverse = vector_fmul_reverse_sse; | 3079 c->vector_fmul_reverse = vector_fmul_reverse_sse; |
3046 c->vector_fmul_add_add = vector_fmul_add_add_sse; | 3080 c->vector_fmul_add_add = vector_fmul_add_add_sse; |
3047 c->vector_fmul_window = vector_fmul_window_sse; | 3081 c->vector_fmul_window = vector_fmul_window_sse; |
3048 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | 3082 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; |
3083 c->vector_clipf = vector_clipf_sse; | |
3049 c->float_to_int16 = float_to_int16_sse; | 3084 c->float_to_int16 = float_to_int16_sse; |
3050 c->float_to_int16_interleave = float_to_int16_interleave_sse; | 3085 c->float_to_int16_interleave = float_to_int16_interleave_sse; |
3051 } | 3086 } |
3052 if(mm_flags & FF_MM_3DNOW) | 3087 if(mm_flags & FF_MM_3DNOW) |
3053 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse | 3088 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse |