Mercurial > libavcodec.hg
comparison x86/dsputil_mmx.c @ 10644:5da7180afadf libavcodec
refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2
(Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.)
9-123% faster ape decoding on G4.
author | lorenm |
---|---|
date | Sat, 05 Dec 2009 15:09:10 +0000 |
parents | 66242b8fbd32 |
children | 6f958f237d7d |
comparison
equal
deleted
inserted
replaced
10643:7f6911429cdc | 10644:5da7180afadf |
---|---|
2382 | 2382 |
2383 #if HAVE_YASM | 2383 #if HAVE_YASM |
2384 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); | 2384 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); |
2385 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); | 2385 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); |
2386 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); | 2386 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); |
2387 void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order); | 2387 int32_t ff_scalarproduct_int16_mmx2(int16_t *v1, int16_t *v2, int order, int shift); |
2388 void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order); | 2388 int32_t ff_scalarproduct_int16_sse2(int16_t *v1, int16_t *v2, int order, int shift); |
2389 void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order); | 2389 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); |
2390 void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order); | 2390 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); |
2391 int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift); | 2391 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); |
2392 int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift); | |
2393 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); | 2392 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); |
2394 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); | 2393 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); |
2395 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); | 2394 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); |
2396 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | 2395 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); |
2397 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | 2396 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); |
2949 c->float_to_int16_interleave = float_to_int16_interleave_3dn2; | 2948 c->float_to_int16_interleave = float_to_int16_interleave_3dn2; |
2950 } | 2949 } |
2951 } | 2950 } |
2952 if(mm_flags & FF_MM_MMX2){ | 2951 if(mm_flags & FF_MM_MMX2){ |
2953 #if HAVE_YASM | 2952 #if HAVE_YASM |
2954 c->add_int16 = ff_add_int16_mmx2; | |
2955 c->sub_int16 = ff_sub_int16_mmx2; | |
2956 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; | 2953 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; |
2954 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; | |
2957 #endif | 2955 #endif |
2958 } | 2956 } |
2959 if(mm_flags & FF_MM_SSE){ | 2957 if(mm_flags & FF_MM_SSE){ |
2960 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | 2958 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; |
2961 c->ac3_downmix = ac3_downmix_sse; | 2959 c->ac3_downmix = ac3_downmix_sse; |
2973 if(mm_flags & FF_MM_SSE2){ | 2971 if(mm_flags & FF_MM_SSE2){ |
2974 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; | 2972 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; |
2975 c->float_to_int16 = float_to_int16_sse2; | 2973 c->float_to_int16 = float_to_int16_sse2; |
2976 c->float_to_int16_interleave = float_to_int16_interleave_sse2; | 2974 c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
2977 #if HAVE_YASM | 2975 #if HAVE_YASM |
2978 c->add_int16 = ff_add_int16_sse2; | |
2979 c->sub_int16 = ff_sub_int16_sse2; | |
2980 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | 2976 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; |
2977 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | |
2981 #endif | 2978 #endif |
2982 } | 2979 } |
2980 if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit | |
2981 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; | |
2983 } | 2982 } |
2984 | 2983 |
2985 if (CONFIG_ENCODERS) | 2984 if (CONFIG_ENCODERS) |
2986 dsputilenc_init_mmx(c, avctx); | 2985 dsputilenc_init_mmx(c, avctx); |
2987 | 2986 |