comparison x86/dsputil_mmx.c @ 10644:5da7180afadf libavcodec

refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4.
author lorenm
date Sat, 05 Dec 2009 15:09:10 +0000
parents 66242b8fbd32
children 6f958f237d7d
comparison
equal deleted inserted replaced
10643:7f6911429cdc 10644:5da7180afadf
2382 2382
2383 #if HAVE_YASM 2383 #if HAVE_YASM
2384 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); 2384 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
2385 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); 2385 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
2386 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); 2386 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
2387 void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order); 2387 int32_t ff_scalarproduct_int16_mmx2(int16_t *v1, int16_t *v2, int order, int shift);
2388 void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order); 2388 int32_t ff_scalarproduct_int16_sse2(int16_t *v1, int16_t *v2, int order, int shift);
2389 void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order); 2389 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
2390 void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order); 2390 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
2391 int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift); 2391 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
2392 int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
2393 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); 2392 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2394 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); 2393 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2395 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); 2394 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2396 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); 2395 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
2397 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); 2396 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
2949 c->float_to_int16_interleave = float_to_int16_interleave_3dn2; 2948 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
2950 } 2949 }
2951 } 2950 }
2952 if(mm_flags & FF_MM_MMX2){ 2951 if(mm_flags & FF_MM_MMX2){
2953 #if HAVE_YASM 2952 #if HAVE_YASM
2954 c->add_int16 = ff_add_int16_mmx2;
2955 c->sub_int16 = ff_sub_int16_mmx2;
2956 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; 2953 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2954 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2957 #endif 2955 #endif
2958 } 2956 }
2959 if(mm_flags & FF_MM_SSE){ 2957 if(mm_flags & FF_MM_SSE){
2960 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; 2958 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2961 c->ac3_downmix = ac3_downmix_sse; 2959 c->ac3_downmix = ac3_downmix_sse;
2973 if(mm_flags & FF_MM_SSE2){ 2971 if(mm_flags & FF_MM_SSE2){
2974 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; 2972 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
2975 c->float_to_int16 = float_to_int16_sse2; 2973 c->float_to_int16 = float_to_int16_sse2;
2976 c->float_to_int16_interleave = float_to_int16_interleave_sse2; 2974 c->float_to_int16_interleave = float_to_int16_interleave_sse2;
2977 #if HAVE_YASM 2975 #if HAVE_YASM
2978 c->add_int16 = ff_add_int16_sse2;
2979 c->sub_int16 = ff_sub_int16_sse2;
2980 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; 2976 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2977 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2981 #endif 2978 #endif
2982 } 2979 }
2980 if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit
2981 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2983 } 2982 }
2984 2983
2985 if (CONFIG_ENCODERS) 2984 if (CONFIG_ENCODERS)
2986 dsputilenc_init_mmx(c, avctx); 2985 dsputilenc_init_mmx(c, avctx);
2987 2986