Mercurial > libavcodec.hg
comparison x86/dsputil_mmx.c @ 10633:66242b8fbd32 libavcodec
port ape dsp functions from sse2 to mmx
now requires yasm
author | lorenm |
---|---|
date | Thu, 03 Dec 2009 18:53:12 +0000 |
parents | 546b7ebeaf07 |
children | 5da7180afadf |
comparison
equal
deleted
inserted
replaced
10632:54982e4c4478 | 10633:66242b8fbd32 |
---|---|
2382 | 2382 |
2383 #if HAVE_YASM | 2383 #if HAVE_YASM |
2384 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); | 2384 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); |
2385 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); | 2385 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); |
2386 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); | 2386 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); |
2387 void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order); | |
2388 void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order); | |
2389 void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order); | |
2390 void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order); | |
2391 int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift); | |
2392 int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift); | |
2387 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); | 2393 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); |
2388 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); | 2394 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); |
2389 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); | 2395 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); |
2390 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | 2396 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); |
2391 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | 2397 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); |
2504 void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, | 2510 void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
2505 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); | 2511 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); |
2506 void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, | 2512 void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
2507 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); | 2513 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); |
2508 | 2514 |
2509 | |
2510 static void add_int16_sse2(int16_t * v1, int16_t * v2, int order) | |
2511 { | |
2512 x86_reg o = -(order << 1); | |
2513 v1 += order; | |
2514 v2 += order; | |
2515 __asm__ volatile( | |
2516 "1: \n\t" | |
2517 "movdqu (%1,%2), %%xmm0 \n\t" | |
2518 "movdqu 16(%1,%2), %%xmm1 \n\t" | |
2519 "paddw (%0,%2), %%xmm0 \n\t" | |
2520 "paddw 16(%0,%2), %%xmm1 \n\t" | |
2521 "movdqa %%xmm0, (%0,%2) \n\t" | |
2522 "movdqa %%xmm1, 16(%0,%2) \n\t" | |
2523 "add $32, %2 \n\t" | |
2524 "js 1b \n\t" | |
2525 : "+r"(v1), "+r"(v2), "+r"(o) | |
2526 ); | |
2527 } | |
2528 | |
2529 static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order) | |
2530 { | |
2531 x86_reg o = -(order << 1); | |
2532 v1 += order; | |
2533 v2 += order; | |
2534 __asm__ volatile( | |
2535 "1: \n\t" | |
2536 "movdqa (%0,%2), %%xmm0 \n\t" | |
2537 "movdqa 16(%0,%2), %%xmm2 \n\t" | |
2538 "movdqu (%1,%2), %%xmm1 \n\t" | |
2539 "movdqu 16(%1,%2), %%xmm3 \n\t" | |
2540 "psubw %%xmm1, %%xmm0 \n\t" | |
2541 "psubw %%xmm3, %%xmm2 \n\t" | |
2542 "movdqa %%xmm0, (%0,%2) \n\t" | |
2543 "movdqa %%xmm2, 16(%0,%2) \n\t" | |
2544 "add $32, %2 \n\t" | |
2545 "js 1b \n\t" | |
2546 : "+r"(v1), "+r"(v2), "+r"(o) | |
2547 ); | |
2548 } | |
2549 | |
2550 static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) | |
2551 { | |
2552 int res = 0; | |
2553 DECLARE_ALIGNED_16(xmm_reg, sh); | |
2554 x86_reg o = -(order << 1); | |
2555 | |
2556 v1 += order; | |
2557 v2 += order; | |
2558 sh.a = shift; | |
2559 __asm__ volatile( | |
2560 "pxor %%xmm7, %%xmm7 \n\t" | |
2561 "1: \n\t" | |
2562 "movdqu (%0,%3), %%xmm0 \n\t" | |
2563 "movdqu 16(%0,%3), %%xmm1 \n\t" | |
2564 "pmaddwd (%1,%3), %%xmm0 \n\t" | |
2565 "pmaddwd 16(%1,%3), %%xmm1 \n\t" | |
2566 "paddd %%xmm0, %%xmm7 \n\t" | |
2567 "paddd %%xmm1, %%xmm7 \n\t" | |
2568 "add $32, %3 \n\t" | |
2569 "js 1b \n\t" | |
2570 "movhlps %%xmm7, %%xmm2 \n\t" | |
2571 "paddd %%xmm2, %%xmm7 \n\t" | |
2572 "psrad %4, %%xmm7 \n\t" | |
2573 "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t" | |
2574 "paddd %%xmm2, %%xmm7 \n\t" | |
2575 "movd %%xmm7, %2 \n\t" | |
2576 : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o) | |
2577 : "m"(sh) | |
2578 ); | |
2579 return res; | |
2580 } | |
2581 | 2515 |
2582 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | 2516 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
2583 { | 2517 { |
2584 mm_flags = mm_support(); | 2518 mm_flags = mm_support(); |
2585 | 2519 |
3013 c->vector_fmul_window = vector_fmul_window_3dnow2; | 2947 c->vector_fmul_window = vector_fmul_window_3dnow2; |
3014 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | 2948 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
3015 c->float_to_int16_interleave = float_to_int16_interleave_3dn2; | 2949 c->float_to_int16_interleave = float_to_int16_interleave_3dn2; |
3016 } | 2950 } |
3017 } | 2951 } |
2952 if(mm_flags & FF_MM_MMX2){ | |
2953 #if HAVE_YASM | |
2954 c->add_int16 = ff_add_int16_mmx2; | |
2955 c->sub_int16 = ff_sub_int16_mmx2; | |
2956 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; | |
2957 #endif | |
2958 } | |
3018 if(mm_flags & FF_MM_SSE){ | 2959 if(mm_flags & FF_MM_SSE){ |
3019 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | 2960 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; |
3020 c->ac3_downmix = ac3_downmix_sse; | 2961 c->ac3_downmix = ac3_downmix_sse; |
3021 c->vector_fmul = vector_fmul_sse; | 2962 c->vector_fmul = vector_fmul_sse; |
3022 c->vector_fmul_reverse = vector_fmul_reverse_sse; | 2963 c->vector_fmul_reverse = vector_fmul_reverse_sse; |
3031 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse | 2972 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse |
3032 if(mm_flags & FF_MM_SSE2){ | 2973 if(mm_flags & FF_MM_SSE2){ |
3033 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; | 2974 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; |
3034 c->float_to_int16 = float_to_int16_sse2; | 2975 c->float_to_int16 = float_to_int16_sse2; |
3035 c->float_to_int16_interleave = float_to_int16_interleave_sse2; | 2976 c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
3036 c->add_int16 = add_int16_sse2; | 2977 #if HAVE_YASM |
3037 c->sub_int16 = sub_int16_sse2; | 2978 c->add_int16 = ff_add_int16_sse2; |
3038 c->scalarproduct_int16 = scalarproduct_int16_sse2; | 2979 c->sub_int16 = ff_sub_int16_sse2; |
2980 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | |
2981 #endif | |
3039 } | 2982 } |
3040 } | 2983 } |
3041 | 2984 |
3042 if (CONFIG_ENCODERS) | 2985 if (CONFIG_ENCODERS) |
3043 dsputilenc_init_mmx(c, avctx); | 2986 dsputilenc_init_mmx(c, avctx); |