comparison x86/dsputil_mmx.c @ 10633:66242b8fbd32 libavcodec

port ape dsp functions from sse2 to mmx now requires yasm
author lorenm
date Thu, 03 Dec 2009 18:53:12 +0000
parents 546b7ebeaf07
children 5da7180afadf
comparison
equal deleted inserted replaced
10632:54982e4c4478 10633:66242b8fbd32
2382 2382
2383 #if HAVE_YASM 2383 #if HAVE_YASM
2384 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); 2384 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
2385 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); 2385 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
2386 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); 2386 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
2387 void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order);
2388 void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order);
2389 void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order);
2390 void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order);
2391 int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift);
2392 int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
2387 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); 2393 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2388 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); 2394 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2389 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); 2395 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2390 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); 2396 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
2391 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); 2397 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
2504 void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, 2510 void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2505 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); 2511 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
2506 void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, 2512 void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2507 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); 2513 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
2508 2514
2509
2510 static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
2511 {
2512 x86_reg o = -(order << 1);
2513 v1 += order;
2514 v2 += order;
2515 __asm__ volatile(
2516 "1: \n\t"
2517 "movdqu (%1,%2), %%xmm0 \n\t"
2518 "movdqu 16(%1,%2), %%xmm1 \n\t"
2519 "paddw (%0,%2), %%xmm0 \n\t"
2520 "paddw 16(%0,%2), %%xmm1 \n\t"
2521 "movdqa %%xmm0, (%0,%2) \n\t"
2522 "movdqa %%xmm1, 16(%0,%2) \n\t"
2523 "add $32, %2 \n\t"
2524 "js 1b \n\t"
2525 : "+r"(v1), "+r"(v2), "+r"(o)
2526 );
2527 }
2528
2529 static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
2530 {
2531 x86_reg o = -(order << 1);
2532 v1 += order;
2533 v2 += order;
2534 __asm__ volatile(
2535 "1: \n\t"
2536 "movdqa (%0,%2), %%xmm0 \n\t"
2537 "movdqa 16(%0,%2), %%xmm2 \n\t"
2538 "movdqu (%1,%2), %%xmm1 \n\t"
2539 "movdqu 16(%1,%2), %%xmm3 \n\t"
2540 "psubw %%xmm1, %%xmm0 \n\t"
2541 "psubw %%xmm3, %%xmm2 \n\t"
2542 "movdqa %%xmm0, (%0,%2) \n\t"
2543 "movdqa %%xmm2, 16(%0,%2) \n\t"
2544 "add $32, %2 \n\t"
2545 "js 1b \n\t"
2546 : "+r"(v1), "+r"(v2), "+r"(o)
2547 );
2548 }
2549
2550 static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
2551 {
2552 int res = 0;
2553 DECLARE_ALIGNED_16(xmm_reg, sh);
2554 x86_reg o = -(order << 1);
2555
2556 v1 += order;
2557 v2 += order;
2558 sh.a = shift;
2559 __asm__ volatile(
2560 "pxor %%xmm7, %%xmm7 \n\t"
2561 "1: \n\t"
2562 "movdqu (%0,%3), %%xmm0 \n\t"
2563 "movdqu 16(%0,%3), %%xmm1 \n\t"
2564 "pmaddwd (%1,%3), %%xmm0 \n\t"
2565 "pmaddwd 16(%1,%3), %%xmm1 \n\t"
2566 "paddd %%xmm0, %%xmm7 \n\t"
2567 "paddd %%xmm1, %%xmm7 \n\t"
2568 "add $32, %3 \n\t"
2569 "js 1b \n\t"
2570 "movhlps %%xmm7, %%xmm2 \n\t"
2571 "paddd %%xmm2, %%xmm7 \n\t"
2572 "psrad %4, %%xmm7 \n\t"
2573 "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
2574 "paddd %%xmm2, %%xmm7 \n\t"
2575 "movd %%xmm7, %2 \n\t"
2576 : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
2577 : "m"(sh)
2578 );
2579 return res;
2580 }
2581 2515
2582 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) 2516 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2583 { 2517 {
2584 mm_flags = mm_support(); 2518 mm_flags = mm_support();
2585 2519
3013 c->vector_fmul_window = vector_fmul_window_3dnow2; 2947 c->vector_fmul_window = vector_fmul_window_3dnow2;
3014 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2948 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3015 c->float_to_int16_interleave = float_to_int16_interleave_3dn2; 2949 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
3016 } 2950 }
3017 } 2951 }
2952 if(mm_flags & FF_MM_MMX2){
2953 #if HAVE_YASM
2954 c->add_int16 = ff_add_int16_mmx2;
2955 c->sub_int16 = ff_sub_int16_mmx2;
2956 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2957 #endif
2958 }
3018 if(mm_flags & FF_MM_SSE){ 2959 if(mm_flags & FF_MM_SSE){
3019 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; 2960 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3020 c->ac3_downmix = ac3_downmix_sse; 2961 c->ac3_downmix = ac3_downmix_sse;
3021 c->vector_fmul = vector_fmul_sse; 2962 c->vector_fmul = vector_fmul_sse;
3022 c->vector_fmul_reverse = vector_fmul_reverse_sse; 2963 c->vector_fmul_reverse = vector_fmul_reverse_sse;
3031 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse 2972 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
3032 if(mm_flags & FF_MM_SSE2){ 2973 if(mm_flags & FF_MM_SSE2){
3033 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; 2974 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
3034 c->float_to_int16 = float_to_int16_sse2; 2975 c->float_to_int16 = float_to_int16_sse2;
3035 c->float_to_int16_interleave = float_to_int16_interleave_sse2; 2976 c->float_to_int16_interleave = float_to_int16_interleave_sse2;
3036 c->add_int16 = add_int16_sse2; 2977 #if HAVE_YASM
3037 c->sub_int16 = sub_int16_sse2; 2978 c->add_int16 = ff_add_int16_sse2;
3038 c->scalarproduct_int16 = scalarproduct_int16_sse2; 2979 c->sub_int16 = ff_sub_int16_sse2;
2980 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2981 #endif
3039 } 2982 }
3040 } 2983 }
3041 2984
3042 if (CONFIG_ENCODERS) 2985 if (CONFIG_ENCODERS)
3043 dsputilenc_init_mmx(c, avctx); 2986 dsputilenc_init_mmx(c, avctx);