Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 3568:945caa35ee9a libavcodec
sse and 3dnow implementations of float->int conversion and mdct windowing.
15% faster vorbis.
author | lorenm |
---|---|
date | Thu, 10 Aug 2006 19:06:25 +0000 |
parents | 97325fecd35a |
children | c42c03f3b402 |
comparison
equal
deleted
inserted
replaced
3567:1f8730f62765 | 3568:945caa35ee9a |
---|---|
2770 ::"memory" | 2770 ::"memory" |
2771 ); | 2771 ); |
2772 } | 2772 } |
2773 } | 2773 } |
2774 | 2774 |
2775 static void vector_fmul_3dnow(float *dst, const float *src, int len){ | |
2776 long i; | |
2777 len >>= 1; | |
2778 for(i=0; i<len; i++) { | |
2779 asm volatile( | |
2780 "movq %0, %%mm0 \n\t" | |
2781 "pfmul %1, %%mm0 \n\t" | |
2782 "movq %%mm0, %0 \n\t" | |
2783 :"+m"(dst[i*2]) | |
2784 :"m"(src[i*2]) | |
2785 :"memory" | |
2786 ); | |
2787 } | |
2788 asm volatile("femms"); | |
2789 } | |
2790 static void vector_fmul_sse(float *dst, const float *src, int len){ | |
2791 long i; | |
2792 len >>= 2; | |
2793 for(i=0; i<len; i++) { | |
2794 asm volatile( | |
2795 "movaps %0, %%xmm0 \n\t" | |
2796 "mulps %1, %%xmm0 \n\t" | |
2797 "movaps %%xmm0, %0 \n\t" | |
2798 :"+m"(dst[i*4]) | |
2799 :"m"(src[i*4]) | |
2800 :"memory" | |
2801 ); | |
2802 } | |
2803 } | |
2804 | |
2805 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ | |
2806 long i = len*4-16; | |
2807 asm volatile( | |
2808 "1: \n\t" | |
2809 "pswapd 8(%1), %%mm0 \n\t" | |
2810 "pswapd (%1), %%mm1 \n\t" | |
2811 "pfmul (%3,%0), %%mm0 \n\t" | |
2812 "pfmul 8(%3,%0), %%mm1 \n\t" | |
2813 "movq %%mm0, (%2,%0) \n\t" | |
2814 "movq %%mm1, 8(%2,%0) \n\t" | |
2815 "add $16, %1 \n\t" | |
2816 "sub $16, %0 \n\t" | |
2817 "jge 1b \n\t" | |
2818 :"+r"(i), "+r"(src1) | |
2819 :"r"(dst), "r"(src0) | |
2820 ); | |
2821 asm volatile("femms"); | |
2822 } | |
2823 static void vector_fmul_reverse_sse2(float *dst, const float *src0, const float *src1, int len){ | |
2824 long i = len*4-32; | |
2825 asm volatile( | |
2826 "1: \n\t" | |
2827 "pshufd $0x1b, 16(%1), %%xmm0 \n\t" | |
2828 "pshufd $0x1b, (%1), %%xmm1 \n\t" | |
2829 "mulps (%3,%0), %%xmm0 \n\t" | |
2830 "mulps 16(%3,%0), %%xmm1 \n\t" | |
2831 "movaps %%xmm0, (%2,%0) \n\t" | |
2832 "movaps %%xmm1, 16(%2,%0) \n\t" | |
2833 "add $32, %1 \n\t" | |
2834 "sub $32, %0 \n\t" | |
2835 "jge 1b \n\t" | |
2836 :"+r"(i), "+r"(src1) | |
2837 :"r"(dst), "r"(src0) | |
2838 ); | |
2839 } | |
2840 | |
2841 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, | |
2842 const float *src2, int src3, int len, int step){ | |
2843 long i; | |
2844 if(step == 2 && src3 == 0){ | |
2845 i = (len-4)*4; | |
2846 dst += (len-4)*2; | |
2847 asm volatile( | |
2848 "1: \n\t" | |
2849 "movq (%2,%0), %%mm0 \n\t" | |
2850 "movq 8(%2,%0), %%mm1 \n\t" | |
2851 "pfmul (%3,%0), %%mm0 \n\t" | |
2852 "pfmul 8(%3,%0), %%mm1 \n\t" | |
2853 "pfadd (%4,%0), %%mm0 \n\t" | |
2854 "pfadd 8(%4,%0), %%mm1 \n\t" | |
2855 "movd %%mm0, (%1) \n\t" | |
2856 "movd %%mm1, 16(%1) \n\t" | |
2857 "psrlq $32, %%mm0 \n\t" | |
2858 "psrlq $32, %%mm1 \n\t" | |
2859 "movd %%mm0, 8(%1) \n\t" | |
2860 "movd %%mm1, 24(%1) \n\t" | |
2861 "sub $32, %1 \n\t" | |
2862 "sub $16, %0 \n\t" | |
2863 "jge 1b \n\t" | |
2864 :"+r"(i), "+r"(dst) | |
2865 :"r"(src0), "r"(src1), "r"(src2) | |
2866 :"memory" | |
2867 ); | |
2868 } | |
2869 else if(step == 1 && src3 == 0){ | |
2870 for(i=0; i<len; i+=2){ | |
2871 asm volatile( | |
2872 "movq %1, %%mm0 \n\t" | |
2873 "pfmul %2, %%mm0 \n\t" | |
2874 "pfadd %3, %%mm0 \n\t" | |
2875 "movq %%mm0, %0 \n\t" | |
2876 :"=m"(dst[i]) | |
2877 :"m"(src0[i]), "m"(src1[i]), "m"(src2[i]) | |
2878 ); | |
2879 } | |
2880 } | |
2881 else | |
2882 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); | |
2883 asm volatile("femms"); | |
2884 } | |
2885 static void vector_fmul_add_add_sse2(float *dst, const float *src0, const float *src1, | |
2886 const float *src2, float src3, int len, int step){ | |
2887 long i; | |
2888 if(step == 2 && src3 == 0){ | |
2889 i = (len-8)*4; | |
2890 dst += (len-8)*2; | |
2891 asm volatile( | |
2892 "1: \n\t" | |
2893 "movaps (%2,%0), %%xmm0 \n\t" | |
2894 "movaps 16(%2,%0), %%xmm1 \n\t" | |
2895 "mulps (%3,%0), %%xmm0 \n\t" | |
2896 "mulps 16(%3,%0), %%xmm1 \n\t" | |
2897 "addps (%4,%0), %%xmm0 \n\t" | |
2898 "addps 16(%4,%0), %%xmm1 \n\t" | |
2899 "movd %%xmm0, (%1) \n\t" | |
2900 "movd %%xmm1, 32(%1) \n\t" | |
2901 "psrldq $4, %%xmm0 \n\t" | |
2902 "psrldq $4, %%xmm1 \n\t" | |
2903 "movd %%xmm0, 8(%1) \n\t" | |
2904 "movd %%xmm1, 40(%1) \n\t" | |
2905 "psrldq $4, %%xmm0 \n\t" | |
2906 "psrldq $4, %%xmm1 \n\t" | |
2907 "movd %%xmm0, 16(%1) \n\t" | |
2908 "movd %%xmm1, 48(%1) \n\t" | |
2909 "psrldq $4, %%xmm0 \n\t" | |
2910 "psrldq $4, %%xmm1 \n\t" | |
2911 "movd %%xmm0, 24(%1) \n\t" | |
2912 "movd %%xmm1, 56(%1) \n\t" | |
2913 "sub $64, %1 \n\t" | |
2914 "sub $32, %0 \n\t" | |
2915 "jge 1b \n\t" | |
2916 :"+r"(i), "+r"(dst) | |
2917 :"r"(src0), "r"(src1), "r"(src2) | |
2918 :"memory" | |
2919 ); | |
2920 } | |
2921 else if(step == 1 && src3 == 0){ | |
2922 for(i=0; i<len; i+=4){ | |
2923 asm volatile( | |
2924 "movaps %1, %%xmm0 \n\t" | |
2925 "mulps %2, %%xmm0 \n\t" | |
2926 "addps %3, %%xmm0 \n\t" | |
2927 "movaps %%xmm0, %0 \n\t" | |
2928 :"=m"(dst[i]) | |
2929 :"m"(src0[i]), "m"(src1[i]), "m"(src2[i]) | |
2930 ); | |
2931 } | |
2932 } | |
2933 else | |
2934 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); | |
2935 } | |
2936 | |
2937 void float_to_int16_3dnow(int16_t *dst, const float *src, int len){ | |
2938 // not bit-exact: pf2id uses different rounding than C and SSE | |
2939 int i; | |
2940 for(i=0; i<len; i+=4) { | |
2941 asm volatile( | |
2942 "pf2id %1, %%mm0 \n\t" | |
2943 "pf2id %2, %%mm1 \n\t" | |
2944 "packssdw %%mm1, %%mm0 \n\t" | |
2945 "movq %%mm0, %0 \n\t" | |
2946 :"=m"(dst[i]) | |
2947 :"m"(src[i]), "m"(src[i+2]) | |
2948 ); | |
2949 } | |
2950 asm volatile("femms"); | |
2951 } | |
2952 void float_to_int16_sse(int16_t *dst, const float *src, int len){ | |
2953 int i; | |
2954 for(i=0; i<len; i+=4) { | |
2955 asm volatile( | |
2956 "cvtps2pi %1, %%mm0 \n\t" | |
2957 "cvtps2pi %2, %%mm1 \n\t" | |
2958 "packssdw %%mm1, %%mm0 \n\t" | |
2959 "movq %%mm0, %0 \n\t" | |
2960 :"=m"(dst[i]) | |
2961 :"m"(src[i]), "m"(src[i+2]) | |
2962 ); | |
2963 } | |
2964 asm volatile("emms"); | |
2965 } | |
2966 | |
2775 #ifdef CONFIG_SNOW_ENCODER | 2967 #ifdef CONFIG_SNOW_ENCODER |
2776 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width); | 2968 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width); |
2777 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width); | 2969 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width); |
2778 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); | 2970 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); |
2779 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); | 2971 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); |
3197 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; | 3389 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; |
3198 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; | 3390 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; |
3199 } | 3391 } |
3200 #endif | 3392 #endif |
3201 | 3393 |
3202 if(mm_flags & MM_SSE) | 3394 if(mm_flags & MM_3DNOW){ |
3395 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | |
3396 c->vector_fmul = vector_fmul_3dnow; | |
3397 if(!(avctx->flags & CODEC_FLAG_BITEXACT)) | |
3398 c->float_to_int16 = float_to_int16_3dnow; | |
3399 } | |
3400 if(mm_flags & MM_3DNOWEXT) | |
3401 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; | |
3402 if(mm_flags & MM_SSE){ | |
3203 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | 3403 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; |
3204 else if(mm_flags & MM_3DNOW) | 3404 c->vector_fmul = vector_fmul_sse; |
3205 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | 3405 c->float_to_int16 = float_to_int16_sse; |
3406 } | |
3407 if(mm_flags & MM_SSE2){ | |
3408 c->vector_fmul_reverse = vector_fmul_reverse_sse2; | |
3409 c->vector_fmul_add_add = vector_fmul_add_add_sse2; | |
3410 } | |
3411 if(mm_flags & MM_3DNOW) | |
3412 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse2 | |
3206 } | 3413 } |
3207 | 3414 |
3208 #ifdef CONFIG_ENCODERS | 3415 #ifdef CONFIG_ENCODERS |
3209 dsputil_init_pix_mmx(c, avctx); | 3416 dsputil_init_pix_mmx(c, avctx); |
3210 #endif //CONFIG_ENCODERS | 3417 #endif //CONFIG_ENCODERS |