comparison i386/dsputil_mmx.c @ 6030:fb99890ee609 libavcodec

move FLAC mmx dsp to its own file
author aurel
date Sun, 16 Dec 2007 22:20:47 +0000
parents ecfdc0bfb233
children 558c1fd0ee72
comparison
equal deleted inserted replaced
6029:fc51a6ffa64f 6030:fb99890ee609
2855 } 2855 }
2856 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { 2856 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2857 avg_pixels16_mmx(dst, src, stride, 16); 2857 avg_pixels16_mmx(dst, src, stride, 16);
2858 } 2858 }
2859 2859
2860 /* FLAC specific */
2861 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
2862 double *autoc);
2863
2860 /* VC1 specific */ 2864 /* VC1 specific */
2861 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); 2865 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
2862 2866
2863 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { 2867 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2864 put_pixels8_mmx(dst, src, stride, 8); 2868 put_pixels8_mmx(dst, src, stride, 8);
2968 :"+m"(mag[i]), "+m"(ang[i]) 2972 :"+m"(mag[i]), "+m"(ang[i])
2969 ::"memory" 2973 ::"memory"
2970 ); 2974 );
2971 } 2975 }
2972 } 2976 }
2973
2974 #ifdef CONFIG_ENCODERS
2975 static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data)
2976 {
2977 double c = 2.0 / (len-1.0);
2978 int n2 = len>>1;
2979 long i = -n2*sizeof(int32_t);
2980 long j = n2*sizeof(int32_t);
2981 asm volatile(
2982 "movsd %0, %%xmm7 \n\t"
2983 "movapd %1, %%xmm6 \n\t"
2984 "movapd %2, %%xmm5 \n\t"
2985 "movlhps %%xmm7, %%xmm7 \n\t"
2986 "subpd %%xmm5, %%xmm7 \n\t"
2987 "addsd %%xmm6, %%xmm7 \n\t"
2988 ::"m"(c), "m"(*ff_pd_1), "m"(*ff_pd_2)
2989 );
2990 #define WELCH(MOVPD)\
2991 asm volatile(\
2992 "1: \n\t"\
2993 "movapd %%xmm7, %%xmm1 \n\t"\
2994 "mulpd %%xmm1, %%xmm1 \n\t"\
2995 "movapd %%xmm6, %%xmm0 \n\t"\
2996 "subpd %%xmm1, %%xmm0 \n\t"\
2997 "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
2998 "cvtpi2pd (%4,%0), %%xmm2 \n\t"\
2999 "cvtpi2pd (%5,%1), %%xmm3 \n\t"\
3000 "mulpd %%xmm0, %%xmm2 \n\t"\
3001 "mulpd %%xmm1, %%xmm3 \n\t"\
3002 "movapd %%xmm2, (%2,%0,2) \n\t"\
3003 MOVPD" %%xmm3, (%3,%1,2) \n\t"\
3004 "subpd %%xmm5, %%xmm7 \n\t"\
3005 "sub $8, %1 \n\t"\
3006 "add $8, %0 \n\t"\
3007 "jl 1b \n\t"\
3008 :"+&r"(i), "+&r"(j)\
3009 :"r"(w_data+n2), "r"(w_data+len-2-n2),\
3010 "r"(data+n2), "r"(data+len-2-n2)\
3011 );
3012 if(len&1)
3013 WELCH("movupd")
3014 else
3015 WELCH("movapd")
3016 #undef WELCH
3017 }
3018
3019 static void flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
3020 double *autoc)
3021 {
3022 double tmp[len + lag + 2];
3023 double *data1 = tmp + lag;
3024 int j;
3025
3026 if((long)data1 & 15)
3027 data1++;
3028
3029 apply_welch_window_sse2(data, len, data1);
3030
3031 for(j=0; j<lag; j++)
3032 data1[j-lag]= 0.0;
3033 data1[len] = 0.0;
3034
3035 for(j=0; j<lag; j+=2){
3036 long i = -len*sizeof(double);
3037 if(j == lag-2) {
3038 asm volatile(
3039 "movsd %6, %%xmm0 \n\t"
3040 "movsd %6, %%xmm1 \n\t"
3041 "movsd %6, %%xmm2 \n\t"
3042 "1: \n\t"
3043 "movapd (%4,%0), %%xmm3 \n\t"
3044 "movupd -8(%5,%0), %%xmm4 \n\t"
3045 "movapd (%5,%0), %%xmm5 \n\t"
3046 "mulpd %%xmm3, %%xmm4 \n\t"
3047 "mulpd %%xmm3, %%xmm5 \n\t"
3048 "mulpd -16(%5,%0), %%xmm3 \n\t"
3049 "addpd %%xmm4, %%xmm1 \n\t"
3050 "addpd %%xmm5, %%xmm0 \n\t"
3051 "addpd %%xmm3, %%xmm2 \n\t"
3052 "add $16, %0 \n\t"
3053 "jl 1b \n\t"
3054 "movhlps %%xmm0, %%xmm3 \n\t"
3055 "movhlps %%xmm1, %%xmm4 \n\t"
3056 "movhlps %%xmm2, %%xmm5 \n\t"
3057 "addsd %%xmm3, %%xmm0 \n\t"
3058 "addsd %%xmm4, %%xmm1 \n\t"
3059 "addsd %%xmm5, %%xmm2 \n\t"
3060 "movsd %%xmm0, %1 \n\t"
3061 "movsd %%xmm1, %2 \n\t"
3062 "movsd %%xmm2, %3 \n\t"
3063 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]), "=m"(autoc[j+2])
3064 :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
3065 );
3066 } else {
3067 asm volatile(
3068 "movsd %5, %%xmm0 \n\t"
3069 "movsd %5, %%xmm1 \n\t"
3070 "1: \n\t"
3071 "movapd (%3,%0), %%xmm3 \n\t"
3072 "movupd -8(%4,%0), %%xmm4 \n\t"
3073 "mulpd %%xmm3, %%xmm4 \n\t"
3074 "mulpd (%4,%0), %%xmm3 \n\t"
3075 "addpd %%xmm4, %%xmm1 \n\t"
3076 "addpd %%xmm3, %%xmm0 \n\t"
3077 "add $16, %0 \n\t"
3078 "jl 1b \n\t"
3079 "movhlps %%xmm0, %%xmm3 \n\t"
3080 "movhlps %%xmm1, %%xmm4 \n\t"
3081 "addsd %%xmm3, %%xmm0 \n\t"
3082 "addsd %%xmm4, %%xmm1 \n\t"
3083 "movsd %%xmm0, %1 \n\t"
3084 "movsd %%xmm1, %2 \n\t"
3085 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
3086 :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
3087 );
3088 }
3089 }
3090 }
3091 #endif // CONFIG_ENCODERS
3092 2977
3093 static void vector_fmul_3dnow(float *dst, const float *src, int len){ 2978 static void vector_fmul_3dnow(float *dst, const float *src, int len){
3094 long i = (len-4)*4; 2979 long i = (len-4)*4;
3095 asm volatile( 2980 asm volatile(
3096 "1: \n\t" 2981 "1: \n\t"
3735 #ifdef CONFIG_ENCODERS 3620 #ifdef CONFIG_ENCODERS
3736 if(mm_flags & MM_SSE2){ 3621 if(mm_flags & MM_SSE2){
3737 c->sum_abs_dctelem= sum_abs_dctelem_sse2; 3622 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
3738 c->hadamard8_diff[0]= hadamard8_diff16_sse2; 3623 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
3739 c->hadamard8_diff[1]= hadamard8_diff_sse2; 3624 c->hadamard8_diff[1]= hadamard8_diff_sse2;
3740 c->flac_compute_autocorr = flac_compute_autocorr_sse2; 3625 if (ENABLE_FLAC_ENCODER)
3626 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
3741 } 3627 }
3742 3628
3743 #ifdef HAVE_SSSE3 3629 #ifdef HAVE_SSSE3
3744 if(mm_flags & MM_SSSE3){ 3630 if(mm_flags & MM_SSSE3){
3745 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 3631 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){