libavcodec.hg: i386/dsputil

comparison i386/dsputil_mmx.c @ 3568:945caa35ee9a libavcodec

sse and 3dnow implementations of float->int conversion and mdct windowing. 15% faster vorbis.

author	lorenm
date	Thu, 10 Aug 2006 19:06:25 +0000
parents	97325fecd35a
children	c42c03f3b402

comparison

equal deleted inserted replaced

-:1f8730f62765
+:945caa35ee9a
 ::"memory"
 );
 }
 }
+static void vector_fmul_3dnow(float *dst, const float *src, int len){
+long i;
+len >>= 1;
+for(i=0; i<len; i++) {
+asm volatile(
+"movq  %0, %%mm0 \n\t"
+"pfmul %1, %%mm0 \n\t"
+"movq  %%mm0, %0 \n\t"
+:"+m"(dst[i*2])
+:"m"(src[i*2])
+:"memory"
+);
+}
+asm volatile("femms");
+}
+static void vector_fmul_sse(float *dst, const float *src, int len){
+long i;
+len >>= 2;
+for(i=0; i<len; i++) {
+asm volatile(
+"movaps %0, %%xmm0 \n\t"
+"mulps  %1, %%xmm0 \n\t"
+"movaps %%xmm0, %0 \n\t"
+:"+m"(dst[i*4])
+:"m"(src[i*4])
+:"memory"
+);
+}
+}
+static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
+long i = len*4-16;
+asm volatile(
+"1: \n\t"
+"pswapd   8(%1), %%mm0 \n\t"
+"pswapd    (%1), %%mm1 \n\t"
+"pfmul  (%3,%0), %%mm0 \n\t"
+"pfmul 8(%3,%0), %%mm1 \n\t"
+"movq  %%mm0,  (%2,%0) \n\t"
+"movq  %%mm1, 8(%2,%0) \n\t"
+"add   $16, %1 \n\t"
+"sub   $16, %0 \n\t"
+"jge   1b \n\t"
+:"+r"(i), "+r"(src1)
+:"r"(dst), "r"(src0)
+);
+asm volatile("femms");
+}
+static void vector_fmul_reverse_sse2(float *dst, const float *src0, const float *src1, int len){
+long i = len*4-32;
+asm volatile(
+"1: \n\t"
+"pshufd $0x1b, 16(%1), %%xmm0 \n\t"
+"pshufd $0x1b,   (%1), %%xmm1 \n\t"
+"mulps        (%3,%0), %%xmm0 \n\t"
+"mulps      16(%3,%0), %%xmm1 \n\t"
+"movaps     %%xmm0,   (%2,%0) \n\t"
+"movaps     %%xmm1, 16(%2,%0) \n\t"
+"add    $32, %1 \n\t"
+"sub    $32, %0 \n\t"
+"jge    1b \n\t"
+:"+r"(i), "+r"(src1)
+:"r"(dst), "r"(src0)
+);
+}
+static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
+const float *src2, int src3, int len, int step){
+long i;
+if(step == 2 && src3 == 0){
+i = (len-4)*4;
+dst += (len-4)*2;
+asm volatile(
+"1: \n\t"
+"movq   (%2,%0),  %%mm0 \n\t"
+"movq  8(%2,%0),  %%mm1 \n\t"
+"pfmul  (%3,%0),  %%mm0 \n\t"
+"pfmul 8(%3,%0),  %%mm1 \n\t"
+"pfadd  (%4,%0),  %%mm0 \n\t"
+"pfadd 8(%4,%0),  %%mm1 \n\t"
+"movd     %%mm0,   (%1) \n\t"
+"movd     %%mm1, 16(%1) \n\t"
+"psrlq      $32,  %%mm0 \n\t"
+"psrlq      $32,  %%mm1 \n\t"
+"movd     %%mm0,  8(%1) \n\t"
+"movd     %%mm1, 24(%1) \n\t"
+"sub  $32, %1 \n\t"
+"sub  $16, %0 \n\t"
+"jge  1b \n\t"
+:"+r"(i), "+r"(dst)
+:"r"(src0), "r"(src1), "r"(src2)
+:"memory"
+);
+}
+else if(step == 1 && src3 == 0){
+for(i=0; i<len; i+=2){
+asm volatile(
+"movq      %1, %%mm0 \n\t"
+"pfmul     %2, %%mm0 \n\t"
+"pfadd     %3, %%mm0 \n\t"
+"movq   %%mm0, %0    \n\t"
+:"=m"(dst[i])
+:"m"(src0[i]), "m"(src1[i]), "m"(src2[i])
+);
+}
+}
+else
+ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+asm volatile("femms");
+}
+static void vector_fmul_add_add_sse2(float *dst, const float *src0, const float *src1,
+const float *src2, float src3, int len, int step){
+long i;
+if(step == 2 && src3 == 0){
+i = (len-8)*4;
+dst += (len-8)*2;
+asm volatile(
+"1: \n\t"
+"movaps   (%2,%0), %%xmm0 \n\t"
+"movaps 16(%2,%0), %%xmm1 \n\t"
+"mulps    (%3,%0), %%xmm0 \n\t"
+"mulps  16(%3,%0), %%xmm1 \n\t"
+"addps    (%4,%0), %%xmm0 \n\t"
+"addps  16(%4,%0), %%xmm1 \n\t"
+"movd      %%xmm0,   (%1) \n\t"
+"movd      %%xmm1, 32(%1) \n\t"
+"psrldq        $4, %%xmm0 \n\t"
+"psrldq        $4, %%xmm1 \n\t"
+"movd      %%xmm0,  8(%1) \n\t"
+"movd      %%xmm1, 40(%1) \n\t"
+"psrldq        $4, %%xmm0 \n\t"
+"psrldq        $4, %%xmm1 \n\t"
+"movd      %%xmm0, 16(%1) \n\t"
+"movd      %%xmm1, 48(%1) \n\t"
+"psrldq        $4, %%xmm0 \n\t"
+"psrldq        $4, %%xmm1 \n\t"
+"movd      %%xmm0, 24(%1) \n\t"
+"movd      %%xmm1, 56(%1) \n\t"
+"sub  $64, %1 \n\t"
+"sub  $32, %0 \n\t"
+"jge  1b \n\t"
+:"+r"(i), "+r"(dst)
+:"r"(src0), "r"(src1), "r"(src2)
+:"memory"
+);
+}
+else if(step == 1 && src3 == 0){
+for(i=0; i<len; i+=4){
+asm volatile(
+"movaps     %1, %%xmm0 \n\t"
+"mulps      %2, %%xmm0 \n\t"
+"addps      %3, %%xmm0 \n\t"
+"movaps %%xmm0, %0     \n\t"
+:"=m"(dst[i])
+:"m"(src0[i]), "m"(src1[i]), "m"(src2[i])
+);
+}
+}
+else
+ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+}
+void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
+// not bit-exact: pf2id uses different rounding than C and SSE
+int i;
+for(i=0; i<len; i+=4) {
+asm volatile(
+"pf2id       %1, %%mm0 \n\t"
+"pf2id       %2, %%mm1 \n\t"
+"packssdw %%mm1, %%mm0 \n\t"
+"movq     %%mm0, %0    \n\t"
+:"=m"(dst[i])
+:"m"(src[i]), "m"(src[i+2])
+);
+}
+asm volatile("femms");
+}
+void float_to_int16_sse(int16_t *dst, const float *src, int len){
+int i;
+for(i=0; i<len; i+=4) {
+asm volatile(
+"cvtps2pi    %1, %%mm0 \n\t"
+"cvtps2pi    %2, %%mm1 \n\t"
+"packssdw %%mm1, %%mm0 \n\t"
+"movq     %%mm0, %0    \n\t"
+:"=m"(dst[i])
+:"m"(src[i]), "m"(src[i+2])
+);
+}
+asm volatile("emms");
+}
 #ifdef CONFIG_SNOW_ENCODER
 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
 }
 #endif
-if(mm_flags & MM_SSE)
+if(mm_flags & MM_3DNOW){
+c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+c->vector_fmul = vector_fmul_3dnow;
+if(!(avctx->flags & CODEC_FLAG_BITEXACT))
+c->float_to_int16 = float_to_int16_3dnow;
+}
+if(mm_flags & MM_3DNOWEXT)
+c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
+if(mm_flags & MM_SSE){
 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
-else if(mm_flags & MM_3DNOW)
+c->vector_fmul = vector_fmul_sse;
-c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+c->float_to_int16 = float_to_int16_sse;
+}
+if(mm_flags & MM_SSE2){
+c->vector_fmul_reverse = vector_fmul_reverse_sse2;
+c->vector_fmul_add_add = vector_fmul_add_add_sse2;
+}
+if(mm_flags & MM_3DNOW)
+c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse2
 }
 #ifdef CONFIG_ENCODERS
 dsputil_init_pix_mmx(c, avctx);
 #endif //CONFIG_ENCODERS

Mercurial > libavcodec.hg

comparison i386/dsputil_mmx.c @ 3568:945caa35ee9a libavcodec