comparison i386/dsputil_mmx.c @ 3536:545a15c19c91 libavcodec

sse & sse2 implementations of vorbis channel coupling. 9% faster vorbis (on a K8).
author lorenm
date Thu, 03 Aug 2006 03:18:47 +0000
parents 419409926166
children 3fbddeb13686
comparison
equal deleted inserted replaced
3535:a14c98a0ca3d 3536:545a15c19c91
2709 ff_idct_xvid_mmx2 (block); 2709 ff_idct_xvid_mmx2 (block);
2710 add_pixels_clamped_mmx(block, dest, line_size); 2710 add_pixels_clamped_mmx(block, dest, line_size);
2711 } 2711 }
2712 #endif 2712 #endif
2713 2713
2714 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2715 {
2716 int i;
2717 asm volatile("pxor %%mm7, %%mm7":);
2718 for(i=0; i<blocksize; i+=2) {
2719 asm volatile(
2720 "movq %0, %%mm0 \n\t"
2721 "movq %1, %%mm1 \n\t"
2722 "movq %%mm0, %%mm2 \n\t"
2723 "movq %%mm1, %%mm3 \n\t"
2724 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2725 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2726 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2727 "pxor %%mm2, %%mm1 \n\t"
2728 "movq %%mm3, %%mm4 \n\t"
2729 "pand %%mm1, %%mm3 \n\t"
2730 "pandn %%mm1, %%mm4 \n\t"
2731 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2732 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2733 "movq %%mm3, %1 \n\t"
2734 "movq %%mm0, %0 \n\t"
2735 :"+m"(mag[i]), "+m"(ang[i])
2736 ::"memory"
2737 );
2738 }
2739 asm volatile("emms");
2740 }
2741 static void vorbis_inverse_coupling_sse2(float *mag, float *ang, int blocksize)
2742 {
2743 int i;
2744 for(i=0; i<blocksize; i+=4) {
2745 asm volatile(
2746 "movaps %0, %%xmm0 \n\t"
2747 "movaps %1, %%xmm1 \n\t"
2748 "pxor %%xmm2, %%xmm2 \n\t"
2749 "pxor %%xmm3, %%xmm3 \n\t"
2750 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2751 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2752 "pslld $31, %%xmm2 \n\t" // keep only the sign bit
2753 "pxor %%xmm2, %%xmm1 \n\t"
2754 "movaps %%xmm3, %%xmm4 \n\t"
2755 "pand %%xmm1, %%xmm3 \n\t"
2756 "pandn %%xmm1, %%xmm4 \n\t"
2757 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2758 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2759 "movaps %%xmm3, %1 \n\t"
2760 "movaps %%xmm0, %0 \n\t"
2761 :"+m"(mag[i]), "+m"(ang[i])
2762 ::"memory"
2763 );
2764 }
2765 }
2766
2714 #ifdef CONFIG_SNOW_ENCODER 2767 #ifdef CONFIG_SNOW_ENCODER
2715 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width); 2768 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
2716 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width); 2769 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
2717 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); 2770 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
2718 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); 2771 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3135 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; 3188 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
3136 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; 3189 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
3137 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; 3190 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3138 } 3191 }
3139 #endif 3192 #endif
3193
3194 if(mm_flags & MM_SSE2)
3195 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse2;
3196 else if(mm_flags & MM_SSE)
3197 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3140 } 3198 }
3141 3199
3142 #ifdef CONFIG_ENCODERS 3200 #ifdef CONFIG_ENCODERS
3143 dsputil_init_pix_mmx(c, avctx); 3201 dsputil_init_pix_mmx(c, avctx);
3144 #endif //CONFIG_ENCODERS 3202 #endif //CONFIG_ENCODERS