comparison i386/dsputil_mmx.c @ 7261:032a49f033e8 libavcodec

simplify vorbis windowing
author lorenm
date Sun, 13 Jul 2008 14:56:01 +0000
parents 08cc6e202aa6
children fc843d00867c
comparison
equal deleted inserted replaced
7260:3ec34b551aae 7261:032a49f033e8
2020 } 2020 }
2021 else 2021 else
2022 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); 2022 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
2023 } 2023 }
2024 2024
2025 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2026 const float *win, float add_bias, int len){
2027 #ifdef HAVE_6REGS
2028 if(add_bias == 0){
2029 x86_reg i = -len*2;
2030 x86_reg j = len*2-16;
2031 asm volatile(
2032 "1: \n"
2033 "movaps (%5,%0), %%xmm0 \n"
2034 "movaps (%5,%1), %%xmm1 \n"
2035 "movaps %%xmm0, %%xmm2 \n"
2036 "movaps %%xmm1, %%xmm3 \n"
2037 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2038 "shufps $0x1b, %%xmm3, %%xmm3 \n"
2039 "mulps (%4,%0), %%xmm0 \n"
2040 "mulps (%4,%1), %%xmm1 \n"
2041 "mulps (%3,%0), %%xmm3 \n"
2042 "mulps (%3,%1), %%xmm2 \n"
2043 "addps %%xmm3, %%xmm0 \n"
2044 "addps %%xmm2, %%xmm1 \n"
2045 "movaps %%xmm0, (%2,%0) \n"
2046 "movaps %%xmm1, (%2,%1) \n"
2047 "sub $16, %1 \n"
2048 "add $16, %0 \n"
2049 "jl 1b \n"
2050 :"+r"(i), "+r"(j)
2051 :"r"(dst+len/2), "r"(src0+len/2), "r"(src1+len/2), "r"(win+len/2)
2052 );
2053 }else
2054 #endif
2055 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
2056 }
2057
2025 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ 2058 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
2026 // not bit-exact: pf2id uses different rounding than C and SSE 2059 // not bit-exact: pf2id uses different rounding than C and SSE
2027 asm volatile( 2060 asm volatile(
2028 "add %0 , %0 \n\t" 2061 "add %0 , %0 \n\t"
2029 "lea (%2,%0,2) , %2 \n\t" 2062 "lea (%2,%0,2) , %2 \n\t"
2080 "add $16 , %0 \n\t" 2113 "add $16 , %0 \n\t"
2081 " js 1b \n\t" 2114 " js 1b \n\t"
2082 :"+r"(len), "+r"(dst), "+r"(src) 2115 :"+r"(len), "+r"(dst), "+r"(src)
2083 ); 2116 );
2084 } 2117 }
2118
2119 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
2120 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
2121 static av_noinline void float_to_int16_interleave2_##cpu(int16_t *dst, const float *src, long len, int channels){\
2122 DECLARE_ALIGNED_16(int16_t, tmp[len*channels]);\
2123 int i,j,c;\
2124 float_to_int16_##cpu(tmp, src, len*channels);\
2125 for(c=0; c<channels; c++){\
2126 int16_t *ptmp = tmp+c*len;\
2127 for(i=0, j=c; i<len; i++, j+=channels)\
2128 dst[j] = ptmp[i];\
2129 }\
2130 }\
2131 \
2132 static void float_to_int16_interleave_##cpu(int16_t *dst, const float *src, long len, int channels){\
2133 if(channels==1)\
2134 float_to_int16_##cpu(dst, src, len);\
2135 else if(channels>2)\
2136 float_to_int16_interleave2_##cpu(dst, src, len, channels);\
2137 else{\
2138 float *src1;\
2139 asm volatile(\
2140 "shl $2, %0 \n"\
2141 "add %0, %1 \n"\
2142 "add %0, %2 \n"\
2143 "lea (%2,%0), %3 \n"\
2144 "neg %0 \n"\
2145 body\
2146 :"+r"(len), "+r"(dst), "+r"(src), "=r"(src1)\
2147 );\
2148 }\
2149 }
2150
2151 FLOAT_TO_INT16_INTERLEAVE(3dnow,
2152 "1: \n"
2153 "pf2id (%2,%0), %%mm0 \n"
2154 "pf2id 8(%2,%0), %%mm1 \n"
2155 "pf2id (%3,%0), %%mm2 \n"
2156 "pf2id 8(%3,%0), %%mm3 \n"
2157 "packssdw %%mm1, %%mm0 \n"
2158 "packssdw %%mm3, %%mm2 \n"
2159 "movq %%mm0, %%mm1 \n"
2160 "punpcklwd %%mm2, %%mm0 \n"
2161 "punpckhwd %%mm2, %%mm1 \n"
2162 "movq %%mm0, (%1,%0)\n"
2163 "movq %%mm0, 8(%1,%0)\n"
2164 "add $16, %0 \n"
2165 "js 1b \n"
2166 "femms \n"
2167 )
2168
2169 FLOAT_TO_INT16_INTERLEAVE(sse,
2170 "1: \n"
2171 "cvtps2pi (%2,%0), %%mm0 \n"
2172 "cvtps2pi 8(%2,%0), %%mm1 \n"
2173 "cvtps2pi (%3,%0), %%mm2 \n"
2174 "cvtps2pi 8(%3,%0), %%mm3 \n"
2175 "packssdw %%mm1, %%mm0 \n"
2176 "packssdw %%mm3, %%mm2 \n"
2177 "movq %%mm0, %%mm1 \n"
2178 "punpcklwd %%mm2, %%mm0 \n"
2179 "punpckhwd %%mm2, %%mm1 \n"
2180 "movq %%mm0, (%1,%0)\n"
2181 "movq %%mm0, 8(%1,%0)\n"
2182 "add $16, %0 \n"
2183 "js 1b \n"
2184 "emms \n"
2185 )
2186
2187 FLOAT_TO_INT16_INTERLEAVE(sse2,
2188 "1: \n"
2189 "cvtps2dq (%2,%0), %%xmm0 \n"
2190 "cvtps2dq (%3,%0), %%xmm1 \n"
2191 "packssdw %%xmm1, %%xmm0 \n"
2192 "movhlps %%xmm0, %%xmm1 \n"
2193 "punpcklwd %%xmm1, %%xmm0 \n"
2194 "movdqa %%xmm0, (%1,%0) \n"
2195 "add $16, %0 \n"
2196 "js 1b \n"
2197 )
2198
2085 2199
2086 extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width); 2200 extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
2087 extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width); 2201 extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
2088 extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); 2202 extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
2089 extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); 2203 extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
2517 #endif 2631 #endif
2518 2632
2519 if(mm_flags & MM_3DNOW){ 2633 if(mm_flags & MM_3DNOW){
2520 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; 2634 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2521 c->vector_fmul = vector_fmul_3dnow; 2635 c->vector_fmul = vector_fmul_3dnow;
2522 if(!(avctx->flags & CODEC_FLAG_BITEXACT)) 2636 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2523 c->float_to_int16 = float_to_int16_3dnow; 2637 c->float_to_int16 = float_to_int16_3dnow;
2638 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
2639 }
2524 } 2640 }
2525 if(mm_flags & MM_3DNOWEXT) 2641 if(mm_flags & MM_3DNOWEXT)
2526 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; 2642 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2527 if(mm_flags & MM_SSE){ 2643 if(mm_flags & MM_SSE){
2528 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; 2644 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2529 c->vector_fmul = vector_fmul_sse; 2645 c->vector_fmul = vector_fmul_sse;
2530 c->float_to_int16 = float_to_int16_sse; 2646 c->float_to_int16 = float_to_int16_sse;
2647 c->float_to_int16_interleave = float_to_int16_interleave_sse;
2531 c->vector_fmul_reverse = vector_fmul_reverse_sse; 2648 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2532 c->vector_fmul_add_add = vector_fmul_add_add_sse; 2649 c->vector_fmul_add_add = vector_fmul_add_add_sse;
2650 c->vector_fmul_window = vector_fmul_window_sse;
2533 } 2651 }
2534 if(mm_flags & MM_SSE2){ 2652 if(mm_flags & MM_SSE2){
2535 c->float_to_int16 = float_to_int16_sse2; 2653 c->float_to_int16 = float_to_int16_sse2;
2654 c->float_to_int16_interleave = float_to_int16_interleave_sse2;
2536 } 2655 }
2537 if(mm_flags & MM_3DNOW) 2656 if(mm_flags & MM_3DNOW)
2538 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse 2657 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
2539 if(mm_flags & MM_SSE2){ 2658 if(mm_flags & MM_SSE2){
2540 c->add_int16 = add_int16_sse2; 2659 c->add_int16 = add_int16_sse2;