Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 7261:032a49f033e8 libavcodec
simplify vorbis windowing
author | lorenm |
---|---|
date | Sun, 13 Jul 2008 14:56:01 +0000 |
parents | 08cc6e202aa6 |
children | fc843d00867c |
comparison
equal
deleted
inserted
replaced
7260:3ec34b551aae | 7261:032a49f033e8 |
---|---|
2020 } | 2020 } |
2021 else | 2021 else |
2022 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); | 2022 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); |
2023 } | 2023 } |
2024 | 2024 |
2025 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, | |
2026 const float *win, float add_bias, int len){ | |
2027 #ifdef HAVE_6REGS | |
2028 if(add_bias == 0){ | |
2029 x86_reg i = -len*2; | |
2030 x86_reg j = len*2-16; | |
2031 asm volatile( | |
2032 "1: \n" | |
2033 "movaps (%5,%0), %%xmm0 \n" | |
2034 "movaps (%5,%1), %%xmm1 \n" | |
2035 "movaps %%xmm0, %%xmm2 \n" | |
2036 "movaps %%xmm1, %%xmm3 \n" | |
2037 "shufps $0x1b, %%xmm2, %%xmm2 \n" | |
2038 "shufps $0x1b, %%xmm3, %%xmm3 \n" | |
2039 "mulps (%4,%0), %%xmm0 \n" | |
2040 "mulps (%4,%1), %%xmm1 \n" | |
2041 "mulps (%3,%0), %%xmm3 \n" | |
2042 "mulps (%3,%1), %%xmm2 \n" | |
2043 "addps %%xmm3, %%xmm0 \n" | |
2044 "addps %%xmm2, %%xmm1 \n" | |
2045 "movaps %%xmm0, (%2,%0) \n" | |
2046 "movaps %%xmm1, (%2,%1) \n" | |
2047 "sub $16, %1 \n" | |
2048 "add $16, %0 \n" | |
2049 "jl 1b \n" | |
2050 :"+r"(i), "+r"(j) | |
2051 :"r"(dst+len/2), "r"(src0+len/2), "r"(src1+len/2), "r"(win+len/2) | |
2052 ); | |
2053 }else | |
2054 #endif | |
2055 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); | |
2056 } | |
2057 | |
2025 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ | 2058 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ |
2026 // not bit-exact: pf2id uses different rounding than C and SSE | 2059 // not bit-exact: pf2id uses different rounding than C and SSE |
2027 asm volatile( | 2060 asm volatile( |
2028 "add %0 , %0 \n\t" | 2061 "add %0 , %0 \n\t" |
2029 "lea (%2,%0,2) , %2 \n\t" | 2062 "lea (%2,%0,2) , %2 \n\t" |
2080 "add $16 , %0 \n\t" | 2113 "add $16 , %0 \n\t" |
2081 " js 1b \n\t" | 2114 " js 1b \n\t" |
2082 :"+r"(len), "+r"(dst), "+r"(src) | 2115 :"+r"(len), "+r"(dst), "+r"(src) |
2083 ); | 2116 ); |
2084 } | 2117 } |
2118 | |
2119 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ | |
2120 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ | |
2121 static av_noinline void float_to_int16_interleave2_##cpu(int16_t *dst, const float *src, long len, int channels){\ | |
2122 DECLARE_ALIGNED_16(int16_t, tmp[len*channels]);\ | |
2123 int i,j,c;\ | |
2124 float_to_int16_##cpu(tmp, src, len*channels);\ | |
2125 for(c=0; c<channels; c++){\ | |
2126 int16_t *ptmp = tmp+c*len;\ | |
2127 for(i=0, j=c; i<len; i++, j+=channels)\ | |
2128 dst[j] = ptmp[i];\ | |
2129 }\ | |
2130 }\ | |
2131 \ | |
2132 static void float_to_int16_interleave_##cpu(int16_t *dst, const float *src, long len, int channels){\ | |
2133 if(channels==1)\ | |
2134 float_to_int16_##cpu(dst, src, len);\ | |
2135 else if(channels>2)\ | |
2136 float_to_int16_interleave2_##cpu(dst, src, len, channels);\ | |
2137 else{\ | |
2138 float *src1;\ | |
2139 asm volatile(\ | |
2140 "shl $2, %0 \n"\ | |
2141 "add %0, %1 \n"\ | |
2142 "add %0, %2 \n"\ | |
2143 "lea (%2,%0), %3 \n"\ | |
2144 "neg %0 \n"\ | |
2145 body\ | |
2146 :"+r"(len), "+r"(dst), "+r"(src), "=r"(src1)\ | |
2147 );\ | |
2148 }\ | |
2149 } | |
2150 | |
2151 FLOAT_TO_INT16_INTERLEAVE(3dnow, | |
2152 "1: \n" | |
2153 "pf2id (%2,%0), %%mm0 \n" | |
2154 "pf2id 8(%2,%0), %%mm1 \n" | |
2155 "pf2id (%3,%0), %%mm2 \n" | |
2156 "pf2id 8(%3,%0), %%mm3 \n" | |
2157 "packssdw %%mm1, %%mm0 \n" | |
2158 "packssdw %%mm3, %%mm2 \n" | |
2159 "movq %%mm0, %%mm1 \n" | |
2160 "punpcklwd %%mm2, %%mm0 \n" | |
2161 "punpckhwd %%mm2, %%mm1 \n" | |
2162 "movq %%mm0, (%1,%0)\n" | |
2163 "movq %%mm0, 8(%1,%0)\n" | |
2164 "add $16, %0 \n" | |
2165 "js 1b \n" | |
2166 "femms \n" | |
2167 ) | |
2168 | |
2169 FLOAT_TO_INT16_INTERLEAVE(sse, | |
2170 "1: \n" | |
2171 "cvtps2pi (%2,%0), %%mm0 \n" | |
2172 "cvtps2pi 8(%2,%0), %%mm1 \n" | |
2173 "cvtps2pi (%3,%0), %%mm2 \n" | |
2174 "cvtps2pi 8(%3,%0), %%mm3 \n" | |
2175 "packssdw %%mm1, %%mm0 \n" | |
2176 "packssdw %%mm3, %%mm2 \n" | |
2177 "movq %%mm0, %%mm1 \n" | |
2178 "punpcklwd %%mm2, %%mm0 \n" | |
2179 "punpckhwd %%mm2, %%mm1 \n" | |
2180 "movq %%mm0, (%1,%0)\n" | |
2181 "movq %%mm0, 8(%1,%0)\n" | |
2182 "add $16, %0 \n" | |
2183 "js 1b \n" | |
2184 "emms \n" | |
2185 ) | |
2186 | |
2187 FLOAT_TO_INT16_INTERLEAVE(sse2, | |
2188 "1: \n" | |
2189 "cvtps2dq (%2,%0), %%xmm0 \n" | |
2190 "cvtps2dq (%3,%0), %%xmm1 \n" | |
2191 "packssdw %%xmm1, %%xmm0 \n" | |
2192 "movhlps %%xmm0, %%xmm1 \n" | |
2193 "punpcklwd %%xmm1, %%xmm0 \n" | |
2194 "movdqa %%xmm0, (%1,%0) \n" | |
2195 "add $16, %0 \n" | |
2196 "js 1b \n" | |
2197 ) | |
2198 | |
2085 | 2199 |
2086 extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width); | 2200 extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width); |
2087 extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width); | 2201 extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width); |
2088 extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); | 2202 extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); |
2089 extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); | 2203 extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); |
2517 #endif | 2631 #endif |
2518 | 2632 |
2519 if(mm_flags & MM_3DNOW){ | 2633 if(mm_flags & MM_3DNOW){ |
2520 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | 2634 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; |
2521 c->vector_fmul = vector_fmul_3dnow; | 2635 c->vector_fmul = vector_fmul_3dnow; |
2522 if(!(avctx->flags & CODEC_FLAG_BITEXACT)) | 2636 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
2523 c->float_to_int16 = float_to_int16_3dnow; | 2637 c->float_to_int16 = float_to_int16_3dnow; |
2638 c->float_to_int16_interleave = float_to_int16_interleave_3dnow; | |
2639 } | |
2524 } | 2640 } |
2525 if(mm_flags & MM_3DNOWEXT) | 2641 if(mm_flags & MM_3DNOWEXT) |
2526 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; | 2642 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; |
2527 if(mm_flags & MM_SSE){ | 2643 if(mm_flags & MM_SSE){ |
2528 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | 2644 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; |
2529 c->vector_fmul = vector_fmul_sse; | 2645 c->vector_fmul = vector_fmul_sse; |
2530 c->float_to_int16 = float_to_int16_sse; | 2646 c->float_to_int16 = float_to_int16_sse; |
2647 c->float_to_int16_interleave = float_to_int16_interleave_sse; | |
2531 c->vector_fmul_reverse = vector_fmul_reverse_sse; | 2648 c->vector_fmul_reverse = vector_fmul_reverse_sse; |
2532 c->vector_fmul_add_add = vector_fmul_add_add_sse; | 2649 c->vector_fmul_add_add = vector_fmul_add_add_sse; |
2650 c->vector_fmul_window = vector_fmul_window_sse; | |
2533 } | 2651 } |
2534 if(mm_flags & MM_SSE2){ | 2652 if(mm_flags & MM_SSE2){ |
2535 c->float_to_int16 = float_to_int16_sse2; | 2653 c->float_to_int16 = float_to_int16_sse2; |
2654 c->float_to_int16_interleave = float_to_int16_interleave_sse2; | |
2536 } | 2655 } |
2537 if(mm_flags & MM_3DNOW) | 2656 if(mm_flags & MM_3DNOW) |
2538 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse | 2657 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse |
2539 if(mm_flags & MM_SSE2){ | 2658 if(mm_flags & MM_SSE2){ |
2540 c->add_int16 = add_int16_sse2; | 2659 c->add_int16 = add_int16_sse2; |