Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 7563:8390efaa0c03 libavcodec
simd downmix
13% faster ac3 if downmixing
author | lorenm |
---|---|
date | Wed, 13 Aug 2008 23:33:48 +0000 |
parents | 3d1b177a1b8c |
children | 7cf793954871 |
comparison
equal
deleted
inserted
replaced
7562:ef456ee01ea2 | 7563:8390efaa0c03 |
---|---|
1840 ::"memory" | 1840 ::"memory" |
1841 ); | 1841 ); |
1842 } | 1842 } |
1843 } | 1843 } |
1844 | 1844 |
1845 #define IF1(x) x | |
1846 #define IF0(x) | |
1847 | |
1848 #define MIX5(mono,stereo)\ | |
1849 asm volatile(\ | |
1850 "movss 0(%2), %%xmm5 \n"\ | |
1851 "movss 8(%2), %%xmm6 \n"\ | |
1852 "movss 24(%2), %%xmm7 \n"\ | |
1853 "shufps $0, %%xmm5, %%xmm5 \n"\ | |
1854 "shufps $0, %%xmm6, %%xmm6 \n"\ | |
1855 "shufps $0, %%xmm7, %%xmm7 \n"\ | |
1856 "1: \n"\ | |
1857 "movaps (%0,%1), %%xmm0 \n"\ | |
1858 "movaps 0x400(%0,%1), %%xmm1 \n"\ | |
1859 "movaps 0x800(%0,%1), %%xmm2 \n"\ | |
1860 "movaps 0xc00(%0,%1), %%xmm3 \n"\ | |
1861 "movaps 0x1000(%0,%1), %%xmm4 \n"\ | |
1862 "mulps %%xmm5, %%xmm0 \n"\ | |
1863 "mulps %%xmm6, %%xmm1 \n"\ | |
1864 "mulps %%xmm5, %%xmm2 \n"\ | |
1865 "mulps %%xmm7, %%xmm3 \n"\ | |
1866 "mulps %%xmm7, %%xmm4 \n"\ | |
1867 stereo("addps %%xmm1, %%xmm0 \n")\ | |
1868 "addps %%xmm1, %%xmm2 \n"\ | |
1869 "addps %%xmm3, %%xmm0 \n"\ | |
1870 "addps %%xmm4, %%xmm2 \n"\ | |
1871 mono("addps %%xmm2, %%xmm0 \n")\ | |
1872 "movaps %%xmm0, (%0,%1) \n"\ | |
1873 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\ | |
1874 "add $16, %0 \n"\ | |
1875 "jl 1b \n"\ | |
1876 :"+&r"(i)\ | |
1877 :"r"(samples[0]+len), "r"(matrix)\ | |
1878 :"memory"\ | |
1879 ); | |
1880 | |
1881 #define MIX_MISC(stereo)\ | |
1882 asm volatile(\ | |
1883 "1: \n"\ | |
1884 "movaps (%3,%0), %%xmm0 \n"\ | |
1885 stereo("movaps %%xmm0, %%xmm1 \n")\ | |
1886 "mulps %%xmm6, %%xmm0 \n"\ | |
1887 stereo("mulps %%xmm7, %%xmm1 \n")\ | |
1888 "lea 1024(%3,%0), %1 \n"\ | |
1889 "mov %5, %2 \n"\ | |
1890 "2: \n"\ | |
1891 "movaps (%1), %%xmm2 \n"\ | |
1892 stereo("movaps %%xmm2, %%xmm3 \n")\ | |
1893 "mulps (%4,%2), %%xmm2 \n"\ | |
1894 stereo("mulps 16(%4,%2), %%xmm3 \n")\ | |
1895 "addps %%xmm2, %%xmm0 \n"\ | |
1896 stereo("addps %%xmm3, %%xmm1 \n")\ | |
1897 "add $1024, %1 \n"\ | |
1898 "add $32, %2 \n"\ | |
1899 "jl 2b \n"\ | |
1900 "movaps %%xmm0, (%3,%0) \n"\ | |
1901 stereo("movaps %%xmm1, 1024(%3,%0) \n")\ | |
1902 "add $16, %0 \n"\ | |
1903 "jl 1b \n"\ | |
1904 :"+&r"(i), "=&r"(j), "=&r"(k)\ | |
1905 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\ | |
1906 :"memory"\ | |
1907 ); | |
1908 | |
1909 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len) | |
1910 { | |
1911 int (*matrix_cmp)[2] = (int(*)[2])matrix; | |
1912 intptr_t i,j,k; | |
1913 | |
1914 i = -len*sizeof(float); | |
1915 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) { | |
1916 MIX5(IF0,IF1); | |
1917 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) { | |
1918 MIX5(IF1,IF0); | |
1919 } else { | |
1920 DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]); | |
1921 j = 2*in_ch*sizeof(float); | |
1922 asm volatile( | |
1923 "1: \n" | |
1924 "sub $8, %0 \n" | |
1925 "movss (%2,%0), %%xmm6 \n" | |
1926 "movss 4(%2,%0), %%xmm7 \n" | |
1927 "shufps $0, %%xmm6, %%xmm6 \n" | |
1928 "shufps $0, %%xmm7, %%xmm7 \n" | |
1929 "movaps %%xmm6, (%1,%0,4) \n" | |
1930 "movaps %%xmm7, 16(%1,%0,4) \n" | |
1931 "jg 1b \n" | |
1932 :"+&r"(j) | |
1933 :"r"(matrix_simd), "r"(matrix) | |
1934 :"memory" | |
1935 ); | |
1936 if(out_ch == 2) { | |
1937 MIX_MISC(IF1); | |
1938 } else { | |
1939 MIX_MISC(IF0); | |
1940 } | |
1941 } | |
1942 } | |
1943 | |
1845 static void vector_fmul_3dnow(float *dst, const float *src, int len){ | 1944 static void vector_fmul_3dnow(float *dst, const float *src, int len){ |
1846 x86_reg i = (len-4)*4; | 1945 x86_reg i = (len-4)*4; |
1847 asm volatile( | 1946 asm volatile( |
1848 "1: \n\t" | 1947 "1: \n\t" |
1849 "movq (%1,%0), %%mm0 \n\t" | 1948 "movq (%1,%0), %%mm0 \n\t" |
2680 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; | 2779 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; |
2681 c->vector_fmul_window = vector_fmul_window_3dnow2; | 2780 c->vector_fmul_window = vector_fmul_window_3dnow2; |
2682 } | 2781 } |
2683 if(mm_flags & MM_SSE){ | 2782 if(mm_flags & MM_SSE){ |
2684 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | 2783 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; |
2784 c->ac3_downmix = ac3_downmix_sse; | |
2685 c->vector_fmul = vector_fmul_sse; | 2785 c->vector_fmul = vector_fmul_sse; |
2686 c->vector_fmul_reverse = vector_fmul_reverse_sse; | 2786 c->vector_fmul_reverse = vector_fmul_reverse_sse; |
2687 c->vector_fmul_add_add = vector_fmul_add_add_sse; | 2787 c->vector_fmul_add_add = vector_fmul_add_add_sse; |
2688 c->vector_fmul_window = vector_fmul_window_sse; | 2788 c->vector_fmul_window = vector_fmul_window_sse; |
2689 c->float_to_int16 = float_to_int16_sse; | 2789 c->float_to_int16 = float_to_int16_sse; |