Mercurial > mplayer.hg
comparison postproc/swscale_template.c @ 3352:64121e8a43f5
print more info if -v
use new horizontal mmx scaler instead of old x86asm if mmx2 cant be used (FAST_BILINEAR only)
fixed overflow in init function ... using double precission fp now :)
using C scaler for the last 1-2 lines if there is a chance to write over the end of the dst array
author | michael |
---|---|
date | Thu, 06 Dec 2001 19:07:25 +0000 |
parents | 20065c9b0f09 |
children | 33c560ffd3dc |
comparison
equal
deleted
inserted
replaced
3351:091cdd056ca4 | 3352:64121e8a43f5 |
---|---|
670 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize), | 670 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize), |
671 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW) | 671 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW) |
672 : "%eax", "%edx", "%esi" | 672 : "%eax", "%edx", "%esi" |
673 ); | 673 ); |
674 #else | 674 #else |
675 //FIXME Optimize (just quickly writen not opti..) | 675 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, |
676 int i; | 676 chrFilter, chrSrc, chrFilterSize, |
677 for(i=0; i<dstW; i++) | 677 dest, uDest, vDest, dstW); |
678 { | |
679 int val=0; | |
680 int j; | |
681 for(j=0; j<lumFilterSize; j++) | |
682 val += lumSrc[j][i] * lumFilter[j]; | |
683 | |
684 dest[i]= MIN(MAX(val>>19, 0), 255); | |
685 } | |
686 | |
687 if(uDest != NULL) | |
688 for(i=0; i<(dstW>>1); i++) | |
689 { | |
690 int u=0; | |
691 int v=0; | |
692 int j; | |
693 for(j=0; j<lumFilterSize; j++) | |
694 { | |
695 u += chrSrc[j][i] * chrFilter[j]; | |
696 v += chrSrc[j][i + 2048] * chrFilter[j]; | |
697 } | |
698 | |
699 uDest[i]= MIN(MAX(u>>19, 0), 255); | |
700 vDest[i]= MIN(MAX(v>>19, 0), 255); | |
701 } | |
702 #endif | 678 #endif |
703 } | 679 } |
704 | 680 |
705 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, | 681 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, |
706 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW) | 682 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW) |
834 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | 810 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) |
835 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | 811 : "%eax", "%ebx", "%ecx", "%edx", "%esi" |
836 ); | 812 ); |
837 } | 813 } |
838 #else | 814 #else |
839 if(dstbpp==32) | 815 yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize, |
840 { | 816 chrFilter, chrSrc, chrFilterSize, |
841 int i; | 817 dest, dstW, dstbpp); |
842 for(i=0; i<(dstW>>1); i++){ | 818 |
843 int j; | |
844 int Y1=0; | |
845 int Y2=0; | |
846 int U=0; | |
847 int V=0; | |
848 int Cb, Cr, Cg; | |
849 for(j=0; j<lumFilterSize; j++) | |
850 { | |
851 Y1 += lumSrc[j][2*i] * lumFilter[j]; | |
852 Y2 += lumSrc[j][2*i+1] * lumFilter[j]; | |
853 } | |
854 for(j=0; j<chrFilterSize; j++) | |
855 { | |
856 U += chrSrc[j][i] * chrFilter[j]; | |
857 V += chrSrc[j][i+2048] * chrFilter[j]; | |
858 } | |
859 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ]; | |
860 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; | |
861 U >>= 19; | |
862 V >>= 19; | |
863 | |
864 Cb= clip_yuvtab_40cf[U+ 256]; | |
865 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; | |
866 Cr= clip_yuvtab_3343[V+ 256]; | |
867 | |
868 dest[8*i+0]=clip_table[((Y1 + Cb) >>13)]; | |
869 dest[8*i+1]=clip_table[((Y1 + Cg) >>13)]; | |
870 dest[8*i+2]=clip_table[((Y1 + Cr) >>13)]; | |
871 | |
872 dest[8*i+4]=clip_table[((Y2 + Cb) >>13)]; | |
873 dest[8*i+5]=clip_table[((Y2 + Cg) >>13)]; | |
874 dest[8*i+6]=clip_table[((Y2 + Cr) >>13)]; | |
875 } | |
876 } | |
877 else if(dstbpp==24) | |
878 { | |
879 int i; | |
880 for(i=0; i<(dstW>>1); i++){ | |
881 int j; | |
882 int Y1=0; | |
883 int Y2=0; | |
884 int U=0; | |
885 int V=0; | |
886 int Cb, Cr, Cg; | |
887 for(j=0; j<lumFilterSize; j++) | |
888 { | |
889 Y1 += lumSrc[j][2*i] * lumFilter[j]; | |
890 Y2 += lumSrc[j][2*i+1] * lumFilter[j]; | |
891 } | |
892 for(j=0; j<chrFilterSize; j++) | |
893 { | |
894 U += chrSrc[j][i] * chrFilter[j]; | |
895 V += chrSrc[j][i+2048] * chrFilter[j]; | |
896 } | |
897 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ]; | |
898 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; | |
899 U >>= 19; | |
900 V >>= 19; | |
901 | |
902 Cb= clip_yuvtab_40cf[U+ 256]; | |
903 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; | |
904 Cr= clip_yuvtab_3343[V+ 256]; | |
905 | |
906 dest[0]=clip_table[((Y1 + Cb) >>13)]; | |
907 dest[1]=clip_table[((Y1 + Cg) >>13)]; | |
908 dest[2]=clip_table[((Y1 + Cr) >>13)]; | |
909 | |
910 dest[3]=clip_table[((Y2 + Cb) >>13)]; | |
911 dest[4]=clip_table[((Y2 + Cg) >>13)]; | |
912 dest[5]=clip_table[((Y2 + Cr) >>13)]; | |
913 dest+=6; | |
914 } | |
915 } | |
916 else if(dstbpp==16) | |
917 { | |
918 int i; | |
919 for(i=0; i<(dstW>>1); i++){ | |
920 int j; | |
921 int Y1=0; | |
922 int Y2=0; | |
923 int U=0; | |
924 int V=0; | |
925 int Cb, Cr, Cg; | |
926 for(j=0; j<lumFilterSize; j++) | |
927 { | |
928 Y1 += lumSrc[j][2*i] * lumFilter[j]; | |
929 Y2 += lumSrc[j][2*i+1] * lumFilter[j]; | |
930 } | |
931 for(j=0; j<chrFilterSize; j++) | |
932 { | |
933 U += chrSrc[j][i] * chrFilter[j]; | |
934 V += chrSrc[j][i+2048] * chrFilter[j]; | |
935 } | |
936 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ]; | |
937 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; | |
938 U >>= 19; | |
939 V >>= 19; | |
940 | |
941 Cb= clip_yuvtab_40cf[U+ 256]; | |
942 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; | |
943 Cr= clip_yuvtab_3343[V+ 256]; | |
944 | |
945 ((uint16_t*)dest)[2*i] = | |
946 clip_table16b[(Y1 + Cb) >>13] | | |
947 clip_table16g[(Y1 + Cg) >>13] | | |
948 clip_table16r[(Y1 + Cr) >>13]; | |
949 | |
950 ((uint16_t*)dest)[2*i+1] = | |
951 clip_table16b[(Y2 + Cb) >>13] | | |
952 clip_table16g[(Y2 + Cg) >>13] | | |
953 clip_table16r[(Y2 + Cr) >>13]; | |
954 } | |
955 } | |
956 else if(dstbpp==15) | |
957 { | |
958 int i; | |
959 for(i=0; i<(dstW>>1); i++){ | |
960 int j; | |
961 int Y1=0; | |
962 int Y2=0; | |
963 int U=0; | |
964 int V=0; | |
965 int Cb, Cr, Cg; | |
966 for(j=0; j<lumFilterSize; j++) | |
967 { | |
968 Y1 += lumSrc[j][2*i] * lumFilter[j]; | |
969 Y2 += lumSrc[j][2*i+1] * lumFilter[j]; | |
970 } | |
971 for(j=0; j<chrFilterSize; j++) | |
972 { | |
973 U += chrSrc[j][i] * chrFilter[j]; | |
974 V += chrSrc[j][i+2048] * chrFilter[j]; | |
975 } | |
976 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ]; | |
977 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; | |
978 U >>= 19; | |
979 V >>= 19; | |
980 | |
981 Cb= clip_yuvtab_40cf[U+ 256]; | |
982 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; | |
983 Cr= clip_yuvtab_3343[V+ 256]; | |
984 | |
985 ((uint16_t*)dest)[2*i] = | |
986 clip_table15b[(Y1 + Cb) >>13] | | |
987 clip_table15g[(Y1 + Cg) >>13] | | |
988 clip_table15r[(Y1 + Cr) >>13]; | |
989 | |
990 ((uint16_t*)dest)[2*i+1] = | |
991 clip_table15b[(Y2 + Cb) >>13] | | |
992 clip_table15g[(Y2 + Cg) >>13] | | |
993 clip_table15r[(Y2 + Cr) >>13]; | |
994 } | |
995 } | |
996 #endif | 819 #endif |
997 } //!FULL_UV_IPOL | 820 } //!FULL_UV_IPOL |
998 } | 821 } |
999 | 822 |
1000 | 823 |
1371 */ | 1194 */ |
1372 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, | 1195 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, |
1373 uint8_t *dest, int dstW, int uvalpha, int dstbpp) | 1196 uint8_t *dest, int dstW, int uvalpha, int dstbpp) |
1374 { | 1197 { |
1375 int uvalpha1=uvalpha^4095; | 1198 int uvalpha1=uvalpha^4095; |
1376 const int yalpha=0; | |
1377 const int yalpha1=0; | 1199 const int yalpha1=0; |
1378 | 1200 |
1379 if(fullUVIpol || allwaysIpol) | 1201 if(fullUVIpol || allwaysIpol) |
1380 { | 1202 { |
1381 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp); | 1203 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp); |
1634 "pmaddwd %%mm6, %%mm0 \n\t" | 1456 "pmaddwd %%mm6, %%mm0 \n\t" |
1635 "packssdw %%mm0, %%mm0 \n\t" | 1457 "packssdw %%mm0, %%mm0 \n\t" |
1636 "movd %%mm0, (%4, %%ebp) \n\t" | 1458 "movd %%mm0, (%4, %%ebp) \n\t" |
1637 "addl $4, %%ebp \n\t" | 1459 "addl $4, %%ebp \n\t" |
1638 " jnc 1b \n\t" | 1460 " jnc 1b \n\t" |
1639 | 1461 |
1640 "popl %%ebp \n\t" | 1462 "popl %%ebp \n\t" |
1641 : "+a" (counter) | 1463 : "+a" (counter) |
1642 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | 1464 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) |
1643 : "%ebx" | 1465 : "%ebx" |
1644 ); | 1466 ); |
1762 #endif | 1584 #endif |
1763 } | 1585 } |
1764 // *** horizontal scale Y line to temp buffer | 1586 // *** horizontal scale Y line to temp buffer |
1765 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc) | 1587 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc) |
1766 { | 1588 { |
1589 #ifdef HAVE_MMX | |
1590 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one) | |
1591 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed)) | |
1592 #else | |
1767 if(sws_flags != SWS_FAST_BILINEAR) | 1593 if(sws_flags != SWS_FAST_BILINEAR) |
1594 #endif | |
1768 { | 1595 { |
1769 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | 1596 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); |
1770 } | 1597 } |
1771 else // Fast Bilinear upscale / crap downscale | 1598 else // Fast Bilinear upscale / crap downscale |
1772 { | 1599 { |
1883 } | 1710 } |
1884 | 1711 |
1885 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, | 1712 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, |
1886 uint8_t *src1, uint8_t *src2, int srcW, int xInc) | 1713 uint8_t *src1, uint8_t *src2, int srcW, int xInc) |
1887 { | 1714 { |
1715 #ifdef HAVE_MMX | |
1716 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one) | |
1717 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed)) | |
1718 #else | |
1888 if(sws_flags != SWS_FAST_BILINEAR) | 1719 if(sws_flags != SWS_FAST_BILINEAR) |
1720 #endif | |
1889 { | 1721 { |
1890 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | 1722 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); |
1891 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | 1723 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); |
1892 } | 1724 } |
1893 else // Fast Bilinear upscale / crap downscale | 1725 else // Fast Bilinear upscale / crap downscale |
2024 } | 1856 } |
2025 #endif | 1857 #endif |
2026 } | 1858 } |
2027 } | 1859 } |
2028 | 1860 |
2029 static inline void RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *filterSize, int xInc, | 1861 static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc, |
2030 int srcW, int dstW, int filterAlign, int one) | 1862 int srcW, int dstW, int filterAlign, int one) |
2031 { | 1863 { |
2032 int i; | 1864 int i; |
1865 double filter[8000]; | |
2033 #ifdef HAVE_MMX | 1866 #ifdef HAVE_MMX |
2034 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions) | 1867 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions) |
2035 #endif | 1868 #endif |
2036 | 1869 |
2037 if(ABS(xInc - 0x10000) <10) // unscaled | 1870 if(ABS(xInc - 0x10000) <10) // unscaled |
2038 { | 1871 { |
2039 int i; | 1872 int i; |
2064 | 1897 |
2065 filterPos[i]= xx; | 1898 filterPos[i]= xx; |
2066 if(sws_flags == SWS_BICUBIC) | 1899 if(sws_flags == SWS_BICUBIC) |
2067 { | 1900 { |
2068 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16); | 1901 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16); |
2069 // int coeff; | 1902 double y1,y2,y3,y4; |
2070 int y1,y2,y3,y4; | |
2071 double A= -0.75; | 1903 double A= -0.75; |
2072 // Equation is from VirtualDub | 1904 // Equation is from VirtualDub |
2073 y1 = (int)floor(0.5 + ( + A*d - 2.0*A*d*d + A*d*d*d) * 16384.0); | 1905 y1 = ( + A*d - 2.0*A*d*d + A*d*d*d); |
2074 y2 = (int)floor(0.5 + (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d) * 16384.0); | 1906 y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d); |
2075 y3 = (int)floor(0.5 + ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d) * 16384.0); | 1907 y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d); |
2076 y4 = (int)floor(0.5 + ( + A*d*d - A*d*d*d) * 16384.0); | 1908 y4 = ( + A*d*d - A*d*d*d); |
2077 | 1909 |
2078 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); | 1910 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
2079 filter[i*(*filterSize) + 0]= y1; | 1911 filter[i*(*filterSize) + 0]= y1; |
2080 filter[i*(*filterSize) + 1]= y2; | 1912 filter[i*(*filterSize) + 1]= y2; |
2081 filter[i*(*filterSize) + 2]= y3; | 1913 filter[i*(*filterSize) + 2]= y3; |
2085 else | 1917 else |
2086 { | 1918 { |
2087 for(j=0; j<*filterSize; j++) | 1919 for(j=0; j<*filterSize; j++) |
2088 { | 1920 { |
2089 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16); | 1921 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16); |
2090 int coeff; | 1922 double coeff= 1.0 - d; |
2091 coeff= (int)(0.5 + (1.0 - d)*(1<<14)); | |
2092 if(coeff<0) coeff=0; | 1923 if(coeff<0) coeff=0; |
2093 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); | 1924 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
2094 filter[i*(*filterSize) + j]= coeff; | 1925 filter[i*(*filterSize) + j]= coeff; |
2095 xx++; | 1926 xx++; |
2096 } | 1927 } |
2114 | 1945 |
2115 filterPos[i]= xx; | 1946 filterPos[i]= xx; |
2116 for(j=0; j<*filterSize; j++) | 1947 for(j=0; j<*filterSize; j++) |
2117 { | 1948 { |
2118 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc; | 1949 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc; |
2119 int coeff; | 1950 double coeff; |
2120 if(sws_flags == SWS_BICUBIC) | 1951 if(sws_flags == SWS_BICUBIC) |
2121 { | 1952 { |
2122 double A= -0.75; | 1953 double A= -0.75; |
2123 // d*=2; | 1954 // d*=2; |
2124 // Equation is from VirtualDub | 1955 // Equation is from VirtualDub |
2125 if(d<1.0) | 1956 if(d<1.0) |
2126 coeff = (int)floor(0.5 + (1.0 - (A+3.0)*d*d | 1957 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d); |
2127 + (A+2.0)*d*d*d) * (1<<14)); | |
2128 else if(d<2.0) | 1958 else if(d<2.0) |
2129 coeff = (int)floor(0.5 + (-4.0*A + 8.0*A*d | 1959 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d); |
2130 - 5.0*A*d*d + A*d*d*d) * (1<<14)); | |
2131 else | 1960 else |
2132 coeff=0; | 1961 coeff=0.0; |
2133 } | 1962 } |
2134 else | 1963 else |
2135 { | 1964 { |
2136 coeff= (int)(0.5 + (1.0 - d)*(1<<14)); | 1965 coeff= 1.0 - d; |
2137 if(coeff<0) coeff=0; | 1966 if(coeff<0) coeff=0; |
2138 } | 1967 } |
2139 // if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc); | 1968 // if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
2140 filter[i*(*filterSize) + j]= coeff; | 1969 filter[i*(*filterSize) + j]= coeff; |
2141 xx++; | 1970 xx++; |
2158 filter[i*(*filterSize) + j]=0; | 1987 filter[i*(*filterSize) + j]=0; |
2159 } | 1988 } |
2160 filterPos[i]= 0; | 1989 filterPos[i]= 0; |
2161 } | 1990 } |
2162 | 1991 |
2163 if(filterPos[i] + *filterSize > srcW) | 1992 if(filterPos[i] + (*filterSize) > srcW) |
2164 { | 1993 { |
2165 int shift= filterPos[i] + *filterSize - srcW; | 1994 int shift= filterPos[i] + (*filterSize) - srcW; |
2166 // Move filter coeffs right to compensate for filterPos | 1995 // Move filter coeffs right to compensate for filterPos |
2167 for(j=*filterSize-2; j>=0; j--) | 1996 for(j=(*filterSize)-2; j>=0; j--) |
2168 { | 1997 { |
2169 int right= MIN(j + shift, *filterSize-1); | 1998 int right= MIN(j + shift, (*filterSize)-1); |
2170 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j]; | 1999 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j]; |
2171 filter[i*(*filterSize) +j]=0; | 2000 filter[i*(*filterSize) +j]=0; |
2172 } | 2001 } |
2173 filterPos[i]= srcW - *filterSize; | 2002 filterPos[i]= srcW - (*filterSize); |
2174 } | 2003 } |
2175 } | 2004 } |
2176 | 2005 |
2177 //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end | 2006 //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end |
2178 // and skip these than later | 2007 // and skip these than later |
2188 sum+= filter[i*(*filterSize) + j]; | 2017 sum+= filter[i*(*filterSize) + j]; |
2189 } | 2018 } |
2190 scale/= sum; | 2019 scale/= sum; |
2191 for(j=0; j<*filterSize; j++) | 2020 for(j=0; j<*filterSize; j++) |
2192 { | 2021 { |
2193 filter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale); | 2022 dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale); |
2194 } | 2023 } |
2195 } | 2024 } |
2196 } | 2025 } |
2197 | 2026 |
2198 #ifdef HAVE_MMX2 | 2027 #ifdef HAVE_MMX2 |
2337 static int lumBufIndex=0; | 2166 static int lumBufIndex=0; |
2338 static int chrBufIndex=0; | 2167 static int chrBufIndex=0; |
2339 | 2168 |
2340 static int firstTime=1; | 2169 static int firstTime=1; |
2341 | 2170 |
2342 int widthAlign= dstbpp==12 ? 16 : 8; | 2171 const int widthAlign= dstbpp==12 ? 16 : 8; |
2343 if(((dstW + widthAlign-1)&(~(widthAlign-1))) > dststride) | 2172 const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4) |
2173 const int over= dstbpp==12 ? (((dstW+15)&(~15))) - dststride | |
2174 : (((dstW+7)&(~7)))*bytespp - dststride; | |
2175 if(dststride%widthAlign !=0 ) | |
2344 { | 2176 { |
2345 dstW&= ~(widthAlign-1); | |
2346 if(firstTime) | 2177 if(firstTime) |
2347 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n" | 2178 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n" |
2348 "SwScaler: ->lowering width to compensate, new width=%d\n" | 2179 "SwScaler: ->cannot do aligned memory acesses anymore\n", |
2349 "SwScaler: ->cannot do aligned memory acesses anymore\n", | 2180 widthAlign); |
2350 widthAlign, dstW); | |
2351 } | 2181 } |
2182 | |
2183 if(over>0) | |
2184 { | |
2185 if(firstTime) | |
2186 fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n" | |
2187 "SwScaler: and dststride is not large enough to handle %d extra bytes\n" | |
2188 "SwScaler: ->using unoptimized C version for last line(s)\n", | |
2189 over); | |
2190 } | |
2191 | |
2192 | |
2352 | 2193 |
2353 //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH); | 2194 //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH); |
2354 //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH); | 2195 //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH); |
2355 | 2196 |
2356 #ifdef HAVE_MMX2 | 2197 #ifdef HAVE_MMX2 |
2357 canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0; | 2198 canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0; |
2358 if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR) | 2199 if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR) |
2359 { | 2200 { |
2360 if(firstTime) //FIXME only if verbose ? | 2201 if(firstTime) |
2361 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n"); | 2202 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n"); |
2362 } | 2203 } |
2204 #else | |
2205 canMMX2BeUsed=0; // should be 0 anyway but ... | |
2363 #endif | 2206 #endif |
2364 | 2207 |
2365 if(firstTime) | 2208 if(firstTime) |
2366 { | 2209 { |
2367 #if defined (DITHER1XBPP) && defined (HAVE_MMX) | 2210 #if defined (DITHER1XBPP) && defined (HAVE_MMX) |
2396 #elif defined (HAVE_3DNOW) | 2239 #elif defined (HAVE_3DNOW) |
2397 fprintf(stderr, "using 3DNOW\n"); | 2240 fprintf(stderr, "using 3DNOW\n"); |
2398 #elif defined (HAVE_MMX) | 2241 #elif defined (HAVE_MMX) |
2399 fprintf(stderr, "using MMX\n"); | 2242 fprintf(stderr, "using MMX\n"); |
2400 #elif defined (ARCH_X86) | 2243 #elif defined (ARCH_X86) |
2401 fprintf(stderr, "using X86 ASM2\n"); | 2244 fprintf(stderr, "using X86 ASM\n"); |
2402 #else | 2245 #else |
2403 fprintf(stderr, "using C\n"); | 2246 fprintf(stderr, "using C\n"); |
2404 #endif | 2247 #endif |
2405 } | 2248 } |
2406 | 2249 |
2411 // would be like the vertical one, but that would require some special code for the | 2254 // would be like the vertical one, but that would require some special code for the |
2412 // first and last pixel | 2255 // first and last pixel |
2413 if(sws_flags==SWS_FAST_BILINEAR) | 2256 if(sws_flags==SWS_FAST_BILINEAR) |
2414 { | 2257 { |
2415 if(canMMX2BeUsed) lumXInc+= 20; | 2258 if(canMMX2BeUsed) lumXInc+= 20; |
2259 #ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available | |
2416 else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20; | 2260 else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20; |
2261 #endif | |
2417 } | 2262 } |
2418 | 2263 |
2419 if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW; | 2264 if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW; |
2420 else chrXInc= lumXInc, chrDstW= dstW>>1; | 2265 else chrXInc= lumXInc, chrDstW= (dstW+1)>>1; |
2421 | 2266 |
2422 if(dstbpp==12) chrYInc= lumYInc, chrDstH= dstH>>1; | 2267 if(dstbpp==12) chrYInc= lumYInc, chrDstH= (dstH+1)>>1; |
2423 else chrYInc= lumYInc>>1, chrDstH= dstH; | 2268 else chrYInc= lumYInc>>1, chrDstH= dstH; |
2424 | 2269 |
2425 // force calculation of the horizontal interpolation of the first line | 2270 // force calculation of the horizontal interpolation of the first line |
2426 | 2271 |
2427 if(srcSliceY ==0){ | 2272 if(srcSliceY ==0){ |
2438 #else | 2283 #else |
2439 const int filterAlign=1; | 2284 const int filterAlign=1; |
2440 #endif | 2285 #endif |
2441 oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags; | 2286 oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags; |
2442 | 2287 |
2443 if(sws_flags != SWS_FAST_BILINEAR) | 2288 RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc, |
2444 { | 2289 srcW , dstW , filterAlign, 1<<14); |
2445 RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc, | 2290 RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc, |
2446 srcW , dstW , filterAlign, 1<<14); | 2291 (srcW+1)>>1, chrDstW, filterAlign, 1<<14); |
2447 RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc, | |
2448 srcW>>1, chrDstW, filterAlign, 1<<14); | |
2449 } | |
2450 | 2292 |
2451 #ifdef HAVE_MMX2 | 2293 #ifdef HAVE_MMX2 |
2452 // cant downscale !!! | 2294 // cant downscale !!! |
2453 if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR) | 2295 if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR) |
2454 { | 2296 { |
2468 for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]); | 2310 for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]); |
2469 | 2311 |
2470 RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc, | 2312 RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc, |
2471 srcH , dstH, 1, (1<<12)-4); | 2313 srcH , dstH, 1, (1<<12)-4); |
2472 RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc, | 2314 RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc, |
2473 srcH>>1, chrDstH, 1, (1<<12)-4); | 2315 (srcH+1)>>1, chrDstH, 1, (1<<12)-4); |
2474 | 2316 |
2475 // Calculate Buffer Sizes so that they wont run out while handling these damn slices | 2317 // Calculate Buffer Sizes so that they wont run out while handling these damn slices |
2476 vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize; | 2318 vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize; |
2477 for(i=0; i<dstH; i++) | 2319 for(i=0; i<dstH; i++) |
2478 { | 2320 { |
2505 | 2347 |
2506 for(i=0; i<vChrFilterSize*chrDstH; i++) | 2348 for(i=0; i<vChrFilterSize*chrDstH; i++) |
2507 chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]= | 2349 chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]= |
2508 vChrFilter[i]; | 2350 vChrFilter[i]; |
2509 #endif | 2351 #endif |
2352 } | |
2353 | |
2354 if(firstTime && verbose) | |
2355 { | |
2356 #ifdef HAVE_MMX2 | |
2357 int mmx2=1; | |
2358 #else | |
2359 int mmx2=0; | |
2360 #endif | |
2361 #ifdef HAVE_MMX | |
2362 int mmx=1; | |
2363 #else | |
2364 int mmx=0; | |
2365 #endif | |
2366 | |
2367 #ifdef HAVE_MMX | |
2368 if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR) | |
2369 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n"); | |
2370 else | |
2371 { | |
2372 if(hLumFilterSize==4) | |
2373 printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n"); | |
2374 else if(hLumFilterSize==8) | |
2375 printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n"); | |
2376 else | |
2377 printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n"); | |
2378 | |
2379 if(hChrFilterSize==4) | |
2380 printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n"); | |
2381 else if(hChrFilterSize==8) | |
2382 printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n"); | |
2383 else | |
2384 printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n"); | |
2385 } | |
2386 #elif defined (ARCH_X86) | |
2387 printf("SwScaler: using X86-Asm scaler for horizontal scaling\n"); | |
2388 #else | |
2389 if(sws_flags==SWS_FAST_BILINEAR) | |
2390 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n"); | |
2391 else | |
2392 printf("SwScaler: using C scaler for horizontal scaling\n"); | |
2393 #endif | |
2394 | |
2395 if(dstbpp==12) | |
2396 { | |
2397 if(vLumFilterSize==1) | |
2398 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C"); | |
2399 else | |
2400 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C"); | |
2401 } | |
2402 else | |
2403 { | |
2404 if(vLumFilterSize==1 && vChrFilterSize==2) | |
2405 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n" | |
2406 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C"); | |
2407 else if(vLumFilterSize==2 && vChrFilterSize==2) | |
2408 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C"); | |
2409 else | |
2410 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C"); | |
2411 } | |
2412 | |
2413 if(dstbpp==24) | |
2414 printf("SwScaler: using %s YV12->BGR24 Converter\n", | |
2415 mmx2 ? "MMX2" : (mmx ? "MMX" : "C")); | |
2416 else | |
2417 printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp); | |
2418 | |
2419 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH); | |
2510 } | 2420 } |
2511 | 2421 |
2512 lastInLumBuf= -1; | 2422 lastInLumBuf= -1; |
2513 lastInChrBuf= -1; | 2423 lastInChrBuf= -1; |
2514 } // if(firstLine) | 2424 } // if(firstLine) |
2555 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2]; | 2465 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2]; |
2556 chrBufIndex++; | 2466 chrBufIndex++; |
2557 ASSERT(chrBufIndex < 2*vChrBufSize) | 2467 ASSERT(chrBufIndex < 2*vChrBufSize) |
2558 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) | 2468 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) |
2559 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) | 2469 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) |
2560 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc); | 2470 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc); |
2561 lastInChrBuf++; | 2471 lastInChrBuf++; |
2562 } | 2472 } |
2563 //wrap buf index around to stay inside the ring buffer | 2473 //wrap buf index around to stay inside the ring buffer |
2564 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | 2474 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; |
2565 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | 2475 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; |
2588 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2]; | 2498 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2]; |
2589 chrBufIndex++; | 2499 chrBufIndex++; |
2590 ASSERT(chrBufIndex < 2*vChrBufSize) | 2500 ASSERT(chrBufIndex < 2*vChrBufSize) |
2591 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) | 2501 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) |
2592 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) | 2502 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) |
2593 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc); | 2503 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc); |
2594 lastInChrBuf++; | 2504 lastInChrBuf++; |
2595 } | 2505 } |
2596 //wrap buf index around to stay inside the ring buffer | 2506 //wrap buf index around to stay inside the ring buffer |
2597 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | 2507 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; |
2598 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | 2508 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; |
2603 b5Dither= dither8[dstY&1]; | 2513 b5Dither= dither8[dstY&1]; |
2604 g6Dither= dither4[dstY&1]; | 2514 g6Dither= dither4[dstY&1]; |
2605 g5Dither= dither8[dstY&1]; | 2515 g5Dither= dither8[dstY&1]; |
2606 r5Dither= dither8[(dstY+1)&1]; | 2516 r5Dither= dither8[(dstY+1)&1]; |
2607 #endif | 2517 #endif |
2608 | 2518 if(dstY < dstH-2 || over<=0) |
2519 { | |
2609 if(dstbpp==12) //YV12 | 2520 if(dstbpp==12) //YV12 |
2610 { | 2521 { |
2611 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | 2522 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi |
2612 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 | 2523 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 |
2613 { | 2524 { |
2655 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | 2566 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, |
2656 dest, dstW, dstbpp, | 2567 dest, dstW, dstbpp, |
2657 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4); | 2568 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4); |
2658 } | 2569 } |
2659 } | 2570 } |
2571 } | |
2572 else // hmm looks like we cant use MMX here without overwriting this arrays tail | |
2573 { | |
2574 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2575 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2576 if(dstbpp==12) //YV12 | |
2577 { | |
2578 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
2579 yuv2yuvXinC( | |
2580 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2581 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2582 dest, uDest, vDest, dstW); | |
2583 } | |
2584 else | |
2585 { | |
2586 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2587 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
2588 yuv2rgbXinC( | |
2589 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2590 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2591 dest, dstW, dstbpp); | |
2592 } | |
2593 } | |
2660 } | 2594 } |
2661 | 2595 |
2662 #ifdef HAVE_MMX | 2596 #ifdef HAVE_MMX |
2663 __asm __volatile(SFENCE:::"memory"); | 2597 __asm __volatile(SFENCE:::"memory"); |
2664 __asm __volatile(EMMS:::"memory"); | 2598 __asm __volatile(EMMS:::"memory"); |