comparison postproc/swscale_template.c @ 3352:64121e8a43f5

print more info if -v use new horizontal mmx scaler instead of old x86asm if mmx2 cant be used (FAST_BILINEAR only) fixed overflow in init function ... using double precission fp now :) using C scaler for the last 1-2 lines if there is a chance to write over the end of the dst array
author michael
date Thu, 06 Dec 2001 19:07:25 +0000
parents 20065c9b0f09
children 33c560ffd3dc
comparison
equal deleted inserted replaced
3351:091cdd056ca4 3352:64121e8a43f5
670 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize), 670 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
671 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW) 671 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
672 : "%eax", "%edx", "%esi" 672 : "%eax", "%edx", "%esi"
673 ); 673 );
674 #else 674 #else
675 //FIXME Optimize (just quickly writen not opti..) 675 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
676 int i; 676 chrFilter, chrSrc, chrFilterSize,
677 for(i=0; i<dstW; i++) 677 dest, uDest, vDest, dstW);
678 {
679 int val=0;
680 int j;
681 for(j=0; j<lumFilterSize; j++)
682 val += lumSrc[j][i] * lumFilter[j];
683
684 dest[i]= MIN(MAX(val>>19, 0), 255);
685 }
686
687 if(uDest != NULL)
688 for(i=0; i<(dstW>>1); i++)
689 {
690 int u=0;
691 int v=0;
692 int j;
693 for(j=0; j<lumFilterSize; j++)
694 {
695 u += chrSrc[j][i] * chrFilter[j];
696 v += chrSrc[j][i + 2048] * chrFilter[j];
697 }
698
699 uDest[i]= MIN(MAX(u>>19, 0), 255);
700 vDest[i]= MIN(MAX(v>>19, 0), 255);
701 }
702 #endif 678 #endif
703 } 679 }
704 680
705 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, 681 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
706 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW) 682 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
834 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) 810 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
835 : "%eax", "%ebx", "%ecx", "%edx", "%esi" 811 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
836 ); 812 );
837 } 813 }
838 #else 814 #else
839 if(dstbpp==32) 815 yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
840 { 816 chrFilter, chrSrc, chrFilterSize,
841 int i; 817 dest, dstW, dstbpp);
842 for(i=0; i<(dstW>>1); i++){ 818
843 int j;
844 int Y1=0;
845 int Y2=0;
846 int U=0;
847 int V=0;
848 int Cb, Cr, Cg;
849 for(j=0; j<lumFilterSize; j++)
850 {
851 Y1 += lumSrc[j][2*i] * lumFilter[j];
852 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
853 }
854 for(j=0; j<chrFilterSize; j++)
855 {
856 U += chrSrc[j][i] * chrFilter[j];
857 V += chrSrc[j][i+2048] * chrFilter[j];
858 }
859 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
860 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
861 U >>= 19;
862 V >>= 19;
863
864 Cb= clip_yuvtab_40cf[U+ 256];
865 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
866 Cr= clip_yuvtab_3343[V+ 256];
867
868 dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
869 dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
870 dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
871
872 dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
873 dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
874 dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
875 }
876 }
877 else if(dstbpp==24)
878 {
879 int i;
880 for(i=0; i<(dstW>>1); i++){
881 int j;
882 int Y1=0;
883 int Y2=0;
884 int U=0;
885 int V=0;
886 int Cb, Cr, Cg;
887 for(j=0; j<lumFilterSize; j++)
888 {
889 Y1 += lumSrc[j][2*i] * lumFilter[j];
890 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
891 }
892 for(j=0; j<chrFilterSize; j++)
893 {
894 U += chrSrc[j][i] * chrFilter[j];
895 V += chrSrc[j][i+2048] * chrFilter[j];
896 }
897 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
898 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
899 U >>= 19;
900 V >>= 19;
901
902 Cb= clip_yuvtab_40cf[U+ 256];
903 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
904 Cr= clip_yuvtab_3343[V+ 256];
905
906 dest[0]=clip_table[((Y1 + Cb) >>13)];
907 dest[1]=clip_table[((Y1 + Cg) >>13)];
908 dest[2]=clip_table[((Y1 + Cr) >>13)];
909
910 dest[3]=clip_table[((Y2 + Cb) >>13)];
911 dest[4]=clip_table[((Y2 + Cg) >>13)];
912 dest[5]=clip_table[((Y2 + Cr) >>13)];
913 dest+=6;
914 }
915 }
916 else if(dstbpp==16)
917 {
918 int i;
919 for(i=0; i<(dstW>>1); i++){
920 int j;
921 int Y1=0;
922 int Y2=0;
923 int U=0;
924 int V=0;
925 int Cb, Cr, Cg;
926 for(j=0; j<lumFilterSize; j++)
927 {
928 Y1 += lumSrc[j][2*i] * lumFilter[j];
929 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
930 }
931 for(j=0; j<chrFilterSize; j++)
932 {
933 U += chrSrc[j][i] * chrFilter[j];
934 V += chrSrc[j][i+2048] * chrFilter[j];
935 }
936 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
937 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
938 U >>= 19;
939 V >>= 19;
940
941 Cb= clip_yuvtab_40cf[U+ 256];
942 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
943 Cr= clip_yuvtab_3343[V+ 256];
944
945 ((uint16_t*)dest)[2*i] =
946 clip_table16b[(Y1 + Cb) >>13] |
947 clip_table16g[(Y1 + Cg) >>13] |
948 clip_table16r[(Y1 + Cr) >>13];
949
950 ((uint16_t*)dest)[2*i+1] =
951 clip_table16b[(Y2 + Cb) >>13] |
952 clip_table16g[(Y2 + Cg) >>13] |
953 clip_table16r[(Y2 + Cr) >>13];
954 }
955 }
956 else if(dstbpp==15)
957 {
958 int i;
959 for(i=0; i<(dstW>>1); i++){
960 int j;
961 int Y1=0;
962 int Y2=0;
963 int U=0;
964 int V=0;
965 int Cb, Cr, Cg;
966 for(j=0; j<lumFilterSize; j++)
967 {
968 Y1 += lumSrc[j][2*i] * lumFilter[j];
969 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
970 }
971 for(j=0; j<chrFilterSize; j++)
972 {
973 U += chrSrc[j][i] * chrFilter[j];
974 V += chrSrc[j][i+2048] * chrFilter[j];
975 }
976 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
977 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
978 U >>= 19;
979 V >>= 19;
980
981 Cb= clip_yuvtab_40cf[U+ 256];
982 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
983 Cr= clip_yuvtab_3343[V+ 256];
984
985 ((uint16_t*)dest)[2*i] =
986 clip_table15b[(Y1 + Cb) >>13] |
987 clip_table15g[(Y1 + Cg) >>13] |
988 clip_table15r[(Y1 + Cr) >>13];
989
990 ((uint16_t*)dest)[2*i+1] =
991 clip_table15b[(Y2 + Cb) >>13] |
992 clip_table15g[(Y2 + Cg) >>13] |
993 clip_table15r[(Y2 + Cr) >>13];
994 }
995 }
996 #endif 819 #endif
997 } //!FULL_UV_IPOL 820 } //!FULL_UV_IPOL
998 } 821 }
999 822
1000 823
1371 */ 1194 */
1372 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, 1195 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1373 uint8_t *dest, int dstW, int uvalpha, int dstbpp) 1196 uint8_t *dest, int dstW, int uvalpha, int dstbpp)
1374 { 1197 {
1375 int uvalpha1=uvalpha^4095; 1198 int uvalpha1=uvalpha^4095;
1376 const int yalpha=0;
1377 const int yalpha1=0; 1199 const int yalpha1=0;
1378 1200
1379 if(fullUVIpol || allwaysIpol) 1201 if(fullUVIpol || allwaysIpol)
1380 { 1202 {
1381 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp); 1203 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp);
1634 "pmaddwd %%mm6, %%mm0 \n\t" 1456 "pmaddwd %%mm6, %%mm0 \n\t"
1635 "packssdw %%mm0, %%mm0 \n\t" 1457 "packssdw %%mm0, %%mm0 \n\t"
1636 "movd %%mm0, (%4, %%ebp) \n\t" 1458 "movd %%mm0, (%4, %%ebp) \n\t"
1637 "addl $4, %%ebp \n\t" 1459 "addl $4, %%ebp \n\t"
1638 " jnc 1b \n\t" 1460 " jnc 1b \n\t"
1639 1461
1640 "popl %%ebp \n\t" 1462 "popl %%ebp \n\t"
1641 : "+a" (counter) 1463 : "+a" (counter)
1642 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) 1464 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1643 : "%ebx" 1465 : "%ebx"
1644 ); 1466 );
1762 #endif 1584 #endif
1763 } 1585 }
1764 // *** horizontal scale Y line to temp buffer 1586 // *** horizontal scale Y line to temp buffer
1765 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc) 1587 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc)
1766 { 1588 {
1589 #ifdef HAVE_MMX
1590 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1591 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1592 #else
1767 if(sws_flags != SWS_FAST_BILINEAR) 1593 if(sws_flags != SWS_FAST_BILINEAR)
1594 #endif
1768 { 1595 {
1769 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); 1596 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1770 } 1597 }
1771 else // Fast Bilinear upscale / crap downscale 1598 else // Fast Bilinear upscale / crap downscale
1772 { 1599 {
1883 } 1710 }
1884 1711
1885 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, 1712 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth,
1886 uint8_t *src1, uint8_t *src2, int srcW, int xInc) 1713 uint8_t *src1, uint8_t *src2, int srcW, int xInc)
1887 { 1714 {
1715 #ifdef HAVE_MMX
1716 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1717 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1718 #else
1888 if(sws_flags != SWS_FAST_BILINEAR) 1719 if(sws_flags != SWS_FAST_BILINEAR)
1720 #endif
1889 { 1721 {
1890 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); 1722 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1891 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); 1723 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1892 } 1724 }
1893 else // Fast Bilinear upscale / crap downscale 1725 else // Fast Bilinear upscale / crap downscale
2024 } 1856 }
2025 #endif 1857 #endif
2026 } 1858 }
2027 } 1859 }
2028 1860
2029 static inline void RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *filterSize, int xInc, 1861 static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
2030 int srcW, int dstW, int filterAlign, int one) 1862 int srcW, int dstW, int filterAlign, int one)
2031 { 1863 {
2032 int i; 1864 int i;
1865 double filter[8000];
2033 #ifdef HAVE_MMX 1866 #ifdef HAVE_MMX
2034 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions) 1867 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
2035 #endif 1868 #endif
2036 1869
2037 if(ABS(xInc - 0x10000) <10) // unscaled 1870 if(ABS(xInc - 0x10000) <10) // unscaled
2038 { 1871 {
2039 int i; 1872 int i;
2064 1897
2065 filterPos[i]= xx; 1898 filterPos[i]= xx;
2066 if(sws_flags == SWS_BICUBIC) 1899 if(sws_flags == SWS_BICUBIC)
2067 { 1900 {
2068 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16); 1901 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
2069 // int coeff; 1902 double y1,y2,y3,y4;
2070 int y1,y2,y3,y4;
2071 double A= -0.75; 1903 double A= -0.75;
2072 // Equation is from VirtualDub 1904 // Equation is from VirtualDub
2073 y1 = (int)floor(0.5 + ( + A*d - 2.0*A*d*d + A*d*d*d) * 16384.0); 1905 y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
2074 y2 = (int)floor(0.5 + (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d) * 16384.0); 1906 y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
2075 y3 = (int)floor(0.5 + ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d) * 16384.0); 1907 y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
2076 y4 = (int)floor(0.5 + ( + A*d*d - A*d*d*d) * 16384.0); 1908 y4 = ( + A*d*d - A*d*d*d);
2077 1909
2078 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); 1910 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
2079 filter[i*(*filterSize) + 0]= y1; 1911 filter[i*(*filterSize) + 0]= y1;
2080 filter[i*(*filterSize) + 1]= y2; 1912 filter[i*(*filterSize) + 1]= y2;
2081 filter[i*(*filterSize) + 2]= y3; 1913 filter[i*(*filterSize) + 2]= y3;
2085 else 1917 else
2086 { 1918 {
2087 for(j=0; j<*filterSize; j++) 1919 for(j=0; j<*filterSize; j++)
2088 { 1920 {
2089 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16); 1921 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
2090 int coeff; 1922 double coeff= 1.0 - d;
2091 coeff= (int)(0.5 + (1.0 - d)*(1<<14));
2092 if(coeff<0) coeff=0; 1923 if(coeff<0) coeff=0;
2093 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); 1924 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
2094 filter[i*(*filterSize) + j]= coeff; 1925 filter[i*(*filterSize) + j]= coeff;
2095 xx++; 1926 xx++;
2096 } 1927 }
2114 1945
2115 filterPos[i]= xx; 1946 filterPos[i]= xx;
2116 for(j=0; j<*filterSize; j++) 1947 for(j=0; j<*filterSize; j++)
2117 { 1948 {
2118 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc; 1949 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
2119 int coeff; 1950 double coeff;
2120 if(sws_flags == SWS_BICUBIC) 1951 if(sws_flags == SWS_BICUBIC)
2121 { 1952 {
2122 double A= -0.75; 1953 double A= -0.75;
2123 // d*=2; 1954 // d*=2;
2124 // Equation is from VirtualDub 1955 // Equation is from VirtualDub
2125 if(d<1.0) 1956 if(d<1.0)
2126 coeff = (int)floor(0.5 + (1.0 - (A+3.0)*d*d 1957 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
2127 + (A+2.0)*d*d*d) * (1<<14));
2128 else if(d<2.0) 1958 else if(d<2.0)
2129 coeff = (int)floor(0.5 + (-4.0*A + 8.0*A*d 1959 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
2130 - 5.0*A*d*d + A*d*d*d) * (1<<14));
2131 else 1960 else
2132 coeff=0; 1961 coeff=0.0;
2133 } 1962 }
2134 else 1963 else
2135 { 1964 {
2136 coeff= (int)(0.5 + (1.0 - d)*(1<<14)); 1965 coeff= 1.0 - d;
2137 if(coeff<0) coeff=0; 1966 if(coeff<0) coeff=0;
2138 } 1967 }
2139 // if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc); 1968 // if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
2140 filter[i*(*filterSize) + j]= coeff; 1969 filter[i*(*filterSize) + j]= coeff;
2141 xx++; 1970 xx++;
2158 filter[i*(*filterSize) + j]=0; 1987 filter[i*(*filterSize) + j]=0;
2159 } 1988 }
2160 filterPos[i]= 0; 1989 filterPos[i]= 0;
2161 } 1990 }
2162 1991
2163 if(filterPos[i] + *filterSize > srcW) 1992 if(filterPos[i] + (*filterSize) > srcW)
2164 { 1993 {
2165 int shift= filterPos[i] + *filterSize - srcW; 1994 int shift= filterPos[i] + (*filterSize) - srcW;
2166 // Move filter coeffs right to compensate for filterPos 1995 // Move filter coeffs right to compensate for filterPos
2167 for(j=*filterSize-2; j>=0; j--) 1996 for(j=(*filterSize)-2; j>=0; j--)
2168 { 1997 {
2169 int right= MIN(j + shift, *filterSize-1); 1998 int right= MIN(j + shift, (*filterSize)-1);
2170 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j]; 1999 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
2171 filter[i*(*filterSize) +j]=0; 2000 filter[i*(*filterSize) +j]=0;
2172 } 2001 }
2173 filterPos[i]= srcW - *filterSize; 2002 filterPos[i]= srcW - (*filterSize);
2174 } 2003 }
2175 } 2004 }
2176 2005
2177 //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end 2006 //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end
2178 // and skip these than later 2007 // and skip these than later
2188 sum+= filter[i*(*filterSize) + j]; 2017 sum+= filter[i*(*filterSize) + j];
2189 } 2018 }
2190 scale/= sum; 2019 scale/= sum;
2191 for(j=0; j<*filterSize; j++) 2020 for(j=0; j<*filterSize; j++)
2192 { 2021 {
2193 filter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale); 2022 dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
2194 } 2023 }
2195 } 2024 }
2196 } 2025 }
2197 2026
2198 #ifdef HAVE_MMX2 2027 #ifdef HAVE_MMX2
2337 static int lumBufIndex=0; 2166 static int lumBufIndex=0;
2338 static int chrBufIndex=0; 2167 static int chrBufIndex=0;
2339 2168
2340 static int firstTime=1; 2169 static int firstTime=1;
2341 2170
2342 int widthAlign= dstbpp==12 ? 16 : 8; 2171 const int widthAlign= dstbpp==12 ? 16 : 8;
2343 if(((dstW + widthAlign-1)&(~(widthAlign-1))) > dststride) 2172 const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
2173 const int over= dstbpp==12 ? (((dstW+15)&(~15))) - dststride
2174 : (((dstW+7)&(~7)))*bytespp - dststride;
2175 if(dststride%widthAlign !=0 )
2344 { 2176 {
2345 dstW&= ~(widthAlign-1);
2346 if(firstTime) 2177 if(firstTime)
2347 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n" 2178 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n"
2348 "SwScaler: ->lowering width to compensate, new width=%d\n" 2179 "SwScaler: ->cannot do aligned memory acesses anymore\n",
2349 "SwScaler: ->cannot do aligned memory acesses anymore\n", 2180 widthAlign);
2350 widthAlign, dstW);
2351 } 2181 }
2182
2183 if(over>0)
2184 {
2185 if(firstTime)
2186 fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n"
2187 "SwScaler: and dststride is not large enough to handle %d extra bytes\n"
2188 "SwScaler: ->using unoptimized C version for last line(s)\n",
2189 over);
2190 }
2191
2192
2352 2193
2353 //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH); 2194 //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH);
2354 //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH); 2195 //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH);
2355 2196
2356 #ifdef HAVE_MMX2 2197 #ifdef HAVE_MMX2
2357 canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0; 2198 canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2358 if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR) 2199 if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR)
2359 { 2200 {
2360 if(firstTime) //FIXME only if verbose ? 2201 if(firstTime)
2361 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n"); 2202 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
2362 } 2203 }
2204 #else
2205 canMMX2BeUsed=0; // should be 0 anyway but ...
2363 #endif 2206 #endif
2364 2207
2365 if(firstTime) 2208 if(firstTime)
2366 { 2209 {
2367 #if defined (DITHER1XBPP) && defined (HAVE_MMX) 2210 #if defined (DITHER1XBPP) && defined (HAVE_MMX)
2396 #elif defined (HAVE_3DNOW) 2239 #elif defined (HAVE_3DNOW)
2397 fprintf(stderr, "using 3DNOW\n"); 2240 fprintf(stderr, "using 3DNOW\n");
2398 #elif defined (HAVE_MMX) 2241 #elif defined (HAVE_MMX)
2399 fprintf(stderr, "using MMX\n"); 2242 fprintf(stderr, "using MMX\n");
2400 #elif defined (ARCH_X86) 2243 #elif defined (ARCH_X86)
2401 fprintf(stderr, "using X86 ASM2\n"); 2244 fprintf(stderr, "using X86 ASM\n");
2402 #else 2245 #else
2403 fprintf(stderr, "using C\n"); 2246 fprintf(stderr, "using C\n");
2404 #endif 2247 #endif
2405 } 2248 }
2406 2249
2411 // would be like the vertical one, but that would require some special code for the 2254 // would be like the vertical one, but that would require some special code for the
2412 // first and last pixel 2255 // first and last pixel
2413 if(sws_flags==SWS_FAST_BILINEAR) 2256 if(sws_flags==SWS_FAST_BILINEAR)
2414 { 2257 {
2415 if(canMMX2BeUsed) lumXInc+= 20; 2258 if(canMMX2BeUsed) lumXInc+= 20;
2259 #ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available
2416 else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20; 2260 else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
2261 #endif
2417 } 2262 }
2418 2263
2419 if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW; 2264 if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW;
2420 else chrXInc= lumXInc, chrDstW= dstW>>1; 2265 else chrXInc= lumXInc, chrDstW= (dstW+1)>>1;
2421 2266
2422 if(dstbpp==12) chrYInc= lumYInc, chrDstH= dstH>>1; 2267 if(dstbpp==12) chrYInc= lumYInc, chrDstH= (dstH+1)>>1;
2423 else chrYInc= lumYInc>>1, chrDstH= dstH; 2268 else chrYInc= lumYInc>>1, chrDstH= dstH;
2424 2269
2425 // force calculation of the horizontal interpolation of the first line 2270 // force calculation of the horizontal interpolation of the first line
2426 2271
2427 if(srcSliceY ==0){ 2272 if(srcSliceY ==0){
2438 #else 2283 #else
2439 const int filterAlign=1; 2284 const int filterAlign=1;
2440 #endif 2285 #endif
2441 oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags; 2286 oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags;
2442 2287
2443 if(sws_flags != SWS_FAST_BILINEAR) 2288 RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc,
2444 { 2289 srcW , dstW , filterAlign, 1<<14);
2445 RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc, 2290 RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
2446 srcW , dstW , filterAlign, 1<<14); 2291 (srcW+1)>>1, chrDstW, filterAlign, 1<<14);
2447 RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
2448 srcW>>1, chrDstW, filterAlign, 1<<14);
2449 }
2450 2292
2451 #ifdef HAVE_MMX2 2293 #ifdef HAVE_MMX2
2452 // cant downscale !!! 2294 // cant downscale !!!
2453 if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR) 2295 if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR)
2454 { 2296 {
2468 for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]); 2310 for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]);
2469 2311
2470 RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc, 2312 RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc,
2471 srcH , dstH, 1, (1<<12)-4); 2313 srcH , dstH, 1, (1<<12)-4);
2472 RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc, 2314 RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc,
2473 srcH>>1, chrDstH, 1, (1<<12)-4); 2315 (srcH+1)>>1, chrDstH, 1, (1<<12)-4);
2474 2316
2475 // Calculate Buffer Sizes so that they wont run out while handling these damn slices 2317 // Calculate Buffer Sizes so that they wont run out while handling these damn slices
2476 vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize; 2318 vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize;
2477 for(i=0; i<dstH; i++) 2319 for(i=0; i<dstH; i++)
2478 { 2320 {
2505 2347
2506 for(i=0; i<vChrFilterSize*chrDstH; i++) 2348 for(i=0; i<vChrFilterSize*chrDstH; i++)
2507 chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]= 2349 chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]=
2508 vChrFilter[i]; 2350 vChrFilter[i];
2509 #endif 2351 #endif
2352 }
2353
2354 if(firstTime && verbose)
2355 {
2356 #ifdef HAVE_MMX2
2357 int mmx2=1;
2358 #else
2359 int mmx2=0;
2360 #endif
2361 #ifdef HAVE_MMX
2362 int mmx=1;
2363 #else
2364 int mmx=0;
2365 #endif
2366
2367 #ifdef HAVE_MMX
2368 if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR)
2369 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
2370 else
2371 {
2372 if(hLumFilterSize==4)
2373 printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
2374 else if(hLumFilterSize==8)
2375 printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
2376 else
2377 printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
2378
2379 if(hChrFilterSize==4)
2380 printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
2381 else if(hChrFilterSize==8)
2382 printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
2383 else
2384 printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
2385 }
2386 #elif defined (ARCH_X86)
2387 printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
2388 #else
2389 if(sws_flags==SWS_FAST_BILINEAR)
2390 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
2391 else
2392 printf("SwScaler: using C scaler for horizontal scaling\n");
2393 #endif
2394
2395 if(dstbpp==12)
2396 {
2397 if(vLumFilterSize==1)
2398 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2399 else
2400 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2401 }
2402 else
2403 {
2404 if(vLumFilterSize==1 && vChrFilterSize==2)
2405 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
2406 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C");
2407 else if(vLumFilterSize==2 && vChrFilterSize==2)
2408 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2409 else
2410 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2411 }
2412
2413 if(dstbpp==24)
2414 printf("SwScaler: using %s YV12->BGR24 Converter\n",
2415 mmx2 ? "MMX2" : (mmx ? "MMX" : "C"));
2416 else
2417 printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp);
2418
2419 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
2510 } 2420 }
2511 2421
2512 lastInLumBuf= -1; 2422 lastInLumBuf= -1;
2513 lastInChrBuf= -1; 2423 lastInChrBuf= -1;
2514 } // if(firstLine) 2424 } // if(firstLine)
2555 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2]; 2465 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2556 chrBufIndex++; 2466 chrBufIndex++;
2557 ASSERT(chrBufIndex < 2*vChrBufSize) 2467 ASSERT(chrBufIndex < 2*vChrBufSize)
2558 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) 2468 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2559 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) 2469 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2560 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc); 2470 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
2561 lastInChrBuf++; 2471 lastInChrBuf++;
2562 } 2472 }
2563 //wrap buf index around to stay inside the ring buffer 2473 //wrap buf index around to stay inside the ring buffer
2564 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; 2474 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2565 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; 2475 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2588 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2]; 2498 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2589 chrBufIndex++; 2499 chrBufIndex++;
2590 ASSERT(chrBufIndex < 2*vChrBufSize) 2500 ASSERT(chrBufIndex < 2*vChrBufSize)
2591 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) 2501 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2592 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) 2502 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2593 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc); 2503 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
2594 lastInChrBuf++; 2504 lastInChrBuf++;
2595 } 2505 }
2596 //wrap buf index around to stay inside the ring buffer 2506 //wrap buf index around to stay inside the ring buffer
2597 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; 2507 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2598 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; 2508 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2603 b5Dither= dither8[dstY&1]; 2513 b5Dither= dither8[dstY&1];
2604 g6Dither= dither4[dstY&1]; 2514 g6Dither= dither4[dstY&1];
2605 g5Dither= dither8[dstY&1]; 2515 g5Dither= dither8[dstY&1];
2606 r5Dither= dither8[(dstY+1)&1]; 2516 r5Dither= dither8[(dstY+1)&1];
2607 #endif 2517 #endif
2608 2518 if(dstY < dstH-2 || over<=0)
2519 {
2609 if(dstbpp==12) //YV12 2520 if(dstbpp==12) //YV12
2610 { 2521 {
2611 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi 2522 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2612 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 2523 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2613 { 2524 {
2655 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2566 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2656 dest, dstW, dstbpp, 2567 dest, dstW, dstbpp,
2657 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4); 2568 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2658 } 2569 }
2659 } 2570 }
2571 }
2572 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2573 {
2574 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2575 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2576 if(dstbpp==12) //YV12
2577 {
2578 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2579 yuv2yuvXinC(
2580 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2581 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2582 dest, uDest, vDest, dstW);
2583 }
2584 else
2585 {
2586 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2587 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2588 yuv2rgbXinC(
2589 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2590 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2591 dest, dstW, dstbpp);
2592 }
2593 }
2660 } 2594 }
2661 2595
2662 #ifdef HAVE_MMX 2596 #ifdef HAVE_MMX
2663 __asm __volatile(SFENCE:::"memory"); 2597 __asm __volatile(SFENCE:::"memory");
2664 __asm __volatile(EMMS:::"memory"); 2598 __asm __volatile(EMMS:::"memory");