# HG changeset patch # User michael # Date 1007665645 0 # Node ID 64121e8a43f529b8caa15cb99e552b6cefb28c02 # Parent 091cdd056ca453e699b774f057714c0b0e4609d1 print more info if -v use new horizontal mmx scaler instead of old x86asm if mmx2 cant be used (FAST_BILINEAR only) fixed overflow in init function ... using double precission fp now :) using C scaler for the last 1-2 lines if there is a chance to write over the end of the dst array diff -r 091cdd056ca4 -r 64121e8a43f5 postproc/swscale.c --- a/postproc/swscale.c Thu Dec 06 18:54:52 2001 +0000 +++ b/postproc/swscale.c Thu Dec 06 19:07:25 2001 +0000 @@ -31,14 +31,14 @@ //#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; } #define ASSERT(x) ; - +extern int verbose; // defined in mplayer.c /* NOTES known BUGS with known cause (no bugreports please!, but patches are welcome :) ) -horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11) +horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11) -Supported output formats BGR15 BGR16 BGR24 BGR32, YV12 +Supported output formats BGR15 BGR16 BGR24 BGR32 YV12 BGR15 & BGR16 MMX verions support dithering Special versions: fast Y 1:1 scaling (no interpolation in y direction) @@ -49,6 +49,7 @@ Move static / global vars into a struct so multiple scalers can be used write special vertical cubic upscale version Optimize C code (yv12 / minmax) +dstStride[3] */ #define ABS(a) ((a) > 0 ? (a) : (-(a))) @@ -183,6 +184,203 @@ } #endif +static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, + int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, + uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW) +{ + //FIXME Optimize (just quickly writen not opti..) + int i; + for(i=0; i>19, 0), 255); + } + + if(uDest != NULL) + for(i=0; i<(dstW>>1); i++) + { + int u=0; + int v=0; + int j; + for(j=0; j>19, 0), 255); + vDest[i]= MIN(MAX(v>>19, 0), 255); + } +} + +static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, + int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, + uint8_t *dest, int dstW, int dstbpp) +{ + if(dstbpp==32) + { + int i; + for(i=0; i<(dstW>>1); i++){ + int j; + int Y1=0; + int Y2=0; + int U=0; + int V=0; + int Cb, Cr, Cg; + for(j=0; j>19) + 256 ]; + Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; + U >>= 19; + V >>= 19; + + Cb= clip_yuvtab_40cf[U+ 256]; + Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; + Cr= clip_yuvtab_3343[V+ 256]; + + dest[8*i+0]=clip_table[((Y1 + Cb) >>13)]; + dest[8*i+1]=clip_table[((Y1 + Cg) >>13)]; + dest[8*i+2]=clip_table[((Y1 + Cr) >>13)]; + + dest[8*i+4]=clip_table[((Y2 + Cb) >>13)]; + dest[8*i+5]=clip_table[((Y2 + Cg) >>13)]; + dest[8*i+6]=clip_table[((Y2 + Cr) >>13)]; + } + } + else if(dstbpp==24) + { + int i; + for(i=0; i<(dstW>>1); i++){ + int j; + int Y1=0; + int Y2=0; + int U=0; + int V=0; + int Cb, Cr, Cg; + for(j=0; j>19) + 256 ]; + Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; + U >>= 19; + V >>= 19; + + Cb= clip_yuvtab_40cf[U+ 256]; + Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; + Cr= clip_yuvtab_3343[V+ 256]; + + dest[0]=clip_table[((Y1 + Cb) >>13)]; + dest[1]=clip_table[((Y1 + Cg) >>13)]; + dest[2]=clip_table[((Y1 + Cr) >>13)]; + + dest[3]=clip_table[((Y2 + Cb) >>13)]; + dest[4]=clip_table[((Y2 + Cg) >>13)]; + dest[5]=clip_table[((Y2 + Cr) >>13)]; + dest+=6; + } + } + else if(dstbpp==16) + { + int i; + for(i=0; i<(dstW>>1); i++){ + int j; + int Y1=0; + int Y2=0; + int U=0; + int V=0; + int Cb, Cr, Cg; + for(j=0; j>19) + 256 ]; + Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; + U >>= 19; + V >>= 19; + + Cb= clip_yuvtab_40cf[U+ 256]; + Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; + Cr= clip_yuvtab_3343[V+ 256]; + + ((uint16_t*)dest)[2*i] = + clip_table16b[(Y1 + Cb) >>13] | + clip_table16g[(Y1 + Cg) >>13] | + clip_table16r[(Y1 + Cr) >>13]; + + ((uint16_t*)dest)[2*i+1] = + clip_table16b[(Y2 + Cb) >>13] | + clip_table16g[(Y2 + Cg) >>13] | + clip_table16r[(Y2 + Cr) >>13]; + } + } + else if(dstbpp==15) + { + int i; + for(i=0; i<(dstW>>1); i++){ + int j; + int Y1=0; + int Y2=0; + int U=0; + int V=0; + int Cb, Cr, Cg; + for(j=0; j>19) + 256 ]; + Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; + U >>= 19; + V >>= 19; + + Cb= clip_yuvtab_40cf[U+ 256]; + Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; + Cr= clip_yuvtab_3343[V+ 256]; + + ((uint16_t*)dest)[2*i] = + clip_table15b[(Y1 + Cb) >>13] | + clip_table15g[(Y1 + Cg) >>13] | + clip_table15r[(Y1 + Cr) >>13]; + + ((uint16_t*)dest)[2*i+1] = + clip_table15b[(Y2 + Cb) >>13] | + clip_table15g[(Y2 + Cg) >>13] | + clip_table15r[(Y2 + Cr) >>13]; + } + } +} + + //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one //Plain C versions #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT) @@ -270,7 +468,6 @@ // *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices: // *** Note: it's called multiple times while decoding a frame, first time y==0 -// *** Designed to upscale, but may work for downscale too. // switching the cpu type during a sliced drawing can have bad effects, like sig11 void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int srcSliceY , int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp, diff -r 091cdd056ca4 -r 64121e8a43f5 postproc/swscale_template.c --- a/postproc/swscale_template.c Thu Dec 06 18:54:52 2001 +0000 +++ b/postproc/swscale_template.c Thu Dec 06 19:07:25 2001 +0000 @@ -672,33 +672,9 @@ : "%eax", "%edx", "%esi" ); #else - //FIXME Optimize (just quickly writen not opti..) - int i; - for(i=0; i>19, 0), 255); - } - - if(uDest != NULL) - for(i=0; i<(dstW>>1); i++) - { - int u=0; - int v=0; - int j; - for(j=0; j>19, 0), 255); - vDest[i]= MIN(MAX(v>>19, 0), 255); - } +yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, + chrFilter, chrSrc, chrFilterSize, + dest, uDest, vDest, dstW); #endif } @@ -836,163 +812,10 @@ ); } #else - if(dstbpp==32) - { - int i; - for(i=0; i<(dstW>>1); i++){ - int j; - int Y1=0; - int Y2=0; - int U=0; - int V=0; - int Cb, Cr, Cg; - for(j=0; j>19) + 256 ]; - Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; - U >>= 19; - V >>= 19; - - Cb= clip_yuvtab_40cf[U+ 256]; - Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; - Cr= clip_yuvtab_3343[V+ 256]; - - dest[8*i+0]=clip_table[((Y1 + Cb) >>13)]; - dest[8*i+1]=clip_table[((Y1 + Cg) >>13)]; - dest[8*i+2]=clip_table[((Y1 + Cr) >>13)]; - - dest[8*i+4]=clip_table[((Y2 + Cb) >>13)]; - dest[8*i+5]=clip_table[((Y2 + Cg) >>13)]; - dest[8*i+6]=clip_table[((Y2 + Cr) >>13)]; - } - } - else if(dstbpp==24) - { - int i; - for(i=0; i<(dstW>>1); i++){ - int j; - int Y1=0; - int Y2=0; - int U=0; - int V=0; - int Cb, Cr, Cg; - for(j=0; j>19) + 256 ]; - Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; - U >>= 19; - V >>= 19; - - Cb= clip_yuvtab_40cf[U+ 256]; - Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; - Cr= clip_yuvtab_3343[V+ 256]; - - dest[0]=clip_table[((Y1 + Cb) >>13)]; - dest[1]=clip_table[((Y1 + Cg) >>13)]; - dest[2]=clip_table[((Y1 + Cr) >>13)]; +yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize, + chrFilter, chrSrc, chrFilterSize, + dest, dstW, dstbpp); - dest[3]=clip_table[((Y2 + Cb) >>13)]; - dest[4]=clip_table[((Y2 + Cg) >>13)]; - dest[5]=clip_table[((Y2 + Cr) >>13)]; - dest+=6; - } - } - else if(dstbpp==16) - { - int i; - for(i=0; i<(dstW>>1); i++){ - int j; - int Y1=0; - int Y2=0; - int U=0; - int V=0; - int Cb, Cr, Cg; - for(j=0; j>19) + 256 ]; - Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; - U >>= 19; - V >>= 19; - - Cb= clip_yuvtab_40cf[U+ 256]; - Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; - Cr= clip_yuvtab_3343[V+ 256]; - - ((uint16_t*)dest)[2*i] = - clip_table16b[(Y1 + Cb) >>13] | - clip_table16g[(Y1 + Cg) >>13] | - clip_table16r[(Y1 + Cr) >>13]; - - ((uint16_t*)dest)[2*i+1] = - clip_table16b[(Y2 + Cb) >>13] | - clip_table16g[(Y2 + Cg) >>13] | - clip_table16r[(Y2 + Cr) >>13]; - } - } - else if(dstbpp==15) - { - int i; - for(i=0; i<(dstW>>1); i++){ - int j; - int Y1=0; - int Y2=0; - int U=0; - int V=0; - int Cb, Cr, Cg; - for(j=0; j>19) + 256 ]; - Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; - U >>= 19; - V >>= 19; - - Cb= clip_yuvtab_40cf[U+ 256]; - Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; - Cr= clip_yuvtab_3343[V+ 256]; - - ((uint16_t*)dest)[2*i] = - clip_table15b[(Y1 + Cb) >>13] | - clip_table15g[(Y1 + Cg) >>13] | - clip_table15r[(Y1 + Cr) >>13]; - - ((uint16_t*)dest)[2*i+1] = - clip_table15b[(Y2 + Cb) >>13] | - clip_table15g[(Y2 + Cg) >>13] | - clip_table15r[(Y2 + Cr) >>13]; - } - } #endif } //!FULL_UV_IPOL } @@ -1373,7 +1196,6 @@ uint8_t *dest, int dstW, int uvalpha, int dstbpp) { int uvalpha1=uvalpha^4095; - const int yalpha=0; const int yalpha1=0; if(fullUVIpol || allwaysIpol) @@ -1636,7 +1458,7 @@ "movd %%mm0, (%4, %%ebp) \n\t" "addl $4, %%ebp \n\t" " jnc 1b \n\t" - + "popl %%ebp \n\t" : "+a" (counter) : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) @@ -1764,7 +1586,12 @@ // *** horizontal scale Y line to temp buffer static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc) { +#ifdef HAVE_MMX + // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one) + if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed)) +#else if(sws_flags != SWS_FAST_BILINEAR) +#endif { RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); } @@ -1885,7 +1712,12 @@ inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2, int srcW, int xInc) { +#ifdef HAVE_MMX + // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one) + if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed)) +#else if(sws_flags != SWS_FAST_BILINEAR) +#endif { RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); @@ -2026,12 +1858,13 @@ } } -static inline void RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *filterSize, int xInc, +static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc, int srcW, int dstW, int filterAlign, int one) { int i; + double filter[8000]; #ifdef HAVE_MMX - asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions) + asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions) #endif if(ABS(xInc - 0x10000) <10) // unscaled @@ -2066,14 +1899,13 @@ if(sws_flags == SWS_BICUBIC) { double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16); -// int coeff; - int y1,y2,y3,y4; + double y1,y2,y3,y4; double A= -0.75; // Equation is from VirtualDub - y1 = (int)floor(0.5 + ( + A*d - 2.0*A*d*d + A*d*d*d) * 16384.0); - y2 = (int)floor(0.5 + (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d) * 16384.0); - y3 = (int)floor(0.5 + ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d) * 16384.0); - y4 = (int)floor(0.5 + ( + A*d*d - A*d*d*d) * 16384.0); + y1 = ( + A*d - 2.0*A*d*d + A*d*d*d); + y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d); + y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d); + y4 = ( + A*d*d - A*d*d*d); // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); filter[i*(*filterSize) + 0]= y1; @@ -2087,8 +1919,7 @@ for(j=0; j<*filterSize; j++) { double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16); - int coeff; - coeff= (int)(0.5 + (1.0 - d)*(1<<14)); + double coeff= 1.0 - d; if(coeff<0) coeff=0; // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); filter[i*(*filterSize) + j]= coeff; @@ -2116,24 +1947,22 @@ for(j=0; j<*filterSize; j++) { double d= ABS((xx<<16) - xDstInSrc)/(double)xInc; - int coeff; + double coeff; if(sws_flags == SWS_BICUBIC) { double A= -0.75; // d*=2; // Equation is from VirtualDub if(d<1.0) - coeff = (int)floor(0.5 + (1.0 - (A+3.0)*d*d - + (A+2.0)*d*d*d) * (1<<14)); + coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d); else if(d<2.0) - coeff = (int)floor(0.5 + (-4.0*A + 8.0*A*d - - 5.0*A*d*d + A*d*d*d) * (1<<14)); + coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d); else - coeff=0; + coeff=0.0; } else { - coeff= (int)(0.5 + (1.0 - d)*(1<<14)); + coeff= 1.0 - d; if(coeff<0) coeff=0; } // if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc); @@ -2160,17 +1989,17 @@ filterPos[i]= 0; } - if(filterPos[i] + *filterSize > srcW) + if(filterPos[i] + (*filterSize) > srcW) { - int shift= filterPos[i] + *filterSize - srcW; + int shift= filterPos[i] + (*filterSize) - srcW; // Move filter coeffs right to compensate for filterPos - for(j=*filterSize-2; j>=0; j--) + for(j=(*filterSize)-2; j>=0; j--) { - int right= MIN(j + shift, *filterSize-1); + int right= MIN(j + shift, (*filterSize)-1); filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j]; filter[i*(*filterSize) +j]=0; } - filterPos[i]= srcW - *filterSize; + filterPos[i]= srcW - (*filterSize); } } @@ -2190,7 +2019,7 @@ scale/= sum; for(j=0; j<*filterSize; j++) { - filter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale); + dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale); } } } @@ -2339,17 +2168,29 @@ static int firstTime=1; -int widthAlign= dstbpp==12 ? 16 : 8; -if(((dstW + widthAlign-1)&(~(widthAlign-1))) > dststride) +const int widthAlign= dstbpp==12 ? 16 : 8; +const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4) +const int over= dstbpp==12 ? (((dstW+15)&(~15))) - dststride + : (((dstW+7)&(~7)))*bytespp - dststride; +if(dststride%widthAlign !=0 ) { - dstW&= ~(widthAlign-1); if(firstTime) fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n" - "SwScaler: ->lowering width to compensate, new width=%d\n" - "SwScaler: ->cannot do aligned memory acesses anymore\n", - widthAlign, dstW); + "SwScaler: ->cannot do aligned memory acesses anymore\n", + widthAlign); } +if(over>0) +{ + if(firstTime) + fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n" + "SwScaler: and dststride is not large enough to handle %d extra bytes\n" + "SwScaler: ->using unoptimized C version for last line(s)\n", + over); +} + + + //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH); //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH); @@ -2357,9 +2198,11 @@ canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0; if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR) { - if(firstTime) //FIXME only if verbose ? + if(firstTime) fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n"); } +#else +canMMX2BeUsed=0; // should be 0 anyway but ... #endif if(firstTime) @@ -2398,7 +2241,7 @@ #elif defined (HAVE_MMX) fprintf(stderr, "using MMX\n"); #elif defined (ARCH_X86) - fprintf(stderr, "using X86 ASM2\n"); + fprintf(stderr, "using X86 ASM\n"); #else fprintf(stderr, "using C\n"); #endif @@ -2413,13 +2256,15 @@ if(sws_flags==SWS_FAST_BILINEAR) { if(canMMX2BeUsed) lumXInc+= 20; +#ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20; +#endif } if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW; -else chrXInc= lumXInc, chrDstW= dstW>>1; +else chrXInc= lumXInc, chrDstW= (dstW+1)>>1; -if(dstbpp==12) chrYInc= lumYInc, chrDstH= dstH>>1; +if(dstbpp==12) chrYInc= lumYInc, chrDstH= (dstH+1)>>1; else chrYInc= lumYInc>>1, chrDstH= dstH; // force calculation of the horizontal interpolation of the first line @@ -2440,13 +2285,10 @@ #endif oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags; - if(sws_flags != SWS_FAST_BILINEAR) - { - RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc, - srcW , dstW , filterAlign, 1<<14); - RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc, - srcW>>1, chrDstW, filterAlign, 1<<14); - } + RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc, + srcW , dstW , filterAlign, 1<<14); + RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc, + (srcW+1)>>1, chrDstW, filterAlign, 1<<14); #ifdef HAVE_MMX2 // cant downscale !!! @@ -2470,7 +2312,7 @@ RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc, srcH , dstH, 1, (1<<12)-4); RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc, - srcH>>1, chrDstH, 1, (1<<12)-4); + (srcH+1)>>1, chrDstH, 1, (1<<12)-4); // Calculate Buffer Sizes so that they wont run out while handling these damn slices vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize; @@ -2509,6 +2351,74 @@ #endif } + if(firstTime && verbose) + { +#ifdef HAVE_MMX2 + int mmx2=1; +#else + int mmx2=0; +#endif +#ifdef HAVE_MMX + int mmx=1; +#else + int mmx=0; +#endif + +#ifdef HAVE_MMX + if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR) + printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n"); + else + { + if(hLumFilterSize==4) + printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n"); + else if(hLumFilterSize==8) + printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n"); + else + printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n"); + + if(hChrFilterSize==4) + printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n"); + else if(hChrFilterSize==8) + printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n"); + else + printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n"); + } +#elif defined (ARCH_X86) + printf("SwScaler: using X86-Asm scaler for horizontal scaling\n"); +#else + if(sws_flags==SWS_FAST_BILINEAR) + printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n"); + else + printf("SwScaler: using C scaler for horizontal scaling\n"); +#endif + + if(dstbpp==12) + { + if(vLumFilterSize==1) + printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C"); + else + printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C"); + } + else + { + if(vLumFilterSize==1 && vChrFilterSize==2) + printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n" + "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C"); + else if(vLumFilterSize==2 && vChrFilterSize==2) + printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C"); + else + printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C"); + } + + if(dstbpp==24) + printf("SwScaler: using %s YV12->BGR24 Converter\n", + mmx2 ? "MMX2" : (mmx ? "MMX" : "C")); + else + printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp); + + printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH); + } + lastInLumBuf= -1; lastInChrBuf= -1; } // if(firstLine) @@ -2557,7 +2467,7 @@ ASSERT(chrBufIndex < 2*vChrBufSize) ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) - RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc); + RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc); lastInChrBuf++; } //wrap buf index around to stay inside the ring buffer @@ -2590,7 +2500,7 @@ ASSERT(chrBufIndex < 2*vChrBufSize) ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) - RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc); + RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc); lastInChrBuf++; } //wrap buf index around to stay inside the ring buffer @@ -2605,7 +2515,8 @@ g5Dither= dither8[dstY&1]; r5Dither= dither8[(dstY+1)&1]; #endif - + if(dstY < dstH-2 || over<=0) + { if(dstbpp==12) //YV12 { if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi @@ -2657,6 +2568,29 @@ lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4); } } + } + else // hmm looks like we cant use MMX here without overwriting this arrays tail + { + int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; + int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; + if(dstbpp==12) //YV12 + { + if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi + yuv2yuvXinC( + vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, + vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize, + dest, uDest, vDest, dstW); + } + else + { + ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); + ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); + yuv2rgbXinC( + vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, + vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + dest, dstW, dstbpp); + } + } } #ifdef HAVE_MMX