Mercurial > mplayer.hg
changeset 7723:11492d5b0896
mmx yuy2 output
author | michael |
---|---|
date | Sun, 13 Oct 2002 17:23:02 +0000 |
parents | a181875e0aa8 |
children | 619fd5403fd0 |
files | postproc/swscale.c postproc/swscale_template.c |
diffstat | 2 files changed, 163 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/postproc/swscale.c Sun Oct 13 13:49:24 2002 +0000 +++ b/postproc/swscale.c Sun Oct 13 17:23:02 2002 +0000 @@ -421,7 +421,7 @@ } -#define YSCALE_YUV_2_X_C(type) \ +#define YSCALE_YUV_2_PACKEDX_C(type) \ for(i=0; i<(dstW>>1); i++){\ int j;\ int Y1=0;\ @@ -458,12 +458,12 @@ } #define YSCALE_YUV_2_RGBX_C(type) \ - YSCALE_YUV_2_X_C(type)\ + YSCALE_YUV_2_PACKEDX_C(type)\ r = c->table_rV[V];\ g = c->table_gU[U] + c->table_gV[V];\ b = c->table_bU[U];\ -#define YSCALE_YUV_2_2_C \ +#define YSCALE_YUV_2_PACKED2_C \ for(i=0; i<(dstW>>1); i++){\ const int i2= 2*i;\ int Y1= (buf0[i2 ]*yalpha1+buf1[i2 ]*yalpha)>>19;\ @@ -472,13 +472,13 @@ int V= (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19;\ #define YSCALE_YUV_2_RGB2_C(type) \ - YSCALE_YUV_2_2_C\ + YSCALE_YUV_2_PACKED2_C\ type *r, *b, *g;\ r = c->table_rV[V];\ g = c->table_gU[U] + c->table_gV[V];\ b = c->table_bU[U];\ -#define YSCALE_YUV_2_1_C \ +#define YSCALE_YUV_2_PACKED1_C \ for(i=0; i<(dstW>>1); i++){\ const int i2= 2*i;\ int Y1= buf0[i2 ]>>7;\ @@ -487,13 +487,13 @@ int V= (uvbuf1[i+2048])>>7;\ #define YSCALE_YUV_2_RGB1_C(type) \ - YSCALE_YUV_2_1_C\ + YSCALE_YUV_2_PACKED1_C\ type *r, *b, *g;\ r = c->table_rV[V];\ g = c->table_gU[U] + c->table_gV[V];\ b = c->table_bU[U];\ -#define YSCALE_YUV_2_1B_C \ +#define YSCALE_YUV_2_PACKED1B_C \ for(i=0; i<(dstW>>1); i++){\ const int i2= 2*i;\ int Y1= buf0[i2 ]>>7;\ @@ -502,7 +502,7 @@ int V= (uvbuf0[i+2048] + uvbuf1[i+2048])>>8;\ #define YSCALE_YUV_2_RGB1B_C(type) \ - YSCALE_YUV_2_1B_C\ + YSCALE_YUV_2_PACKED1B_C\ type *r, *b, *g;\ r = c->table_rV[V];\ g = c->table_gU[U] + c->table_gV[V];\ @@ -668,7 +668,7 @@ }\ -static inline void yuv2rgbXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, +static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, uint8_t *dest, int dstW, int y) { @@ -791,7 +791,7 @@ } break; case IMGFMT_YUY2: - YSCALE_YUV_2_X_C(void) + YSCALE_YUV_2_PACKEDX_C(void) ((uint8_t*)dest)[2*i2+0]= Y1; ((uint8_t*)dest)[2*i2+1]= U; ((uint8_t*)dest)[2*i2+2]= Y2;
--- a/postproc/swscale_template.c Sun Oct 13 13:49:24 2002 +0000 +++ b/postproc/swscale_template.c Sun Oct 13 17:23:02 2002 +0000 @@ -107,7 +107,7 @@ "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) : "%eax", "%ebx", "%ecx", "%edx", "%esi" */ -#define YSCALEYUV2RGBX \ +#define YSCALEYUV2PACKEDX \ "xorl %%eax, %%eax \n\t"\ ".balign 16 \n\t"\ "1: \n\t"\ @@ -144,7 +144,10 @@ "paddw %%mm5, %%mm7 \n\t"\ "addl $1, %%edx \n\t"\ " jnz 2b \n\t"\ -\ + + +#define YSCALEYUV2RGBX \ + YSCALEYUV2PACKEDX\ "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\ "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\ "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ @@ -234,6 +237,46 @@ \ "packuswb %%mm1, %%mm1 \n\t" +#define YSCALEYUV2PACKED \ + "movd %6, %%mm6 \n\t" /*yalpha1*/\ + "punpcklwd %%mm6, %%mm6 \n\t"\ + "punpcklwd %%mm6, %%mm6 \n\t"\ + "psraw $3, %%mm6 \n\t"\ + "movq %%mm6, 3968(%2) \n\t"\ + "movd %7, %%mm5 \n\t" /*uvalpha1*/\ + "punpcklwd %%mm5, %%mm5 \n\t"\ + "punpcklwd %%mm5, %%mm5 \n\t"\ + "psraw $3, %%mm5 \n\t"\ + "movq %%mm5, 3976(%2) \n\t"\ + "xorl %%eax, %%eax \n\t"\ + ".balign 16 \n\t"\ + "1: \n\t"\ + "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ + "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ + "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ + "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ + "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ + "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ + "movq 3976(%2), %%mm0 \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ + "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ + "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ + "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ + "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ + "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ + "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ + "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ + "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\ + "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\ + "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ + "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ + "pmulhw 3968(%2), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ + "pmulhw 3968(%2), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ + "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ + "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ + "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ + "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ + #define YSCALEYUV2RGB \ "movd %6, %%mm6 \n\t" /*yalpha1*/\ "punpcklwd %%mm6, %%mm6 \n\t"\ @@ -306,7 +349,20 @@ "packuswb %%mm6, %%mm5 \n\t"\ "packuswb %%mm3, %%mm4 \n\t"\ "pxor %%mm7, %%mm7 \n\t" - + +#define YSCALEYUV2PACKED1 \ + "xorl %%eax, %%eax \n\t"\ + ".balign 16 \n\t"\ + "1: \n\t"\ + "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\ + "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ + "psraw $7, %%mm3 \n\t" \ + "psraw $7, %%mm4 \n\t" \ + "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ + "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ + "psraw $7, %%mm1 \n\t" \ + "psraw $7, %%mm7 \n\t" \ + #define YSCALEYUV2RGB1 \ "xorl %%eax, %%eax \n\t"\ ".balign 16 \n\t"\ @@ -355,6 +411,23 @@ "packuswb %%mm3, %%mm4 \n\t"\ "pxor %%mm7, %%mm7 \n\t" +#define YSCALEYUV2PACKED1b \ + "xorl %%eax, %%eax \n\t"\ + ".balign 16 \n\t"\ + "1: \n\t"\ + "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ + "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ + "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ + "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ + "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ + "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ + "psrlw $8, %%mm3 \n\t" \ + "psrlw $8, %%mm4 \n\t" \ + "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ + "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ + "psraw $7, %%mm1 \n\t" \ + "psraw $7, %%mm7 \n\t" + // do vertical chrominance interpolation #define YSCALEYUV2RGB1b \ "xorl %%eax, %%eax \n\t"\ @@ -652,6 +725,23 @@ #define WRITEBGR24 WRITEBGR24MMX #endif +#define WRITEYUY2 \ + "packuswb %%mm3, %%mm3 \n\t"\ + "packuswb %%mm4, %%mm4 \n\t"\ + "packuswb %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm4, %%mm3 \n\t"\ + "movq %%mm1, %%mm7 \n\t"\ + "punpcklbw %%mm3, %%mm1 \n\t"\ + "punpckhbw %%mm3, %%mm7 \n\t"\ +\ + MOVNTQ(%%mm1, (%4, %%eax, 2))\ + MOVNTQ(%%mm7, 8(%4, %%eax, 2))\ +\ + "addl $8, %%eax \n\t"\ + "cmpl %5, %%eax \n\t"\ + " jb 1b \n\t" + + static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW, @@ -752,7 +842,7 @@ /** * vertical scale YV12 to RGB */ -static inline void RENAME(yuv2rgbX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, +static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, uint8_t *dest, int dstW, int16_t * lumMmxFilter, int16_t * chrMmxFilter, int dstY) { @@ -831,9 +921,29 @@ ); } break; + case IMGFMT_YUY2: + { + asm volatile( + YSCALEYUV2PACKEDX + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ + + "psraw $3, %%mm3 \n\t" + "psraw $3, %%mm4 \n\t" + "psraw $3, %%mm1 \n\t" + "psraw $3, %%mm7 \n\t" + WRITEYUY2 + + :: "m" (-lumFilterSize), "m" (-chrFilterSize), + "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), + "r" (dest), "m" (dstW), + "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) + : "%eax", "%ebx", "%ecx", "%edx", "%esi" + ); + } + break; #endif default: - yuv2rgbXinC(c, lumFilter, lumSrc, lumFilterSize, + yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY); break; @@ -843,7 +953,7 @@ /** * vertical bilinear scale YV12 to RGB */ -static inline void RENAME(yuv2rgb2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, +static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) { int yalpha1=yalpha^4095; @@ -1124,16 +1234,26 @@ : "%eax" ); return; + case IMGFMT_YUY2: + asm volatile( + YSCALEYUV2PACKED + WRITEYUY2 + + :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), + "m" (yalpha1), "m" (uvalpha1) + : "%eax" + ); + return; default: break; } #endif //HAVE_MMX -YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_2_C) +YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C) } /** * YV12 to RGB without scaling or interpolating */ -static inline void RENAME(yuv2rgb1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, +static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) { int uvalpha1=uvalpha^4095; @@ -1145,7 +1265,7 @@ if(flags&SWS_FULL_CHR_H_INT) { - RENAME(yuv2rgb2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); + RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); return; } @@ -1204,6 +1324,15 @@ : "%eax" ); return; + case IMGFMT_YUY2: + asm volatile( + YSCALEYUV2PACKED1 + WRITEYUY2 + :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), + "m" (yalpha1), "m" (uvalpha1) + : "%eax" + ); + return; } } else @@ -1260,14 +1389,23 @@ : "%eax" ); return; + case IMGFMT_YUY2: + asm volatile( + YSCALEYUV2PACKED1b + WRITEYUY2 + :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), + "m" (yalpha1), "m" (uvalpha1) + : "%eax" + ); + return; } } #endif if( uvalpha < 2048 ) { - YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_1_C) + YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C) }else{ - YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_1B_C) + YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C) } } @@ -2533,7 +2671,7 @@ { int chrAlpha= vChrFilter[2*dstY+1]; - RENAME(yuv2rgb1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), + RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), dest, dstW, chrAlpha, dstFormat, flags, dstY); } else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB @@ -2541,12 +2679,12 @@ int lumAlpha= vLumFilter[2*dstY+1]; int chrAlpha= vChrFilter[2*dstY+1]; - RENAME(yuv2rgb2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), + RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), dest, dstW, lumAlpha, chrAlpha, dstY); } else //General RGB { - RENAME(yuv2rgbX)(c, + RENAME(yuv2packedX)(c, vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, dest, dstW, @@ -2571,7 +2709,7 @@ { ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); - yuv2rgbXinC(c, + yuv2packedXinC(c, vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, dest, dstW, dstY);