Mercurial > mplayer.hg
changeset 27529:ffb573fae5e8
Rewrite bgr24->yuv mmx code, the new code is cleaner, more accurate,
and does not throw half the chroma away.
author | michael |
---|---|
date | Tue, 09 Sep 2008 23:30:06 +0000 |
parents | 0474738b5577 |
children | dbdc77f8b041 |
files | libswscale/swscale.c libswscale/swscale_template.c |
diffstat | 2 files changed, 147 insertions(+), 196 deletions(-) [+] |
line wrap: on
line diff
--- a/libswscale/swscale.c Tue Sep 09 21:07:26 2008 +0000 +++ b/libswscale/swscale.c Tue Sep 09 23:30:06 2008 +0000 @@ -237,6 +237,20 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL; DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL; DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL; + +DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toY1Coeff) = 0x0C88000040870C88ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toY2Coeff) = 0x20DE4087000020DEULL; +DECLARE_ALIGNED(8, const uint64_t, ff_rgb24toY1Coeff) = 0x20DE0000408720DEULL; +DECLARE_ALIGNED(8, const uint64_t, ff_rgb24toY2Coeff) = 0x0C88408700000C88ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toYOffset) = 0x0008400000084000ULL; + +DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toUV[2][4]) = { + {0x38380000DAC83838ULL, 0xECFFDAC80000ECFFULL, 0xF6E40000D0E3F6E4ULL, 0x3838D0E300003838ULL}, + {0xECFF0000DAC8ECFFULL, 0x3838DAC800003838ULL, 0x38380000D0E33838ULL, 0xF6E4D0E30000F6E4ULL}, +}; + +DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toUVOffset)= 0x0040400000404000ULL; + #endif /* defined(ARCH_X86) */ // clipping helper table for C implementations: @@ -2201,7 +2215,8 @@ if ((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP) && srcFormat!=PIX_FMT_RGB8 && srcFormat!=PIX_FMT_BGR8 && srcFormat!=PIX_FMT_RGB4 && srcFormat!=PIX_FMT_BGR4 - && srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE) + && srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE + && srcFormat!=PIX_FMT_BGR24 && srcFormat!=PIX_FMT_RGB24) c->chrSrcHSubSample=1; if (param){
--- a/libswscale/swscale_template.c Tue Sep 09 21:07:26 2008 +0000 +++ b/libswscale/swscale_template.c Tue Sep 09 23:30:06 2008 +0000 @@ -1875,78 +1875,121 @@ } } +#ifdef HAVE_MMX +static inline void bgr24ToY_mmx(uint8_t *dst, uint8_t *src, long width, int srcFormat) +{ + + if(srcFormat == PIX_FMT_BGR24){ + asm volatile( + "movq "MANGLE(ff_bgr24toY1Coeff)", %mm5 \n\t" + "movq "MANGLE(ff_bgr24toY2Coeff)", %mm6 \n\t" + ); + }else{ + asm volatile( + "movq "MANGLE(ff_rgb24toY1Coeff)", %mm5 \n\t" + "movq "MANGLE(ff_rgb24toY2Coeff)", %mm6 \n\t" + ); + } + + asm volatile( + "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t" + "mov %2, %%"REG_a" \n\t" + "pxor %%mm7, %%mm7 \n\t" + "1: \n\t" + PREFETCH" 64(%0) \n\t" + "movd (%0), %%mm0 \n\t" + "movd 2(%0), %%mm1 \n\t" + "movd 6(%0), %%mm2 \n\t" + "movd 8(%0), %%mm3 \n\t" + "add $12, %0 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "pmaddwd %%mm5, %%mm0 \n\t" + "pmaddwd %%mm6, %%mm1 \n\t" + "pmaddwd %%mm5, %%mm2 \n\t" + "pmaddwd %%mm6, %%mm3 \n\t" + "paddd %%mm1, %%mm0 \n\t" + "paddd %%mm3, %%mm2 \n\t" + "paddd %%mm4, %%mm0 \n\t" + "paddd %%mm4, %%mm2 \n\t" + "psrad $15, %%mm0 \n\t" + "psrad $15, %%mm2 \n\t" + "packssdw %%mm2, %%mm0 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "movd %%mm0, (%1, %%"REG_a") \n\t" + "add $4, %%"REG_a" \n\t" + " js 1b \n\t" + : "+r" (src) + : "r" (dst+width), "g" (-width) + : "%"REG_a + ); +} + +static inline void bgr24ToUV_mmx(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat) +{ + asm volatile( + "movq 24+%4, %%mm6 \n\t" + "mov %3, %%"REG_a" \n\t" + "pxor %%mm7, %%mm7 \n\t" + "1: \n\t" + PREFETCH" 64(%0) \n\t" + "movd (%0), %%mm0 \n\t" + "movd 2(%0), %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "pmaddwd %4, %%mm0 \n\t" + "pmaddwd 8+%4, %%mm1 \n\t" + "pmaddwd 16+%4, %%mm2 \n\t" + "pmaddwd %%mm6, %%mm3 \n\t" + "paddd %%mm1, %%mm0 \n\t" + "paddd %%mm3, %%mm2 \n\t" + + "movd 6(%0), %%mm1 \n\t" + "movd 8(%0), %%mm3 \n\t" + "add $12, %0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "pmaddwd %4, %%mm1 \n\t" + "pmaddwd 8+%4, %%mm3 \n\t" + "pmaddwd 16+%4, %%mm4 \n\t" + "pmaddwd %%mm6, %%mm5 \n\t" + "paddd %%mm3, %%mm1 \n\t" + "paddd %%mm5, %%mm4 \n\t" + + "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t" + "paddd %%mm3, %%mm0 \n\t" + "paddd %%mm3, %%mm2 \n\t" + "paddd %%mm3, %%mm1 \n\t" + "paddd %%mm3, %%mm4 \n\t" + "psrad $15, %%mm0 \n\t" + "psrad $15, %%mm2 \n\t" + "psrad $15, %%mm1 \n\t" + "psrad $15, %%mm4 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "packssdw %%mm4, %%mm2 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm2, %%mm2 \n\t" + "movd %%mm0, (%1, %%"REG_a") \n\t" + "movd %%mm2, (%2, %%"REG_a") \n\t" + "add $4, %%"REG_a" \n\t" + " js 1b \n\t" + : "+r" (src) + : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0]) + : "%"REG_a + ); +} +#endif + static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) { #ifdef HAVE_MMX - asm volatile( - "mov %2, %%"REG_a" \n\t" - "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" - "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 64(%0, %%"REG_d") \n\t" - "movd (%0, %%"REG_d"), %%mm0 \n\t" - "movd 3(%0, %%"REG_d"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "movd 6(%0, %%"REG_d"), %%mm2 \n\t" - "movd 9(%0, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm0 \n\t" - "pmaddwd %%mm6, %%mm1 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "pmaddwd %%mm6, %%mm3 \n\t" -#ifndef FAST_BGR2YV12 - "psrad $8, %%mm0 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" -#endif - "packssdw %%mm1, %%mm0 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "pmaddwd %%mm5, %%mm0 \n\t" - "pmaddwd %%mm5, %%mm2 \n\t" - "packssdw %%mm2, %%mm0 \n\t" - "psraw $7, %%mm0 \n\t" - - "movd 12(%0, %%"REG_d"), %%mm4 \n\t" - "movd 15(%0, %%"REG_d"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "movd 18(%0, %%"REG_d"), %%mm2 \n\t" - "movd 21(%0, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm4 \n\t" - "pmaddwd %%mm6, %%mm1 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "pmaddwd %%mm6, %%mm3 \n\t" -#ifndef FAST_BGR2YV12 - "psrad $8, %%mm4 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" -#endif - "packssdw %%mm1, %%mm4 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "pmaddwd %%mm5, %%mm4 \n\t" - "pmaddwd %%mm5, %%mm2 \n\t" - "add $24, %%"REG_d" \n\t" - "packssdw %%mm2, %%mm4 \n\t" - "psraw $7, %%mm4 \n\t" - - "packuswb %%mm4, %%mm0 \n\t" - "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" - - "movq %%mm0, (%1, %%"REG_a") \n\t" - "add $8, %%"REG_a" \n\t" - " js 1b \n\t" - : : "r" (src+width*3), "r" (dst+width), "g" (-width) - : "%"REG_a, "%"REG_d - ); + bgr24ToY_mmx(dst, src, width, PIX_FMT_BGR24); #else int i; for (i=0; i<width; i++) @@ -1963,132 +2006,17 @@ static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) { #ifdef HAVE_MMX - asm volatile( - "mov %3, %%"REG_a" \n\t" - "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" - "add %%"REG_d", %%"REG_d" \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 64(%0, %%"REG_d") \n\t" -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) - "movq (%0, %%"REG_d"), %%mm0 \n\t" - "movq 6(%0, %%"REG_d"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlq $24, %%mm0 \n\t" - "psrlq $24, %%mm2 \n\t" - PAVGB(%%mm1, %%mm0) - PAVGB(%%mm3, %%mm2) - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" -#else - "movd (%0, %%"REG_d"), %%mm0 \n\t" - "movd 3(%0, %%"REG_d"), %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "movd 6(%0, %%"REG_d"), %%mm4 \n\t" - "movd 9(%0, %%"REG_d"), %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "paddw %%mm4, %%mm2 \n\t" - "psrlw $1, %%mm0 \n\t" - "psrlw $1, %%mm2 \n\t" -#endif - "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" - "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" - - "pmaddwd %%mm0, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm0 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" -#ifndef FAST_BGR2YV12 - "psrad $8, %%mm0 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" -#endif - "packssdw %%mm2, %%mm0 \n\t" - "packssdw %%mm3, %%mm1 \n\t" - "pmaddwd %%mm5, %%mm0 \n\t" - "pmaddwd %%mm5, %%mm1 \n\t" - "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 - "psraw $7, %%mm0 \n\t" - -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) - "movq 12(%0, %%"REG_d"), %%mm4 \n\t" - "movq 18(%0, %%"REG_d"), %%mm2 \n\t" - "movq %%mm4, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlq $24, %%mm4 \n\t" - "psrlq $24, %%mm2 \n\t" - PAVGB(%%mm1, %%mm4) - PAVGB(%%mm3, %%mm2) - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" -#else - "movd 12(%0, %%"REG_d"), %%mm4 \n\t" - "movd 15(%0, %%"REG_d"), %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "paddw %%mm2, %%mm4 \n\t" - "movd 18(%0, %%"REG_d"), %%mm5 \n\t" - "movd 21(%0, %%"REG_d"), %%mm2 \n\t" - "punpcklbw %%mm7, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "paddw %%mm5, %%mm2 \n\t" - "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm2 \n\t" -#endif - "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" - "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" - - "pmaddwd %%mm4, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm4 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" -#ifndef FAST_BGR2YV12 - "psrad $8, %%mm4 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" -#endif - "packssdw %%mm2, %%mm4 \n\t" - "packssdw %%mm3, %%mm1 \n\t" - "pmaddwd %%mm5, %%mm4 \n\t" - "pmaddwd %%mm5, %%mm1 \n\t" - "add $24, %%"REG_d" \n\t" - "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 - "psraw $7, %%mm4 \n\t" - - "movq %%mm0, %%mm1 \n\t" - "punpckldq %%mm4, %%mm0 \n\t" - "punpckhdq %%mm4, %%mm1 \n\t" - "packsswb %%mm1, %%mm0 \n\t" - "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" - - "movd %%mm0, (%1, %%"REG_a") \n\t" - "punpckhdq %%mm0, %%mm0 \n\t" - "movd %%mm0, (%2, %%"REG_a") \n\t" - "add $4, %%"REG_a" \n\t" - " js 1b \n\t" - : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) - : "%"REG_a, "%"REG_d - ); + bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_BGR24); #else int i; for (i=0; i<width; i++) { - int b= src1[6*i + 0] + src1[6*i + 3]; - int g= src1[6*i + 1] + src1[6*i + 4]; - int r= src1[6*i + 2] + src1[6*i + 5]; + int b= src1[3*i + 0]; + int g= src1[3*i + 1]; + int r= src1[3*i + 2]; - dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); - dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); + dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; + dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; } #endif /* HAVE_MMX */ assert(src1 == src2); @@ -2201,6 +2129,9 @@ static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width) { +#ifdef HAVE_MMX + bgr24ToY_mmx(dst, src, width, PIX_FMT_RGB24); +#else int i; for (i=0; i<width; i++) { @@ -2210,21 +2141,26 @@ dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); } +#endif } static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) { int i; assert(src1==src2); +#ifdef HAVE_MMX + bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_RGB24); +#else for (i=0; i<width; i++) { - int r= src1[6*i + 0] + src1[6*i + 3]; - int g= src1[6*i + 1] + src1[6*i + 4]; - int b= src1[6*i + 2] + src1[6*i + 5]; + int r= src1[3*i + 0]; + int g= src1[3*i + 1]; + int b= src1[3*i + 2]; - dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1); - dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1); + dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; + dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; } +#endif } static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, long width)