Mercurial > mplayer.hg
changeset 2724:c08b7af26782
yuy2toyv12 fixed and speedup
author | michael |
---|---|
date | Mon, 05 Nov 2001 18:50:58 +0000 |
parents | 22aba8af94af |
children | 5bba527c9a4c |
files | postproc/rgb2rgb.c postproc/rgb2rgb.h postproc/rgb2rgb_template.c |
diffstat | 3 files changed, 208 insertions(+), 113 deletions(-) [+] |
line wrap: on
line diff
--- a/postproc/rgb2rgb.c Mon Nov 05 18:26:49 2001 +0000 +++ b/postproc/rgb2rgb.c Mon Nov 05 18:50:58 2001 +0000 @@ -291,7 +291,8 @@ } /** * - * width must be a multiple of 16 for the MMX version + * height should be a multiple of 2 and width should be a multiple of 16 (if this is a + * problem for anyone then tell me, and ill fix it) */ void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride) @@ -359,70 +360,116 @@ #endif } -void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, unsigned num_pixels) +/** + * + * height should be a multiple of 2 and width should be a multiple of 16 (if this is a + * problem for anyone then tell me, and ill fix it) + */ +void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + int width, int height, int lumStride, int chromStride, int srcStride) { + int y; + const int chromWidth= width>>1; + for(y=0; y<height; y+=2) + { #ifdef HAVE_MMX - asm volatile( - "xorl %%eax, %%eax \n\t" - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... - "1: \n\t" - PREFETCH" 64(%0, %%eax, 4) \n\t" - "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) - "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) - "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) - "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) - "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) - "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) - "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) - "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) - "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) - "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) + asm volatile( + "xorl %%eax, %%eax \n\t" + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... + "1: \n\t" + PREFETCH" 64(%0, %%eax, 4) \n\t" + "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) + "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) + "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) + "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) + "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) + "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) + "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) + "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) + + MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" - MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" + "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) + "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) + "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) + "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) + "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) + "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) + "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) + "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) - "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) - "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) - "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) - "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) - "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) - "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) - "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) - "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) - "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) - "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) + MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" + + "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) + "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) + "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) + "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) + "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) + "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) + "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) + "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) - MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" + MOVNTQ" %%mm0, (%3, %%eax) \n\t" + MOVNTQ" %%mm2, (%2, %%eax) \n\t" + + "addl $8, %%eax \n\t" + "cmpl %4, %%eax \n\t" + " jb 1b \n\t" - "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) - "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) - "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) - "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) - "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) - "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) - "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) - "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) + "1: \n\t" + PREFETCH" 64(%0, %%eax, 4) \n\t" + "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) + "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) + "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) + "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) + "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) + "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) + "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) - MOVNTQ" %%mm0, (%3, %%eax) \n\t" - MOVNTQ" %%mm2, (%2, %%eax) \n\t" + MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" + MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" + + "addl $8, %%eax \n\t" + "cmpl %5, %%eax \n\t" + " jb 1b \n\t" - "addl $8, %%eax \n\t" - "cmpl %4, %%eax \n\t" - " jb 1b \n\t" - EMMS" \n\t" - SFENCE - ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (num_pixels>>1) - : "memory", "%eax" - ); + ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth), "m"(width) + : "memory", "%eax" + ); #else - int i; - num_pixels>>=1; - for(i=0; i<num_pixels; i++) - { - ydst[2*i+0] = src[4*i+0]; - udst[i] = src[4*i+1]; - ydst[2*i+1] = src[4*i+2]; - vdst[i] = src[4*i+3]; + int i; + for(i=0; i<chromWidth; i++) + { + ydst[2*i+0] = src[4*i+0]; + udst[i] = src[4*i+1]; + ydst[2*i+1] = src[4*i+2]; + vdst[i] = src[4*i+3]; + } + ydst += lumStride; + src += srcStride; + + for(i=0; i<chromWidth; i++) + { + ydst[2*i+0] = src[4*i+0]; + ydst[2*i+1] = src[4*i+2]; + } +#endif + udst += chromStride; + vdst += chromStride; + ydst += lumStride; + src += srcStride; } +#ifdef HAVE_MMX +asm( EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); #endif }
--- a/postproc/rgb2rgb.h Mon Nov 05 18:26:49 2001 +0000 +++ b/postproc/rgb2rgb.h Mon Nov 05 18:50:58 2001 +0000 @@ -23,6 +23,7 @@ extern void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride); -extern void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, unsigned num_pixels); +extern void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + int width, int height, int lumStride, int chromStride, int srcStride); #endif
--- a/postproc/rgb2rgb_template.c Mon Nov 05 18:26:49 2001 +0000 +++ b/postproc/rgb2rgb_template.c Mon Nov 05 18:50:58 2001 +0000 @@ -291,7 +291,8 @@ } /** * - * width must be a multiple of 16 for the MMX version + * height should be a multiple of 2 and width should be a multiple of 16 (if this is a + * problem for anyone then tell me, and ill fix it) */ void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride) @@ -359,70 +360,116 @@ #endif } -void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, unsigned num_pixels) +/** + * + * height should be a multiple of 2 and width should be a multiple of 16 (if this is a + * problem for anyone then tell me, and ill fix it) + */ +void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + int width, int height, int lumStride, int chromStride, int srcStride) { + int y; + const int chromWidth= width>>1; + for(y=0; y<height; y+=2) + { #ifdef HAVE_MMX - asm volatile( - "xorl %%eax, %%eax \n\t" - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... - "1: \n\t" - PREFETCH" 64(%0, %%eax, 4) \n\t" - "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) - "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) - "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) - "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) - "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) - "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) - "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) - "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) - "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) - "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) + asm volatile( + "xorl %%eax, %%eax \n\t" + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... + "1: \n\t" + PREFETCH" 64(%0, %%eax, 4) \n\t" + "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) + "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) + "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) + "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) + "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) + "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) + "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) + "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) + + MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" - MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" + "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) + "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) + "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) + "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) + "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) + "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) + "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) + "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) - "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) - "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) - "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) - "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) - "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) - "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) - "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) - "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) - "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) - "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) + MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" + + "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) + "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) + "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) + "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) + "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) + "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) + "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) + "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) - MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" + MOVNTQ" %%mm0, (%3, %%eax) \n\t" + MOVNTQ" %%mm2, (%2, %%eax) \n\t" + + "addl $8, %%eax \n\t" + "cmpl %4, %%eax \n\t" + " jb 1b \n\t" - "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) - "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) - "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) - "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) - "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) - "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) - "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) - "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) + "1: \n\t" + PREFETCH" 64(%0, %%eax, 4) \n\t" + "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) + "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) + "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) + "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) + "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) + "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) + "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) - MOVNTQ" %%mm0, (%3, %%eax) \n\t" - MOVNTQ" %%mm2, (%2, %%eax) \n\t" + MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" + MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" + + "addl $8, %%eax \n\t" + "cmpl %5, %%eax \n\t" + " jb 1b \n\t" - "addl $8, %%eax \n\t" - "cmpl %4, %%eax \n\t" - " jb 1b \n\t" - EMMS" \n\t" - SFENCE - ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (num_pixels>>1) - : "memory", "%eax" - ); + ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth), "m"(width) + : "memory", "%eax" + ); #else - int i; - num_pixels>>=1; - for(i=0; i<num_pixels; i++) - { - ydst[2*i+0] = src[4*i+0]; - udst[i] = src[4*i+1]; - ydst[2*i+1] = src[4*i+2]; - vdst[i] = src[4*i+3]; + int i; + for(i=0; i<chromWidth; i++) + { + ydst[2*i+0] = src[4*i+0]; + udst[i] = src[4*i+1]; + ydst[2*i+1] = src[4*i+2]; + vdst[i] = src[4*i+3]; + } + ydst += lumStride; + src += srcStride; + + for(i=0; i<chromWidth; i++) + { + ydst[2*i+0] = src[4*i+0]; + ydst[2*i+1] = src[4*i+2]; + } +#endif + udst += chromStride; + vdst += chromStride; + ydst += lumStride; + src += srcStride; } +#ifdef HAVE_MMX +asm( EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); #endif }