# HG changeset patch # User michael # Date 1017074124 0 # Node ID 0bd1c35aa42c020ff8419d6363e0b09a95550a2c # Parent 3cc26c4e662b7fb09d99dc53d5cc2f88feb5dd37 byte interleaving for mga untested (no g200 mga or whatever i would need ...) experimental sse2 version (even less tested as no p4 either ...) sse2 version would need 16-byte aligned src & dst else sig11 sse2 version is disabled by default diff -r 3cc26c4e662b -r 0bd1c35aa42c postproc/rgb2rgb.c --- a/postproc/rgb2rgb.c Mon Mar 25 16:22:15 2002 +0000 +++ b/postproc/rgb2rgb.c Mon Mar 25 16:35:24 2002 +0000 @@ -409,3 +409,21 @@ rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); #endif } + +void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst, + int width, int height, int src1Stride, int src2Stride, int dstStride) +{ +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + interleaveBytes_MMX2(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); + else if(gCpuCaps.has3DNow) + interleaveBytes_3DNow(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); + else if(gCpuCaps.hasMMX) + interleaveBytes_MMX(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); + else + interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); +#else + interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); +#endif +} diff -r 3cc26c4e662b -r 0bd1c35aa42c postproc/rgb2rgb.h --- a/postproc/rgb2rgb.h Mon Mar 25 16:22:15 2002 +0000 +++ b/postproc/rgb2rgb.h Mon Mar 25 16:35:24 2002 +0000 @@ -34,6 +34,10 @@ unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride, unsigned int srcStride); +extern void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst, + int width, int height, int src1Stride, int src2Stride, int dstStride); + + #define MODE_RGB 0x1 #define MODE_BGR 0x2 diff -r 3cc26c4e662b -r 0bd1c35aa42c postproc/rgb2rgb_template.c --- a/postproc/rgb2rgb_template.c Mon Mar 25 16:22:15 2002 +0000 +++ b/postproc/rgb2rgb_template.c Mon Mar 25 16:35:24 2002 +0000 @@ -1197,3 +1197,83 @@ src += srcStride; } } + +void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, + int width, int height, int src1Stride, int src2Stride, int dstStride){ + int h; + + for(h=0; h < height; h++) + { + int w; + +#ifdef HAVE_MMX +#ifdef HAVE_SSE2 + asm( + "xorl %%eax, %%eax \n\t" + "1: \n\t" + PREFETCH" 64(%1, %%eax) \n\t" + PREFETCH" 64(%2, %%eax) \n\t" + "movdqa (%1, %%eax), %%xmm0 \n\t" + "movdqa (%1, %%eax), %%xmm1 \n\t" + "movdqa (%2, %%eax), %%xmm2 \n\t" + "punpcklbw %%xmm2, %%xmm0 \n\t" + "punpckhbw %%xmm2, %%xmm1 \n\t" + "movntdq %%xmm0, (%0, %%eax, 2) \n\t" + "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t" + "addl $16, %%eax \n\t" + "cmpl %3, %%eax \n\t" + " jb 1b \n\t" + ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) + : "memory", "%eax" + ); +#else + asm( + "xorl %%eax, %%eax \n\t" + "1: \n\t" + PREFETCH" 64(%1, %%eax) \n\t" + PREFETCH" 64(%2, %%eax) \n\t" + "movq (%1, %%eax), %%mm0 \n\t" + "movq 8(%1, %%eax), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq (%2, %%eax), %%mm4 \n\t" + "movq 8(%2, %%eax), %%mm5 \n\t" + "punpcklbw %%mm4, %%mm0 \n\t" + "punpckhbw %%mm4, %%mm1 \n\t" + "punpcklbw %%mm5, %%mm2 \n\t" + "punpckhbw %%mm5, %%mm3 \n\t" + MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t" + MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t" + MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t" + MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t" + "addl $16, %%eax \n\t" + "cmpl %3, %%eax \n\t" + " jb 1b \n\t" + ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) + : "memory", "%eax" + ); +#endif + for(w= (width&(~15)); w < width; w++) + { + dest[2*w+0] = src1[w]; + dest[2*w+1] = src2[w]; + } +#else + for(w=0; w < width; w++) + { + dest[2*w+0] = src1[w]; + dest[2*w+1] = src2[w]; + } +#endif + dest += dstStride; + src1 += src1Stride; + src2 += src2Stride; + } +#ifdef HAVE_MMX + asm( + EMMS" \n\t" + SFENCE" \n\t" + ::: "memory" + ); +#endif +}