changeset 27529:ffb573fae5e8

Rewrite bgr24->yuv mmx code, the new code is cleaner, more accurate, and does not throw half the chroma away.
author michael
date Tue, 09 Sep 2008 23:30:06 +0000
parents 0474738b5577
children dbdc77f8b041
files libswscale/swscale.c libswscale/swscale_template.c
diffstat 2 files changed, 147 insertions(+), 196 deletions(-) [+]
line wrap: on
line diff
--- a/libswscale/swscale.c	Tue Sep 09 21:07:26 2008 +0000
+++ b/libswscale/swscale.c	Tue Sep 09 23:30:06 2008 +0000
@@ -237,6 +237,20 @@
 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset)  = 0x1010101010101010ULL;
 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
 DECLARE_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
+
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toY1Coeff) = 0x0C88000040870C88ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toY2Coeff) = 0x20DE4087000020DEULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_rgb24toY1Coeff) = 0x20DE0000408720DEULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_rgb24toY2Coeff) = 0x0C88408700000C88ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toYOffset) = 0x0008400000084000ULL;
+
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toUV[2][4]) = {
+    {0x38380000DAC83838ULL, 0xECFFDAC80000ECFFULL, 0xF6E40000D0E3F6E4ULL, 0x3838D0E300003838ULL},
+    {0xECFF0000DAC8ECFFULL, 0x3838DAC800003838ULL, 0x38380000D0E33838ULL, 0xF6E4D0E30000F6E4ULL},
+};
+
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toUVOffset)= 0x0040400000404000ULL;
+
 #endif /* defined(ARCH_X86) */
 
 // clipping helper table for C implementations:
@@ -2201,7 +2215,8 @@
     if ((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP)
       && srcFormat!=PIX_FMT_RGB8      && srcFormat!=PIX_FMT_BGR8
       && srcFormat!=PIX_FMT_RGB4      && srcFormat!=PIX_FMT_BGR4
-      && srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE)
+      && srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE
+      && srcFormat!=PIX_FMT_BGR24     && srcFormat!=PIX_FMT_RGB24)
         c->chrSrcHSubSample=1;
 
     if (param){
--- a/libswscale/swscale_template.c	Tue Sep 09 21:07:26 2008 +0000
+++ b/libswscale/swscale_template.c	Tue Sep 09 23:30:06 2008 +0000
@@ -1875,78 +1875,121 @@
     }
 }
 
+#ifdef HAVE_MMX
+static inline void bgr24ToY_mmx(uint8_t *dst, uint8_t *src, long width, int srcFormat)
+{
+
+    if(srcFormat == PIX_FMT_BGR24){
+        asm volatile(
+            "movq  "MANGLE(ff_bgr24toY1Coeff)", %mm5       \n\t"
+            "movq  "MANGLE(ff_bgr24toY2Coeff)", %mm6       \n\t"
+        );
+    }else{
+        asm volatile(
+            "movq  "MANGLE(ff_rgb24toY1Coeff)", %mm5       \n\t"
+            "movq  "MANGLE(ff_rgb24toY2Coeff)", %mm6       \n\t"
+        );
+    }
+
+    asm volatile(
+        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
+        "mov                        %2, %%"REG_a"   \n\t"
+        "pxor                    %%mm7, %%mm7       \n\t"
+        "1:                                         \n\t"
+        PREFETCH"               64(%0)              \n\t"
+        "movd                     (%0), %%mm0       \n\t"
+        "movd                    2(%0), %%mm1       \n\t"
+        "movd                    6(%0), %%mm2       \n\t"
+        "movd                    8(%0), %%mm3       \n\t"
+        "add                       $12, %0          \n\t"
+        "punpcklbw               %%mm7, %%mm0       \n\t"
+        "punpcklbw               %%mm7, %%mm1       \n\t"
+        "punpcklbw               %%mm7, %%mm2       \n\t"
+        "punpcklbw               %%mm7, %%mm3       \n\t"
+        "pmaddwd                 %%mm5, %%mm0       \n\t"
+        "pmaddwd                 %%mm6, %%mm1       \n\t"
+        "pmaddwd                 %%mm5, %%mm2       \n\t"
+        "pmaddwd                 %%mm6, %%mm3       \n\t"
+        "paddd                   %%mm1, %%mm0       \n\t"
+        "paddd                   %%mm3, %%mm2       \n\t"
+        "paddd                   %%mm4, %%mm0       \n\t"
+        "paddd                   %%mm4, %%mm2       \n\t"
+        "psrad                     $15, %%mm0       \n\t"
+        "psrad                     $15, %%mm2       \n\t"
+        "packssdw                %%mm2, %%mm0       \n\t"
+        "packuswb                %%mm0, %%mm0       \n\t"
+        "movd                %%mm0, (%1, %%"REG_a") \n\t"
+        "add                        $4, %%"REG_a"   \n\t"
+        " js                        1b              \n\t"
+    : "+r" (src)
+    : "r" (dst+width), "g" (-width)
+    : "%"REG_a
+    );
+}
+
+static inline void bgr24ToUV_mmx(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
+{
+    asm volatile(
+        "movq                    24+%4, %%mm6       \n\t"
+        "mov                        %3, %%"REG_a"   \n\t"
+        "pxor                    %%mm7, %%mm7       \n\t"
+        "1:                                         \n\t"
+        PREFETCH"               64(%0)              \n\t"
+        "movd                     (%0), %%mm0       \n\t"
+        "movd                    2(%0), %%mm1       \n\t"
+        "punpcklbw               %%mm7, %%mm0       \n\t"
+        "punpcklbw               %%mm7, %%mm1       \n\t"
+        "movq                    %%mm0, %%mm2       \n\t"
+        "movq                    %%mm1, %%mm3       \n\t"
+        "pmaddwd                    %4, %%mm0       \n\t"
+        "pmaddwd                  8+%4, %%mm1       \n\t"
+        "pmaddwd                 16+%4, %%mm2       \n\t"
+        "pmaddwd                 %%mm6, %%mm3       \n\t"
+        "paddd                   %%mm1, %%mm0       \n\t"
+        "paddd                   %%mm3, %%mm2       \n\t"
+
+        "movd                    6(%0), %%mm1       \n\t"
+        "movd                    8(%0), %%mm3       \n\t"
+        "add                       $12, %0          \n\t"
+        "punpcklbw               %%mm7, %%mm1       \n\t"
+        "punpcklbw               %%mm7, %%mm3       \n\t"
+        "movq                    %%mm1, %%mm4       \n\t"
+        "movq                    %%mm3, %%mm5       \n\t"
+        "pmaddwd                    %4, %%mm1       \n\t"
+        "pmaddwd                  8+%4, %%mm3       \n\t"
+        "pmaddwd                 16+%4, %%mm4       \n\t"
+        "pmaddwd                 %%mm6, %%mm5       \n\t"
+        "paddd                   %%mm3, %%mm1       \n\t"
+        "paddd                   %%mm5, %%mm4       \n\t"
+
+        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
+        "paddd                   %%mm3, %%mm0       \n\t"
+        "paddd                   %%mm3, %%mm2       \n\t"
+        "paddd                   %%mm3, %%mm1       \n\t"
+        "paddd                   %%mm3, %%mm4       \n\t"
+        "psrad                     $15, %%mm0       \n\t"
+        "psrad                     $15, %%mm2       \n\t"
+        "psrad                     $15, %%mm1       \n\t"
+        "psrad                     $15, %%mm4       \n\t"
+        "packssdw                %%mm1, %%mm0       \n\t"
+        "packssdw                %%mm4, %%mm2       \n\t"
+        "packuswb                %%mm0, %%mm0       \n\t"
+        "packuswb                %%mm2, %%mm2       \n\t"
+        "movd                %%mm0, (%1, %%"REG_a") \n\t"
+        "movd                %%mm2, (%2, %%"REG_a") \n\t"
+        "add                        $4, %%"REG_a"   \n\t"
+        " js                        1b              \n\t"
+    : "+r" (src)
+    : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
+    : "%"REG_a
+    );
+}
+#endif
+
 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
 {
 #ifdef HAVE_MMX
-    asm volatile(
-    "mov                        %2, %%"REG_a"   \n\t"
-    "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
-    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
-    "pxor                    %%mm7, %%mm7       \n\t"
-    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
-    ASMALIGN(4)
-    "1:                                         \n\t"
-    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
-    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
-    "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
-    "punpcklbw               %%mm7, %%mm0       \n\t"
-    "punpcklbw               %%mm7, %%mm1       \n\t"
-    "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
-    "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
-    "punpcklbw               %%mm7, %%mm2       \n\t"
-    "punpcklbw               %%mm7, %%mm3       \n\t"
-    "pmaddwd                 %%mm6, %%mm0       \n\t"
-    "pmaddwd                 %%mm6, %%mm1       \n\t"
-    "pmaddwd                 %%mm6, %%mm2       \n\t"
-    "pmaddwd                 %%mm6, %%mm3       \n\t"
-#ifndef FAST_BGR2YV12
-    "psrad                      $8, %%mm0       \n\t"
-    "psrad                      $8, %%mm1       \n\t"
-    "psrad                      $8, %%mm2       \n\t"
-    "psrad                      $8, %%mm3       \n\t"
-#endif
-    "packssdw                %%mm1, %%mm0       \n\t"
-    "packssdw                %%mm3, %%mm2       \n\t"
-    "pmaddwd                 %%mm5, %%mm0       \n\t"
-    "pmaddwd                 %%mm5, %%mm2       \n\t"
-    "packssdw                %%mm2, %%mm0       \n\t"
-    "psraw                      $7, %%mm0       \n\t"
-
-    "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
-    "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
-    "punpcklbw               %%mm7, %%mm4       \n\t"
-    "punpcklbw               %%mm7, %%mm1       \n\t"
-    "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
-    "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
-    "punpcklbw               %%mm7, %%mm2       \n\t"
-    "punpcklbw               %%mm7, %%mm3       \n\t"
-    "pmaddwd                 %%mm6, %%mm4       \n\t"
-    "pmaddwd                 %%mm6, %%mm1       \n\t"
-    "pmaddwd                 %%mm6, %%mm2       \n\t"
-    "pmaddwd                 %%mm6, %%mm3       \n\t"
-#ifndef FAST_BGR2YV12
-    "psrad                      $8, %%mm4       \n\t"
-    "psrad                      $8, %%mm1       \n\t"
-    "psrad                      $8, %%mm2       \n\t"
-    "psrad                      $8, %%mm3       \n\t"
-#endif
-    "packssdw                %%mm1, %%mm4       \n\t"
-    "packssdw                %%mm3, %%mm2       \n\t"
-    "pmaddwd                 %%mm5, %%mm4       \n\t"
-    "pmaddwd                 %%mm5, %%mm2       \n\t"
-    "add                       $24, %%"REG_d"   \n\t"
-    "packssdw                %%mm2, %%mm4       \n\t"
-    "psraw                      $7, %%mm4       \n\t"
-
-    "packuswb                %%mm4, %%mm0       \n\t"
-    "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
-
-    "movq                    %%mm0, (%1, %%"REG_a") \n\t"
-    "add                        $8, %%"REG_a"   \n\t"
-    " js                        1b              \n\t"
-    : : "r" (src+width*3), "r" (dst+width), "g" (-width)
-    : "%"REG_a, "%"REG_d
-    );
+    bgr24ToY_mmx(dst, src, width, PIX_FMT_BGR24);
 #else
     int i;
     for (i=0; i<width; i++)
@@ -1963,132 +2006,17 @@
 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
 {
 #ifdef HAVE_MMX
-    asm volatile(
-    "mov                        %3, %%"REG_a"   \n\t"
-    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
-    "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
-    "pxor                    %%mm7, %%mm7       \n\t"
-    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
-    "add                 %%"REG_d", %%"REG_d"   \n\t"
-    ASMALIGN(4)
-    "1:                                         \n\t"
-    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
-    "movq          (%0, %%"REG_d"), %%mm0       \n\t"
-    "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
-    "movq                    %%mm0, %%mm1       \n\t"
-    "movq                    %%mm2, %%mm3       \n\t"
-    "psrlq                     $24, %%mm0       \n\t"
-    "psrlq                     $24, %%mm2       \n\t"
-    PAVGB(%%mm1, %%mm0)
-    PAVGB(%%mm3, %%mm2)
-    "punpcklbw               %%mm7, %%mm0       \n\t"
-    "punpcklbw               %%mm7, %%mm2       \n\t"
-#else
-    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
-    "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
-    "punpcklbw               %%mm7, %%mm0       \n\t"
-    "punpcklbw               %%mm7, %%mm2       \n\t"
-    "paddw                   %%mm2, %%mm0       \n\t"
-    "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
-    "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
-    "punpcklbw               %%mm7, %%mm4       \n\t"
-    "punpcklbw               %%mm7, %%mm2       \n\t"
-    "paddw                   %%mm4, %%mm2       \n\t"
-    "psrlw                      $1, %%mm0       \n\t"
-    "psrlw                      $1, %%mm2       \n\t"
-#endif
-    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
-    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
-
-    "pmaddwd                 %%mm0, %%mm1       \n\t"
-    "pmaddwd                 %%mm2, %%mm3       \n\t"
-    "pmaddwd                 %%mm6, %%mm0       \n\t"
-    "pmaddwd                 %%mm6, %%mm2       \n\t"
-#ifndef FAST_BGR2YV12
-    "psrad                      $8, %%mm0       \n\t"
-    "psrad                      $8, %%mm1       \n\t"
-    "psrad                      $8, %%mm2       \n\t"
-    "psrad                      $8, %%mm3       \n\t"
-#endif
-    "packssdw                %%mm2, %%mm0       \n\t"
-    "packssdw                %%mm3, %%mm1       \n\t"
-    "pmaddwd                 %%mm5, %%mm0       \n\t"
-    "pmaddwd                 %%mm5, %%mm1       \n\t"
-    "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
-    "psraw                      $7, %%mm0       \n\t"
-
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
-    "movq       12(%0, %%"REG_d"), %%mm4       \n\t"
-    "movq       18(%0, %%"REG_d"), %%mm2       \n\t"
-    "movq                   %%mm4, %%mm1       \n\t"
-    "movq                   %%mm2, %%mm3       \n\t"
-    "psrlq                    $24, %%mm4       \n\t"
-    "psrlq                    $24, %%mm2       \n\t"
-    PAVGB(%%mm1, %%mm4)
-    PAVGB(%%mm3, %%mm2)
-    "punpcklbw              %%mm7, %%mm4       \n\t"
-    "punpcklbw              %%mm7, %%mm2       \n\t"
-#else
-    "movd       12(%0, %%"REG_d"), %%mm4       \n\t"
-    "movd       15(%0, %%"REG_d"), %%mm2       \n\t"
-    "punpcklbw              %%mm7, %%mm4       \n\t"
-    "punpcklbw              %%mm7, %%mm2       \n\t"
-    "paddw                  %%mm2, %%mm4       \n\t"
-    "movd       18(%0, %%"REG_d"), %%mm5       \n\t"
-    "movd       21(%0, %%"REG_d"), %%mm2       \n\t"
-    "punpcklbw              %%mm7, %%mm5       \n\t"
-    "punpcklbw              %%mm7, %%mm2       \n\t"
-    "paddw                  %%mm5, %%mm2       \n\t"
-    "movq      "MANGLE(ff_w1111)", %%mm5       \n\t"
-    "psrlw                     $2, %%mm4       \n\t"
-    "psrlw                     $2, %%mm2       \n\t"
-#endif
-    "movq "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
-    "movq "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
-
-    "pmaddwd                %%mm4, %%mm1       \n\t"
-    "pmaddwd                %%mm2, %%mm3       \n\t"
-    "pmaddwd                %%mm6, %%mm4       \n\t"
-    "pmaddwd                %%mm6, %%mm2       \n\t"
-#ifndef FAST_BGR2YV12
-    "psrad                     $8, %%mm4       \n\t"
-    "psrad                     $8, %%mm1       \n\t"
-    "psrad                     $8, %%mm2       \n\t"
-    "psrad                     $8, %%mm3       \n\t"
-#endif
-    "packssdw               %%mm2, %%mm4       \n\t"
-    "packssdw               %%mm3, %%mm1       \n\t"
-    "pmaddwd                %%mm5, %%mm4       \n\t"
-    "pmaddwd                %%mm5, %%mm1       \n\t"
-    "add                      $24, %%"REG_d"   \n\t"
-    "packssdw               %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
-    "psraw                     $7, %%mm4       \n\t"
-
-    "movq                   %%mm0, %%mm1       \n\t"
-    "punpckldq              %%mm4, %%mm0       \n\t"
-    "punpckhdq              %%mm4, %%mm1       \n\t"
-    "packsswb               %%mm1, %%mm0       \n\t"
-    "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0    \n\t"
-
-    "movd                   %%mm0, (%1, %%"REG_a")  \n\t"
-    "punpckhdq              %%mm0, %%mm0            \n\t"
-    "movd                   %%mm0, (%2, %%"REG_a")  \n\t"
-    "add                       $4, %%"REG_a"        \n\t"
-    " js                       1b                   \n\t"
-    : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
-    : "%"REG_a, "%"REG_d
-    );
+    bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_BGR24);
 #else
     int i;
     for (i=0; i<width; i++)
     {
-        int b= src1[6*i + 0] + src1[6*i + 3];
-        int g= src1[6*i + 1] + src1[6*i + 4];
-        int r= src1[6*i + 2] + src1[6*i + 5];
+        int b= src1[3*i + 0];
+        int g= src1[3*i + 1];
+        int r= src1[3*i + 2];
 
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
     }
 #endif /* HAVE_MMX */
     assert(src1 == src2);
@@ -2201,6 +2129,9 @@
 
 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width)
 {
+#ifdef HAVE_MMX
+    bgr24ToY_mmx(dst, src, width, PIX_FMT_RGB24);
+#else
     int i;
     for (i=0; i<width; i++)
     {
@@ -2210,21 +2141,26 @@
 
         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
     }
+#endif
 }
 
 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
 {
     int i;
     assert(src1==src2);
+#ifdef HAVE_MMX
+    bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_RGB24);
+#else
     for (i=0; i<width; i++)
     {
-        int r= src1[6*i + 0] + src1[6*i + 3];
-        int g= src1[6*i + 1] + src1[6*i + 4];
-        int b= src1[6*i + 2] + src1[6*i + 5];
+        int r= src1[3*i + 0];
+        int g= src1[3*i + 1];
+        int b= src1[3*i + 2];
 
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1);
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1);
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
     }
+#endif
 }
 
 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, long width)