Mercurial > libavcodec.hg

--- a/i386/snowdsp_mmx.c	Tue Aug 21 15:48:08 2007 +0000
+++ b/i386/snowdsp_mmx.c	Tue Aug 21 16:29:40 2007 +0000
@@ -111,8 +111,7 @@

         i = 0;
         asm volatile(
-            "pcmpeqd    %%xmm7, %%xmm7        \n\t"
-            "psrad         $29, %%xmm7        \n\t"
+            "pslld          $1, %%xmm7        \n\t"
         ::);
         for(; i<w_l-7; i+=8){
             asm volatile(
@@ -157,25 +156,21 @@
                 "movdqu 20(%1), %%xmm6        \n\t"
                 "paddd    (%1), %%xmm2        \n\t"
                 "paddd  16(%1), %%xmm6        \n\t"
-                "movdqa %%xmm2, %%xmm0        \n\t"
-                "movdqa %%xmm6, %%xmm4        \n\t"
-                "pslld      $2, %%xmm2        \n\t"
-                "pslld      $2, %%xmm6        \n\t"
-                "psubd  %%xmm2, %%xmm0        \n\t"
-                "psubd  %%xmm6, %%xmm4        \n\t"
-                "psrad      $1, %%xmm0        \n\t"
-                "psrad      $1, %%xmm4        \n\t"
-                "movdqu   (%0), %%xmm2        \n\t"
-                "movdqu 16(%0), %%xmm6        \n\t"
-                "psubd  %%xmm0, %%xmm2        \n\t"
-                "psubd  %%xmm4, %%xmm6        \n\t"
+                "movdqu   (%0), %%xmm0        \n\t"
+                "movdqu 16(%0), %%xmm4        \n\t"
+                "paddd  %%xmm2, %%xmm0        \n\t"
+                "paddd  %%xmm6, %%xmm4        \n\t"
+                "psrad      $1, %%xmm2        \n\t"
+                "psrad      $1, %%xmm6        \n\t"
+                "paddd  %%xmm0, %%xmm2        \n\t"
+                "paddd  %%xmm4, %%xmm6        \n\t"
                 "movdqa %%xmm2, (%2)          \n\t"
                 "movdqa %%xmm6, 16(%2)        \n\t"
                 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
                  : "memory"
                );
         }
-        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
     }

     {
@@ -291,10 +286,9 @@
         DWTELEM * const ref = b+w2 - 1;

         i = 1;
-        b[0] = b[0] + (((2 * ref[1] + W_BO-1) + 4 * b[0]) >> W_BS);
+        b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
         asm volatile(
-            "pcmpeqd     %%mm7, %%mm7        \n\t"
-            "psrld         $29, %%mm7        \n\t"
+            "pslld          $1, %%mm7        \n\t"
            ::);
         for(; i<w_l-3; i+=4){
             asm volatile(
@@ -333,16 +327,12 @@
                 "movq   12(%1), %%mm6        \n\t"
                 "paddd    (%1), %%mm2        \n\t"
                 "paddd   8(%1), %%mm6        \n\t"
-                "pxor    %%mm0, %%mm0        \n\t" //note: the 2 xor could be avoided if we would flip the rounding direction
-                "pxor    %%mm4, %%mm4        \n\t"
-                "psubd   %%mm2, %%mm0        \n\t"
-                "psubd   %%mm6, %%mm4        \n\t"
-                "psrad      $1, %%mm0        \n\t"
-                "psrad      $1, %%mm4        \n\t"
-                "psubd   %%mm0, %%mm2        \n\t"
-                "psubd   %%mm4, %%mm6        \n\t"
                 "movq     (%0), %%mm0        \n\t"
                 "movq    8(%0), %%mm4        \n\t"
+                "paddd   %%mm2, %%mm0        \n\t"
+                "paddd   %%mm6, %%mm4        \n\t"
+                "psrad      $1, %%mm2        \n\t"
+                "psrad      $1, %%mm6        \n\t"
                 "paddd   %%mm0, %%mm2        \n\t"
                 "paddd   %%mm4, %%mm6        \n\t"
                 "movq    %%mm2, (%2)         \n\t"
@@ -351,7 +341,7 @@
                  : "memory"
                );
         }
-        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
     }

     {
--- a/snow.c	Tue Aug 21 15:48:08 2007 +0000
+++ b/snow.c	Tue Aug 21 16:29:40 2007 +0000
@@ -775,7 +775,7 @@
     int i;

     assert(shift == 4);
-#define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
+#define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): -((-16*4*(src) + 4*(ref) + add + 5 + (5<<27))/(5*16) - (1<<23)))
     if(mirror_left){
         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
         dst += dst_step;
@@ -1113,8 +1113,8 @@
     DWTELEM temp[width];
     const int w2= (width+1)>>1;

-    lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
-    liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
+    lift (temp+w2, b    +1, b      , 1, 2, 2, width,  W_AM, W_AO, W_AS, 1, 1);
+    liftS(temp   , b      , temp+w2, 1, 2, 1, width,  W_BM, W_BO, W_BS, 0, 0);
     lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
     lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
 }
@@ -1150,7 +1150,7 @@
 #ifdef liftS
         b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
 #else
-        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
+        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + W_BO*5 + (5<<27)) / (5*16) - (1<<23);
 #endif
     }
 }
@@ -1344,8 +1344,8 @@

     lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
     lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
-    liftS(b      , temp   , temp+w2, 2, 1, 1, width,  W_BM, W_BO-1, W_BS, 0, 1);
-    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
+    liftS(b      , temp   , temp+w2, 2, 1, 1, width,  W_BM, W_BO, W_BS, 0, 1);
+    lift (b+1    , temp+w2, b      , 2, 1, 2, width,  W_AM, W_AO, W_AS, 1, 0);
 }

 static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
--- a/snow.h	Tue Aug 21 15:48:08 2007 +0000
+++ b/snow.h	Tue Aug 21 16:29:40 2007 +0000
@@ -165,11 +165,11 @@

 static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w){
         for(; i<w; i++){
-            dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO-1 + 4 * src[i]) >> W_BS);
+            dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO + 4 * src[i]) >> W_BS);
         }

         if(width&1){
-            dst[w] = src[w] + ((2 * ref[w] + W_BO-1 + 4 * src[w]) >> W_BS);
+            dst[w] = src[w] + ((2 * ref[w] + W_BO + 4 * src[w]) >> W_BS);
         }
 }