changeset 1024:9cc1031e1864 libavcodec

More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
author michaelni
date Mon, 20 Jan 2003 22:50:14 +0000
parents e61be5796027
children 1f9afd8b9131
files ppc/dsputil_altivec.c ppc/dsputil_altivec.h ppc/dsputil_ppc.c ppc/dsputil_ppc.h
diffstat 4 files changed, 389 insertions(+), 97 deletions(-) [+]
line wrap: on
line diff
--- a/ppc/dsputil_altivec.c	Mon Jan 20 22:41:48 2003 +0000
+++ b/ppc/dsputil_altivec.c	Mon Jan 20 22:50:14 2003 +0000
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2002 Brian Foley
  * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -628,86 +629,7 @@
 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 }
 
-extern UINT8 cropTbl[];
-void put_pixels_clamped_altivec(const DCTELEM *block, UINT8 *restrict pixels,
-                                int line_size)
-{
-POWERPC_TBL_DECLARE(altivec_put_pixels_clamped_num, 1);
-#ifdef ALTIVEC_USE_REFERENCE_C_CODE
-    int i;
-    UINT8 *cm = cropTbl + MAX_NEG_CROP;
-
-POWERPC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1);
-    
-    /* read the pixels */
-    for(i=0;i<8;i++) {
-        pixels[0] = cm[block[0]];
-        pixels[1] = cm[block[1]];
-        pixels[2] = cm[block[2]];
-        pixels[3] = cm[block[3]];
-        pixels[4] = cm[block[4]];
-        pixels[5] = cm[block[5]];
-        pixels[6] = cm[block[6]];
-        pixels[7] = cm[block[7]];
-
-        pixels += line_size;
-        block += 8;
-    }
-
-POWERPC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1);
-
-#else /* ALTIVEC_USE_REFERENCE_C_CODE */
-    register const vector short vczero = (const vector short)(0);
-    register vector short
-      blockv0, blockv1, blockv2, blockv3,
-      blockv4, blockv5, blockv6, blockv7;
-    register vector unsigned char
-      pixelsv0, pixelsv1, pixelsv2, pixelsv3, pixelsv4,
-      pixelsv0old, pixelsv4old;
-
-POWERPC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1);
-
-    blockv0 = vec_ld(0, block);
-    blockv1 = vec_ld(16, block);
-    blockv2 = vec_ld(32, block);
-    blockv3 = vec_ld(48, block);
-    blockv4 = vec_ld(64, block);
-    blockv5 = vec_ld(80, block);
-    blockv6 = vec_ld(96, block);
-    blockv7 = vec_ld(112, block);
-    if (((unsigned long)pixels) & 0x0000000F)
-    {
-      pixelsv0old = vec_ld(-8, pixels);
-      pixelsv4old = vec_ld(56, pixels);
-      pixelsv0 = vec_packsu(vczero, blockv0);
-      pixelsv1 = vec_packsu(blockv1, blockv2);
-      pixelsv2 = vec_packsu(blockv3, blockv4);
-      pixelsv3 = vec_packsu(blockv5, blockv6);
-      pixelsv4 = vec_packsu(blockv5, vczero);
-      pixelsv0 = vec_perm(pixelsv0old, pixelsv0, vcprm(0, 1, s2, s3));
-      pixelsv4 = vec_perm(pixelsv4, pixelsv4old, vcprm(0, 1, s2, s3));
-      vec_st(pixelsv0, -8, pixels);
-      vec_st(pixelsv1, 8, pixels);
-      vec_st(pixelsv2, 24, pixels);
-      vec_st(pixelsv3, 40, pixels);
-      vec_st(pixelsv4, 56, pixels);
-    }
-    else
-    {
-      pixelsv0 = vec_packsu(blockv0, blockv1);
-      pixelsv1 = vec_packsu(blockv2, blockv3);
-      pixelsv2 = vec_packsu(blockv4, blockv5);
-      pixelsv3 = vec_packsu(blockv6, blockv7);
-      vec_st(pixelsv0, 0, pixels);
-      vec_st(pixelsv1, 16, pixels);
-      vec_st(pixelsv2, 32, pixels);
-      vec_st(pixelsv3, 48, pixels);
-    }
-
-POWERPC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1);
-#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
-}
-
+/* next one assumes that ((line_size % 16) == 0) */
 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
 POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1);
@@ -729,6 +651,7 @@
 
 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
     register vector unsigned char pixelsv1, pixelsv2;
+    register vector unsigned char perm = vec_lvsl(0, pixels);
     int i;
 
 POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
@@ -736,7 +659,7 @@
     for(i=0; i<h; i++) {
       pixelsv1 = vec_ld(0, (unsigned char*)pixels);
       pixelsv2 = vec_ld(16, (unsigned char*)pixels);
-      vec_st(vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)),
+      vec_st(vec_perm(pixelsv1, pixelsv2, perm),
              0, (unsigned char*)block);
       pixels+=line_size;
       block +=line_size;
@@ -747,6 +670,7 @@
 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 }
 
+/* next one assumes that ((line_size % 16) == 0) */
 #define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
@@ -769,6 +693,7 @@
 
 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
     register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
+    register vector unsigned char perm = vec_lvsl(0, pixels);
     int i;
 
 POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
@@ -777,7 +702,7 @@
       pixelsv1 = vec_ld(0, (unsigned char*)pixels);
       pixelsv2 = vec_ld(16, (unsigned char*)pixels);
       blockv = vec_ld(0, block);
-      pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
+      pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
       blockv = vec_avg(blockv,pixelsv);
       vec_st(blockv, 0, (unsigned char*)block);
       pixels+=line_size;
@@ -789,8 +714,8 @@
 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 }
 
-void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels,
-                         int line_size, int h)
+/* next one assumes that ((line_size % 8) == 0) */
+void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
 {
 POWERPC_TBL_DECLARE(altivec_avg_pixels8_num, 1);
 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
@@ -855,6 +780,7 @@
 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 }
 
+/* next one assumes that ((line_size % 8) == 0) */
 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
 POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num, 1);
@@ -969,6 +895,363 @@
 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 }
 
+/* next one assumes that ((line_size % 8) == 0) */
+void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int j;
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
+    for (j = 0; j < 2; j++) {
+      int i;
+      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+      const uint32_t b =
+        (((const struct unaligned_32 *) (pixels + 1))->l);
+      uint32_t l0 =
+        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+      uint32_t h0 =
+        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+      uint32_t l1, h1;
+      pixels += line_size;
+      for (i = 0; i < h; i += 2) {
+        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+        a = (((const struct unaligned_32 *) (pixels))->l);
+        b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+      } pixels += 4 - line_size * (h + 1);
+      block += 4 - line_size * h;
+    }
+    
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+   register int i;
+   register vector unsigned char
+     pixelsv1, pixelsv2,
+     pixelsavg;
+   register vector unsigned char
+     blockv, temp1, temp2;
+   register vector unsigned short
+     pixelssum1, pixelssum2, temp3;
+   register const vector unsigned char vczero = (const vector unsigned char)(0);
+   register const vector unsigned short vcone = (const vector unsigned short)(1);
+   register const vector unsigned short vctwo = (const vector unsigned short)(2);
+   
+   temp1 = vec_ld(0, pixels);
+   temp2 = vec_ld(16, pixels);
+   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
+   {
+     pixelsv2 = temp2;
+   }
+   else
+   {
+     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+   }
+   pixelsv1 = vec_mergeh(vczero, pixelsv1);
+   pixelsv2 = vec_mergeh(vczero, pixelsv2);
+   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                        (vector unsigned short)pixelsv2);
+   pixelssum1 = vec_add(pixelssum1, vcone);
+   
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); 
+   for (i = 0; i < h ; i++) {
+     int rightside = ((unsigned long)block & 0x0000000F);
+     blockv = vec_ld(0, block);
+
+     temp1 = vec_ld(line_size, pixels);
+     temp2 = vec_ld(line_size + 16, pixels);
+     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
+     {
+       pixelsv2 = temp2;
+     }
+     else
+     {
+       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+     }
+
+     pixelsv1 = vec_mergeh(vczero, pixelsv1);
+     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                          (vector unsigned short)pixelsv2);
+     temp3 = vec_add(pixelssum1, pixelssum2);
+     temp3 = vec_sra(temp3, vctwo);
+     pixelssum1 = vec_add(pixelssum2, vcone);
+     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
+     
+     if (rightside)
+     {
+       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
+     }
+     else
+     {
+       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
+     }
+     
+     vec_st(blockv, 0, block);
+     
+     block += line_size;
+     pixels += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_pixels16_xy2_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int j;
+POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1);
+      for (j = 0; j < 4; j++) {
+      int i;
+      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+      const uint32_t b =
+        (((const struct unaligned_32 *) (pixels + 1))->l);
+      uint32_t l0 =
+        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
+      uint32_t h0 =
+        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+      uint32_t l1, h1;
+      pixels += line_size;
+      for (i = 0; i < h; i += 2) {
+        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+        a = (((const struct unaligned_32 *) (pixels))->l);
+        b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+      } pixels += 4 - line_size * (h + 1);
+      block += 4 - line_size * h;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+   register int i;
+   register vector unsigned char
+     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
+   register vector unsigned char
+     blockv, temp1, temp2;
+   register vector unsigned short
+     pixelssum1, pixelssum2, temp3,
+     pixelssum3, pixelssum4, temp4;
+   register const vector unsigned char vczero = (const vector unsigned char)(0);
+   register const vector unsigned short vctwo = (const vector unsigned short)(2);
+   
+   temp1 = vec_ld(0, pixels);
+   temp2 = vec_ld(16, pixels);
+   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
+   {
+     pixelsv2 = temp2;
+   }
+   else
+   {
+     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+   }
+   pixelsv3 = vec_mergel(vczero, pixelsv1);
+   pixelsv4 = vec_mergel(vczero, pixelsv2);
+   pixelsv1 = vec_mergeh(vczero, pixelsv1);
+   pixelsv2 = vec_mergeh(vczero, pixelsv2);
+   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
+                        (vector unsigned short)pixelsv4);
+   pixelssum3 = vec_add(pixelssum3, vctwo);
+   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                        (vector unsigned short)pixelsv2);
+   pixelssum1 = vec_add(pixelssum1, vctwo);
+   
+POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); 
+   for (i = 0; i < h ; i++) {
+     blockv = vec_ld(0, block);
+
+     temp1 = vec_ld(line_size, pixels);
+     temp2 = vec_ld(line_size + 16, pixels);
+     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
+     {
+       pixelsv2 = temp2;
+     }
+     else
+     {
+       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+     }
+
+     pixelsv3 = vec_mergel(vczero, pixelsv1);
+     pixelsv4 = vec_mergel(vczero, pixelsv2);
+     pixelsv1 = vec_mergeh(vczero, pixelsv1);
+     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+     
+     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
+                          (vector unsigned short)pixelsv4);
+     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                          (vector unsigned short)pixelsv2);
+     temp4 = vec_add(pixelssum3, pixelssum4);
+     temp4 = vec_sra(temp4, vctwo);
+     temp3 = vec_add(pixelssum1, pixelssum2);
+     temp3 = vec_sra(temp3, vctwo);
+
+     pixelssum3 = vec_add(pixelssum4, vctwo);
+     pixelssum1 = vec_add(pixelssum2, vctwo);
+
+     blockv = vec_packsu(temp3, temp4);
+     
+     vec_st(blockv, 0, block);
+     
+     block += line_size;
+     pixels += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int j;
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
+      for (j = 0; j < 4; j++) {
+      int i;
+      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+      const uint32_t b =
+        (((const struct unaligned_32 *) (pixels + 1))->l);
+      uint32_t l0 =
+        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+      uint32_t h0 =
+        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+      uint32_t l1, h1;
+      pixels += line_size;
+      for (i = 0; i < h; i += 2) {
+        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+        a = (((const struct unaligned_32 *) (pixels))->l);
+        b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+      } pixels += 4 - line_size * (h + 1);
+      block += 4 - line_size * h;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+   register int i;
+   register vector unsigned char
+     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
+   register vector unsigned char
+     blockv, temp1, temp2;
+   register vector unsigned short
+     pixelssum1, pixelssum2, temp3,
+     pixelssum3, pixelssum4, temp4;
+   register const vector unsigned char vczero = (const vector unsigned char)(0);
+   register const vector unsigned short vcone = (const vector unsigned short)(1);
+   register const vector unsigned short vctwo = (const vector unsigned short)(2);
+   
+   temp1 = vec_ld(0, pixels);
+   temp2 = vec_ld(16, pixels);
+   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
+   {
+     pixelsv2 = temp2;
+   }
+   else
+   {
+     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+   }
+   pixelsv3 = vec_mergel(vczero, pixelsv1);
+   pixelsv4 = vec_mergel(vczero, pixelsv2);
+   pixelsv1 = vec_mergeh(vczero, pixelsv1);
+   pixelsv2 = vec_mergeh(vczero, pixelsv2);
+   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
+                        (vector unsigned short)pixelsv4);
+   pixelssum3 = vec_add(pixelssum3, vcone);
+   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                        (vector unsigned short)pixelsv2);
+   pixelssum1 = vec_add(pixelssum1, vcone);
+   
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); 
+   for (i = 0; i < h ; i++) {
+     blockv = vec_ld(0, block);
+
+     temp1 = vec_ld(line_size, pixels);
+     temp2 = vec_ld(line_size + 16, pixels);
+     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
+     {
+       pixelsv2 = temp2;
+     }
+     else
+     {
+       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+     }
+
+     pixelsv3 = vec_mergel(vczero, pixelsv1);
+     pixelsv4 = vec_mergel(vczero, pixelsv2);
+     pixelsv1 = vec_mergeh(vczero, pixelsv1);
+     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+     
+     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
+                          (vector unsigned short)pixelsv4);
+     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                          (vector unsigned short)pixelsv2);
+     temp4 = vec_add(pixelssum3, pixelssum4);
+     temp4 = vec_sra(temp4, vctwo);
+     temp3 = vec_add(pixelssum1, pixelssum2);
+     temp3 = vec_sra(temp3, vctwo);
+
+     pixelssum3 = vec_add(pixelssum4, vcone);
+     pixelssum1 = vec_add(pixelssum2, vcone);
+
+     blockv = vec_packsu(temp3, temp4);
+     
+     vec_st(blockv, 0, block);
+     
+     block += line_size;
+     pixels += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
 int has_altivec(void)
 {
 #if CONFIG_DARWIN
--- a/ppc/dsputil_altivec.h	Mon Jan 20 22:41:48 2003 +0000
+++ b/ppc/dsputil_altivec.h	Mon Jan 20 22:50:14 2003 +0000
@@ -44,6 +44,9 @@
 extern void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
 extern void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
 extern void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+extern void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+extern void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
+extern void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
 
 extern void gmc1_altivec(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder);
 
--- a/ppc/dsputil_ppc.c	Mon Jan 20 22:41:48 2003 +0000
+++ b/ppc/dsputil_ppc.c	Mon Jan 20 22:50:14 2003 +0000
@@ -40,18 +40,20 @@
 
 #ifdef POWERPC_TBL_PERFORMANCE_REPORT
 unsigned long long perfdata[powerpc_perf_total][powerpc_data_total];
-/* list below must match enum in dsputil_altivec.h */
+/* list below must match enum in dsputil_ppc.h */
 static unsigned char* perfname[] = {
   "fft_calc_altivec",
   "gmc1_altivec",
   "dct_unquantize_h263_altivec",
   "idct_add_altivec",
   "idct_put_altivec",
-  "put_pixels_clamped_altivec",
   "put_pixels16_altivec",
   "avg_pixels16_altivec",
   "avg_pixels8_altivec",
   "put_pixels8_xy2_altivec",
+  "put_no_rnd_pixels8_xy2_altivec",
+  "put_pixels16_xy2_altivec",
+  "put_no_rnd_pixels16_xy2_altivec",
   "clear_blocks_dcbz32_ppc"
 };
 #ifdef POWERPC_PERF_USE_PMC
@@ -65,9 +67,9 @@
 {
   int i;
 #ifndef POWERPC_PERF_USE_PMC
-  fprintf(stderr, "AltiVec performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n");
+  fprintf(stderr, "PowerPC performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n");
 #else /* POWERPC_PERF_USE_PMC */
-  fprintf(stderr, "AltiVec performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
+  fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
 #endif /* POWERPC_PERF_USE_PMC */
   for(i = 0 ; i < powerpc_perf_total ; i++)
   {
@@ -199,21 +201,23 @@
         c->pix_sum = pix_sum_altivec;
         c->diff_pixels = diff_pixels_altivec;
         c->get_pixels = get_pixels_altivec;
-// next two disabled as they're untested.
+// next one disabled as it's untested.
 #if 0
         c->add_bytes= add_bytes_altivec;
-        c->put_pixels_clamped = put_pixels_clamped_altivec;
-#endif
+#endif /* 0 */
         c->put_pixels_tab[0][0] = put_pixels16_altivec;
         c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
 // next one disabled as it's untested.
 #if 0
         c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
-#endif
+#endif /* 0 */
         c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
+        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
+        c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
+        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
         
 	c->gmc1 = gmc1_altivec;
-
+        
 #ifdef POWERPC_TBL_PERFORMANCE_REPORT
         {
           int i;
@@ -228,12 +232,12 @@
             perfdata_miss[i][powerpc_data_max] = 0x0000000000000000;
             perfdata_miss[i][powerpc_data_sum] = 0x0000000000000000;
             perfdata_miss[i][powerpc_data_num] = 0x0000000000000000;
-#endif
+#endif /* POWERPC_PERF_USE_PMC */
           }
         }
-#endif
+#endif /* POWERPC_TBL_PERFORMANCE_REPORT */
     } else
-#endif
+#endif /* HAVE_ALTIVEC */
     {
         // Non-AltiVec PPC optimisations
 
--- a/ppc/dsputil_ppc.h	Mon Jan 20 22:41:48 2003 +0000
+++ b/ppc/dsputil_ppc.h	Mon Jan 20 22:50:14 2003 +0000
@@ -29,11 +29,13 @@
   altivec_dct_unquantize_h263_num,
   altivec_idct_add_num,
   altivec_idct_put_num,
-  altivec_put_pixels_clamped_num,
   altivec_put_pixels16_num,
   altivec_avg_pixels16_num,
   altivec_avg_pixels8_num,
   altivec_put_pixels8_xy2_num,
+  altivec_put_no_rnd_pixels8_xy2_num,
+  altivec_put_pixels16_xy2_num,
+  altivec_put_no_rnd_pixels16_xy2_num,
   powerpc_clear_blocks_dcbz32,
   powerpc_perf_total
 };