changeset 89431:c1527f26d513

Make it work for the map file included int glibc.
author Kenichi Handa <handa@m17n.org>
date Tue, 20 May 2003 13:07:55 +0000
parents 28939d6dacea
children d1fc6a0e4772
files etc/charsets/gb18030-2.awk etc/charsets/gb18030-4.awk
diffstat 2 files changed, 84 insertions(+), 101 deletions(-) [+]
line wrap: on
line diff
--- a/etc/charsets/gb18030-2.awk	Tue May 20 13:06:09 2003 +0000
+++ b/etc/charsets/gb18030-2.awk	Tue May 20 13:07:55 2003 +0000
@@ -11,6 +11,10 @@
   tohex["d"] = 13;
   tohex["e"] = 14;
   tohex["f"] = 15;
+  from_gb = 0;
+  to_gb = -1;
+  to_unicode = 0;
+  from_unicode = 0;
 }
 
 function decode_hex(str) {
@@ -30,56 +34,47 @@
 function gb_to_index(gb) {
   b0 = int(gb / 256);
   b1 = gb % 256;
-  idx = (((b0 - 129)) * 190 + b1 - 64); 
-  if (b1 >= 128)
-    idx--;
+  idx = (((b0 - 129)) * 191 + b1 - 64); 
+#  if (b1 >= 128)
+#    idx--;
   return idx
 }
 
 function index_to_gb(idx) {
-  b0 = int(idx / 190) + 129;
-  b1 = (idx % 190) + 64;
-  if (b1 >= 127)
-    b1++;
+  b0 = int(idx / 191) + 129;
+  b1 = (idx % 191) + 64;
+#  if (b1 >= 127)
+#    b1++;
   return (b0 * 256 + b1);
 }
-function decode_gb(str) {
-  b0 = decode_hex(substr(str, 3, 2));
-  b1 = decode_hex(substr(str, 7, 2));
-  return (b0 * 256 + b1)
+
+/^\#/ {
+  print;
+  next;
 }
 
-/^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]>/ {
-  if ($2 ~ /^\\x[0-9A-F][0-9A-F]\\x[0-9A-F][0-9A-F]$/)
+{
+  gb = gb_to_index(decode_hex(substr($1, 3, 4)));
+  unicode = decode_hex(substr($2, 3, 4));
+  if ((gb == to_gb + 1) && (unicode == to_unicode + 1))
     {
-      unicode = decode_hex(substr($1, 3, 4));
-      gb = decode_gb($2);
-      idx = gb_to_index(gb);
-      gb_table[idx] = unicode;
+      to_gb++;
+      to_unicode++;
+    }
+  else
+    {
+      if (from_gb == to_gb)
+	printf "0x%04X 0x%04X\n", index_to_gb(from_gb), from_unicode;
+      else if (from_gb < to_gb)
+	printf "0x%04X-0x%04X 0x%04X\n",
+	  index_to_gb(from_gb), index_to_gb(to_gb), from_unicode;
+      from_gb = to_gb = gb;
+      from_unicode = to_unicode = unicode;
     }
 }
 
 END {
-  last_idx = gb_to_index(decode_hex("FEFE"));
-  from_idx = 0;
-  from_unicode = gb_table[0];
-  for (i = 1; i <= last_idx; i++)
-    {
-      gb = index_to_gb(i);
-      unicode = gb_table[i];
-      if (i - from_idx != unicode - from_unicode)
-	{
-	  if (i - 1 == from_idx)
-	    printf ("0x%04X 0x%04X\n",
-		    index_to_gb(from_idx), from_unicode);
-	  else
-	    printf ("0x%04X-0x%04X 0x%04X\n",
-		    index_to_gb(from_idx), index_to_gb(i - 1), from_unicode);
-	  from_idx = i;
-	  from_unicode=unicode;
-	}
-    }
-  if (i - from_idx != unicode - from_unicode)
-    printf ("0x%04X-0x%04X 0x%04X\n",
-	    index_to_gb(from_idx), index_to_gb(i - 1), from_unicode);
+  if (from_gb <= to_gb)
+    printf "0x%04X-0x%04X 0x%04X\n",
+      index_to_gb(from_gb), index_to_gb(to_gb), from_unicode;
 }
--- a/etc/charsets/gb18030-4.awk	Tue May 20 13:06:09 2003 +0000
+++ b/etc/charsets/gb18030-4.awk	Tue May 20 13:07:55 2003 +0000
@@ -27,88 +27,76 @@
   return n;
 }
 
-function gb_to_index(b0,b1,b2,b3) {
-  return ((((b0 - 129) * 10 + (b1 - 48)) * 126 + (b2 - 129)) * 10 + b3 - 48);
+function gb_to_index(gb) {
+  b0 = int(gb / 256);
+  b1 = gb % 256;
+  idx = (((b0 - 129)) * 191 + b1 - 64); 
+#  if (b1 >= 127)
+#    idx--;
+  return idx
 }
 
 function index_to_gb(idx) {
   b3 = (idx % 10) + 48;
-  idx /= 10;
+  idx = int(idx / 10);
   b2 = (idx % 126) + 129;
-  idx /= 126;
+  idx = int(idx / 126);
   b1 = (idx % 10) + 48;
-  b0 = (idx / 10) + 129;
+  b0 = int(idx / 10) + 129;
   return sprintf("%02X%02X%02X%02X", b0, b1, b2, b3);
 }
 
-function decode_gb(str) {
-  b0 = decode_hex(substr(str, 3, 2));
-  b1 = decode_hex(substr(str, 7, 2));
-  b2 = decode_hex(substr(str, 11, 2));
-  b3 = decode_hex(substr(str, 15, 2));
-  return gb_to_index(b0, b1, b2, b3);
+/^\#/ {
+  print;
+  next;
 }
 
-function printline(from, to) {
-  fromgb = index_to_gb(from);
-  fromuni = gbtable[from];
-  if (from == to)
-    printf ("0x%s		0x%04X\n", fromgb, fromuni);
-  else
-    printf ("0x%s-0x%s	0x%04X\n", fromgb, index_to_gb(to), fromuni);
+/0x....-0x..../ {
+  gb_from = gb_to_index(decode_hex(substr($1, 3, 4)));
+  gb_to = gb_to_index(decode_hex(substr($1, 10, 4)));
+  unicode = decode_hex(substr($2, 3, 4));
+  while (gb_from <= gb_to)
+    {
+      table[unicode++] = 1;
+      gb_from++;
+    }
+  next;
 }
 
-/^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]>/ {
-  unicode = decode_hex(substr($1, 3, 4));
-  if ($2 ~ /\\x8[1-4]\\x3[0-9]\\x[8-9A-F][0-9A-F]\\x3[0-9]/)
-    unitable[unicode] = decode_gb($2);
-  else
-    unitable[unicode] = -1;
+{
+  gb = decode_hex(substr($1, 3, 4));
+  unicode = decode_hex(substr($2, 3, 4));
+  table[unicode] = 1;
 }
 
 END {
-  lastgb = 0;
-  surrogate_min = decode_hex("D800");
-  surrogate_max = decode_hex("DFFF");
-  lastgb = unitable[128];
-  gbtable[lastgb] = 128;
-  for (i = 129; i < 65536; i++)
+  from_gb = -1;
+  to_gb = 0;
+  from_i = 0;
+  table[65536] = 1;
+  for (i = 128; i <= 65536; i++)
     {
-      if (unitable[i] == 0 && (i < surrogate_min || i > surrogate_max))
+      if (table[i] == 0)
 	{
-	  lastgb++;
-	  gbtable[lastgb] = i;
-	  unitable[i] = lastgb;
+	  if (i < 55296 || i >= 57344)
+	    {
+	      if (from_gb < 0)
+		{
+		  from_gb = to_gb;
+		  from_i = i;
+		}
+	      to_gb++;
+	    }
 	}
-      else if (unitable[i] > 0)
+      else if (from_gb >= 0)
 	{
-	  lastgb = unitable[i];
-	  gbtable[lastgb] = i;
+	  if (from_gb + 1 == to_gb)
+	    printf "0x%s\t\t0x%04X\n",
+	      index_to_gb(from_gb), from_i;
+	  else
+	    printf "0x%s-0x%s\t0x%04X\n",
+	      index_to_gb(from_gb), index_to_gb(to_gb - 1), from_i;
+	  from_gb = -1;
 	}
     }
-
-  fromgb = lastgb = unitable[128];
-  for (i = 129; i < 65536; i++)
-    {
-      if (unitable[i] > 0)
-	{
-	  if (lastgb + 1 == unitable[i])
-	    {
-	      lastgb++;
-	    }
-	  else
-	    {
-	      if (lastgb >= 0)
-		printline(fromgb, lastgb);
-	      fromgb = lastgb = unitable[i];
-	    }
-	}
-      else			# i.e. (unitable[i] < 0)
-	{
-	  if (lastgb >= 0)
-	    printline(fromgb, lastgb);
-	  lastgb = -1;
-	}
-    }
-  printline(fromgb, unitable[65535]);
 }