Mercurial > emacs
annotate etc/charsets/gb18030.awk @ 89184:88a9e962e183
(decode_coding_utf_8): Treat surrogates as invalid.
author | Dave Love <fx@gnu.org> |
---|---|
date | Wed, 09 Oct 2002 22:00:36 +0000 |
parents | ddb17e4c813c |
children |
rev | line source |
---|---|
88448 | 1 BEGIN { |
2 tohex["A"] = 10; | |
3 tohex["B"] = 11; | |
4 tohex["C"] = 12; | |
5 tohex["D"] = 13; | |
6 tohex["E"] = 14; | |
7 tohex["F"] = 15; | |
8 tohex["a"] = 10; | |
9 tohex["b"] = 11; | |
10 tohex["c"] = 12; | |
11 tohex["d"] = 13; | |
12 tohex["e"] = 14; | |
13 tohex["f"] = 15; | |
14 } | |
15 | |
16 function decode_hex(str) { | |
17 n = 0; | |
18 len = length(str); | |
19 for (i = 1; i <= len; i++) | |
20 { | |
21 c = substr (str, i, 1); | |
22 if (c >= "0" && c <= "9") | |
23 n = n * 16 + (c - "0"); | |
24 else | |
25 n = n * 16 + tohex[c]; | |
26 } | |
27 return n; | |
28 } | |
29 | |
30 function gb_to_index(b0,b1,b2,b3) { | |
31 return ((((b0 - 129) * 10 + (b1 - 48)) * 126 + (b2 - 129)) * 10 + b3 - 48); | |
32 } | |
33 | |
34 function index_to_gb(idx) { | |
35 b3 = (idx % 10) + 48; | |
36 idx /= 10; | |
37 b2 = (idx % 126) + 129; | |
38 idx /= 126; | |
39 b1 = (idx % 10) + 48; | |
40 b0 = (idx / 10) + 129; | |
41 return sprintf("%02X%02X%02X%02X", b0, b1, b2, b3); | |
42 } | |
43 | |
44 function decode_gb(str) { | |
45 b0 = decode_hex(substr(str, 3, 2)); | |
46 b1 = decode_hex(substr(str, 7, 2)); | |
47 b2 = decode_hex(substr(str, 11, 2)); | |
48 b3 = decode_hex(substr(str, 15, 2)); | |
49 return gb_to_index(b0, b1, b2, b3); | |
50 } | |
51 | |
52 function printline(from, to) { | |
53 fromgb = index_to_gb(from); | |
54 fromuni = gbtable[from]; | |
55 if (from == to) | |
56 printf ("0x%s U+%04X\n", fromgb, fromuni); | |
57 else | |
58 printf ("0x%s-0x%s U+%04X-U+%04X\n", fromgb, index_to_gb(to), | |
59 fromuni, fromuni + (to - from)); | |
60 } | |
61 | |
62 /^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]>/ { | |
63 unicode = decode_hex(substr($1, 3, 4)); | |
64 if ($2 ~ /\\x8[1-4]\\x3[0-9]\\x[8-9A-F][0-9A-F]\\x3[0-9]/) | |
65 unitable[unicode] = decode_gb($2); | |
66 else | |
67 unitable[unicode] = -1; | |
68 } | |
69 | |
70 END { | |
71 lastgb = 0; | |
88466
ddb17e4c813c
(END): Initialize lastgb and gbtable[lastgb] correctly.
Kenichi Handa <handa@m17n.org>
parents:
88448
diff
changeset
|
72 surrogate_min = decode_hex("D800"); |
ddb17e4c813c
(END): Initialize lastgb and gbtable[lastgb] correctly.
Kenichi Handa <handa@m17n.org>
parents:
88448
diff
changeset
|
73 surrogate_max = decode_hex("DFFF"); |
ddb17e4c813c
(END): Initialize lastgb and gbtable[lastgb] correctly.
Kenichi Handa <handa@m17n.org>
parents:
88448
diff
changeset
|
74 lastgb = unitable[128]; |
ddb17e4c813c
(END): Initialize lastgb and gbtable[lastgb] correctly.
Kenichi Handa <handa@m17n.org>
parents:
88448
diff
changeset
|
75 gbtable[lastgb] = 128; |
88448 | 76 for (i = 129; i < 65536; i++) |
77 { | |
78 if (unitable[i] == 0 && (i < surrogate_min || i > surrogate_max)) | |
79 { | |
80 lastgb++; | |
81 gbtable[lastgb] = i; | |
82 unitable[i] = lastgb; | |
83 } | |
84 else if (unitable[i] > 0) | |
85 { | |
86 lastgb = unitable[i]; | |
87 gbtable[lastgb] = i; | |
88 } | |
89 } | |
90 | |
91 fromgb = lastgb = unitable[128]; | |
92 for (i = 129; i < 65536; i++) | |
93 { | |
94 if (unitable[i] > 0) | |
95 { | |
96 if (lastgb + 1 == unitable[i]) | |
97 { | |
98 lastgb++; | |
99 } | |
100 else | |
101 { | |
102 if (lastgb >= 0) | |
103 printline(fromgb, lastgb); | |
104 fromgb = lastgb = unitable[i]; | |
105 } | |
106 } | |
107 else # i.e. (unitable[i] < 0) | |
108 { | |
109 if (lastgb >= 0) | |
110 printline(fromgb, lastgb); | |
111 lastgb = -1; | |
112 } | |
113 } | |
114 printline(fromgb, unitable[65535]); | |
115 } |