Mercurial > emacs
comparison lisp/international/utf-8.el @ 56037:81dbb510a1db
(utf-translate-cjk-charsets): New
variable.
(utf-translate-cjk-unicode-range): New variable.
(utf-translate-cjk-load-tables): New function.
(utf-lookup-subst-table-for-decode): New function.
(utf-lookup-subst-table-for-encode): New function.
(utf-translate-cjk-mode): Init-value changed to t. Don't load
tables here. Update safe-charsets of utf-* coding systems.
(ccl-mule-utf-untrans): New CCL.
(ccl-decode-mule-utf-8): Call ccl-mule-utf-untrans. Use `repeat'
at end of each branch.
(ccl-mule-utf-8-encode-untrans): New CCL.
(ccl-encode-mule-utf-8): Call ccl-mule-utf-8-encode-untrans.
(ccl-untranslated-to-ucs): Handle 2-byte encoding. Set r1 to the
length of encoding. Don't return r0.
(utf-8-compose): New arg hash-table. Handle 2-byte encoding.
(utf-8-post-read-conversion): Narrow to region properly. If
utf-translate-cjk-mode is on, load tables if necessary. Call
utf-8-compose with hash-table arg if necessary. Call
XXX-compose-region instead of XXX-post-read-convesion.
(utf-8-pre-write-conversion): New function.
(mule-utf-8): Include CJK charsets in safe-charsets if
utf-translate-cjk-mode is on. Add pre-write-conversion.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Sat, 12 Jun 2004 02:10:37 +0000 |
parents | 6e677a935fe9 |
children | 4575a565f45d |
comparison
equal
deleted
inserted
replaced
56036:ff6f1f61fea4 | 56037:81dbb510a1db |
---|---|
188 (set-default s v)) | 188 (set-default s v)) |
189 :version "21.4" | 189 :version "21.4" |
190 :type 'boolean | 190 :type 'boolean |
191 :group 'mule) | 191 :group 'mule) |
192 | 192 |
193 (define-minor-mode utf-translate-cjk-mode | 193 |
194 "Whether the UTF based coding systems should decode/encode CJK characters. | 194 (defconst utf-translate-cjk-charsets '(chinese-gb2312 |
195 Enabling this loads tables which allow the coding systems mule-utf-8, | 195 chinese-big5-1 chinese-big5-2 |
196 mule-utf-16le and mule-utf-16be to encode characters in the charsets | 196 japanese-jisx0208 japanese-jisx0212 |
197 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', | 197 korean-ksc5601) |
198 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to | 198 "List of charsets supported by `utf-translate-cjk-mode'.") |
199 decode the corresponding unicodes into such characters. | 199 |
200 | 200 (defconst utf-translate-cjk-unicode-range |
201 Where the charsets overlap, the one preferred for decoding is chosen | 201 '((#x2e80 . #xd7a3) |
202 according to the language environment in effect when this option is | 202 (#xff00 . #xffef)) |
203 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for | 203 "List of Unicode code ranges supported by `utf-translate-cjk-mode'.") |
204 Chinese-Big5 and jisx for other environments. | 204 |
205 | 205 ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'. |
206 The tables are large (over 40000 entries), so this option is not the | 206 (defsubst utf-translate-cjk-substitutable-p (code-point) |
207 default. Also, installing them may be rather slow." | 207 (let ((tail utf-translate-cjk-unicode-range) |
208 :init-value nil | 208 elt) |
209 :version "21.4" | 209 (while tail |
210 :type 'boolean | 210 (setq elt (car tail) tail (cdr tail)) |
211 :set-after '(current-language-environment) | 211 (if (and (>= code-point (car elt)) (<= code-point (cdr elt))) |
212 :group 'mule | 212 (setq tail nil) |
213 :global t | 213 (setq elt nil))) |
214 (if utf-translate-cjk-mode | 214 elt)) |
215 ;; Fixme: Allow the use of the CJK charsets to be | 215 |
216 ;; customized by reordering and possible omission. | 216 (defvar utf-translate-cjk-lang-env nil |
217 (progn | 217 "Language environment in which tables for `utf-translate-cjk-mode' is loaded. |
218 The value nil means that the tables are not yet loaded.") | |
219 | |
220 (defun utf-translate-cjk-load-tables () | |
221 "Load tables for `utf-translate-cjk-mode'." | |
222 ;; Fixme: Allow the use of the CJK charsets to be | |
223 ;; customized by reordering and possible omission. | |
224 (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000))) | |
225 (if redefined | |
218 ;; Redefine them with realistic initial sizes and a | 226 ;; Redefine them with realistic initial sizes and a |
219 ;; smallish rehash size to avoid wasting significant | 227 ;; smallish rehash size to avoid wasting significant |
220 ;; space after they're built. | 228 ;; space after they're built. |
221 (setq ucs-mule-cjk-to-unicode | 229 (setq ucs-mule-cjk-to-unicode |
222 (make-hash-table :test 'eq :size 43000 :rehash-size 1000) | 230 (make-hash-table :test 'eq :size 43000 :rehash-size 1000) |
223 ucs-unicode-to-mule-cjk | 231 ucs-unicode-to-mule-cjk |
224 (make-hash-table :test 'eq :size 21500 :rehash-size 1000)) | 232 (make-hash-table :test 'eq :size 21500 :rehash-size 1000))) |
225 ;; Load the files explicitly, to avoid having to keep | 233 |
226 ;; around the large tables they contain (as well as the | 234 ;; Load the files explicitly, to avoid having to keep |
227 ;; ones which get built). | 235 ;; around the large tables they contain (as well as the |
228 (cond | 236 ;; ones which get built). |
229 ((string= "Korean" current-language-environment) | 237 (cond ((string= "Korean" current-language-environment) |
230 (load "subst-jis") | 238 (load "subst-jis") |
231 (load "subst-big5") | 239 (load "subst-big5") |
232 (load "subst-gb2312") | 240 (load "subst-gb2312") |
233 (load "subst-ksc")) | 241 (load "subst-ksc")) |
234 ((string= "Chinese-BIG5" current-language-environment) | 242 ((string= "Chinese-BIG5" current-language-environment) |
235 (load "subst-jis") | 243 (load "subst-jis") |
236 (load "subst-ksc") | 244 (load "subst-ksc") |
237 (load "subst-gb2312") | 245 (load "subst-gb2312") |
238 (load "subst-big5")) | 246 (load "subst-big5")) |
239 ((string= "Chinese-GB" current-language-environment) | 247 ((string= "Chinese-GB" current-language-environment) |
240 (load "subst-jis") | 248 (load "subst-jis") |
241 (load "subst-ksc") | 249 (load "subst-ksc") |
242 (load "subst-big5") | 250 (load "subst-big5") |
243 (load "subst-gb2312")) | 251 (load "subst-gb2312")) |
244 (t | 252 (t |
245 (load "subst-ksc") | 253 (load "subst-ksc") |
246 (load "subst-gb2312") | 254 (load "subst-gb2312") |
247 (load "subst-big5") | 255 (load "subst-big5") |
248 (load "subst-jis"))) ; jis covers as much as big5, gb2312 | 256 (load "subst-jis"))) ; jis covers as much as big5, gb2312 |
257 | |
258 (when redefined | |
259 (define-translation-hash-table 'utf-subst-table-for-decode | |
260 ucs-unicode-to-mule-cjk) | |
261 (define-translation-hash-table 'utf-subst-table-for-encode | |
262 ucs-mule-cjk-to-unicode) | |
263 (set-char-table-extra-slot (get 'utf-translation-table-for-encode | |
264 'translation-table) | |
265 1 ucs-mule-cjk-to-unicode)) | |
266 | |
267 (setq utf-translate-cjk-lang-env current-language-environment))) | |
268 | |
269 (defun utf-lookup-subst-table-for-decode (code-point) | |
270 (if (and utf-translate-cjk-mode | |
271 (not utf-translate-cjk-lang-env) | |
272 (utf-translate-cjk-substitutable-p code-point)) | |
273 (utf-translate-cjk-load-tables)) | |
274 (gethash code-point | |
275 (get 'utf-subst-table-for-decode 'translation-hash-table))) | |
276 | |
277 | |
278 (defun utf-lookup-subst-table-for-encode (char) | |
279 (if (and utf-translate-cjk-mode | |
280 (not utf-translate-cjk-lang-env) | |
281 (memq (char-charset char) utf-translate-cjk-charsets)) | |
282 (utf-translate-cjk-load-tables)) | |
283 (gethash char | |
284 (get 'utf-subst-table-for-encode 'translation-hash-table))) | |
285 | |
286 (define-minor-mode utf-translate-cjk-mode | |
287 "Whether the UTF based coding systems should decode/encode CJK characters. | |
288 Enabling this allows the coding systems mule-utf-8, | |
289 mule-utf-16le and mule-utf-16be to encode characters in the charsets | |
290 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', | |
291 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to | |
292 decode the corresponding unicodes into such characters. | |
293 | |
294 Where the charsets overlap, the one preferred for decoding is chosen | |
295 according to the language environment in effect when this option is | |
296 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for | |
297 Chinese-Big5 and jisx for other environments. | |
298 | |
299 This option is on by default. If you are not interested in CJK | |
300 characters and want to avoid some overhead on encoding/decoding | |
301 by the above coding systems, you can customize this option to nil." | |
302 :init-value t | |
303 :version "21.4" | |
304 :type 'boolean | |
305 :group 'mule | |
306 :global t | |
307 (if utf-translate-cjk-mode | |
308 (progn | |
249 (define-translation-hash-table 'utf-subst-table-for-decode | 309 (define-translation-hash-table 'utf-subst-table-for-decode |
250 ucs-unicode-to-mule-cjk) | 310 ucs-unicode-to-mule-cjk) |
251 (define-translation-hash-table 'utf-subst-table-for-encode | 311 (define-translation-hash-table 'utf-subst-table-for-encode |
252 ucs-mule-cjk-to-unicode) | 312 ucs-mule-cjk-to-unicode) |
253 (set-char-table-extra-slot (get 'utf-translation-table-for-encode | 313 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
257 (make-hash-table :test 'eq)) | 317 (make-hash-table :test 'eq)) |
258 (define-translation-hash-table 'utf-subst-table-for-encode | 318 (define-translation-hash-table 'utf-subst-table-for-encode |
259 (make-hash-table :test 'eq)) | 319 (make-hash-table :test 'eq)) |
260 (set-char-table-extra-slot (get 'utf-translation-table-for-encode | 320 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
261 'translation-table) | 321 'translation-table) |
262 1 nil))) | 322 1 nil)) |
323 | |
324 ;; Update safe-chars of mule-utf-* coding systems. | |
325 (dolist (elt (coding-system-list t)) | |
326 (if (string-match "^mule-utf" (symbol-name elt)) | |
327 (let ((safe-charsets (coding-system-get elt 'safe-charsets)) | |
328 (safe-chars (coding-system-get elt 'safe-chars)) | |
329 (need-update nil)) | |
330 (dolist (charset utf-translate-cjk-charsets) | |
331 (unless (eq utf-translate-cjk-mode (memq charset safe-charsets)) | |
332 (setq safe-charsets | |
333 (if utf-translate-cjk-mode | |
334 (cons charset safe-charsets) | |
335 (delq charset safe-charsets)) | |
336 need-update t) | |
337 (aset safe-chars (make-char charset) utf-translate-cjk-mode))) | |
338 (when need-update | |
339 (coding-system-put elt 'safe-charsets safe-charsets) | |
340 (define-coding-system-internal elt)))))) | |
341 | |
342 (define-ccl-program ccl-mule-utf-untrans | |
343 ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or | |
344 ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write | |
345 ;; eight-bit-control/graphic sequence (2 to 4 chars) representing | |
346 ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified. | |
347 ;; | |
348 ;; This is a subrountine because we assume that this is called very | |
349 ;; rarely (so we don't have to worry about the overhead of the | |
350 ;; call). | |
351 `(0 | |
352 ((r5 = ,(charset-id 'eight-bit-control)) | |
353 (r6 = ,(charset-id 'eight-bit-graphic)) | |
354 (if (r0 < #x100) | |
355 ((r4 = ((r0 >> 6) | #xC0)) | |
356 (write-multibyte-character r6 r4)) | |
357 ((if (r0 < #x10000) | |
358 ((r4 = ((r0 >> 12) | #xE0)) | |
359 (write-multibyte-character r6 r4)) | |
360 ((r4 = ((r0 >> 18) | #xF0)) | |
361 (write-multibyte-character r6 r4) | |
362 (r4 = (((r0 >> 12) & #x3F) | #x80)) | |
363 (if (r4 < #xA0) | |
364 (write-multibyte-character r5 r4) | |
365 (write-multibyte-character r6 r4)))) | |
366 (r4 = (((r0 >> 6) & #x3F) | #x80)) | |
367 (if (r4 < #xA0) | |
368 (write-multibyte-character r5 r4) | |
369 (write-multibyte-character r6 r4)))) | |
370 (r4 = ((r0 & #x3F) | #x80)) | |
371 (if (r4 < #xA0) | |
372 (write-multibyte-character r5 r4) | |
373 (write-multibyte-character r6 r4))))) | |
263 | 374 |
264 (define-ccl-program ccl-decode-mule-utf-8 | 375 (define-ccl-program ccl-decode-mule-utf-8 |
265 ;; | 376 ;; |
266 ;; charset | bytes in utf-8 | bytes in emacs | 377 ;; charset | bytes in utf-8 | bytes in emacs |
267 ;; -----------------------+----------------+--------------- | 378 ;; -----------------------+----------------+--------------- |
276 ;; -----------------------+----------------+--------------- | 387 ;; -----------------------+----------------+--------------- |
277 ;; mule-unicode-0100-24ff | 3 | 4 | 388 ;; mule-unicode-0100-24ff | 3 | 4 |
278 ;; (>= 8000) | | | 389 ;; (>= 8000) | | |
279 ;; mule-unicode-2500-33ff | 3 | 4 | 390 ;; mule-unicode-2500-33ff | 3 | 4 |
280 ;; mule-unicode-e000-ffff | 3 | 4 | 391 ;; mule-unicode-e000-ffff | 3 | 4 |
392 ;; -----------------------+----------------+--------------- | |
393 ;; invalid byte | 1 | 2 | |
281 ;; | 394 ;; |
282 ;; Thus magnification factor is two. | 395 ;; Thus magnification factor is two. |
283 ;; | 396 ;; |
284 `(2 | 397 `(2 |
285 ((r5 = ,(charset-id 'eight-bit-control)) | 398 ((r0 = -1) |
286 (r6 = ,(charset-id 'eight-bit-graphic)) | |
287 (loop | 399 (loop |
400 (if (r0 < 0) | |
401 (read r0)) | |
402 (if (r0 < #x80) | |
403 ;; 1-byte encoding, i.e., ascii | |
404 ((write r0) | |
405 (r0 = -1) | |
406 (repeat))) | |
407 (if (r0 < #xc0) ; continuation byte (invalid here) | |
408 ((call ccl-mule-utf-untrans) | |
409 (r0 = -1) | |
410 (repeat))) | |
411 | |
412 ;; Read the 2nd byte. | |
413 (r1 = -1) | |
414 (read r1) | |
415 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte | |
416 ((call ccl-mule-utf-untrans) | |
417 ;; Handle it in the next loop. | |
418 (r0 = r1) | |
419 (repeat))) | |
420 | |
421 (if (r0 < #xe0) | |
422 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx | |
423 ((r2 = ((r0 & #x1F) << 6)) | |
424 (r2 |= (r1 & #x3F)) | |
425 ;; Now r2 holds scalar value | |
426 | |
427 (if (r2 < 128) ; `overlong sequence' | |
428 ((call ccl-mule-utf-untrans) | |
429 (r0 = r1) | |
430 (call ccl-mule-utf-untrans) | |
431 (r0 = -1) | |
432 (repeat))) | |
433 | |
434 (r1 = r2) | |
435 (if (r1 < 160) | |
436 ;; eight-bit-control | |
437 (r0 = ,(charset-id 'eight-bit-control)) | |
438 (if (r1 < 256) | |
439 ;; latin-iso8859-1 | |
440 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
441 (r1 -= 128)) | |
442 ;; mule-unicode-0100-24ff (< 0800) | |
443 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
444 (r1 -= #x0100) | |
445 (r2 = (((r1 / 96) + 32) << 7)) | |
446 (r1 %= 96) | |
447 (r1 += (r2 + 32)) | |
448 (translate-character | |
449 utf-translation-table-for-decode r0 r1)))) | |
450 (write-multibyte-character r0 r1) | |
451 (r0 = -1) | |
452 (repeat))) | |
453 | |
454 ;; Read the 3rd bytes. | |
455 (r2 = -1) | |
456 (read r2) | |
457 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte | |
458 ((call ccl-mule-utf-untrans) | |
459 (r0 = r1) | |
460 (call ccl-mule-utf-untrans) | |
461 ;; Handle it in the next loop. | |
462 (r0 = r2) | |
463 (repeat))) | |
464 | |
465 (if (r0 < #xF0) | |
466 ;; 3byte encoding | |
467 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | |
468 ((r3 = ((r0 & #xF) << 12)) | |
469 (r3 |= ((r1 & #x3F) << 6)) | |
470 (r3 |= (r2 & #x3F)) | |
471 | |
472 (if (r3 < #x800) ; `overlong sequence' | |
473 ((call ccl-mule-utf-untrans) | |
474 (r0 = r1) | |
475 (call ccl-mule-utf-untrans) | |
476 (r0 = r2) | |
477 (call ccl-mule-utf-untrans) | |
478 (r0 = -1) | |
479 (repeat))) | |
480 | |
481 (if (r3 < #x2500) | |
482 ;; mule-unicode-0100-24ff (>= 0800) | |
483 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
484 (r3 -= #x0100) | |
485 (r3 //= 96) | |
486 (r1 = (r7 + 32)) | |
487 (r1 += ((r3 + 32) << 7)) | |
488 (translate-character | |
489 utf-translation-table-for-decode r0 r1) | |
490 (write-multibyte-character r0 r1) | |
491 (r0 = -1) | |
492 (repeat))) | |
493 | |
494 (if (r3 < #x3400) | |
495 ;; mule-unicode-2500-33ff | |
496 ((r0 = r3) ; don't zap r3 | |
497 (lookup-integer utf-subst-table-for-decode r0 r1) | |
498 (if (r7 == 0) | |
499 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
500 (r3 -= #x2500) | |
501 (r3 //= 96) | |
502 (r1 = (r7 + 32)) | |
503 (r1 += ((r3 + 32) << 7)))) | |
504 (write-multibyte-character r0 r1) | |
505 (r0 = -1) | |
506 (repeat))) | |
507 | |
508 (if (r3 < #xE000) | |
509 ;; Try to convert to CJK chars, else | |
510 ;; keep them as eight-bit-{control|graphic}. | |
511 ((r0 = r3) | |
512 (lookup-integer utf-subst-table-for-decode r3 r1) | |
513 (if r7 | |
514 ;; got a translation | |
515 (write-multibyte-character r3 r1) | |
516 (call ccl-mule-utf-untrans)) | |
517 (r0 = -1) | |
518 (repeat))) | |
519 | |
520 ;; mule-unicode-e000-ffff | |
521 ;; Fixme: fffe and ffff are invalid. | |
522 (r0 = r3) ; don't zap r3 | |
523 (lookup-integer utf-subst-table-for-decode r0 r1) | |
524 (if (r7 == 0) | |
525 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
526 (r3 -= #xe000) | |
527 (r3 //= 96) | |
528 (r1 = (r7 + 32)) | |
529 (r1 += ((r3 + 32) << 7)))) | |
530 (write-multibyte-character r0 r1) | |
531 (r0 = -1) | |
532 (repeat))) | |
533 | |
534 ;; Read the 4th bytes. | |
535 (r3 = -1) | |
536 (read r3) | |
537 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte | |
538 ((call ccl-mule-utf-untrans) | |
539 (r0 = r1) | |
540 (call ccl-mule-utf-untrans) | |
541 ;; Handle it in the next loop. | |
542 (r0 = r3) | |
543 (repeat))) | |
544 | |
545 (if (r3 < #xF8) | |
546 ;; 4-byte encoding: | |
547 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx | |
548 ;; keep those bytes as eight-bit-{control|graphic} | |
549 ;; Fixme: allow lookup in utf-subst-table-for-decode. | |
550 ((r4 = ((r0 & #x7) << 18)) | |
551 (r4 |= ((r1 & #x3F) << 12)) | |
552 (r4 |= ((r2 & #x3F) << 6)) | |
553 (r4 |= (r3 & #x3F)) | |
554 | |
555 (if (r4 < #x10000) ; `overlong sequence' | |
556 ((call ccl-mule-utf-untrans) | |
557 (r0 = r1) | |
558 (call ccl-mule-utf-untrans) | |
559 (r0 = r2) | |
560 (call ccl-mule-utf-untrans) | |
561 (r0 = r3) | |
562 (call ccl-mule-utf-untrans)) | |
563 ((r0 = r4) | |
564 (call ccl-mule-utf-untrans))) | |
565 (r0 = -1) | |
566 (repeat))) | |
567 | |
568 ;; Unsupported sequence. | |
569 (call ccl-mule-utf-untrans) | |
570 (r0 = r1) | |
571 (call ccl-mule-utf-untrans) | |
572 (r0 = r2) | |
573 (call ccl-mule-utf-untrans) | |
574 (r0 = r3) | |
575 (call ccl-mule-utf-untrans) | |
288 (r0 = -1) | 576 (r0 = -1) |
289 (read r0) | |
290 | |
291 ;; 1byte encoding, i.e., ascii | |
292 (if (r0 < #x80) | |
293 ((write r0)) | |
294 (if (r0 < #xc0) ; continuation byte (invalid here) | |
295 ((if (r0 < #xa0) | |
296 (write-multibyte-character r5 r0) | |
297 (write-multibyte-character r6 r0))) | |
298 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx | |
299 (if (r0 < #xe0) | |
300 ((r1 = -1) | |
301 (read r1) | |
302 | |
303 (if ((r1 & #b11000000) != #b10000000) | |
304 ;; Invalid 2-byte sequence | |
305 ((if (r0 < #xa0) | |
306 (write-multibyte-character r5 r0) | |
307 (write-multibyte-character r6 r0)) | |
308 (if (r1 < #x80) | |
309 (write r1) | |
310 (if (r1 < #xa0) | |
311 (write-multibyte-character r5 r1) | |
312 (write-multibyte-character r6 r1)))) | |
313 | |
314 ((r3 = r0) ; save in case of overlong sequence | |
315 (r2 = r1) | |
316 (r0 &= #x1f) | |
317 (r0 <<= 6) | |
318 (r1 &= #x3f) | |
319 (r1 += r0) | |
320 ;; Now r1 holds scalar value | |
321 | |
322 (if (r1 < 128) ; `overlong sequence' | |
323 ((if (r3 < #xa0) | |
324 (write-multibyte-character r5 r3) | |
325 (write-multibyte-character r6 r3)) | |
326 (if (r2 < #x80) | |
327 (write r2) | |
328 (if (r2 < #xa0) | |
329 (write-multibyte-character r5 r2) | |
330 (write-multibyte-character r6 r2)))) | |
331 | |
332 ;; eight-bit-control | |
333 (if (r1 < 160) | |
334 ((write-multibyte-character r5 r1)) | |
335 | |
336 ;; latin-iso8859-1 | |
337 (if (r1 < 256) | |
338 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
339 (r1 -= 128) | |
340 (write-multibyte-character r0 r1)) | |
341 | |
342 ;; mule-unicode-0100-24ff (< 0800) | |
343 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
344 (r1 -= #x0100) | |
345 (r2 = (((r1 / 96) + 32) << 7)) | |
346 (r1 %= 96) | |
347 (r1 += (r2 + 32)) | |
348 (translate-character | |
349 utf-translation-table-for-decode r0 r1) | |
350 (write-multibyte-character r0 r1)))))))) | |
351 | |
352 ;; 3byte encoding | |
353 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | |
354 (if (r0 < #xf0) | |
355 ((r1 = -1) | |
356 (r2 = -1) | |
357 (read r1 r2) | |
358 | |
359 ;; This is set to 1 if the encoding is invalid. | |
360 (r4 = 0) | |
361 | |
362 (r3 = (r1 & #b11000000)) | |
363 (r3 |= ((r2 >> 2) & #b00110000)) | |
364 (if (r3 != #b10100000) | |
365 (r4 = 1) | |
366 ((r3 = ((r0 & #x0f) << 12)) | |
367 (r3 += ((r1 & #x3f) << 6)) | |
368 (r3 += (r2 & #x3f)) | |
369 (if (r3 < #x0800) | |
370 (r4 = 1)))) | |
371 | |
372 (if (r4 != 0) | |
373 ;; Invalid 3-byte sequence | |
374 ((if (r0 < #xa0) | |
375 (write-multibyte-character r5 r0) | |
376 (write-multibyte-character r6 r0)) | |
377 (if (r1 < #x80) | |
378 (write r1) | |
379 (if (r1 < #xa0) | |
380 (write-multibyte-character r5 r1) | |
381 (write-multibyte-character r6 r1))) | |
382 (if (r2 < #x80) | |
383 (write r2) | |
384 (if (r2 < #xa0) | |
385 (write-multibyte-character r5 r2) | |
386 (write-multibyte-character r6 r2)))) | |
387 | |
388 ;; mule-unicode-0100-24ff (>= 0800) | |
389 ((if (r3 < #x2500) | |
390 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
391 (r3 -= #x0100) | |
392 (r3 //= 96) | |
393 (r1 = (r7 + 32)) | |
394 (r1 += ((r3 + 32) << 7)) | |
395 (translate-character | |
396 utf-translation-table-for-decode r0 r1) | |
397 (write-multibyte-character r0 r1)) | |
398 | |
399 ;; mule-unicode-2500-33ff | |
400 (if (r3 < #x3400) | |
401 ((r4 = r3) ; don't zap r3 | |
402 (lookup-integer utf-subst-table-for-decode r4 r5) | |
403 (if r7 | |
404 ;; got a translation | |
405 ((write-multibyte-character r4 r5) | |
406 ;; Zapped through register starvation. | |
407 (r5 = ,(charset-id 'eight-bit-control))) | |
408 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
409 (r3 -= #x2500) | |
410 (r3 //= 96) | |
411 (r1 = (r7 + 32)) | |
412 (r1 += ((r3 + 32) << 7)) | |
413 (write-multibyte-character r0 r1)))) | |
414 | |
415 ;; U+3400 .. U+D7FF | |
416 ;; Try to convert to CJK chars, else keep | |
417 ;; them as eight-bit-{control|graphic}. | |
418 (if (r3 < #xd800) | |
419 ((r4 = r3) ; don't zap r3 | |
420 (lookup-integer utf-subst-table-for-decode r4 r5) | |
421 (if r7 | |
422 ;; got a translation | |
423 ((write-multibyte-character r4 r5) | |
424 ;; Zapped through register starvation. | |
425 (r5 = ,(charset-id 'eight-bit-control))) | |
426 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | |
427 ((r3 = r6) | |
428 (write-multibyte-character r3 r0) | |
429 (if (r1 < #xa0) | |
430 (r3 = r5)) | |
431 (write-multibyte-character r3 r1) | |
432 (if (r2 < #xa0) | |
433 (r3 = r5) | |
434 (r3 = r6)) | |
435 (write-multibyte-character r3 r2)))) | |
436 | |
437 ;; Surrogates, U+D800 .. U+DFFF | |
438 (if (r3 < #xe000) | |
439 ((r3 = r6) | |
440 (write-multibyte-character r3 r0) ; eight-bit-graphic | |
441 (if (r1 < #xa0) | |
442 (r3 = r5)) | |
443 (write-multibyte-character r3 r1) | |
444 (if (r2 < #xa0) | |
445 (r3 = r5) | |
446 (r3 = r6)) | |
447 (write-multibyte-character r3 r2)) | |
448 | |
449 ;; mule-unicode-e000-ffff | |
450 ;; Fixme: fffe and ffff are invalid. | |
451 ((r4 = r3) ; don't zap r3 | |
452 (lookup-integer utf-subst-table-for-decode r4 r5) | |
453 (if r7 | |
454 ;; got a translation | |
455 ((write-multibyte-character r4 r5) | |
456 ;; Zapped through register starvation. | |
457 (r5 = ,(charset-id 'eight-bit-control))) | |
458 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
459 (r3 -= #xe000) | |
460 (r3 //= 96) | |
461 (r1 = (r7 + 32)) | |
462 (r1 += ((r3 + 32) << 7)) | |
463 (write-multibyte-character r0 r1))))))))))) | |
464 | |
465 (if (r0 < #xfe) | |
466 ;; 4byte encoding | |
467 ;; keep those bytes as eight-bit-{control|graphic} | |
468 ;; Fixme: allow lookup in utf-subst-table-for-decode. | |
469 ((r1 = -1) | |
470 (r2 = -1) | |
471 (r3 = -1) | |
472 (read r1 r2 r3) | |
473 ;; r0 > #xf0, thus eight-bit-graphic | |
474 (write-multibyte-character r6 r0) | |
475 (if (r1 < #xa0) | |
476 (if (r1 < #x80) ; invalid byte | |
477 (write r1) | |
478 (write-multibyte-character r5 r1)) | |
479 (write-multibyte-character r6 r1)) | |
480 (if (r2 < #xa0) | |
481 (if (r2 < #x80) ; invalid byte | |
482 (write r2) | |
483 (write-multibyte-character r5 r2)) | |
484 (write-multibyte-character r6 r2)) | |
485 (if (r3 < #xa0) | |
486 (if (r3 < #x80) ; invalid byte | |
487 (write r3) | |
488 (write-multibyte-character r5 r3)) | |
489 (write-multibyte-character r6 r3)) | |
490 (if (r0 >= #xf8) ; 5- or 6-byte encoding | |
491 ((r0 = -1) | |
492 (read r0) | |
493 (if (r0 < #xa0) | |
494 (if (r0 < #x80) ; invalid byte | |
495 (write r0) | |
496 (write-multibyte-character r5 r0)) | |
497 (write-multibyte-character r6 r0)) | |
498 (if (r0 >= #xfc) ; 6-byte | |
499 ((r0 = -1) | |
500 (read r0) | |
501 (if (r0 < #xa0) | |
502 (if (r0 < #x80) ; invalid byte | |
503 (write r0) | |
504 (write-multibyte-character r5 r0)) | |
505 (write-multibyte-character r6 r0))))))) | |
506 ;; else invalid byte >= #xfe | |
507 (write-multibyte-character r6 r0)))))) | |
508 (repeat))) | 577 (repeat))) |
509 | 578 |
510 ;; At EOF... | 579 ;; At EOF... |
511 (if (r0 >= 0) | 580 (if (r0 >= 0) |
512 ((if (r0 < #x80) | 581 ;; r0 >= #x80 |
513 (write r0) | 582 ((call ccl-mule-utf-untrans) |
514 (if (r0 < #xa0) | |
515 (write-multibyte-character r5 r0) | |
516 ((write-multibyte-character r6 r0)))) | |
517 (if (r1 >= 0) | 583 (if (r1 >= 0) |
518 ((if (r1 < #x80) | 584 ((r0 = r1) |
519 (write r1) | 585 (call ccl-mule-utf-untrans) |
520 (if (r1 < #xa0) | |
521 (write-multibyte-character r5 r1) | |
522 ((write-multibyte-character r6 r1)))) | |
523 (if (r2 >= 0) | 586 (if (r2 >= 0) |
524 ((if (r2 < #x80) | 587 ((r0 = r2) |
525 (write r2) | 588 (call ccl-mule-utf-untrans) |
526 (if (r2 < #xa0) | |
527 (write-multibyte-character r5 r2) | |
528 ((write-multibyte-character r6 r2)))) | |
529 (if (r3 >= 0) | 589 (if (r3 >= 0) |
530 (if (r3 < #x80) | 590 ((r0 = r3) |
531 (write r3) | 591 (call ccl-mule-utf-untrans)))))))))) |
532 (if (r3 < #xa0) | |
533 (write-multibyte-character r5 r3) | |
534 ((write-multibyte-character r6 r3)))))))))))) | |
535 | 592 |
536 "CCL program to decode UTF-8. | 593 "CCL program to decode UTF-8. |
537 Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 594 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
538 mule-unicode-*, but see also `utf-fragmentation-table' and | 595 mule-unicode-*, but see also `utf-fragmentation-table' and |
539 `ucs-mule-cjk-to-unicode'. | 596 `ucs-mule-cjk-to-unicode'. |
540 Encodings of un-representable Unicode characters are decoded asis into | 597 Encodings of un-representable Unicode characters are decoded asis into |
541 eight-bit-control and eight-bit-graphic characters.") | 598 eight-bit-control and eight-bit-graphic characters.") |
542 | 599 |
600 (define-ccl-program ccl-mule-utf-8-encode-untrans | |
601 ;; UTF-8 decoder generates an UTF-8 sequence represented by a | |
602 ;; sequence eight-bit-control/graphic chars for an untranslatable | |
603 ;; character and an invalid byte. | |
604 ;; | |
605 ;; This CCL parses that sequence (the first byte is already in r1), | |
606 ;; writes out the original bytes of that sequence, and sets r5 to | |
607 ;; -1. | |
608 ;; | |
609 ;; If the eight-bit-control/graphic sequence is shorter than what r1 | |
610 ;; suggests, it sets r5 and r6 to the last character read that | |
611 ;; should be handled by the next loop of a caller. | |
612 ;; | |
613 ;; Note: For UTF-8 validation, we only check if a character is | |
614 ;; eight-bit-control/graphic or not. It may result in incorrect | |
615 ;; handling of random binary data, but such a data can't be encoded | |
616 ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such | |
617 ;; a sequence even if a source contains invalid byte-sequence. | |
618 `(0 | |
619 (;; Read the 2nd byte. | |
620 (read-multibyte-character r5 r6) | |
621 (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
622 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
623 ((write r1) ; invalid UTF-8 | |
624 (r1 = -1) | |
625 (end))) | |
626 | |
627 (if (r1 <= #xC3) | |
628 ;; 2-byte sequence for an originally invalid byte. | |
629 ((r6 &= #x3F) | |
630 (r6 |= ((r1 & #x1F) << 6)) | |
631 (write r6) | |
632 (r5 = -1) | |
633 (end))) | |
634 | |
635 (write r1 r6) | |
636 (r2 = r1) | |
637 (r1 = -1) | |
638 ;; Read the 3rd byte. | |
639 (read-multibyte-character r5 r6) | |
640 (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
641 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
642 (end)) ; invalid UTF-8 | |
643 (write r6) | |
644 (if (r2 < #xF0) | |
645 ;; 3-byte sequence for an untranslated character. | |
646 ((r5 = -1) | |
647 (end))) | |
648 ;; Read the 4th byte. | |
649 (read-multibyte-character r5 r6) | |
650 (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
651 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
652 (end)) ; invalid UTF-8 | |
653 ;; 4-byte sequence for an untranslated character. | |
654 (write r6) | |
655 (r5 = -1) | |
656 (end)) | |
657 | |
658 ;; At EOF... | |
659 ((r5 = -1) | |
660 (if (r1 >= 0) | |
661 (write r1))))) | |
662 | |
543 (define-ccl-program ccl-encode-mule-utf-8 | 663 (define-ccl-program ccl-encode-mule-utf-8 |
544 `(1 | 664 `(1 |
545 ((r5 = -1) | 665 ((r5 = -1) |
546 (loop | 666 (loop |
547 (if (r5 < 0) | 667 (if (r5 < 0) |
548 ((r1 = -1) | 668 (read-multibyte-character r0 r1) |
549 (read-multibyte-character r0 r1) | 669 ;; Pre-read character is in r5 (charset-ID) and r6 (code-point). |
550 (translate-character utf-translation-table-for-encode r0 r1)) | 670 ((r0 = r5) |
551 (;; We have already done read-multibyte-character. | |
552 (r0 = r5) | |
553 (r1 = r6) | 671 (r1 = r6) |
554 (r5 = -1))) | 672 (r5 = -1))) |
673 (translate-character utf-translation-table-for-encode r0 r1) | |
555 | 674 |
556 (if (r0 == ,(charset-id 'ascii)) | 675 (if (r0 == ,(charset-id 'ascii)) |
557 (write r1) | 676 (write-repeat r1)) |
558 | 677 |
559 (if (r0 == ,(charset-id 'latin-iso8859-1)) | 678 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
560 ;; r1 scalar utf-8 | 679 ;; r1 scalar utf-8 |
561 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | 680 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
562 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 | 681 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
563 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 | 682 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
564 ((r0 = (((r1 & #x40) >> 6) | #xc2)) | 683 ((r0 = (((r1 & #x40) >> 6) | #xc2)) |
565 (r1 &= #x3f) | 684 (r1 &= #x3f) |
685 (r1 |= #x80) | |
686 (write r0) | |
687 (write-repeat r1))) | |
688 | |
689 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | |
690 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
691 ;; #x3f80 == (0011 1111 1000 0000)b | |
692 (r1 &= #x7f) | |
693 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | |
694 ;; now r1 holds scalar value | |
695 (if (r1 < #x0800) | |
696 ;; 2byte encoding | |
697 ((write ((r1 >> 6) | #xC0)) | |
698 (r1 &= #x3F) | |
699 (r1 |= #x80) | |
700 (write-repeat r1)) | |
701 ;; 3byte encoding | |
702 ((write ((r1 >> 12) | #xE0)) | |
703 (write (((r1 & #x0FC0) >> 6) | #x80)) | |
704 (r1 &= #x3F) | |
705 (r1 |= #x80) | |
706 (write-repeat r1))))) | |
707 | |
708 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
709 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
710 (r1 &= #x7f) | |
711 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | |
712 ;; now r1 holds scalar value | |
713 (write ((r1 >> 12) | #xE0)) | |
714 (write (((r1 & #x0FC0) >> 6) | #x80)) | |
715 (r1 &= #x3F) | |
716 (r1 |= #x80) | |
717 (write-repeat r1))) | |
718 | |
719 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | |
720 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
721 (r1 &= #x7f) | |
722 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 | |
723 ;; now r1 holds scalar value | |
724 (write ((r1 >> 12) | #xE0)) | |
725 (write (((r1 & #x0FC0) >> 6) | #x80)) | |
726 (r1 &= #x3F) | |
727 (r1 |= #x80) | |
728 (write-repeat r1))) | |
729 | |
730 (if (r0 == ,(charset-id 'eight-bit-control)) | |
731 ;; r1 scalar utf-8 | |
732 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
733 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | |
734 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | |
735 ((write #xC2) | |
736 (write-repeat r1))) | |
737 | |
738 (if (r0 == ,(charset-id 'eight-bit-graphic)) | |
739 ;; r1 scalar utf-8 | |
740 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
741 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | |
742 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | |
743 ((r0 = (r1 >= #xC0)) | |
744 (r0 &= (r1 <= #xC3)) | |
745 (r4 = (r1 >= #xE1)) | |
746 (r4 &= (r1 <= #xF7)) | |
747 (r0 |= r4) | |
748 (if r0 | |
749 ((call ccl-mule-utf-8-encode-untrans) | |
750 (repeat)) | |
751 (write-repeat r1)))) | |
752 | |
753 (lookup-character utf-subst-table-for-encode r0 r1) | |
754 (if r7 ; lookup succeeded | |
755 (if (r0 < #x800) | |
756 ;; 2byte encoding | |
757 ((write ((r0 >> 6) | #xC0)) | |
758 (r1 &= #x3F) | |
759 (r1 |= #x80) | |
760 (write-repeat r1)) | |
761 ;; 3byte encoding | |
762 ((write ((r0 >> 12) | #xE0)) | |
763 (write (((r0 & #x0FC0) >> 6) | #x80)) | |
764 (r1 &= #x3F) | |
566 (r1 |= #x80) | 765 (r1 |= #x80) |
567 (write r0 r1)) | 766 (write-repeat r1)))) |
568 | 767 |
569 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | 768 ;; Unsupported character. |
570 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | 769 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. |
571 ;; #x3f80 == (0011 1111 1000 0000)b | 770 (write #xef) |
572 (r1 &= #x7f) | 771 (write #xbf) |
573 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | 772 (write-repeat #xbd)))) |
574 ;; now r1 holds scalar value | |
575 (if (r1 < #x0800) | |
576 ;; 2byte encoding | |
577 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) | |
578 ;; #x07c0 == (0000 0111 1100 0000)b | |
579 (r1 &= #x3f) | |
580 (r1 |= #x80) | |
581 (write r0 r1)) | |
582 ;; 3byte encoding | |
583 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
584 (r2 = ((r1 & #x3f) | #x80)) | |
585 (r1 &= #x0fc0) | |
586 (r1 >>= 6) | |
587 (r1 |= #x80) | |
588 (write r0 r1 r2)))) | |
589 | |
590 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
591 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
592 (r1 &= #x7f) | |
593 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | |
594 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
595 (r2 = ((r1 & #x3f) | #x80)) | |
596 (r1 &= #x0fc0) | |
597 (r1 >>= 6) | |
598 (r1 |= #x80) | |
599 (write r0 r1 r2)) | |
600 | |
601 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | |
602 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
603 (r1 &= #x7f) | |
604 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 | |
605 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
606 (r2 = ((r1 & #x3f) | #x80)) | |
607 (r1 &= #x0fc0) | |
608 (r1 >>= 6) | |
609 (r1 |= #x80) | |
610 (write r0 r1 r2)) | |
611 | |
612 (if (r0 == ,(charset-id 'eight-bit-control)) | |
613 ;; r1 scalar utf-8 | |
614 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
615 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | |
616 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | |
617 ((write #xc2) | |
618 (write r1)) | |
619 | |
620 (if (r0 == ,(charset-id 'eight-bit-graphic)) | |
621 ;; r1 scalar utf-8 | |
622 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
623 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | |
624 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | |
625 ((write r1) | |
626 (r1 = -1) | |
627 (read-multibyte-character r0 r1) | |
628 (if (r0 != ,(charset-id 'eight-bit-graphic)) | |
629 (if (r0 != ,(charset-id 'eight-bit-control)) | |
630 ((r5 = r0) | |
631 (r6 = r1)))) | |
632 (if (r5 < 0) | |
633 ((read-multibyte-character r0 r2) | |
634 (if (r0 != ,(charset-id 'eight-bit-graphic)) | |
635 (if (r0 != ,(charset-id 'eight-bit-control)) | |
636 ((r5 = r0) | |
637 (r6 = r2)))) | |
638 (if (r5 < 0) | |
639 (write r1 r2) | |
640 (if (r1 < #xa0) | |
641 (write r1) | |
642 ((write #xc2) | |
643 (write r1))))))) | |
644 | |
645 ((lookup-character utf-subst-table-for-encode r0 r1) | |
646 (if r7 ; lookup succeeded | |
647 ((r1 = (((r0 & #xf000) >> 12) | #xe0)) | |
648 (r2 = ((r0 & #x3f) | #x80)) | |
649 (r0 &= #x0fc0) | |
650 (r0 >>= 6) | |
651 (r0 |= #x80) | |
652 (write r1 r0 r2)) | |
653 ;; Unsupported character. | |
654 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
655 ((write #xef) | |
656 (write #xbf) | |
657 (write #xbd))))))))))) | |
658 (repeat))) | |
659 (if (r1 >= #xa0) | |
660 (write r1) | |
661 (if (r1 >= #x80) | |
662 ((write #xc2) | |
663 (write r1))))) | |
664 | |
665 "CCL program to encode into UTF-8.") | 773 "CCL program to encode into UTF-8.") |
666 | 774 |
667 | 775 |
668 (define-ccl-program ccl-untranslated-to-ucs | 776 (define-ccl-program ccl-untranslated-to-ucs |
669 `(0 | 777 `(0 |
670 (if (r0 < #xf0) ; 3-byte encoding, as above | 778 (if (r1 == 0) |
671 ((r4 = 0) | 779 nil |
672 (r3 = (r1 & #b11000000)) | 780 (if (r0 <= #xC3) ; 2-byte encoding |
673 (r3 |= ((r2 >> 2) & #b00110000)) | 781 ((r0 = ((r0 & #x3) << 6)) |
674 (if (r3 != #b10100000) | 782 (r0 |= (r1 & #x3F)) |
675 (r4 = 1) | 783 (r1 = 2)) |
676 ((r3 = ((r0 & #x0f) << 12)) | 784 (if (r2 == 0) |
677 (r3 += ((r1 & #x3f) << 6)) | 785 (r1 = 0) |
678 (r3 += (r2 & #x3f)) | 786 (if (r0 < #xF0) ; 3-byte encoding, as above |
679 (if (r3 < #x0800) | 787 ((r0 = ((r0 & #xF) << 12)) |
680 (r4 = 1)))) | 788 (r0 |= ((r1 & #x3F) << 6)) |
681 (if (r4 != 0) | 789 (r0 |= (r1 & #x3F)) |
682 (r0 = 0) | 790 (r1 = 3)) |
683 (r0 = r3))) | 791 (if (r3 == 0) |
684 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe) | 792 (r1 = 0) |
685 ((r4 = (r1 >> 6)) | 793 ((r0 = ((r0 & #x7) << 18)) |
686 (if (r4 != #b10) | 794 (r0 |= ((r1 & #x3F) << 12)) |
687 (r0 = 0) | 795 (r0 |= ((r2 & #x3F) << 6)) |
688 ((r4 = (r2 >> 6)) | 796 (r0 |= (r3 & #x3F)) |
689 (if (r4 != #b10) | 797 (r1 = 4)))))))) |
690 (r0 = 0) | 798 "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. |
691 ((r4 = (r3 >> 6)) | 799 Set r1 to the byte length. r0 == 0 for invalid sequence.") |
692 (if (r4 != #b10) | |
693 (r0 = 0) | |
694 ((r1 = ((r1 & #x3F) << 12)) | |
695 (r2 = ((r2 & #x3F) << 6)) | |
696 (r3 &= #x3F) | |
697 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3))))))))) | |
698 (r0 = 0)))) | |
699 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. | |
700 r0 == 0 for invalid sequence.") | |
701 | 800 |
702 (defvar utf-8-ccl-regs (make-vector 8 0)) | 801 (defvar utf-8-ccl-regs (make-vector 8 0)) |
703 | 802 |
704 (defsubst utf-8-untranslated-to-ucs () | 803 (defsubst utf-8-untranslated-to-ucs () |
705 "Return the UCS code for an untranslated sequence of raw bytes t point. | 804 "Return the UCS code for an untranslated sequence of raw bytes t point. |
706 Only for 3- or 4-byte sequences." | 805 Only for 3- or 4-byte sequences." |
707 (aset utf-8-ccl-regs 0 (or (char-after) 0)) | 806 (aset utf-8-ccl-regs 0 (or (char-after) 0)) |
708 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) | 807 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) |
709 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) | 808 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) |
710 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) | 809 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) |
711 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs) | 810 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)) |
712 (aref utf-8-ccl-regs 0)) | |
713 | 811 |
714 (defun utf-8-help-echo (window object position) | 812 (defun utf-8-help-echo (window object position) |
715 (format "Untranslated Unicode U+%04X" | 813 (format "Untranslated Unicode U+%04X" |
716 (get-char-property position 'untranslated-utf-8 object))) | 814 (get-char-property position 'untranslated-utf-8 object))) |
717 | 815 |
718 ;; We compose the untranslatable sequences into a single character. | 816 ;; We compose the untranslatable sequences into a single character, |
817 ;; and move point to the next character. | |
719 ;; This is infelicitous for editing, because there's currently no | 818 ;; This is infelicitous for editing, because there's currently no |
720 ;; mechanism for treating compositions as atomic, but is OK for | 819 ;; mechanism for treating compositions as atomic, but is OK for |
721 ;; display. They are composed to U+FFFD with help-echo which | 820 ;; display. They are composed to U+FFFD with help-echo which |
722 ;; indicates the unicodes they represent. This function GCs too much. | 821 ;; indicates the unicodes they represent. This function GCs too much. |
723 (defsubst utf-8-compose () | 822 |
724 "Put a suitable composition on an untranslatable sequence. | 823 ;; If utf-translate-cjk-mode is non-nil, this function is called with |
725 Return the sequence's length." | 824 ;; HASH-TABLE which translates CJK characters into some of CJK |
726 (let* ((u (utf-8-untranslated-to-ucs)) | 825 ;; charsets. |
727 (l (unless (zerop u) | 826 |
728 (if (>= u #x10000) | 827 (defsubst utf-8-compose (hash-table) |
729 4 | 828 "Put a suitable composition on an untranslatable sequence at point. |
730 3)))) | 829 If HASH-TABLE is non-nil, try to translate CJK characters by it at first. |
731 (when l | 830 Move point to the end of the sequence." |
732 (put-text-property (point) (min (point-max) (+ l (point))) | 831 (utf-8-untranslated-to-ucs) |
733 'untranslated-utf-8 u) | 832 (let ((l (aref utf-8-ccl-regs 1)) |
734 (put-text-property (point) (min (point-max) (+ l (point))) | 833 ch) |
735 'help-echo 'utf-8-help-echo) | 834 (if (> l 0) |
736 (compose-region (point) (+ l (point)) ?$,3u=(B) | 835 (if (and hash-table |
737 l))) | 836 (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table))) |
837 (progn | |
838 (insert ch) | |
839 (delete-region (point) (min (point-max) (+ l (point))))) | |
840 (setq ch (aref utf-8-ccl-regs 0)) | |
841 (put-text-property (point) (min (point-max) (+ l (point))) | |
842 'untranslated-utf-8 ch) | |
843 (put-text-property (point) (min (point-max) (+ l (point))) | |
844 'help-echo 'utf-8-help-echo) | |
845 (if (= l 2) | |
846 (put-text-property (point) (min (point-max) (+ l (point))) | |
847 'display (format "\\%03o" ch)) | |
848 (compose-region (point) (+ l (point)) ?$,3u=(B)) | |
849 (forward-char l)) | |
850 (forward-char 1)))) | |
738 | 851 |
739 (defcustom utf-8-compose-scripts nil | 852 (defcustom utf-8-compose-scripts nil |
740 "*Non-nil means compose various scripts on decoding utf-8 text." | 853 "*Non-nil means compose various scripts on decoding utf-8 text." |
741 :group 'mule | 854 :group 'mule |
742 :version "21.4" | 855 :version "21.4" |
743 :type 'boolean) | 856 :type 'boolean) |
744 | 857 |
745 (defun utf-8-post-read-conversion (length) | 858 (defun utf-8-post-read-conversion (length) |
746 "Compose untranslated utf-8 sequences into single characters. | 859 "Compose untranslated utf-8 sequences into single characters. |
860 If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters. | |
747 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." | 861 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." |
748 (save-excursion | 862 (save-excursion |
749 ;; Can't do eval-when-compile to insert a multibyte constant | 863 (save-restriction |
750 ;; version of the string in the loop, since it's always loaded as | 864 (narrow-to-region (point) (+ (point) length)) |
751 ;; unibyte from a byte-compiled file. | 865 ;; Can't do eval-when-compile to insert a multibyte constant |
752 (let ((range (string-as-multibyte "^\xe1-\xf7"))) | 866 ;; version of the string in the loop, since it's always loaded as |
753 (while (and (skip-chars-forward range) | 867 ;; unibyte from a byte-compiled file. |
754 (not (eobp))) | 868 (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7")) |
755 (forward-char (utf-8-compose))))) | 869 hash-table ch) |
756 ;; Fixme: Takahashi-san implies it may not work this easily. I | 870 (when utf-translate-cjk-mode |
757 ;; asked why but didn't get a reply. -- fx | 871 (if (not utf-translate-cjk-lang-env) |
758 (when (and utf-8-compose-scripts (> length 1)) | 872 ;; Check these characters: |
759 ;; These currently have definitions which cover the relevant | 873 ;; "U+2e80-U+33ff", "U+ff00-U+ffef" |
760 ;; unicodes. We could avoid loading thai-util &c by checking | 874 ;; We may have to translate them to CJK charsets. |
761 ;; whether the region contains any characters with the appropriate | 875 (let ((range2 "$,29@(B-$,2G$,3r`(B-$,3u/(B")) |
762 ;; categories. There aren't yet Unicode-based rules for Tibetan. | 876 (skip-chars-forward (concat range range2)) |
763 (save-excursion (setq length (diacritic-post-read-conversion length))) | 877 (unless (eobp) |
764 (save-excursion (setq length (thai-post-read-conversion length))) | 878 (utf-translate-cjk-load-tables) |
765 (save-excursion (setq length (lao-post-read-conversion length))) | 879 (setq range (concat range range2))) |
766 (save-excursion (setq length (devanagari-post-read-conversion length))) | 880 (setq hash-table (get 'utf-subst-table-for-decode |
767 (save-excursion (setq length (malayalam-post-read-conversion length))) | 881 'translation-hash-table))))) |
768 (save-excursion (setq length (tamil-post-read-conversion length)))) | 882 (while (and (skip-chars-forward range) |
769 length) | 883 (not (eobp))) |
770 | 884 (setq ch (following-char)) |
771 ;; ucs-tables is preloaded | 885 (if (< ch 256) |
772 ;; (defun utf-8-pre-write-conversion (beg end) | 886 (utf-8-compose hash-table) |
773 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables." | 887 (if (and hash-table |
774 ;; ;; Ensure translation-table is loaded. | 888 (setq ch (gethash (encode-char ch 'ucs) hash-table))) |
775 ;; (require 'ucs-tables) | 889 (progn |
776 ;; ;; Don't do this again. | 890 (insert ch) |
777 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) | 891 (delete-char 1)) |
778 ;; nil) | 892 (forward-char 1))))) |
893 | |
894 (when (and utf-8-compose-scripts (> length 1)) | |
895 ;; These currently have definitions which cover the relevant | |
896 ;; unicodes. We could avoid loading thai-util &c by checking | |
897 ;; whether the region contains any characters with the appropriate | |
898 ;; categories. There aren't yet Unicode-based rules for Tibetan. | |
899 (diacritic-compose-region (point-max) (point-min)) | |
900 (thai-compose-region (point-max) (point-min)) | |
901 (lao-compose-region (point-max) (point-min)) | |
902 (devanagari-compose-region (point-max) (point-min)) | |
903 (malayalam-compose-region (point-max) (point-min)) | |
904 (tamil-compose-region (point-max) (point-min))) | |
905 (- (point-max) (point-min))))) | |
906 | |
907 (defun utf-8-pre-write-conversion (beg end) | |
908 "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END. | |
909 This is used as a post-read-conversion of utf-8 coding system." | |
910 (if (and utf-translate-cjk-mode | |
911 (not utf-translate-cjk-lang-env) | |
912 (save-excursion | |
913 (goto-char beg) | |
914 (re-search-forward "\\cc\\|\\cj\\|\\ch" end t))) | |
915 (utf-translate-cjk-load-tables)) | |
916 nil) | |
779 | 917 |
780 (make-coding-system | 918 (make-coding-system |
781 'mule-utf-8 4 ?u | 919 'mule-utf-8 4 ?u |
782 "UTF-8 encoding for Emacs-supported Unicode characters. | 920 "UTF-8 encoding for Emacs-supported Unicode characters. |
783 It supports Unicode characters of these ranges: | 921 It supports Unicode characters of these ranges: |
795 On encoding (e.g. writing a file), Emacs characters not belonging to | 933 On encoding (e.g. writing a file), Emacs characters not belonging to |
796 any of the character sets listed above are encoded into the UTF-8 byte | 934 any of the character sets listed above are encoded into the UTF-8 byte |
797 sequence representing U+FFFD (REPLACEMENT CHARACTER)." | 935 sequence representing U+FFFD (REPLACEMENT CHARACTER)." |
798 | 936 |
799 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | 937 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) |
800 '((safe-charsets | 938 `((safe-charsets |
801 ascii | 939 ascii |
802 eight-bit-control | 940 eight-bit-control |
803 eight-bit-graphic | 941 eight-bit-graphic |
804 latin-iso8859-1 | 942 latin-iso8859-1 |
805 mule-unicode-0100-24ff | 943 mule-unicode-0100-24ff |
806 mule-unicode-2500-33ff | 944 mule-unicode-2500-33ff |
807 mule-unicode-e000-ffff) | 945 mule-unicode-e000-ffff |
946 ,@(if utf-translate-cjk-mode | |
947 utf-translate-cjk-charsets)) | |
808 (mime-charset . utf-8) | 948 (mime-charset . utf-8) |
809 (coding-category . coding-category-utf-8) | 949 (coding-category . coding-category-utf-8) |
810 (valid-codes (0 . 255)) | 950 (valid-codes (0 . 255)) |
811 ;; (pre-write-conversion . utf-8-pre-write-conversion) | 951 (pre-write-conversion . utf-8-pre-write-conversion) |
812 (post-read-conversion . utf-8-post-read-conversion) | 952 (post-read-conversion . utf-8-post-read-conversion) |
813 (translation-table-for-encode . utf-translation-table-for-encode) | 953 (translation-table-for-encode . utf-translation-table-for-encode) |
814 (dependency unify-8859-on-encoding-mode | 954 (dependency unify-8859-on-encoding-mode |
815 unify-8859-on-decoding-mode | 955 unify-8859-on-decoding-mode |
816 utf-fragment-on-decoding | 956 utf-fragment-on-decoding |