comparison lisp/international/utf-8.el @ 56037:81dbb510a1db

(utf-translate-cjk-charsets): New variable. (utf-translate-cjk-unicode-range): New variable. (utf-translate-cjk-load-tables): New function. (utf-lookup-subst-table-for-decode): New function. (utf-lookup-subst-table-for-encode): New function. (utf-translate-cjk-mode): Init-value changed to t. Don't load tables here. Update safe-charsets of utf-* coding systems. (ccl-mule-utf-untrans): New CCL. (ccl-decode-mule-utf-8): Call ccl-mule-utf-untrans. Use `repeat' at end of each branch. (ccl-mule-utf-8-encode-untrans): New CCL. (ccl-encode-mule-utf-8): Call ccl-mule-utf-8-encode-untrans. (ccl-untranslated-to-ucs): Handle 2-byte encoding. Set r1 to the length of encoding. Don't return r0. (utf-8-compose): New arg hash-table. Handle 2-byte encoding. (utf-8-post-read-conversion): Narrow to region properly. If utf-translate-cjk-mode is on, load tables if necessary. Call utf-8-compose with hash-table arg if necessary. Call XXX-compose-region instead of XXX-post-read-convesion. (utf-8-pre-write-conversion): New function. (mule-utf-8): Include CJK charsets in safe-charsets if utf-translate-cjk-mode is on. Add pre-write-conversion.
author Kenichi Handa <handa@m17n.org>
date Sat, 12 Jun 2004 02:10:37 +0000
parents 6e677a935fe9
children 4575a565f45d
comparison
equal deleted inserted replaced
56036:ff6f1f61fea4 56037:81dbb510a1db
188 (set-default s v)) 188 (set-default s v))
189 :version "21.4" 189 :version "21.4"
190 :type 'boolean 190 :type 'boolean
191 :group 'mule) 191 :group 'mule)
192 192
193 (define-minor-mode utf-translate-cjk-mode 193
194 "Whether the UTF based coding systems should decode/encode CJK characters. 194 (defconst utf-translate-cjk-charsets '(chinese-gb2312
195 Enabling this loads tables which allow the coding systems mule-utf-8, 195 chinese-big5-1 chinese-big5-2
196 mule-utf-16le and mule-utf-16be to encode characters in the charsets 196 japanese-jisx0208 japanese-jisx0212
197 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', 197 korean-ksc5601)
198 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to 198 "List of charsets supported by `utf-translate-cjk-mode'.")
199 decode the corresponding unicodes into such characters. 199
200 200 (defconst utf-translate-cjk-unicode-range
201 Where the charsets overlap, the one preferred for decoding is chosen 201 '((#x2e80 . #xd7a3)
202 according to the language environment in effect when this option is 202 (#xff00 . #xffef))
203 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for 203 "List of Unicode code ranges supported by `utf-translate-cjk-mode'.")
204 Chinese-Big5 and jisx for other environments. 204
205 205 ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'.
206 The tables are large (over 40000 entries), so this option is not the 206 (defsubst utf-translate-cjk-substitutable-p (code-point)
207 default. Also, installing them may be rather slow." 207 (let ((tail utf-translate-cjk-unicode-range)
208 :init-value nil 208 elt)
209 :version "21.4" 209 (while tail
210 :type 'boolean 210 (setq elt (car tail) tail (cdr tail))
211 :set-after '(current-language-environment) 211 (if (and (>= code-point (car elt)) (<= code-point (cdr elt)))
212 :group 'mule 212 (setq tail nil)
213 :global t 213 (setq elt nil)))
214 (if utf-translate-cjk-mode 214 elt))
215 ;; Fixme: Allow the use of the CJK charsets to be 215
216 ;; customized by reordering and possible omission. 216 (defvar utf-translate-cjk-lang-env nil
217 (progn 217 "Language environment in which tables for `utf-translate-cjk-mode' is loaded.
218 The value nil means that the tables are not yet loaded.")
219
220 (defun utf-translate-cjk-load-tables ()
221 "Load tables for `utf-translate-cjk-mode'."
222 ;; Fixme: Allow the use of the CJK charsets to be
223 ;; customized by reordering and possible omission.
224 (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000)))
225 (if redefined
218 ;; Redefine them with realistic initial sizes and a 226 ;; Redefine them with realistic initial sizes and a
219 ;; smallish rehash size to avoid wasting significant 227 ;; smallish rehash size to avoid wasting significant
220 ;; space after they're built. 228 ;; space after they're built.
221 (setq ucs-mule-cjk-to-unicode 229 (setq ucs-mule-cjk-to-unicode
222 (make-hash-table :test 'eq :size 43000 :rehash-size 1000) 230 (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
223 ucs-unicode-to-mule-cjk 231 ucs-unicode-to-mule-cjk
224 (make-hash-table :test 'eq :size 21500 :rehash-size 1000)) 232 (make-hash-table :test 'eq :size 21500 :rehash-size 1000)))
225 ;; Load the files explicitly, to avoid having to keep 233
226 ;; around the large tables they contain (as well as the 234 ;; Load the files explicitly, to avoid having to keep
227 ;; ones which get built). 235 ;; around the large tables they contain (as well as the
228 (cond 236 ;; ones which get built).
229 ((string= "Korean" current-language-environment) 237 (cond ((string= "Korean" current-language-environment)
230 (load "subst-jis") 238 (load "subst-jis")
231 (load "subst-big5") 239 (load "subst-big5")
232 (load "subst-gb2312") 240 (load "subst-gb2312")
233 (load "subst-ksc")) 241 (load "subst-ksc"))
234 ((string= "Chinese-BIG5" current-language-environment) 242 ((string= "Chinese-BIG5" current-language-environment)
235 (load "subst-jis") 243 (load "subst-jis")
236 (load "subst-ksc") 244 (load "subst-ksc")
237 (load "subst-gb2312") 245 (load "subst-gb2312")
238 (load "subst-big5")) 246 (load "subst-big5"))
239 ((string= "Chinese-GB" current-language-environment) 247 ((string= "Chinese-GB" current-language-environment)
240 (load "subst-jis") 248 (load "subst-jis")
241 (load "subst-ksc") 249 (load "subst-ksc")
242 (load "subst-big5") 250 (load "subst-big5")
243 (load "subst-gb2312")) 251 (load "subst-gb2312"))
244 (t 252 (t
245 (load "subst-ksc") 253 (load "subst-ksc")
246 (load "subst-gb2312") 254 (load "subst-gb2312")
247 (load "subst-big5") 255 (load "subst-big5")
248 (load "subst-jis"))) ; jis covers as much as big5, gb2312 256 (load "subst-jis"))) ; jis covers as much as big5, gb2312
257
258 (when redefined
259 (define-translation-hash-table 'utf-subst-table-for-decode
260 ucs-unicode-to-mule-cjk)
261 (define-translation-hash-table 'utf-subst-table-for-encode
262 ucs-mule-cjk-to-unicode)
263 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
264 'translation-table)
265 1 ucs-mule-cjk-to-unicode))
266
267 (setq utf-translate-cjk-lang-env current-language-environment)))
268
269 (defun utf-lookup-subst-table-for-decode (code-point)
270 (if (and utf-translate-cjk-mode
271 (not utf-translate-cjk-lang-env)
272 (utf-translate-cjk-substitutable-p code-point))
273 (utf-translate-cjk-load-tables))
274 (gethash code-point
275 (get 'utf-subst-table-for-decode 'translation-hash-table)))
276
277
278 (defun utf-lookup-subst-table-for-encode (char)
279 (if (and utf-translate-cjk-mode
280 (not utf-translate-cjk-lang-env)
281 (memq (char-charset char) utf-translate-cjk-charsets))
282 (utf-translate-cjk-load-tables))
283 (gethash char
284 (get 'utf-subst-table-for-encode 'translation-hash-table)))
285
286 (define-minor-mode utf-translate-cjk-mode
287 "Whether the UTF based coding systems should decode/encode CJK characters.
288 Enabling this allows the coding systems mule-utf-8,
289 mule-utf-16le and mule-utf-16be to encode characters in the charsets
290 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
291 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
292 decode the corresponding unicodes into such characters.
293
294 Where the charsets overlap, the one preferred for decoding is chosen
295 according to the language environment in effect when this option is
296 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
297 Chinese-Big5 and jisx for other environments.
298
299 This option is on by default. If you are not interested in CJK
300 characters and want to avoid some overhead on encoding/decoding
301 by the above coding systems, you can customize this option to nil."
302 :init-value t
303 :version "21.4"
304 :type 'boolean
305 :group 'mule
306 :global t
307 (if utf-translate-cjk-mode
308 (progn
249 (define-translation-hash-table 'utf-subst-table-for-decode 309 (define-translation-hash-table 'utf-subst-table-for-decode
250 ucs-unicode-to-mule-cjk) 310 ucs-unicode-to-mule-cjk)
251 (define-translation-hash-table 'utf-subst-table-for-encode 311 (define-translation-hash-table 'utf-subst-table-for-encode
252 ucs-mule-cjk-to-unicode) 312 ucs-mule-cjk-to-unicode)
253 (set-char-table-extra-slot (get 'utf-translation-table-for-encode 313 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
257 (make-hash-table :test 'eq)) 317 (make-hash-table :test 'eq))
258 (define-translation-hash-table 'utf-subst-table-for-encode 318 (define-translation-hash-table 'utf-subst-table-for-encode
259 (make-hash-table :test 'eq)) 319 (make-hash-table :test 'eq))
260 (set-char-table-extra-slot (get 'utf-translation-table-for-encode 320 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
261 'translation-table) 321 'translation-table)
262 1 nil))) 322 1 nil))
323
324 ;; Update safe-chars of mule-utf-* coding systems.
325 (dolist (elt (coding-system-list t))
326 (if (string-match "^mule-utf" (symbol-name elt))
327 (let ((safe-charsets (coding-system-get elt 'safe-charsets))
328 (safe-chars (coding-system-get elt 'safe-chars))
329 (need-update nil))
330 (dolist (charset utf-translate-cjk-charsets)
331 (unless (eq utf-translate-cjk-mode (memq charset safe-charsets))
332 (setq safe-charsets
333 (if utf-translate-cjk-mode
334 (cons charset safe-charsets)
335 (delq charset safe-charsets))
336 need-update t)
337 (aset safe-chars (make-char charset) utf-translate-cjk-mode)))
338 (when need-update
339 (coding-system-put elt 'safe-charsets safe-charsets)
340 (define-coding-system-internal elt))))))
341
342 (define-ccl-program ccl-mule-utf-untrans
343 ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or
344 ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write
345 ;; eight-bit-control/graphic sequence (2 to 4 chars) representing
346 ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified.
347 ;;
348 ;; This is a subrountine because we assume that this is called very
349 ;; rarely (so we don't have to worry about the overhead of the
350 ;; call).
351 `(0
352 ((r5 = ,(charset-id 'eight-bit-control))
353 (r6 = ,(charset-id 'eight-bit-graphic))
354 (if (r0 < #x100)
355 ((r4 = ((r0 >> 6) | #xC0))
356 (write-multibyte-character r6 r4))
357 ((if (r0 < #x10000)
358 ((r4 = ((r0 >> 12) | #xE0))
359 (write-multibyte-character r6 r4))
360 ((r4 = ((r0 >> 18) | #xF0))
361 (write-multibyte-character r6 r4)
362 (r4 = (((r0 >> 12) & #x3F) | #x80))
363 (if (r4 < #xA0)
364 (write-multibyte-character r5 r4)
365 (write-multibyte-character r6 r4))))
366 (r4 = (((r0 >> 6) & #x3F) | #x80))
367 (if (r4 < #xA0)
368 (write-multibyte-character r5 r4)
369 (write-multibyte-character r6 r4))))
370 (r4 = ((r0 & #x3F) | #x80))
371 (if (r4 < #xA0)
372 (write-multibyte-character r5 r4)
373 (write-multibyte-character r6 r4)))))
263 374
264 (define-ccl-program ccl-decode-mule-utf-8 375 (define-ccl-program ccl-decode-mule-utf-8
265 ;; 376 ;;
266 ;; charset | bytes in utf-8 | bytes in emacs 377 ;; charset | bytes in utf-8 | bytes in emacs
267 ;; -----------------------+----------------+--------------- 378 ;; -----------------------+----------------+---------------
276 ;; -----------------------+----------------+--------------- 387 ;; -----------------------+----------------+---------------
277 ;; mule-unicode-0100-24ff | 3 | 4 388 ;; mule-unicode-0100-24ff | 3 | 4
278 ;; (>= 8000) | | 389 ;; (>= 8000) | |
279 ;; mule-unicode-2500-33ff | 3 | 4 390 ;; mule-unicode-2500-33ff | 3 | 4
280 ;; mule-unicode-e000-ffff | 3 | 4 391 ;; mule-unicode-e000-ffff | 3 | 4
392 ;; -----------------------+----------------+---------------
393 ;; invalid byte | 1 | 2
281 ;; 394 ;;
282 ;; Thus magnification factor is two. 395 ;; Thus magnification factor is two.
283 ;; 396 ;;
284 `(2 397 `(2
285 ((r5 = ,(charset-id 'eight-bit-control)) 398 ((r0 = -1)
286 (r6 = ,(charset-id 'eight-bit-graphic))
287 (loop 399 (loop
400 (if (r0 < 0)
401 (read r0))
402 (if (r0 < #x80)
403 ;; 1-byte encoding, i.e., ascii
404 ((write r0)
405 (r0 = -1)
406 (repeat)))
407 (if (r0 < #xc0) ; continuation byte (invalid here)
408 ((call ccl-mule-utf-untrans)
409 (r0 = -1)
410 (repeat)))
411
412 ;; Read the 2nd byte.
413 (r1 = -1)
414 (read r1)
415 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte
416 ((call ccl-mule-utf-untrans)
417 ;; Handle it in the next loop.
418 (r0 = r1)
419 (repeat)))
420
421 (if (r0 < #xe0)
422 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
423 ((r2 = ((r0 & #x1F) << 6))
424 (r2 |= (r1 & #x3F))
425 ;; Now r2 holds scalar value
426
427 (if (r2 < 128) ; `overlong sequence'
428 ((call ccl-mule-utf-untrans)
429 (r0 = r1)
430 (call ccl-mule-utf-untrans)
431 (r0 = -1)
432 (repeat)))
433
434 (r1 = r2)
435 (if (r1 < 160)
436 ;; eight-bit-control
437 (r0 = ,(charset-id 'eight-bit-control))
438 (if (r1 < 256)
439 ;; latin-iso8859-1
440 ((r0 = ,(charset-id 'latin-iso8859-1))
441 (r1 -= 128))
442 ;; mule-unicode-0100-24ff (< 0800)
443 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
444 (r1 -= #x0100)
445 (r2 = (((r1 / 96) + 32) << 7))
446 (r1 %= 96)
447 (r1 += (r2 + 32))
448 (translate-character
449 utf-translation-table-for-decode r0 r1))))
450 (write-multibyte-character r0 r1)
451 (r0 = -1)
452 (repeat)))
453
454 ;; Read the 3rd bytes.
455 (r2 = -1)
456 (read r2)
457 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte
458 ((call ccl-mule-utf-untrans)
459 (r0 = r1)
460 (call ccl-mule-utf-untrans)
461 ;; Handle it in the next loop.
462 (r0 = r2)
463 (repeat)))
464
465 (if (r0 < #xF0)
466 ;; 3byte encoding
467 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
468 ((r3 = ((r0 & #xF) << 12))
469 (r3 |= ((r1 & #x3F) << 6))
470 (r3 |= (r2 & #x3F))
471
472 (if (r3 < #x800) ; `overlong sequence'
473 ((call ccl-mule-utf-untrans)
474 (r0 = r1)
475 (call ccl-mule-utf-untrans)
476 (r0 = r2)
477 (call ccl-mule-utf-untrans)
478 (r0 = -1)
479 (repeat)))
480
481 (if (r3 < #x2500)
482 ;; mule-unicode-0100-24ff (>= 0800)
483 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
484 (r3 -= #x0100)
485 (r3 //= 96)
486 (r1 = (r7 + 32))
487 (r1 += ((r3 + 32) << 7))
488 (translate-character
489 utf-translation-table-for-decode r0 r1)
490 (write-multibyte-character r0 r1)
491 (r0 = -1)
492 (repeat)))
493
494 (if (r3 < #x3400)
495 ;; mule-unicode-2500-33ff
496 ((r0 = r3) ; don't zap r3
497 (lookup-integer utf-subst-table-for-decode r0 r1)
498 (if (r7 == 0)
499 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
500 (r3 -= #x2500)
501 (r3 //= 96)
502 (r1 = (r7 + 32))
503 (r1 += ((r3 + 32) << 7))))
504 (write-multibyte-character r0 r1)
505 (r0 = -1)
506 (repeat)))
507
508 (if (r3 < #xE000)
509 ;; Try to convert to CJK chars, else
510 ;; keep them as eight-bit-{control|graphic}.
511 ((r0 = r3)
512 (lookup-integer utf-subst-table-for-decode r3 r1)
513 (if r7
514 ;; got a translation
515 (write-multibyte-character r3 r1)
516 (call ccl-mule-utf-untrans))
517 (r0 = -1)
518 (repeat)))
519
520 ;; mule-unicode-e000-ffff
521 ;; Fixme: fffe and ffff are invalid.
522 (r0 = r3) ; don't zap r3
523 (lookup-integer utf-subst-table-for-decode r0 r1)
524 (if (r7 == 0)
525 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
526 (r3 -= #xe000)
527 (r3 //= 96)
528 (r1 = (r7 + 32))
529 (r1 += ((r3 + 32) << 7))))
530 (write-multibyte-character r0 r1)
531 (r0 = -1)
532 (repeat)))
533
534 ;; Read the 4th bytes.
535 (r3 = -1)
536 (read r3)
537 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte
538 ((call ccl-mule-utf-untrans)
539 (r0 = r1)
540 (call ccl-mule-utf-untrans)
541 ;; Handle it in the next loop.
542 (r0 = r3)
543 (repeat)))
544
545 (if (r3 < #xF8)
546 ;; 4-byte encoding:
547 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx
548 ;; keep those bytes as eight-bit-{control|graphic}
549 ;; Fixme: allow lookup in utf-subst-table-for-decode.
550 ((r4 = ((r0 & #x7) << 18))
551 (r4 |= ((r1 & #x3F) << 12))
552 (r4 |= ((r2 & #x3F) << 6))
553 (r4 |= (r3 & #x3F))
554
555 (if (r4 < #x10000) ; `overlong sequence'
556 ((call ccl-mule-utf-untrans)
557 (r0 = r1)
558 (call ccl-mule-utf-untrans)
559 (r0 = r2)
560 (call ccl-mule-utf-untrans)
561 (r0 = r3)
562 (call ccl-mule-utf-untrans))
563 ((r0 = r4)
564 (call ccl-mule-utf-untrans)))
565 (r0 = -1)
566 (repeat)))
567
568 ;; Unsupported sequence.
569 (call ccl-mule-utf-untrans)
570 (r0 = r1)
571 (call ccl-mule-utf-untrans)
572 (r0 = r2)
573 (call ccl-mule-utf-untrans)
574 (r0 = r3)
575 (call ccl-mule-utf-untrans)
288 (r0 = -1) 576 (r0 = -1)
289 (read r0)
290
291 ;; 1byte encoding, i.e., ascii
292 (if (r0 < #x80)
293 ((write r0))
294 (if (r0 < #xc0) ; continuation byte (invalid here)
295 ((if (r0 < #xa0)
296 (write-multibyte-character r5 r0)
297 (write-multibyte-character r6 r0)))
298 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
299 (if (r0 < #xe0)
300 ((r1 = -1)
301 (read r1)
302
303 (if ((r1 & #b11000000) != #b10000000)
304 ;; Invalid 2-byte sequence
305 ((if (r0 < #xa0)
306 (write-multibyte-character r5 r0)
307 (write-multibyte-character r6 r0))
308 (if (r1 < #x80)
309 (write r1)
310 (if (r1 < #xa0)
311 (write-multibyte-character r5 r1)
312 (write-multibyte-character r6 r1))))
313
314 ((r3 = r0) ; save in case of overlong sequence
315 (r2 = r1)
316 (r0 &= #x1f)
317 (r0 <<= 6)
318 (r1 &= #x3f)
319 (r1 += r0)
320 ;; Now r1 holds scalar value
321
322 (if (r1 < 128) ; `overlong sequence'
323 ((if (r3 < #xa0)
324 (write-multibyte-character r5 r3)
325 (write-multibyte-character r6 r3))
326 (if (r2 < #x80)
327 (write r2)
328 (if (r2 < #xa0)
329 (write-multibyte-character r5 r2)
330 (write-multibyte-character r6 r2))))
331
332 ;; eight-bit-control
333 (if (r1 < 160)
334 ((write-multibyte-character r5 r1))
335
336 ;; latin-iso8859-1
337 (if (r1 < 256)
338 ((r0 = ,(charset-id 'latin-iso8859-1))
339 (r1 -= 128)
340 (write-multibyte-character r0 r1))
341
342 ;; mule-unicode-0100-24ff (< 0800)
343 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
344 (r1 -= #x0100)
345 (r2 = (((r1 / 96) + 32) << 7))
346 (r1 %= 96)
347 (r1 += (r2 + 32))
348 (translate-character
349 utf-translation-table-for-decode r0 r1)
350 (write-multibyte-character r0 r1))))))))
351
352 ;; 3byte encoding
353 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
354 (if (r0 < #xf0)
355 ((r1 = -1)
356 (r2 = -1)
357 (read r1 r2)
358
359 ;; This is set to 1 if the encoding is invalid.
360 (r4 = 0)
361
362 (r3 = (r1 & #b11000000))
363 (r3 |= ((r2 >> 2) & #b00110000))
364 (if (r3 != #b10100000)
365 (r4 = 1)
366 ((r3 = ((r0 & #x0f) << 12))
367 (r3 += ((r1 & #x3f) << 6))
368 (r3 += (r2 & #x3f))
369 (if (r3 < #x0800)
370 (r4 = 1))))
371
372 (if (r4 != 0)
373 ;; Invalid 3-byte sequence
374 ((if (r0 < #xa0)
375 (write-multibyte-character r5 r0)
376 (write-multibyte-character r6 r0))
377 (if (r1 < #x80)
378 (write r1)
379 (if (r1 < #xa0)
380 (write-multibyte-character r5 r1)
381 (write-multibyte-character r6 r1)))
382 (if (r2 < #x80)
383 (write r2)
384 (if (r2 < #xa0)
385 (write-multibyte-character r5 r2)
386 (write-multibyte-character r6 r2))))
387
388 ;; mule-unicode-0100-24ff (>= 0800)
389 ((if (r3 < #x2500)
390 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
391 (r3 -= #x0100)
392 (r3 //= 96)
393 (r1 = (r7 + 32))
394 (r1 += ((r3 + 32) << 7))
395 (translate-character
396 utf-translation-table-for-decode r0 r1)
397 (write-multibyte-character r0 r1))
398
399 ;; mule-unicode-2500-33ff
400 (if (r3 < #x3400)
401 ((r4 = r3) ; don't zap r3
402 (lookup-integer utf-subst-table-for-decode r4 r5)
403 (if r7
404 ;; got a translation
405 ((write-multibyte-character r4 r5)
406 ;; Zapped through register starvation.
407 (r5 = ,(charset-id 'eight-bit-control)))
408 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
409 (r3 -= #x2500)
410 (r3 //= 96)
411 (r1 = (r7 + 32))
412 (r1 += ((r3 + 32) << 7))
413 (write-multibyte-character r0 r1))))
414
415 ;; U+3400 .. U+D7FF
416 ;; Try to convert to CJK chars, else keep
417 ;; them as eight-bit-{control|graphic}.
418 (if (r3 < #xd800)
419 ((r4 = r3) ; don't zap r3
420 (lookup-integer utf-subst-table-for-decode r4 r5)
421 (if r7
422 ;; got a translation
423 ((write-multibyte-character r4 r5)
424 ;; Zapped through register starvation.
425 (r5 = ,(charset-id 'eight-bit-control)))
426 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
427 ((r3 = r6)
428 (write-multibyte-character r3 r0)
429 (if (r1 < #xa0)
430 (r3 = r5))
431 (write-multibyte-character r3 r1)
432 (if (r2 < #xa0)
433 (r3 = r5)
434 (r3 = r6))
435 (write-multibyte-character r3 r2))))
436
437 ;; Surrogates, U+D800 .. U+DFFF
438 (if (r3 < #xe000)
439 ((r3 = r6)
440 (write-multibyte-character r3 r0) ; eight-bit-graphic
441 (if (r1 < #xa0)
442 (r3 = r5))
443 (write-multibyte-character r3 r1)
444 (if (r2 < #xa0)
445 (r3 = r5)
446 (r3 = r6))
447 (write-multibyte-character r3 r2))
448
449 ;; mule-unicode-e000-ffff
450 ;; Fixme: fffe and ffff are invalid.
451 ((r4 = r3) ; don't zap r3
452 (lookup-integer utf-subst-table-for-decode r4 r5)
453 (if r7
454 ;; got a translation
455 ((write-multibyte-character r4 r5)
456 ;; Zapped through register starvation.
457 (r5 = ,(charset-id 'eight-bit-control)))
458 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
459 (r3 -= #xe000)
460 (r3 //= 96)
461 (r1 = (r7 + 32))
462 (r1 += ((r3 + 32) << 7))
463 (write-multibyte-character r0 r1)))))))))))
464
465 (if (r0 < #xfe)
466 ;; 4byte encoding
467 ;; keep those bytes as eight-bit-{control|graphic}
468 ;; Fixme: allow lookup in utf-subst-table-for-decode.
469 ((r1 = -1)
470 (r2 = -1)
471 (r3 = -1)
472 (read r1 r2 r3)
473 ;; r0 > #xf0, thus eight-bit-graphic
474 (write-multibyte-character r6 r0)
475 (if (r1 < #xa0)
476 (if (r1 < #x80) ; invalid byte
477 (write r1)
478 (write-multibyte-character r5 r1))
479 (write-multibyte-character r6 r1))
480 (if (r2 < #xa0)
481 (if (r2 < #x80) ; invalid byte
482 (write r2)
483 (write-multibyte-character r5 r2))
484 (write-multibyte-character r6 r2))
485 (if (r3 < #xa0)
486 (if (r3 < #x80) ; invalid byte
487 (write r3)
488 (write-multibyte-character r5 r3))
489 (write-multibyte-character r6 r3))
490 (if (r0 >= #xf8) ; 5- or 6-byte encoding
491 ((r0 = -1)
492 (read r0)
493 (if (r0 < #xa0)
494 (if (r0 < #x80) ; invalid byte
495 (write r0)
496 (write-multibyte-character r5 r0))
497 (write-multibyte-character r6 r0))
498 (if (r0 >= #xfc) ; 6-byte
499 ((r0 = -1)
500 (read r0)
501 (if (r0 < #xa0)
502 (if (r0 < #x80) ; invalid byte
503 (write r0)
504 (write-multibyte-character r5 r0))
505 (write-multibyte-character r6 r0)))))))
506 ;; else invalid byte >= #xfe
507 (write-multibyte-character r6 r0))))))
508 (repeat))) 577 (repeat)))
509 578
510 ;; At EOF... 579 ;; At EOF...
511 (if (r0 >= 0) 580 (if (r0 >= 0)
512 ((if (r0 < #x80) 581 ;; r0 >= #x80
513 (write r0) 582 ((call ccl-mule-utf-untrans)
514 (if (r0 < #xa0)
515 (write-multibyte-character r5 r0)
516 ((write-multibyte-character r6 r0))))
517 (if (r1 >= 0) 583 (if (r1 >= 0)
518 ((if (r1 < #x80) 584 ((r0 = r1)
519 (write r1) 585 (call ccl-mule-utf-untrans)
520 (if (r1 < #xa0)
521 (write-multibyte-character r5 r1)
522 ((write-multibyte-character r6 r1))))
523 (if (r2 >= 0) 586 (if (r2 >= 0)
524 ((if (r2 < #x80) 587 ((r0 = r2)
525 (write r2) 588 (call ccl-mule-utf-untrans)
526 (if (r2 < #xa0)
527 (write-multibyte-character r5 r2)
528 ((write-multibyte-character r6 r2))))
529 (if (r3 >= 0) 589 (if (r3 >= 0)
530 (if (r3 < #x80) 590 ((r0 = r3)
531 (write r3) 591 (call ccl-mule-utf-untrans))))))))))
532 (if (r3 < #xa0)
533 (write-multibyte-character r5 r3)
534 ((write-multibyte-character r6 r3))))))))))))
535 592
536 "CCL program to decode UTF-8. 593 "CCL program to decode UTF-8.
537 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 594 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
538 mule-unicode-*, but see also `utf-fragmentation-table' and 595 mule-unicode-*, but see also `utf-fragmentation-table' and
539 `ucs-mule-cjk-to-unicode'. 596 `ucs-mule-cjk-to-unicode'.
540 Encodings of un-representable Unicode characters are decoded asis into 597 Encodings of un-representable Unicode characters are decoded asis into
541 eight-bit-control and eight-bit-graphic characters.") 598 eight-bit-control and eight-bit-graphic characters.")
542 599
600 (define-ccl-program ccl-mule-utf-8-encode-untrans
601 ;; UTF-8 decoder generates an UTF-8 sequence represented by a
602 ;; sequence eight-bit-control/graphic chars for an untranslatable
603 ;; character and an invalid byte.
604 ;;
605 ;; This CCL parses that sequence (the first byte is already in r1),
606 ;; writes out the original bytes of that sequence, and sets r5 to
607 ;; -1.
608 ;;
609 ;; If the eight-bit-control/graphic sequence is shorter than what r1
610 ;; suggests, it sets r5 and r6 to the last character read that
611 ;; should be handled by the next loop of a caller.
612 ;;
613 ;; Note: For UTF-8 validation, we only check if a character is
614 ;; eight-bit-control/graphic or not. It may result in incorrect
615 ;; handling of random binary data, but such a data can't be encoded
616 ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such
617 ;; a sequence even if a source contains invalid byte-sequence.
618 `(0
619 (;; Read the 2nd byte.
620 (read-multibyte-character r5 r6)
621 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
622 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
623 ((write r1) ; invalid UTF-8
624 (r1 = -1)
625 (end)))
626
627 (if (r1 <= #xC3)
628 ;; 2-byte sequence for an originally invalid byte.
629 ((r6 &= #x3F)
630 (r6 |= ((r1 & #x1F) << 6))
631 (write r6)
632 (r5 = -1)
633 (end)))
634
635 (write r1 r6)
636 (r2 = r1)
637 (r1 = -1)
638 ;; Read the 3rd byte.
639 (read-multibyte-character r5 r6)
640 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
641 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
642 (end)) ; invalid UTF-8
643 (write r6)
644 (if (r2 < #xF0)
645 ;; 3-byte sequence for an untranslated character.
646 ((r5 = -1)
647 (end)))
648 ;; Read the 4th byte.
649 (read-multibyte-character r5 r6)
650 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
651 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
652 (end)) ; invalid UTF-8
653 ;; 4-byte sequence for an untranslated character.
654 (write r6)
655 (r5 = -1)
656 (end))
657
658 ;; At EOF...
659 ((r5 = -1)
660 (if (r1 >= 0)
661 (write r1)))))
662
543 (define-ccl-program ccl-encode-mule-utf-8 663 (define-ccl-program ccl-encode-mule-utf-8
544 `(1 664 `(1
545 ((r5 = -1) 665 ((r5 = -1)
546 (loop 666 (loop
547 (if (r5 < 0) 667 (if (r5 < 0)
548 ((r1 = -1) 668 (read-multibyte-character r0 r1)
549 (read-multibyte-character r0 r1) 669 ;; Pre-read character is in r5 (charset-ID) and r6 (code-point).
550 (translate-character utf-translation-table-for-encode r0 r1)) 670 ((r0 = r5)
551 (;; We have already done read-multibyte-character.
552 (r0 = r5)
553 (r1 = r6) 671 (r1 = r6)
554 (r5 = -1))) 672 (r5 = -1)))
673 (translate-character utf-translation-table-for-encode r0 r1)
555 674
556 (if (r0 == ,(charset-id 'ascii)) 675 (if (r0 == ,(charset-id 'ascii))
557 (write r1) 676 (write-repeat r1))
558 677
559 (if (r0 == ,(charset-id 'latin-iso8859-1)) 678 (if (r0 == ,(charset-id 'latin-iso8859-1))
560 ;; r1 scalar utf-8 679 ;; r1 scalar utf-8
561 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx 680 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
562 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 681 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
563 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 682 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
564 ((r0 = (((r1 & #x40) >> 6) | #xc2)) 683 ((r0 = (((r1 & #x40) >> 6) | #xc2))
565 (r1 &= #x3f) 684 (r1 &= #x3f)
685 (r1 |= #x80)
686 (write r0)
687 (write-repeat r1)))
688
689 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
690 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
691 ;; #x3f80 == (0011 1111 1000 0000)b
692 (r1 &= #x7f)
693 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
694 ;; now r1 holds scalar value
695 (if (r1 < #x0800)
696 ;; 2byte encoding
697 ((write ((r1 >> 6) | #xC0))
698 (r1 &= #x3F)
699 (r1 |= #x80)
700 (write-repeat r1))
701 ;; 3byte encoding
702 ((write ((r1 >> 12) | #xE0))
703 (write (((r1 & #x0FC0) >> 6) | #x80))
704 (r1 &= #x3F)
705 (r1 |= #x80)
706 (write-repeat r1)))))
707
708 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
709 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
710 (r1 &= #x7f)
711 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
712 ;; now r1 holds scalar value
713 (write ((r1 >> 12) | #xE0))
714 (write (((r1 & #x0FC0) >> 6) | #x80))
715 (r1 &= #x3F)
716 (r1 |= #x80)
717 (write-repeat r1)))
718
719 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
720 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
721 (r1 &= #x7f)
722 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
723 ;; now r1 holds scalar value
724 (write ((r1 >> 12) | #xE0))
725 (write (((r1 & #x0FC0) >> 6) | #x80))
726 (r1 &= #x3F)
727 (r1 |= #x80)
728 (write-repeat r1)))
729
730 (if (r0 == ,(charset-id 'eight-bit-control))
731 ;; r1 scalar utf-8
732 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
733 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
734 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
735 ((write #xC2)
736 (write-repeat r1)))
737
738 (if (r0 == ,(charset-id 'eight-bit-graphic))
739 ;; r1 scalar utf-8
740 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
741 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
742 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
743 ((r0 = (r1 >= #xC0))
744 (r0 &= (r1 <= #xC3))
745 (r4 = (r1 >= #xE1))
746 (r4 &= (r1 <= #xF7))
747 (r0 |= r4)
748 (if r0
749 ((call ccl-mule-utf-8-encode-untrans)
750 (repeat))
751 (write-repeat r1))))
752
753 (lookup-character utf-subst-table-for-encode r0 r1)
754 (if r7 ; lookup succeeded
755 (if (r0 < #x800)
756 ;; 2byte encoding
757 ((write ((r0 >> 6) | #xC0))
758 (r1 &= #x3F)
759 (r1 |= #x80)
760 (write-repeat r1))
761 ;; 3byte encoding
762 ((write ((r0 >> 12) | #xE0))
763 (write (((r0 & #x0FC0) >> 6) | #x80))
764 (r1 &= #x3F)
566 (r1 |= #x80) 765 (r1 |= #x80)
567 (write r0 r1)) 766 (write-repeat r1))))
568 767
569 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) 768 ;; Unsupported character.
570 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) 769 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
571 ;; #x3f80 == (0011 1111 1000 0000)b 770 (write #xef)
572 (r1 &= #x7f) 771 (write #xbf)
573 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 772 (write-repeat #xbd))))
574 ;; now r1 holds scalar value
575 (if (r1 < #x0800)
576 ;; 2byte encoding
577 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
578 ;; #x07c0 == (0000 0111 1100 0000)b
579 (r1 &= #x3f)
580 (r1 |= #x80)
581 (write r0 r1))
582 ;; 3byte encoding
583 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
584 (r2 = ((r1 & #x3f) | #x80))
585 (r1 &= #x0fc0)
586 (r1 >>= 6)
587 (r1 |= #x80)
588 (write r0 r1 r2))))
589
590 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
591 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
592 (r1 &= #x7f)
593 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
594 (r0 = (((r1 & #xf000) >> 12) | #xe0))
595 (r2 = ((r1 & #x3f) | #x80))
596 (r1 &= #x0fc0)
597 (r1 >>= 6)
598 (r1 |= #x80)
599 (write r0 r1 r2))
600
601 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
602 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
603 (r1 &= #x7f)
604 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
605 (r0 = (((r1 & #xf000) >> 12) | #xe0))
606 (r2 = ((r1 & #x3f) | #x80))
607 (r1 &= #x0fc0)
608 (r1 >>= 6)
609 (r1 |= #x80)
610 (write r0 r1 r2))
611
612 (if (r0 == ,(charset-id 'eight-bit-control))
613 ;; r1 scalar utf-8
614 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
615 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
616 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
617 ((write #xc2)
618 (write r1))
619
620 (if (r0 == ,(charset-id 'eight-bit-graphic))
621 ;; r1 scalar utf-8
622 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
623 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
624 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
625 ((write r1)
626 (r1 = -1)
627 (read-multibyte-character r0 r1)
628 (if (r0 != ,(charset-id 'eight-bit-graphic))
629 (if (r0 != ,(charset-id 'eight-bit-control))
630 ((r5 = r0)
631 (r6 = r1))))
632 (if (r5 < 0)
633 ((read-multibyte-character r0 r2)
634 (if (r0 != ,(charset-id 'eight-bit-graphic))
635 (if (r0 != ,(charset-id 'eight-bit-control))
636 ((r5 = r0)
637 (r6 = r2))))
638 (if (r5 < 0)
639 (write r1 r2)
640 (if (r1 < #xa0)
641 (write r1)
642 ((write #xc2)
643 (write r1)))))))
644
645 ((lookup-character utf-subst-table-for-encode r0 r1)
646 (if r7 ; lookup succeeded
647 ((r1 = (((r0 & #xf000) >> 12) | #xe0))
648 (r2 = ((r0 & #x3f) | #x80))
649 (r0 &= #x0fc0)
650 (r0 >>= 6)
651 (r0 |= #x80)
652 (write r1 r0 r2))
653 ;; Unsupported character.
654 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
655 ((write #xef)
656 (write #xbf)
657 (write #xbd)))))))))))
658 (repeat)))
659 (if (r1 >= #xa0)
660 (write r1)
661 (if (r1 >= #x80)
662 ((write #xc2)
663 (write r1)))))
664
665 "CCL program to encode into UTF-8.") 773 "CCL program to encode into UTF-8.")
666 774
667 775
668 (define-ccl-program ccl-untranslated-to-ucs 776 (define-ccl-program ccl-untranslated-to-ucs
669 `(0 777 `(0
670 (if (r0 < #xf0) ; 3-byte encoding, as above 778 (if (r1 == 0)
671 ((r4 = 0) 779 nil
672 (r3 = (r1 & #b11000000)) 780 (if (r0 <= #xC3) ; 2-byte encoding
673 (r3 |= ((r2 >> 2) & #b00110000)) 781 ((r0 = ((r0 & #x3) << 6))
674 (if (r3 != #b10100000) 782 (r0 |= (r1 & #x3F))
675 (r4 = 1) 783 (r1 = 2))
676 ((r3 = ((r0 & #x0f) << 12)) 784 (if (r2 == 0)
677 (r3 += ((r1 & #x3f) << 6)) 785 (r1 = 0)
678 (r3 += (r2 & #x3f)) 786 (if (r0 < #xF0) ; 3-byte encoding, as above
679 (if (r3 < #x0800) 787 ((r0 = ((r0 & #xF) << 12))
680 (r4 = 1)))) 788 (r0 |= ((r1 & #x3F) << 6))
681 (if (r4 != 0) 789 (r0 |= (r1 & #x3F))
682 (r0 = 0) 790 (r1 = 3))
683 (r0 = r3))) 791 (if (r3 == 0)
684 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe) 792 (r1 = 0)
685 ((r4 = (r1 >> 6)) 793 ((r0 = ((r0 & #x7) << 18))
686 (if (r4 != #b10) 794 (r0 |= ((r1 & #x3F) << 12))
687 (r0 = 0) 795 (r0 |= ((r2 & #x3F) << 6))
688 ((r4 = (r2 >> 6)) 796 (r0 |= (r3 & #x3F))
689 (if (r4 != #b10) 797 (r1 = 4))))))))
690 (r0 = 0) 798 "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
691 ((r4 = (r3 >> 6)) 799 Set r1 to the byte length. r0 == 0 for invalid sequence.")
692 (if (r4 != #b10)
693 (r0 = 0)
694 ((r1 = ((r1 & #x3F) << 12))
695 (r2 = ((r2 & #x3F) << 6))
696 (r3 &= #x3F)
697 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
698 (r0 = 0))))
699 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
700 r0 == 0 for invalid sequence.")
701 800
702 (defvar utf-8-ccl-regs (make-vector 8 0)) 801 (defvar utf-8-ccl-regs (make-vector 8 0))
703 802
704 (defsubst utf-8-untranslated-to-ucs () 803 (defsubst utf-8-untranslated-to-ucs ()
705 "Return the UCS code for an untranslated sequence of raw bytes t point. 804 "Return the UCS code for an untranslated sequence of raw bytes t point.
706 Only for 3- or 4-byte sequences." 805 Only for 3- or 4-byte sequences."
707 (aset utf-8-ccl-regs 0 (or (char-after) 0)) 806 (aset utf-8-ccl-regs 0 (or (char-after) 0))
708 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) 807 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
709 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) 808 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
710 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) 809 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
711 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs) 810 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs))
712 (aref utf-8-ccl-regs 0))
713 811
714 (defun utf-8-help-echo (window object position) 812 (defun utf-8-help-echo (window object position)
715 (format "Untranslated Unicode U+%04X" 813 (format "Untranslated Unicode U+%04X"
716 (get-char-property position 'untranslated-utf-8 object))) 814 (get-char-property position 'untranslated-utf-8 object)))
717 815
718 ;; We compose the untranslatable sequences into a single character. 816 ;; We compose the untranslatable sequences into a single character,
817 ;; and move point to the next character.
719 ;; This is infelicitous for editing, because there's currently no 818 ;; This is infelicitous for editing, because there's currently no
720 ;; mechanism for treating compositions as atomic, but is OK for 819 ;; mechanism for treating compositions as atomic, but is OK for
721 ;; display. They are composed to U+FFFD with help-echo which 820 ;; display. They are composed to U+FFFD with help-echo which
722 ;; indicates the unicodes they represent. This function GCs too much. 821 ;; indicates the unicodes they represent. This function GCs too much.
723 (defsubst utf-8-compose () 822
724 "Put a suitable composition on an untranslatable sequence. 823 ;; If utf-translate-cjk-mode is non-nil, this function is called with
725 Return the sequence's length." 824 ;; HASH-TABLE which translates CJK characters into some of CJK
726 (let* ((u (utf-8-untranslated-to-ucs)) 825 ;; charsets.
727 (l (unless (zerop u) 826
728 (if (>= u #x10000) 827 (defsubst utf-8-compose (hash-table)
729 4 828 "Put a suitable composition on an untranslatable sequence at point.
730 3)))) 829 If HASH-TABLE is non-nil, try to translate CJK characters by it at first.
731 (when l 830 Move point to the end of the sequence."
732 (put-text-property (point) (min (point-max) (+ l (point))) 831 (utf-8-untranslated-to-ucs)
733 'untranslated-utf-8 u) 832 (let ((l (aref utf-8-ccl-regs 1))
734 (put-text-property (point) (min (point-max) (+ l (point))) 833 ch)
735 'help-echo 'utf-8-help-echo) 834 (if (> l 0)
736 (compose-region (point) (+ l (point)) ?$,3u=(B) 835 (if (and hash-table
737 l))) 836 (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table)))
837 (progn
838 (insert ch)
839 (delete-region (point) (min (point-max) (+ l (point)))))
840 (setq ch (aref utf-8-ccl-regs 0))
841 (put-text-property (point) (min (point-max) (+ l (point)))
842 'untranslated-utf-8 ch)
843 (put-text-property (point) (min (point-max) (+ l (point)))
844 'help-echo 'utf-8-help-echo)
845 (if (= l 2)
846 (put-text-property (point) (min (point-max) (+ l (point)))
847 'display (format "\\%03o" ch))
848 (compose-region (point) (+ l (point)) ?$,3u=(B))
849 (forward-char l))
850 (forward-char 1))))
738 851
739 (defcustom utf-8-compose-scripts nil 852 (defcustom utf-8-compose-scripts nil
740 "*Non-nil means compose various scripts on decoding utf-8 text." 853 "*Non-nil means compose various scripts on decoding utf-8 text."
741 :group 'mule 854 :group 'mule
742 :version "21.4" 855 :version "21.4"
743 :type 'boolean) 856 :type 'boolean)
744 857
745 (defun utf-8-post-read-conversion (length) 858 (defun utf-8-post-read-conversion (length)
746 "Compose untranslated utf-8 sequences into single characters. 859 "Compose untranslated utf-8 sequences into single characters.
860 If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters.
747 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." 861 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
748 (save-excursion 862 (save-excursion
749 ;; Can't do eval-when-compile to insert a multibyte constant 863 (save-restriction
750 ;; version of the string in the loop, since it's always loaded as 864 (narrow-to-region (point) (+ (point) length))
751 ;; unibyte from a byte-compiled file. 865 ;; Can't do eval-when-compile to insert a multibyte constant
752 (let ((range (string-as-multibyte "^\xe1-\xf7"))) 866 ;; version of the string in the loop, since it's always loaded as
753 (while (and (skip-chars-forward range) 867 ;; unibyte from a byte-compiled file.
754 (not (eobp))) 868 (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7"))
755 (forward-char (utf-8-compose))))) 869 hash-table ch)
756 ;; Fixme: Takahashi-san implies it may not work this easily. I 870 (when utf-translate-cjk-mode
757 ;; asked why but didn't get a reply. -- fx 871 (if (not utf-translate-cjk-lang-env)
758 (when (and utf-8-compose-scripts (> length 1)) 872 ;; Check these characters:
759 ;; These currently have definitions which cover the relevant 873 ;; "U+2e80-U+33ff", "U+ff00-U+ffef"
760 ;; unicodes. We could avoid loading thai-util &c by checking 874 ;; We may have to translate them to CJK charsets.
761 ;; whether the region contains any characters with the appropriate 875 (let ((range2 "$,29@(B-$,2G$,3r`(B-$,3u/(B"))
762 ;; categories. There aren't yet Unicode-based rules for Tibetan. 876 (skip-chars-forward (concat range range2))
763 (save-excursion (setq length (diacritic-post-read-conversion length))) 877 (unless (eobp)
764 (save-excursion (setq length (thai-post-read-conversion length))) 878 (utf-translate-cjk-load-tables)
765 (save-excursion (setq length (lao-post-read-conversion length))) 879 (setq range (concat range range2)))
766 (save-excursion (setq length (devanagari-post-read-conversion length))) 880 (setq hash-table (get 'utf-subst-table-for-decode
767 (save-excursion (setq length (malayalam-post-read-conversion length))) 881 'translation-hash-table)))))
768 (save-excursion (setq length (tamil-post-read-conversion length)))) 882 (while (and (skip-chars-forward range)
769 length) 883 (not (eobp)))
770 884 (setq ch (following-char))
771 ;; ucs-tables is preloaded 885 (if (< ch 256)
772 ;; (defun utf-8-pre-write-conversion (beg end) 886 (utf-8-compose hash-table)
773 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables." 887 (if (and hash-table
774 ;; ;; Ensure translation-table is loaded. 888 (setq ch (gethash (encode-char ch 'ucs) hash-table)))
775 ;; (require 'ucs-tables) 889 (progn
776 ;; ;; Don't do this again. 890 (insert ch)
777 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) 891 (delete-char 1))
778 ;; nil) 892 (forward-char 1)))))
893
894 (when (and utf-8-compose-scripts (> length 1))
895 ;; These currently have definitions which cover the relevant
896 ;; unicodes. We could avoid loading thai-util &c by checking
897 ;; whether the region contains any characters with the appropriate
898 ;; categories. There aren't yet Unicode-based rules for Tibetan.
899 (diacritic-compose-region (point-max) (point-min))
900 (thai-compose-region (point-max) (point-min))
901 (lao-compose-region (point-max) (point-min))
902 (devanagari-compose-region (point-max) (point-min))
903 (malayalam-compose-region (point-max) (point-min))
904 (tamil-compose-region (point-max) (point-min)))
905 (- (point-max) (point-min)))))
906
907 (defun utf-8-pre-write-conversion (beg end)
908 "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END.
909 This is used as a post-read-conversion of utf-8 coding system."
910 (if (and utf-translate-cjk-mode
911 (not utf-translate-cjk-lang-env)
912 (save-excursion
913 (goto-char beg)
914 (re-search-forward "\\cc\\|\\cj\\|\\ch" end t)))
915 (utf-translate-cjk-load-tables))
916 nil)
779 917
780 (make-coding-system 918 (make-coding-system
781 'mule-utf-8 4 ?u 919 'mule-utf-8 4 ?u
782 "UTF-8 encoding for Emacs-supported Unicode characters. 920 "UTF-8 encoding for Emacs-supported Unicode characters.
783 It supports Unicode characters of these ranges: 921 It supports Unicode characters of these ranges:
795 On encoding (e.g. writing a file), Emacs characters not belonging to 933 On encoding (e.g. writing a file), Emacs characters not belonging to
796 any of the character sets listed above are encoded into the UTF-8 byte 934 any of the character sets listed above are encoded into the UTF-8 byte
797 sequence representing U+FFFD (REPLACEMENT CHARACTER)." 935 sequence representing U+FFFD (REPLACEMENT CHARACTER)."
798 936
799 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) 937 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
800 '((safe-charsets 938 `((safe-charsets
801 ascii 939 ascii
802 eight-bit-control 940 eight-bit-control
803 eight-bit-graphic 941 eight-bit-graphic
804 latin-iso8859-1 942 latin-iso8859-1
805 mule-unicode-0100-24ff 943 mule-unicode-0100-24ff
806 mule-unicode-2500-33ff 944 mule-unicode-2500-33ff
807 mule-unicode-e000-ffff) 945 mule-unicode-e000-ffff
946 ,@(if utf-translate-cjk-mode
947 utf-translate-cjk-charsets))
808 (mime-charset . utf-8) 948 (mime-charset . utf-8)
809 (coding-category . coding-category-utf-8) 949 (coding-category . coding-category-utf-8)
810 (valid-codes (0 . 255)) 950 (valid-codes (0 . 255))
811 ;; (pre-write-conversion . utf-8-pre-write-conversion) 951 (pre-write-conversion . utf-8-pre-write-conversion)
812 (post-read-conversion . utf-8-post-read-conversion) 952 (post-read-conversion . utf-8-post-read-conversion)
813 (translation-table-for-encode . utf-translation-table-for-encode) 953 (translation-table-for-encode . utf-translation-table-for-encode)
814 (dependency unify-8859-on-encoding-mode 954 (dependency unify-8859-on-encoding-mode
815 unify-8859-on-decoding-mode 955 unify-8859-on-decoding-mode
816 utf-fragment-on-decoding 956 utf-fragment-on-decoding