# HG changeset patch # User Kenichi Handa # Date 1281082279 -32400 # Node ID 62d9702eb2613e8ea181230195a66215261a4c64 # Parent 0a7e386737b178f4d74ca93e319332812e573a10 Improve the encoding by compound-text-with-extensions. diff -r 0a7e386737b1 -r 62d9702eb261 lisp/ChangeLog --- a/lisp/ChangeLog Fri Aug 06 12:54:13 2010 +0900 +++ b/lisp/ChangeLog Fri Aug 06 17:11:19 2010 +0900 @@ -1,3 +1,14 @@ +2010-08-06 Kenichi Handa + + * international/mule.el (define-charset): Store NAME as :base + property. + (ctext-non-standard-encodings-table): Pay attention to charset + aliases. + (ctext-pre-write-conversion): Sort ctext-standard-encodings by the + current priority. Force using the designation of the specific + charset by adding `charset' text property. Improve the whole + algorithm. + 2010-08-04 Kenichi Handa * language/cyrillic.el: Don't add "microsoft-cp1251" to diff -r 0a7e386737b1 -r 62d9702eb261 lisp/international/mule.el --- a/lisp/international/mule.el Fri Aug 06 12:54:13 2010 +0900 +++ b/lisp/international/mule.el Fri Aug 06 17:11:19 2010 +0900 @@ -282,6 +282,7 @@ (plist-put props :short-name (symbol-name name))) (or (plist-get props :long-name) (plist-put props :long-name (plist-get props :short-name))) + (plist-put props :base name) ;; We can probably get a worthwhile amount in purespace. (setq props (mapcar (lambda (elt) @@ -1535,11 +1536,13 @@ (let* ((slot (assoc elt ctext-non-standard-encodings-alist)) (charset (nth 3 slot))) (if (charsetp charset) - (setcar tail (cons charset slot)) + (setcar tail + (cons (plist-get (charset-plist charset) :base) slot)) (setcar tail (cons (car charset) slot)) (dolist (cs (cdr charset)) (setcdr tail - (cons (cons (car cs) slot) (cdr tail))) + (cons (cons (plist-get (charset-plist (car cs)) :base) slot) + (cdr tail))) (setq tail (cdr tail)))) (setq tail (cdr tail)))) table)) @@ -1559,74 +1562,56 @@ (setq from 1 to (point-max))) (save-restriction (narrow-to-region from to) + (goto-char from) (let ((encoding-table (ctext-non-standard-encodings-table)) - (charset-list ctext-standard-encodings) + (charset-list (sort-charsets + (copy-sequence ctext-standard-encodings))) + (end-pos (make-marker)) last-coding-system-used - last-pos last-encoding-info - encoding-info end-pos ch charset) + last-pos charset encoding-info) (dolist (elt encoding-table) (push (car elt) charset-list)) - (goto-char (setq last-pos from)) (setq end-pos (point-marker)) - (while (re-search-forward "[^\000-\177]+" nil t) + (while (re-search-forward "[^\0-\177]+" nil t) ;; Found a sequence of non-ASCII characters. - (setq last-pos (match-beginning 0) - ch (char-after last-pos) - charset (char-charset ch charset-list) - last-encoding-info - (if charset - (or (cdr (assq charset encoding-table)) - charset) - 'utf-8)) (set-marker end-pos (match-end 0)) - (goto-char (1+ last-pos)) - (while (marker-position end-pos) - (if (< (point) end-pos) - (progn - (setq charset (char-charset (following-char) charset-list) - encoding-info - (if charset - (or (cdr (assq charset encoding-table)) - charset) - 'utf-8)) - (forward-char 1)) - (setq encoding-info nil) - (set-marker end-pos nil)) - (unless (eq last-encoding-info encoding-info) - (cond ((consp last-encoding-info) - ;; Encode the previous range using an extended - ;; segment. - (let ((encoding-name (car last-encoding-info)) - (coding-system (nth 1 last-encoding-info)) - (noctets (nth 2 last-encoding-info)) - len) - (encode-coding-region last-pos (point) coding-system) - (setq len (+ (length encoding-name) 1 - (- (point) last-pos))) - ;; According to the spec of CTEXT, it is not - ;; necessary to produce this extra designation - ;; sequence, but some buggy application - ;; (e.g. crxvt-gb) requires it. - (insert "\e(B") - (save-excursion - (goto-char last-pos) - (insert (format "\e%%/%d" noctets)) - (insert-byte (+ (/ len 128) 128) 1) - (insert-byte (+ (% len 128) 128) 1) - (insert encoding-name) - (insert 2)))) - ((eq last-encoding-info 'utf-8) - ;; Encode the previous range using UTF-8 encoding - ;; extention. - (encode-coding-region last-pos (point) 'mule-utf-8) - (save-excursion - (goto-char last-pos) - (insert "\e%G")) - (insert "\e%@")) - (t - (put-text-property last-pos (point) 'charset charset))) - (setq last-pos (point) - last-encoding-info encoding-info)))) + (goto-char (match-beginning 0)) + (setq last-pos (point) + charset (char-charset (following-char) charset-list)) + (forward-char 1) + (while (and (< (point) end-pos) + (eq charset (char-charset (following-char) charset-list))) + (forward-char 1)) + (if charset + (if (setq encoding-info (cdr (assq charset encoding-table))) + ;; Encode this range using an extended segment. + (let ((encoding-name (car encoding-info)) + (coding-system (nth 1 encoding-info)) + (noctets (nth 2 encoding-info)) + len) + (encode-coding-region last-pos (point) coding-system) + (setq len (+ (length encoding-name) 1 + (- (point) last-pos))) + ;; According to the spec of CTEXT, it is not + ;; necessary to produce this extra designation + ;; sequence, but some buggy application + ;; (e.g. crxvt-gb) requires it. + (insert "\e(B") + (save-excursion + (goto-char last-pos) + (insert (format "\e%%/%d" noctets)) + (insert-byte (+ (/ len 128) 128) 1) + (insert-byte (+ (% len 128) 128) 1) + (insert encoding-name) + (insert 2))) + ;; Encode this range as characters in CHARSET. + (put-text-property last-pos (point) 'charset charset)) + ;; Encode this range using UTF-8 encoding extention. + (encode-coding-region last-pos (point) 'mule-utf-8) + (save-excursion + (goto-char last-pos) + (insert "\e%G")) + (insert "\e%@"))) (goto-char (point-min))))) ;; Must return nil, as build_annotations_2 expects that. nil) diff -r 0a7e386737b1 -r 62d9702eb261 src/ChangeLog --- a/src/ChangeLog Fri Aug 06 12:54:13 2010 +0900 +++ b/src/ChangeLog Fri Aug 06 17:11:19 2010 +0900 @@ -1,3 +1,14 @@ +2010-08-06 Kenichi Handa + + * charset.c: Include + (struct charset_sort_data): New struct. + (charset_compare): New function. + (Fsort_charsets): New funciton. + (syms_of_charset): Declare Fsort_charsets as a Lisp function. + + * coding.c (decode_coding_iso_2022): Fix checking of dimension + number in CTEXT extended segment. + 2010-08-01 Juanma Barranquero * w32fns.c (syms_of_w32fns) : Fix typo in docstring. diff -r 0a7e386737b1 -r 62d9702eb261 src/charset.c --- a/src/charset.c Fri Aug 06 12:54:13 2010 +0900 +++ b/src/charset.c Fri Aug 06 17:11:19 2010 +0900 @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -2139,23 +2140,22 @@ charset = CHAR_CHARSET (XINT (ch)); else { - Lisp_Object charset_list; - if (CONSP (restriction)) { - for (charset_list = Qnil; CONSP (restriction); - restriction = XCDR (restriction)) + int c = XFASTINT (ch); + + for (; CONSP (restriction); restriction = XCDR (restriction)) { - int id; + struct charset *charset; - CHECK_CHARSET_GET_ID (XCAR (restriction), id); - charset_list = Fcons (make_number (id), charset_list); + CHECK_CHARSET_GET_CHARSET (XCAR (restriction), charset); + if (ENCODE_CHAR (charset, c) != CHARSET_INVALID_CODE (charset)) + return XCAR (restriction); } - charset_list = Fnreverse (charset_list); + return Qnil; } - else - charset_list = coding_system_charset_list (restriction); - charset = char_charset (XINT (ch), charset_list, NULL); + restriction = coding_system_charset_list (restriction); + charset = char_charset (XINT (ch), restriction, NULL); if (! charset) return Qnil; } @@ -2312,6 +2312,69 @@ return make_number (id); } +struct charset_sort_data +{ + Lisp_Object charset; + int id; + int priority; +}; + +static int +charset_compare (const void *d1, const void *d2) +{ + const struct charset_sort_data *data1 = d1, *data2 = d2; + return (data1->priority - data2->priority); +} + +DEFUN ("sort-charsets", Fsort_charsets, Ssort_charsets, 1, 1, 0, + doc: /* Sort charset list CHARSETS by a priority of each charset. +Return the sorted list. CHARSETS is modified by side effects. +See also `charset-priority-list' and `set-charset-priority'. */) + (Lisp_Object charsets) +{ + Lisp_Object len = Flength (charsets); + int n = XFASTINT (len), i, j, done; + Lisp_Object tail, elt, attrs; + struct charset_sort_data *sort_data; + int id, min_id, max_id; + USE_SAFE_ALLOCA; + + if (n == 0) + return Qnil; + SAFE_ALLOCA (sort_data, struct charset_sort_data *, sizeof (*sort_data) * n); + for (tail = charsets, i = 0; CONSP (tail); tail = XCDR (tail), i++) + { + elt = XCAR (tail); + CHECK_CHARSET_GET_ATTR (elt, attrs); + sort_data[i].charset = elt; + sort_data[i].id = id = XINT (CHARSET_ATTR_ID (attrs)); + if (i == 0) + min_id = max_id = id; + else if (id < min_id) + min_id = id; + else if (id > max_id) + max_id = id; + } + for (done = 0, tail = Vcharset_ordered_list, i = 0; + done < n && CONSP (tail); tail = XCDR (tail), i++) + { + elt = XCAR (tail); + id = XFASTINT (elt); + if (id >= min_id && id <= max_id) + for (j = 0; j < n; j++) + if (sort_data[j].id == id) + { + sort_data[j].priority = i; + done++; + } + } + qsort (sort_data, n, sizeof *sort_data, charset_compare); + for (i = 0, tail = charsets; CONSP (tail); tail = XCDR (tail), i++) + XSETCAR (tail, sort_data[i].charset); + SAFE_FREE (); + return charsets; +} + void init_charset () @@ -2414,6 +2477,7 @@ defsubr (&Scharset_priority_list); defsubr (&Sset_charset_priority); defsubr (&Scharset_id_internal); + defsubr (&Ssort_charsets); DEFVAR_LISP ("charset-map-path", &Vcharset_map_path, doc: /* *List of directories to search for charset map files. */); diff -r 0a7e386737b1 -r 62d9702eb261 src/coding.c --- a/src/coding.c Fri Aug 06 12:54:13 2010 +0900 +++ b/src/coding.c Fri Aug 06 17:11:19 2010 +0900 @@ -3935,7 +3935,7 @@ int size; ONE_MORE_BYTE (dim); - if (dim < 0 || dim > 4) + if (dim < '0' || dim > '4') goto invalid_code; ONE_MORE_BYTE (M); if (M < 128)