# HG changeset patch # User Kenichi Handa # Date 1281092331 -32400 # Node ID 091f3ff9a59eb42411652577e446ebbd1c6c5a9c # Parent de86f640dbaa94da2a1183ea4abc77b49a4fced8# Parent b5fcc3bee61ec2383ff33079e6e420984569f492 Modify the coding system compound-text-with-extensions to conform to the spec of Compound Text. diff -r de86f640dbaa -r 091f3ff9a59e lisp/ChangeLog --- a/lisp/ChangeLog Thu Aug 05 23:31:03 2010 +0200 +++ b/lisp/ChangeLog Fri Aug 06 19:58:51 2010 +0900 @@ -1,9 +1,37 @@ +2010-08-06 Kenichi Handa + + * international/mule.el (define-charset): Store NAME as :base + property. + (ctext-non-standard-encodings-table): Pay attention to charset + aliases. + (ctext-pre-write-conversion): Sort ctext-standard-encodings by the + current priority. Force using the designation of the specific + charset by adding `charset' text property. Improve the whole + algorithm. + 2010-08-05 Juanma Barranquero * emulation/pc-select.el (pc-selection-mode-hook) (copy-region-as-kill-nomark, beginning-of-buffer-mark) (pc-selection-mode): Fix typos in docstrings. +2010-08-04 Kenichi Handa + + * language/cyrillic.el: Don't add "microsoft-cp1251" to + ctext-non-standard-encodings-alist here. + + * international/mule.el (ctext-non-standard-encodings-alist): Add + "koi8-r" and "microsoft-cp1251". + (ctext-standard-encodings): New variable. + (ctext-non-standard-encodings-table): List only elements for + non-standard encodings. + (ctext-pre-write-conversion): Adjusted for the above change. + Check ctext-standard-encodings. + + * international/mule-conf.el (compound-text): Doc fix. + (ctext-no-compositions): Doc fix. + (compound-text-with-extensions): Doc fix. + 2010-08-04 Stefan Monnier * simple.el (exchange-dot-and-mark): Mark obsolete, finally. diff -r de86f640dbaa -r 091f3ff9a59e lisp/international/mule-conf.el --- a/lisp/international/mule-conf.el Thu Aug 05 23:31:03 2010 +0200 +++ b/lisp/international/mule-conf.el Fri Aug 06 19:58:51 2010 +0900 @@ -1410,9 +1410,10 @@ :flags '(ascii-at-eol ascii-at-cntl designation single-shift composition)) (define-coding-system 'compound-text - "Compound text based generic encoding for decoding unknown messages. - -This coding system does not support extended segments of CTEXT." + "Compound text based generic encoding. +This coding system is an extension of X's \"Compound Text Encoding\". +It encodes many characters using the normal ISO-2022 designation sequences, +but it doesn't support extended segments of CTEXT." :coding-type 'iso-2022 :mnemonic ?x :charset-list 'iso-2022 @@ -1432,7 +1433,7 @@ ;; not have a mime-charset property, to prevent it from showing up ;; close to the beginning of coding systems ordered by priority. (define-coding-system 'ctext-no-compositions - "Compound text based generic encoding for decoding unknown messages. + "Compound text based generic encoding. Like `compound-text', but does not produce escape sequences for compositions." :coding-type 'iso-2022 @@ -1445,8 +1446,9 @@ (define-coding-system 'compound-text-with-extensions "Compound text encoding with ICCCM Extended Segment extensions. -See the variable `ctext-non-standard-encodings-alist' for the -detail about how extended segments are handled. +See the variables `ctext-standard-encodings' and +`ctext-non-standard-encodings-alist' for the detail about how +extended segments are handled. This coding system should be used only for X selections. It is inappropriate for decoding and encoding files, process I/O, etc." diff -r de86f640dbaa -r 091f3ff9a59e lisp/international/mule.el --- a/lisp/international/mule.el Thu Aug 05 23:31:03 2010 +0200 +++ b/lisp/international/mule.el Fri Aug 06 19:58:51 2010 +0900 @@ -282,6 +282,7 @@ (plist-put props :short-name (symbol-name name))) (or (plist-get props :long-name) (plist-put props :long-name (plist-get props :short-name))) + (plist-put props :base name) ;; We can probably get a worthwhile amount in purespace. (setq props (mapcar (lambda (elt) @@ -1408,7 +1409,9 @@ '(("big5-0" big5 2 big5) ("ISO8859-14" iso-8859-14 1 latin-iso8859-14) ("ISO8859-15" iso-8859-15 1 latin-iso8859-15) - ("gbk-0" gbk 2 chinese-gbk))) + ("gbk-0" gbk 2 chinese-gbk) + ("koi8-r" koi8-r 1 koi8-r) + ("microsoft-cp1251" windows-1251 1 windows-1251))) "Alist of non-standard encoding names vs the corresponding usages in CTEXT. It controls how extended segments of a compound text are handled @@ -1497,6 +1500,20 @@ (goto-char (point-min)) (- (point-max) (point))))) +(defvar ctext-standard-encodings + '(ascii latin-jisx0201 katakana-jisx0201 + latin-iso8859-1 latin-iso8859-2 latin-iso8859-3 latin-iso8859-4 + greek-iso8859-7 arabic-iso8859-6 hebrew-iso8859-8 cyrillic-iso8859-5 + latin-iso8859-9 + chinese-gb2312 japanese-jisx0208 korean-ksc5601) + "List of approved standard encodings (i.e. charsets) of X's Compound Text. +Coding-system `compound-text-with-extensions' encodes a character +belonging to any of those charsets using the normal ISO2022 +designation sequence unless the current language environment or +the variable `ctext-non-standard-encodings' decide to use an extended +segment of CTEXT for that character. See also the documentation +of `ctext-non-standard-encodings-alist'.") + ;; Return an alist of CHARSET vs CTEXT-USAGE-INFO generated from ;; `ctext-non-standard-encodings' and a list specified by the key ;; `ctext-non-standard-encodings' for the currrent language @@ -1508,115 +1525,94 @@ ;; is encoded using UTF-8 encoding extention. (defun ctext-non-standard-encodings-table () - (let (table) - ;; Setup charsets specified by the key - ;; `ctext-non-standard-encodings' for the current language - ;; environment and in `ctext-non-standard-encodings'. - (dolist (encoding (append - (get-language-info current-language-environment - 'ctext-non-standard-encodings) - ctext-non-standard-encodings)) - (let* ((slot (assoc encoding ctext-non-standard-encodings-alist)) + (let* ((table (append ctext-non-standard-encodings + (copy-sequence + (get-language-info current-language-environment + 'ctext-non-standard-encodings)))) + (tail table) + elt) + (while tail + (setq elt (car tail)) + (let* ((slot (assoc elt ctext-non-standard-encodings-alist)) (charset (nth 3 slot))) (if (charsetp charset) - (push (cons charset slot) table) - (dolist (cs charset) - (push (cons cs slot) table))))) - - ;; Next prepend charsets for ISO2022 designation sequence. - (dolist (charset charset-list) - (let ((final (plist-get (charset-plist charset) :iso-final-char))) - (if (and (integerp final) - (>= final #x40) (<= final #x7e) - ;; Exclude ascii and chinese-cns11643-X. - (not (eq charset 'ascii)) - (not (string-match "cns11643" (symbol-name charset)))) - (push (cons charset nil) table)))) - - ;; Returned reversed list so that the charsets specified by the - ;; key `ctext-non-standard-encodings' for the current language - ;; have the highest priority. - (nreverse table))) + (setcar tail + (cons (plist-get (charset-plist charset) :base) slot)) + (setcar tail (cons (car charset) slot)) + (dolist (cs (cdr charset)) + (setcdr tail + (cons (cons (plist-get (charset-plist (car cs)) :base) slot) + (cdr tail))) + (setq tail (cdr tail)))) + (setq tail (cdr tail)))) + table)) (defun ctext-pre-write-conversion (from to) "Encode characters between FROM and TO as Compound Text w/Extended Segments. -If FROM is a string, or if the current buffer is not the one set up for us -by `encode-coding-string', generate a new temp buffer, insert the text, -and convert it in the temporary buffer. Otherwise, convert in-place." +If FROM is a string, generate a new temp buffer, insert the text, +and convert it in the temporary buffer. Otherwise, convert +in-place." (save-match-data ;; Setup a working buffer if necessary. (when (stringp from) (set-buffer (generate-new-buffer " *temp")) (set-buffer-multibyte (multibyte-string-p from)) - (insert from)) - - ;; Now we can encode the whole buffer. - (let ((encoding-table (ctext-non-standard-encodings-table)) - last-coding-system-used - last-pos last-encoding-info - encoding-info end-pos ch) - (goto-char (setq last-pos (point-min))) - (setq end-pos (point-marker)) - (while (re-search-forward "[^\000-\177]+" nil t) - ;; Found a sequence of non-ASCII characters. - (setq last-pos (match-beginning 0) - ch (char-after last-pos) - last-encoding-info (catch 'tag - (dolist (elt encoding-table) - (if (encode-char ch (car elt)) - (throw 'tag (cdr elt)))) - 'utf-8)) - (set-marker end-pos (match-end 0)) - (goto-char (1+ last-pos)) - (catch 'tag - (while t - (setq encoding-info - (if (< (point) end-pos) - (catch 'tag - (setq ch (following-char)) - (dolist (elt encoding-table) - (if (encode-char ch (car elt)) - (throw 'tag (cdr elt)))) - 'utf-8))) - (unless (eq last-encoding-info encoding-info) - (cond ((consp last-encoding-info) - ;; Encode the previous range using an extended - ;; segment. - (let ((encoding-name (car last-encoding-info)) - (coding-system (nth 1 last-encoding-info)) - (noctets (nth 2 last-encoding-info)) - len) - (encode-coding-region last-pos (point) coding-system) - (setq len (+ (length encoding-name) 1 - (- (point) last-pos))) - ;; According to the spec of CTEXT, it is not - ;; necessary to produce this extra designation - ;; sequence, but some buggy application - ;; (e.g. crxvt-gb) requires it. - (insert "\e(B") - (save-excursion - (goto-char last-pos) - (insert (format "\e%%/%d" noctets)) - (insert-byte (+ (/ len 128) 128) 1) - (insert-byte (+ (% len 128) 128) 1) - (insert encoding-name) - (insert 2)))) - ((eq last-encoding-info 'utf-8) - ;; Encode the previous range using UTF-8 encoding - ;; extention. - (encode-coding-region last-pos (point) 'mule-utf-8) - (save-excursion - (goto-char last-pos) - (insert "\e%G")) - (insert "\e%@"))) - (setq last-pos (point) - last-encoding-info encoding-info)) - (if (< (point) end-pos) - (forward-char 1) - (throw 'tag nil))))) - (set-marker end-pos nil) - (goto-char (point-min)))) + (insert from) + (setq from 1 to (point-max))) + (save-restriction + (narrow-to-region from to) + (goto-char from) + (let ((encoding-table (ctext-non-standard-encodings-table)) + (charset-list (sort-charsets + (copy-sequence ctext-standard-encodings))) + (end-pos (make-marker)) + last-coding-system-used + last-pos charset encoding-info) + (dolist (elt encoding-table) + (push (car elt) charset-list)) + (setq end-pos (point-marker)) + (while (re-search-forward "[^\0-\177]+" nil t) + ;; Found a sequence of non-ASCII characters. + (set-marker end-pos (match-end 0)) + (goto-char (match-beginning 0)) + (setq last-pos (point) + charset (char-charset (following-char) charset-list)) + (forward-char 1) + (while (and (< (point) end-pos) + (eq charset (char-charset (following-char) charset-list))) + (forward-char 1)) + (if charset + (if (setq encoding-info (cdr (assq charset encoding-table))) + ;; Encode this range using an extended segment. + (let ((encoding-name (car encoding-info)) + (coding-system (nth 1 encoding-info)) + (noctets (nth 2 encoding-info)) + len) + (encode-coding-region last-pos (point) coding-system) + (setq len (+ (length encoding-name) 1 + (- (point) last-pos))) + ;; According to the spec of CTEXT, it is not + ;; necessary to produce this extra designation + ;; sequence, but some buggy application + ;; (e.g. crxvt-gb) requires it. + (insert "\e(B") + (save-excursion + (goto-char last-pos) + (insert (format "\e%%/%d" noctets)) + (insert-byte (+ (/ len 128) 128) 1) + (insert-byte (+ (% len 128) 128) 1) + (insert encoding-name) + (insert 2))) + ;; Encode this range as characters in CHARSET. + (put-text-property last-pos (point) 'charset charset)) + ;; Encode this range using UTF-8 encoding extention. + (encode-coding-region last-pos (point) 'mule-utf-8) + (save-excursion + (goto-char last-pos) + (insert "\e%G")) + (insert "\e%@"))) + (goto-char (point-min))))) ;; Must return nil, as build_annotations_2 expects that. nil) diff -r de86f640dbaa -r 091f3ff9a59e lisp/language/cyrillic.el --- a/lisp/language/cyrillic.el Thu Aug 05 23:31:03 2010 +0200 +++ b/lisp/language/cyrillic.el Fri Aug 06 19:58:51 2010 +0900 @@ -239,13 +239,6 @@ (documentation . "Support for Tajik using KOI8-T.")) '("Cyrillic")) -(let ((elt `("microsoft-cp1251" windows-1251 1 - ,(get 'encode-windows-1251 'translation-table))) - (slot (assoc "microsoft-cp1251" ctext-non-standard-encodings-alist))) - (if slot - (setcdr slot (cdr elt)) - (push elt ctext-non-standard-encodings-alist))) - (set-language-info-alist "Bulgarian" `((coding-system windows-1251) (coding-priority windows-1251) diff -r de86f640dbaa -r 091f3ff9a59e src/ChangeLog --- a/src/ChangeLog Thu Aug 05 23:31:03 2010 +0200 +++ b/src/ChangeLog Fri Aug 06 19:58:51 2010 +0900 @@ -1,3 +1,14 @@ +2010-08-06 Kenichi Handa + + * charset.c: Include + (struct charset_sort_data): New struct. + (charset_compare): New function. + (Fsort_charsets): New funciton. + (syms_of_charset): Declare Fsort_charsets as a Lisp function. + + * coding.c (decode_coding_iso_2022): Fix checking of dimension + number in CTEXT extended segment. + 2010-08-01 Juanma Barranquero * w32fns.c (syms_of_w32fns) : Fix typo in docstring. diff -r de86f640dbaa -r 091f3ff9a59e src/charset.c --- a/src/charset.c Thu Aug 05 23:31:03 2010 +0200 +++ b/src/charset.c Fri Aug 06 19:58:51 2010 +0900 @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -2139,23 +2140,22 @@ charset = CHAR_CHARSET (XINT (ch)); else { - Lisp_Object charset_list; - if (CONSP (restriction)) { - for (charset_list = Qnil; CONSP (restriction); - restriction = XCDR (restriction)) + int c = XFASTINT (ch); + + for (; CONSP (restriction); restriction = XCDR (restriction)) { - int id; + struct charset *charset; - CHECK_CHARSET_GET_ID (XCAR (restriction), id); - charset_list = Fcons (make_number (id), charset_list); + CHECK_CHARSET_GET_CHARSET (XCAR (restriction), charset); + if (ENCODE_CHAR (charset, c) != CHARSET_INVALID_CODE (charset)) + return XCAR (restriction); } - charset_list = Fnreverse (charset_list); + return Qnil; } - else - charset_list = coding_system_charset_list (restriction); - charset = char_charset (XINT (ch), charset_list, NULL); + restriction = coding_system_charset_list (restriction); + charset = char_charset (XINT (ch), restriction, NULL); if (! charset) return Qnil; } @@ -2312,6 +2312,69 @@ return make_number (id); } +struct charset_sort_data +{ + Lisp_Object charset; + int id; + int priority; +}; + +static int +charset_compare (const void *d1, const void *d2) +{ + const struct charset_sort_data *data1 = d1, *data2 = d2; + return (data1->priority - data2->priority); +} + +DEFUN ("sort-charsets", Fsort_charsets, Ssort_charsets, 1, 1, 0, + doc: /* Sort charset list CHARSETS by a priority of each charset. +Return the sorted list. CHARSETS is modified by side effects. +See also `charset-priority-list' and `set-charset-priority'. */) + (Lisp_Object charsets) +{ + Lisp_Object len = Flength (charsets); + int n = XFASTINT (len), i, j, done; + Lisp_Object tail, elt, attrs; + struct charset_sort_data *sort_data; + int id, min_id, max_id; + USE_SAFE_ALLOCA; + + if (n == 0) + return Qnil; + SAFE_ALLOCA (sort_data, struct charset_sort_data *, sizeof (*sort_data) * n); + for (tail = charsets, i = 0; CONSP (tail); tail = XCDR (tail), i++) + { + elt = XCAR (tail); + CHECK_CHARSET_GET_ATTR (elt, attrs); + sort_data[i].charset = elt; + sort_data[i].id = id = XINT (CHARSET_ATTR_ID (attrs)); + if (i == 0) + min_id = max_id = id; + else if (id < min_id) + min_id = id; + else if (id > max_id) + max_id = id; + } + for (done = 0, tail = Vcharset_ordered_list, i = 0; + done < n && CONSP (tail); tail = XCDR (tail), i++) + { + elt = XCAR (tail); + id = XFASTINT (elt); + if (id >= min_id && id <= max_id) + for (j = 0; j < n; j++) + if (sort_data[j].id == id) + { + sort_data[j].priority = i; + done++; + } + } + qsort (sort_data, n, sizeof *sort_data, charset_compare); + for (i = 0, tail = charsets; CONSP (tail); tail = XCDR (tail), i++) + XSETCAR (tail, sort_data[i].charset); + SAFE_FREE (); + return charsets; +} + void init_charset () @@ -2414,6 +2477,7 @@ defsubr (&Scharset_priority_list); defsubr (&Sset_charset_priority); defsubr (&Scharset_id_internal); + defsubr (&Ssort_charsets); DEFVAR_LISP ("charset-map-path", &Vcharset_map_path, doc: /* *List of directories to search for charset map files. */); diff -r de86f640dbaa -r 091f3ff9a59e src/coding.c --- a/src/coding.c Thu Aug 05 23:31:03 2010 +0200 +++ b/src/coding.c Fri Aug 06 19:58:51 2010 +0900 @@ -3935,7 +3935,7 @@ int size; ONE_MORE_BYTE (dim); - if (dim < 0 || dim > 4) + if (dim < '0' || dim > '4') goto invalid_code; ONE_MORE_BYTE (M); if (M < 128)