view lisp/language/china-util.el @ 105813:df4934f25eef

* textmodes/two-column.el (2C-split): * textmodes/texnfo-upd.el (texinfo-multi-file-included-list): * textmodes/tex-mode.el (tex-set-buffer-directory): * textmodes/spell.el (spell-region, spell-string): * textmodes/reftex.el (reftex-erase-buffer): (reftex-get-file-buffer-force, reftex-kill-temporary-buffers): * textmodes/reftex-toc.el (reftex-toc-promote-action): * textmodes/reftex-sel.el (reftex-get-offset, reftex-insert-docstruct) (reftex-select-item): * textmodes/reftex-ref.el (reftex-label-info-update) (reftex-offer-label-menu): * textmodes/reftex-index.el (reftex-index-change-entry) (reftex-index-phrases-info): * textmodes/reftex-global.el (reftex-create-tags-file) (reftex-save-all-document-buffers, reftex-ensure-write-access): * textmodes/reftex-dcr.el (reftex-echo-ref, reftex-echo-cite) (reftex-view-crossref-from-bibtex): * textmodes/reftex-cite.el (reftex-bibtex-selection-callback) (reftex-extract-bib-entries-from-thebibliography) (reftex-all-used-citation-keys, reftex-create-bibtex-file): * textmodes/refbib.el (r2b-capitalize-title): (r2b-convert-buffer, r2b-help): * textmodes/page-ext.el (pages-directory) (pages-directory-goto-with-mouse): * textmodes/bibtex.el (bibtex-validate-globally): * textmodes/bib-mode.el (bib-capitalize-title): * textmodes/artist.el (artist-clear-buffer, artist-system): * progmodes/xscheme.el (global-set-scheme-interaction-buffer): (local-set-scheme-interaction-buffer, xscheme-process-filter) (verify-xscheme-buffer, xscheme-enter-interaction-mode) (xscheme-enter-debugger-mode, xscheme-debugger-mode-p) (xscheme-send-control-g-interrupt, xscheme-start-process) (xscheme-process-sentinel, xscheme-cd): * progmodes/verilog-mode.el (verilog-read-always-signals) (verilog-set-define, verilog-getopt-file) (verilog-module-inside-filename-p): * progmodes/sh-script.el: * progmodes/python.el (python-pdbtrack-get-source-buffer) (python-pdbtrack-grub-for-buffer, python-execute-file): * progmodes/octave-inf.el (inferior-octave): * progmodes/idlwave.el (idlwave-scan-user-lib-files) (idlwave-shell-compile-helper-routines, idlwave-set-local) (idlwave-display-completion-list-xemacs, idlwave-list-abbrevs) (idlwave-display-completion-list-emacs, idlwave-list-load-path-shadows) (idlwave-completion-fontify-classes, idlwave-display-calling-sequence): * progmodes/idlw-shell.el (idlwave-shell-examine-display-clear) (idlwave-shell-filter, idlwave-shell-examine-highlight) (idlwave-shell-sentinel, idlwave-shell-filter-directory) (idlwave-shell-display-line, idlwave-shell-set-bp-in-module) (idlwave-shell-examine-display, idlwave-shell-run-region) (idlwave-shell-filter-bp, idlwave-shell-save-and-action) (idlwave-shell-sources-filter, idlwave-shell-goto-next-error): * progmodes/idlw-help.el (idlwave-help-get-special-help) (idlwave-help-get-help-buffer): * progmodes/gud.el (gud-basic-call, gud-find-class) (gud-tooltip-activate-mouse-motions-if-enabled): * progmodes/gdb-mi.el (gdb-mouse-toggle-breakpoint-fringe): * progmodes/ebrowse.el (ebrowse-member-table, ebrowse-save-tree-as) (ebrowse-view-exit-fn, ebrowse-tags-list-members-in-file) (ebrowse-tags-next-file): * progmodes/ebnf2ps.el (ebnf-generate-eps, ebnf-generate-eps) (ebnf-eps-production-list, ebnf-begin-file, ebnf-log) (ebnf-eps-finish-and-write): * progmodes/cpp.el (cpp-edit-save): * progmodes/cperl-mode.el (cperl-pod-to-manpage): * progmodes/cc-defs.el (c-emacs-features): * progmodes/antlr-mode.el (antlr-invalidate-context-cache) (antlr-directory-dependencies): * progmodes/ada-xref.el (ada-gnat-parse-gpr, ada-get-ali-file-name) (ada-run-application, ada-find-in-src-path, ada-goto-parent) (ada-find-any-references, ada-make-filename-from-adaname) (ada-make-body-gnatstub): * obsolete/rnews.el (news-list-news-groups): * obsolete/resume.el (resume-suspend-hook,resume-write-buffer-to-file): * obsolete/iso-acc.el (iso-acc-minibuf-setup): * net/rcirc.el (rcirc-debug): * net/newst-treeview.el (newsticker--treeview-list-add-item) (newsticker--treeview-list-clear, newsticker-treeview-browse-url) (newsticker--treeview-list-update-faces, newsticker-treeview-save) (newsticker--treeview-item-show-text, newsticker--treeview-item-show) (newsticker--treeview-tree-update-tag,newsticker--treeview-buffer-init) (newsticker-treeview-show-item, newsticker--treeview-unfold-node) (newsticker--treeview-list-clear-highlight) (newsticker--treeview-list-update-highlight) (newsticker--treeview-list-highlight-start) (newsticker--treeview-tree-update-highlight) (newsticker--treeview-get-selected-item) (newsticker-treeview-mark-list-items-old) (newsticker--treeview-set-current-node): * net/newst-plainview.el (newsticker--buffer-set-uptodate): * net/newst-backend.el (newsticker--get-news-by-funcall) (newsticker--get-news-by-wget, newsticker--image-get) (newsticker--image-sentinel): * net/mairix.el (mairix-rmail-fetch-field, mairix-gnus-fetch-field): * net/eudcb-ph.el (eudc-ph-do-request, eudc-ph-open-session): (eudc-ph-close-session): * net/eudc.el (eudc-save-options): * language/thai-word.el (thai-update-word-table): * language/japan-util.el (japanese-string-conversion): * international/titdic-cnv.el (tsang-quick-converter) (ziranma-converter, ctlau-converter): * international/mule-cmds.el (describe-language-environment): * international/ja-dic-cnv.el (skkdic-convert-okuri-ari) (skkdic-convert-postfix, skkdic-convert-prefix): (skkdic-convert-okuri-nasi, skkdic-convert): * emacs-lisp/re-builder.el (reb-update-overlays): * emacs-lisp/pp.el (pp-to-string, pp-display-expression): * emacs-lisp/gulp.el (gulp-send-requests): * emacs-lisp/find-gc.el (trace-call-tree): * emacs-lisp/eieio-opt.el (eieio-browse, eieio-describe-class) (eieio-describe-generic): * emacs-lisp/eieio-base.el (eieio-persistent-read): * emacs-lisp/edebug.el (edebug-outside-excursion): * emacs-lisp/debug.el (debugger-make-xrefs): * emacs-lisp/cust-print.el (custom-prin1-to-string): * emacs-lisp/chart.el (chart-new-buffer): * emacs-lisp/authors.el (authors-scan-el, authors-scan-change-log): Use with-current-buffer. * textmodes/artist.el (artist-system): Don't call copy-sequence on a fresh string. * progmodes/idlw-shell.el (easymenu setup): Use dolist.
author Stefan Monnier <monnier@iro.umontreal.ca>
date Sat, 31 Oct 2009 02:38:34 +0000
parents a9dc0e7c3f2b
children 1d1d5d9bd884
line wrap: on
line source

;;; china-util.el --- utilities for Chinese  -*- coding: iso-2022-7bit -*-

;; Copyright (C) 1995, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
;;   Free Software Foundation, Inc.
;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
;;   2005, 2006, 2007, 2008, 2009
;;   National Institute of Advanced Industrial Science and Technology (AIST)
;;   Registration Number H14PRO021
;; Copyright (C) 2003
;;   National Institute of Advanced Industrial Science and Technology (AIST)
;;   Registration Number H13PRO009

;; Keywords: mule, multilingual, Chinese

;; This file is part of GNU Emacs.

;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.

;;; Commentary:

;;; Code:

;; Hz/ZW/EUC-TW encoding stuff

;; HZ is an encoding method for Chinese character set GB2312 used
;; widely in Internet.  It is very similar to 7-bit environment of
;; ISO-2022.  The difference is that HZ uses the sequence "~{" and
;; "~}" for designating GB2312 and ASCII respectively, hence, it
;; doesn't uses ESC (0x1B) code.

;; ZW is another encoding method for Chinese character set GB2312.  It
;; encodes Chinese characters line by line by starting each line with
;; the sequence "zW".  It also uses only 7-bit as HZ.

;; EUC-TW is similar to EUC-KS or EUC-JP.  Its main character set is
;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
;; a single shift escape followed by three bytes: the first gives the
;; plane, the second and third the character code.  Note that characters
;; of plane 1 are (redundantly) accessible with a single shift escape
;; also.

;; ISO-2022 escape sequence to designate GB2312.
(defvar iso2022-gb-designation "\e$A")
;; HZ escape sequence to designate GB2312.
(defvar hz-gb-designnation "~{")
;; ISO-2022 escape sequence to designate ASCII.
(defvar iso2022-ascii-designation "\e(B")
;; HZ escape sequence to designate ASCII.
(defvar hz-ascii-designnation "~}")
;; Regexp of ZW sequence to start GB2312.
(defvar zw-start-gb "^zW")
;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
(defvar hz/zw-start-gb
  (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]"))

(defvar decode-hz-line-continuation nil
  "Flag to tell if we should care line continuation convention of Hz.")

(defconst hz-set-msb-table
  (eval-when-compile
    (let ((chars nil)
	  (i 0))
      (while (< i 33)
	(push i chars)
	(setq i (1+ i)))
      (while (< i 127)
	(push (decode-char 'eight-bit (+ i 128)) chars)
	(setq i (1+ i)))
      (apply 'string (nreverse chars)))))

;;;###autoload
(defun decode-hz-region (beg end)
  "Decode HZ/ZW encoded text in the current region.
Return the length of resulting text."
  (interactive "r")
  (save-excursion
    (save-restriction
      (let (pos ch)
	(narrow-to-region beg end)

	;; We, at first, convert HZ/ZW to `euc-china',
	;; then decode it.

	;; "~\n" -> "\n", "~~" -> "~"
	(goto-char (point-min))
	(while (search-forward "~" nil t)
	  (setq ch (following-char))
	  (if (or (= ch ?\n) (= ch ?~)) (delete-char -1)))

	;; "^zW...\n" -> Chinese GB2312
	;; "~{...~}"  -> Chinese GB2312
	(goto-char (point-min))
	(setq beg nil)
	(while (re-search-forward hz/zw-start-gb nil t)
	  (setq pos (match-beginning 0)
		ch (char-after pos))
	  ;; Record the first position to start conversion.
	  (or beg (setq beg pos))
	  (end-of-line)
	  (setq end (point))
	  (if (>= ch 128)		; 8bit GB2312
	      nil
	    (goto-char pos)
	    (delete-char 2)
	    (setq end (- end 2))
	    (if (= ch ?z)			; ZW -> euc-china
		(progn
		  (translate-region (point) end hz-set-msb-table)
		  (goto-char end))
	      (if (search-forward hz-ascii-designnation
				  (if decode-hz-line-continuation nil end)
				  t)
		  (delete-char -2))
	      (setq end (point))
	      (translate-region pos (point) hz-set-msb-table))))
	(if beg
	    (decode-coding-region beg end 'euc-china)))
      (- (point-max) (point-min)))))

;;;###autoload
(defun decode-hz-buffer ()
  "Decode HZ/ZW encoded text in the current buffer."
  (interactive)
  (decode-hz-region (point-min) (point-max)))

;;;###autoload
(defun encode-hz-region (beg end)
  "Encode the text in the current region to HZ.
Return the length of resulting text."
  (interactive "r")
  (save-excursion
    (save-restriction
      (narrow-to-region beg end)

      ;; "~" -> "~~"
      (goto-char (point-min))
      (while (search-forward "~" nil t)	(insert ?~))

      ;; Chinese GB2312 -> "~{...~}"
      (goto-char (point-min))
      (if (re-search-forward "\\cc" nil t)
	  (let (pos)
	    (goto-char (setq pos (match-beginning 0)))
	    (encode-coding-region pos (point-max) 'iso-2022-7bit)
	    (goto-char pos)
	    (while (search-forward iso2022-gb-designation nil t)
	      (delete-char -3)
	      (insert hz-gb-designnation))
	    (goto-char pos)
	    (while (search-forward iso2022-ascii-designation nil t)
	      (delete-char -3)
	      (insert hz-ascii-designnation))))
      (- (point-max) (point-min)))))

;;;###autoload
(defun encode-hz-buffer ()
  "Encode the text in the current buffer to HZ."
  (interactive)
  (encode-hz-region (point-min) (point-max)))

;;;###autoload
(defun post-read-decode-hz (len)
  (let ((pos (point))
	(buffer-modified-p (buffer-modified-p))
	last-coding-system-used)
    (prog1
	(decode-hz-region pos (+ pos len))
      (set-buffer-modified-p buffer-modified-p))))

;;;###autoload
(defun pre-write-encode-hz (from to)
  (let ((buf (current-buffer)))
    (set-buffer (generate-new-buffer " *temp*"))
    (if (stringp from)
	(insert from)
      (insert-buffer-substring buf from to))
    (let (last-coding-system-used)
      (encode-hz-region 1 (point-max)))
    nil))
;;
(provide 'china-util)

;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836
;;; china-util.el ends here