view lisp/textmodes/po.el @ 44529:ce826e8a3a41

*** empty log message ***
author Stefan Monnier <monnier@iro.umontreal.ca>
date Fri, 12 Apr 2002 06:01:18 +0000
parents 2479ec7d435b
children 6082da91c94d
line wrap: on
line source

;;; po.el --- basic support of PO translation files -*- coding: latin-1; -*-

;; Copyright (C) 1995-1998, 2000-2002 Free Software Foundation, Inc.

;; Authors: François Pinard <pinard@iro.umontreal.ca>,
;;          Greg McGary <gkm@magilla.cichlid.com>,
;;          Bruno Haible <bruno@clisp.org>.
;; Keywords: i18n, files

;; This file is part of GNU Emacs.

;; GNU Emacs is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING.  If not, write to the
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.

;;; Commentary:

;; This package makes sure visiting PO files decodes them correctly,
;; according to the Charset= header in the PO file.  For more support
;; for editing PO files, see po-mode.el.

;;; Code:

(defconst po-content-type-charset-alist
  '(; Note: Emacs 21 doesn't support all encodings, thus the missing entries.
    ("ASCII" . undecided)
    ("ANSI_X3.4-1968" . undecided)
    ("US-ASCII" . undecided)
    ("ISO-8859-1" . iso-8859-1)
    ("ISO_8859-1" . iso-8859-1)
    ("ISO-8859-2" . iso-8859-2)
    ("ISO_8859-2" . iso-8859-2)
    ("ISO-8859-3" . iso-8859-3)
    ("ISO_8859-3" . iso-8859-3)
    ("ISO-8859-4" . iso-8859-4)
    ("ISO_8859-4" . iso-8859-4)
    ("ISO-8859-5" . iso-8859-5)
    ("ISO_8859-5" . iso-8859-5)
    ;("ISO-8859-6" . ??)
    ;("ISO_8859-6" . ??)
    ("ISO-8859-7" . iso-8859-7)
    ("ISO_8859-7" . iso-8859-7)
    ("ISO-8859-8" . iso-8859-8)
    ("ISO_8859-8" . iso-8859-8)
    ("ISO-8859-9" . iso-8859-9)
    ("ISO_8859-9" . iso-8859-9)
    ;("ISO-8859-13" . ??)
    ;("ISO_8859-13" . ??)
    ("ISO-8859-15" . iso-8859-15) ; requires Emacs 21
    ("ISO_8859-15" . iso-8859-15) ; requires Emacs 21
    ("KOI8-R" . koi8-r)
    ;("KOI8-U" . ??)
    ("CP437" . cp437) ; requires Emacs 20
    ("CP775" . cp775) ; requires Emacs 20
    ("CP850" . cp850) ; requires Emacs 20
    ("CP852" . cp852) ; requires Emacs 20
    ("CP855" . cp855) ; requires Emacs 20
    ;("CP856" . ??)
    ("CP857" . cp857) ; requires Emacs 20
    ("CP861" . cp861) ; requires Emacs 20
    ("CP862" . cp862) ; requires Emacs 20
    ("CP864" . cp864) ; requires Emacs 20
    ("CP865" . cp865) ; requires Emacs 20
    ("CP866" . cp866) ; requires Emacs 21
    ("CP869" . cp869) ; requires Emacs 20
    ;("CP874" . ??)
    ;("CP922" . ??)
    ;("CP932" . ??)
    ;("CP943" . ??)
    ;("CP949" . ??)
    ;("CP950" . ??)
    ;("CP1046" . ??)
    ;("CP1124" . ??)
    ;("CP1129" . ??)
    ("CP1250" . cp1250) ; requires Emacs 20
    ("CP1251" . cp1251) ; requires Emacs 20
    ("CP1252" . iso-8859-1) ; approximation
    ("CP1253" . cp1253) ; requires Emacs 20
    ("CP1254" . iso-8859-9) ; approximation
    ("CP1255" . iso-8859-8) ; approximation
    ;("CP1256" . ??)
    ("CP1257" . cp1257) ; requires Emacs 20
    ("GB2312" . cn-gb-2312)  ; also named 'gb2312' in XEmacs 21 or Emacs 21
                           ; also named 'euc-cn' in Emacs 20 or Emacs 21
    ("EUC-JP" . euc-jp)
    ("EUC-KR" . euc-kr)
    ;("EUC-TW" . ??)
    ("BIG5" . big5)
    ;("BIG5-HKSCS" . ??)
    ;("GBK" . ??)
    ;("GB18030" . ??)
    ("SHIFT_JIS" . shift_jis)
    ;("JOHAB" . ??)
    ("TIS-620" . tis-620)    ; requires Emacs 20 or Emacs 21
    ("VISCII" . viscii)      ; requires Emacs 20 or Emacs 21
    ("UTF-8" . utf-8)        ; requires Mule-UCS in Emacs 20, or Emacs 21
    )
  "How to convert a GNU libc/libiconv canonical charset name as seen in
Content-Type into a Mule coding system.")

(defun po-find-charset (filename)
  "Return PO file charset value."
  (interactive)
  (let ((charset-regexp
	 "^\"Content-Type: text/plain;[ \t]*charset=\\(.*\\)\\\\n\"")
	(short-read nil))
    ;; Try the first 4096 bytes.  In case we cannot find the charset value
    ;; within the first 4096 bytes (the PO file might start with a long
    ;; comment) try the next 4096 bytes repeatedly until we'll know for sure
    ;; we've checked the empty header entry entirely.
    (while (not (or short-read (re-search-forward "^msgid" nil t)))
      (save-excursion
        (goto-char (point-max))
	(let ((pair (insert-file-contents-literally filename nil
						    (1- (point))
						    (1- (+ (point) 4096)))))
	  (setq short-read (< (nth 1 pair) 4096)))))
    (cond ((re-search-forward charset-regexp nil t) (match-string 1))
	  (short-read nil)
	  ;; We've found the first msgid; maybe, only a part of the msgstr
	  ;; value was loaded.  Load the next 1024 bytes; if charset still
	  ;; isn't available, give up.
	  (t (save-excursion
	       (goto-char (point-max))
	       (insert-file-contents-literally filename nil
					       (1- (point))
					       (1- (+ (point) 1024))))
	     (if (re-search-forward charset-regexp nil t)
		 (match-string 1))))))

(defun po-find-file-coding-system-guts (operation filename)
  "\
Return a Mule (DECODING . ENCODING) pair, according to PO file charset.
Called through file-coding-system-alist, before the file is visited for real."
  (and (eq operation 'insert-file-contents)
       (file-exists-p filename)
       (with-temp-buffer
	 (let* ((coding-system-for-read 'no-conversion)
                (charset (or (po-find-charset filename) "ascii"))
                (charset-upper (upcase charset))
                (charset-lower (downcase charset))
		(candidate
		 (cdr (assoc charset-upper po-content-type-charset-alist)))
		(try (or candidate (intern-soft charset-lower))))
           (list (cond ((and try (coding-system-p try))
			try)
		       ((and try
			     (string-match "\\`cp[1-9][0-9][0-9]?\\'"
					   (symbol-name try))
			     (assoc (substring (symbol-name try) 2)
				    (cp-supported-codepages)))
			(codepage-setup (substring (symbol-name try) 2))
			try)
		       ((and (string-match "\\`cp[1-9][0-9][0-9]?\\'"
					   charset-lower)
			     (assoc (substring charset-lower 2)
				    (cp-supported-codepages)))
			(codepage-setup (substring charset-lower 2))
			(intern charset-lower))
		       (t
			'no-conversion)))))))

;;;###autoload
(defun po-find-file-coding-system (arg-list)
  "\
Return a Mule (DECODING . ENCODING) pair, according to PO file charset.
Called through file-coding-system-alist, before the file is visited for real."
  (po-find-file-coding-system-guts (car arg-list) (car (cdr arg-list))))
;; This is for XEmacs.
;(defun po-find-file-coding-system (operation filename)
;  "\
;Return a Mule (DECODING . ENCODING) pair, according to PO file charset.
;Called through file-coding-system-alist, before the file is visited for real."
;  (po-find-file-coding-system-guts operation filename))