view lisp/language/indian.el @ 19860:c17fd465ea95 libc-970911 libc-970912 libc-970913 libc-970914 libc-970915 libc-970916 libc-970917 libc-970918 libc-970919 libc-970920 libc-970921 libc-970922 libc-970923 libc-970924 libc-970925 libc-970926 libc-970927 libc-970928 libc-970929 libc-970930 libc-971001 libc-971018 libc-971019 libc-971020 libc-971021 libc-971022 libc-971023 libc-971024 libc-971025 libc-971026 libc-971027 libc-971028 libc-971029 libc-971030 libc-971031 libc-971101 libc-971102 libc-971103 libc-971104 libc-971105 libc-971106 libc-971107 libc-971108 libc-971109 libc-971110 libc-971111 libc-971112 libc-971113 libc-971114 libc-971115 libc-971116 libc-971117 libc-971118 libc-971120 libc-971121 libc-971122 libc-971123 libc-971124 libc-971125 libc-971126 libc-971127 libc-971128 libc-971129 libc-971130 libc-971201 libc-971203 libc-971204 libc-971205 libc-971206 libc-971207 libc-971208 libc-971209 libc-971210 libc-971211 libc-971212 libc-971213 libc-971214 libc-971217 libc-971218 libc-971219 libc-971220 libc-971221 libc-971222 libc-971223 libc-971224 libc-971225 libc-971226 libc-971227 libc-971228 libc-971229 libc-971230 libc-971231 libc-980103 libc-980104 libc-980105 libc-980106 libc-980107 libc-980108 libc-980109 libc-980110 libc-980111 libc-980112 libc-980114 libc-980115 libc-980116 libc-980117 libc-980118 libc-980119 libc-980120 libc-980121 libc-980122 libc-980123 libc-980124 libc-980125 libc-980126 libc-980127 libc-980128

typos.
author Jeff Law <law@redhat.com>
date Wed, 10 Sep 1997 21:16:20 +0000
parents 9503ed5384c5
children 7918e0f37c7c
line wrap: on
line source

;;; indian.el --- Support for Indian Languages

;; Copyright (C) 1995 Free Software Foundation, Inc.

;; Author: KAWABATA, Taichi <kawabata@is.s.u-tokyo.ac.jp>

;; Keywords: multilingual, Indian

;; This file is part of GNU Emacs.

;; GNU Emacs is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING.  If not, write to the
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.

;;; Commentary:

;; History:
;; 1996.10.18 written by KAWABATA, Taichi <kawabata@is.s.u-tokyo.ac.jp>

;; For Indian, the character set IS 13194 is supported.
;;
;; IS 13194 does not specifically assign glyphs for each characters.
;; Following code is not specific to each Indian language.
;;
;; Eventually, this code will support generic information about
;; following scripts.
;;
;;    Devanagari
;;    Bengali
;;    Gurmukhi
;;    Gujarati
;;    Oriya
;;    Tamil
;;    Telgu
;;    Kannada
;;    Malayalam
;;
;; In this file, charsets other than charset-ascii and charset-indian-is13194
;; should not be used except in the comment.

;;; Code:

(define-prefix-command 'describe-indian-environment-map)
(define-key-after describe-language-environment-map [Indian]
  '("Indian" . describe-indian-environment-map)
  t)

(define-prefix-command 'setup-indian-environment-map)
(define-key-after setup-language-environment-map [Indian]
  '("Indian" . setup-indian-environment-map)
  t)

;;  Followings are what you see when you refer to the Emacs
;;  representations of IS 13194 charcters.  However, this is merely
;;  tentative apperance, and you must convert them by
;;  indian-to-xxxxxx(specific script) function to use them.
;;  Devanagari is not an exception of this rule.

;;     0xa0 //(5!"#$%&'()*+,-./(B
;;     0xb0 (50123456789:;<=>?(B
;;     0xc0 (5@ABCDEFGHIJKLMNO(B
;;     0xd0 (5PQRSTUVWXYZ[\]^_(B
;;     0xe0 (5`abcdefghijklmno(B
;;     0xf0 (5pqrstuvwxyz{|}~(B//

;; Note - In IS 13194, several symbols are obtained by special
;; combination of several characters and Nukta sign.
;;
;;   Sanskrit Vowel R  -> (5*(B + (5i(B
;;   Sanskrit Vowel L  -> (5&(B + (5i(B
;;   Sanskrit Vowel LL -> (5'(B + (5i(B
;;   Sanskrit Avagrah  -> (5j(B + (5i(B
;;   OM                -> (5!(B + (5i(B
;;
;; Note - IS 13194 defines ATR(0xEF) and EXT(0xF0), but they are
;; not used in Emacs.
;;
;; Note - the above characters DO NOT represent any script.  For
;; example, if you want to obtain Devanagari character, you must do
;; something like the following.
;;
;;   (char-to-string (indian-to-devanagari ?(5$(B))
;;   "$(5!$(B"

(let ((deflist	
	'(;; chars	syntax	category
	  ("(5!"#(B"	"w"	?7) ; vowel-modifying diacritical mark
				    ; chandrabindu, anuswar, visarga
	  ("(5$(B-(52(B"	"w"	?1) ; base (independent) vowel
	  ("(53(B-(5X(B"	"w"	?0) ; consonant
	  ("(5Z(B-(5g(B"	"w"	?8) ; matra
	  ("(5q(B-(5z(B"	"w"	?6) ; digit
	  ))
      elm chars len syntax category to ch i)
  (while deflist
    (setq elm (car deflist))
    (setq chars (car elm)
	  len (length chars)
	  syntax (nth 1 elm)
	  category (nth 2 elm)
	  i 0)
    (while (< i len)
      (if (= (aref chars i) ?-)
	  (setq i (1+ i)
		to (sref chars i))
	(setq ch (sref chars i)
	      to ch))
      (while (<= ch to)
	(modify-syntax-entry ch syntax)
	(modify-category-entry ch category)
	(setq ch (1+ ch)))
      (setq i (+ i (char-bytes to))))
    (setq deflist (cdr deflist))))


;;; ITRANS
;;
;; ITRANS is one of the most popular method to exchange indian scripts
;; electronically.  Here is the table to convert between ITRANS code and
;; IS 13194 code.

(defvar indian-itrans-consonant-alist
  '(
    ("k" . "(53(B")
    ("kh" . "(54(B")
    ("g" . "(55(B")
    ("gh" . "(56(B")
    ("N^" . "(57(B")
    ("ch" . "(58(B")
    ("chh" . "(59(B")
    ("j" . "(5:(B")
    ("jh" . "(5;(B")
    ("JN" . "(5<(B")
    ("T" . "(5=(B")
    ("Th" . "(5>(B")
    ("D" . "(5?(B")
    ("Dh" . "(5@(B")
    ("N" . "(5A(B")
    ("t" . "(5B(B")
    ("th" . "(5C(B")
    ("d" . "(5D(B")
    ("dh" . "(5E(B")
    ("n" . "(5F(B")
    ("nh" . "(5G(B")     ; For transcription of non-Devanagari Languages.
    ("p" . "(5H(B")
    ("ph" . "(5I(B")
    ("b" . "(5J(B")
    ("bh" . "(5K(B")
    ("m" . "(5L(B")
    ("y" . "(5M(B")
    ("yh" . "(5N(B")      ; For transcription of non-Devanagari Languages.
    ("r" . "(5O(B")
    ("rh" . "(5P(B")      ; For transcription of non-Devanagari Languages.
    ("l" . "(5Q(B")
    ("v" . "(5T(B")
    ("sh" . "(5U(B")
    ("shh" . "(5V(B")
    ("s" . "(5W(B")
    ("h" . "(5X(B")
    ("ld" . "(5R(B")
    ("L" . "(5R(B")
    ("ksh" . "$(5!3!h!V(B")
    ("GY" . "***GY***")  ; Must check out later.
    ;; special consonants
    ("q" . "(53i(B")
    ("K" . "(54i(B")
    ("G" . "(55i(B")
    ("z" . "(5:i(B")
    ("f" . "(5Ii(B")
    (".D" . "(5?i(B")
    (".Dh" . "(5@i(B")
  ))

(defvar indian-itrans-vowel-sign-alist
  '(
    ;; Special treatment unique to IS 13194 Transliteration
    ("" . "(5h(B")
    ("a" . "")
    ;; Matra (Vowel Sign)
    ("aa" . "(5Z(B")
    ("A" . "(5Z(B")
    ("i" . "(5[(B")
    ("ii" . "(5\(B")
    ("I" . "(5\(B")
    ("u" . "(5](B")
    ("uu" . "(5^(B")
    ("U" . "(5^(B")
    ("R^i" . "(5_(B")     ; These must be checked out later.
    ("R^I" . "(5_i(B")
    ("L^i" . "(5[i(B")
    ("L^I" . "(5\i(B")
    ("E" . "(5`(B")       ; For transcription of non-Devanangri Languages.
    ("e" . "(5a(B")
    ("ai" . "(5b(B") 
    ;; ("e.c" . "(5c(B")     ; Tentatively suppressed.
    ("O" . "(5d(B")       ; For transcription of non-Devanagari Languages.
    ("o" . "(5e(B")
    ("au" . "(5f(B")
    ;; ("o.c" . "(5g(B")     ; Tentatively suppressed.
    ))

;;
;; Independent vowels and other signs.
;;

(defvar indian-itrans-other-letters-alist
  '(
    ("a" . "(5$(B")
    ("aa" . "(5%(B")
    ("A" . "(5%(B")
    ("i" . "(5&(B")
    ("ii" . "(5'(B")
    ("I" . "(5'(B")
    ("u" . "(5((B")
    ("uu" . "(5)(B")
    ("U" . "(5)(B")
    ("R^i" . "(5*(B")
    ("R^I" . "(5*i(B")
    ("L^i" . "(5&i(B")
    ("L^I" . "(5'i(B")
    ("E" . "(5+(B")	; For transcription of non-Devanagari Languages.
    ("e" . "(5,(B")
    ("ai" . "(5-(B")
    ;; ("e.c" . "(5.(B")	; Candra E
    ("O" . "(5/(B")	; For transcription of non-Devanagari Languages.
    ("o" . "(50(B")
    ("au" . "(51(B")
    ;; ("o.c" . "(52(B")	; Candra O
    ("M" . "(5$(B")
    ("H" . "(5#(B")
    ("AUM" . "(5!i(B")
    ("OM" . "(5!i(B")
    (".r" . "(5Oh(B")
    (".n" . "(5"(B")
    (".N" . "(5!(B")
    (".h" . "(5h(B")        ; Halant
    (".." . "(5j(B")
    (".a" . "(5ji(B")      ; Avagrah
    ("0" . "(5q(B")
    ("1" . "(5r(B")
    ("2" . "(5s(B")
    ("3" . "(5t(B")
    ("4" . "(5u(B")
    ("5" . "(5v(B")
    ("6" . "(5w(B")
    ("7" . "(5x(B")
    ("8" . "(5y(B")
    ("9" . "(5z(B")
    ))

;; Regular expression matching single Indian character represented
;; by ITRANS.

(defvar indian-itrans-regexp
  (let ((consonant "\\([cs]hh?\\)\\|[kgjTDnpbyr]h?\\|\\(N\\^?\\)\\|\\(jN\\)\\|[mvqKGzfs]\\|\\(ld?\\)\\|\\(ksh\\)\\|\\(GY\\)\\|\\(\\.Dh?\\)")
	(vowel "\\(a[aiu]\\)\\|\\(ii\\)\\|\\(uu\\)\\|\\([RL]\\^[iI]\\)\\|[AIEOeoaiu]")
	(misc "[MH0-9]\\|\\(AUM\\)\\|\\(OM\\)\\|\\(\\.[rnNh\\.a]\\)")
	(lpre "\\(") (rpre "\\)") (orre "\\|"))
    (concat lpre misc rpre orre
	    lpre lpre consonant rpre "?" lpre vowel rpre rpre orre
	    lpre consonant rpre )))

;;
;; Regular expression matching single ITRANS unit for IS 13194 characters.
;;

(defvar itrans-indian-regexp
  (let ((vowel "[(5$(B-(52(B]")
	(consonant "[(53(B-(5X(B]")
	(matra "[(5Z(B-(5g(B]")
	(misc "[(5q(B-(5z(B]")
	(lpre "\\(") (rpre "\\)") (orre "\\|"))
    (concat misc orre
	    lpre consonant matra "?" rpre orre
	    vowel)))

;;
;; IS13194 - ITRANS conversion table for string matching above regexp.
;;

(defvar indian-itrans-alist
  (let ((cl indian-itrans-consonant-alist)
	(ml indian-itrans-other-letters-alist) rules)
	  (while cl
	    (let ((vl indian-itrans-vowel-sign-alist))
	      (while vl
		(setq rules 
		      (cons (cons (concat (car (car cl)) (car (car vl)))
				  (concat (cdr (car cl)) (cdr (car vl))))
			    rules))
		(setq vl (cdr vl))))
	    (setq cl (cdr cl)))
	  (while ml
	    (setq rules (cons (cons (car (car ml)) 
				    (cdr (car ml)))
			      rules))
	    (setq ml (cdr ml)))
	  rules))

;;
;; Utility program to convert from ITRANS to IS 13194 in specified region.
;;

(defun indian-decode-itrans-region (from to)
  "Convert `ITRANS' mnemonics of the current region to Indian characters.
When called from a program, expects two arguments,
positions (integers or markers) specifying the stretch of the region."
  (interactive "r")
  (save-restriction
    (narrow-to-region from to)
    (goto-char (point-min))
    (while (re-search-forward indian-itrans-regexp nil t)
      (let* ((itrans (buffer-substring (match-beginning 0) (match-end 0)))
	     (ch (cdr (assoc itrans indian-itrans-alist))))
	(if ch
	    (progn
	      (delete-region (match-beginning 0) (match-end 0))
	      (insert ch)))))
    (goto-char (point-min))
    (while (re-search-forward "\\((5h(B\\)[^\\c0]" nil t)
      (delete-region (match-beginning 1) (match-end 1)))))

;;
;; Utility program to convert from IS 13194 to ITRANS in specified region.
;;

(defun indian-encode-itrans-region (from to)
  "Convert indian region to ITRANS mnemonics."
  (interactive "r")
  (save-restriction
    (narrow-to-region from to)
    (goto-char (point-min))
    (while (re-search-forward itrans-indian-regexp nil t)
      (let* ((indian (buffer-substring (match-beginning 0) (match-end 0)))
	     (ch (car (rassoc indian indian-itrans-alist))))
	(if ch
	    (progn
	      (delete-region (match-beginning 0) (match-end 0))
	      (insert ch)))))
    (goto-char (point-min))))
  
;;; indian.el ends here