comparison lisp/language/indian.el @ 17052:d0d7b244b1d0

Initial revision
author Karl Heuer <kwzh@gnu.org>
date Thu, 20 Feb 1997 07:02:49 +0000
parents
children 70194012fb3a
comparison
equal deleted inserted replaced
17051:fd0b17a79b07 17052:d0d7b244b1d0
1 ;;; indian.el --- Support for Indian Languages
2
3 ;; Copyright (C) 1995 Free Software Foundation, Inc.
4
5 ;; Author: KAWABATA, Taichi <kawabata@is.s.u-tokyo.ac.jp>
6
7 ;; Keywords: multilingual, Indian
8
9 ;; This file is part of GNU Emacs.
10
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
12 ;; it under the terms of the GNU General Public License as published by
13 ;; the Free Software Foundation; either version 2, or (at your option)
14 ;; any later version.
15
16 ;; GNU Emacs is distributed in the hope that it will be useful,
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;; GNU General Public License for more details.
20
21 ;; You should have received a copy of the GNU General Public License
22 ;; along with GNU Emacs; see the file COPYING. If not, write to
23 ;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
24
25 ;;; Commentary:
26
27 ;; History:
28 ;; 1996.10.18 written by KAWABATA, Taichi <kawabata@is.s.u-tokyo.ac.jp>
29
30 ;; For Indian, the character set IS 13194 is supported.
31 ;;
32 ;; IS 13194 does not specifically assign glyphs for each characters.
33 ;; Following code is not specific to each Indian language.
34 ;;
35 ;; Eventually, this code will support generic information about
36 ;; following scripts.
37 ;;
38 ;; Devanagari
39 ;; Bengali
40 ;; Gurmukhi
41 ;; Gujarati
42 ;; Oriya
43 ;; Tamil
44 ;; Telgu
45 ;; Kannada
46 ;; Malayalam
47 ;;
48 ;; In this file, charsets other than charset-ascii and charset-indian-is13194
49 ;; should not be used except in the comment.
50
51 ;;; Code:
52
53 ;; Followings are what you see when you refer to the Emacs
54 ;; representations of IS 13194 charcters. However, this is merely
55 ;; tentative apperance, and you must convert them by
56 ;; indian-to-xxxxxx(specific script) function to use them.
57 ;; Devanagari is not an exception of this rule.
58
59 ;; 0xa0 //(5!"#$%&'()*+,-./(B
60 ;; 0xb0 (50123456789:;<=>?(B
61 ;; 0xc0 (5@ABCDEFGHIJKLMNO(B
62 ;; 0xd0 (5PQRSTUVWXYZ[\]^_(B
63 ;; 0xe0 (5`abcdefghijklmno(B
64 ;; 0xf0 (5pqrstuvwxyz{|}~(B//
65
66 ;; Note - In IS 13194, several symbols are obtained by special
67 ;; combination of several characters and Nukta sign.
68 ;;
69 ;; Sanskrit Vowel R -> (5*(B + (5i(B
70 ;; Sanskrit Vowel L -> (5&(B + (5i(B
71 ;; Sanskrit Vowel LL -> (5'(B + (5i(B
72 ;; Sanskrit Avagrah -> (5j(B + (5i(B
73 ;; OM -> (5!(B + (5i(B
74 ;;
75 ;; Note - IS 13194 defines ATR(0xEF) and EXT(0xF0), but they are
76 ;; not used in Emacs.
77 ;;
78 ;; Note - the above characters DO NOT represent any script. For
79 ;; example, if you want to obtain Devanagari character, you must do
80 ;; something like the following.
81 ;;
82 ;; (char-to-string (indian-to-devanagari ?(5$(B))
83 ;; "$(5!$(B"
84
85 (let ((deflist
86 '(;; chars syntax category
87 ("(5!"#(B" "w" ?7) ; vowel-modifying diacritical mark
88 ; chandrabindu, anuswar, visarga
89 ("(5$(B-(52(B" "w" ?5) ; independent vowel
90 ("(53(B-(5X(B" "w" ?0) ; consonant
91 ("(5Z(B-(5g(B" "w" ?8) ; matra
92 ("(5q(B-(5z(B" "w" ?6) ; digit
93 ))
94 elm chars len syntax category to ch i)
95 (while deflist
96 (setq elm (car deflist))
97 (setq chars (car elm)
98 len (length chars)
99 syntax (nth 1 elm)
100 category (nth 2 elm)
101 i 0)
102 (while (< i len)
103 (if (= (aref chars i) ?-)
104 (setq i (1+ i)
105 to (sref chars i))
106 (setq ch (sref chars i)
107 to ch))
108 (while (<= ch to)
109 (modify-syntax-entry ch syntax)
110 (modify-category-entry ch category)
111 (setq ch (1+ ch)))
112 (setq i (+ i (char-bytes to))))
113 (setq deflist (cdr deflist))))
114
115
116 ;;; ITRANS
117 ;;
118 ;; ITRANS is one of the most popular method to exchange indian scripts
119 ;; electronically. Here is the table to convert between ITRANS code and
120 ;; IS 13194 code.
121
122 (defvar indian-itrans-consonant-alist
123 '(
124 ("k" . "(53(B")
125 ("kh" . "(54(B")
126 ("g" . "(55(B")
127 ("gh" . "(56(B")
128 ("N^" . "(57(B")
129 ("ch" . "(58(B")
130 ("chh" . "(59(B")
131 ("j" . "(5:(B")
132 ("jh" . "(5;(B")
133 ("JN" . "(5<(B")
134 ("T" . "(5=(B")
135 ("Th" . "(5>(B")
136 ("D" . "(5?(B")
137 ("Dh" . "(5@(B")
138 ("N" . "(5A(B")
139 ("t" . "(5B(B")
140 ("th" . "(5C(B")
141 ("d" . "(5D(B")
142 ("dh" . "(5E(B")
143 ("n" . "(5F(B")
144 ("nh" . "(5G(B") ; For transcription of non-Devanagari Languages.
145 ("p" . "(5H(B")
146 ("ph" . "(5I(B")
147 ("b" . "(5J(B")
148 ("bh" . "(5K(B")
149 ("m" . "(5L(B")
150 ("y" . "(5M(B")
151 ("yh" . "(5N(B") ; For transcription of non-Devanagari Languages.
152 ("r" . "(5O(B")
153 ("rh" . "(5P(B") ; For transcription of non-Devanagari Languages.
154 ("l" . "(5Q(B")
155 ("v" . "(5T(B")
156 ("sh" . "(5U(B")
157 ("shh" . "(5V(B")
158 ("s" . "(5W(B")
159 ("h" . "(5X(B")
160 ("ld" . "(5R(B")
161 ("L" . "(5R(B")
162 ("ksh" . "$(5!3!h!V(B")
163 ("GY" . "***GY***") ; Must check out later.
164 ;; special consonants
165 ("q" . "(53i(B")
166 ("K" . "(54i(B")
167 ("G" . "(55i(B")
168 ("z" . "(5:i(B")
169 ("f" . "(5Ii(B")
170 (".D" . "(5?i(B")
171 (".Dh" . "(5@i(B")
172 ))
173
174 (defvar indian-itrans-vowel-sign-alist
175 '(
176 ;; Special treatment unique to IS 13194 Transliteration
177 ("" . "(5h(B")
178 ("a" . "")
179 ;; Matra (Vowel Sign)
180 ("aa" . "(5Z(B")
181 ("A" . "(5Z(B")
182 ("i" . "(5[(B")
183 ("ii" . "(5\(B")
184 ("I" . "(5\(B")
185 ("u" . "(5](B")
186 ("uu" . "(5^(B")
187 ("U" . "(5^(B")
188 ("R^i" . "(5_(B") ; These must be checked out later.
189 ("R^I" . "(5_i(B")
190 ("L^i" . "(5[i(B")
191 ("L^I" . "(5\i(B")
192 ("E" . "(5`(B") ; For transcription of non-Devanangri Languages.
193 ("e" . "(5a(B")
194 ("ai" . "(5b(B")
195 ;; ("e.c" . "(5c(B") ; Tentatively suppressed.
196 ("O" . "(5d(B") ; For transcription of non-Devanagari Languages.
197 ("o" . "(5e(B")
198 ("au" . "(5f(B")
199 ;; ("o.c" . "(5g(B") ; Tentatively suppressed.
200 ))
201
202 ;;
203 ;; Independent vowels and other signs.
204 ;;
205
206 (defvar indian-itrans-other-letters-alist
207 '(
208 ("a" . "(5$(B")
209 ("aa" . "(5%(B")
210 ("A" . "(5%(B")
211 ("i" . "(5&(B")
212 ("ii" . "(5'(B")
213 ("I" . "(5'(B")
214 ("u" . "(5((B")
215 ("uu" . "(5)(B")
216 ("U" . "(5)(B")
217 ("R^i" . "(5*(B")
218 ("R^I" . "(5*i(B")
219 ("L^i" . "(5&i(B")
220 ("L^I" . "(5'i(B")
221 ("E" . "(5+(B") ; For transcription of non-Devanagari Languages.
222 ("e" . "(5,(B")
223 ("ai" . "(5-(B")
224 ;; ("e.c" . "(5.(B") ; Candra E
225 ("O" . "(5/(B") ; For transcription of non-Devanagari Languages.
226 ("o" . "(50(B")
227 ("au" . "(51(B")
228 ;; ("o.c" . "(52(B") ; Candra O
229 ("M" . "(5$(B")
230 ("H" . "(5#(B")
231 ("AUM" . "(5!i(B")
232 ("OM" . "(5!i(B")
233 (".r" . "(5Oh(B")
234 (".n" . "(5"(B")
235 (".N" . "(5!(B")
236 (".h" . "(5h(B") ; Halant
237 (".." . "(5j(B")
238 (".a" . "(5ji(B") ; Avagrah
239 ("0" . "(5q(B")
240 ("1" . "(5r(B")
241 ("2" . "(5s(B")
242 ("3" . "(5t(B")
243 ("4" . "(5u(B")
244 ("5" . "(5v(B")
245 ("6" . "(5w(B")
246 ("7" . "(5x(B")
247 ("8" . "(5y(B")
248 ("9" . "(5z(B")
249 ))
250
251 ;; Regular expression matching single Indian character represented
252 ;; by ITRANS.
253
254 (defvar indian-itrans-regexp
255 (let ((consonant "\\([cs]hh?\\)\\|[kgjTDnpbyr]h?\\|\\(N\\^?\\)\\|\\(jN\\)\\|[mvqKGzfs]\\|\\(ld?\\)\\|\\(ksh\\)\\|\\(GY\\)\\|\\(\\.Dh?\\)")
256 (vowel "\\(a[aiu]\\)\\|\\(ii\\)\\|\\(uu\\)\\|\\([RL]\\^[iI]\\)\\|[AIEOeoaiu]")
257 (misc "[MH0-9]\\|\\(AUM\\)\\|\\(OM\\)\\|\\(\\.[rnNh\\.a]\\)")
258 (lpre "\\(") (rpre "\\)") (orre "\\|"))
259 (concat lpre misc rpre orre
260 lpre lpre consonant rpre "?" lpre vowel rpre rpre orre
261 lpre consonant rpre )))
262
263 ;;
264 ;; Regular expression matching single ITRANS unit for IS 13194 characters.
265 ;;
266
267 (defvar itrans-indian-regexp
268 (let ((vowel "[(5$(B-(52(B]")
269 (consonant "[(53(B-(5X(B]")
270 (vowel-sign "[(5Z(B-(5g(B]")
271 (misc "[(5q(B-(5z(B]")
272 (lpre "\\(") (rpre "\\)") (orre "\\|"))
273 nil)) ; not yet prepared.
274
275
276 ;;
277 ;; IS13194 - ITRANS conversion table for string matching above regexp.
278 ;;
279
280 (defvar indian-itrans-alist
281 (let ((cl indian-itrans-consonant-alist)
282 (ml indian-itrans-other-letters-alist) rules)
283 (while cl
284 (let ((vl indian-itrans-vowel-sign-alist))
285 (while vl
286 (setq rules
287 (cons (cons (concat (car (car cl)) (car (car vl)))
288 (concat (cdr (car cl)) (cdr (car vl))))
289 rules))
290 (setq vl (cdr vl))))
291 (setq cl (cdr cl)))
292 (while ml
293 (setq rules (cons (cons (car (car ml))
294 (cdr (car ml)))
295 rules))
296 (setq ml (cdr ml)))
297 rules))
298
299 ;;
300 ;; Utility program to convert from ITRANS to IS 13194 in specified region.
301 ;;
302
303 (defun indian-decode-itrans-region (from to)
304 "Convert `ITRANS' mnemonics of the current region to Indian characters.
305 When called from a program, expects two arguments,
306 positions (integers or markers) specifying the stretch of the region."
307 (interactive "r")
308 (save-restriction
309 (narrow-to-region from to)
310 (goto-char (point-min))
311 (while (re-search-forward indian-itrans-regexp nil t)
312 (let* ((itrans (buffer-substring (match-beginning 0) (match-end 0)))
313 (ch (cdr (assoc itrans indian-itrans-alist))))
314 (if ch
315 (progn
316 (delete-region (match-beginning 0) (match-end 0))
317 (insert ch)))))
318 (goto-char (point-min))
319 (while (re-search-forward "\\((5h(B\\)[^\\c0]" nil t)
320 (delete-region (match-beginning 1) (match-end 1)))))
321
322 ;;
323 ;; Utility program to convert from IS 13194 to ITRANS in specified region.
324 ;;
325
326 ;;;;;; not yet prepared.
327
328 ;;; indian.el ends here