comparison lisp/language/indian.el @ 106634:1f0768e4ad7e

(devanagari-composable-pattern): Fixed to handle ZWNJ and ZWJ. Use it in composition-function-table for Devanagari. (malayalam-composable-pattern): Fix previous change.
author Kenichi Handa <handa@m17n.org>
date Fri, 25 Dec 2009 02:45:47 +0000
parents 3b9052789379
children 8d23ea9e4ab1
comparison
equal deleted inserted replaced
106633:c155113f5bd1 106634:1f0768e4ad7e
137 (dolist (elt table) 137 (dolist (elt table)
138 (setq regexp (replace-regexp-in-string (car elt) (cdr elt) regexp t t))) 138 (setq regexp (replace-regexp-in-string (car elt) (cdr elt) regexp t t)))
139 regexp)) 139 regexp))
140 140
141 (defconst devanagari-composable-pattern 141 (defconst devanagari-composable-pattern
142 (concat 142 (let ((table
143 "\\([अ-औॠॡ][ँं]?\\)\\|[ः।]" 143 '(("V" . "[\u0904-\u0914\u0960-\u0961\u0972]") ; independent vowel
144 "\\|\\(" 144 ("C" . "[\u0915-\u0939]") ; consonant
145 "\\(?:\\(?:[क-हक़-य़]्\\)?\\(?:[क-हक़-य़]्\\)?\\(?:[क-हक़-य़]्\\)?[क-हक़-य़]्\\)?" 145 ("R" . "\u0930") ; RA
146 "[क-हक़-य़]\\(?:्\\|[ा-्ॢॣ]?[ंँ]?\\)?" 146 ("n" . "\u093C") ; NUKTA
147 "\\)") 147 ("H" . "\u094D") ; HALANT
148 ("m" . "\u093F") ; vowel sign (pre)
149 ("u" . "[\u0945-\u0948\u0955]") ; vowel sign (above)
150 ("b" . "[\u0941-\u0944\u0962-\u0963]") ; vowel sign (below)
151 ("p" . "[\u093E\u0940\u0949-\u094C]") ; vowel sign (post)
152 ("A" . "[\u0900-\u0902\u0953-\u0954]") ; vowel modifier (above)
153 ("a" . "\u0903") ; vowel modifier (post)
154 ("S" . "\u0951") ; stress sign (above)
155 ("s" . "\u0952") ; stress sign (below)
156 ("J" . "\u200D") ; ZWJ
157 ("N" . "\u200C") ; ZWNJ
158 ("X" . "[\u0900-\u097F]")))) ; all coverage
159 (indian-compose-regexp
160 (concat
161 ;; syllables with an independent vowel, or
162 "\\(?:RH\\)?Vn?m?b?u?p?n?A?s?S?a?\\|"
163 ;; consonant-based syllables, or
164 "\\(?:Cn?J?HJ?\\)*Cn?\\(?:H[NJ]?\\|m?b?u?p?n?A?s?S?a?\\)\\|"
165 ;; special consonant form, or
166 "JHR\\|"
167 ;; any other singleton characters
168 "X")
169 table))
148 "Regexp matching a composable sequence of Devanagari characters.") 170 "Regexp matching a composable sequence of Devanagari characters.")
149 171
150 (defconst tamil-composable-pattern 172 (defconst tamil-composable-pattern
151 (concat 173 (concat
152 "\\([அ-ஔ]\\)\\|" 174 "\\([அ-ஔ]\\)\\|"
163 "[ಕ-ಹ]\\(?:್\\|[ಾ-್ೕೃ]?\\)?" 185 "[ಕ-ಹ]\\(?:್\\|[ಾ-್ೕೃ]?\\)?"
164 "\\)") 186 "\\)")
165 "Regexp matching a composable sequence of Kannada characters.") 187 "Regexp matching a composable sequence of Kannada characters.")
166 188
167 (defconst malayalam-composable-pattern 189 (defconst malayalam-composable-pattern
168 (let ((table '(("V" . "[\u0D05-\u0D14\u0D60-\u0D61]") ; independent vowel 190 (let ((table
169 ("C" . "[\u0D15-\u0D39]") ; consonant 191 '(("V" . "[\u0D05-\u0D14\u0D60-\u0D61]") ; independent vowel
170 ("m" . "[\u0D46-\u0D48\u0D4A-\u0D4C]") ; prebase matra 192 ("C" . "[\u0D15-\u0D39]") ; consonant
171 ("p" . "[\u0D3E-\u0D44\u0D57]") ; postname matra 193 ("m" . "[\u0D46-\u0D48\u0D4A-\u0D4C]") ; prebase matra
172 ("b" . "[\u0D62-\u0D63]") ; belowbase matra 194 ("p" . "[\u0D3E-\u0D44\u0D57]") ; postbase matra
173 ("a" . "[\u0D02-\u0D03]") ; abovebase sign 195 ("b" . "[\u0D62-\u0D63]") ; belowbase matra
174 ("H" . "്") ; virama sign 196 ("a" . "[\u0D02-\u0D03]") ; abovebase sign
175 ("N" . "\u200D") ; ZWJ 197 ("H" . "\u0D4D") ; virama sign
176 ("J" . "\u200C") ; ZWNJ 198 ("N" . "\u200D") ; ZWJ
177 ("X" . "[\u0D00-\u0D7F]")))) ; all coverage 199 ("J" . "\u200C") ; ZWNJ
200 ("X" . "[\u0D00-\u0D7F]")))) ; all coverage
178 (indian-compose-regexp 201 (indian-compose-regexp
179 (concat 202 (concat
180 ;; consonant-based syllables 203 ;; syllables with an independent vowel, or
181 "\\(CJ?HJ?\\)*C\\(H[NJ]?\\|m?b?p?a?\\)\\|" 204 "V\\(?:J?HC\\)?m?b?p?a?\\|"
182 ;; syllables with an independent vowel 205 ;; consonant-based syllables, or
183 "V\\(J?HC\\)?m?b?p?a?\\|" 206 "\\(?:CJ?HJ?\\)\\{0,4\\}C\\(?:H[NJ]?\\|m?b?p?a?\\)\\|"
184 ;; special consonant form 207 ;; special consonant form, or
185 "JHC\\|" 208 "JHC\\|"
186 ;; any other singleton characters 209 ;; any other singleton characters
187 "X") 210 "X")
188 table)) 211 table))
189 "Regexp matching a composable sequence of Malayalam characters.") 212 "Regexp matching a composable sequence of Malayalam characters.")
190 213
191 (let ((script-regexp-alist 214 (let ((script-regexp-alist
192 `((devanagari . "[\x900-\x97F\x200C\x200D]+") 215 `((devanagari . ,devanagari-composable-pattern)
193 (bengali . "[\x980-\x9FF\x200C\x200D]+") 216 (bengali . "[\x980-\x9FF\x200C\x200D]+")
194 (gurmukhi . "[\xA00-\xA7F\x200C\x200D]+") 217 (gurmukhi . "[\xA00-\xA7F\x200C\x200D]+")
195 (gujarati . "[\xA80-\xAFF\x200C\x200D]+") 218 (gujarati . "[\xA80-\xAFF\x200C\x200D]+")
196 (oriya . "[\xB00-\xB7F\x200C\x200D]+") 219 (oriya . "[\xB00-\xB7F\x200C\x200D]+")
197 (tamil . "[\xB80-\xBFF\x200C\x200D]+") 220 (tamil . "[\xB80-\xBFF\x200C\x200D]+")