Commit | Line | Data |
---|---|---|
56c8bc03 | 1 | ;;; indian.el --- Support for Indian Languages -*- coding: iso-2022-7bit; -*- |
4ed46869 KH |
2 | |
3 | ;; Copyright (C) 1995 Free Software Foundation, Inc. | |
4 | ||
5 | ;; Author: KAWABATA, Taichi <kawabata@is.s.u-tokyo.ac.jp> | |
6 | ||
7 | ;; Keywords: multilingual, Indian | |
8 | ||
9 | ;; This file is part of GNU Emacs. | |
10 | ||
11 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
12 | ;; it under the terms of the GNU General Public License as published by | |
13 | ;; the Free Software Foundation; either version 2, or (at your option) | |
14 | ;; any later version. | |
15 | ||
16 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
17 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 | ;; GNU General Public License for more details. | |
20 | ||
21 | ;; You should have received a copy of the GNU General Public License | |
e803d6bd KH |
22 | ;; along with GNU Emacs; see the file COPYING. If not, write to the |
23 | ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
24 | ;; Boston, MA 02111-1307, USA. | |
4ed46869 KH |
25 | |
26 | ;;; Commentary: | |
27 | ||
28 | ;; History: | |
29 | ;; 1996.10.18 written by KAWABATA, Taichi <kawabata@is.s.u-tokyo.ac.jp> | |
30 | ||
31 | ;; For Indian, the character set IS 13194 is supported. | |
32 | ;; | |
33 | ;; IS 13194 does not specifically assign glyphs for each characters. | |
34 | ;; Following code is not specific to each Indian language. | |
35 | ;; | |
36 | ;; Eventually, this code will support generic information about | |
37 | ;; following scripts. | |
38 | ;; | |
39 | ;; Devanagari | |
40 | ;; Bengali | |
41 | ;; Gurmukhi | |
42 | ;; Gujarati | |
43 | ;; Oriya | |
44 | ;; Tamil | |
45 | ;; Telgu | |
46 | ;; Kannada | |
47 | ;; Malayalam | |
48 | ;; | |
49 | ;; In this file, charsets other than charset-ascii and charset-indian-is13194 | |
50 | ;; should not be used except in the comment. | |
51 | ||
52 | ;;; Code: | |
53 | ||
54 | ;; Followings are what you see when you refer to the Emacs | |
55 | ;; representations of IS 13194 charcters. However, this is merely | |
56 | ;; tentative apperance, and you must convert them by | |
57 | ;; indian-to-xxxxxx(specific script) function to use them. | |
58 | ;; Devanagari is not an exception of this rule. | |
59 | ||
60 | ;; 0xa0 //\e(5!"#$%&'()*+,-./\e(B | |
61 | ;; 0xb0 \e(50123456789:;<=>?\e(B | |
62 | ;; 0xc0 \e(5@ABCDEFGHIJKLMNO\e(B | |
63 | ;; 0xd0 \e(5PQRSTUVWXYZ[\]^_\e(B | |
64 | ;; 0xe0 \e(5`abcdefghijklmno\e(B | |
65 | ;; 0xf0 \e(5pqrstuvwxyz{|}~\e(B// | |
66 | ||
67 | ;; Note - In IS 13194, several symbols are obtained by special | |
68 | ;; combination of several characters and Nukta sign. | |
69 | ;; | |
70 | ;; Sanskrit Vowel R -> \e(5*\e(B + \e(5i\e(B | |
71 | ;; Sanskrit Vowel L -> \e(5&\e(B + \e(5i\e(B | |
72 | ;; Sanskrit Vowel LL -> \e(5'\e(B + \e(5i\e(B | |
73 | ;; Sanskrit Avagrah -> \e(5j\e(B + \e(5i\e(B | |
74 | ;; OM -> \e(5!\e(B + \e(5i\e(B | |
75 | ;; | |
76 | ;; Note - IS 13194 defines ATR(0xEF) and EXT(0xF0), but they are | |
77 | ;; not used in Emacs. | |
78 | ;; | |
79 | ;; Note - the above characters DO NOT represent any script. For | |
80 | ;; example, if you want to obtain Devanagari character, you must do | |
81 | ;; something like the following. | |
82 | ;; | |
83 | ;; (char-to-string (indian-to-devanagari ?\e(5$\e(B)) | |
84 | ;; "\e$(5!$\e(B" | |
85 | ||
4ed46869 KH |
86 | ;;; ITRANS |
87 | ;; | |
88 | ;; ITRANS is one of the most popular method to exchange indian scripts | |
89 | ;; electronically. Here is the table to convert between ITRANS code and | |
90 | ;; IS 13194 code. | |
91 | ||
92 | (defvar indian-itrans-consonant-alist | |
93 | '( | |
94 | ("k" . "\e(53\e(B") | |
95 | ("kh" . "\e(54\e(B") | |
96 | ("g" . "\e(55\e(B") | |
97 | ("gh" . "\e(56\e(B") | |
98 | ("N^" . "\e(57\e(B") | |
99 | ("ch" . "\e(58\e(B") | |
100 | ("chh" . "\e(59\e(B") | |
101 | ("j" . "\e(5:\e(B") | |
102 | ("jh" . "\e(5;\e(B") | |
103 | ("JN" . "\e(5<\e(B") | |
104 | ("T" . "\e(5=\e(B") | |
105 | ("Th" . "\e(5>\e(B") | |
106 | ("D" . "\e(5?\e(B") | |
107 | ("Dh" . "\e(5@\e(B") | |
108 | ("N" . "\e(5A\e(B") | |
109 | ("t" . "\e(5B\e(B") | |
110 | ("th" . "\e(5C\e(B") | |
111 | ("d" . "\e(5D\e(B") | |
112 | ("dh" . "\e(5E\e(B") | |
113 | ("n" . "\e(5F\e(B") | |
114 | ("nh" . "\e(5G\e(B") ; For transcription of non-Devanagari Languages. | |
115 | ("p" . "\e(5H\e(B") | |
116 | ("ph" . "\e(5I\e(B") | |
117 | ("b" . "\e(5J\e(B") | |
118 | ("bh" . "\e(5K\e(B") | |
119 | ("m" . "\e(5L\e(B") | |
120 | ("y" . "\e(5M\e(B") | |
121 | ("yh" . "\e(5N\e(B") ; For transcription of non-Devanagari Languages. | |
122 | ("r" . "\e(5O\e(B") | |
123 | ("rh" . "\e(5P\e(B") ; For transcription of non-Devanagari Languages. | |
124 | ("l" . "\e(5Q\e(B") | |
125 | ("v" . "\e(5T\e(B") | |
126 | ("sh" . "\e(5U\e(B") | |
127 | ("shh" . "\e(5V\e(B") | |
128 | ("s" . "\e(5W\e(B") | |
129 | ("h" . "\e(5X\e(B") | |
130 | ("ld" . "\e(5R\e(B") | |
131 | ("L" . "\e(5R\e(B") | |
132 | ("ksh" . "\e$(5!3!h!V\e(B") | |
133 | ("GY" . "***GY***") ; Must check out later. | |
134 | ;; special consonants | |
135 | ("q" . "\e(53i\e(B") | |
136 | ("K" . "\e(54i\e(B") | |
137 | ("G" . "\e(55i\e(B") | |
138 | ("z" . "\e(5:i\e(B") | |
139 | ("f" . "\e(5Ii\e(B") | |
140 | (".D" . "\e(5?i\e(B") | |
141 | (".Dh" . "\e(5@i\e(B") | |
142 | )) | |
143 | ||
144 | (defvar indian-itrans-vowel-sign-alist | |
145 | '( | |
146 | ;; Special treatment unique to IS 13194 Transliteration | |
147 | ("" . "\e(5h\e(B") | |
148 | ("a" . "") | |
149 | ;; Matra (Vowel Sign) | |
150 | ("aa" . "\e(5Z\e(B") | |
151 | ("A" . "\e(5Z\e(B") | |
152 | ("i" . "\e(5[\e(B") | |
153 | ("ii" . "\e(5\\e(B") | |
154 | ("I" . "\e(5\\e(B") | |
155 | ("u" . "\e(5]\e(B") | |
156 | ("uu" . "\e(5^\e(B") | |
157 | ("U" . "\e(5^\e(B") | |
158 | ("R^i" . "\e(5_\e(B") ; These must be checked out later. | |
159 | ("R^I" . "\e(5_i\e(B") | |
160 | ("L^i" . "\e(5[i\e(B") | |
161 | ("L^I" . "\e(5\i\e(B") | |
162 | ("E" . "\e(5`\e(B") ; For transcription of non-Devanangri Languages. | |
163 | ("e" . "\e(5a\e(B") | |
164 | ("ai" . "\e(5b\e(B") | |
165 | ;; ("e.c" . "\e(5c\e(B") ; Tentatively suppressed. | |
166 | ("O" . "\e(5d\e(B") ; For transcription of non-Devanagari Languages. | |
167 | ("o" . "\e(5e\e(B") | |
168 | ("au" . "\e(5f\e(B") | |
169 | ;; ("o.c" . "\e(5g\e(B") ; Tentatively suppressed. | |
170 | )) | |
171 | ||
172 | ;; | |
173 | ;; Independent vowels and other signs. | |
174 | ;; | |
175 | ||
176 | (defvar indian-itrans-other-letters-alist | |
177 | '( | |
178 | ("a" . "\e(5$\e(B") | |
179 | ("aa" . "\e(5%\e(B") | |
180 | ("A" . "\e(5%\e(B") | |
181 | ("i" . "\e(5&\e(B") | |
182 | ("ii" . "\e(5'\e(B") | |
183 | ("I" . "\e(5'\e(B") | |
184 | ("u" . "\e(5(\e(B") | |
185 | ("uu" . "\e(5)\e(B") | |
186 | ("U" . "\e(5)\e(B") | |
187 | ("R^i" . "\e(5*\e(B") | |
188 | ("R^I" . "\e(5*i\e(B") | |
189 | ("L^i" . "\e(5&i\e(B") | |
190 | ("L^I" . "\e(5'i\e(B") | |
191 | ("E" . "\e(5+\e(B") ; For transcription of non-Devanagari Languages. | |
192 | ("e" . "\e(5,\e(B") | |
193 | ("ai" . "\e(5-\e(B") | |
194 | ;; ("e.c" . "\e(5.\e(B") ; Candra E | |
195 | ("O" . "\e(5/\e(B") ; For transcription of non-Devanagari Languages. | |
196 | ("o" . "\e(50\e(B") | |
197 | ("au" . "\e(51\e(B") | |
198 | ;; ("o.c" . "\e(52\e(B") ; Candra O | |
199 | ("M" . "\e(5$\e(B") | |
200 | ("H" . "\e(5#\e(B") | |
201 | ("AUM" . "\e(5!i\e(B") | |
202 | ("OM" . "\e(5!i\e(B") | |
203 | (".r" . "\e(5Oh\e(B") | |
204 | (".n" . "\e(5"\e(B") | |
205 | (".N" . "\e(5!\e(B") | |
206 | (".h" . "\e(5h\e(B") ; Halant | |
207 | (".." . "\e(5j\e(B") | |
208 | (".a" . "\e(5ji\e(B") ; Avagrah | |
209 | ("0" . "\e(5q\e(B") | |
210 | ("1" . "\e(5r\e(B") | |
211 | ("2" . "\e(5s\e(B") | |
212 | ("3" . "\e(5t\e(B") | |
213 | ("4" . "\e(5u\e(B") | |
214 | ("5" . "\e(5v\e(B") | |
215 | ("6" . "\e(5w\e(B") | |
216 | ("7" . "\e(5x\e(B") | |
217 | ("8" . "\e(5y\e(B") | |
218 | ("9" . "\e(5z\e(B") | |
219 | )) | |
220 | ||
221 | ;; Regular expression matching single Indian character represented | |
222 | ;; by ITRANS. | |
223 | ||
224 | (defvar indian-itrans-regexp | |
225 | (let ((consonant "\\([cs]hh?\\)\\|[kgjTDnpbyr]h?\\|\\(N\\^?\\)\\|\\(jN\\)\\|[mvqKGzfs]\\|\\(ld?\\)\\|\\(ksh\\)\\|\\(GY\\)\\|\\(\\.Dh?\\)") | |
226 | (vowel "\\(a[aiu]\\)\\|\\(ii\\)\\|\\(uu\\)\\|\\([RL]\\^[iI]\\)\\|[AIEOeoaiu]") | |
227 | (misc "[MH0-9]\\|\\(AUM\\)\\|\\(OM\\)\\|\\(\\.[rnNh\\.a]\\)") | |
228 | (lpre "\\(") (rpre "\\)") (orre "\\|")) | |
229 | (concat lpre misc rpre orre | |
230 | lpre lpre consonant rpre "?" lpre vowel rpre rpre orre | |
231 | lpre consonant rpre ))) | |
232 | ||
233 | ;; | |
234 | ;; Regular expression matching single ITRANS unit for IS 13194 characters. | |
235 | ;; | |
236 | ||
237 | (defvar itrans-indian-regexp | |
238 | (let ((vowel "[\e(5$\e(B-\e(52\e(B]") | |
239 | (consonant "[\e(53\e(B-\e(5X\e(B]") | |
37cdc7ad | 240 | (matra "[\e(5Z\e(B-\e(5g\e(B]") |
4ed46869 KH |
241 | (misc "[\e(5q\e(B-\e(5z\e(B]") |
242 | (lpre "\\(") (rpre "\\)") (orre "\\|")) | |
37cdc7ad KH |
243 | (concat misc orre |
244 | lpre consonant matra "?" rpre orre | |
245 | vowel))) | |
4ed46869 KH |
246 | |
247 | ;; | |
248 | ;; IS13194 - ITRANS conversion table for string matching above regexp. | |
249 | ;; | |
250 | ||
251 | (defvar indian-itrans-alist | |
252 | (let ((cl indian-itrans-consonant-alist) | |
253 | (ml indian-itrans-other-letters-alist) rules) | |
254 | (while cl | |
255 | (let ((vl indian-itrans-vowel-sign-alist)) | |
256 | (while vl | |
257 | (setq rules | |
258 | (cons (cons (concat (car (car cl)) (car (car vl))) | |
259 | (concat (cdr (car cl)) (cdr (car vl)))) | |
260 | rules)) | |
261 | (setq vl (cdr vl)))) | |
262 | (setq cl (cdr cl))) | |
263 | (while ml | |
264 | (setq rules (cons (cons (car (car ml)) | |
265 | (cdr (car ml))) | |
266 | rules)) | |
267 | (setq ml (cdr ml))) | |
268 | rules)) | |
269 | ||
270 | ;; | |
271 | ;; Utility program to convert from ITRANS to IS 13194 in specified region. | |
272 | ;; | |
273 | ||
274 | (defun indian-decode-itrans-region (from to) | |
275 | "Convert `ITRANS' mnemonics of the current region to Indian characters. | |
276 | When called from a program, expects two arguments, | |
277 | positions (integers or markers) specifying the stretch of the region." | |
278 | (interactive "r") | |
279 | (save-restriction | |
280 | (narrow-to-region from to) | |
281 | (goto-char (point-min)) | |
282 | (while (re-search-forward indian-itrans-regexp nil t) | |
283 | (let* ((itrans (buffer-substring (match-beginning 0) (match-end 0))) | |
284 | (ch (cdr (assoc itrans indian-itrans-alist)))) | |
285 | (if ch | |
286 | (progn | |
287 | (delete-region (match-beginning 0) (match-end 0)) | |
288 | (insert ch))))) | |
289 | (goto-char (point-min)) | |
290 | (while (re-search-forward "\\(\e(5h\e(B\\)[^\\c0]" nil t) | |
291 | (delete-region (match-beginning 1) (match-end 1))))) | |
292 | ||
293 | ;; | |
294 | ;; Utility program to convert from IS 13194 to ITRANS in specified region. | |
295 | ;; | |
296 | ||
37cdc7ad KH |
297 | (defun indian-encode-itrans-region (from to) |
298 | "Convert indian region to ITRANS mnemonics." | |
299 | (interactive "r") | |
300 | (save-restriction | |
301 | (narrow-to-region from to) | |
302 | (goto-char (point-min)) | |
303 | (while (re-search-forward itrans-indian-regexp nil t) | |
304 | (let* ((indian (buffer-substring (match-beginning 0) (match-end 0))) | |
305 | (ch (car (rassoc indian indian-itrans-alist)))) | |
306 | (if ch | |
307 | (progn | |
308 | (delete-region (match-beginning 0) (match-end 0)) | |
309 | (insert ch))))) | |
310 | (goto-char (point-min)))) | |
311 | ||
4ed46869 | 312 | ;;; indian.el ends here |