Commit | Line | Data |
---|---|---|
4b725a70 | 1 | ;;; tibet-util.el --- utilities for Tibetan -*- coding: utf-8-emacs; -*- |
80d75b56 | 2 | |
ba318903 | 3 | ;; Copyright (C) 1997, 2001-2014 Free Software Foundation, Inc. |
7976eda0 | 4 | ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, |
5df4f04c | 5 | ;; 2005, 2006, 2007, 2008, 2009, 2010, 2011 |
eaa61218 KH |
6 | ;; National Institute of Advanced Industrial Science and Technology (AIST) |
7 | ;; Registration Number H14PRO021 | |
80d75b56 | 8 | |
4c120732 | 9 | ;; Author: Toru TOMABECHI <Toru.Tomabechi@orient.unil.ch> |
80d75b56 | 10 | ;; Keywords: multilingual, Tibetan |
4c120732 | 11 | ;; Created: Feb. 17. 1997 |
80d75b56 KH |
12 | |
13 | ;; This file is part of GNU Emacs. | |
14 | ||
4936186e | 15 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
80d75b56 | 16 | ;; it under the terms of the GNU General Public License as published by |
4936186e GM |
17 | ;; the Free Software Foundation, either version 3 of the License, or |
18 | ;; (at your option) any later version. | |
80d75b56 KH |
19 | |
20 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
21 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
22 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
23 | ;; GNU General Public License for more details. | |
24 | ||
25 | ;; You should have received a copy of the GNU General Public License | |
4936186e | 26 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
80d75b56 | 27 | |
e8af40ee | 28 | ;;; History: |
80d75b56 | 29 | ;; 1997.03.13 Modification in treatment of text properties; |
e4920bc9 | 30 | ;; Support for some special signs and punctuation. |
3bdf8898 | 31 | ;; 1999.10.25 Modification for a new composition way by K.Handa. |
80d75b56 | 32 | |
e8af40ee PJ |
33 | ;;; Commentary: |
34 | ||
80d75b56 KH |
35 | ;;; Code: |
36 | ||
93f7e657 | 37 | (defconst tibetan-obsolete-glyphs |
4b725a70 PE |
38 | `(("།" . "།") ; 2 col <-> 1 col |
39 | ("༏" . "༏") | |
40 | ("༐" . "༐") | |
41 | ("༑" . "༑") | |
42 | ("ཿ" . "ཿ") | |
43 | ("་" . "་") | |
44 | ("༔" . "༔") | |
93f7e657 | 45 | ;; Yes these are dirty. But ... |
4b725a70 PE |
46 | ("༎ ༎" . ,(compose-string "༎ ༎" 0 3 [?༎ (Br . Bl) ? (Br . Bl) ?༎])) |
47 | ("༄༅༅" . ,(compose-string | |
48 | "࿁࿂࿂࿂" 0 4 | |
49 | [?࿁ (Br . Bl) ?࿂ (Br . Bl) ?࿂ (Br . Bl) ?࿂])) | |
50 | ("༄༅" . ,(compose-string "࿁࿂࿂" 0 3 [?࿁ (Br . Bl) ?࿂ (Br . Bl) ?࿂])) | |
51 | ("༆" . ,(compose-string "࿁࿂༙" 0 3 [?࿁ (Br . Bl) ?࿂ (br . tr) ?༙])) | |
52 | ("༄" . ,(compose-string "࿁࿂" 0 2 [?࿁ (Br . Bl) ?࿂])))) | |
93f7e657 | 53 | |
80d75b56 | 54 | ;;;###autoload |
3bdf8898 KH |
55 | (defun tibetan-char-p (ch) |
56 | "Check if char CH is Tibetan character. | |
57 | Returns non-nil if CH is Tibetan. Otherwise, returns nil." | |
58 | (memq (char-charset ch) '(tibetan tibetan-1-column))) | |
80d75b56 | 59 | |
3bdf8898 | 60 | ;;; Functions for Tibetan <-> Tibetan-transcription. |
80d75b56 KH |
61 | |
62 | ;;;###autoload | |
3bdf8898 KH |
63 | (defun tibetan-tibetan-to-transcription (str) |
64 | "Transcribe Tibetan string STR and return the corresponding Roman string." | |
65 | (let (;; Accumulate transcriptions here in reverse order. | |
66 | (trans nil) | |
67 | (len (length str)) | |
68 | (i 0) | |
69 | ch this-trans) | |
70 | (while (< i len) | |
ff69c4ee | 71 | (let ((idx (string-match tibetan-precomposition-rule-regexp str i))) |
3bdf8898 KH |
72 | (if (eq idx i) |
73 | ;; Ith character and the followings matches precomposable | |
74 | ;; Tibetan sequence. | |
75 | (setq i (match-end 0) | |
76 | this-trans | |
77 | (car (rassoc | |
78 | (cdr (assoc (match-string 0 str) | |
79 | tibetan-precomposition-rule-alist)) | |
80 | tibetan-precomposed-transcription-alist))) | |
81 | (setq ch (substring str i (1+ i)) | |
82 | i (1+ i) | |
83 | this-trans | |
84 | (car (or (rassoc ch tibetan-consonant-transcription-alist) | |
85 | (rassoc ch tibetan-vowel-transcription-alist) | |
86 | (rassoc ch tibetan-subjoined-transcription-alist))))) | |
87 | (setq trans (cons this-trans trans)))) | |
88 | (apply 'concat (nreverse trans)))) | |
80d75b56 | 89 | |
3bdf8898 KH |
90 | ;;;###autoload |
91 | (defun tibetan-transcription-to-tibetan (str) | |
92 | "Convert Tibetan Roman string STR to Tibetan character string. | |
93 | The returned string has no composition information." | |
94 | (let (;; Case is significant. | |
95 | (case-fold-search nil) | |
96 | (idx 0) | |
97 | ;; Accumulate Tibetan strings here in reverse order. | |
98 | (t-str-list nil) | |
99 | i subtrans) | |
100 | (while (setq i (string-match tibetan-regexp str idx)) | |
101 | (if (< idx i) | |
102 | ;; STR contains a pattern that doesn't match Tibetan | |
103 | ;; transcription. Include the pattern as is. | |
104 | (setq t-str-list (cons (substring str idx i) t-str-list))) | |
105 | (setq subtrans (match-string 0 str) | |
106 | idx (match-end 0)) | |
107 | (let ((t-char (cdr (assoc subtrans | |
108 | tibetan-precomposed-transcription-alist)))) | |
109 | (if t-char | |
110 | ;; SUBTRANS corresponds to a transcription for | |
111 | ;; precomposable Tibetan sequence. | |
112 | (setq t-char (car (rassoc t-char | |
113 | tibetan-precomposition-rule-alist))) | |
114 | (setq t-char | |
115 | (cdr | |
116 | (or (assoc subtrans tibetan-consonant-transcription-alist) | |
117 | (assoc subtrans tibetan-vowel-transcription-alist) | |
118 | (assoc subtrans tibetan-modifier-transcription-alist) | |
119 | (assoc subtrans tibetan-subjoined-transcription-alist))))) | |
120 | (setq t-str-list (cons t-char t-str-list)))) | |
121 | (if (< idx (length str)) | |
122 | (setq t-str-list (cons (substring str idx) t-str-list))) | |
123 | (apply 'concat (nreverse t-str-list)))) | |
80d75b56 KH |
124 | |
125 | ;;; | |
3bdf8898 | 126 | ;;; Functions for composing/decomposing Tibetan sequence. |
80d75b56 KH |
127 | ;;; |
128 | ;;; A Tibetan syllable is typically structured as follows: | |
129 | ;;; | |
130 | ;;; [Prefix] C [C+] V [M] [Suffix [Post suffix]] | |
131 | ;;; | |
132 | ;;; where C's are all vertically stacked, V appears below or above | |
133 | ;;; consonant cluster and M is always put above the C[C+]V combination. | |
134 | ;;; (Sanskrit visarga, though it is a vowel modifier, is considered | |
135 | ;;; to be a punctuation.) | |
136 | ;;; | |
ff69c4ee | 137 | ;;; Here are examples of the words "bsgrubs" and "hfauM" |
80d75b56 | 138 | ;;; |
4b725a70 | 139 | ;;; བསྒྲུབས ཧཱུཾ |
80d75b56 KH |
140 | ;;; |
141 | ;;; M | |
142 | ;;; b s b s h | |
ff69c4ee | 143 | ;;; g fa |
80d75b56 KH |
144 | ;;; r u |
145 | ;;; u | |
146 | ;;; | |
4b725a70 | 147 | ;;; Consonants `'' (འ), `w' (ཝ), `y' (ཡ), `r' (ར) take special |
3bdf8898 KH |
148 | ;;; forms when they are used as subjoined consonant. Consonant `r' |
149 | ;;; takes another special form when used as superjoined in such a case | |
150 | ;;; as "rka", while it does not change its form when conjoined with | |
151 | ;;; subjoined `'', `w' or `y' as in "rwa", "rya". | |
152 | ||
153 | ;; Append a proper composition rule and glyph to COMPONENTS to compose | |
154 | ;; CHAR with a composition that has COMPONENTS. | |
155 | ||
156 | (defun tibetan-add-components (components char) | |
157 | (let ((last (last components)) | |
158 | (stack-upper '(tc . bc)) | |
159 | (stack-under '(bc . tc)) | |
7258f9ca | 160 | rule comp-vowel tmp) |
3bdf8898 KH |
161 | ;; Special treatment for 'a chung. |
162 | ;; If 'a follows a consonant, turn it into the subjoined form. | |
7258f9ca | 163 | ;; * Disabled by Tomabechi 2000/06/09 * |
4b725a70 PE |
164 | ;; Because in Unicode, འ may follow directly a consonant without |
165 | ;; any intervening vowel, as in མཁའ་=མ ཁ འ not མ ཁ འ | |
166 | ;;(if (and (= char ?འ) | |
7258f9ca | 167 | ;; (aref (char-category-set (car last)) ?0)) |
4b725a70 | 168 | ;; (setq char ?ཱ)) ;; modified for new font by Tomabechi 1999/12/10 |
7258f9ca KH |
169 | |
170 | ;; Composite vowel signs are decomposed before being added | |
171 | ;; Added by Tomabechi 2000/06/08 | |
4b725a70 | 172 | (if (memq char '(?ཱི ?ཱུ ?ྲྀ ?ཷ ?ླྀ ?ཹ ?ཱྀ)) |
7258f9ca | 173 | (setq comp-vowel |
ff69c4ee KH |
174 | (copy-sequence |
175 | (cddr (assoc (char-to-string char) | |
176 | tibetan-composite-vowel-alist))) | |
7258f9ca KH |
177 | char |
178 | (cadr (assoc (char-to-string char) | |
179 | tibetan-composite-vowel-alist)))) | |
3bdf8898 KH |
180 | (cond |
181 | ;; Compose upper vowel sign vertically over. | |
182 | ((aref (char-category-set char) ?2) | |
183 | (setq rule stack-upper)) | |
184 | ||
185 | ;; Compose lower vowel sign vertically under. | |
186 | ((aref (char-category-set char) ?3) | |
4b725a70 | 187 | (if (or (eq char ?) ;; `' and `' should not visible when composed. |
8f924df7 | 188 | (eq char #xF70)) |
6b12c749 KH |
189 | (setq rule nil) |
190 | (setq rule stack-under))) | |
3bdf8898 KH |
191 | ;; Transform ra-mgo (superscribed r) if followed by a subjoined |
192 | ;; consonant other than w, ', y, r. | |
4b725a70 PE |
193 | ((and (= (car last) ?ར) |
194 | (not (memq char '(?ྭ ?ཱ ?ྱ ?ྲ)))) | |
195 | (setcar last ?) ;; modified for newfont by Tomabechi 1999/12/10 | |
3bdf8898 | 196 | (setq rule stack-under)) |
3bdf8898 KH |
197 | ;; Transform initial base consonant if followed by a subjoined |
198 | ;; consonant but 'a. | |
199 | (t | |
200 | (let ((laststr (char-to-string (car last)))) | |
4b725a70 PE |
201 | (if (and (/= char ?ཱ) ;; modified for new font by Tomabechi |
202 | (string-match "[ཀ-ཛྷཞཟལ-ཀྵཪ]" laststr)) | |
3bdf8898 KH |
203 | (setcar last (string-to-char |
204 | (cdr (assoc (char-to-string (car last)) | |
205 | tibetan-base-to-subjoined-alist))))) | |
206 | (setq rule stack-under)))) | |
207 | ||
6b12c749 | 208 | (if rule |
7258f9ca KH |
209 | (setcdr last (list rule char))) |
210 | ;; Added by Tomabechi 2000/06/08 | |
211 | (if comp-vowel | |
212 | (nconc last comp-vowel)) | |
213 | )) | |
80d75b56 KH |
214 | |
215 | ;;;###autoload | |
216 | (defun tibetan-compose-string (str) | |
3bdf8898 KH |
217 | "Compose Tibetan string STR." |
218 | (let ((idx 0)) | |
4b725a70 | 219 | ;; `འ' is included in the pattern for subjoined consonants |
3bdf8898 | 220 | ;; because we treat it specially in tibetan-add-components. |
7258f9ca | 221 | ;; (This feature is removed by Tomabechi 2000/06/08) |
3bdf8898 KH |
222 | (while (setq idx (string-match tibetan-composable-pattern str idx)) |
223 | (let ((from idx) | |
224 | (to (match-end 0)) | |
225 | components) | |
226 | (if (eq (string-match tibetan-precomposition-rule-regexp str idx) idx) | |
227 | (setq idx (match-end 0) | |
228 | components | |
229 | (list (string-to-char | |
230 | (cdr | |
231 | (assoc (match-string 0 str) | |
232 | tibetan-precomposition-rule-alist))))) | |
233 | (setq components (list (aref str idx)) | |
234 | idx (1+ idx))) | |
235 | (while (< idx to) | |
236 | (tibetan-add-components components (aref str idx)) | |
237 | (setq idx (1+ idx))) | |
238 | (compose-string str from to components)))) | |
239 | str) | |
80d75b56 KH |
240 | |
241 | ;;;###autoload | |
3bdf8898 KH |
242 | (defun tibetan-compose-region (beg end) |
243 | "Compose Tibetan text the region BEG and END." | |
80d75b56 | 244 | (interactive "r") |
3bdf8898 | 245 | (let (str result chars) |
80d75b56 KH |
246 | (save-excursion |
247 | (save-restriction | |
248 | (narrow-to-region beg end) | |
249 | (goto-char (point-min)) | |
4b725a70 | 250 | ;; `འ' is included in the pattern for subjoined consonants |
3bdf8898 | 251 | ;; because we treat it specially in tibetan-add-components. |
7258f9ca | 252 | ;; (This feature is removed by Tomabechi 2000/06/08) |
3bdf8898 KH |
253 | (while (re-search-forward tibetan-composable-pattern nil t) |
254 | (let ((from (match-beginning 0)) | |
255 | (to (match-end 0)) | |
256 | components) | |
257 | (goto-char from) | |
258 | (if (looking-at tibetan-precomposition-rule-regexp) | |
259 | (progn | |
260 | (setq components | |
261 | (list (string-to-char | |
262 | (cdr | |
263 | (assoc (match-string 0) | |
264 | tibetan-precomposition-rule-alist))))) | |
265 | (goto-char (match-end 0))) | |
266 | (setq components (list (char-after from))) | |
267 | (forward-char 1)) | |
268 | (while (< (point) to) | |
269 | (tibetan-add-components components (following-char)) | |
270 | (forward-char 1)) | |
271 | (compose-region from to components))))))) | |
80d75b56 | 272 | |
6b12c749 KH |
273 | (defvar tibetan-decompose-precomposition-alist |
274 | (mapcar (function (lambda (x) (cons (string-to-char (cdr x)) (car x)))) | |
275 | tibetan-precomposition-rule-alist)) | |
276 | ||
80d75b56 | 277 | ;;;###autoload |
6b12c749 KH |
278 | (defun tibetan-decompose-region (from to) |
279 | "Decompose Tibetan text in the region FROM and TO. | |
280 | This is different from decompose-region because precomposed Tibetan characters | |
80dadb9e | 281 | are decomposed into normal Tibetan character sequences." |
6b12c749 KH |
282 | (interactive "r") |
283 | (save-restriction | |
284 | (narrow-to-region from to) | |
285 | (decompose-region from to) | |
286 | (goto-char from) | |
287 | (while (not (eobp)) | |
288 | (let* ((char (following-char)) | |
289 | (slot (assq char tibetan-decompose-precomposition-alist))) | |
290 | (if slot | |
291 | (progn | |
292 | (delete-char 1) | |
293 | (insert (cdr slot))) | |
294 | (forward-char 1)))))) | |
295 | ||
296 | ||
3bdf8898 | 297 | ;;;###autoload |
6b12c749 KH |
298 | (defun tibetan-decompose-string (str) |
299 | "Decompose Tibetan string STR. | |
300 | This is different from decompose-string because precomposed Tibetan characters | |
80dadb9e | 301 | are decomposed into normal Tibetan character sequences." |
6b12c749 KH |
302 | (let ((new "") |
303 | (len (length str)) | |
304 | (idx 0) | |
305 | char slot) | |
306 | (while (< idx len) | |
307 | (setq char (aref str idx) | |
308 | slot (assq (aref str idx) tibetan-decompose-precomposition-alist) | |
309 | new (concat new (if slot (cdr slot) (char-to-string char))) | |
310 | idx (1+ idx))) | |
311 | new)) | |
3bdf8898 | 312 | |
80d75b56 KH |
313 | ;;; |
314 | ;;; This variable is used to avoid repeated decomposition. | |
315 | ;;; | |
316 | (setq-default tibetan-decomposed nil) | |
317 | ||
318 | ;;;###autoload | |
319 | (defun tibetan-decompose-buffer () | |
320 | "Decomposes Tibetan characters in the buffer into their components. | |
3bdf8898 | 321 | See also the documentation of the function `tibetan-decompose-region'." |
80d75b56 KH |
322 | (interactive) |
323 | (make-local-variable 'tibetan-decomposed) | |
324 | (cond ((not tibetan-decomposed) | |
325 | (tibetan-decompose-region (point-min) (point-max)) | |
326 | (setq tibetan-decomposed t)))) | |
327 | ||
328 | ;;;###autoload | |
329 | (defun tibetan-compose-buffer () | |
330 | "Composes Tibetan character components in the buffer. | |
331 | See also docstring of the function tibetan-compose-region." | |
332 | (interactive) | |
333 | (make-local-variable 'tibetan-decomposed) | |
334 | (tibetan-compose-region (point-min) (point-max)) | |
335 | (setq tibetan-decomposed nil)) | |
336 | ||
337 | ;;;###autoload | |
338 | (defun tibetan-post-read-conversion (len) | |
339 | (save-excursion | |
340 | (save-restriction | |
341 | (let ((buffer-modified-p (buffer-modified-p))) | |
342 | (narrow-to-region (point) (+ (point) len)) | |
343 | (tibetan-compose-region (point-min) (point-max)) | |
344 | (set-buffer-modified-p buffer-modified-p) | |
73b4b264 KH |
345 | (make-local-variable 'tibetan-decomposed) |
346 | (setq tibetan-decomposed nil) | |
347 | (- (point-max) (point-min)))))) | |
80d75b56 KH |
348 | |
349 | ||
350 | ;;;###autoload | |
351 | (defun tibetan-pre-write-conversion (from to) | |
352 | (setq tibetan-decomposed-temp tibetan-decomposed) | |
7273226d | 353 | (let ((old-buf (current-buffer))) |
76d12415 KH |
354 | (set-buffer (generate-new-buffer " *temp*")) |
355 | (if (stringp from) | |
356 | (insert from) | |
357 | (insert-buffer-substring old-buf from to)) | |
358 | (if (not tibetan-decomposed-temp) | |
359 | (tibetan-decompose-region (point-min) (point-max))) | |
5d0ae729 KH |
360 | ;; Should return nil as annotations. |
361 | nil)) | |
80d75b56 | 362 | |
d633b538 KH |
363 | \f |
364 | ;;; | |
365 | ;;; Unicode-related definitions. | |
a1506d29 | 366 | ;;; |
d633b538 KH |
367 | |
368 | (defvar tibetan-canonicalize-for-unicode-alist | |
4b725a70 PE |
369 | '(("" . "") ;; remove vowel a |
370 | ("ཱི" . "ཱི") ;; decompose vowels whose use is ``discouraged'' in Unicode 3.0 | |
371 | ("ཱུ" . "ཱུ") | |
372 | ("ྲྀ" . "ྲྀ") | |
373 | ("ཷ" . "ྲཱྀ") | |
374 | ("ླྀ" . "ླྀ") | |
375 | ("ཹ" . "ླཱྀ") | |
376 | ("ཱྀ" . "ཱྀ")) | |
d633b538 KH |
377 | "Rules for canonicalizing Tibetan vowels for Unicode.") |
378 | ||
379 | (defvar tibetan-canonicalize-for-unicode-regexp | |
4b725a70 | 380 | "[ཱཱིུྲྀཷླྀཹཱྀ]" |
d633b538 KH |
381 | "Regexp for Tibetan vowels to be canonicalized in Unicode.") |
382 | ||
383 | (defun tibetan-canonicalize-for-unicode-region (from to) | |
384 | (save-restriction | |
385 | (narrow-to-region from to) | |
386 | (goto-char from) | |
387 | (while (re-search-forward tibetan-canonicalize-for-unicode-regexp nil t) | |
388 | (let ( | |
389 | ;;(from (match-beginning 0)) | |
390 | ;;(to (match-end 0)) | |
391 | (canonical-form | |
392 | (cdr (assoc (match-string 0) | |
393 | tibetan-canonicalize-for-unicode-alist)))) | |
394 | ;;(goto-char from) | |
395 | ;;(delete-region from to) | |
396 | ;;(insert canonical-form) | |
397 | (replace-match canonical-form) | |
398 | )))) | |
399 | ||
400 | (defvar tibetan-strict-unicode t | |
fb7ada5f | 401 | "Flag to control Tibetan canonicalizing for Unicode. |
d633b538 KH |
402 | |
403 | If non-nil, the vowel a is removed and composite vowels are decomposed | |
404 | before writing buffer in Unicode. See also | |
405 | `tibetan-canonicalize-for-unicode-regexp' and | |
406 | `tibetan-canonicalize-for-unicode-alist'.") | |
407 | ||
408 | ;;;###autoload | |
409 | (defun tibetan-pre-write-canonicalize-for-unicode (from to) | |
410 | (let ((old-buf (current-buffer)) | |
411 | (strict-unicode tibetan-strict-unicode)) | |
412 | (set-buffer (generate-new-buffer " *temp*")) | |
413 | (if (stringp from) | |
414 | (insert from) | |
415 | (insert-buffer-substring old-buf from to)) | |
416 | (if strict-unicode | |
417 | (tibetan-canonicalize-for-unicode-region (point-min) (point-max))) | |
418 | ;; Should return nil as annotations. | |
419 | nil)) | |
420 | ||
650e8505 | 421 | (provide 'tibet-util) |
80d75b56 | 422 | |
e8af40ee | 423 | ;;; tibet-util.el ends here |