Update copyright year to 2014 by running admin/update-copyright.
[bpt/emacs.git] / lisp / language / tibet-util.el
CommitLineData
4b725a70 1;;; tibet-util.el --- utilities for Tibetan -*- coding: utf-8-emacs; -*-
80d75b56 2
ba318903 3;; Copyright (C) 1997, 2001-2014 Free Software Foundation, Inc.
7976eda0 4;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 5;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
eaa61218
KH
6;; National Institute of Advanced Industrial Science and Technology (AIST)
7;; Registration Number H14PRO021
80d75b56 8
4c120732 9;; Author: Toru TOMABECHI <Toru.Tomabechi@orient.unil.ch>
80d75b56 10;; Keywords: multilingual, Tibetan
4c120732 11;; Created: Feb. 17. 1997
80d75b56
KH
12
13;; This file is part of GNU Emacs.
14
4936186e 15;; GNU Emacs is free software: you can redistribute it and/or modify
80d75b56 16;; it under the terms of the GNU General Public License as published by
4936186e
GM
17;; the Free Software Foundation, either version 3 of the License, or
18;; (at your option) any later version.
80d75b56
KH
19
20;; GNU Emacs is distributed in the hope that it will be useful,
21;; but WITHOUT ANY WARRANTY; without even the implied warranty of
22;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23;; GNU General Public License for more details.
24
25;; You should have received a copy of the GNU General Public License
4936186e 26;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
80d75b56 27
e8af40ee 28;;; History:
80d75b56 29;; 1997.03.13 Modification in treatment of text properties;
e4920bc9 30;; Support for some special signs and punctuation.
3bdf8898 31;; 1999.10.25 Modification for a new composition way by K.Handa.
80d75b56 32
e8af40ee
PJ
33;;; Commentary:
34
80d75b56
KH
35;;; Code:
36
93f7e657 37(defconst tibetan-obsolete-glyphs
4b725a70
PE
38 `(("།" . "།") ; 2 col <-> 1 col
39 ("༏" . "༏")
40 ("༐" . "༐")
41 ("༑" . "༑")
42 ("ཿ" . "ཿ")
43 ("་" . "་")
44 ("༔" . "༔")
93f7e657 45 ;; Yes these are dirty. But ...
4b725a70
PE
46 ("༎ ༎" . ,(compose-string "༎ ༎" 0 3 [?༎ (Br . Bl) ? (Br . Bl) ?༎]))
47 ("༄༅༅" . ,(compose-string
48 "࿁࿂࿂࿂" 0 4
49 [?࿁ (Br . Bl) ?࿂ (Br . Bl) ?࿂ (Br . Bl) ?࿂]))
50 ("༄༅" . ,(compose-string "࿁࿂࿂" 0 3 [?࿁ (Br . Bl) ?࿂ (Br . Bl) ?࿂]))
51 ("༆" . ,(compose-string "࿁࿂༙" 0 3 [?࿁ (Br . Bl) ?࿂ (br . tr) ?༙]))
52 ("༄" . ,(compose-string "࿁࿂" 0 2 [?࿁ (Br . Bl) ?࿂]))))
93f7e657 53
80d75b56 54;;;###autoload
3bdf8898
KH
55(defun tibetan-char-p (ch)
56 "Check if char CH is Tibetan character.
57Returns non-nil if CH is Tibetan. Otherwise, returns nil."
58 (memq (char-charset ch) '(tibetan tibetan-1-column)))
80d75b56 59
3bdf8898 60;;; Functions for Tibetan <-> Tibetan-transcription.
80d75b56
KH
61
62;;;###autoload
3bdf8898
KH
63(defun tibetan-tibetan-to-transcription (str)
64 "Transcribe Tibetan string STR and return the corresponding Roman string."
65 (let (;; Accumulate transcriptions here in reverse order.
66 (trans nil)
67 (len (length str))
68 (i 0)
69 ch this-trans)
70 (while (< i len)
ff69c4ee 71 (let ((idx (string-match tibetan-precomposition-rule-regexp str i)))
3bdf8898
KH
72 (if (eq idx i)
73 ;; Ith character and the followings matches precomposable
74 ;; Tibetan sequence.
75 (setq i (match-end 0)
76 this-trans
77 (car (rassoc
78 (cdr (assoc (match-string 0 str)
79 tibetan-precomposition-rule-alist))
80 tibetan-precomposed-transcription-alist)))
81 (setq ch (substring str i (1+ i))
82 i (1+ i)
83 this-trans
84 (car (or (rassoc ch tibetan-consonant-transcription-alist)
85 (rassoc ch tibetan-vowel-transcription-alist)
86 (rassoc ch tibetan-subjoined-transcription-alist)))))
87 (setq trans (cons this-trans trans))))
88 (apply 'concat (nreverse trans))))
80d75b56 89
3bdf8898
KH
90;;;###autoload
91(defun tibetan-transcription-to-tibetan (str)
92 "Convert Tibetan Roman string STR to Tibetan character string.
93The returned string has no composition information."
94 (let (;; Case is significant.
95 (case-fold-search nil)
96 (idx 0)
97 ;; Accumulate Tibetan strings here in reverse order.
98 (t-str-list nil)
99 i subtrans)
100 (while (setq i (string-match tibetan-regexp str idx))
101 (if (< idx i)
102 ;; STR contains a pattern that doesn't match Tibetan
103 ;; transcription. Include the pattern as is.
104 (setq t-str-list (cons (substring str idx i) t-str-list)))
105 (setq subtrans (match-string 0 str)
106 idx (match-end 0))
107 (let ((t-char (cdr (assoc subtrans
108 tibetan-precomposed-transcription-alist))))
109 (if t-char
110 ;; SUBTRANS corresponds to a transcription for
111 ;; precomposable Tibetan sequence.
112 (setq t-char (car (rassoc t-char
113 tibetan-precomposition-rule-alist)))
114 (setq t-char
115 (cdr
116 (or (assoc subtrans tibetan-consonant-transcription-alist)
117 (assoc subtrans tibetan-vowel-transcription-alist)
118 (assoc subtrans tibetan-modifier-transcription-alist)
119 (assoc subtrans tibetan-subjoined-transcription-alist)))))
120 (setq t-str-list (cons t-char t-str-list))))
121 (if (< idx (length str))
122 (setq t-str-list (cons (substring str idx) t-str-list)))
123 (apply 'concat (nreverse t-str-list))))
80d75b56
KH
124
125;;;
3bdf8898 126;;; Functions for composing/decomposing Tibetan sequence.
80d75b56
KH
127;;;
128;;; A Tibetan syllable is typically structured as follows:
129;;;
130;;; [Prefix] C [C+] V [M] [Suffix [Post suffix]]
131;;;
132;;; where C's are all vertically stacked, V appears below or above
133;;; consonant cluster and M is always put above the C[C+]V combination.
134;;; (Sanskrit visarga, though it is a vowel modifier, is considered
135;;; to be a punctuation.)
136;;;
ff69c4ee 137;;; Here are examples of the words "bsgrubs" and "hfauM"
80d75b56 138;;;
4b725a70 139;;; བསྒྲུབས ཧཱུཾ
80d75b56
KH
140;;;
141;;; M
142;;; b s b s h
ff69c4ee 143;;; g fa
80d75b56
KH
144;;; r u
145;;; u
146;;;
4b725a70 147;;; Consonants `'' (འ), `w' (ཝ), `y' (ཡ), `r' (ར) take special
3bdf8898
KH
148;;; forms when they are used as subjoined consonant. Consonant `r'
149;;; takes another special form when used as superjoined in such a case
150;;; as "rka", while it does not change its form when conjoined with
151;;; subjoined `'', `w' or `y' as in "rwa", "rya".
152
153;; Append a proper composition rule and glyph to COMPONENTS to compose
154;; CHAR with a composition that has COMPONENTS.
155
156(defun tibetan-add-components (components char)
157 (let ((last (last components))
158 (stack-upper '(tc . bc))
159 (stack-under '(bc . tc))
7258f9ca 160 rule comp-vowel tmp)
3bdf8898
KH
161 ;; Special treatment for 'a chung.
162 ;; If 'a follows a consonant, turn it into the subjoined form.
7258f9ca 163 ;; * Disabled by Tomabechi 2000/06/09 *
4b725a70
PE
164 ;; Because in Unicode, འ may follow directly a consonant without
165 ;; any intervening vowel, as in མཁའ་=མ ཁ འ not མ ཁ འ
166 ;;(if (and (= char ?འ)
7258f9ca 167 ;; (aref (char-category-set (car last)) ?0))
4b725a70 168 ;; (setq char ?ཱ)) ;; modified for new font by Tomabechi 1999/12/10
7258f9ca
KH
169
170 ;; Composite vowel signs are decomposed before being added
171 ;; Added by Tomabechi 2000/06/08
4b725a70 172 (if (memq char '(?ཱི ?ཱུ ?ྲྀ ?ཷ ?ླྀ ?ཹ ?ཱྀ))
7258f9ca 173 (setq comp-vowel
ff69c4ee
KH
174 (copy-sequence
175 (cddr (assoc (char-to-string char)
176 tibetan-composite-vowel-alist)))
7258f9ca
KH
177 char
178 (cadr (assoc (char-to-string char)
179 tibetan-composite-vowel-alist))))
3bdf8898
KH
180 (cond
181 ;; Compose upper vowel sign vertically over.
182 ((aref (char-category-set char) ?2)
183 (setq rule stack-upper))
184
185 ;; Compose lower vowel sign vertically under.
186 ((aref (char-category-set char) ?3)
4b725a70 187 (if (or (eq char ?) ;; `' and `཰' should not visible when composed.
8f924df7 188 (eq char #xF70))
6b12c749
KH
189 (setq rule nil)
190 (setq rule stack-under)))
3bdf8898
KH
191 ;; Transform ra-mgo (superscribed r) if followed by a subjoined
192 ;; consonant other than w, ', y, r.
4b725a70
PE
193 ((and (= (car last) ?ར)
194 (not (memq char '(?ྭ ?ཱ ?ྱ ?ྲ))))
195 (setcar last ?) ;; modified for newfont by Tomabechi 1999/12/10
3bdf8898 196 (setq rule stack-under))
3bdf8898
KH
197 ;; Transform initial base consonant if followed by a subjoined
198 ;; consonant but 'a.
199 (t
200 (let ((laststr (char-to-string (car last))))
4b725a70
PE
201 (if (and (/= char ?ཱ) ;; modified for new font by Tomabechi
202 (string-match "[ཀ-ཛྷཞཟལ-ཀྵཪ]" laststr))
3bdf8898
KH
203 (setcar last (string-to-char
204 (cdr (assoc (char-to-string (car last))
205 tibetan-base-to-subjoined-alist)))))
206 (setq rule stack-under))))
207
6b12c749 208 (if rule
7258f9ca
KH
209 (setcdr last (list rule char)))
210 ;; Added by Tomabechi 2000/06/08
211 (if comp-vowel
212 (nconc last comp-vowel))
213 ))
80d75b56
KH
214
215;;;###autoload
216(defun tibetan-compose-string (str)
3bdf8898
KH
217 "Compose Tibetan string STR."
218 (let ((idx 0))
4b725a70 219 ;; `འ' is included in the pattern for subjoined consonants
3bdf8898 220 ;; because we treat it specially in tibetan-add-components.
7258f9ca 221 ;; (This feature is removed by Tomabechi 2000/06/08)
3bdf8898
KH
222 (while (setq idx (string-match tibetan-composable-pattern str idx))
223 (let ((from idx)
224 (to (match-end 0))
225 components)
226 (if (eq (string-match tibetan-precomposition-rule-regexp str idx) idx)
227 (setq idx (match-end 0)
228 components
229 (list (string-to-char
230 (cdr
231 (assoc (match-string 0 str)
232 tibetan-precomposition-rule-alist)))))
233 (setq components (list (aref str idx))
234 idx (1+ idx)))
235 (while (< idx to)
236 (tibetan-add-components components (aref str idx))
237 (setq idx (1+ idx)))
238 (compose-string str from to components))))
239 str)
80d75b56
KH
240
241;;;###autoload
3bdf8898
KH
242(defun tibetan-compose-region (beg end)
243 "Compose Tibetan text the region BEG and END."
80d75b56 244 (interactive "r")
3bdf8898 245 (let (str result chars)
80d75b56
KH
246 (save-excursion
247 (save-restriction
248 (narrow-to-region beg end)
249 (goto-char (point-min))
4b725a70 250 ;; `འ' is included in the pattern for subjoined consonants
3bdf8898 251 ;; because we treat it specially in tibetan-add-components.
7258f9ca 252 ;; (This feature is removed by Tomabechi 2000/06/08)
3bdf8898
KH
253 (while (re-search-forward tibetan-composable-pattern nil t)
254 (let ((from (match-beginning 0))
255 (to (match-end 0))
256 components)
257 (goto-char from)
258 (if (looking-at tibetan-precomposition-rule-regexp)
259 (progn
260 (setq components
261 (list (string-to-char
262 (cdr
263 (assoc (match-string 0)
264 tibetan-precomposition-rule-alist)))))
265 (goto-char (match-end 0)))
266 (setq components (list (char-after from)))
267 (forward-char 1))
268 (while (< (point) to)
269 (tibetan-add-components components (following-char))
270 (forward-char 1))
271 (compose-region from to components)))))))
80d75b56 272
6b12c749
KH
273(defvar tibetan-decompose-precomposition-alist
274 (mapcar (function (lambda (x) (cons (string-to-char (cdr x)) (car x))))
275 tibetan-precomposition-rule-alist))
276
80d75b56 277;;;###autoload
6b12c749
KH
278(defun tibetan-decompose-region (from to)
279 "Decompose Tibetan text in the region FROM and TO.
280This is different from decompose-region because precomposed Tibetan characters
80dadb9e 281are decomposed into normal Tibetan character sequences."
6b12c749
KH
282 (interactive "r")
283 (save-restriction
284 (narrow-to-region from to)
285 (decompose-region from to)
286 (goto-char from)
287 (while (not (eobp))
288 (let* ((char (following-char))
289 (slot (assq char tibetan-decompose-precomposition-alist)))
290 (if slot
291 (progn
292 (delete-char 1)
293 (insert (cdr slot)))
294 (forward-char 1))))))
295
296
3bdf8898 297;;;###autoload
6b12c749
KH
298(defun tibetan-decompose-string (str)
299 "Decompose Tibetan string STR.
300This is different from decompose-string because precomposed Tibetan characters
80dadb9e 301are decomposed into normal Tibetan character sequences."
6b12c749
KH
302 (let ((new "")
303 (len (length str))
304 (idx 0)
305 char slot)
306 (while (< idx len)
307 (setq char (aref str idx)
308 slot (assq (aref str idx) tibetan-decompose-precomposition-alist)
309 new (concat new (if slot (cdr slot) (char-to-string char)))
310 idx (1+ idx)))
311 new))
3bdf8898 312
80d75b56
KH
313;;;
314;;; This variable is used to avoid repeated decomposition.
315;;;
316(setq-default tibetan-decomposed nil)
317
318;;;###autoload
319(defun tibetan-decompose-buffer ()
320 "Decomposes Tibetan characters in the buffer into their components.
3bdf8898 321See also the documentation of the function `tibetan-decompose-region'."
80d75b56
KH
322 (interactive)
323 (make-local-variable 'tibetan-decomposed)
324 (cond ((not tibetan-decomposed)
325 (tibetan-decompose-region (point-min) (point-max))
326 (setq tibetan-decomposed t))))
327
328;;;###autoload
329(defun tibetan-compose-buffer ()
330 "Composes Tibetan character components in the buffer.
331See also docstring of the function tibetan-compose-region."
332 (interactive)
333 (make-local-variable 'tibetan-decomposed)
334 (tibetan-compose-region (point-min) (point-max))
335 (setq tibetan-decomposed nil))
336
337;;;###autoload
338(defun tibetan-post-read-conversion (len)
339 (save-excursion
340 (save-restriction
341 (let ((buffer-modified-p (buffer-modified-p)))
342 (narrow-to-region (point) (+ (point) len))
343 (tibetan-compose-region (point-min) (point-max))
344 (set-buffer-modified-p buffer-modified-p)
73b4b264
KH
345 (make-local-variable 'tibetan-decomposed)
346 (setq tibetan-decomposed nil)
347 (- (point-max) (point-min))))))
80d75b56
KH
348
349
350;;;###autoload
351(defun tibetan-pre-write-conversion (from to)
352 (setq tibetan-decomposed-temp tibetan-decomposed)
7273226d 353 (let ((old-buf (current-buffer)))
76d12415
KH
354 (set-buffer (generate-new-buffer " *temp*"))
355 (if (stringp from)
356 (insert from)
357 (insert-buffer-substring old-buf from to))
358 (if (not tibetan-decomposed-temp)
359 (tibetan-decompose-region (point-min) (point-max)))
5d0ae729
KH
360 ;; Should return nil as annotations.
361 nil))
80d75b56 362
d633b538
KH
363\f
364;;;
365;;; Unicode-related definitions.
a1506d29 366;;;
d633b538
KH
367
368(defvar tibetan-canonicalize-for-unicode-alist
4b725a70
PE
369 '(("" . "") ;; remove vowel a
370 ("ཱི" . "ཱི") ;; decompose vowels whose use is ``discouraged'' in Unicode 3.0
371 ("ཱུ" . "ཱུ")
372 ("ྲྀ" . "ྲྀ")
373 ("ཷ" . "ྲཱྀ")
374 ("ླྀ" . "ླྀ")
375 ("ཹ" . "ླཱྀ")
376 ("ཱྀ" . "ཱྀ"))
d633b538
KH
377 "Rules for canonicalizing Tibetan vowels for Unicode.")
378
379(defvar tibetan-canonicalize-for-unicode-regexp
4b725a70 380 "[ཱཱིུྲྀཷླྀཹཱྀ]"
d633b538
KH
381 "Regexp for Tibetan vowels to be canonicalized in Unicode.")
382
383(defun tibetan-canonicalize-for-unicode-region (from to)
384 (save-restriction
385 (narrow-to-region from to)
386 (goto-char from)
387 (while (re-search-forward tibetan-canonicalize-for-unicode-regexp nil t)
388 (let (
389 ;;(from (match-beginning 0))
390 ;;(to (match-end 0))
391 (canonical-form
392 (cdr (assoc (match-string 0)
393 tibetan-canonicalize-for-unicode-alist))))
394 ;;(goto-char from)
395 ;;(delete-region from to)
396 ;;(insert canonical-form)
397 (replace-match canonical-form)
398 ))))
399
400(defvar tibetan-strict-unicode t
fb7ada5f 401 "Flag to control Tibetan canonicalizing for Unicode.
d633b538
KH
402
403If non-nil, the vowel a is removed and composite vowels are decomposed
404before writing buffer in Unicode. See also
405`tibetan-canonicalize-for-unicode-regexp' and
406`tibetan-canonicalize-for-unicode-alist'.")
407
408;;;###autoload
409(defun tibetan-pre-write-canonicalize-for-unicode (from to)
410 (let ((old-buf (current-buffer))
411 (strict-unicode tibetan-strict-unicode))
412 (set-buffer (generate-new-buffer " *temp*"))
413 (if (stringp from)
414 (insert from)
415 (insert-buffer-substring old-buf from to))
416 (if strict-unicode
417 (tibetan-canonicalize-for-unicode-region (point-min) (point-max)))
418 ;; Should return nil as annotations.
419 nil))
420
650e8505 421(provide 'tibet-util)
80d75b56 422
e8af40ee 423;;; tibet-util.el ends here