Fix copyrights.
[bpt/emacs.git] / lisp / language / tibet-util.el
CommitLineData
348f797a 1;;; tibet-util.el --- utilities for Tibetan -*- coding: iso-2022-7bit; -*-
80d75b56 2
eaa61218
KH
3;; Copyright (C) 1997, 2002 Free Software Foundation, Inc.
4;; Copyright (C) 1995, 1997, 1998, 2000
5;; National Institute of Advanced Industrial Science and Technology (AIST)
6;; Registration Number H14PRO021
80d75b56
KH
7
8;; Keywords: multilingual, Tibetan
9
10;; This file is part of GNU Emacs.
11
12;; GNU Emacs is free software; you can redistribute it and/or modify
13;; it under the terms of the GNU General Public License as published by
14;; the Free Software Foundation; either version 2, or (at your option)
15;; any later version.
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
23;; along with GNU Emacs; see the file COPYING. If not, write to the
24;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25;; Boston, MA 02111-1307, USA.
26
27;; Author: Toru TOMABECHI, <Toru.Tomabechi@orient.unil.ch>
28
29;; Created: Feb. 17. 1997
30
e8af40ee 31;;; History:
80d75b56
KH
32;; 1997.03.13 Modification in treatment of text properties;
33;; Support for some special signs and punctuations.
3bdf8898 34;; 1999.10.25 Modification for a new composition way by K.Handa.
80d75b56 35
e8af40ee
PJ
36;;; Commentary:
37
80d75b56
KH
38;;; Code:
39
93f7e657
RS
40(defconst tibetan-obsolete-glyphs
41 `(("\e$(7!=\e(B" . "\e$(8!=\e(B") ; 2 col <-> 1 col
42 ("\e$(7!?\e(B" . "\e$(8!?\e(B")
43 ("\e$(7!@\e(B" . "\e$(8!@\e(B")
44 ("\e$(7!A\e(B" . "\e$(8!A\e(B")
45 ("\e$(7"`\e(B" . "\e$(8"`\e(B")
46 ("\e$(7!;\e(B" . "\e$(8!;\e(B")
47 ("\e$(7!D\e(B" . "\e$(8!D\e(B")
48 ;; Yes these are dirty. But ...
49 ("\e$(7!>\e(B \e$(7!>\e(B" . ,(compose-string "\e$(7!>\e(B \e$(7!>\e(B" 0 3 [?\e$(7!>\e(B (Br . Bl) ? (Br . Bl) ?\e$(7!>\e(B]))
50 ("\e$(7!4!5!5\e(B" . ,(compose-string
51 "\e$(7#R#S#S#S\e(B" 0 4
52 [?\e$(7#R\e(B (Br . Bl) ?\e$(7#S\e(B (Br . Bl) ?\e$(7#S\e(B (Br . Bl) ?\e$(7#S\e(B]))
53 ("\e$(7!4!5\e(B" . ,(compose-string "\e$(7#R#S#S\e(B" 0 3 [?\e$(7#R\e(B (Br . Bl) ?\e$(7#S\e(B (Br . Bl) ?\e$(7#S\e(B]))
54 ("\e$(7!6\e(B" . ,(compose-string "\e$(7#R#S!I\e(B" 0 3 [?\e$(7#R\e(B (Br . Bl) ?\e$(7#S\e(B (br . tr) ?\e$(7!I\e(B]))
55 ("\e$(7!4\e(B" . ,(compose-string "\e$(7#R#S\e(B" 0 2 [?\e$(7#R\e(B (Br . Bl) ?\e$(7#S\e(B]))))
56
80d75b56 57;;;###autoload
3bdf8898
KH
58(defun tibetan-char-p (ch)
59 "Check if char CH is Tibetan character.
60Returns non-nil if CH is Tibetan. Otherwise, returns nil."
61 (memq (char-charset ch) '(tibetan tibetan-1-column)))
80d75b56 62
3bdf8898 63;;; Functions for Tibetan <-> Tibetan-transcription.
80d75b56
KH
64
65;;;###autoload
3bdf8898
KH
66(defun tibetan-tibetan-to-transcription (str)
67 "Transcribe Tibetan string STR and return the corresponding Roman string."
68 (let (;; Accumulate transcriptions here in reverse order.
69 (trans nil)
70 (len (length str))
71 (i 0)
72 ch this-trans)
73 (while (< i len)
ff69c4ee 74 (let ((idx (string-match tibetan-precomposition-rule-regexp str i)))
3bdf8898
KH
75 (if (eq idx i)
76 ;; Ith character and the followings matches precomposable
77 ;; Tibetan sequence.
78 (setq i (match-end 0)
79 this-trans
80 (car (rassoc
81 (cdr (assoc (match-string 0 str)
82 tibetan-precomposition-rule-alist))
83 tibetan-precomposed-transcription-alist)))
84 (setq ch (substring str i (1+ i))
85 i (1+ i)
86 this-trans
87 (car (or (rassoc ch tibetan-consonant-transcription-alist)
88 (rassoc ch tibetan-vowel-transcription-alist)
89 (rassoc ch tibetan-subjoined-transcription-alist)))))
90 (setq trans (cons this-trans trans))))
91 (apply 'concat (nreverse trans))))
80d75b56 92
3bdf8898
KH
93;;;###autoload
94(defun tibetan-transcription-to-tibetan (str)
95 "Convert Tibetan Roman string STR to Tibetan character string.
96The returned string has no composition information."
97 (let (;; Case is significant.
98 (case-fold-search nil)
99 (idx 0)
100 ;; Accumulate Tibetan strings here in reverse order.
101 (t-str-list nil)
102 i subtrans)
103 (while (setq i (string-match tibetan-regexp str idx))
104 (if (< idx i)
105 ;; STR contains a pattern that doesn't match Tibetan
106 ;; transcription. Include the pattern as is.
107 (setq t-str-list (cons (substring str idx i) t-str-list)))
108 (setq subtrans (match-string 0 str)
109 idx (match-end 0))
110 (let ((t-char (cdr (assoc subtrans
111 tibetan-precomposed-transcription-alist))))
112 (if t-char
113 ;; SUBTRANS corresponds to a transcription for
114 ;; precomposable Tibetan sequence.
115 (setq t-char (car (rassoc t-char
116 tibetan-precomposition-rule-alist)))
117 (setq t-char
118 (cdr
119 (or (assoc subtrans tibetan-consonant-transcription-alist)
120 (assoc subtrans tibetan-vowel-transcription-alist)
121 (assoc subtrans tibetan-modifier-transcription-alist)
122 (assoc subtrans tibetan-subjoined-transcription-alist)))))
123 (setq t-str-list (cons t-char t-str-list))))
124 (if (< idx (length str))
125 (setq t-str-list (cons (substring str idx) t-str-list)))
126 (apply 'concat (nreverse t-str-list))))
80d75b56
KH
127
128;;;
3bdf8898 129;;; Functions for composing/decomposing Tibetan sequence.
80d75b56
KH
130;;;
131;;; A Tibetan syllable is typically structured as follows:
132;;;
133;;; [Prefix] C [C+] V [M] [Suffix [Post suffix]]
134;;;
135;;; where C's are all vertically stacked, V appears below or above
136;;; consonant cluster and M is always put above the C[C+]V combination.
137;;; (Sanskrit visarga, though it is a vowel modifier, is considered
138;;; to be a punctuation.)
139;;;
ff69c4ee 140;;; Here are examples of the words "bsgrubs" and "hfauM"
80d75b56 141;;;
ff69c4ee 142;;; \e4\e$(7"7\e0"7\e1\e4%qx!"U\e0"G###C"U\e1\e4"7\e0"7\e1\e4"G\e0"G\e1\e(B \e4\e$(7"Hx!"Rx!"Ur'"_\e0"H"R"U"_\e1\e(B
80d75b56
KH
143;;;
144;;; M
145;;; b s b s h
ff69c4ee 146;;; g fa
80d75b56
KH
147;;; r u
148;;; u
149;;;
3bdf8898
KH
150;;; Consonants `'' (\e$(7"A\e(B), `w' (\e$(7">\e(B), `y' (\e$(7"B\e(B), `r' (\e$(7"C\e(B) take special
151;;; forms when they are used as subjoined consonant. Consonant `r'
152;;; takes another special form when used as superjoined in such a case
153;;; as "rka", while it does not change its form when conjoined with
154;;; subjoined `'', `w' or `y' as in "rwa", "rya".
155
156;; Append a proper composition rule and glyph to COMPONENTS to compose
157;; CHAR with a composition that has COMPONENTS.
158
159(defun tibetan-add-components (components char)
160 (let ((last (last components))
161 (stack-upper '(tc . bc))
162 (stack-under '(bc . tc))
7258f9ca 163 rule comp-vowel tmp)
3bdf8898
KH
164 ;; Special treatment for 'a chung.
165 ;; If 'a follows a consonant, turn it into the subjoined form.
7258f9ca
KH
166 ;; * Disabled by Tomabechi 2000/06/09 *
167 ;; Because in Unicode, \e$(7"A\e(B may follow directly a consonant without
a1506d29 168 ;; any intervening vowel, as in \e4\e$(7"9\e0"9\e1\e4""\e0"""Q\e1\e4"A\e0"A\e1!;\e(B=\e4\e$(7"9\e0"9\e1\e(B \e4\e$(7""\e0""\e1\e(B \e4\e$(7"A\e0"A\e1\e(B not \e4\e$(7"9\e0"9\e1\e(B \e4\e$(7""\e0""\e1\e(B \e$(7"Q\e(B \e4\e$(7"A\e0"A\e1\e(B
7258f9ca
KH
169 ;;(if (and (= char ?\e$(7"A\e(B)
170 ;; (aref (char-category-set (car last)) ?0))
171 ;; (setq char ?\e$(7"R\e(B)) ;; modified for new font by Tomabechi 1999/12/10
172
173 ;; Composite vowel signs are decomposed before being added
174 ;; Added by Tomabechi 2000/06/08
175 (if (memq char '(?\e$(7"T\e(B ?\e$(7"V\e(B ?\e$(7"W\e(B ?\e$(7"X\e(B ?\e$(7"Y\e(B ?\e$(7"Z\e(B ?\e$(7"b\e(B))
176 (setq comp-vowel
ff69c4ee
KH
177 (copy-sequence
178 (cddr (assoc (char-to-string char)
179 tibetan-composite-vowel-alist)))
7258f9ca
KH
180 char
181 (cadr (assoc (char-to-string char)
182 tibetan-composite-vowel-alist))))
3bdf8898
KH
183 (cond
184 ;; Compose upper vowel sign vertically over.
185 ((aref (char-category-set char) ?2)
186 (setq rule stack-upper))
187
188 ;; Compose lower vowel sign vertically under.
189 ((aref (char-category-set char) ?3)
6b12c749
KH
190 (if (eq char ?\e$(7"Q\e(B) ;; `\e$(7"Q\e(B' should not visible when composed.
191 (setq rule nil)
192 (setq rule stack-under)))
3bdf8898
KH
193 ;; Transform ra-mgo (superscribed r) if followed by a subjoined
194 ;; consonant other than w, ', y, r.
195 ((and (= (car last) ?\e$(7"C\e(B)
6b12c749
KH
196 (not (memq char '(?\e$(7#>\e(B ?\e$(7"R\e(B ?\e$(7#B\e(B ?\e$(7#C\e(B))))
197 (setcar last ?\e$(7!"\e(B) ;; modified for newfont by Tomabechi 1999/12/10
3bdf8898 198 (setq rule stack-under))
3bdf8898
KH
199 ;; Transform initial base consonant if followed by a subjoined
200 ;; consonant but 'a.
201 (t
202 (let ((laststr (char-to-string (car last))))
6b12c749
KH
203 (if (and (/= char ?\e$(7"R\e(B) ;; modified for new font by Tomabechi
204 (string-match "[\e$(7"!\e(B-\e$(7"="?"@"D\e(B-\e$(7"J"K\e(B]" laststr))
3bdf8898
KH
205 (setcar last (string-to-char
206 (cdr (assoc (char-to-string (car last))
207 tibetan-base-to-subjoined-alist)))))
208 (setq rule stack-under))))
209
6b12c749 210 (if rule
7258f9ca
KH
211 (setcdr last (list rule char)))
212 ;; Added by Tomabechi 2000/06/08
213 (if comp-vowel
214 (nconc last comp-vowel))
215 ))
80d75b56
KH
216
217;;;###autoload
218(defun tibetan-compose-string (str)
3bdf8898
KH
219 "Compose Tibetan string STR."
220 (let ((idx 0))
221 ;; `\e$(7"A\e(B' is included in the pattern for subjoined consonants
222 ;; because we treat it specially in tibetan-add-components.
7258f9ca 223 ;; (This feature is removed by Tomabechi 2000/06/08)
3bdf8898
KH
224 (while (setq idx (string-match tibetan-composable-pattern str idx))
225 (let ((from idx)
226 (to (match-end 0))
227 components)
228 (if (eq (string-match tibetan-precomposition-rule-regexp str idx) idx)
229 (setq idx (match-end 0)
230 components
231 (list (string-to-char
232 (cdr
233 (assoc (match-string 0 str)
234 tibetan-precomposition-rule-alist)))))
235 (setq components (list (aref str idx))
236 idx (1+ idx)))
237 (while (< idx to)
238 (tibetan-add-components components (aref str idx))
239 (setq idx (1+ idx)))
240 (compose-string str from to components))))
241 str)
80d75b56
KH
242
243;;;###autoload
3bdf8898
KH
244(defun tibetan-compose-region (beg end)
245 "Compose Tibetan text the region BEG and END."
80d75b56 246 (interactive "r")
3bdf8898 247 (let (str result chars)
80d75b56
KH
248 (save-excursion
249 (save-restriction
250 (narrow-to-region beg end)
251 (goto-char (point-min))
3bdf8898
KH
252 ;; `\e$(7"A\e(B' is included in the pattern for subjoined consonants
253 ;; because we treat it specially in tibetan-add-components.
7258f9ca 254 ;; (This feature is removed by Tomabechi 2000/06/08)
3bdf8898
KH
255 (while (re-search-forward tibetan-composable-pattern nil t)
256 (let ((from (match-beginning 0))
257 (to (match-end 0))
258 components)
259 (goto-char from)
260 (if (looking-at tibetan-precomposition-rule-regexp)
261 (progn
262 (setq components
263 (list (string-to-char
264 (cdr
265 (assoc (match-string 0)
266 tibetan-precomposition-rule-alist)))))
267 (goto-char (match-end 0)))
268 (setq components (list (char-after from)))
269 (forward-char 1))
270 (while (< (point) to)
271 (tibetan-add-components components (following-char))
272 (forward-char 1))
273 (compose-region from to components)))))))
80d75b56 274
6b12c749
KH
275(defvar tibetan-decompose-precomposition-alist
276 (mapcar (function (lambda (x) (cons (string-to-char (cdr x)) (car x))))
277 tibetan-precomposition-rule-alist))
278
80d75b56 279;;;###autoload
6b12c749
KH
280(defun tibetan-decompose-region (from to)
281 "Decompose Tibetan text in the region FROM and TO.
282This is different from decompose-region because precomposed Tibetan characters
80dadb9e 283are decomposed into normal Tibetan character sequences."
6b12c749
KH
284 (interactive "r")
285 (save-restriction
286 (narrow-to-region from to)
287 (decompose-region from to)
288 (goto-char from)
289 (while (not (eobp))
290 (let* ((char (following-char))
291 (slot (assq char tibetan-decompose-precomposition-alist)))
292 (if slot
293 (progn
294 (delete-char 1)
295 (insert (cdr slot)))
296 (forward-char 1))))))
297
298
3bdf8898 299;;;###autoload
6b12c749
KH
300(defun tibetan-decompose-string (str)
301 "Decompose Tibetan string STR.
302This is different from decompose-string because precomposed Tibetan characters
80dadb9e 303are decomposed into normal Tibetan character sequences."
6b12c749
KH
304 (let ((new "")
305 (len (length str))
306 (idx 0)
307 char slot)
308 (while (< idx len)
309 (setq char (aref str idx)
310 slot (assq (aref str idx) tibetan-decompose-precomposition-alist)
311 new (concat new (if slot (cdr slot) (char-to-string char)))
312 idx (1+ idx)))
313 new))
3bdf8898
KH
314
315;;;###autoload
316(defun tibetan-composition-function (from to pattern &optional string)
317 (if string
318 (tibetan-compose-string string)
319 (tibetan-compose-region from to))
320 (- to from))
80d75b56
KH
321
322;;;
323;;; This variable is used to avoid repeated decomposition.
324;;;
325(setq-default tibetan-decomposed nil)
326
327;;;###autoload
328(defun tibetan-decompose-buffer ()
329 "Decomposes Tibetan characters in the buffer into their components.
3bdf8898 330See also the documentation of the function `tibetan-decompose-region'."
80d75b56
KH
331 (interactive)
332 (make-local-variable 'tibetan-decomposed)
333 (cond ((not tibetan-decomposed)
334 (tibetan-decompose-region (point-min) (point-max))
335 (setq tibetan-decomposed t))))
336
337;;;###autoload
338(defun tibetan-compose-buffer ()
339 "Composes Tibetan character components in the buffer.
340See also docstring of the function tibetan-compose-region."
341 (interactive)
342 (make-local-variable 'tibetan-decomposed)
343 (tibetan-compose-region (point-min) (point-max))
344 (setq tibetan-decomposed nil))
345
346;;;###autoload
347(defun tibetan-post-read-conversion (len)
348 (save-excursion
349 (save-restriction
350 (let ((buffer-modified-p (buffer-modified-p)))
351 (narrow-to-region (point) (+ (point) len))
352 (tibetan-compose-region (point-min) (point-max))
353 (set-buffer-modified-p buffer-modified-p)
73b4b264
KH
354 (make-local-variable 'tibetan-decomposed)
355 (setq tibetan-decomposed nil)
356 (- (point-max) (point-min))))))
80d75b56
KH
357
358
359;;;###autoload
360(defun tibetan-pre-write-conversion (from to)
361 (setq tibetan-decomposed-temp tibetan-decomposed)
7273226d 362 (let ((old-buf (current-buffer)))
76d12415
KH
363 (set-buffer (generate-new-buffer " *temp*"))
364 (if (stringp from)
365 (insert from)
366 (insert-buffer-substring old-buf from to))
367 (if (not tibetan-decomposed-temp)
368 (tibetan-decompose-region (point-min) (point-max)))
5d0ae729
KH
369 ;; Should return nil as annotations.
370 nil))
80d75b56 371
d633b538
KH
372\f
373;;;
374;;; Unicode-related definitions.
a1506d29 375;;;
d633b538
KH
376
377(defvar tibetan-canonicalize-for-unicode-alist
378 '(("\e$(7"Q\e(B" . "") ;; remove vowel a
379 ("\e$(7"T\e(B" . "\e$(7"R"S\e(B") ;; decompose vowels whose use is ``discouraged'' in Unicode 3.0
380 ("\e$(7"V\e(B" . "\e$(7"R"U\e(B")
381 ("\e$(7"W\e(B" . "\e$(7#C"a\e(B")
382 ("\e$(7"X\e(B" . "\e$(7#C"R"a\e(B")
383 ("\e$(7"Y\e(B" . "\e$(7#D"a\e(B")
384 ("\e$(7"Z\e(B" . "\e$(7#D"R"a\e(B")
385 ("\e$(7"b\e(B" . "\e$(7"R"a\e(B"))
386 "Rules for canonicalizing Tibetan vowels for Unicode.")
387
388(defvar tibetan-canonicalize-for-unicode-regexp
389 "[\e$(7"Q"T"V"W"X"Y"Z"b\e(B]"
390 "Regexp for Tibetan vowels to be canonicalized in Unicode.")
391
392(defun tibetan-canonicalize-for-unicode-region (from to)
393 (save-restriction
394 (narrow-to-region from to)
395 (goto-char from)
396 (while (re-search-forward tibetan-canonicalize-for-unicode-regexp nil t)
397 (let (
398 ;;(from (match-beginning 0))
399 ;;(to (match-end 0))
400 (canonical-form
401 (cdr (assoc (match-string 0)
402 tibetan-canonicalize-for-unicode-alist))))
403 ;;(goto-char from)
404 ;;(delete-region from to)
405 ;;(insert canonical-form)
406 (replace-match canonical-form)
407 ))))
408
409(defvar tibetan-strict-unicode t
410 "*Flag to control Tibetan canonicalizing for Unicode.
411
412If non-nil, the vowel a is removed and composite vowels are decomposed
413before writing buffer in Unicode. See also
414`tibetan-canonicalize-for-unicode-regexp' and
415`tibetan-canonicalize-for-unicode-alist'.")
416
417;;;###autoload
418(defun tibetan-pre-write-canonicalize-for-unicode (from to)
419 (let ((old-buf (current-buffer))
420 (strict-unicode tibetan-strict-unicode))
421 (set-buffer (generate-new-buffer " *temp*"))
422 (if (stringp from)
423 (insert from)
424 (insert-buffer-substring old-buf from to))
425 (if strict-unicode
426 (tibetan-canonicalize-for-unicode-region (point-min) (point-max)))
427 ;; Should return nil as annotations.
428 nil))
429
650e8505 430(provide 'tibet-util)
80d75b56 431
ab5796a9 432;;; arch-tag: 7a7333e8-1584-446c-b39c-a02b9def265d
e8af40ee 433;;; tibet-util.el ends here