Add arch taglines
[bpt/emacs.git] / lisp / language / tibet-util.el
CommitLineData
348f797a 1;;; tibet-util.el --- utilities for Tibetan -*- coding: iso-2022-7bit; -*-
80d75b56 2
80d75b56 3;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
fa526c4a 4;; Licensed to the Free Software Foundation.
80d75b56
KH
5
6;; Keywords: multilingual, Tibetan
7
8;; This file is part of GNU Emacs.
9
10;; GNU Emacs is free software; you can redistribute it and/or modify
11;; it under the terms of the GNU General Public License as published by
12;; the Free Software Foundation; either version 2, or (at your option)
13;; any later version.
14
15;; GNU Emacs is distributed in the hope that it will be useful,
16;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18;; GNU General Public License for more details.
19
20;; You should have received a copy of the GNU General Public License
21;; along with GNU Emacs; see the file COPYING. If not, write to the
22;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23;; Boston, MA 02111-1307, USA.
24
25;; Author: Toru TOMABECHI, <Toru.Tomabechi@orient.unil.ch>
26
27;; Created: Feb. 17. 1997
28
e8af40ee 29;;; History:
80d75b56
KH
30;; 1997.03.13 Modification in treatment of text properties;
31;; Support for some special signs and punctuations.
3bdf8898 32;; 1999.10.25 Modification for a new composition way by K.Handa.
80d75b56 33
e8af40ee
PJ
34;;; Commentary:
35
80d75b56
KH
36;;; Code:
37
93f7e657
RS
38(defconst tibetan-obsolete-glyphs
39 `(("\e$(7!=\e(B" . "\e$(8!=\e(B") ; 2 col <-> 1 col
40 ("\e$(7!?\e(B" . "\e$(8!?\e(B")
41 ("\e$(7!@\e(B" . "\e$(8!@\e(B")
42 ("\e$(7!A\e(B" . "\e$(8!A\e(B")
43 ("\e$(7"`\e(B" . "\e$(8"`\e(B")
44 ("\e$(7!;\e(B" . "\e$(8!;\e(B")
45 ("\e$(7!D\e(B" . "\e$(8!D\e(B")
46 ;; Yes these are dirty. But ...
47 ("\e$(7!>\e(B \e$(7!>\e(B" . ,(compose-string "\e$(7!>\e(B \e$(7!>\e(B" 0 3 [?\e$(7!>\e(B (Br . Bl) ? (Br . Bl) ?\e$(7!>\e(B]))
48 ("\e$(7!4!5!5\e(B" . ,(compose-string
49 "\e$(7#R#S#S#S\e(B" 0 4
50 [?\e$(7#R\e(B (Br . Bl) ?\e$(7#S\e(B (Br . Bl) ?\e$(7#S\e(B (Br . Bl) ?\e$(7#S\e(B]))
51 ("\e$(7!4!5\e(B" . ,(compose-string "\e$(7#R#S#S\e(B" 0 3 [?\e$(7#R\e(B (Br . Bl) ?\e$(7#S\e(B (Br . Bl) ?\e$(7#S\e(B]))
52 ("\e$(7!6\e(B" . ,(compose-string "\e$(7#R#S!I\e(B" 0 3 [?\e$(7#R\e(B (Br . Bl) ?\e$(7#S\e(B (br . tr) ?\e$(7!I\e(B]))
53 ("\e$(7!4\e(B" . ,(compose-string "\e$(7#R#S\e(B" 0 2 [?\e$(7#R\e(B (Br . Bl) ?\e$(7#S\e(B]))))
54
80d75b56 55;;;###autoload
3bdf8898
KH
56(defun tibetan-char-p (ch)
57 "Check if char CH is Tibetan character.
58Returns non-nil if CH is Tibetan. Otherwise, returns nil."
59 (memq (char-charset ch) '(tibetan tibetan-1-column)))
80d75b56 60
3bdf8898 61;;; Functions for Tibetan <-> Tibetan-transcription.
80d75b56
KH
62
63;;;###autoload
3bdf8898
KH
64(defun tibetan-tibetan-to-transcription (str)
65 "Transcribe Tibetan string STR and return the corresponding Roman string."
66 (let (;; Accumulate transcriptions here in reverse order.
67 (trans nil)
68 (len (length str))
69 (i 0)
70 ch this-trans)
71 (while (< i len)
ff69c4ee 72 (let ((idx (string-match tibetan-precomposition-rule-regexp str i)))
3bdf8898
KH
73 (if (eq idx i)
74 ;; Ith character and the followings matches precomposable
75 ;; Tibetan sequence.
76 (setq i (match-end 0)
77 this-trans
78 (car (rassoc
79 (cdr (assoc (match-string 0 str)
80 tibetan-precomposition-rule-alist))
81 tibetan-precomposed-transcription-alist)))
82 (setq ch (substring str i (1+ i))
83 i (1+ i)
84 this-trans
85 (car (or (rassoc ch tibetan-consonant-transcription-alist)
86 (rassoc ch tibetan-vowel-transcription-alist)
87 (rassoc ch tibetan-subjoined-transcription-alist)))))
88 (setq trans (cons this-trans trans))))
89 (apply 'concat (nreverse trans))))
80d75b56 90
3bdf8898
KH
91;;;###autoload
92(defun tibetan-transcription-to-tibetan (str)
93 "Convert Tibetan Roman string STR to Tibetan character string.
94The returned string has no composition information."
95 (let (;; Case is significant.
96 (case-fold-search nil)
97 (idx 0)
98 ;; Accumulate Tibetan strings here in reverse order.
99 (t-str-list nil)
100 i subtrans)
101 (while (setq i (string-match tibetan-regexp str idx))
102 (if (< idx i)
103 ;; STR contains a pattern that doesn't match Tibetan
104 ;; transcription. Include the pattern as is.
105 (setq t-str-list (cons (substring str idx i) t-str-list)))
106 (setq subtrans (match-string 0 str)
107 idx (match-end 0))
108 (let ((t-char (cdr (assoc subtrans
109 tibetan-precomposed-transcription-alist))))
110 (if t-char
111 ;; SUBTRANS corresponds to a transcription for
112 ;; precomposable Tibetan sequence.
113 (setq t-char (car (rassoc t-char
114 tibetan-precomposition-rule-alist)))
115 (setq t-char
116 (cdr
117 (or (assoc subtrans tibetan-consonant-transcription-alist)
118 (assoc subtrans tibetan-vowel-transcription-alist)
119 (assoc subtrans tibetan-modifier-transcription-alist)
120 (assoc subtrans tibetan-subjoined-transcription-alist)))))
121 (setq t-str-list (cons t-char t-str-list))))
122 (if (< idx (length str))
123 (setq t-str-list (cons (substring str idx) t-str-list)))
124 (apply 'concat (nreverse t-str-list))))
80d75b56
KH
125
126;;;
3bdf8898 127;;; Functions for composing/decomposing Tibetan sequence.
80d75b56
KH
128;;;
129;;; A Tibetan syllable is typically structured as follows:
130;;;
131;;; [Prefix] C [C+] V [M] [Suffix [Post suffix]]
132;;;
133;;; where C's are all vertically stacked, V appears below or above
134;;; consonant cluster and M is always put above the C[C+]V combination.
135;;; (Sanskrit visarga, though it is a vowel modifier, is considered
136;;; to be a punctuation.)
137;;;
ff69c4ee 138;;; Here are examples of the words "bsgrubs" and "hfauM"
80d75b56 139;;;
ff69c4ee 140;;; \e4\e$(7"7\e0"7\e1\e4%qx!"U\e0"G###C"U\e1\e4"7\e0"7\e1\e4"G\e0"G\e1\e(B \e4\e$(7"Hx!"Rx!"Ur'"_\e0"H"R"U"_\e1\e(B
80d75b56
KH
141;;;
142;;; M
143;;; b s b s h
ff69c4ee 144;;; g fa
80d75b56
KH
145;;; r u
146;;; u
147;;;
3bdf8898
KH
148;;; Consonants `'' (\e$(7"A\e(B), `w' (\e$(7">\e(B), `y' (\e$(7"B\e(B), `r' (\e$(7"C\e(B) take special
149;;; forms when they are used as subjoined consonant. Consonant `r'
150;;; takes another special form when used as superjoined in such a case
151;;; as "rka", while it does not change its form when conjoined with
152;;; subjoined `'', `w' or `y' as in "rwa", "rya".
153
154;; Append a proper composition rule and glyph to COMPONENTS to compose
155;; CHAR with a composition that has COMPONENTS.
156
157(defun tibetan-add-components (components char)
158 (let ((last (last components))
159 (stack-upper '(tc . bc))
160 (stack-under '(bc . tc))
7258f9ca 161 rule comp-vowel tmp)
3bdf8898
KH
162 ;; Special treatment for 'a chung.
163 ;; If 'a follows a consonant, turn it into the subjoined form.
7258f9ca
KH
164 ;; * Disabled by Tomabechi 2000/06/09 *
165 ;; Because in Unicode, \e$(7"A\e(B may follow directly a consonant without
a1506d29 166 ;; any intervening vowel, as in \e4\e$(7"9\e0"9\e1\e4""\e0"""Q\e1\e4"A\e0"A\e1!;\e(B=\e4\e$(7"9\e0"9\e1\e(B \e4\e$(7""\e0""\e1\e(B \e4\e$(7"A\e0"A\e1\e(B not \e4\e$(7"9\e0"9\e1\e(B \e4\e$(7""\e0""\e1\e(B \e$(7"Q\e(B \e4\e$(7"A\e0"A\e1\e(B
7258f9ca
KH
167 ;;(if (and (= char ?\e$(7"A\e(B)
168 ;; (aref (char-category-set (car last)) ?0))
169 ;; (setq char ?\e$(7"R\e(B)) ;; modified for new font by Tomabechi 1999/12/10
170
171 ;; Composite vowel signs are decomposed before being added
172 ;; Added by Tomabechi 2000/06/08
173 (if (memq char '(?\e$(7"T\e(B ?\e$(7"V\e(B ?\e$(7"W\e(B ?\e$(7"X\e(B ?\e$(7"Y\e(B ?\e$(7"Z\e(B ?\e$(7"b\e(B))
174 (setq comp-vowel
ff69c4ee
KH
175 (copy-sequence
176 (cddr (assoc (char-to-string char)
177 tibetan-composite-vowel-alist)))
7258f9ca
KH
178 char
179 (cadr (assoc (char-to-string char)
180 tibetan-composite-vowel-alist))))
3bdf8898
KH
181 (cond
182 ;; Compose upper vowel sign vertically over.
183 ((aref (char-category-set char) ?2)
184 (setq rule stack-upper))
185
186 ;; Compose lower vowel sign vertically under.
187 ((aref (char-category-set char) ?3)
6b12c749
KH
188 (if (eq char ?\e$(7"Q\e(B) ;; `\e$(7"Q\e(B' should not visible when composed.
189 (setq rule nil)
190 (setq rule stack-under)))
3bdf8898
KH
191 ;; Transform ra-mgo (superscribed r) if followed by a subjoined
192 ;; consonant other than w, ', y, r.
193 ((and (= (car last) ?\e$(7"C\e(B)
6b12c749
KH
194 (not (memq char '(?\e$(7#>\e(B ?\e$(7"R\e(B ?\e$(7#B\e(B ?\e$(7#C\e(B))))
195 (setcar last ?\e$(7!"\e(B) ;; modified for newfont by Tomabechi 1999/12/10
3bdf8898 196 (setq rule stack-under))
3bdf8898
KH
197 ;; Transform initial base consonant if followed by a subjoined
198 ;; consonant but 'a.
199 (t
200 (let ((laststr (char-to-string (car last))))
6b12c749
KH
201 (if (and (/= char ?\e$(7"R\e(B) ;; modified for new font by Tomabechi
202 (string-match "[\e$(7"!\e(B-\e$(7"="?"@"D\e(B-\e$(7"J"K\e(B]" laststr))
3bdf8898
KH
203 (setcar last (string-to-char
204 (cdr (assoc (char-to-string (car last))
205 tibetan-base-to-subjoined-alist)))))
206 (setq rule stack-under))))
207
6b12c749 208 (if rule
7258f9ca
KH
209 (setcdr last (list rule char)))
210 ;; Added by Tomabechi 2000/06/08
211 (if comp-vowel
212 (nconc last comp-vowel))
213 ))
80d75b56
KH
214
215;;;###autoload
216(defun tibetan-compose-string (str)
3bdf8898
KH
217 "Compose Tibetan string STR."
218 (let ((idx 0))
219 ;; `\e$(7"A\e(B' is included in the pattern for subjoined consonants
220 ;; because we treat it specially in tibetan-add-components.
7258f9ca 221 ;; (This feature is removed by Tomabechi 2000/06/08)
3bdf8898
KH
222 (while (setq idx (string-match tibetan-composable-pattern str idx))
223 (let ((from idx)
224 (to (match-end 0))
225 components)
226 (if (eq (string-match tibetan-precomposition-rule-regexp str idx) idx)
227 (setq idx (match-end 0)
228 components
229 (list (string-to-char
230 (cdr
231 (assoc (match-string 0 str)
232 tibetan-precomposition-rule-alist)))))
233 (setq components (list (aref str idx))
234 idx (1+ idx)))
235 (while (< idx to)
236 (tibetan-add-components components (aref str idx))
237 (setq idx (1+ idx)))
238 (compose-string str from to components))))
239 str)
80d75b56
KH
240
241;;;###autoload
3bdf8898
KH
242(defun tibetan-compose-region (beg end)
243 "Compose Tibetan text the region BEG and END."
80d75b56 244 (interactive "r")
3bdf8898 245 (let (str result chars)
80d75b56
KH
246 (save-excursion
247 (save-restriction
248 (narrow-to-region beg end)
249 (goto-char (point-min))
3bdf8898
KH
250 ;; `\e$(7"A\e(B' is included in the pattern for subjoined consonants
251 ;; because we treat it specially in tibetan-add-components.
7258f9ca 252 ;; (This feature is removed by Tomabechi 2000/06/08)
3bdf8898
KH
253 (while (re-search-forward tibetan-composable-pattern nil t)
254 (let ((from (match-beginning 0))
255 (to (match-end 0))
256 components)
257 (goto-char from)
258 (if (looking-at tibetan-precomposition-rule-regexp)
259 (progn
260 (setq components
261 (list (string-to-char
262 (cdr
263 (assoc (match-string 0)
264 tibetan-precomposition-rule-alist)))))
265 (goto-char (match-end 0)))
266 (setq components (list (char-after from)))
267 (forward-char 1))
268 (while (< (point) to)
269 (tibetan-add-components components (following-char))
270 (forward-char 1))
271 (compose-region from to components)))))))
80d75b56 272
6b12c749
KH
273(defvar tibetan-decompose-precomposition-alist
274 (mapcar (function (lambda (x) (cons (string-to-char (cdr x)) (car x))))
275 tibetan-precomposition-rule-alist))
276
80d75b56 277;;;###autoload
6b12c749
KH
278(defun tibetan-decompose-region (from to)
279 "Decompose Tibetan text in the region FROM and TO.
280This is different from decompose-region because precomposed Tibetan characters
80dadb9e 281are decomposed into normal Tibetan character sequences."
6b12c749
KH
282 (interactive "r")
283 (save-restriction
284 (narrow-to-region from to)
285 (decompose-region from to)
286 (goto-char from)
287 (while (not (eobp))
288 (let* ((char (following-char))
289 (slot (assq char tibetan-decompose-precomposition-alist)))
290 (if slot
291 (progn
292 (delete-char 1)
293 (insert (cdr slot)))
294 (forward-char 1))))))
295
296
3bdf8898 297;;;###autoload
6b12c749
KH
298(defun tibetan-decompose-string (str)
299 "Decompose Tibetan string STR.
300This is different from decompose-string because precomposed Tibetan characters
80dadb9e 301are decomposed into normal Tibetan character sequences."
6b12c749
KH
302 (let ((new "")
303 (len (length str))
304 (idx 0)
305 char slot)
306 (while (< idx len)
307 (setq char (aref str idx)
308 slot (assq (aref str idx) tibetan-decompose-precomposition-alist)
309 new (concat new (if slot (cdr slot) (char-to-string char)))
310 idx (1+ idx)))
311 new))
3bdf8898
KH
312
313;;;###autoload
314(defun tibetan-composition-function (from to pattern &optional string)
315 (if string
316 (tibetan-compose-string string)
317 (tibetan-compose-region from to))
318 (- to from))
80d75b56
KH
319
320;;;
321;;; This variable is used to avoid repeated decomposition.
322;;;
323(setq-default tibetan-decomposed nil)
324
325;;;###autoload
326(defun tibetan-decompose-buffer ()
327 "Decomposes Tibetan characters in the buffer into their components.
3bdf8898 328See also the documentation of the function `tibetan-decompose-region'."
80d75b56
KH
329 (interactive)
330 (make-local-variable 'tibetan-decomposed)
331 (cond ((not tibetan-decomposed)
332 (tibetan-decompose-region (point-min) (point-max))
333 (setq tibetan-decomposed t))))
334
335;;;###autoload
336(defun tibetan-compose-buffer ()
337 "Composes Tibetan character components in the buffer.
338See also docstring of the function tibetan-compose-region."
339 (interactive)
340 (make-local-variable 'tibetan-decomposed)
341 (tibetan-compose-region (point-min) (point-max))
342 (setq tibetan-decomposed nil))
343
344;;;###autoload
345(defun tibetan-post-read-conversion (len)
346 (save-excursion
347 (save-restriction
348 (let ((buffer-modified-p (buffer-modified-p)))
349 (narrow-to-region (point) (+ (point) len))
350 (tibetan-compose-region (point-min) (point-max))
351 (set-buffer-modified-p buffer-modified-p)
73b4b264
KH
352 (make-local-variable 'tibetan-decomposed)
353 (setq tibetan-decomposed nil)
354 (- (point-max) (point-min))))))
80d75b56
KH
355
356
357;;;###autoload
358(defun tibetan-pre-write-conversion (from to)
359 (setq tibetan-decomposed-temp tibetan-decomposed)
7273226d 360 (let ((old-buf (current-buffer)))
76d12415
KH
361 (set-buffer (generate-new-buffer " *temp*"))
362 (if (stringp from)
363 (insert from)
364 (insert-buffer-substring old-buf from to))
365 (if (not tibetan-decomposed-temp)
366 (tibetan-decompose-region (point-min) (point-max)))
5d0ae729
KH
367 ;; Should return nil as annotations.
368 nil))
80d75b56 369
d633b538
KH
370\f
371;;;
372;;; Unicode-related definitions.
a1506d29 373;;;
d633b538
KH
374
375(defvar tibetan-canonicalize-for-unicode-alist
376 '(("\e$(7"Q\e(B" . "") ;; remove vowel a
377 ("\e$(7"T\e(B" . "\e$(7"R"S\e(B") ;; decompose vowels whose use is ``discouraged'' in Unicode 3.0
378 ("\e$(7"V\e(B" . "\e$(7"R"U\e(B")
379 ("\e$(7"W\e(B" . "\e$(7#C"a\e(B")
380 ("\e$(7"X\e(B" . "\e$(7#C"R"a\e(B")
381 ("\e$(7"Y\e(B" . "\e$(7#D"a\e(B")
382 ("\e$(7"Z\e(B" . "\e$(7#D"R"a\e(B")
383 ("\e$(7"b\e(B" . "\e$(7"R"a\e(B"))
384 "Rules for canonicalizing Tibetan vowels for Unicode.")
385
386(defvar tibetan-canonicalize-for-unicode-regexp
387 "[\e$(7"Q"T"V"W"X"Y"Z"b\e(B]"
388 "Regexp for Tibetan vowels to be canonicalized in Unicode.")
389
390(defun tibetan-canonicalize-for-unicode-region (from to)
391 (save-restriction
392 (narrow-to-region from to)
393 (goto-char from)
394 (while (re-search-forward tibetan-canonicalize-for-unicode-regexp nil t)
395 (let (
396 ;;(from (match-beginning 0))
397 ;;(to (match-end 0))
398 (canonical-form
399 (cdr (assoc (match-string 0)
400 tibetan-canonicalize-for-unicode-alist))))
401 ;;(goto-char from)
402 ;;(delete-region from to)
403 ;;(insert canonical-form)
404 (replace-match canonical-form)
405 ))))
406
407(defvar tibetan-strict-unicode t
408 "*Flag to control Tibetan canonicalizing for Unicode.
409
410If non-nil, the vowel a is removed and composite vowels are decomposed
411before writing buffer in Unicode. See also
412`tibetan-canonicalize-for-unicode-regexp' and
413`tibetan-canonicalize-for-unicode-alist'.")
414
415;;;###autoload
416(defun tibetan-pre-write-canonicalize-for-unicode (from to)
417 (let ((old-buf (current-buffer))
418 (strict-unicode tibetan-strict-unicode))
419 (set-buffer (generate-new-buffer " *temp*"))
420 (if (stringp from)
421 (insert from)
422 (insert-buffer-substring old-buf from to))
423 (if strict-unicode
424 (tibetan-canonicalize-for-unicode-region (point-min) (point-max)))
425 ;; Should return nil as annotations.
426 nil))
427
650e8505 428(provide 'tibet-util)
80d75b56 429
ab5796a9 430;;; arch-tag: 7a7333e8-1584-446c-b39c-a02b9def265d
e8af40ee 431;;; tibet-util.el ends here