Set category of Coptic characters be 'g' (Greek).
[bpt/emacs.git] / lisp / international / characters.el
CommitLineData
4ed46869
KH
1;;; characters.el --- set syntax and category for multibyte characters
2
ba318903 3;; Copyright (C) 1997, 2000-2014 Free Software Foundation, Inc.
7976eda0 4;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 5;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
2fd125a3
KH
6;; National Institute of Advanced Industrial Science and Technology (AIST)
7;; Registration Number H14PRO021
8f924df7 8;; Copyright (C) 2003
55bd52ea
KH
9;; National Institute of Advanced Industrial Science and Technology (AIST)
10;; Registration Number H13PRO009
4ed46869
KH
11
12;; Keywords: multibyte character, character set, syntax, category
13
14;; This file is part of GNU Emacs.
15
4936186e 16;; GNU Emacs is free software: you can redistribute it and/or modify
4ed46869 17;; it under the terms of the GNU General Public License as published by
4936186e
GM
18;; the Free Software Foundation, either version 3 of the License, or
19;; (at your option) any later version.
4ed46869
KH
20
21;; GNU Emacs is distributed in the hope that it will be useful,
22;; but WITHOUT ANY WARRANTY; without even the implied warranty of
23;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24;; GNU General Public License for more details.
25
26;; You should have received a copy of the GNU General Public License
4936186e 27;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
4ed46869
KH
28
29;;; Commentary:
30
60370d40
PJ
31;;; Code:
32
4ed46869
KH
33;;; Predefined categories.
34
35;; For each character set.
36
46bf60bc
KH
37(define-category ?a "ASCII
38ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
4ed46869
KH
39(define-category ?l "Latin")
40(define-category ?t "Thai")
41(define-category ?g "Greek")
42(define-category ?b "Arabic")
43(define-category ?w "Hebrew")
44(define-category ?y "Cyrillic")
46bf60bc
KH
45(define-category ?k "Katakana
46Japanese katakana")
47(define-category ?r "Roman
48Japanese roman")
4ed46869
KH
49(define-category ?c "Chinese")
50(define-category ?j "Japanese")
51(define-category ?h "Korean")
46bf60bc
KH
52(define-category ?e "Ethiopic
53Ethiopic (Ge'ez)")
54(define-category ?v "Viet
55Vietnamese")
4ed46869 56(define-category ?i "Indian")
6eba8645 57(define-category ?o "Lao")
9395eb7c 58(define-category ?q "Tibetan")
4ed46869
KH
59
60;; For each group (row) of 2-byte character sets.
61
46bf60bc
KH
62(define-category ?A "2-byte alnum
63Alpha-numeric characters of 2-byte character sets")
64(define-category ?C "2-byte han
65Chinese (Han) characters of 2-byte character sets")
66(define-category ?G "2-byte Greek
67Greek characters of 2-byte character sets")
68(define-category ?H "2-byte Hiragana
69Japanese Hiragana characters of 2-byte character sets")
70(define-category ?K "2-byte Katakana
71Japanese Katakana characters of 2-byte character sets")
72(define-category ?N "2-byte Korean
73Korean Hangul characters of 2-byte character sets")
91c491e0 74(define-category ?Y "2-byte Cyrillic
46bf60bc 75Cyrillic characters of 2-byte character sets")
4ed46869
KH
76(define-category ?I "Indian Glyphs")
77
78;; For phonetic classifications.
79
80(define-category ?0 "consonant")
46bf60bc 81(define-category ?1 "base vowel
4eb97232 82Base (independent) vowel")
46bf60bc 83(define-category ?2 "upper diacritic
4eb97232 84Upper diacritical mark (including upper vowel)")
46bf60bc 85(define-category ?3 "lower diacritic
4eb97232 86Lower diacritical mark (including lower vowel)")
46bf60bc 87(define-category ?4 "combining tone
4eb97232 88Combining tone mark")
9765a2ba 89(define-category ?5 "symbol")
4ed46869 90(define-category ?6 "digit")
91c491e0 91(define-category ?7 "vowel diacritic
4eb97232 92Vowel-modifying diacritical mark")
6eba8645
KH
93(define-category ?8 "vowel-signs")
94(define-category ?9 "semivowel lower")
4ed46869
KH
95
96;; For filling.
46bf60bc
KH
97(define-category ?| "line breakable
98While filling, we can break a line at this character.")
4ed46869 99
504af7b2 100;; For indentation calculation.
70ea295a 101(define-category ?\s
46bf60bc
KH
102 "space for indent
103This character counts as a space for indentation purposes.")
504af7b2 104
94487c4e 105;; Keep the following for `kinsoku' processing. See comments in
4ed46869 106;; kinsoku.el.
46bf60bc
KH
107(define-category ?> "Not at bol
108A character which can't be placed at beginning of line.")
109(define-category ?< "Not at eol
110A character which can't be placed at end of line.")
4ed46869 111
8ea6fa80
KH
112;; Base and Combining
113(define-category ?. "Base
114Base characters (Unicode General Category L,N,P,S,Zs)")
46bf60bc 115(define-category ?^ "Combining
4eb97232 116Combining diacritic or mark (Unicode General Category M)")
f635daa1
CY
117
118;; bidi types
119(define-category ?R "Right-to-left (strong)
120Characters with \"strong\" right-to-left directionality, i.e.
121with R, AL, RLE, or RLO Unicode bidi character type.")
122
123(define-category ?L "Left-to-right (strong)
124Characters with \"strong\" left-to-right directionality, i.e.
125with L, LRE, or LRO Unicode bidi character type.")
126
4ed46869
KH
127\f
128;;; Setting syntax and category.
129
130;; ASCII
131
e2cc40b7
KH
132;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
133(modify-category-entry '(32 . 127) ?a)
134(modify-category-entry '(32 . 127) ?l)
4ed46869 135
c94ae9eb
DL
136;; Deal with the CJK charsets first. Since the syntax of blocks is
137;; defined per charset, and the charsets may contain e.g. Latin
138;; characters, we end up with the wrong syntax definitions if we're
139;; not careful.
4ed46869 140
66bff5ed 141;; Chinese characters (Unicode)
a5bb49e1
KH
142(modify-category-entry '(#x2E80 . #x312F) ?|)
143(modify-category-entry '(#x3190 . #x33FF) ?|)
66a85e76
KH
144(modify-category-entry '(#x3400 . #x4DBF) ?C)
145(modify-category-entry '(#x4E00 . #x9FAF) ?C)
66bff5ed
KH
146(modify-category-entry '(#x3400 . #x9FAF) ?c)
147(modify-category-entry '(#x3400 . #x9FAF) ?|)
148(modify-category-entry '(#xF900 . #xFAFF) ?C)
149(modify-category-entry '(#xF900 . #xFAFF) ?c)
150(modify-category-entry '(#xF900 . #xFAFF) ?|)
796f8b2f
KH
151(modify-category-entry '(#x20000 . #x2FFFF) ?|)
152(modify-category-entry '(#x20000 . #x2FFFF) ?C)
153(modify-category-entry '(#x20000 . #x2FFFF) ?c)
8e4cd685 154
4ed46869
KH
155
156;; Chinese character set (GB2312)
157
66bff5ed
KH
158(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
159(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
160(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
4ed46869 161
87a39edb 162(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
9ad4b491
KH
163(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
164(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
165(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
66bff5ed
KH
166(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
167(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
168(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
169(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
170(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
4ed46869
KH
171
172;; Chinese character set (BIG5)
173
e7259832 174(map-charset-chars #'modify-category-entry 'big5 ?c)
66a85e76 175(map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA261)
9ad4b491 176(map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
66a85e76 177(map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DC)
4ed46869
KH
178
179;; Chinese character set (CNS11643)
180
87a39edb
DL
181(dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
182 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
183 chinese-cns11643-7))
184 (map-charset-chars #'modify-category-entry c ?c)
9ad4b491
KH
185 (if (eq c 'chinese-cns11643-1)
186 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
8e4cd685 187 (map-charset-chars #'modify-category-entry c ?C)))
4ed46869 188
8f924df7 189;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
4ed46869 190
66bff5ed 191(map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
4ed46869 192
66bff5ed 193(map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
4ed46869 194
8f924df7 195(dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
761f6427
KH
196 japanese-jisx0213-1 japanese-jisx0213-2
197 cp932-2-byte))
8e4cd685 198 (map-charset-chars #'modify-category-entry l ?j))
4ed46869 199
c4186f9c
KH
200;; Fullwidth characters
201(modify-category-entry '(#xff01 . #xff60) ?\|)
202
269a5dd0 203;; Unicode equivalents of JISX0201-kana
66bff5ed
KH
204(let ((range '(#xff61 . #xff9f)))
205 (modify-category-entry range ?k)
206 (modify-category-entry range ?j)
207 (modify-category-entry range ?\|))
269a5dd0
DL
208
209;; Katakana block
796f8b2f
KH
210(modify-category-entry '(#x3099 . #x309C) ?K)
211(modify-category-entry '(#x30A0 . #x30FF) ?K)
6f3ac1e1 212(modify-category-entry '(#x31F0 . #x31FF) ?K)
b11c2874 213(modify-category-entry '(#x30A0 . #x30FA) ?\|)
796f8b2f 214(modify-category-entry #x30FF ?\|)
269a5dd0
DL
215
216;; Hiragana block
796f8b2f
KH
217(modify-category-entry '(#x3040 . #x309F) ?H)
218(modify-category-entry '(#x3040 . #x3096) ?\|)
219(modify-category-entry #x309F ?\|)
220(modify-category-entry #x30A0 ?H)
221(modify-category-entry #x30FC ?H)
222
269a5dd0 223
4ed46869 224;; JISX0208
66bff5ed
KH
225(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
226(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
227(let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
69c2c6ea 228 (dolist (elt chars)
2b89bca4 229 (modify-syntax-entry elt "w")))
66bff5ed
KH
230
231(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
232(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
233(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
234(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
235(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
236(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
66a85e76 237(let ((chars '(?仝 ?々 ?〆 ?〇)))
4ed46869
KH
238 (while chars
239 (modify-category-entry (car chars) ?C)
240 (setq chars (cdr chars))))
241
242;; JISX0212
4ed46869 243
66bff5ed 244(map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
4ed46869
KH
245
246;; JISX0201-Kana
87a39edb 247
abdaa411 248(let ((chars '(?。 ?、 ?・)))
4ed46869
KH
249 (while chars
250 (modify-syntax-entry (car chars) ".")
251 (setq chars (cdr chars))))
252
e6d10035
KH
253(modify-syntax-entry ?\「 "(」")
254(modify-syntax-entry ?\」 "(「")
226e4119 255
4ed46869
KH
256;; Korean character set (KSC5601)
257
87a39edb 258(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
66bff5ed
KH
259
260(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
439f7264
DL
261(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
262(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
263(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
9ad4b491
KH
264(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
265(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
266(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
66bff5ed
KH
267(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
268(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
269(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
270(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
4ed46869 271
c94ae9eb 272;; These are in more than one charset.
8f924df7
KH
273(let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
274 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
275 "()[]{}"))
276 open close)
277 (dotimes (i (/ (length parens) 2))
278 (setq open (aref parens (* i 2))
279 close (aref parens (1+ (* i 2))))
280 (modify-syntax-entry open (format "(%c" close))
281 (modify-syntax-entry close (format ")%c" open))))
d05cfa1f 282
c94ae9eb 283;; Arabic character set
6eba8645 284
c94ae9eb
DL
285(let ((charsets '(arabic-iso8859-6
286 arabic-digit
287 arabic-1-column
288 arabic-2-column)))
289 (while charsets
290 (map-charset-chars #'modify-category-entry (car charsets) ?b)
291 (setq charsets (cdr charsets))))
292(modify-category-entry '(#x600 . #x6ff) ?b)
293(modify-category-entry '(#xfb50 . #xfdff) ?b)
294(modify-category-entry '(#xfe70 . #xfefe) ?b)
6eba8645 295
c94ae9eb
DL
296;; Cyrillic character set (ISO-8859-5)
297
298(modify-syntax-entry ?№ ".")
299
300;; Ethiopic character set
301
4c81b0f6
KH
302(modify-category-entry '(#x1200 . #x1399) ?e)
303(modify-category-entry '(#x2d80 . #x2dde) ?e)
55a3ed16 304(let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨)))
c94ae9eb
DL
305 (while chars
306 (modify-syntax-entry (car chars) ".")
307 (setq chars (cdr chars))))
308(map-charset-chars #'modify-category-entry 'ethiopic ?e)
309
310;; Hebrew character set (ISO-8859-8)
311
312(modify-syntax-entry #x5be ".") ; MAQAF
313(modify-syntax-entry #x5c0 ".") ; PASEQ
314(modify-syntax-entry #x5c3 ".") ; SOF PASUQ
315(modify-syntax-entry #x5f3 ".") ; GERESH
316(modify-syntax-entry #x5f4 ".") ; GERSHAYIM
317
318;; Indian character set (IS 13194 and other Emacs original Indian charsets)
319
320(modify-category-entry '(#x901 . #x970) ?i)
321(map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
322(map-charset-chars #'modify-category-entry 'indian-2-column ?i)
d05cfa1f 323
6eba8645
KH
324;; Lao character set
325
abdaa411
DL
326(modify-category-entry '(#xe80 . #xeff) ?o)
327(map-charset-chars #'modify-category-entry 'lao ?o)
6eba8645 328
abdaa411 329(let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
e6d10035
KH
330 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
331 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
332 ("ຸູ" "w" ?3) ; vowel lower
8f924df7 333 ("່-໋" "w" ?4) ; tone mark
e6d10035
KH
334 ("ຼຽ" "w" ?9) ; semivowel lower
335 ("໐-໙" "w" ?6) ; digit
336 ("ຯໆ" "_" ?5) ; symbol
6eba8645
KH
337 ))
338 elm chars len syntax category to ch i)
339 (while deflist
340 (setq elm (car deflist))
341 (setq chars (car elm)
342 len (length chars)
343 syntax (nth 1 elm)
344 category (nth 2 elm)
345 i 0)
346 (while (< i len)
347 (if (= (aref chars i) ?-)
348 (setq i (1+ i)
4a027a0d
KH
349 to (aref chars i))
350 (setq ch (aref chars i)
6eba8645
KH
351 to ch))
352 (while (<= ch to)
269a5dd0
DL
353 (unless (string-equal syntax "w")
354 (modify-syntax-entry ch syntax))
6eba8645
KH
355 (modify-category-entry ch category)
356 (setq ch (1+ ch)))
4a027a0d 357 (setq i (1+ i)))
6eba8645
KH
358 (setq deflist (cdr deflist))))
359
4ed46869
KH
360;; Thai character set (TIS620)
361
abdaa411
DL
362(modify-category-entry '(#xe00 . #xe7f) ?t)
363(map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
4ed46869
KH
364
365(let ((deflist '(;; chars syntax category
e6d10035
KH
366 ("ก-รลว-ฮ" "w" ?0) ; consonant
367 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
368 ("ัิ-ื็๎" "w" ?2) ; vowel upper
369 ("ุ-ฺ" "w" ?3) ; vowel lower
8f924df7 370 ("่-ํ" "w" ?4) ; tone mark
e6d10035
KH
371 ("๐-๙" "w" ?6) ; digit
372 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
4ed46869
KH
373 ))
374 elm chars len syntax category to ch i)
9395eb7c
KH
375 (while deflist
376 (setq elm (car deflist))
377 (setq chars (car elm)
378 len (length chars)
379 syntax (nth 1 elm)
380 category (nth 2 elm)
381 i 0)
382 (while (< i len)
383 (if (= (aref chars i) ?-)
384 (setq i (1+ i)
4a027a0d
KH
385 to (aref chars i))
386 (setq ch (aref chars i)
9395eb7c
KH
387 to ch))
388 (while (<= ch to)
269a5dd0
DL
389 (unless (string-equal syntax "w")
390 (modify-syntax-entry ch syntax))
9395eb7c
KH
391 (modify-category-entry ch category)
392 (setq ch (1+ ch)))
4a027a0d 393 (setq i (1+ i)))
9395eb7c
KH
394 (setq deflist (cdr deflist))))
395
396;; Tibetan character set
397
abdaa411
DL
398(modify-category-entry '(#xf00 . #xfff) ?q)
399(map-charset-chars #'modify-category-entry 'tibetan ?q)
400(map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
9395eb7c
KH
401
402(let ((deflist '(;; chars syntax category
725d7c92 403 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
55a3ed16 404 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
725d7c92
DL
405 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
406 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
53964682 407 ("྄ཱུ༙༵༷" "w" ?3) ; lower vowel/modifier
8f924df7 408 ("཰" "w" ?3) ; invisible vowel a
725d7c92
DL
409 ("༠-༩༪-༳" "w" ?6) ; digit
410 ("་།-༒༔ཿ" "." ?|) ; line-break char
411 ("་།༏༐༑༔ཿ" "." ?|) ;
412 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
413 ("་།༏༐༑༔ཿ" "." ?>) ;
414 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
415 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
9395eb7c
KH
416 ))
417 elm chars len syntax category to ch i)
4ed46869
KH
418 (while deflist
419 (setq elm (car deflist))
420 (setq chars (car elm)
421 len (length chars)
422 syntax (nth 1 elm)
423 category (nth 2 elm)
424 i 0)
425 (while (< i len)
426 (if (= (aref chars i) ?-)
427 (setq i (1+ i)
4a027a0d
KH
428 to (aref chars i))
429 (setq ch (aref chars i)
4ed46869
KH
430 to ch))
431 (while (<= ch to)
269a5dd0
DL
432 (unless (string-equal syntax "w")
433 (modify-syntax-entry ch syntax))
4ed46869
KH
434 (modify-category-entry ch category)
435 (setq ch (1+ ch)))
4a027a0d 436 (setq i (1+ i)))
4ed46869
KH
437 (setq deflist (cdr deflist))))
438
439;; Vietnamese character set
440
abdaa411
DL
441;; To make a word with Latin characters
442(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
443(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
444
445(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
446(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
4ed46869 447
e5dd1155
KH
448(let ((tbl (standard-case-table))
449 (i 32))
450 (while (< i 128)
725d7c92
DL
451 (let* ((char (decode-char 'vietnamese-viscii-upper i))
452 (charl (decode-char 'vietnamese-viscii-lower i))
453 (uc (encode-char char 'ucs))
454 (lc (encode-char charl 'ucs)))
455 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
4eb97232 456 tbl)
725d7c92
DL
457 (if uc (modify-category-entry uc ?v))
458 (if lc (modify-category-entry lc ?v)))
e5dd1155
KH
459 (setq i (1+ i))))
460
d807d0c7
KH
461;; Tai Viet
462(let ((deflist '(;; chars syntax category
da6062e6 463 ((?ꪀ. ?ꪯ) "w" ?0) ; consonant
d807d0c7
KH
464 ("ꪱꪵꪶ" "w" ?1) ; vowel base
465 ((?ꪹ . ?ꪽ) "w" ?1) ; vowel base
466 ("ꪰꪲꪳꪷꪸꪾ" "w" ?2) ; vowel upper
467 ("ꪴ" "w" ?3) ; vowel lower
468 ("ꫀꫂ" "w" ?1) ; non-combining tone-mark
469 ("꪿꫁" "w" ?4) ; combining tone-mark
470 ((?ꫛ . ?꫟) "_" ?5) ; symbol
471 )))
472 (dolist (elm deflist)
473 (let ((chars (car elm))
474 (syntax (nth 1 elm))
475 (category (nth 2 elm)))
476 (if (consp chars)
477 (progn
478 (modify-syntax-entry chars syntax)
479 (modify-category-entry chars category))
480 (mapc #'(lambda (x)
481 (modify-syntax-entry x syntax)
482 (modify-category-entry x category))
483 chars)))))
c94ae9eb 484
f635daa1
CY
485;; Bidi categories
486
20372d0c
GM
487;; If bootstrapping without generated uni-*.el files, table not defined.
488(let ((table (unicode-property-table-internal 'bidi-class)))
489 (when table
490 (map-char-table (lambda (key val)
491 (cond
492 ((memq val '(R AL RLO RLE))
493 (modify-category-entry key ?R))
494 ((memq val '(L LRE LRO))
495 (modify-category-entry key ?L))))
496 table)))
f635daa1 497
b7cf27ed
EZ
498;; Load uni-mirrored.el if available, so that it gets dumped into
499;; Emacs. This allows to start Emacs with force-load-messages in
500;; ~/.emacs, and avoid infinite recursion in bidi_initialize, which
501;; needs to load uni-mirrored.el in order to display the "Loading"
502;; messages.
503(unicode-property-table-internal 'mirroring)
bbab1c4f 504
c94ae9eb
DL
505;; Latin
506
507(modify-category-entry '(#x80 . #x024F) ?l)
d05cfa1f 508
85ef8ece
KH
509(let ((tbl (standard-case-table)) c)
510
4fb82d62
DL
511 ;; Latin-1
512
513 ;; Fixme: Some of the non-word syntaxes here perhaps should be
514 ;; reviewed. (Note that the following all implicitly have word
515 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
516 ;; relating Unicode categories to Emacs syntax codes.
db92e81e
KH
517
518 ;; NBSP isn't semantically interchangeable with other whitespace chars,
e1dbe924 519 ;; so it's more like punctuation.
db92e81e 520 (set-case-syntax ?  "." tbl)
4fb82d62
DL
521 (set-case-syntax ?¡ "." tbl)
522 (set-case-syntax ?¦ "_" tbl)
523 (set-case-syntax ?§ "." tbl)
524 (set-case-syntax ?© "_" tbl)
db3b7db5
SM
525 ;; French wants
526 ;; (set-case-syntax-delims ?« ?» tbl)
527 ;; And German wants
528 ;; (set-case-syntax-delims ?» ?« tbl)
529 ;; So let's stay neutral and let users set these up if/when they want to.
530 (set-case-syntax ?« "." tbl)
531 (set-case-syntax ?» "." tbl)
4fb82d62
DL
532 (set-case-syntax ?¬ "_" tbl)
533 (set-case-syntax ?­ "_" tbl)
534 (set-case-syntax ?® "_" tbl)
535 (set-case-syntax ?° "_" tbl)
536 (set-case-syntax ?± "_" tbl)
537 (set-case-syntax ?µ "_" tbl)
538 (set-case-syntax ?· "_" tbl)
539 (set-case-syntax ?¼ "_" tbl)
540 (set-case-syntax ?½ "_" tbl)
541 (set-case-syntax ?¾ "_" tbl)
542 (set-case-syntax ?¿ "." tbl)
543 (let ((c 192))
544 (while (<= c 222)
545 (set-case-syntax-pair c (+ c 32) tbl)
546 (setq c (1+ c))))
547 (set-case-syntax ?× "_" tbl)
548 (set-case-syntax ?ß "w" tbl)
549 (set-case-syntax ?÷ "_" tbl)
550 ;; See below for ÿ.
85ef8ece 551
85ef8ece
KH
552 ;; Latin Extended-A, Latin Extended-B
553 (setq c #x0100)
e5e381c8
KH
554 (while (<= c #x02B8)
555 (modify-category-entry c ?l)
d05cfa1f 556 (setq c (1+ c)))
2bb915b8 557
e5e381c8
KH
558 (let ((pair-ranges '((#x0100 . #x012F)
559 (#x0132 . #x0137)
560 (#x0139 . #x0148)
561 (#x014a . #x0177)
562 (#x0179 . #x017E)
563 (#x0182 . #x0185)
796f8b2f
KH
564 (#x0187 . #x0188)
565 (#x018B . #x018C)
e5e381c8
KH
566 (#x0191 . #x0192)
567 (#x0198 . #x0199)
568 (#x01A0 . #x01A5)
569 (#x01A7 . #x01A8)
570 (#x01AC . #x01AD)
571 (#x01AF . #x01B0)
572 (#x01B3 . #x01B6)
d0203d61 573 (#x01B8 . #x01B9)
e5e381c8
KH
574 (#x01BC . #x01BD)
575 (#x01CD . #x01DC)
576 (#x01DE . #x01EF)
577 (#x01F4 . #x01F5)
578 (#x01F8 . #x021F)
579 (#x0222 . #x0233)
580 (#x023B . #x023C)
581 (#x0241 . #x0242)
582 (#x0246 . #x024F))))
583 (dolist (elt pair-ranges)
584 (let ((from (car elt)) (to (cdr elt)))
585 (while (< from to)
586 (set-case-syntax-pair from (1+ from) tbl)
587 (setq from (+ from 2))))))
2bb915b8 588
d0203d61 589 (set-case-syntax-pair ?Ÿ ?ÿ tbl)
796f8b2f 590
2bb915b8
KH
591 ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
592 ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
593 ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
594 ;; SMALL LETTER I.
595
596 ;; We used to set up half of those correspondence unconditionally,
597 ;; but that makes searches slow. So now we don't set up either half
598 ;; of these correspondences by default.
599
600 ;; (set-downcase-syntax ?İ ?i tbl)
601 ;; (set-upcase-syntax ?I ?ı tbl)
602
0d93216c
AS
603 (set-case-syntax-pair ?Ɓ ?ɓ tbl)
604 (set-case-syntax-pair ?Ɔ ?ɔ tbl)
d0203d61
AS
605 (set-case-syntax-pair ?Ɖ ?ɖ tbl)
606 (set-case-syntax-pair ?Ɗ ?ɗ tbl)
0d93216c
AS
607 (set-case-syntax-pair ?Ǝ ?ǝ tbl)
608 (set-case-syntax-pair ?Ə ?ə tbl)
609 (set-case-syntax-pair ?Ɛ ?ɛ tbl)
610 (set-case-syntax-pair ?Ɠ ?ɠ tbl)
611 (set-case-syntax-pair ?Ɣ ?ɣ tbl)
612 (set-case-syntax-pair ?Ɩ ?ɩ tbl)
613 (set-case-syntax-pair ?Ɨ ?ɨ tbl)
614 (set-case-syntax-pair ?Ɯ ?ɯ tbl)
615 (set-case-syntax-pair ?Ɲ ?ɲ tbl)
616 (set-case-syntax-pair ?Ɵ ?ɵ tbl)
617 (set-case-syntax-pair ?Ʀ ?ʀ tbl)
618 (set-case-syntax-pair ?Ʃ ?ʃ tbl)
619 (set-case-syntax-pair ?Ʈ ?ʈ tbl)
620 (set-case-syntax-pair ?Ʊ ?ʊ tbl)
621 (set-case-syntax-pair ?Ʋ ?ʋ tbl)
622 (set-case-syntax-pair ?Ʒ ?ʒ tbl)
e6d10035
KH
623 (set-case-syntax-pair ?DŽ ?dž tbl)
624 (set-case-syntax-pair ?Dž ?dž tbl)
625 (set-case-syntax-pair ?LJ ?lj tbl)
626 (set-case-syntax-pair ?Lj ?lj tbl)
627 (set-case-syntax-pair ?NJ ?nj tbl)
628 (set-case-syntax-pair ?Nj ?nj tbl)
e5e381c8 629
269a5dd0 630 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
e6d10035
KH
631 (set-case-syntax-pair ?DZ ?dz tbl)
632 (set-case-syntax-pair ?Dz ?dz tbl)
e6d10035
KH
633 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
634 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
cb80bcd1
EZ
635 (set-case-syntax-pair ?Ⱥ ?ⱥ tbl)
636 (set-case-syntax-pair ?Ƚ ?ƚ tbl)
637 (set-case-syntax-pair ?Ⱦ ?ⱦ tbl)
638 (set-case-syntax-pair ?Ƀ ?ƀ tbl)
639 (set-case-syntax-pair ?Ʉ ?ʉ tbl)
640 (set-case-syntax-pair ?Ʌ ?ʌ tbl)
269a5dd0 641
85ef8ece 642 ;; Latin Extended Additional
abdaa411 643 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
85ef8ece 644 (setq c #x1e00)
d05cfa1f 645 (while (<= c #x1ef9)
d05cfa1f
KH
646 (and (zerop (% c 2))
647 (or (<= c #x1e94) (>= c #x1ea0))
abdaa411 648 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f
KH
649 (setq c (1+ c)))
650
85ef8ece 651 ;; Greek
abdaa411 652 (modify-category-entry '(#x0370 . #x03ff) ?g)
85ef8ece 653 (setq c #x0370)
d05cfa1f 654 (while (<= c #x03ff)
d05cfa1f
KH
655 (if (or (and (>= c #x0391) (<= c #x03a1))
656 (and (>= c #x03a3) (<= c #x03ab)))
abdaa411 657 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
658 (and (>= c #x03da)
659 (<= c #x03ee)
660 (zerop (% c 2))
abdaa411 661 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 662 (setq c (1+ c)))
e6d10035
KH
663 (set-case-syntax-pair ?Ά ?ά tbl)
664 (set-case-syntax-pair ?Έ ?έ tbl)
665 (set-case-syntax-pair ?Ή ?ή tbl)
666 (set-case-syntax-pair ?Ί ?ί tbl)
667 (set-case-syntax-pair ?Ό ?ό tbl)
668 (set-case-syntax-pair ?Ύ ?ύ tbl)
669 (set-case-syntax-pair ?Ώ ?ώ tbl)
d05cfa1f 670
269a5dd0
DL
671 ;; Armenian
672 (setq c #x531)
673 (while (<= c #x556)
abdaa411 674 (set-case-syntax-pair c (+ c #x30) tbl)
269a5dd0
DL
675 (setq c (1+ c)))
676
85ef8ece 677 ;; Greek Extended
abdaa411 678 (modify-category-entry '(#x1f00 . #x1fff) ?g)
85ef8ece 679 (setq c #x1f00)
d05cfa1f 680 (while (<= c #x1fff)
d05cfa1f
KH
681 (and (<= (logand c #x000f) 7)
682 (<= c #x1fa7)
796f8b2f
KH
683 (not (memq c '(#x1f16 #x1f17 #x1f56 #x1f57
684 #x1f50 #x1f52 #x1f54 #x1f56)))
685 (/= (logand c #x00f0) #x70)
abdaa411 686 (set-case-syntax-pair (+ c 8) c tbl))
d05cfa1f 687 (setq c (1+ c)))
e6d10035
KH
688 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
689 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
690 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
691 (set-case-syntax-pair ?Ά ?ά tbl)
692 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
693 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
694 (set-case-syntax-pair ?Έ ?έ tbl)
695 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
696 (set-case-syntax-pair ?Ή ?ή tbl)
697 (set-case-syntax-pair ?ῌ ?ῃ tbl)
698 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
699 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
700 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
701 (set-case-syntax-pair ?Ί ?ί tbl)
702 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
703 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
704 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
705 (set-case-syntax-pair ?Ύ ?ύ tbl)
706 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
707 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
708 (set-case-syntax-pair ?Ό ?ό tbl)
709 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
710 (set-case-syntax-pair ?Ώ ?ώ tbl)
711 (set-case-syntax-pair ?ῼ ?ῳ tbl)
d05cfa1f 712
85ef8ece 713 ;; cyrillic
abdaa411 714 (modify-category-entry '(#x0400 . #x04FF) ?y)
85ef8ece 715 (setq c #x0400)
d05cfa1f 716 (while (<= c #x04ff)
d05cfa1f
KH
717 (and (>= c #x0400)
718 (<= c #x040f)
abdaa411 719 (set-case-syntax-pair c (+ c 80) tbl))
d05cfa1f
KH
720 (and (>= c #x0410)
721 (<= c #x042f)
abdaa411 722 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
723 (and (zerop (% c 2))
724 (or (and (>= c #x0460) (<= c #x0480))
725 (and (>= c #x048c) (<= c #x04be))
726 (and (>= c #x04d0) (<= c #x04f4)))
8f924df7 727 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 728 (setq c (1+ c)))
e6d10035
KH
729 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
730 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
731 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
732 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
733 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
d05cfa1f 734
85ef8ece
KH
735 ;; general punctuation
736 (setq c #x2000)
d05cfa1f
KH
737 (while (<= c #x200b)
738 (set-case-syntax c " " tbl)
739 (setq c (1+ c)))
b427c97e
DL
740 (while (<= c #x200F)
741 (set-case-syntax c "." tbl)
742 (setq c (1+ c)))
743 ;; Fixme: These aren't all right:
6b61353c
KH
744 (setq c #x2010)
745 (while (<= c #x2016)
746 (set-case-syntax c "_" tbl)
747 (setq c (1+ c)))
748 ;; Punctuation syntax for quotation marks (like `)
749 (while (<= c #x201f)
750 (set-case-syntax c "." tbl)
751 (setq c (1+ c)))
752 ;; Fixme: These aren't all right:
d05cfa1f
KH
753 (while (<= c #x2027)
754 (set-case-syntax c "_" tbl)
755 (setq c (1+ c)))
b427c97e
DL
756 (while (<= c #x206F)
757 (set-case-syntax c "." tbl)
758 (setq c (1+ c)))
d05cfa1f 759
269a5dd0
DL
760 ;; Roman numerals
761 (setq c #x2160)
762 (while (<= c #x216f)
abdaa411 763 (set-case-syntax-pair c (+ c #x10) tbl)
269a5dd0
DL
764 (setq c (1+ c)))
765
4fb82d62
DL
766 ;; Fixme: The following blocks might be better as symbol rather than
767 ;; punctuation.
b427c97e
DL
768 ;; Arrows
769 (setq c #x2190)
6ca54a3a
DL
770 (while (<= c #x21FF)
771 (set-case-syntax c "." tbl)
b427c97e
DL
772 (setq c (1+ c)))
773 ;; Mathematical Operators
774 (while (<= c #x22FF)
6ca54a3a 775 (set-case-syntax c "." tbl)
b427c97e
DL
776 (setq c (1+ c)))
777 ;; Miscellaneous Technical
778 (while (<= c #x23FF)
6ca54a3a 779 (set-case-syntax c "." tbl)
b427c97e
DL
780 (setq c (1+ c)))
781 ;; Control Pictures
782 (while (<= c #x243F)
6ca54a3a 783 (set-case-syntax c "_" tbl)
269a5dd0
DL
784 (setq c (1+ c)))
785
786 ;; Circled Latin
787 (setq c #x24b6)
788 (while (<= c #x24cf)
abdaa411
DL
789 (set-case-syntax-pair c (+ c 26) tbl)
790 (modify-category-entry c ?l)
791 (modify-category-entry (+ c 26) ?l)
269a5dd0
DL
792 (setq c (1+ c)))
793
ac387dd1
EZ
794 ;; Coptic
795 (let ((pair-ranges '((#x2C80 . #x2CE2)
796 (#x2CEB . #x2CF2))))
797 (dolist (elt pair-ranges)
798 (let ((from (car elt)) (to (cdr elt)))
799 (while (< from to)
800 (set-case-syntax-pair from (1+ from) tbl)
ac387dd1 801 (setq from (+ from 2))))))
204db02a
EZ
802 ;; There's no Coptic category. However, Coptic letters that are
803 ;; part of the Greek block above get the Greek category, and those
804 ;; in this block are derived from Greek letters, so let's be
805 ;; consistent about their category.
806 (modify-category-entry '(#x2C80 . #x2CFF) ?g)
ac387dd1 807
269a5dd0
DL
808 ;; Fullwidth Latin
809 (setq c #xff21)
810 (while (<= c #xff3a)
abdaa411
DL
811 (set-case-syntax-pair c (+ c #x20) tbl)
812 (modify-category-entry c ?l)
813 (modify-category-entry (+ c #x20) ?l)
269a5dd0
DL
814 (setq c (1+ c)))
815
269a5dd0 816 ;; Combining diacritics
abdaa411 817 (modify-category-entry '(#x300 . #x362) ?^)
269a5dd0 818 ;; Combining marks
0ca754d0 819 (modify-category-entry '(#x20d0 . #x20ff) ?^)
269a5dd0
DL
820
821 ;; Fixme: syntax for symbols &c
822 )
6b61353c
KH
823
824(let ((pairs
e55a4d4e
KH
825 '("⁅⁆" ; U+2045 U+2046
826 "⁽⁾" ; U+207D U+207E
827 "₍₎" ; U+208D U+208E
828 "〈〉" ; U+2329 U+232A
829 "⎴⎵" ; U+23B4 U+23B5
830 "❨❩" ; U+2768 U+2769
831 "❪❫" ; U+276A U+276B
832 "❬❭" ; U+276C U+276D
833 "❰❱" ; U+2770 U+2771
834 "❲❳" ; U+2772 U+2773
835 "❴❵" ; U+2774 U+2775
836 "⟦⟧" ; U+27E6 U+27E7
837 "⟨⟩" ; U+27E8 U+27E9
838 "⟪⟫" ; U+27EA U+27EB
839 "⦃⦄" ; U+2983 U+2984
840 "⦅⦆" ; U+2985 U+2986
841 "⦇⦈" ; U+2987 U+2988
842 "⦉⦊" ; U+2989 U+298A
843 "⦋⦌" ; U+298B U+298C
844 "⦍⦎" ; U+298D U+298E
845 "⦏⦐" ; U+298F U+2990
846 "⦑⦒" ; U+2991 U+2992
847 "⦓⦔" ; U+2993 U+2994
848 "⦕⦖" ; U+2995 U+2996
849 "⦗⦘" ; U+2997 U+2998
850 "⧼⧽" ; U+29FC U+29FD
851 "〈〉" ; U+3008 U+3009
852 "《》" ; U+300A U+300B
853 "「」" ; U+300C U+300D
854 "『』" ; U+300E U+300F
855 "【】" ; U+3010 U+3011
856 "〔〕" ; U+3014 U+3015
857 "〖〗" ; U+3016 U+3017
858 "〘〙" ; U+3018 U+3019
859 "〚〛" ; U+301A U+301B
860 "﴾﴿" ; U+FD3E U+FD3F
861 "︵︶" ; U+FE35 U+FE36
862 "︷︸" ; U+FE37 U+FE38
863 "︹︺" ; U+FE39 U+FE3A
864 "︻︼" ; U+FE3B U+FE3C
865 "︽︾" ; U+FE3D U+FE3E
866 "︿﹀" ; U+FE3F U+FE40
867 "﹁﹂" ; U+FE41 U+FE42
868 "﹃﹄" ; U+FE43 U+FE44
869 "﹙﹚" ; U+FE59 U+FE5A
870 "﹛﹜" ; U+FE5B U+FE5C
871 "﹝﹞" ; U+FE5D U+FE5E
872 "()" ; U+FF08 U+FF09
873 "[]" ; U+FF3B U+FF3D
874 "{}" ; U+FF5B U+FF5D
875 "⦅⦆" ; U+FF5F U+FF60
876 "「」" ; U+FF62 U+FF63
6b61353c
KH
877 )))
878 (dolist (elt pairs)
879 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
880 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
881
4ed46869 882\f
777cfce6 883;; For each character set, put the information of the most proper
aaa9f206 884;; coding system to encode it by `preferred-coding-system' property.
777cfce6 885
abdaa411 886;; Fixme: should this be junked?
777cfce6
KH
887(let ((l '((latin-iso8859-1 . iso-latin-1)
888 (latin-iso8859-2 . iso-latin-2)
889 (latin-iso8859-3 . iso-latin-3)
890 (latin-iso8859-4 . iso-latin-4)
891 (thai-tis620 . thai-tis620)
892 (greek-iso8859-7 . greek-iso-8bit)
893 (arabic-iso8859-6 . iso-2022-7bit)
894 (hebrew-iso8859-8 . hebrew-iso-8bit)
895 (katakana-jisx0201 . japanese-shift-jis)
896 (latin-jisx0201 . japanese-shift-jis)
897 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
898 (latin-iso8859-9 . iso-latin-5)
899 (japanese-jisx0208-1978 . iso-2022-jp)
7870bdd9
KH
900 (chinese-gb2312 . chinese-iso-8bit)
901 (chinese-gbk . chinese-gbk)
902 (gb18030-2-byte . chinese-gb18030)
903 (gb18030-4-byte-bmp . chinese-gb18030)
904 (gb18030-4-byte-smp . chinese-gb18030)
905 (gb18030-4-byte-ext-1 . chinese-gb18030)
906 (gb18030-4-byte-ext-2 . chinese-gb18030)
777cfce6
KH
907 (japanese-jisx0208 . iso-2022-jp)
908 (korean-ksc5601 . iso-2022-kr)
909 (japanese-jisx0212 . iso-2022-jp)
777cfce6
KH
910 (chinese-big5-1 . chinese-big5)
911 (chinese-big5-2 . chinese-big5)
912 (chinese-sisheng . iso-2022-7bit)
913 (ipa . iso-2022-7bit)
914 (vietnamese-viscii-lower . vietnamese-viscii)
915 (vietnamese-viscii-upper . vietnamese-viscii)
916 (arabic-digit . iso-2022-7bit)
917 (arabic-1-column . iso-2022-7bit)
777cfce6
KH
918 (lao . lao)
919 (arabic-2-column . iso-2022-7bit)
920 (indian-is13194 . devanagari)
69e138b2 921 (indian-glyph . devanagari)
777cfce6 922 (tibetan-1-column . tibetan)
58cd41a3 923 (ethiopic . iso-2022-7bit)
7870bdd9
KH
924 (chinese-cns11643-1 . iso-2022-cn)
925 (chinese-cns11643-2 . iso-2022-cn)
777cfce6
KH
926 (chinese-cns11643-3 . iso-2022-cn)
927 (chinese-cns11643-4 . iso-2022-cn)
928 (chinese-cns11643-5 . iso-2022-cn)
929 (chinese-cns11643-6 . iso-2022-cn)
930 (chinese-cns11643-7 . iso-2022-cn)
931 (indian-2-column . devanagari)
7a860cf2
DL
932 (tibetan . tibetan)
933 (latin-iso8859-14 . iso-latin-8)
934 (latin-iso8859-15 . iso-latin-9))))
777cfce6 935 (while l
aaa9f206 936 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
777cfce6 937 (setq l (cdr l))))
df0415c5
KH
938
939\f
98a663f1 940;; Setup auto-fill-chars for charsets that should invoke auto-filling.
7760ba82 941;; SPACE and NEWLINE are already set.
df21429c
KH
942
943(set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
944(set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
945(set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
946(set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
947(set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
948(set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
949
55bd52ea 950\f
7760ba82
KH
951;;; Setting char-width-table. The default is 1.
952
953;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
954;; and final characters.
a2a22302 955(let ((l '((#x0300 . #x036F)
7760ba82
KH
956 (#x0483 . #x0489)
957 (#x0591 . #x05BD)
958 (#x05BF . #x05BF)
959 (#x05C1 . #x05C2)
960 (#x05C4 . #x05C5)
961 (#x05C7 . #x05C7)
962 (#x0600 . #x0603)
963 (#x0610 . #x0615)
964 (#x064B . #x065E)
965 (#x0670 . #x0670)
966 (#x06D6 . #x06E4)
967 (#x06E7 . #x06E8)
968 (#x06EA . #x06ED)
969 (#x070F . #x070F)
970 (#x0711 . #x0711)
971 (#x0730 . #x074A)
972 (#x07A6 . #x07B0)
973 (#x07EB . #x07F3)
974 (#x0901 . #x0902)
975 (#x093C . #x093C)
976 (#x0941 . #x0948)
977 (#x094D . #x094D)
978 (#x0951 . #x0954)
979 (#x0962 . #x0963)
980 (#x0981 . #x0981)
981 (#x09BC . #x09BC)
982 (#x09C1 . #x09C4)
983 (#x09CD . #x09CD)
984 (#x09E2 . #x09E3)
985 (#x0A01 . #x0A02)
986 (#x0A3C . #x0A3C)
987 (#x0A41 . #x0A4D)
988 (#x0A70 . #x0A71)
989 (#x0A81 . #x0A82)
990 (#x0ABC . #x0ABC)
991 (#x0AC1 . #x0AC8)
992 (#x0ACD . #x0ACD)
993 (#x0AE2 . #x0AE3)
994 (#x0B01 . #x0B01)
995 (#x0B3C . #x0B3C)
996 (#x0B3F . #x0B3F)
997 (#x0B41 . #x0B43)
998 (#x0B4D . #x0B56)
999 (#x0B82 . #x0B82)
1000 (#x0BC0 . #x0BC0)
1001 (#x0BCD . #x0BCD)
1002 (#x0C3E . #x0C40)
1003 (#x0C46 . #x0C56)
1004 (#x0CBC . #x0CBC)
1005 (#x0CBF . #x0CBF)
1006 (#x0CC6 . #x0CC6)
1007 (#x0CCC . #x0CCD)
1008 (#x0CE2 . #x0CE3)
1009 (#x0D41 . #x0D43)
1010 (#x0D4D . #x0D4D)
1011 (#x0DCA . #x0DCA)
1012 (#x0DD2 . #x0DD6)
1013 (#x0E31 . #x0E31)
1014 (#x0E34 . #x0E3A)
1015 (#x0E47 . #x0E4E)
1016 (#x0EB1 . #x0EB1)
1017 (#x0EB4 . #x0EBC)
1018 (#x0EC8 . #x0ECD)
1019 (#x0F18 . #x0F19)
1020 (#x0F35 . #x0F35)
1021 (#x0F37 . #x0F37)
1022 (#x0F39 . #x0F39)
1023 (#x0F71 . #x0F7E)
1024 (#x0F80 . #x0F84)
1025 (#x0F86 . #x0F87)
1026 (#x0F90 . #x0FBC)
1027 (#x0FC6 . #x0FC6)
1028 (#x102D . #x1030)
1029 (#x1032 . #x1037)
1030 (#x1039 . #x1039)
1031 (#x1058 . #x1059)
1032 (#x1160 . #x11FF)
1033 (#x135F . #x135F)
1034 (#x1712 . #x1714)
1035 (#x1732 . #x1734)
1036 (#x1752 . #x1753)
1037 (#x1772 . #x1773)
1038 (#x17B4 . #x17B5)
1039 (#x17B7 . #x17BD)
1040 (#x17C6 . #x17C6)
1041 (#x17C9 . #x17D3)
1042 (#x17DD . #x17DD)
1043 (#x180B . #x180D)
1044 (#x18A9 . #x18A9)
1045 (#x1920 . #x1922)
1046 (#x1927 . #x1928)
1047 (#x1932 . #x1932)
1048 (#x1939 . #x193B)
1049 (#x1A17 . #x1A18)
1050 (#x1B00 . #x1B03)
1051 (#x1B34 . #x1B34)
1052 (#x1B36 . #x1B3A)
1053 (#x1B3C . #x1B3C)
1054 (#x1B42 . #x1B42)
1055 (#x1B6B . #x1B73)
1056 (#x1DC0 . #x1DFF)
1057 (#x200B . #x200F)
1058 (#x202A . #x202E)
1059 (#x2060 . #x206F)
1060 (#x20D0 . #x20EF)
1061 (#x302A . #x302F)
1062 (#x3099 . #x309A)
1063 (#xA806 . #xA806)
1064 (#xA80B . #xA80B)
1065 (#xA825 . #xA826)
1066 (#xFB1E . #xFB1E)
1067 (#xFE00 . #xFE0F)
1068 (#xFE20 . #xFE23)
1069 (#xFEFF . #xFEFF)
1070 (#xFFF9 . #xFFFB)
1071 (#x10A01 . #x10A0F)
1072 (#x10A38 . #x10A3F)
1073 (#x1D167 . #x1D169)
1074 (#x1D173 . #x1D182)
1075 (#x1D185 . #x1D18B)
1076 (#x1D1AA . #x1D1AD)
1077 (#x1D242 . #x1D244)
1078 (#xE0001 . #xE01EF))))
1079 (dolist (elt l)
1080 (set-char-table-range char-width-table elt 0)))
1081
1082;; 2: East Asian Wide and Full-width characters.
1083(let ((l '((#x1100 . #x115F)
1084 (#x2329 . #x232A)
1085 (#x2E80 . #x303E)
1086 (#x3040 . #xA4CF)
1087 (#xAC00 . #xD7A3)
ed0cb465 1088 (#xF900 . #xFAFF)
7760ba82 1089 (#xFE30 . #xFE6F)
bb5c62cf 1090 (#xFF01 . #xFF60)
7760ba82
KH
1091 (#xFFE0 . #xFFE6)
1092 (#x20000 . #x2FFFF)
1093 (#x30000 . #x3FFFF))))
ed0cb465 1094 (dolist (elt l)
7760ba82 1095 (set-char-table-range char-width-table elt 2)))
173f18ce
DL
1096
1097;; Other double width
7760ba82
KH
1098;;(map-charset-chars
1099;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1100;; 'ethiopic)
1101;; (map-charset-chars
1102;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1103;; 'tibetan)
173f18ce 1104(map-charset-chars
9d3aa82c 1105 (lambda (range _ignore) (set-char-table-range char-width-table range 2))
173f18ce
DL
1106 'indian-2-column)
1107(map-charset-chars
9d3aa82c 1108 (lambda (range _ignore) (set-char-table-range char-width-table range 2))
173f18ce 1109 'arabic-2-column)
777cfce6 1110
dbff07a2
KH
1111;; Internal use only.
1112;; Alist of locale symbol vs charsets. In a language environment
1113;; corresponding to the locale, width of characters in the charsets is
1114;; set to 2. Each element has the form:
1115;; (LOCALE TABLE (CHARSET (FROM-CODE . TO-CODE) ...) ...)
1116;; LOCALE: locale symbol
1117;; TABLE: char-table used for char-width-table, initially nil.
d5081c1e 1118;; CHARSET: character set
dbff07a2
KH
1119;; FROM-CODE, TO-CODE: range of code-points in CHARSET
1120
1121(defvar cjk-char-width-table-list
1122 '((ja_JP nil (japanese-jisx0208 (#x2121 . #x287E))
1123 (cp932-2-byte (#x8140 . #x879F)))
1124 (zh_CN nil (chinese-gb2312 (#x2121 . #x297E)))
1125 (zh_HK nil (big5-hkscs (#xA140 . #xA3FE) (#xC6A0 . #xC8FE)))
1126 (zh_TW nil (big5 (#xA140 . #xA3FE))
1127 (chinese-cns11643-1 (#x2121 . #x427E)))
1128 (ko_KR nil (korean-ksc5601 (#x2121 . #x2C7E)))))
1129
1130;; Internal use only.
1131;; Setup char-width-table appropriate for a language environment
1132;; corresponding to LOCALE-NAME (symbol).
1133
1134(defun use-cjk-char-width-table (locale-name)
1135 (while (char-table-parent char-width-table)
1136 (setq char-width-table (char-table-parent char-width-table)))
9d3aa82c 1137 (let ((slot (assq locale-name cjk-char-width-table-list)))
dbff07a2
KH
1138 (or slot (error "Unknown locale for CJK language environment: %s"
1139 locale-name))
1140 (unless (nth 1 slot)
1141 (let ((table (make-char-table nil)))
1142 (dolist (charset-info (nthcdr 2 slot))
1143 (let ((charset (car charset-info)))
1144 (dolist (code-range (cdr charset-info))
9d3aa82c 1145 (map-charset-chars #'(lambda (range _arg)
dbff07a2
KH
1146 (set-char-table-range table range 2))
1147 charset nil
1148 (car code-range) (cdr code-range)))))
1149 (optimize-char-table table)
1150 (set-char-table-parent table char-width-table)
1151 (setcar (cdr slot) table)))
1152 (setq char-width-table (nth 1 slot))))
55a3ed16
KH
1153
1154(defun use-default-char-width-table ()
1155 "Internal use only.
9f336de0 1156Setup char-width-table appropriate for non-CJK language environment."
dbff07a2
KH
1157 (while (char-table-parent char-width-table)
1158 (setq char-width-table (char-table-parent char-width-table))))
55a3ed16 1159
87a39edb 1160(optimize-char-table (standard-case-table))
87a39edb
DL
1161(optimize-char-table (standard-syntax-table))
1162
55a3ed16
KH
1163\f
1164;; Setting char-script-table.
1165
57939ff4
EZ
1166;; The data is compiled from Blocks.txt and Scripts.txt in the
1167;; "Unicode Character Database", simplified to lump together all the
1168;; blocks belonging to the same language. E.g., "Basic Latin",
1169;; "Latin-1 Supplement", "Latin Extended-A", etc. are all lumped
1170;; together under "latin".
1171;;
b427c97e
DL
1172;; The Unicode blocks actually extend past some of these ranges with
1173;; undefined codepoints.
9ce5de1c
KH
1174(let ((script-list nil))
1175 (dolist
1176 (elt
b982c760 1177 '((#x0000 #x007F latin)
6c52dd78
JR
1178 (#x00A0 #x024F latin)
1179 (#x0250 #x02AF phonetic)
1180 (#x02B0 #x036F latin)
9ce5de1c
KH
1181 (#x0370 #x03E1 greek)
1182 (#x03E2 #x03EF coptic)
1183 (#x03F0 #x03F3 greek)
57939ff4 1184 (#x0400 #x052F cyrillic)
9ce5de1c
KH
1185 (#x0530 #x058F armenian)
1186 (#x0590 #x05FF hebrew)
1187 (#x0600 #x06FF arabic)
1188 (#x0700 #x074F syriac)
57939ff4 1189 (#x0750 #x077F arabic)
9ce5de1c 1190 (#x0780 #x07BF thaana)
57939ff4
EZ
1191 (#x07C0 #x07FF nko)
1192 (#x0800 #x083F samaritan)
1193 (#x0840 #x085F mandaic)
1194 (#x08A0 #x08FF arabic)
9ce5de1c
KH
1195 (#x0900 #x097F devanagari)
1196 (#x0980 #x09FF bengali)
1197 (#x0A00 #x0A7F gurmukhi)
1198 (#x0A80 #x0AFF gujarati)
1199 (#x0B00 #x0B7F oriya)
1200 (#x0B80 #x0BFF tamil)
1201 (#x0C00 #x0C7F telugu)
1202 (#x0C80 #x0CFF kannada)
1203 (#x0D00 #x0D7F malayalam)
1204 (#x0D80 #x0DFF sinhala)
57939ff4
EZ
1205 (#x0E00 #x0E7F thai)
1206 (#x0E80 #x0EFF lao)
9ce5de1c 1207 (#x0F00 #x0FFF tibetan)
57939ff4 1208 (#x1000 #x109F burmese) ; according to Unicode 6.1, should be "myanmar"
9ce5de1c
KH
1209 (#x10A0 #x10FF georgian)
1210 (#x1100 #x11FF hangul)
4c81b0f6 1211 (#x1200 #x139F ethiopic)
9ce5de1c
KH
1212 (#x13A0 #x13FF cherokee)
1213 (#x1400 #x167F canadian-aboriginal)
1214 (#x1680 #x169F ogham)
1215 (#x16A0 #x16FF runic)
57939ff4
EZ
1216 (#x1700 #x171F tagalog)
1217 (#x1720 #x173F hanunoo)
1218 (#x1740 #x175F buhid)
1219 (#x1760 #x177F tagbanwa)
9ce5de1c
KH
1220 (#x1780 #x17FF khmer)
1221 (#x1800 #x18AF mongolian)
57939ff4
EZ
1222 (#x18B0 #x18FF canadian-aboriginal)
1223 (#x1900 #x194F limbu)
1224 (#x1950 #x197F tai-le)
1225 (#x1980 #x19DF tai-lue)
1226 (#x19E0 #x19FF khmer)
1227 (#x1A00 #x1A00 buginese)
1228 (#x1A20 #x1AAF tai-tham)
1229 (#x1B00 #x1B7F balinese)
1230 (#x1B80 #x1BBF sundanese)
1231 (#x1BC0 #x1BFF batak)
1232 (#x1C00 #x1C4F lepcha)
1233 (#x1C50 #x1C7F ol-chiki)
1234 (#x1CC0 #x1CCF sundanese)
1235 (#x1CD0 #x1CFF vedic)
1236 (#x1D00 #x1DBF phonetic)
1237 (#x1DC0 #x1EFF latin)
9ce5de1c 1238 (#x1F00 #x1FFF greek)
f041d33e 1239 (#x2000 #x27FF symbol)
9ce5de1c 1240 (#x2800 #x28FF braille)
57939ff4
EZ
1241 (#x2900 #x2BFF symbol)
1242 (#x2C00 #x2C5F glagolitic)
1243 (#x2C60 #x2C7F latin)
1244 (#x2C80 #x2CFF coptic)
1245 (#x2D00 #x2D2F georgian)
1246 (#x2D30 #x2D7F tifinagh)
4c81b0f6 1247 (#x2D80 #x2DDF ethiopic)
57939ff4
EZ
1248 (#x2DE0 #x2DFF cyrillic)
1249 (#x2E00 #x2E7F symbol)
9ce5de1c
KH
1250 (#x2E80 #x2FDF han)
1251 (#x2FF0 #x2FFF ideographic-description)
1252 (#x3000 #x303F cjk-misc)
1253 (#x3040 #x30FF kana)
1254 (#x3100 #x312F bopomofo)
1255 (#x3130 #x318F hangul)
1256 (#x3190 #x319F kanbun)
1257 (#x31A0 #x31BF bopomofo)
57939ff4
EZ
1258 (#x31C0 #x31EF cjk-misc)
1259 (#x31F0 #x31FF kana)
1260 (#x3200 #x9FAF han)
9ce5de1c 1261 (#xA000 #xA4CF yi)
57939ff4
EZ
1262 (#xA4D0 #xA4FF lisu)
1263 (#xA500 #xA63F vai)
1264 (#xA640 #xA69F cyrillic)
1265 (#xA6A0 #xA6FF bamum)
1266 (#xA700 #xA7FF latin)
1267 (#xA800 #xA82F syloti-nagri)
1268 (#xA830 #xA83F north-indic-number)
1269 (#xA840 #xA87F phags-pa)
1270 (#xA880 #xA8DF saurashtra)
1271 (#xA8E0 #xA8FF devanagari)
1272 (#xA900 #xA92F kayah-li)
1273 (#xA930 #xA95F rejang)
1274 (#xA960 #xA97F hangul)
1275 (#xA980 #xA9DF javanese)
1ffae953 1276 (#xAA00 #xAA5F cham)
57939ff4 1277 (#xAA60 #xAA7B burmese) ; Unicode 6.1: "myanmar"
d807d0c7 1278 (#xAA80 #xAADF tai-viet)
57939ff4
EZ
1279 (#xAAE0 #xAAFF meetei-mayek)
1280 (#xAB00 #xAB2F ethiopic)
1281 (#xABC0 #xABFF meetei-mayek)
1282 (#xAC00 #xD7FF hangul)
95ac45fa 1283 (#xF900 #xFAFF han)
9ce5de1c
KH
1284 (#xFB1D #xFB4F hebrew)
1285 (#xFB50 #xFDFF arabic)
57939ff4
EZ
1286 (#xFE30 #xFE4F han)
1287 (#xFE70 #xFEFF arabic)
9ce5de1c
KH
1288 (#xFF00 #xFF5F cjk-misc)
1289 (#xFF61 #xFF9F kana)
1290 (#xFFE0 #xFFE6 cjk-misc)
458888ab
KH
1291 (#x10000 #x100FF linear-b)
1292 (#x10100 #x1013F aegean-number)
57939ff4
EZ
1293 (#x10140 #x1018F ancient-greek-number)
1294 (#x10190 #x101CF ancient-symbol)
458888ab
KH
1295 (#x101D0 #x101FF phaistos-disc)
1296 (#x10280 #x1029F lycian)
1297 (#x102A0 #x102DF carian)
1298 (#x10300 #x1032F olt-italic)
57939ff4 1299 (#x10330 #x1034F gothic)
458888ab
KH
1300 (#x10380 #x1039F ugaritic)
1301 (#x103A0 #x103DF old-persian)
1302 (#x10400 #x1044F deseret)
1303 (#x10450 #x1047F shavian)
1304 (#x10480 #x104AF osmanya)
1305 (#x10800 #x1083F cypriot-syllabary)
57939ff4 1306 (#x10840 #x1085F aramaic)
458888ab
KH
1307 (#x10900 #x1091F phoenician)
1308 (#x10920 #x1093F lydian)
57939ff4 1309 (#x10980 #x109FF meroitic)
458888ab 1310 (#x10A00 #x10A5F kharoshthi)
57939ff4
EZ
1311 (#x10A60 #x10A7F old-south-arabian)
1312 (#x10B00 #x10B3F avestan)
1313 (#x10B40 #x10B5F inscriptional-parthian)
1314 (#x10B60 #x10B7F inscriptional-pahlavi)
1315 (#x10C00 #x10C4F old-turkic)
1316 (#x10E60 #x10E7F rumi-number)
1317 (#x11000 #x1107F brahmi)
1318 (#x11080 #x110CF kaithi)
1319 (#x110D0 #x110FF sora-sompeng)
1320 (#x11100 #x1114F chakma)
1321 (#x11180 #x111DF sharada)
1322 (#x11680 #x116CF takri)
458888ab
KH
1323 (#x12000 #x123FF cuneiform)
1324 (#x12400 #x1247F cuneiform-numbers-and-punctuation)
57939ff4
EZ
1325 (#x13000 #x1342F egyptian)
1326 (#x16800 #x16A3F bamum)
1327 (#x16F00 #x16F9F miao)
1328 (#x1B000 #x1B0FF kana)
e7da2f38
KH
1329 (#x1D000 #x1D0FF byzantine-musical-symbol)
1330 (#x1D100 #x1D1FF musical-symbol)
458888ab
KH
1331 (#x1D200 #x1D24F ancient-greek-musical-notation)
1332 (#x1D300 #x1D35F tai-xuan-jing-symbol)
1333 (#x1D360 #x1D37F counting-rod-numeral)
e7da2f38 1334 (#x1D400 #x1D7FF mathematical)
57939ff4 1335 (#x1EE00 #x1EEFF arabic)
458888ab
KH
1336 (#x1F000 #x1F02F mahjong-tile)
1337 (#x1F030 #x1F09F domino-tile)
57939ff4
EZ
1338 (#x1F0A0 #x1F0FF playing-cards)
1339 (#x1F100 #x1F1FF symbol)
1340 (#x1F200 #x1F2FF han)
1341 (#x1F300 #x1F64F symbol)
1342 (#x1F680 #x1F77F symbol)
1343 (#x20000 #x2B81F han)
9ce5de1c
KH
1344 (#x2F800 #x2FFFF han)))
1345 (set-char-table-range char-script-table
1346 (cons (car elt) (nth 1 elt)) (nth 2 elt))
1347 (or (memq (nth 2 elt) script-list)
1348 (setq script-list (cons (nth 2 elt) script-list))))
1349 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1350
8f924df7 1351(map-charset-chars
9d3aa82c 1352 #'(lambda (range _ignore)
cdfc5141
KH
1353 (set-char-table-range char-script-table range 'tibetan))
1354 'tibetan)
1355
e7259832 1356\f
59db3a5c
KH
1357;;; Setting unicode-category-table.
1358
20372d0c
GM
1359(when (setq unicode-category-table
1360 (unicode-property-table-internal 'general-category))
1361 (map-char-table #'(lambda (key val)
1362 (if (and val
1363 (or (and (/= (aref (symbol-name val) 0) ?M)
1364 (/= (aref (symbol-name val) 0) ?C))
1365 (eq val 'Zs)))
1366 (modify-category-entry key ?.)))
1367 unicode-category-table))
8ea6fa80
KH
1368
1369(optimize-char-table (standard-category-table))
59db3a5c
KH
1370
1371\f
b2cca856
KH
1372;; Display of glyphless characters.
1373
1374(defvar char-acronym-table
1375 (make-char-table 'char-acronym-table nil)
1376 "Char table of acronyms for non-graphic characters.")
1377
1378(let ((c0-acronyms '("NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK" "BEL"
1379 "BS" nil nil "VT" "FF" "CR" "SO" "SI"
1380 "DLE" "DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB"
1381 "CAN" "EM" "SUB" "ESC" "FC" "GS" "RS" "US")))
1382 (dotimes (i 32)
1383 (aset char-acronym-table i (car c0-acronyms))
1384 (setq c0-acronyms (cdr c0-acronyms))))
1385
1386(let ((c1-acronyms '("XXX" "XXX" "BPH" "NBH" "IND" "NEL" "SSA" "ESA"
1387 "HTS" "HTJ" "VTS" "PLD" "PLU" "R1" "SS2" "SS1"
1388 "DCS" "PU1" "PU2" "STS" "CCH" "MW" "SPA" "EPA"
1389 "SOS" "XXX" "SC1" "CSI" "ST" "OSC" "PM" "APC")))
1390 (dotimes (i 32)
1391 (aset char-acronym-table (+ #x0080 i) (car c1-acronyms))
1392 (setq c1-acronyms (cdr c1-acronyms))))
1393
1394(aset char-acronym-table #x17B4 "KIVAQ") ; KHMER VOWEL INHERENT AQ
1395(aset char-acronym-table #x17B5 "KIVAA") ; KHMER VOWEL INHERENT AA
1396(aset char-acronym-table #x200B "ZWSP") ; ZERO WIDTH SPACE
1397(aset char-acronym-table #x200C "ZWNJ") ; ZERO WIDTH NON-JOINER
1398(aset char-acronym-table #x200D "ZWJ") ; ZERO WIDTH JOINER
1399(aset char-acronym-table #x200E "LRM") ; LEFT-TO-RIGHT MARK
1400(aset char-acronym-table #x200F "RLM") ; RIGHT-TO-LEFT MARK
1401(aset char-acronym-table #x202A "LRE") ; LEFT-TO-RIGHT EMBEDDING
1402(aset char-acronym-table #x202B "RLE") ; RIGHT-TO-LEFT EMBEDDING
1403(aset char-acronym-table #x202C "PDF") ; POP DIRECTIONAL FORMATTING
1404(aset char-acronym-table #x202D "LRO") ; LEFT-TO-RIGHT OVERRIDE
1405(aset char-acronym-table #x202E "RLO") ; RIGHT-TO-LEFT OVERRIDE
1406(aset char-acronym-table #x2060 "WJ") ; WORD JOINER
1407(aset char-acronym-table #x206A "ISS") ; INHIBIT SYMMETRIC SWAPPING
1408(aset char-acronym-table #x206B "ASS") ; ACTIVATE SYMMETRIC SWAPPING
1409(aset char-acronym-table #x206C "IAFS") ; INHIBIT ARABIC FORM SHAPING
1410(aset char-acronym-table #x206D "AAFS") ; ACTIVATE ARABIC FORM SHAPING
1411(aset char-acronym-table #x206E "NADS") ; NATIONAL DIGIT SHAPES
1412(aset char-acronym-table #x206F "NODS") ; NOMINAL DIGIT SHAPES
1413(aset char-acronym-table #xFEFF "ZWNBSP") ; ZERO WIDTH NO-BREAK SPACE
1414(aset char-acronym-table #xFFF9 "IAA") ; INTERLINEAR ANNOTATION ANCHOR
1415(aset char-acronym-table #xFFFA "IAS") ; INTERLINEAR ANNOTATION SEPARATOR
1416(aset char-acronym-table #xFFFB "IAT") ; INTERLINEAR ANNOTATION TERMINATOR
1417(aset char-acronym-table #x1D173 "BEGBM") ; MUSICAL SYMBOL BEGIN BEAM
1418(aset char-acronym-table #x1D174 "ENDBM") ; MUSICAL SYMBOL END BEAM
1419(aset char-acronym-table #x1D175 "BEGTIE") ; MUSICAL SYMBOL BEGIN TIE
1420(aset char-acronym-table #x1D176 "END") ; MUSICAL SYMBOL END TIE
1421(aset char-acronym-table #x1D177 "BEGSLR") ; MUSICAL SYMBOL BEGIN SLUR
1422(aset char-acronym-table #x1D178 "ENDSLR") ; MUSICAL SYMBOL END SLUR
1423(aset char-acronym-table #x1D179 "BEGPHR") ; MUSICAL SYMBOL BEGIN PHRASE
1424(aset char-acronym-table #x1D17A "ENDPHR") ; MUSICAL SYMBOL END PHRASE
1425(aset char-acronym-table #xE0001 "|->TAG") ; LANGUAGE TAG
1426(aset char-acronym-table #xE0020 "SP TAG") ; TAG SPACE
1427(dotimes (i 94)
1428 (aset char-acronym-table (+ #xE0021 i) (format " %c TAG" (+ 33 i))))
1429(aset char-acronym-table #xE007F "->|TAG") ; CANCEL TAG
1430
0e7c0582 1431(defun update-glyphless-char-display (&optional variable value)
0eb025fb 1432 "Make the setting of `glyphless-char-display-control' take effect.
b2cca856 1433This function updates the char-table `glyphless-char-display'."
0e7c0582
EZ
1434 (when value
1435 (set-default variable value))
1436 (dolist (elt value)
b2cca856
KH
1437 (let ((target (car elt))
1438 (method (cdr elt)))
0eb025fb
EZ
1439 (or (memq method '(zero-width thin-space empty-box acronym hex-code))
1440 (error "Invalid glyphless character display method: %s" method))
b2cca856 1441 (cond ((eq target 'c0-control)
bd3921f0
PS
1442 (glyphless-set-char-table-range glyphless-char-display
1443 #x00 #x1F method)
96107967
EZ
1444 ;; Users will not expect their newlines and TABs be
1445 ;; displayed as anything but themselves, so exempt those
1446 ;; two characters from c0-control.
1447 (set-char-table-range glyphless-char-display #x9 nil)
1448 (set-char-table-range glyphless-char-display #xa nil))
b2cca856 1449 ((eq target 'c1-control)
bd3921f0
PS
1450 (glyphless-set-char-table-range glyphless-char-display
1451 #x80 #x9F method))
b2cca856 1452 ((eq target 'format-control)
20372d0c
GM
1453 (when unicode-category-table
1454 (map-char-table
1455 #'(lambda (char category)
1456 (if (eq category 'Cf)
1457 (let ((this-method method)
1458 from to)
1459 (if (consp char)
1460 (setq from (car char) to (cdr char))
1461 (setq from char to char))
1462 (while (<= from to)
1463 (when (/= from #xAD)
1464 (if (eq method 'acronym)
1465 (setq this-method
1466 (aref char-acronym-table from)))
1467 (set-char-table-range glyphless-char-display
1468 from this-method))
1469 (setq from (1+ from))))))
1470 unicode-category-table)))
b2cca856
KH
1471 ((eq target 'no-font)
1472 (set-char-table-extra-slot glyphless-char-display 0 method))
1473 (t
0eb025fb 1474 (error "Invalid glyphless character group: %s" target))))))
bd3921f0
PS
1475
1476(defun glyphless-set-char-table-range (chartable from to method)
1477 (if (eq method 'acronym)
1478 (let ((i from))
1479 (while (<= i to)
1480 (set-char-table-range chartable i (aref char-acronym-table i))
1481 (setq i (1+ i))))
1482 (set-char-table-range chartable (cons from to) method)))
b2cca856 1483
0e7c0582
EZ
1484;;; Control of displaying glyphless characters.
1485(defcustom glyphless-char-display-control
1486 '((format-control . thin-space)
1487 (no-font . hex-code))
1488 "List of directives to control display of glyphless characters.
1489
1490Each element has the form (GROUP . METHOD), where GROUP is a
1491symbol specifying the character group, and METHOD is a symbol
1492specifying the method of displaying characters belonging to that
1493group.
1494
1495GROUP must be one of these symbols:
96107967 1496 `c0-control': U+0000..U+001F, but excluding newline and TAB.
0e7c0582
EZ
1497 `c1-control': U+0080..U+009F.
1498 `format-control': Characters of Unicode General Category `Cf',
1499 such as U+200C (ZWNJ), U+200E (LRM), but
1500 excluding characters that have graphic images,
1501 such as U+00AD (SHY).
1502 `no-font': characters for which no suitable font is found.
1503 For character terminals, characters that cannot
1504 be encoded by `terminal-coding-system'.
1505
1506METHOD must be one of these symbols:
1507 `zero-width': don't display.
1508 `thin-space': display a thin (1-pixel width) space. On character
1509 terminals, display as 1-character space.
1510 `empty-box': display an empty box.
1511 `acronym': display an acronym of the character in a box. The
1512 acronym is taken from `char-acronym-table', which see.
d35f31a4
XF
1513 `hex-code': display the hexadecimal character code in a box.
1514
1515Do not set its value directly from Lisp; the value takes effect
1516only via a custom `:set'
1517function (`update-glyphless-char-display'), which updates
1518`glyphless-char-display'."
2bed3f04 1519 :version "24.1"
0e7c0582
EZ
1520 :type '(alist :key-type (symbol :tag "Character Group")
1521 :value-type (symbol :tag "Display Method"))
1522 :options '((c0-control
1523 (choice (const :tag "Don't display" zero-width)
1524 (const :tag "Display as thin space" thin-space)
1525 (const :tag "Display as empty box" empty-box)
1526 (const :tag "Display acronym" acronym)
1527 (const :tag "Display hex code in a box" hex-code)))
1528 (c1-control
1529 (choice (const :tag "Don't display" zero-width)
1530 (const :tag "Display as thin space" thin-space)
1531 (const :tag "Display as empty box" empty-box)
1532 (const :tag "Display acronym" acronym)
1533 (const :tag "Display hex code in a box" hex-code)))
1534 (format-control
1535 (choice (const :tag "Don't display" zero-width)
1536 (const :tag "Display as thin space" thin-space)
1537 (const :tag "Display as empty box" empty-box)
1538 (const :tag "Display acronym" acronym)
1539 (const :tag "Display hex code in a box" hex-code)))
1540 (no-font
1541 (choice (const :tag "Don't display" zero-width)
1542 (const :tag "Display as thin space" thin-space)
1543 (const :tag "Display as empty box" empty-box)
1544 (const :tag "Display acronym" acronym)
1545 (const :tag "Display hex code in a box" hex-code))))
1546 :set 'update-glyphless-char-display
1547 :group 'display)
1548
b2cca856 1549\f
e7259832
KH
1550;;; Setting word boundary.
1551
e7259832 1552(setq word-combining-categories
4626499f
KH
1553 '((nil . ?^)
1554 (?^ . nil)
7ffefb08
MB
1555 (?C . ?H)
1556 (?C . ?K)))
e7259832
KH
1557
1558(setq word-separating-categories ; (2-byte character sets)
4626499f 1559 '((?H . ?K) ; Hiragana - Katakana
e7259832
KH
1560 ))
1561
1cbfaab9 1562;; Local Variables:
985773c9 1563;; coding: utf-8
1cbfaab9 1564;; End:
777cfce6 1565
60370d40 1566;;; characters.el ends here