* calendar/todo-mode.el (todo-set-top-priorities): Fix logic to
[bpt/emacs.git] / lisp / international / characters.el
CommitLineData
4ed46869
KH
1;;; characters.el --- set syntax and category for multibyte characters
2
ba318903 3;; Copyright (C) 1997, 2000-2014 Free Software Foundation, Inc.
7976eda0 4;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 5;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
2fd125a3
KH
6;; National Institute of Advanced Industrial Science and Technology (AIST)
7;; Registration Number H14PRO021
8f924df7 8;; Copyright (C) 2003
55bd52ea
KH
9;; National Institute of Advanced Industrial Science and Technology (AIST)
10;; Registration Number H13PRO009
4ed46869
KH
11
12;; Keywords: multibyte character, character set, syntax, category
13
14;; This file is part of GNU Emacs.
15
4936186e 16;; GNU Emacs is free software: you can redistribute it and/or modify
4ed46869 17;; it under the terms of the GNU General Public License as published by
4936186e
GM
18;; the Free Software Foundation, either version 3 of the License, or
19;; (at your option) any later version.
4ed46869
KH
20
21;; GNU Emacs is distributed in the hope that it will be useful,
22;; but WITHOUT ANY WARRANTY; without even the implied warranty of
23;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24;; GNU General Public License for more details.
25
26;; You should have received a copy of the GNU General Public License
4936186e 27;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
4ed46869
KH
28
29;;; Commentary:
30
60370d40
PJ
31;;; Code:
32
4ed46869
KH
33;;; Predefined categories.
34
35;; For each character set.
36
46bf60bc
KH
37(define-category ?a "ASCII
38ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
4ed46869
KH
39(define-category ?l "Latin")
40(define-category ?t "Thai")
41(define-category ?g "Greek")
42(define-category ?b "Arabic")
43(define-category ?w "Hebrew")
44(define-category ?y "Cyrillic")
46bf60bc
KH
45(define-category ?k "Katakana
46Japanese katakana")
47(define-category ?r "Roman
48Japanese roman")
4ed46869
KH
49(define-category ?c "Chinese")
50(define-category ?j "Japanese")
51(define-category ?h "Korean")
46bf60bc
KH
52(define-category ?e "Ethiopic
53Ethiopic (Ge'ez)")
54(define-category ?v "Viet
55Vietnamese")
4ed46869 56(define-category ?i "Indian")
6eba8645 57(define-category ?o "Lao")
9395eb7c 58(define-category ?q "Tibetan")
4ed46869
KH
59
60;; For each group (row) of 2-byte character sets.
61
46bf60bc
KH
62(define-category ?A "2-byte alnum
63Alpha-numeric characters of 2-byte character sets")
64(define-category ?C "2-byte han
65Chinese (Han) characters of 2-byte character sets")
66(define-category ?G "2-byte Greek
67Greek characters of 2-byte character sets")
68(define-category ?H "2-byte Hiragana
69Japanese Hiragana characters of 2-byte character sets")
70(define-category ?K "2-byte Katakana
71Japanese Katakana characters of 2-byte character sets")
72(define-category ?N "2-byte Korean
73Korean Hangul characters of 2-byte character sets")
91c491e0 74(define-category ?Y "2-byte Cyrillic
46bf60bc 75Cyrillic characters of 2-byte character sets")
4ed46869
KH
76(define-category ?I "Indian Glyphs")
77
78;; For phonetic classifications.
79
80(define-category ?0 "consonant")
46bf60bc 81(define-category ?1 "base vowel
4eb97232 82Base (independent) vowel")
46bf60bc 83(define-category ?2 "upper diacritic
4eb97232 84Upper diacritical mark (including upper vowel)")
46bf60bc 85(define-category ?3 "lower diacritic
4eb97232 86Lower diacritical mark (including lower vowel)")
46bf60bc 87(define-category ?4 "combining tone
4eb97232 88Combining tone mark")
9765a2ba 89(define-category ?5 "symbol")
4ed46869 90(define-category ?6 "digit")
91c491e0 91(define-category ?7 "vowel diacritic
4eb97232 92Vowel-modifying diacritical mark")
6eba8645
KH
93(define-category ?8 "vowel-signs")
94(define-category ?9 "semivowel lower")
4ed46869
KH
95
96;; For filling.
46bf60bc
KH
97(define-category ?| "line breakable
98While filling, we can break a line at this character.")
4ed46869 99
504af7b2 100;; For indentation calculation.
70ea295a 101(define-category ?\s
46bf60bc
KH
102 "space for indent
103This character counts as a space for indentation purposes.")
504af7b2 104
94487c4e 105;; Keep the following for `kinsoku' processing. See comments in
4ed46869 106;; kinsoku.el.
46bf60bc
KH
107(define-category ?> "Not at bol
108A character which can't be placed at beginning of line.")
109(define-category ?< "Not at eol
110A character which can't be placed at end of line.")
4ed46869 111
8ea6fa80
KH
112;; Base and Combining
113(define-category ?. "Base
114Base characters (Unicode General Category L,N,P,S,Zs)")
46bf60bc 115(define-category ?^ "Combining
4eb97232 116Combining diacritic or mark (Unicode General Category M)")
f635daa1
CY
117
118;; bidi types
119(define-category ?R "Right-to-left (strong)
120Characters with \"strong\" right-to-left directionality, i.e.
121with R, AL, RLE, or RLO Unicode bidi character type.")
122
123(define-category ?L "Left-to-right (strong)
124Characters with \"strong\" left-to-right directionality, i.e.
125with L, LRE, or LRO Unicode bidi character type.")
126
4ed46869
KH
127\f
128;;; Setting syntax and category.
129
130;; ASCII
131
e2cc40b7
KH
132;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
133(modify-category-entry '(32 . 127) ?a)
134(modify-category-entry '(32 . 127) ?l)
4ed46869 135
c94ae9eb
DL
136;; Deal with the CJK charsets first. Since the syntax of blocks is
137;; defined per charset, and the charsets may contain e.g. Latin
138;; characters, we end up with the wrong syntax definitions if we're
139;; not careful.
4ed46869 140
66bff5ed 141;; Chinese characters (Unicode)
a5bb49e1
KH
142(modify-category-entry '(#x2E80 . #x312F) ?|)
143(modify-category-entry '(#x3190 . #x33FF) ?|)
66a85e76
KH
144(modify-category-entry '(#x3400 . #x4DBF) ?C)
145(modify-category-entry '(#x4E00 . #x9FAF) ?C)
66bff5ed
KH
146(modify-category-entry '(#x3400 . #x9FAF) ?c)
147(modify-category-entry '(#x3400 . #x9FAF) ?|)
148(modify-category-entry '(#xF900 . #xFAFF) ?C)
149(modify-category-entry '(#xF900 . #xFAFF) ?c)
150(modify-category-entry '(#xF900 . #xFAFF) ?|)
796f8b2f
KH
151(modify-category-entry '(#x20000 . #x2FFFF) ?|)
152(modify-category-entry '(#x20000 . #x2FFFF) ?C)
153(modify-category-entry '(#x20000 . #x2FFFF) ?c)
8e4cd685 154
4ed46869
KH
155
156;; Chinese character set (GB2312)
157
66bff5ed
KH
158(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
159(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
160(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
4ed46869 161
87a39edb 162(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
9ad4b491
KH
163(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
164(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
165(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
66bff5ed
KH
166(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
167(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
168(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
169(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
170(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
4ed46869
KH
171
172;; Chinese character set (BIG5)
173
e7259832 174(map-charset-chars #'modify-category-entry 'big5 ?c)
66a85e76 175(map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA261)
9ad4b491 176(map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
66a85e76 177(map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DC)
4ed46869
KH
178
179;; Chinese character set (CNS11643)
180
87a39edb
DL
181(dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
182 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
183 chinese-cns11643-7))
184 (map-charset-chars #'modify-category-entry c ?c)
9ad4b491
KH
185 (if (eq c 'chinese-cns11643-1)
186 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
8e4cd685 187 (map-charset-chars #'modify-category-entry c ?C)))
4ed46869 188
8f924df7 189;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
4ed46869 190
66bff5ed 191(map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
4ed46869 192
66bff5ed 193(map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
4ed46869 194
8f924df7 195(dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
761f6427
KH
196 japanese-jisx0213-1 japanese-jisx0213-2
197 cp932-2-byte))
8e4cd685 198 (map-charset-chars #'modify-category-entry l ?j))
4ed46869 199
c4186f9c
KH
200;; Fullwidth characters
201(modify-category-entry '(#xff01 . #xff60) ?\|)
202
269a5dd0 203;; Unicode equivalents of JISX0201-kana
66bff5ed
KH
204(let ((range '(#xff61 . #xff9f)))
205 (modify-category-entry range ?k)
206 (modify-category-entry range ?j)
207 (modify-category-entry range ?\|))
269a5dd0
DL
208
209;; Katakana block
796f8b2f
KH
210(modify-category-entry '(#x3099 . #x309C) ?K)
211(modify-category-entry '(#x30A0 . #x30FF) ?K)
6f3ac1e1 212(modify-category-entry '(#x31F0 . #x31FF) ?K)
b11c2874 213(modify-category-entry '(#x30A0 . #x30FA) ?\|)
796f8b2f 214(modify-category-entry #x30FF ?\|)
269a5dd0
DL
215
216;; Hiragana block
796f8b2f
KH
217(modify-category-entry '(#x3040 . #x309F) ?H)
218(modify-category-entry '(#x3040 . #x3096) ?\|)
219(modify-category-entry #x309F ?\|)
220(modify-category-entry #x30A0 ?H)
221(modify-category-entry #x30FC ?H)
222
269a5dd0 223
4ed46869 224;; JISX0208
66bff5ed
KH
225(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
226(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
227(let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
69c2c6ea 228 (dolist (elt chars)
2b89bca4 229 (modify-syntax-entry elt "w")))
66bff5ed
KH
230
231(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
232(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
233(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
234(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
235(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
236(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
66a85e76 237(let ((chars '(?仝 ?々 ?〆 ?〇)))
4ed46869
KH
238 (while chars
239 (modify-category-entry (car chars) ?C)
240 (setq chars (cdr chars))))
241
242;; JISX0212
4ed46869 243
66bff5ed 244(map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
4ed46869
KH
245
246;; JISX0201-Kana
87a39edb 247
abdaa411 248(let ((chars '(?。 ?、 ?・)))
4ed46869
KH
249 (while chars
250 (modify-syntax-entry (car chars) ".")
251 (setq chars (cdr chars))))
252
e6d10035
KH
253(modify-syntax-entry ?\「 "(」")
254(modify-syntax-entry ?\」 "(「")
226e4119 255
4ed46869
KH
256;; Korean character set (KSC5601)
257
87a39edb 258(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
66bff5ed
KH
259
260(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
439f7264
DL
261(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
262(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
263(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
9ad4b491
KH
264(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
265(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
266(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
66bff5ed
KH
267(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
268(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
269(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
270(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
4ed46869 271
c94ae9eb 272;; These are in more than one charset.
8f924df7
KH
273(let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
274 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
275 "()[]{}"))
276 open close)
277 (dotimes (i (/ (length parens) 2))
278 (setq open (aref parens (* i 2))
279 close (aref parens (1+ (* i 2))))
280 (modify-syntax-entry open (format "(%c" close))
281 (modify-syntax-entry close (format ")%c" open))))
d05cfa1f 282
c94ae9eb 283;; Arabic character set
6eba8645 284
c94ae9eb
DL
285(let ((charsets '(arabic-iso8859-6
286 arabic-digit
287 arabic-1-column
288 arabic-2-column)))
289 (while charsets
290 (map-charset-chars #'modify-category-entry (car charsets) ?b)
291 (setq charsets (cdr charsets))))
292(modify-category-entry '(#x600 . #x6ff) ?b)
293(modify-category-entry '(#xfb50 . #xfdff) ?b)
294(modify-category-entry '(#xfe70 . #xfefe) ?b)
6eba8645 295
c94ae9eb
DL
296;; Cyrillic character set (ISO-8859-5)
297
298(modify-syntax-entry ?№ ".")
299
300;; Ethiopic character set
301
4c81b0f6
KH
302(modify-category-entry '(#x1200 . #x1399) ?e)
303(modify-category-entry '(#x2d80 . #x2dde) ?e)
55a3ed16 304(let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨)))
c94ae9eb
DL
305 (while chars
306 (modify-syntax-entry (car chars) ".")
307 (setq chars (cdr chars))))
308(map-charset-chars #'modify-category-entry 'ethiopic ?e)
309
310;; Hebrew character set (ISO-8859-8)
311
312(modify-syntax-entry #x5be ".") ; MAQAF
313(modify-syntax-entry #x5c0 ".") ; PASEQ
314(modify-syntax-entry #x5c3 ".") ; SOF PASUQ
315(modify-syntax-entry #x5f3 ".") ; GERESH
316(modify-syntax-entry #x5f4 ".") ; GERSHAYIM
317
318;; Indian character set (IS 13194 and other Emacs original Indian charsets)
319
320(modify-category-entry '(#x901 . #x970) ?i)
321(map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
322(map-charset-chars #'modify-category-entry 'indian-2-column ?i)
d05cfa1f 323
6eba8645
KH
324;; Lao character set
325
abdaa411
DL
326(modify-category-entry '(#xe80 . #xeff) ?o)
327(map-charset-chars #'modify-category-entry 'lao ?o)
6eba8645 328
abdaa411 329(let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
e6d10035
KH
330 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
331 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
332 ("ຸູ" "w" ?3) ; vowel lower
8f924df7 333 ("່-໋" "w" ?4) ; tone mark
e6d10035
KH
334 ("ຼຽ" "w" ?9) ; semivowel lower
335 ("໐-໙" "w" ?6) ; digit
336 ("ຯໆ" "_" ?5) ; symbol
6eba8645
KH
337 ))
338 elm chars len syntax category to ch i)
339 (while deflist
340 (setq elm (car deflist))
341 (setq chars (car elm)
342 len (length chars)
343 syntax (nth 1 elm)
344 category (nth 2 elm)
345 i 0)
346 (while (< i len)
347 (if (= (aref chars i) ?-)
348 (setq i (1+ i)
4a027a0d
KH
349 to (aref chars i))
350 (setq ch (aref chars i)
6eba8645
KH
351 to ch))
352 (while (<= ch to)
269a5dd0
DL
353 (unless (string-equal syntax "w")
354 (modify-syntax-entry ch syntax))
6eba8645
KH
355 (modify-category-entry ch category)
356 (setq ch (1+ ch)))
4a027a0d 357 (setq i (1+ i)))
6eba8645
KH
358 (setq deflist (cdr deflist))))
359
4ed46869
KH
360;; Thai character set (TIS620)
361
abdaa411
DL
362(modify-category-entry '(#xe00 . #xe7f) ?t)
363(map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
4ed46869
KH
364
365(let ((deflist '(;; chars syntax category
e6d10035
KH
366 ("ก-รลว-ฮ" "w" ?0) ; consonant
367 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
368 ("ัิ-ื็๎" "w" ?2) ; vowel upper
369 ("ุ-ฺ" "w" ?3) ; vowel lower
8f924df7 370 ("่-ํ" "w" ?4) ; tone mark
e6d10035
KH
371 ("๐-๙" "w" ?6) ; digit
372 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
4ed46869
KH
373 ))
374 elm chars len syntax category to ch i)
9395eb7c
KH
375 (while deflist
376 (setq elm (car deflist))
377 (setq chars (car elm)
378 len (length chars)
379 syntax (nth 1 elm)
380 category (nth 2 elm)
381 i 0)
382 (while (< i len)
383 (if (= (aref chars i) ?-)
384 (setq i (1+ i)
4a027a0d
KH
385 to (aref chars i))
386 (setq ch (aref chars i)
9395eb7c
KH
387 to ch))
388 (while (<= ch to)
269a5dd0
DL
389 (unless (string-equal syntax "w")
390 (modify-syntax-entry ch syntax))
9395eb7c
KH
391 (modify-category-entry ch category)
392 (setq ch (1+ ch)))
4a027a0d 393 (setq i (1+ i)))
9395eb7c
KH
394 (setq deflist (cdr deflist))))
395
396;; Tibetan character set
397
abdaa411
DL
398(modify-category-entry '(#xf00 . #xfff) ?q)
399(map-charset-chars #'modify-category-entry 'tibetan ?q)
400(map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
9395eb7c
KH
401
402(let ((deflist '(;; chars syntax category
725d7c92 403 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
55a3ed16 404 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
725d7c92
DL
405 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
406 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
53964682 407 ("྄ཱུ༙༵༷" "w" ?3) ; lower vowel/modifier
8f924df7 408 ("཰" "w" ?3) ; invisible vowel a
725d7c92
DL
409 ("༠-༩༪-༳" "w" ?6) ; digit
410 ("་།-༒༔ཿ" "." ?|) ; line-break char
411 ("་།༏༐༑༔ཿ" "." ?|) ;
412 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
413 ("་།༏༐༑༔ཿ" "." ?>) ;
414 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
415 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
9395eb7c
KH
416 ))
417 elm chars len syntax category to ch i)
4ed46869
KH
418 (while deflist
419 (setq elm (car deflist))
420 (setq chars (car elm)
421 len (length chars)
422 syntax (nth 1 elm)
423 category (nth 2 elm)
424 i 0)
425 (while (< i len)
426 (if (= (aref chars i) ?-)
427 (setq i (1+ i)
4a027a0d
KH
428 to (aref chars i))
429 (setq ch (aref chars i)
4ed46869
KH
430 to ch))
431 (while (<= ch to)
269a5dd0
DL
432 (unless (string-equal syntax "w")
433 (modify-syntax-entry ch syntax))
4ed46869
KH
434 (modify-category-entry ch category)
435 (setq ch (1+ ch)))
4a027a0d 436 (setq i (1+ i)))
4ed46869
KH
437 (setq deflist (cdr deflist))))
438
439;; Vietnamese character set
440
abdaa411
DL
441;; To make a word with Latin characters
442(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
443(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
444
445(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
446(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
4ed46869 447
e5dd1155
KH
448(let ((tbl (standard-case-table))
449 (i 32))
450 (while (< i 128)
725d7c92
DL
451 (let* ((char (decode-char 'vietnamese-viscii-upper i))
452 (charl (decode-char 'vietnamese-viscii-lower i))
453 (uc (encode-char char 'ucs))
454 (lc (encode-char charl 'ucs)))
455 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
4eb97232 456 tbl)
725d7c92
DL
457 (if uc (modify-category-entry uc ?v))
458 (if lc (modify-category-entry lc ?v)))
e5dd1155
KH
459 (setq i (1+ i))))
460
d807d0c7
KH
461;; Tai Viet
462(let ((deflist '(;; chars syntax category
da6062e6 463 ((?ꪀ. ?ꪯ) "w" ?0) ; consonant
d807d0c7
KH
464 ("ꪱꪵꪶ" "w" ?1) ; vowel base
465 ((?ꪹ . ?ꪽ) "w" ?1) ; vowel base
466 ("ꪰꪲꪳꪷꪸꪾ" "w" ?2) ; vowel upper
467 ("ꪴ" "w" ?3) ; vowel lower
468 ("ꫀꫂ" "w" ?1) ; non-combining tone-mark
469 ("꪿꫁" "w" ?4) ; combining tone-mark
470 ((?ꫛ . ?꫟) "_" ?5) ; symbol
471 )))
472 (dolist (elm deflist)
473 (let ((chars (car elm))
474 (syntax (nth 1 elm))
475 (category (nth 2 elm)))
476 (if (consp chars)
477 (progn
478 (modify-syntax-entry chars syntax)
479 (modify-category-entry chars category))
480 (mapc #'(lambda (x)
481 (modify-syntax-entry x syntax)
482 (modify-category-entry x category))
483 chars)))))
c94ae9eb 484
f635daa1
CY
485;; Bidi categories
486
20372d0c
GM
487;; If bootstrapping without generated uni-*.el files, table not defined.
488(let ((table (unicode-property-table-internal 'bidi-class)))
489 (when table
490 (map-char-table (lambda (key val)
491 (cond
492 ((memq val '(R AL RLO RLE))
493 (modify-category-entry key ?R))
494 ((memq val '(L LRE LRO))
495 (modify-category-entry key ?L))))
496 table)))
f635daa1 497
b7cf27ed
EZ
498;; Load uni-mirrored.el if available, so that it gets dumped into
499;; Emacs. This allows to start Emacs with force-load-messages in
500;; ~/.emacs, and avoid infinite recursion in bidi_initialize, which
501;; needs to load uni-mirrored.el in order to display the "Loading"
502;; messages.
503(unicode-property-table-internal 'mirroring)
bbab1c4f 504
c94ae9eb
DL
505;; Latin
506
507(modify-category-entry '(#x80 . #x024F) ?l)
d05cfa1f 508
85ef8ece
KH
509(let ((tbl (standard-case-table)) c)
510
4fb82d62
DL
511 ;; Latin-1
512
513 ;; Fixme: Some of the non-word syntaxes here perhaps should be
514 ;; reviewed. (Note that the following all implicitly have word
515 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
516 ;; relating Unicode categories to Emacs syntax codes.
db92e81e
KH
517
518 ;; NBSP isn't semantically interchangeable with other whitespace chars,
e1dbe924 519 ;; so it's more like punctuation.
db92e81e 520 (set-case-syntax ?  "." tbl)
4fb82d62
DL
521 (set-case-syntax ?¡ "." tbl)
522 (set-case-syntax ?¦ "_" tbl)
523 (set-case-syntax ?§ "." tbl)
524 (set-case-syntax ?© "_" tbl)
db3b7db5
SM
525 ;; French wants
526 ;; (set-case-syntax-delims ?« ?» tbl)
527 ;; And German wants
528 ;; (set-case-syntax-delims ?» ?« tbl)
529 ;; So let's stay neutral and let users set these up if/when they want to.
530 (set-case-syntax ?« "." tbl)
531 (set-case-syntax ?» "." tbl)
4fb82d62
DL
532 (set-case-syntax ?¬ "_" tbl)
533 (set-case-syntax ?­ "_" tbl)
534 (set-case-syntax ?® "_" tbl)
535 (set-case-syntax ?° "_" tbl)
536 (set-case-syntax ?± "_" tbl)
537 (set-case-syntax ?µ "_" tbl)
538 (set-case-syntax ?· "_" tbl)
539 (set-case-syntax ?¼ "_" tbl)
540 (set-case-syntax ?½ "_" tbl)
541 (set-case-syntax ?¾ "_" tbl)
542 (set-case-syntax ?¿ "." tbl)
543 (let ((c 192))
544 (while (<= c 222)
545 (set-case-syntax-pair c (+ c 32) tbl)
546 (setq c (1+ c))))
547 (set-case-syntax ?× "_" tbl)
548 (set-case-syntax ?ß "w" tbl)
549 (set-case-syntax ?÷ "_" tbl)
550 ;; See below for ÿ.
85ef8ece 551
85ef8ece
KH
552 ;; Latin Extended-A, Latin Extended-B
553 (setq c #x0100)
e5e381c8
KH
554 (while (<= c #x02B8)
555 (modify-category-entry c ?l)
d05cfa1f 556 (setq c (1+ c)))
2bb915b8 557
e5e381c8
KH
558 (let ((pair-ranges '((#x0100 . #x012F)
559 (#x0132 . #x0137)
560 (#x0139 . #x0148)
561 (#x014a . #x0177)
562 (#x0179 . #x017E)
563 (#x0182 . #x0185)
796f8b2f
KH
564 (#x0187 . #x0188)
565 (#x018B . #x018C)
e5e381c8
KH
566 (#x0191 . #x0192)
567 (#x0198 . #x0199)
568 (#x01A0 . #x01A5)
569 (#x01A7 . #x01A8)
570 (#x01AC . #x01AD)
571 (#x01AF . #x01B0)
572 (#x01B3 . #x01B6)
d0203d61 573 (#x01B8 . #x01B9)
e5e381c8
KH
574 (#x01BC . #x01BD)
575 (#x01CD . #x01DC)
576 (#x01DE . #x01EF)
577 (#x01F4 . #x01F5)
578 (#x01F8 . #x021F)
579 (#x0222 . #x0233)
580 (#x023B . #x023C)
581 (#x0241 . #x0242)
582 (#x0246 . #x024F))))
583 (dolist (elt pair-ranges)
584 (let ((from (car elt)) (to (cdr elt)))
585 (while (< from to)
586 (set-case-syntax-pair from (1+ from) tbl)
587 (setq from (+ from 2))))))
2bb915b8 588
d0203d61 589 (set-case-syntax-pair ?Ÿ ?ÿ tbl)
796f8b2f 590
2bb915b8
KH
591 ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
592 ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
593 ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
594 ;; SMALL LETTER I.
595
596 ;; We used to set up half of those correspondence unconditionally,
597 ;; but that makes searches slow. So now we don't set up either half
598 ;; of these correspondences by default.
599
600 ;; (set-downcase-syntax ?İ ?i tbl)
601 ;; (set-upcase-syntax ?I ?ı tbl)
602
0d93216c
AS
603 (set-case-syntax-pair ?Ɓ ?ɓ tbl)
604 (set-case-syntax-pair ?Ɔ ?ɔ tbl)
d0203d61
AS
605 (set-case-syntax-pair ?Ɖ ?ɖ tbl)
606 (set-case-syntax-pair ?Ɗ ?ɗ tbl)
0d93216c
AS
607 (set-case-syntax-pair ?Ǝ ?ǝ tbl)
608 (set-case-syntax-pair ?Ə ?ə tbl)
609 (set-case-syntax-pair ?Ɛ ?ɛ tbl)
610 (set-case-syntax-pair ?Ɠ ?ɠ tbl)
611 (set-case-syntax-pair ?Ɣ ?ɣ tbl)
612 (set-case-syntax-pair ?Ɩ ?ɩ tbl)
613 (set-case-syntax-pair ?Ɨ ?ɨ tbl)
614 (set-case-syntax-pair ?Ɯ ?ɯ tbl)
615 (set-case-syntax-pair ?Ɲ ?ɲ tbl)
616 (set-case-syntax-pair ?Ɵ ?ɵ tbl)
617 (set-case-syntax-pair ?Ʀ ?ʀ tbl)
618 (set-case-syntax-pair ?Ʃ ?ʃ tbl)
619 (set-case-syntax-pair ?Ʈ ?ʈ tbl)
620 (set-case-syntax-pair ?Ʊ ?ʊ tbl)
621 (set-case-syntax-pair ?Ʋ ?ʋ tbl)
622 (set-case-syntax-pair ?Ʒ ?ʒ tbl)
e6d10035
KH
623 (set-case-syntax-pair ?DŽ ?dž tbl)
624 (set-case-syntax-pair ?Dž ?dž tbl)
625 (set-case-syntax-pair ?LJ ?lj tbl)
626 (set-case-syntax-pair ?Lj ?lj tbl)
627 (set-case-syntax-pair ?NJ ?nj tbl)
628 (set-case-syntax-pair ?Nj ?nj tbl)
e5e381c8 629
269a5dd0 630 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
e6d10035
KH
631 (set-case-syntax-pair ?DZ ?dz tbl)
632 (set-case-syntax-pair ?Dz ?dz tbl)
e6d10035
KH
633 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
634 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
cb80bcd1
EZ
635 (set-case-syntax-pair ?Ⱥ ?ⱥ tbl)
636 (set-case-syntax-pair ?Ƚ ?ƚ tbl)
637 (set-case-syntax-pair ?Ⱦ ?ⱦ tbl)
638 (set-case-syntax-pair ?Ƀ ?ƀ tbl)
639 (set-case-syntax-pair ?Ʉ ?ʉ tbl)
640 (set-case-syntax-pair ?Ʌ ?ʌ tbl)
269a5dd0 641
85ef8ece 642 ;; Latin Extended Additional
abdaa411 643 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
85ef8ece 644 (setq c #x1e00)
d05cfa1f 645 (while (<= c #x1ef9)
d05cfa1f
KH
646 (and (zerop (% c 2))
647 (or (<= c #x1e94) (>= c #x1ea0))
abdaa411 648 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f
KH
649 (setq c (1+ c)))
650
85ef8ece 651 ;; Greek
abdaa411 652 (modify-category-entry '(#x0370 . #x03ff) ?g)
85ef8ece 653 (setq c #x0370)
d05cfa1f 654 (while (<= c #x03ff)
d05cfa1f
KH
655 (if (or (and (>= c #x0391) (<= c #x03a1))
656 (and (>= c #x03a3) (<= c #x03ab)))
abdaa411 657 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
658 (and (>= c #x03da)
659 (<= c #x03ee)
660 (zerop (% c 2))
abdaa411 661 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 662 (setq c (1+ c)))
e6d10035
KH
663 (set-case-syntax-pair ?Ά ?ά tbl)
664 (set-case-syntax-pair ?Έ ?έ tbl)
665 (set-case-syntax-pair ?Ή ?ή tbl)
666 (set-case-syntax-pair ?Ί ?ί tbl)
667 (set-case-syntax-pair ?Ό ?ό tbl)
668 (set-case-syntax-pair ?Ύ ?ύ tbl)
669 (set-case-syntax-pair ?Ώ ?ώ tbl)
d05cfa1f 670
269a5dd0
DL
671 ;; Armenian
672 (setq c #x531)
673 (while (<= c #x556)
abdaa411 674 (set-case-syntax-pair c (+ c #x30) tbl)
269a5dd0
DL
675 (setq c (1+ c)))
676
85ef8ece 677 ;; Greek Extended
abdaa411 678 (modify-category-entry '(#x1f00 . #x1fff) ?g)
85ef8ece 679 (setq c #x1f00)
d05cfa1f 680 (while (<= c #x1fff)
d05cfa1f
KH
681 (and (<= (logand c #x000f) 7)
682 (<= c #x1fa7)
796f8b2f
KH
683 (not (memq c '(#x1f16 #x1f17 #x1f56 #x1f57
684 #x1f50 #x1f52 #x1f54 #x1f56)))
685 (/= (logand c #x00f0) #x70)
abdaa411 686 (set-case-syntax-pair (+ c 8) c tbl))
d05cfa1f 687 (setq c (1+ c)))
e6d10035
KH
688 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
689 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
690 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
691 (set-case-syntax-pair ?Ά ?ά tbl)
692 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
693 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
694 (set-case-syntax-pair ?Έ ?έ tbl)
695 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
696 (set-case-syntax-pair ?Ή ?ή tbl)
697 (set-case-syntax-pair ?ῌ ?ῃ tbl)
698 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
699 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
700 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
701 (set-case-syntax-pair ?Ί ?ί tbl)
702 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
703 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
704 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
705 (set-case-syntax-pair ?Ύ ?ύ tbl)
706 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
707 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
708 (set-case-syntax-pair ?Ό ?ό tbl)
709 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
710 (set-case-syntax-pair ?Ώ ?ώ tbl)
711 (set-case-syntax-pair ?ῼ ?ῳ tbl)
d05cfa1f 712
85ef8ece 713 ;; cyrillic
abdaa411 714 (modify-category-entry '(#x0400 . #x04FF) ?y)
85ef8ece 715 (setq c #x0400)
d05cfa1f 716 (while (<= c #x04ff)
d05cfa1f
KH
717 (and (>= c #x0400)
718 (<= c #x040f)
abdaa411 719 (set-case-syntax-pair c (+ c 80) tbl))
d05cfa1f
KH
720 (and (>= c #x0410)
721 (<= c #x042f)
abdaa411 722 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
723 (and (zerop (% c 2))
724 (or (and (>= c #x0460) (<= c #x0480))
725 (and (>= c #x048c) (<= c #x04be))
726 (and (>= c #x04d0) (<= c #x04f4)))
8f924df7 727 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 728 (setq c (1+ c)))
e6d10035
KH
729 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
730 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
731 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
732 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
733 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
d05cfa1f 734
85ef8ece
KH
735 ;; general punctuation
736 (setq c #x2000)
d05cfa1f
KH
737 (while (<= c #x200b)
738 (set-case-syntax c " " tbl)
739 (setq c (1+ c)))
b427c97e
DL
740 (while (<= c #x200F)
741 (set-case-syntax c "." tbl)
742 (setq c (1+ c)))
743 ;; Fixme: These aren't all right:
6b61353c
KH
744 (setq c #x2010)
745 (while (<= c #x2016)
746 (set-case-syntax c "_" tbl)
747 (setq c (1+ c)))
748 ;; Punctuation syntax for quotation marks (like `)
749 (while (<= c #x201f)
750 (set-case-syntax c "." tbl)
751 (setq c (1+ c)))
752 ;; Fixme: These aren't all right:
d05cfa1f
KH
753 (while (<= c #x2027)
754 (set-case-syntax c "_" tbl)
755 (setq c (1+ c)))
b427c97e
DL
756 (while (<= c #x206F)
757 (set-case-syntax c "." tbl)
758 (setq c (1+ c)))
d05cfa1f 759
269a5dd0
DL
760 ;; Roman numerals
761 (setq c #x2160)
762 (while (<= c #x216f)
abdaa411 763 (set-case-syntax-pair c (+ c #x10) tbl)
269a5dd0
DL
764 (setq c (1+ c)))
765
4fb82d62
DL
766 ;; Fixme: The following blocks might be better as symbol rather than
767 ;; punctuation.
b427c97e
DL
768 ;; Arrows
769 (setq c #x2190)
6ca54a3a
DL
770 (while (<= c #x21FF)
771 (set-case-syntax c "." tbl)
b427c97e
DL
772 (setq c (1+ c)))
773 ;; Mathematical Operators
774 (while (<= c #x22FF)
6ca54a3a 775 (set-case-syntax c "." tbl)
b427c97e
DL
776 (setq c (1+ c)))
777 ;; Miscellaneous Technical
778 (while (<= c #x23FF)
6ca54a3a 779 (set-case-syntax c "." tbl)
b427c97e
DL
780 (setq c (1+ c)))
781 ;; Control Pictures
782 (while (<= c #x243F)
6ca54a3a 783 (set-case-syntax c "_" tbl)
269a5dd0
DL
784 (setq c (1+ c)))
785
786 ;; Circled Latin
787 (setq c #x24b6)
788 (while (<= c #x24cf)
abdaa411
DL
789 (set-case-syntax-pair c (+ c 26) tbl)
790 (modify-category-entry c ?l)
791 (modify-category-entry (+ c 26) ?l)
269a5dd0
DL
792 (setq c (1+ c)))
793
ac387dd1
EZ
794 ;; Coptic
795 (let ((pair-ranges '((#x2C80 . #x2CE2)
796 (#x2CEB . #x2CF2))))
797 (dolist (elt pair-ranges)
798 (let ((from (car elt)) (to (cdr elt)))
799 (while (< from to)
800 (set-case-syntax-pair from (1+ from) tbl)
ac387dd1 801 (setq from (+ from 2))))))
204db02a
EZ
802 ;; There's no Coptic category. However, Coptic letters that are
803 ;; part of the Greek block above get the Greek category, and those
804 ;; in this block are derived from Greek letters, so let's be
805 ;; consistent about their category.
806 (modify-category-entry '(#x2C80 . #x2CFF) ?g)
ac387dd1 807
269a5dd0
DL
808 ;; Fullwidth Latin
809 (setq c #xff21)
810 (while (<= c #xff3a)
abdaa411
DL
811 (set-case-syntax-pair c (+ c #x20) tbl)
812 (modify-category-entry c ?l)
813 (modify-category-entry (+ c #x20) ?l)
269a5dd0
DL
814 (setq c (1+ c)))
815
269a5dd0 816 ;; Combining diacritics
abdaa411 817 (modify-category-entry '(#x300 . #x362) ?^)
269a5dd0 818 ;; Combining marks
0ca754d0 819 (modify-category-entry '(#x20d0 . #x20ff) ?^)
269a5dd0
DL
820
821 ;; Fixme: syntax for symbols &c
822 )
6b61353c
KH
823
824(let ((pairs
e55a4d4e
KH
825 '("⁅⁆" ; U+2045 U+2046
826 "⁽⁾" ; U+207D U+207E
827 "₍₎" ; U+208D U+208E
828 "〈〉" ; U+2329 U+232A
829 "⎴⎵" ; U+23B4 U+23B5
830 "❨❩" ; U+2768 U+2769
831 "❪❫" ; U+276A U+276B
832 "❬❭" ; U+276C U+276D
833 "❰❱" ; U+2770 U+2771
834 "❲❳" ; U+2772 U+2773
835 "❴❵" ; U+2774 U+2775
836 "⟦⟧" ; U+27E6 U+27E7
837 "⟨⟩" ; U+27E8 U+27E9
838 "⟪⟫" ; U+27EA U+27EB
839 "⦃⦄" ; U+2983 U+2984
840 "⦅⦆" ; U+2985 U+2986
841 "⦇⦈" ; U+2987 U+2988
842 "⦉⦊" ; U+2989 U+298A
843 "⦋⦌" ; U+298B U+298C
844 "⦍⦎" ; U+298D U+298E
845 "⦏⦐" ; U+298F U+2990
846 "⦑⦒" ; U+2991 U+2992
847 "⦓⦔" ; U+2993 U+2994
848 "⦕⦖" ; U+2995 U+2996
849 "⦗⦘" ; U+2997 U+2998
850 "⧼⧽" ; U+29FC U+29FD
851 "〈〉" ; U+3008 U+3009
852 "《》" ; U+300A U+300B
853 "「」" ; U+300C U+300D
854 "『』" ; U+300E U+300F
855 "【】" ; U+3010 U+3011
856 "〔〕" ; U+3014 U+3015
857 "〖〗" ; U+3016 U+3017
858 "〘〙" ; U+3018 U+3019
859 "〚〛" ; U+301A U+301B
860 "﴾﴿" ; U+FD3E U+FD3F
861 "︵︶" ; U+FE35 U+FE36
862 "︷︸" ; U+FE37 U+FE38
863 "︹︺" ; U+FE39 U+FE3A
864 "︻︼" ; U+FE3B U+FE3C
865 "︽︾" ; U+FE3D U+FE3E
866 "︿﹀" ; U+FE3F U+FE40
867 "﹁﹂" ; U+FE41 U+FE42
868 "﹃﹄" ; U+FE43 U+FE44
869 "﹙﹚" ; U+FE59 U+FE5A
870 "﹛﹜" ; U+FE5B U+FE5C
871 "﹝﹞" ; U+FE5D U+FE5E
872 "()" ; U+FF08 U+FF09
873 "[]" ; U+FF3B U+FF3D
874 "{}" ; U+FF5B U+FF5D
875 "⦅⦆" ; U+FF5F U+FF60
876 "「」" ; U+FF62 U+FF63
6b61353c
KH
877 )))
878 (dolist (elt pairs)
879 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
880 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
881
4ed46869 882\f
777cfce6 883;; For each character set, put the information of the most proper
aaa9f206 884;; coding system to encode it by `preferred-coding-system' property.
777cfce6 885
abdaa411 886;; Fixme: should this be junked?
777cfce6
KH
887(let ((l '((latin-iso8859-1 . iso-latin-1)
888 (latin-iso8859-2 . iso-latin-2)
889 (latin-iso8859-3 . iso-latin-3)
890 (latin-iso8859-4 . iso-latin-4)
891 (thai-tis620 . thai-tis620)
892 (greek-iso8859-7 . greek-iso-8bit)
893 (arabic-iso8859-6 . iso-2022-7bit)
894 (hebrew-iso8859-8 . hebrew-iso-8bit)
895 (katakana-jisx0201 . japanese-shift-jis)
896 (latin-jisx0201 . japanese-shift-jis)
897 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
898 (latin-iso8859-9 . iso-latin-5)
899 (japanese-jisx0208-1978 . iso-2022-jp)
7870bdd9
KH
900 (chinese-gb2312 . chinese-iso-8bit)
901 (chinese-gbk . chinese-gbk)
902 (gb18030-2-byte . chinese-gb18030)
903 (gb18030-4-byte-bmp . chinese-gb18030)
904 (gb18030-4-byte-smp . chinese-gb18030)
905 (gb18030-4-byte-ext-1 . chinese-gb18030)
906 (gb18030-4-byte-ext-2 . chinese-gb18030)
777cfce6
KH
907 (japanese-jisx0208 . iso-2022-jp)
908 (korean-ksc5601 . iso-2022-kr)
909 (japanese-jisx0212 . iso-2022-jp)
777cfce6
KH
910 (chinese-big5-1 . chinese-big5)
911 (chinese-big5-2 . chinese-big5)
912 (chinese-sisheng . iso-2022-7bit)
913 (ipa . iso-2022-7bit)
914 (vietnamese-viscii-lower . vietnamese-viscii)
915 (vietnamese-viscii-upper . vietnamese-viscii)
916 (arabic-digit . iso-2022-7bit)
917 (arabic-1-column . iso-2022-7bit)
777cfce6
KH
918 (lao . lao)
919 (arabic-2-column . iso-2022-7bit)
920 (indian-is13194 . devanagari)
69e138b2 921 (indian-glyph . devanagari)
777cfce6 922 (tibetan-1-column . tibetan)
58cd41a3 923 (ethiopic . iso-2022-7bit)
7870bdd9
KH
924 (chinese-cns11643-1 . iso-2022-cn)
925 (chinese-cns11643-2 . iso-2022-cn)
777cfce6
KH
926 (chinese-cns11643-3 . iso-2022-cn)
927 (chinese-cns11643-4 . iso-2022-cn)
928 (chinese-cns11643-5 . iso-2022-cn)
929 (chinese-cns11643-6 . iso-2022-cn)
930 (chinese-cns11643-7 . iso-2022-cn)
931 (indian-2-column . devanagari)
7a860cf2
DL
932 (tibetan . tibetan)
933 (latin-iso8859-14 . iso-latin-8)
934 (latin-iso8859-15 . iso-latin-9))))
777cfce6 935 (while l
aaa9f206 936 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
777cfce6 937 (setq l (cdr l))))
df0415c5
KH
938
939\f
98a663f1 940;; Setup auto-fill-chars for charsets that should invoke auto-filling.
7760ba82 941;; SPACE and NEWLINE are already set.
df21429c
KH
942
943(set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
944(set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
945(set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
946(set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
947(set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
948(set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
949
55bd52ea 950\f
7760ba82
KH
951;;; Setting char-width-table. The default is 1.
952
953;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
954;; and final characters.
a2a22302 955(let ((l '((#x0300 . #x036F)
7760ba82
KH
956 (#x0483 . #x0489)
957 (#x0591 . #x05BD)
958 (#x05BF . #x05BF)
959 (#x05C1 . #x05C2)
960 (#x05C4 . #x05C5)
961 (#x05C7 . #x05C7)
962 (#x0600 . #x0603)
963 (#x0610 . #x0615)
964 (#x064B . #x065E)
965 (#x0670 . #x0670)
966 (#x06D6 . #x06E4)
967 (#x06E7 . #x06E8)
968 (#x06EA . #x06ED)
969 (#x070F . #x070F)
970 (#x0711 . #x0711)
971 (#x0730 . #x074A)
972 (#x07A6 . #x07B0)
973 (#x07EB . #x07F3)
974 (#x0901 . #x0902)
975 (#x093C . #x093C)
976 (#x0941 . #x0948)
977 (#x094D . #x094D)
978 (#x0951 . #x0954)
979 (#x0962 . #x0963)
980 (#x0981 . #x0981)
981 (#x09BC . #x09BC)
982 (#x09C1 . #x09C4)
983 (#x09CD . #x09CD)
984 (#x09E2 . #x09E3)
985 (#x0A01 . #x0A02)
986 (#x0A3C . #x0A3C)
987 (#x0A41 . #x0A4D)
988 (#x0A70 . #x0A71)
989 (#x0A81 . #x0A82)
990 (#x0ABC . #x0ABC)
991 (#x0AC1 . #x0AC8)
992 (#x0ACD . #x0ACD)
993 (#x0AE2 . #x0AE3)
994 (#x0B01 . #x0B01)
995 (#x0B3C . #x0B3C)
996 (#x0B3F . #x0B3F)
997 (#x0B41 . #x0B43)
998 (#x0B4D . #x0B56)
999 (#x0B82 . #x0B82)
1000 (#x0BC0 . #x0BC0)
1001 (#x0BCD . #x0BCD)
1002 (#x0C3E . #x0C40)
1003 (#x0C46 . #x0C56)
1004 (#x0CBC . #x0CBC)
1005 (#x0CBF . #x0CBF)
1006 (#x0CC6 . #x0CC6)
1007 (#x0CCC . #x0CCD)
1008 (#x0CE2 . #x0CE3)
1009 (#x0D41 . #x0D43)
1010 (#x0D4D . #x0D4D)
1011 (#x0DCA . #x0DCA)
1012 (#x0DD2 . #x0DD6)
1013 (#x0E31 . #x0E31)
1014 (#x0E34 . #x0E3A)
1015 (#x0E47 . #x0E4E)
1016 (#x0EB1 . #x0EB1)
1017 (#x0EB4 . #x0EBC)
1018 (#x0EC8 . #x0ECD)
1019 (#x0F18 . #x0F19)
1020 (#x0F35 . #x0F35)
1021 (#x0F37 . #x0F37)
1022 (#x0F39 . #x0F39)
1023 (#x0F71 . #x0F7E)
1024 (#x0F80 . #x0F84)
1025 (#x0F86 . #x0F87)
1026 (#x0F90 . #x0FBC)
1027 (#x0FC6 . #x0FC6)
1028 (#x102D . #x1030)
1029 (#x1032 . #x1037)
1030 (#x1039 . #x1039)
1031 (#x1058 . #x1059)
1032 (#x1160 . #x11FF)
1033 (#x135F . #x135F)
1034 (#x1712 . #x1714)
1035 (#x1732 . #x1734)
1036 (#x1752 . #x1753)
1037 (#x1772 . #x1773)
1038 (#x17B4 . #x17B5)
1039 (#x17B7 . #x17BD)
1040 (#x17C6 . #x17C6)
1041 (#x17C9 . #x17D3)
1042 (#x17DD . #x17DD)
1043 (#x180B . #x180D)
1044 (#x18A9 . #x18A9)
1045 (#x1920 . #x1922)
1046 (#x1927 . #x1928)
1047 (#x1932 . #x1932)
1048 (#x1939 . #x193B)
1049 (#x1A17 . #x1A18)
1050 (#x1B00 . #x1B03)
1051 (#x1B34 . #x1B34)
1052 (#x1B36 . #x1B3A)
1053 (#x1B3C . #x1B3C)
1054 (#x1B42 . #x1B42)
1055 (#x1B6B . #x1B73)
1056 (#x1DC0 . #x1DFF)
1057 (#x200B . #x200F)
1058 (#x202A . #x202E)
1059 (#x2060 . #x206F)
1060 (#x20D0 . #x20EF)
1061 (#x302A . #x302F)
1062 (#x3099 . #x309A)
1063 (#xA806 . #xA806)
1064 (#xA80B . #xA80B)
1065 (#xA825 . #xA826)
1066 (#xFB1E . #xFB1E)
1067 (#xFE00 . #xFE0F)
1068 (#xFE20 . #xFE23)
1069 (#xFEFF . #xFEFF)
1070 (#xFFF9 . #xFFFB)
1071 (#x10A01 . #x10A0F)
1072 (#x10A38 . #x10A3F)
1073 (#x1D167 . #x1D169)
1074 (#x1D173 . #x1D182)
1075 (#x1D185 . #x1D18B)
1076 (#x1D1AA . #x1D1AD)
1077 (#x1D242 . #x1D244)
1078 (#xE0001 . #xE01EF))))
1079 (dolist (elt l)
1080 (set-char-table-range char-width-table elt 0)))
1081
1082;; 2: East Asian Wide and Full-width characters.
1083(let ((l '((#x1100 . #x115F)
1084 (#x2329 . #x232A)
1085 (#x2E80 . #x303E)
1086 (#x3040 . #xA4CF)
1087 (#xAC00 . #xD7A3)
ed0cb465 1088 (#xF900 . #xFAFF)
7760ba82 1089 (#xFE30 . #xFE6F)
bb5c62cf 1090 (#xFF01 . #xFF60)
7760ba82
KH
1091 (#xFFE0 . #xFFE6)
1092 (#x20000 . #x2FFFF)
1093 (#x30000 . #x3FFFF))))
ed0cb465 1094 (dolist (elt l)
7760ba82 1095 (set-char-table-range char-width-table elt 2)))
173f18ce
DL
1096
1097;; Other double width
7760ba82
KH
1098;;(map-charset-chars
1099;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1100;; 'ethiopic)
1101;; (map-charset-chars
1102;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1103;; 'tibetan)
173f18ce 1104(map-charset-chars
9d3aa82c 1105 (lambda (range _ignore) (set-char-table-range char-width-table range 2))
173f18ce
DL
1106 'indian-2-column)
1107(map-charset-chars
9d3aa82c 1108 (lambda (range _ignore) (set-char-table-range char-width-table range 2))
173f18ce 1109 'arabic-2-column)
777cfce6 1110
dbff07a2
KH
1111;; Internal use only.
1112;; Alist of locale symbol vs charsets. In a language environment
1113;; corresponding to the locale, width of characters in the charsets is
1114;; set to 2. Each element has the form:
1115;; (LOCALE TABLE (CHARSET (FROM-CODE . TO-CODE) ...) ...)
1116;; LOCALE: locale symbol
1117;; TABLE: char-table used for char-width-table, initially nil.
d5081c1e 1118;; CHARSET: character set
dbff07a2
KH
1119;; FROM-CODE, TO-CODE: range of code-points in CHARSET
1120
1121(defvar cjk-char-width-table-list
1122 '((ja_JP nil (japanese-jisx0208 (#x2121 . #x287E))
1123 (cp932-2-byte (#x8140 . #x879F)))
1124 (zh_CN nil (chinese-gb2312 (#x2121 . #x297E)))
1125 (zh_HK nil (big5-hkscs (#xA140 . #xA3FE) (#xC6A0 . #xC8FE)))
1126 (zh_TW nil (big5 (#xA140 . #xA3FE))
1127 (chinese-cns11643-1 (#x2121 . #x427E)))
1128 (ko_KR nil (korean-ksc5601 (#x2121 . #x2C7E)))))
1129
1130;; Internal use only.
1131;; Setup char-width-table appropriate for a language environment
1132;; corresponding to LOCALE-NAME (symbol).
1133
1134(defun use-cjk-char-width-table (locale-name)
1135 (while (char-table-parent char-width-table)
1136 (setq char-width-table (char-table-parent char-width-table)))
9d3aa82c 1137 (let ((slot (assq locale-name cjk-char-width-table-list)))
dbff07a2
KH
1138 (or slot (error "Unknown locale for CJK language environment: %s"
1139 locale-name))
1140 (unless (nth 1 slot)
1141 (let ((table (make-char-table nil)))
1142 (dolist (charset-info (nthcdr 2 slot))
1143 (let ((charset (car charset-info)))
1144 (dolist (code-range (cdr charset-info))
9d3aa82c 1145 (map-charset-chars #'(lambda (range _arg)
dbff07a2
KH
1146 (set-char-table-range table range 2))
1147 charset nil
1148 (car code-range) (cdr code-range)))))
1149 (optimize-char-table table)
1150 (set-char-table-parent table char-width-table)
1151 (setcar (cdr slot) table)))
1152 (setq char-width-table (nth 1 slot))))
55a3ed16
KH
1153
1154(defun use-default-char-width-table ()
1155 "Internal use only.
9f336de0 1156Setup char-width-table appropriate for non-CJK language environment."
dbff07a2
KH
1157 (while (char-table-parent char-width-table)
1158 (setq char-width-table (char-table-parent char-width-table))))
55a3ed16 1159
87a39edb 1160(optimize-char-table (standard-case-table))
87a39edb
DL
1161(optimize-char-table (standard-syntax-table))
1162
55a3ed16
KH
1163\f
1164;; Setting char-script-table.
1165
57939ff4
EZ
1166;; The data is compiled from Blocks.txt and Scripts.txt in the
1167;; "Unicode Character Database", simplified to lump together all the
1168;; blocks belonging to the same language. E.g., "Basic Latin",
1169;; "Latin-1 Supplement", "Latin Extended-A", etc. are all lumped
1170;; together under "latin".
1171;;
b427c97e
DL
1172;; The Unicode blocks actually extend past some of these ranges with
1173;; undefined codepoints.
9ce5de1c
KH
1174(let ((script-list nil))
1175 (dolist
1176 (elt
b982c760 1177 '((#x0000 #x007F latin)
6c52dd78 1178 (#x00A0 #x024F latin)
b9507529
EZ
1179 (#x0250 #x02AF phonetic) ; IPA Extensions
1180 (#x02B0 #x036F latin) ; Spacing Modifiers and Diacriticals
9ce5de1c
KH
1181 (#x0370 #x03E1 greek)
1182 (#x03E2 #x03EF coptic)
1183 (#x03F0 #x03F3 greek)
57939ff4 1184 (#x0400 #x052F cyrillic)
9ce5de1c
KH
1185 (#x0530 #x058F armenian)
1186 (#x0590 #x05FF hebrew)
1187 (#x0600 #x06FF arabic)
1188 (#x0700 #x074F syriac)
b9507529 1189 (#x0750 #x077F arabic) ; Arabic Supplement
9ce5de1c 1190 (#x0780 #x07BF thaana)
57939ff4
EZ
1191 (#x07C0 #x07FF nko)
1192 (#x0800 #x083F samaritan)
1193 (#x0840 #x085F mandaic)
b9507529 1194 (#x08A0 #x08FF arabic) ; Arabic Extended-A
9ce5de1c
KH
1195 (#x0900 #x097F devanagari)
1196 (#x0980 #x09FF bengali)
1197 (#x0A00 #x0A7F gurmukhi)
1198 (#x0A80 #x0AFF gujarati)
1199 (#x0B00 #x0B7F oriya)
1200 (#x0B80 #x0BFF tamil)
1201 (#x0C00 #x0C7F telugu)
1202 (#x0C80 #x0CFF kannada)
1203 (#x0D00 #x0D7F malayalam)
1204 (#x0D80 #x0DFF sinhala)
57939ff4
EZ
1205 (#x0E00 #x0E7F thai)
1206 (#x0E80 #x0EFF lao)
9ce5de1c 1207 (#x0F00 #x0FFF tibetan)
b9507529 1208 (#x1000 #x109F burmese) ; Myanmar
9ce5de1c
KH
1209 (#x10A0 #x10FF georgian)
1210 (#x1100 #x11FF hangul)
b9507529 1211 (#x1200 #x139F ethiopic) ; Ethiopic and Ethiopic Supplement
9ce5de1c
KH
1212 (#x13A0 #x13FF cherokee)
1213 (#x1400 #x167F canadian-aboriginal)
1214 (#x1680 #x169F ogham)
1215 (#x16A0 #x16FF runic)
57939ff4
EZ
1216 (#x1700 #x171F tagalog)
1217 (#x1720 #x173F hanunoo)
1218 (#x1740 #x175F buhid)
1219 (#x1760 #x177F tagbanwa)
9ce5de1c
KH
1220 (#x1780 #x17FF khmer)
1221 (#x1800 #x18AF mongolian)
b9507529 1222 (#x18B0 #x18FF canadian-aboriginal) ; Canadian Aboriginal Syllabics Extended
57939ff4
EZ
1223 (#x1900 #x194F limbu)
1224 (#x1950 #x197F tai-le)
b9507529
EZ
1225 (#x1980 #x19DF tai-lue) ; New Tai Lue
1226 (#x19E0 #x19FF khmer) ; Khmer Symbols
57939ff4
EZ
1227 (#x1A00 #x1A00 buginese)
1228 (#x1A20 #x1AAF tai-tham)
b9507529 1229 (#x1AB0 #x1AFF latin) ; Combining Diacritical Marks Extended
57939ff4
EZ
1230 (#x1B00 #x1B7F balinese)
1231 (#x1B80 #x1BBF sundanese)
1232 (#x1BC0 #x1BFF batak)
1233 (#x1C00 #x1C4F lepcha)
1234 (#x1C50 #x1C7F ol-chiki)
1235 (#x1CC0 #x1CCF sundanese)
1236 (#x1CD0 #x1CFF vedic)
b9507529
EZ
1237 (#x1D00 #x1DBF phonetic) ; Phonetic Extensions & Supplement
1238 (#x1DC0 #x1EFF latin) ; Latin Extended Additional
1239 (#x1F00 #x1FFF greek) ; Greek Extended
f041d33e 1240 (#x2000 #x27FF symbol)
9ce5de1c 1241 (#x2800 #x28FF braille)
57939ff4
EZ
1242 (#x2900 #x2BFF symbol)
1243 (#x2C00 #x2C5F glagolitic)
b9507529 1244 (#x2C60 #x2C7F latin) ; Latin Extended-C
57939ff4 1245 (#x2C80 #x2CFF coptic)
b9507529 1246 (#x2D00 #x2D2F georgian) ; Georgian Supplement
57939ff4 1247 (#x2D30 #x2D7F tifinagh)
b9507529
EZ
1248 (#x2D80 #x2DDF ethiopic) ; Ethiopic Extended
1249 (#x2DE0 #x2DFF cyrillic) ; Cyrillic Extended-A
57939ff4 1250 (#x2E00 #x2E7F symbol)
9ce5de1c
KH
1251 (#x2E80 #x2FDF han)
1252 (#x2FF0 #x2FFF ideographic-description)
1253 (#x3000 #x303F cjk-misc)
b9507529 1254 (#x3040 #x30FF kana) ; Hiragana and Katakana
9ce5de1c 1255 (#x3100 #x312F bopomofo)
b9507529 1256 (#x3130 #x318F hangul) ; Hangul Compatibility Jamo
9ce5de1c 1257 (#x3190 #x319F kanbun)
b9507529
EZ
1258 (#x31A0 #x31BF bopomofo) ; Bopomofo Extended
1259 (#x31C0 #x31EF cjk-misc) ; CJK Strokes
1260 (#x31F0 #x31FF kana) ; Katakana Phonetic Extensions
57939ff4 1261 (#x3200 #x9FAF han)
9ce5de1c 1262 (#xA000 #xA4CF yi)
57939ff4
EZ
1263 (#xA4D0 #xA4FF lisu)
1264 (#xA500 #xA63F vai)
b9507529 1265 (#xA640 #xA69F cyrillic) ; Cyrillic Extended-B
57939ff4
EZ
1266 (#xA6A0 #xA6FF bamum)
1267 (#xA700 #xA7FF latin)
1268 (#xA800 #xA82F syloti-nagri)
1269 (#xA830 #xA83F north-indic-number)
1270 (#xA840 #xA87F phags-pa)
1271 (#xA880 #xA8DF saurashtra)
b9507529 1272 (#xA8E0 #xA8FF devanagari) ; Devanagari Extended
57939ff4
EZ
1273 (#xA900 #xA92F kayah-li)
1274 (#xA930 #xA95F rejang)
b9507529 1275 (#xA960 #xA97F hangul) ; Hangul Jamo Extended
57939ff4 1276 (#xA980 #xA9DF javanese)
b9507529 1277 (#xA9E0 #xA9FF burmese) ; Myanmar Extended-B
1ffae953 1278 (#xAA00 #xAA5F cham)
b9507529 1279 (#xAA60 #xAA7F burmese) ; Myanmar Extended-A
d807d0c7 1280 (#xAA80 #xAADF tai-viet)
b9507529
EZ
1281 (#xAAE0 #xAAFF meetei-mayek) ; Meetei Mayek Extensions
1282 (#xAB00 #xAB2F ethiopic) ; Ethiopic Extended-A
1283 (#xAB30 #xAB6F latin) ; Latin Extended-E
57939ff4
EZ
1284 (#xABC0 #xABFF meetei-mayek)
1285 (#xAC00 #xD7FF hangul)
95ac45fa 1286 (#xF900 #xFAFF han)
b9507529
EZ
1287 (#xFB00 #xFB06 latin) ; Latin ligatures
1288 (#xFB13 #xFB17 armenian) ; Armenian ligatures
1289 (#xFB1D #xFB4F hebrew) ; Alphabetic Presentation Forms
1290 (#xFB50 #xFDFF arabic) ; Arabic Presentation Forms-A
1291 (#xFE20 #xFE2F latin) ; Combining Half Marks
57939ff4 1292 (#xFE30 #xFE4F han)
b9507529 1293 (#xFE70 #xFEFF arabic) ; Arabic Presentation Forms-B
9ce5de1c
KH
1294 (#xFF00 #xFF5F cjk-misc)
1295 (#xFF61 #xFF9F kana)
1296 (#xFFE0 #xFFE6 cjk-misc)
458888ab
KH
1297 (#x10000 #x100FF linear-b)
1298 (#x10100 #x1013F aegean-number)
57939ff4
EZ
1299 (#x10140 #x1018F ancient-greek-number)
1300 (#x10190 #x101CF ancient-symbol)
458888ab
KH
1301 (#x101D0 #x101FF phaistos-disc)
1302 (#x10280 #x1029F lycian)
1303 (#x102A0 #x102DF carian)
b9507529 1304 (#x102E0 #x102FF coptic) ; Coptic Epact Numbers
458888ab 1305 (#x10300 #x1032F olt-italic)
57939ff4 1306 (#x10330 #x1034F gothic)
b9507529 1307 (#x10350 #x1037F old-permic)
458888ab
KH
1308 (#x10380 #x1039F ugaritic)
1309 (#x103A0 #x103DF old-persian)
1310 (#x10400 #x1044F deseret)
1311 (#x10450 #x1047F shavian)
1312 (#x10480 #x104AF osmanya)
b9507529
EZ
1313 (#x10500 #x1052F elbasan)
1314 (#x10530 #x1056F caucasian-albanian)
1315 (#x10600 #x106BF linear-a)
458888ab 1316 (#x10800 #x1083F cypriot-syllabary)
57939ff4 1317 (#x10840 #x1085F aramaic)
b9507529
EZ
1318 (#x10860 #x1087F palmyrene)
1319 (#x10880 #x108AF nabataean)
458888ab
KH
1320 (#x10900 #x1091F phoenician)
1321 (#x10920 #x1093F lydian)
57939ff4 1322 (#x10980 #x109FF meroitic)
458888ab 1323 (#x10A00 #x10A5F kharoshthi)
57939ff4 1324 (#x10A60 #x10A7F old-south-arabian)
b9507529
EZ
1325 (#x10A80 #x10A9F old-north-arabian)
1326 (#x10AC0 #x10AFF manichaean)
57939ff4
EZ
1327 (#x10B00 #x10B3F avestan)
1328 (#x10B40 #x10B5F inscriptional-parthian)
1329 (#x10B60 #x10B7F inscriptional-pahlavi)
b9507529 1330 (#x10B80 #x10BAF psalter-pahlavi)
57939ff4
EZ
1331 (#x10C00 #x10C4F old-turkic)
1332 (#x10E60 #x10E7F rumi-number)
1333 (#x11000 #x1107F brahmi)
1334 (#x11080 #x110CF kaithi)
1335 (#x110D0 #x110FF sora-sompeng)
1336 (#x11100 #x1114F chakma)
b9507529 1337 (#x11150 #x1117F mahajani)
57939ff4 1338 (#x11180 #x111DF sharada)
b9507529
EZ
1339 (#x111E0 #x111FF sinhala-archaic-number)
1340 (#x11200 #x1124F khojki)
1341 (#x112B0 #x112FF khudawadi)
1342 (#x11300 #x1137F grantha)
1343 (#x11480 #x114DF tirhuta)
1344 (#x11580 #x115FF siddham)
1345 (#x11600 #x1165F modi)
57939ff4 1346 (#x11680 #x116CF takri)
b9507529
EZ
1347 (#x118A0 #x118FF warang-citi)
1348 (#x11AC0 #x11AFF pau-cin-hau)
458888ab
KH
1349 (#x12000 #x123FF cuneiform)
1350 (#x12400 #x1247F cuneiform-numbers-and-punctuation)
57939ff4
EZ
1351 (#x13000 #x1342F egyptian)
1352 (#x16800 #x16A3F bamum)
b9507529
EZ
1353 (#x16A40 #x16A6F mro)
1354 (#x16AD0 #x16AFF bassa-vah)
1355 (#x16B00 #x16B8F pahawh-hmong)
57939ff4 1356 (#x16F00 #x16F9F miao)
b9507529
EZ
1357 (#x1B000 #x1B0FF kana) ; Kana Supplement
1358 (#x1BC00 #x1BCAF duployan-shorthand)
e7da2f38
KH
1359 (#x1D000 #x1D0FF byzantine-musical-symbol)
1360 (#x1D100 #x1D1FF musical-symbol)
458888ab
KH
1361 (#x1D200 #x1D24F ancient-greek-musical-notation)
1362 (#x1D300 #x1D35F tai-xuan-jing-symbol)
1363 (#x1D360 #x1D37F counting-rod-numeral)
e7da2f38 1364 (#x1D400 #x1D7FF mathematical)
b9507529
EZ
1365 (#x1E800 #x1E8DF mende-kikakui)
1366 (#x1EE00 #x1EEFF arabic) ; Arabic Mathematical Alphabetic Symbols
458888ab
KH
1367 (#x1F000 #x1F02F mahjong-tile)
1368 (#x1F030 #x1F09F domino-tile)
57939ff4 1369 (#x1F0A0 #x1F0FF playing-cards)
b9507529
EZ
1370 (#x1F100 #x1F1FF symbol) ; Enclosed Alphanumeric Supplement
1371 (#x1F200 #x1F2FF han) ; Enclosed Ideographic Supplement
1372 (#x1F300 #x1F8FF symbol)
57939ff4 1373 (#x20000 #x2B81F han)
9ce5de1c
KH
1374 (#x2F800 #x2FFFF han)))
1375 (set-char-table-range char-script-table
1376 (cons (car elt) (nth 1 elt)) (nth 2 elt))
1377 (or (memq (nth 2 elt) script-list)
1378 (setq script-list (cons (nth 2 elt) script-list))))
1379 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1380
8f924df7 1381(map-charset-chars
9d3aa82c 1382 #'(lambda (range _ignore)
cdfc5141
KH
1383 (set-char-table-range char-script-table range 'tibetan))
1384 'tibetan)
1385
e7259832 1386\f
59db3a5c
KH
1387;;; Setting unicode-category-table.
1388
20372d0c
GM
1389(when (setq unicode-category-table
1390 (unicode-property-table-internal 'general-category))
1391 (map-char-table #'(lambda (key val)
1392 (if (and val
1393 (or (and (/= (aref (symbol-name val) 0) ?M)
1394 (/= (aref (symbol-name val) 0) ?C))
1395 (eq val 'Zs)))
1396 (modify-category-entry key ?.)))
1397 unicode-category-table))
8ea6fa80
KH
1398
1399(optimize-char-table (standard-category-table))
59db3a5c
KH
1400
1401\f
b2cca856
KH
1402;; Display of glyphless characters.
1403
1404(defvar char-acronym-table
1405 (make-char-table 'char-acronym-table nil)
1406 "Char table of acronyms for non-graphic characters.")
1407
1408(let ((c0-acronyms '("NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK" "BEL"
1409 "BS" nil nil "VT" "FF" "CR" "SO" "SI"
1410 "DLE" "DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB"
1411 "CAN" "EM" "SUB" "ESC" "FC" "GS" "RS" "US")))
1412 (dotimes (i 32)
1413 (aset char-acronym-table i (car c0-acronyms))
1414 (setq c0-acronyms (cdr c0-acronyms))))
1415
1416(let ((c1-acronyms '("XXX" "XXX" "BPH" "NBH" "IND" "NEL" "SSA" "ESA"
1417 "HTS" "HTJ" "VTS" "PLD" "PLU" "R1" "SS2" "SS1"
1418 "DCS" "PU1" "PU2" "STS" "CCH" "MW" "SPA" "EPA"
1419 "SOS" "XXX" "SC1" "CSI" "ST" "OSC" "PM" "APC")))
1420 (dotimes (i 32)
1421 (aset char-acronym-table (+ #x0080 i) (car c1-acronyms))
1422 (setq c1-acronyms (cdr c1-acronyms))))
1423
1424(aset char-acronym-table #x17B4 "KIVAQ") ; KHMER VOWEL INHERENT AQ
1425(aset char-acronym-table #x17B5 "KIVAA") ; KHMER VOWEL INHERENT AA
1426(aset char-acronym-table #x200B "ZWSP") ; ZERO WIDTH SPACE
1427(aset char-acronym-table #x200C "ZWNJ") ; ZERO WIDTH NON-JOINER
1428(aset char-acronym-table #x200D "ZWJ") ; ZERO WIDTH JOINER
1429(aset char-acronym-table #x200E "LRM") ; LEFT-TO-RIGHT MARK
1430(aset char-acronym-table #x200F "RLM") ; RIGHT-TO-LEFT MARK
1431(aset char-acronym-table #x202A "LRE") ; LEFT-TO-RIGHT EMBEDDING
1432(aset char-acronym-table #x202B "RLE") ; RIGHT-TO-LEFT EMBEDDING
1433(aset char-acronym-table #x202C "PDF") ; POP DIRECTIONAL FORMATTING
1434(aset char-acronym-table #x202D "LRO") ; LEFT-TO-RIGHT OVERRIDE
1435(aset char-acronym-table #x202E "RLO") ; RIGHT-TO-LEFT OVERRIDE
1436(aset char-acronym-table #x2060 "WJ") ; WORD JOINER
1437(aset char-acronym-table #x206A "ISS") ; INHIBIT SYMMETRIC SWAPPING
1438(aset char-acronym-table #x206B "ASS") ; ACTIVATE SYMMETRIC SWAPPING
1439(aset char-acronym-table #x206C "IAFS") ; INHIBIT ARABIC FORM SHAPING
1440(aset char-acronym-table #x206D "AAFS") ; ACTIVATE ARABIC FORM SHAPING
1441(aset char-acronym-table #x206E "NADS") ; NATIONAL DIGIT SHAPES
1442(aset char-acronym-table #x206F "NODS") ; NOMINAL DIGIT SHAPES
1443(aset char-acronym-table #xFEFF "ZWNBSP") ; ZERO WIDTH NO-BREAK SPACE
1444(aset char-acronym-table #xFFF9 "IAA") ; INTERLINEAR ANNOTATION ANCHOR
1445(aset char-acronym-table #xFFFA "IAS") ; INTERLINEAR ANNOTATION SEPARATOR
1446(aset char-acronym-table #xFFFB "IAT") ; INTERLINEAR ANNOTATION TERMINATOR
1447(aset char-acronym-table #x1D173 "BEGBM") ; MUSICAL SYMBOL BEGIN BEAM
1448(aset char-acronym-table #x1D174 "ENDBM") ; MUSICAL SYMBOL END BEAM
1449(aset char-acronym-table #x1D175 "BEGTIE") ; MUSICAL SYMBOL BEGIN TIE
1450(aset char-acronym-table #x1D176 "END") ; MUSICAL SYMBOL END TIE
1451(aset char-acronym-table #x1D177 "BEGSLR") ; MUSICAL SYMBOL BEGIN SLUR
1452(aset char-acronym-table #x1D178 "ENDSLR") ; MUSICAL SYMBOL END SLUR
1453(aset char-acronym-table #x1D179 "BEGPHR") ; MUSICAL SYMBOL BEGIN PHRASE
1454(aset char-acronym-table #x1D17A "ENDPHR") ; MUSICAL SYMBOL END PHRASE
1455(aset char-acronym-table #xE0001 "|->TAG") ; LANGUAGE TAG
1456(aset char-acronym-table #xE0020 "SP TAG") ; TAG SPACE
1457(dotimes (i 94)
1458 (aset char-acronym-table (+ #xE0021 i) (format " %c TAG" (+ 33 i))))
1459(aset char-acronym-table #xE007F "->|TAG") ; CANCEL TAG
1460
0e7c0582 1461(defun update-glyphless-char-display (&optional variable value)
0eb025fb 1462 "Make the setting of `glyphless-char-display-control' take effect.
b2cca856 1463This function updates the char-table `glyphless-char-display'."
0e7c0582
EZ
1464 (when value
1465 (set-default variable value))
1466 (dolist (elt value)
b2cca856
KH
1467 (let ((target (car elt))
1468 (method (cdr elt)))
0eb025fb
EZ
1469 (or (memq method '(zero-width thin-space empty-box acronym hex-code))
1470 (error "Invalid glyphless character display method: %s" method))
b2cca856 1471 (cond ((eq target 'c0-control)
bd3921f0
PS
1472 (glyphless-set-char-table-range glyphless-char-display
1473 #x00 #x1F method)
96107967
EZ
1474 ;; Users will not expect their newlines and TABs be
1475 ;; displayed as anything but themselves, so exempt those
1476 ;; two characters from c0-control.
1477 (set-char-table-range glyphless-char-display #x9 nil)
1478 (set-char-table-range glyphless-char-display #xa nil))
b2cca856 1479 ((eq target 'c1-control)
bd3921f0
PS
1480 (glyphless-set-char-table-range glyphless-char-display
1481 #x80 #x9F method))
b2cca856 1482 ((eq target 'format-control)
20372d0c
GM
1483 (when unicode-category-table
1484 (map-char-table
1485 #'(lambda (char category)
1486 (if (eq category 'Cf)
1487 (let ((this-method method)
1488 from to)
1489 (if (consp char)
1490 (setq from (car char) to (cdr char))
1491 (setq from char to char))
1492 (while (<= from to)
1493 (when (/= from #xAD)
1494 (if (eq method 'acronym)
1495 (setq this-method
1496 (aref char-acronym-table from)))
1497 (set-char-table-range glyphless-char-display
1498 from this-method))
1499 (setq from (1+ from))))))
1500 unicode-category-table)))
b2cca856
KH
1501 ((eq target 'no-font)
1502 (set-char-table-extra-slot glyphless-char-display 0 method))
1503 (t
0eb025fb 1504 (error "Invalid glyphless character group: %s" target))))))
bd3921f0
PS
1505
1506(defun glyphless-set-char-table-range (chartable from to method)
1507 (if (eq method 'acronym)
1508 (let ((i from))
1509 (while (<= i to)
1510 (set-char-table-range chartable i (aref char-acronym-table i))
1511 (setq i (1+ i))))
1512 (set-char-table-range chartable (cons from to) method)))
b2cca856 1513
0e7c0582
EZ
1514;;; Control of displaying glyphless characters.
1515(defcustom glyphless-char-display-control
1516 '((format-control . thin-space)
1517 (no-font . hex-code))
1518 "List of directives to control display of glyphless characters.
1519
1520Each element has the form (GROUP . METHOD), where GROUP is a
1521symbol specifying the character group, and METHOD is a symbol
1522specifying the method of displaying characters belonging to that
1523group.
1524
1525GROUP must be one of these symbols:
96107967 1526 `c0-control': U+0000..U+001F, but excluding newline and TAB.
0e7c0582
EZ
1527 `c1-control': U+0080..U+009F.
1528 `format-control': Characters of Unicode General Category `Cf',
1529 such as U+200C (ZWNJ), U+200E (LRM), but
1530 excluding characters that have graphic images,
1531 such as U+00AD (SHY).
1532 `no-font': characters for which no suitable font is found.
1533 For character terminals, characters that cannot
1534 be encoded by `terminal-coding-system'.
1535
1536METHOD must be one of these symbols:
1537 `zero-width': don't display.
1538 `thin-space': display a thin (1-pixel width) space. On character
1539 terminals, display as 1-character space.
1540 `empty-box': display an empty box.
1541 `acronym': display an acronym of the character in a box. The
1542 acronym is taken from `char-acronym-table', which see.
d35f31a4
XF
1543 `hex-code': display the hexadecimal character code in a box.
1544
1545Do not set its value directly from Lisp; the value takes effect
1546only via a custom `:set'
1547function (`update-glyphless-char-display'), which updates
1548`glyphless-char-display'."
2bed3f04 1549 :version "24.1"
0e7c0582
EZ
1550 :type '(alist :key-type (symbol :tag "Character Group")
1551 :value-type (symbol :tag "Display Method"))
1552 :options '((c0-control
1553 (choice (const :tag "Don't display" zero-width)
1554 (const :tag "Display as thin space" thin-space)
1555 (const :tag "Display as empty box" empty-box)
1556 (const :tag "Display acronym" acronym)
1557 (const :tag "Display hex code in a box" hex-code)))
1558 (c1-control
1559 (choice (const :tag "Don't display" zero-width)
1560 (const :tag "Display as thin space" thin-space)
1561 (const :tag "Display as empty box" empty-box)
1562 (const :tag "Display acronym" acronym)
1563 (const :tag "Display hex code in a box" hex-code)))
1564 (format-control
1565 (choice (const :tag "Don't display" zero-width)
1566 (const :tag "Display as thin space" thin-space)
1567 (const :tag "Display as empty box" empty-box)
1568 (const :tag "Display acronym" acronym)
1569 (const :tag "Display hex code in a box" hex-code)))
1570 (no-font
1571 (choice (const :tag "Don't display" zero-width)
1572 (const :tag "Display as thin space" thin-space)
1573 (const :tag "Display as empty box" empty-box)
1574 (const :tag "Display acronym" acronym)
1575 (const :tag "Display hex code in a box" hex-code))))
1576 :set 'update-glyphless-char-display
1577 :group 'display)
1578
b2cca856 1579\f
e7259832
KH
1580;;; Setting word boundary.
1581
e7259832 1582(setq word-combining-categories
4626499f
KH
1583 '((nil . ?^)
1584 (?^ . nil)
7ffefb08
MB
1585 (?C . ?H)
1586 (?C . ?K)))
e7259832
KH
1587
1588(setq word-separating-categories ; (2-byte character sets)
4626499f 1589 '((?H . ?K) ; Hiragana - Katakana
e7259832
KH
1590 ))
1591
1cbfaab9 1592;; Local Variables:
985773c9 1593;; coding: utf-8
1cbfaab9 1594;; End:
777cfce6 1595
60370d40 1596;;; characters.el ends here