Spelling fixes.
[bpt/emacs.git] / lisp / international / characters.el
CommitLineData
4ed46869
KH
1;;; characters.el --- set syntax and category for multibyte characters
2
ba318903 3;; Copyright (C) 1997, 2000-2014 Free Software Foundation, Inc.
7976eda0 4;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 5;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
2fd125a3
KH
6;; National Institute of Advanced Industrial Science and Technology (AIST)
7;; Registration Number H14PRO021
8f924df7 8;; Copyright (C) 2003
55bd52ea
KH
9;; National Institute of Advanced Industrial Science and Technology (AIST)
10;; Registration Number H13PRO009
4ed46869
KH
11
12;; Keywords: multibyte character, character set, syntax, category
13
14;; This file is part of GNU Emacs.
15
4936186e 16;; GNU Emacs is free software: you can redistribute it and/or modify
4ed46869 17;; it under the terms of the GNU General Public License as published by
4936186e
GM
18;; the Free Software Foundation, either version 3 of the License, or
19;; (at your option) any later version.
4ed46869
KH
20
21;; GNU Emacs is distributed in the hope that it will be useful,
22;; but WITHOUT ANY WARRANTY; without even the implied warranty of
23;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24;; GNU General Public License for more details.
25
26;; You should have received a copy of the GNU General Public License
4936186e 27;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
4ed46869
KH
28
29;;; Commentary:
30
60370d40
PJ
31;;; Code:
32
4ed46869
KH
33;;; Predefined categories.
34
35;; For each character set.
36
46bf60bc
KH
37(define-category ?a "ASCII
38ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
4ed46869
KH
39(define-category ?l "Latin")
40(define-category ?t "Thai")
41(define-category ?g "Greek")
42(define-category ?b "Arabic")
43(define-category ?w "Hebrew")
44(define-category ?y "Cyrillic")
46bf60bc
KH
45(define-category ?k "Katakana
46Japanese katakana")
47(define-category ?r "Roman
48Japanese roman")
4ed46869
KH
49(define-category ?c "Chinese")
50(define-category ?j "Japanese")
51(define-category ?h "Korean")
46bf60bc
KH
52(define-category ?e "Ethiopic
53Ethiopic (Ge'ez)")
54(define-category ?v "Viet
55Vietnamese")
4ed46869 56(define-category ?i "Indian")
6eba8645 57(define-category ?o "Lao")
9395eb7c 58(define-category ?q "Tibetan")
4ed46869
KH
59
60;; For each group (row) of 2-byte character sets.
61
46bf60bc
KH
62(define-category ?A "2-byte alnum
63Alpha-numeric characters of 2-byte character sets")
64(define-category ?C "2-byte han
65Chinese (Han) characters of 2-byte character sets")
66(define-category ?G "2-byte Greek
67Greek characters of 2-byte character sets")
68(define-category ?H "2-byte Hiragana
69Japanese Hiragana characters of 2-byte character sets")
70(define-category ?K "2-byte Katakana
71Japanese Katakana characters of 2-byte character sets")
72(define-category ?N "2-byte Korean
73Korean Hangul characters of 2-byte character sets")
91c491e0 74(define-category ?Y "2-byte Cyrillic
46bf60bc 75Cyrillic characters of 2-byte character sets")
4ed46869
KH
76(define-category ?I "Indian Glyphs")
77
78;; For phonetic classifications.
79
80(define-category ?0 "consonant")
46bf60bc 81(define-category ?1 "base vowel
4eb97232 82Base (independent) vowel")
46bf60bc 83(define-category ?2 "upper diacritic
4eb97232 84Upper diacritical mark (including upper vowel)")
46bf60bc 85(define-category ?3 "lower diacritic
4eb97232 86Lower diacritical mark (including lower vowel)")
46bf60bc 87(define-category ?4 "combining tone
4eb97232 88Combining tone mark")
9765a2ba 89(define-category ?5 "symbol")
4ed46869 90(define-category ?6 "digit")
91c491e0 91(define-category ?7 "vowel diacritic
4eb97232 92Vowel-modifying diacritical mark")
6eba8645
KH
93(define-category ?8 "vowel-signs")
94(define-category ?9 "semivowel lower")
4ed46869
KH
95
96;; For filling.
46bf60bc
KH
97(define-category ?| "line breakable
98While filling, we can break a line at this character.")
4ed46869 99
504af7b2 100;; For indentation calculation.
70ea295a 101(define-category ?\s
46bf60bc
KH
102 "space for indent
103This character counts as a space for indentation purposes.")
504af7b2 104
94487c4e 105;; Keep the following for `kinsoku' processing. See comments in
4ed46869 106;; kinsoku.el.
46bf60bc
KH
107(define-category ?> "Not at bol
108A character which can't be placed at beginning of line.")
109(define-category ?< "Not at eol
110A character which can't be placed at end of line.")
4ed46869 111
8ea6fa80
KH
112;; Base and Combining
113(define-category ?. "Base
114Base characters (Unicode General Category L,N,P,S,Zs)")
46bf60bc 115(define-category ?^ "Combining
4eb97232 116Combining diacritic or mark (Unicode General Category M)")
f635daa1
CY
117
118;; bidi types
119(define-category ?R "Right-to-left (strong)
120Characters with \"strong\" right-to-left directionality, i.e.
121with R, AL, RLE, or RLO Unicode bidi character type.")
122
123(define-category ?L "Left-to-right (strong)
124Characters with \"strong\" left-to-right directionality, i.e.
125with L, LRE, or LRO Unicode bidi character type.")
126
4ed46869
KH
127\f
128;;; Setting syntax and category.
129
130;; ASCII
131
e2cc40b7
KH
132;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
133(modify-category-entry '(32 . 127) ?a)
134(modify-category-entry '(32 . 127) ?l)
4ed46869 135
c94ae9eb
DL
136;; Deal with the CJK charsets first. Since the syntax of blocks is
137;; defined per charset, and the charsets may contain e.g. Latin
138;; characters, we end up with the wrong syntax definitions if we're
139;; not careful.
4ed46869 140
66bff5ed 141;; Chinese characters (Unicode)
a5bb49e1
KH
142(modify-category-entry '(#x2E80 . #x312F) ?|)
143(modify-category-entry '(#x3190 . #x33FF) ?|)
66a85e76
KH
144(modify-category-entry '(#x3400 . #x4DBF) ?C)
145(modify-category-entry '(#x4E00 . #x9FAF) ?C)
66bff5ed
KH
146(modify-category-entry '(#x3400 . #x9FAF) ?c)
147(modify-category-entry '(#x3400 . #x9FAF) ?|)
148(modify-category-entry '(#xF900 . #xFAFF) ?C)
149(modify-category-entry '(#xF900 . #xFAFF) ?c)
150(modify-category-entry '(#xF900 . #xFAFF) ?|)
796f8b2f
KH
151(modify-category-entry '(#x20000 . #x2FFFF) ?|)
152(modify-category-entry '(#x20000 . #x2FFFF) ?C)
153(modify-category-entry '(#x20000 . #x2FFFF) ?c)
8e4cd685 154
4ed46869
KH
155
156;; Chinese character set (GB2312)
157
66bff5ed
KH
158(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
159(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
160(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
4ed46869 161
87a39edb 162(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
9ad4b491
KH
163(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
164(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
165(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
66bff5ed
KH
166(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
167(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
168(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
169(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
170(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
4ed46869
KH
171
172;; Chinese character set (BIG5)
173
e7259832 174(map-charset-chars #'modify-category-entry 'big5 ?c)
66a85e76 175(map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA261)
9ad4b491 176(map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
66a85e76 177(map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DC)
4ed46869
KH
178
179;; Chinese character set (CNS11643)
180
87a39edb
DL
181(dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
182 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
183 chinese-cns11643-7))
184 (map-charset-chars #'modify-category-entry c ?c)
9ad4b491
KH
185 (if (eq c 'chinese-cns11643-1)
186 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
8e4cd685 187 (map-charset-chars #'modify-category-entry c ?C)))
4ed46869 188
8f924df7 189;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
4ed46869 190
66bff5ed 191(map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
4ed46869 192
66bff5ed 193(map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
4ed46869 194
8f924df7 195(dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
761f6427
KH
196 japanese-jisx0213-1 japanese-jisx0213-2
197 cp932-2-byte))
8e4cd685 198 (map-charset-chars #'modify-category-entry l ?j))
4ed46869 199
c4186f9c
KH
200;; Fullwidth characters
201(modify-category-entry '(#xff01 . #xff60) ?\|)
202
269a5dd0 203;; Unicode equivalents of JISX0201-kana
66bff5ed
KH
204(let ((range '(#xff61 . #xff9f)))
205 (modify-category-entry range ?k)
206 (modify-category-entry range ?j)
207 (modify-category-entry range ?\|))
269a5dd0
DL
208
209;; Katakana block
796f8b2f
KH
210(modify-category-entry '(#x3099 . #x309C) ?K)
211(modify-category-entry '(#x30A0 . #x30FF) ?K)
6f3ac1e1 212(modify-category-entry '(#x31F0 . #x31FF) ?K)
b11c2874 213(modify-category-entry '(#x30A0 . #x30FA) ?\|)
796f8b2f 214(modify-category-entry #x30FF ?\|)
269a5dd0
DL
215
216;; Hiragana block
796f8b2f
KH
217(modify-category-entry '(#x3040 . #x309F) ?H)
218(modify-category-entry '(#x3040 . #x3096) ?\|)
219(modify-category-entry #x309F ?\|)
220(modify-category-entry #x30A0 ?H)
221(modify-category-entry #x30FC ?H)
222
269a5dd0 223
4ed46869 224;; JISX0208
66bff5ed
KH
225(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
226(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
227(let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
69c2c6ea 228 (dolist (elt chars)
2b89bca4 229 (modify-syntax-entry elt "w")))
66bff5ed
KH
230
231(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
232(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
233(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
234(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
235(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
236(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
66a85e76 237(let ((chars '(?仝 ?々 ?〆 ?〇)))
4ed46869
KH
238 (while chars
239 (modify-category-entry (car chars) ?C)
240 (setq chars (cdr chars))))
241
242;; JISX0212
4ed46869 243
66bff5ed 244(map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
4ed46869
KH
245
246;; JISX0201-Kana
87a39edb 247
abdaa411 248(let ((chars '(?。 ?、 ?・)))
4ed46869
KH
249 (while chars
250 (modify-syntax-entry (car chars) ".")
251 (setq chars (cdr chars))))
252
e6d10035
KH
253(modify-syntax-entry ?\「 "(」")
254(modify-syntax-entry ?\」 "(「")
226e4119 255
4ed46869
KH
256;; Korean character set (KSC5601)
257
87a39edb 258(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
66bff5ed
KH
259
260(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
439f7264
DL
261(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
262(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
263(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
9ad4b491
KH
264(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
265(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
266(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
66bff5ed
KH
267(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
268(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
269(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
270(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
4ed46869 271
c94ae9eb 272;; These are in more than one charset.
8f924df7
KH
273(let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
274 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
275 "()[]{}"))
276 open close)
277 (dotimes (i (/ (length parens) 2))
278 (setq open (aref parens (* i 2))
279 close (aref parens (1+ (* i 2))))
280 (modify-syntax-entry open (format "(%c" close))
281 (modify-syntax-entry close (format ")%c" open))))
d05cfa1f 282
c94ae9eb 283;; Arabic character set
6eba8645 284
c94ae9eb
DL
285(let ((charsets '(arabic-iso8859-6
286 arabic-digit
287 arabic-1-column
288 arabic-2-column)))
289 (while charsets
290 (map-charset-chars #'modify-category-entry (car charsets) ?b)
291 (setq charsets (cdr charsets))))
292(modify-category-entry '(#x600 . #x6ff) ?b)
293(modify-category-entry '(#xfb50 . #xfdff) ?b)
294(modify-category-entry '(#xfe70 . #xfefe) ?b)
6eba8645 295
c94ae9eb
DL
296;; Cyrillic character set (ISO-8859-5)
297
298(modify-syntax-entry ?№ ".")
299
300;; Ethiopic character set
301
4c81b0f6
KH
302(modify-category-entry '(#x1200 . #x1399) ?e)
303(modify-category-entry '(#x2d80 . #x2dde) ?e)
55a3ed16 304(let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨)))
c94ae9eb
DL
305 (while chars
306 (modify-syntax-entry (car chars) ".")
307 (setq chars (cdr chars))))
308(map-charset-chars #'modify-category-entry 'ethiopic ?e)
309
310;; Hebrew character set (ISO-8859-8)
311
312(modify-syntax-entry #x5be ".") ; MAQAF
313(modify-syntax-entry #x5c0 ".") ; PASEQ
314(modify-syntax-entry #x5c3 ".") ; SOF PASUQ
315(modify-syntax-entry #x5f3 ".") ; GERESH
316(modify-syntax-entry #x5f4 ".") ; GERSHAYIM
317
318;; Indian character set (IS 13194 and other Emacs original Indian charsets)
319
320(modify-category-entry '(#x901 . #x970) ?i)
321(map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
322(map-charset-chars #'modify-category-entry 'indian-2-column ?i)
d05cfa1f 323
6eba8645
KH
324;; Lao character set
325
abdaa411
DL
326(modify-category-entry '(#xe80 . #xeff) ?o)
327(map-charset-chars #'modify-category-entry 'lao ?o)
6eba8645 328
abdaa411 329(let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
e6d10035
KH
330 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
331 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
332 ("ຸູ" "w" ?3) ; vowel lower
8f924df7 333 ("່-໋" "w" ?4) ; tone mark
e6d10035
KH
334 ("ຼຽ" "w" ?9) ; semivowel lower
335 ("໐-໙" "w" ?6) ; digit
336 ("ຯໆ" "_" ?5) ; symbol
6eba8645
KH
337 ))
338 elm chars len syntax category to ch i)
339 (while deflist
340 (setq elm (car deflist))
341 (setq chars (car elm)
342 len (length chars)
343 syntax (nth 1 elm)
344 category (nth 2 elm)
345 i 0)
346 (while (< i len)
347 (if (= (aref chars i) ?-)
348 (setq i (1+ i)
4a027a0d
KH
349 to (aref chars i))
350 (setq ch (aref chars i)
6eba8645
KH
351 to ch))
352 (while (<= ch to)
269a5dd0
DL
353 (unless (string-equal syntax "w")
354 (modify-syntax-entry ch syntax))
6eba8645
KH
355 (modify-category-entry ch category)
356 (setq ch (1+ ch)))
4a027a0d 357 (setq i (1+ i)))
6eba8645
KH
358 (setq deflist (cdr deflist))))
359
4ed46869
KH
360;; Thai character set (TIS620)
361
abdaa411
DL
362(modify-category-entry '(#xe00 . #xe7f) ?t)
363(map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
4ed46869
KH
364
365(let ((deflist '(;; chars syntax category
e6d10035
KH
366 ("ก-รลว-ฮ" "w" ?0) ; consonant
367 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
368 ("ัิ-ื็๎" "w" ?2) ; vowel upper
369 ("ุ-ฺ" "w" ?3) ; vowel lower
8f924df7 370 ("่-ํ" "w" ?4) ; tone mark
e6d10035
KH
371 ("๐-๙" "w" ?6) ; digit
372 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
4ed46869
KH
373 ))
374 elm chars len syntax category to ch i)
9395eb7c
KH
375 (while deflist
376 (setq elm (car deflist))
377 (setq chars (car elm)
378 len (length chars)
379 syntax (nth 1 elm)
380 category (nth 2 elm)
381 i 0)
382 (while (< i len)
383 (if (= (aref chars i) ?-)
384 (setq i (1+ i)
4a027a0d
KH
385 to (aref chars i))
386 (setq ch (aref chars i)
9395eb7c
KH
387 to ch))
388 (while (<= ch to)
269a5dd0
DL
389 (unless (string-equal syntax "w")
390 (modify-syntax-entry ch syntax))
9395eb7c
KH
391 (modify-category-entry ch category)
392 (setq ch (1+ ch)))
4a027a0d 393 (setq i (1+ i)))
9395eb7c
KH
394 (setq deflist (cdr deflist))))
395
396;; Tibetan character set
397
abdaa411
DL
398(modify-category-entry '(#xf00 . #xfff) ?q)
399(map-charset-chars #'modify-category-entry 'tibetan ?q)
400(map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
9395eb7c
KH
401
402(let ((deflist '(;; chars syntax category
725d7c92 403 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
55a3ed16 404 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
725d7c92
DL
405 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
406 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
53964682 407 ("྄ཱུ༙༵༷" "w" ?3) ; lower vowel/modifier
8f924df7 408 ("཰" "w" ?3) ; invisible vowel a
725d7c92
DL
409 ("༠-༩༪-༳" "w" ?6) ; digit
410 ("་།-༒༔ཿ" "." ?|) ; line-break char
411 ("་།༏༐༑༔ཿ" "." ?|) ;
412 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
413 ("་།༏༐༑༔ཿ" "." ?>) ;
414 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
415 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
9395eb7c
KH
416 ))
417 elm chars len syntax category to ch i)
4ed46869
KH
418 (while deflist
419 (setq elm (car deflist))
420 (setq chars (car elm)
421 len (length chars)
422 syntax (nth 1 elm)
423 category (nth 2 elm)
424 i 0)
425 (while (< i len)
426 (if (= (aref chars i) ?-)
427 (setq i (1+ i)
4a027a0d
KH
428 to (aref chars i))
429 (setq ch (aref chars i)
4ed46869
KH
430 to ch))
431 (while (<= ch to)
269a5dd0
DL
432 (unless (string-equal syntax "w")
433 (modify-syntax-entry ch syntax))
4ed46869
KH
434 (modify-category-entry ch category)
435 (setq ch (1+ ch)))
4a027a0d 436 (setq i (1+ i)))
4ed46869
KH
437 (setq deflist (cdr deflist))))
438
439;; Vietnamese character set
440
abdaa411
DL
441;; To make a word with Latin characters
442(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
443(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
444
445(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
446(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
4ed46869 447
e5dd1155
KH
448(let ((tbl (standard-case-table))
449 (i 32))
450 (while (< i 128)
725d7c92
DL
451 (let* ((char (decode-char 'vietnamese-viscii-upper i))
452 (charl (decode-char 'vietnamese-viscii-lower i))
453 (uc (encode-char char 'ucs))
454 (lc (encode-char charl 'ucs)))
455 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
4eb97232 456 tbl)
725d7c92
DL
457 (if uc (modify-category-entry uc ?v))
458 (if lc (modify-category-entry lc ?v)))
e5dd1155
KH
459 (setq i (1+ i))))
460
d807d0c7
KH
461;; Tai Viet
462(let ((deflist '(;; chars syntax category
da6062e6 463 ((?ꪀ. ?ꪯ) "w" ?0) ; consonant
d807d0c7
KH
464 ("ꪱꪵꪶ" "w" ?1) ; vowel base
465 ((?ꪹ . ?ꪽ) "w" ?1) ; vowel base
466 ("ꪰꪲꪳꪷꪸꪾ" "w" ?2) ; vowel upper
467 ("ꪴ" "w" ?3) ; vowel lower
468 ("ꫀꫂ" "w" ?1) ; non-combining tone-mark
469 ("꪿꫁" "w" ?4) ; combining tone-mark
470 ((?ꫛ . ?꫟) "_" ?5) ; symbol
471 )))
472 (dolist (elm deflist)
473 (let ((chars (car elm))
474 (syntax (nth 1 elm))
475 (category (nth 2 elm)))
476 (if (consp chars)
477 (progn
478 (modify-syntax-entry chars syntax)
479 (modify-category-entry chars category))
480 (mapc #'(lambda (x)
481 (modify-syntax-entry x syntax)
482 (modify-category-entry x category))
483 chars)))))
c94ae9eb 484
f635daa1
CY
485;; Bidi categories
486
20372d0c
GM
487;; If bootstrapping without generated uni-*.el files, table not defined.
488(let ((table (unicode-property-table-internal 'bidi-class)))
489 (when table
490 (map-char-table (lambda (key val)
491 (cond
492 ((memq val '(R AL RLO RLE))
493 (modify-category-entry key ?R))
494 ((memq val '(L LRE LRO))
495 (modify-category-entry key ?L))))
496 table)))
f635daa1 497
c94ae9eb
DL
498;; Latin
499
500(modify-category-entry '(#x80 . #x024F) ?l)
d05cfa1f 501
85ef8ece
KH
502(let ((tbl (standard-case-table)) c)
503
4fb82d62
DL
504 ;; Latin-1
505
506 ;; Fixme: Some of the non-word syntaxes here perhaps should be
507 ;; reviewed. (Note that the following all implicitly have word
508 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
509 ;; relating Unicode categories to Emacs syntax codes.
db92e81e
KH
510
511 ;; NBSP isn't semantically interchangeable with other whitespace chars,
e1dbe924 512 ;; so it's more like punctuation.
db92e81e 513 (set-case-syntax ?  "." tbl)
4fb82d62
DL
514 (set-case-syntax ?¡ "." tbl)
515 (set-case-syntax ?¦ "_" tbl)
516 (set-case-syntax ?§ "." tbl)
517 (set-case-syntax ?© "_" tbl)
db3b7db5
SM
518 ;; French wants
519 ;; (set-case-syntax-delims ?« ?» tbl)
520 ;; And German wants
521 ;; (set-case-syntax-delims ?» ?« tbl)
522 ;; So let's stay neutral and let users set these up if/when they want to.
523 (set-case-syntax ?« "." tbl)
524 (set-case-syntax ?» "." tbl)
4fb82d62
DL
525 (set-case-syntax ?¬ "_" tbl)
526 (set-case-syntax ?­ "_" tbl)
527 (set-case-syntax ?® "_" tbl)
528 (set-case-syntax ?° "_" tbl)
529 (set-case-syntax ?± "_" tbl)
530 (set-case-syntax ?µ "_" tbl)
531 (set-case-syntax ?· "_" tbl)
532 (set-case-syntax ?¼ "_" tbl)
533 (set-case-syntax ?½ "_" tbl)
534 (set-case-syntax ?¾ "_" tbl)
535 (set-case-syntax ?¿ "." tbl)
536 (let ((c 192))
537 (while (<= c 222)
538 (set-case-syntax-pair c (+ c 32) tbl)
539 (setq c (1+ c))))
540 (set-case-syntax ?× "_" tbl)
541 (set-case-syntax ?ß "w" tbl)
542 (set-case-syntax ?÷ "_" tbl)
543 ;; See below for ÿ.
85ef8ece 544
85ef8ece
KH
545 ;; Latin Extended-A, Latin Extended-B
546 (setq c #x0100)
e5e381c8
KH
547 (while (<= c #x02B8)
548 (modify-category-entry c ?l)
d05cfa1f 549 (setq c (1+ c)))
2bb915b8 550
e5e381c8
KH
551 (let ((pair-ranges '((#x0100 . #x012F)
552 (#x0132 . #x0137)
553 (#x0139 . #x0148)
554 (#x014a . #x0177)
555 (#x0179 . #x017E)
556 (#x0182 . #x0185)
796f8b2f
KH
557 (#x0187 . #x0188)
558 (#x018B . #x018C)
e5e381c8
KH
559 (#x0191 . #x0192)
560 (#x0198 . #x0199)
561 (#x01A0 . #x01A5)
562 (#x01A7 . #x01A8)
563 (#x01AC . #x01AD)
564 (#x01AF . #x01B0)
565 (#x01B3 . #x01B6)
d0203d61 566 (#x01B8 . #x01B9)
e5e381c8
KH
567 (#x01BC . #x01BD)
568 (#x01CD . #x01DC)
569 (#x01DE . #x01EF)
570 (#x01F4 . #x01F5)
571 (#x01F8 . #x021F)
572 (#x0222 . #x0233)
573 (#x023B . #x023C)
574 (#x0241 . #x0242)
575 (#x0246 . #x024F))))
576 (dolist (elt pair-ranges)
577 (let ((from (car elt)) (to (cdr elt)))
578 (while (< from to)
579 (set-case-syntax-pair from (1+ from) tbl)
580 (setq from (+ from 2))))))
2bb915b8 581
d0203d61 582 (set-case-syntax-pair ?Ÿ ?ÿ tbl)
796f8b2f 583
2bb915b8
KH
584 ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
585 ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
586 ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
587 ;; SMALL LETTER I.
588
589 ;; We used to set up half of those correspondence unconditionally,
590 ;; but that makes searches slow. So now we don't set up either half
591 ;; of these correspondences by default.
592
593 ;; (set-downcase-syntax ?İ ?i tbl)
594 ;; (set-upcase-syntax ?I ?ı tbl)
595
0d93216c
AS
596 (set-case-syntax-pair ?Ɓ ?ɓ tbl)
597 (set-case-syntax-pair ?Ɔ ?ɔ tbl)
d0203d61
AS
598 (set-case-syntax-pair ?Ɖ ?ɖ tbl)
599 (set-case-syntax-pair ?Ɗ ?ɗ tbl)
0d93216c
AS
600 (set-case-syntax-pair ?Ǝ ?ǝ tbl)
601 (set-case-syntax-pair ?Ə ?ə tbl)
602 (set-case-syntax-pair ?Ɛ ?ɛ tbl)
603 (set-case-syntax-pair ?Ɠ ?ɠ tbl)
604 (set-case-syntax-pair ?Ɣ ?ɣ tbl)
605 (set-case-syntax-pair ?Ɩ ?ɩ tbl)
606 (set-case-syntax-pair ?Ɨ ?ɨ tbl)
607 (set-case-syntax-pair ?Ɯ ?ɯ tbl)
608 (set-case-syntax-pair ?Ɲ ?ɲ tbl)
609 (set-case-syntax-pair ?Ɵ ?ɵ tbl)
610 (set-case-syntax-pair ?Ʀ ?ʀ tbl)
611 (set-case-syntax-pair ?Ʃ ?ʃ tbl)
612 (set-case-syntax-pair ?Ʈ ?ʈ tbl)
613 (set-case-syntax-pair ?Ʊ ?ʊ tbl)
614 (set-case-syntax-pair ?Ʋ ?ʋ tbl)
615 (set-case-syntax-pair ?Ʒ ?ʒ tbl)
e6d10035
KH
616 (set-case-syntax-pair ?DŽ ?dž tbl)
617 (set-case-syntax-pair ?Dž ?dž tbl)
618 (set-case-syntax-pair ?LJ ?lj tbl)
619 (set-case-syntax-pair ?Lj ?lj tbl)
620 (set-case-syntax-pair ?NJ ?nj tbl)
621 (set-case-syntax-pair ?Nj ?nj tbl)
e5e381c8 622
269a5dd0 623 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
e6d10035
KH
624 (set-case-syntax-pair ?DZ ?dz tbl)
625 (set-case-syntax-pair ?Dz ?dz tbl)
e6d10035
KH
626 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
627 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
cb80bcd1
EZ
628 (set-case-syntax-pair ?Ⱥ ?ⱥ tbl)
629 (set-case-syntax-pair ?Ƚ ?ƚ tbl)
630 (set-case-syntax-pair ?Ⱦ ?ⱦ tbl)
631 (set-case-syntax-pair ?Ƀ ?ƀ tbl)
632 (set-case-syntax-pair ?Ʉ ?ʉ tbl)
633 (set-case-syntax-pair ?Ʌ ?ʌ tbl)
269a5dd0 634
85ef8ece 635 ;; Latin Extended Additional
abdaa411 636 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
85ef8ece 637 (setq c #x1e00)
d05cfa1f 638 (while (<= c #x1ef9)
d05cfa1f
KH
639 (and (zerop (% c 2))
640 (or (<= c #x1e94) (>= c #x1ea0))
abdaa411 641 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f
KH
642 (setq c (1+ c)))
643
85ef8ece 644 ;; Greek
abdaa411 645 (modify-category-entry '(#x0370 . #x03ff) ?g)
85ef8ece 646 (setq c #x0370)
d05cfa1f 647 (while (<= c #x03ff)
d05cfa1f
KH
648 (if (or (and (>= c #x0391) (<= c #x03a1))
649 (and (>= c #x03a3) (<= c #x03ab)))
abdaa411 650 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
651 (and (>= c #x03da)
652 (<= c #x03ee)
653 (zerop (% c 2))
abdaa411 654 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 655 (setq c (1+ c)))
e6d10035
KH
656 (set-case-syntax-pair ?Ά ?ά tbl)
657 (set-case-syntax-pair ?Έ ?έ tbl)
658 (set-case-syntax-pair ?Ή ?ή tbl)
659 (set-case-syntax-pair ?Ί ?ί tbl)
660 (set-case-syntax-pair ?Ό ?ό tbl)
661 (set-case-syntax-pair ?Ύ ?ύ tbl)
662 (set-case-syntax-pair ?Ώ ?ώ tbl)
d05cfa1f 663
269a5dd0
DL
664 ;; Armenian
665 (setq c #x531)
666 (while (<= c #x556)
abdaa411 667 (set-case-syntax-pair c (+ c #x30) tbl)
269a5dd0
DL
668 (setq c (1+ c)))
669
85ef8ece 670 ;; Greek Extended
abdaa411 671 (modify-category-entry '(#x1f00 . #x1fff) ?g)
85ef8ece 672 (setq c #x1f00)
d05cfa1f 673 (while (<= c #x1fff)
d05cfa1f
KH
674 (and (<= (logand c #x000f) 7)
675 (<= c #x1fa7)
796f8b2f
KH
676 (not (memq c '(#x1f16 #x1f17 #x1f56 #x1f57
677 #x1f50 #x1f52 #x1f54 #x1f56)))
678 (/= (logand c #x00f0) #x70)
abdaa411 679 (set-case-syntax-pair (+ c 8) c tbl))
d05cfa1f 680 (setq c (1+ c)))
e6d10035
KH
681 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
682 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
683 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
684 (set-case-syntax-pair ?Ά ?ά tbl)
685 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
686 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
687 (set-case-syntax-pair ?Έ ?έ tbl)
688 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
689 (set-case-syntax-pair ?Ή ?ή tbl)
690 (set-case-syntax-pair ?ῌ ?ῃ tbl)
691 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
692 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
693 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
694 (set-case-syntax-pair ?Ί ?ί tbl)
695 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
696 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
697 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
698 (set-case-syntax-pair ?Ύ ?ύ tbl)
699 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
700 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
701 (set-case-syntax-pair ?Ό ?ό tbl)
702 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
703 (set-case-syntax-pair ?Ώ ?ώ tbl)
704 (set-case-syntax-pair ?ῼ ?ῳ tbl)
d05cfa1f 705
85ef8ece 706 ;; cyrillic
abdaa411 707 (modify-category-entry '(#x0400 . #x04FF) ?y)
85ef8ece 708 (setq c #x0400)
d05cfa1f 709 (while (<= c #x04ff)
d05cfa1f
KH
710 (and (>= c #x0400)
711 (<= c #x040f)
abdaa411 712 (set-case-syntax-pair c (+ c 80) tbl))
d05cfa1f
KH
713 (and (>= c #x0410)
714 (<= c #x042f)
abdaa411 715 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
716 (and (zerop (% c 2))
717 (or (and (>= c #x0460) (<= c #x0480))
718 (and (>= c #x048c) (<= c #x04be))
719 (and (>= c #x04d0) (<= c #x04f4)))
8f924df7 720 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 721 (setq c (1+ c)))
e6d10035
KH
722 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
723 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
724 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
725 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
726 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
d05cfa1f 727
85ef8ece
KH
728 ;; general punctuation
729 (setq c #x2000)
d05cfa1f
KH
730 (while (<= c #x200b)
731 (set-case-syntax c " " tbl)
732 (setq c (1+ c)))
b427c97e
DL
733 (while (<= c #x200F)
734 (set-case-syntax c "." tbl)
735 (setq c (1+ c)))
736 ;; Fixme: These aren't all right:
6b61353c
KH
737 (setq c #x2010)
738 (while (<= c #x2016)
739 (set-case-syntax c "_" tbl)
740 (setq c (1+ c)))
741 ;; Punctuation syntax for quotation marks (like `)
742 (while (<= c #x201f)
743 (set-case-syntax c "." tbl)
744 (setq c (1+ c)))
745 ;; Fixme: These aren't all right:
d05cfa1f
KH
746 (while (<= c #x2027)
747 (set-case-syntax c "_" tbl)
748 (setq c (1+ c)))
b427c97e
DL
749 (while (<= c #x206F)
750 (set-case-syntax c "." tbl)
751 (setq c (1+ c)))
d05cfa1f 752
269a5dd0
DL
753 ;; Roman numerals
754 (setq c #x2160)
755 (while (<= c #x216f)
abdaa411 756 (set-case-syntax-pair c (+ c #x10) tbl)
269a5dd0
DL
757 (setq c (1+ c)))
758
4fb82d62
DL
759 ;; Fixme: The following blocks might be better as symbol rather than
760 ;; punctuation.
b427c97e
DL
761 ;; Arrows
762 (setq c #x2190)
6ca54a3a
DL
763 (while (<= c #x21FF)
764 (set-case-syntax c "." tbl)
b427c97e
DL
765 (setq c (1+ c)))
766 ;; Mathematical Operators
767 (while (<= c #x22FF)
6ca54a3a 768 (set-case-syntax c "." tbl)
b427c97e
DL
769 (setq c (1+ c)))
770 ;; Miscellaneous Technical
771 (while (<= c #x23FF)
6ca54a3a 772 (set-case-syntax c "." tbl)
b427c97e
DL
773 (setq c (1+ c)))
774 ;; Control Pictures
775 (while (<= c #x243F)
6ca54a3a 776 (set-case-syntax c "_" tbl)
269a5dd0
DL
777 (setq c (1+ c)))
778
779 ;; Circled Latin
780 (setq c #x24b6)
781 (while (<= c #x24cf)
abdaa411
DL
782 (set-case-syntax-pair c (+ c 26) tbl)
783 (modify-category-entry c ?l)
784 (modify-category-entry (+ c 26) ?l)
269a5dd0
DL
785 (setq c (1+ c)))
786
787 ;; Fullwidth Latin
788 (setq c #xff21)
789 (while (<= c #xff3a)
abdaa411
DL
790 (set-case-syntax-pair c (+ c #x20) tbl)
791 (modify-category-entry c ?l)
792 (modify-category-entry (+ c #x20) ?l)
269a5dd0
DL
793 (setq c (1+ c)))
794
269a5dd0 795 ;; Combining diacritics
abdaa411 796 (modify-category-entry '(#x300 . #x362) ?^)
269a5dd0 797 ;; Combining marks
0ca754d0 798 (modify-category-entry '(#x20d0 . #x20ff) ?^)
269a5dd0
DL
799
800 ;; Fixme: syntax for symbols &c
801 )
6b61353c
KH
802
803(let ((pairs
e55a4d4e
KH
804 '("⁅⁆" ; U+2045 U+2046
805 "⁽⁾" ; U+207D U+207E
806 "₍₎" ; U+208D U+208E
807 "〈〉" ; U+2329 U+232A
808 "⎴⎵" ; U+23B4 U+23B5
809 "❨❩" ; U+2768 U+2769
810 "❪❫" ; U+276A U+276B
811 "❬❭" ; U+276C U+276D
812 "❰❱" ; U+2770 U+2771
813 "❲❳" ; U+2772 U+2773
814 "❴❵" ; U+2774 U+2775
815 "⟦⟧" ; U+27E6 U+27E7
816 "⟨⟩" ; U+27E8 U+27E9
817 "⟪⟫" ; U+27EA U+27EB
818 "⦃⦄" ; U+2983 U+2984
819 "⦅⦆" ; U+2985 U+2986
820 "⦇⦈" ; U+2987 U+2988
821 "⦉⦊" ; U+2989 U+298A
822 "⦋⦌" ; U+298B U+298C
823 "⦍⦎" ; U+298D U+298E
824 "⦏⦐" ; U+298F U+2990
825 "⦑⦒" ; U+2991 U+2992
826 "⦓⦔" ; U+2993 U+2994
827 "⦕⦖" ; U+2995 U+2996
828 "⦗⦘" ; U+2997 U+2998
829 "⧼⧽" ; U+29FC U+29FD
830 "〈〉" ; U+3008 U+3009
831 "《》" ; U+300A U+300B
832 "「」" ; U+300C U+300D
833 "『』" ; U+300E U+300F
834 "【】" ; U+3010 U+3011
835 "〔〕" ; U+3014 U+3015
836 "〖〗" ; U+3016 U+3017
837 "〘〙" ; U+3018 U+3019
838 "〚〛" ; U+301A U+301B
839 "﴾﴿" ; U+FD3E U+FD3F
840 "︵︶" ; U+FE35 U+FE36
841 "︷︸" ; U+FE37 U+FE38
842 "︹︺" ; U+FE39 U+FE3A
843 "︻︼" ; U+FE3B U+FE3C
844 "︽︾" ; U+FE3D U+FE3E
845 "︿﹀" ; U+FE3F U+FE40
846 "﹁﹂" ; U+FE41 U+FE42
847 "﹃﹄" ; U+FE43 U+FE44
848 "﹙﹚" ; U+FE59 U+FE5A
849 "﹛﹜" ; U+FE5B U+FE5C
850 "﹝﹞" ; U+FE5D U+FE5E
851 "()" ; U+FF08 U+FF09
852 "[]" ; U+FF3B U+FF3D
853 "{}" ; U+FF5B U+FF5D
854 "⦅⦆" ; U+FF5F U+FF60
855 "「」" ; U+FF62 U+FF63
6b61353c
KH
856 )))
857 (dolist (elt pairs)
858 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
859 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
860
4ed46869 861\f
777cfce6 862;; For each character set, put the information of the most proper
aaa9f206 863;; coding system to encode it by `preferred-coding-system' property.
777cfce6 864
abdaa411 865;; Fixme: should this be junked?
777cfce6
KH
866(let ((l '((latin-iso8859-1 . iso-latin-1)
867 (latin-iso8859-2 . iso-latin-2)
868 (latin-iso8859-3 . iso-latin-3)
869 (latin-iso8859-4 . iso-latin-4)
870 (thai-tis620 . thai-tis620)
871 (greek-iso8859-7 . greek-iso-8bit)
872 (arabic-iso8859-6 . iso-2022-7bit)
873 (hebrew-iso8859-8 . hebrew-iso-8bit)
874 (katakana-jisx0201 . japanese-shift-jis)
875 (latin-jisx0201 . japanese-shift-jis)
876 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
877 (latin-iso8859-9 . iso-latin-5)
878 (japanese-jisx0208-1978 . iso-2022-jp)
7870bdd9
KH
879 (chinese-gb2312 . chinese-iso-8bit)
880 (chinese-gbk . chinese-gbk)
881 (gb18030-2-byte . chinese-gb18030)
882 (gb18030-4-byte-bmp . chinese-gb18030)
883 (gb18030-4-byte-smp . chinese-gb18030)
884 (gb18030-4-byte-ext-1 . chinese-gb18030)
885 (gb18030-4-byte-ext-2 . chinese-gb18030)
777cfce6
KH
886 (japanese-jisx0208 . iso-2022-jp)
887 (korean-ksc5601 . iso-2022-kr)
888 (japanese-jisx0212 . iso-2022-jp)
777cfce6
KH
889 (chinese-big5-1 . chinese-big5)
890 (chinese-big5-2 . chinese-big5)
891 (chinese-sisheng . iso-2022-7bit)
892 (ipa . iso-2022-7bit)
893 (vietnamese-viscii-lower . vietnamese-viscii)
894 (vietnamese-viscii-upper . vietnamese-viscii)
895 (arabic-digit . iso-2022-7bit)
896 (arabic-1-column . iso-2022-7bit)
777cfce6
KH
897 (lao . lao)
898 (arabic-2-column . iso-2022-7bit)
899 (indian-is13194 . devanagari)
69e138b2 900 (indian-glyph . devanagari)
777cfce6 901 (tibetan-1-column . tibetan)
58cd41a3 902 (ethiopic . iso-2022-7bit)
7870bdd9
KH
903 (chinese-cns11643-1 . iso-2022-cn)
904 (chinese-cns11643-2 . iso-2022-cn)
777cfce6
KH
905 (chinese-cns11643-3 . iso-2022-cn)
906 (chinese-cns11643-4 . iso-2022-cn)
907 (chinese-cns11643-5 . iso-2022-cn)
908 (chinese-cns11643-6 . iso-2022-cn)
909 (chinese-cns11643-7 . iso-2022-cn)
910 (indian-2-column . devanagari)
7a860cf2
DL
911 (tibetan . tibetan)
912 (latin-iso8859-14 . iso-latin-8)
913 (latin-iso8859-15 . iso-latin-9))))
777cfce6 914 (while l
aaa9f206 915 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
777cfce6 916 (setq l (cdr l))))
df0415c5
KH
917
918\f
98a663f1 919;; Setup auto-fill-chars for charsets that should invoke auto-filling.
7760ba82 920;; SPACE and NEWLINE are already set.
df21429c
KH
921
922(set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
923(set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
924(set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
925(set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
926(set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
927(set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
928
55bd52ea 929\f
7760ba82
KH
930;;; Setting char-width-table. The default is 1.
931
932;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
933;; and final characters.
a2a22302 934(let ((l '((#x0300 . #x036F)
7760ba82
KH
935 (#x0483 . #x0489)
936 (#x0591 . #x05BD)
937 (#x05BF . #x05BF)
938 (#x05C1 . #x05C2)
939 (#x05C4 . #x05C5)
940 (#x05C7 . #x05C7)
941 (#x0600 . #x0603)
942 (#x0610 . #x0615)
943 (#x064B . #x065E)
944 (#x0670 . #x0670)
945 (#x06D6 . #x06E4)
946 (#x06E7 . #x06E8)
947 (#x06EA . #x06ED)
948 (#x070F . #x070F)
949 (#x0711 . #x0711)
950 (#x0730 . #x074A)
951 (#x07A6 . #x07B0)
952 (#x07EB . #x07F3)
953 (#x0901 . #x0902)
954 (#x093C . #x093C)
955 (#x0941 . #x0948)
956 (#x094D . #x094D)
957 (#x0951 . #x0954)
958 (#x0962 . #x0963)
959 (#x0981 . #x0981)
960 (#x09BC . #x09BC)
961 (#x09C1 . #x09C4)
962 (#x09CD . #x09CD)
963 (#x09E2 . #x09E3)
964 (#x0A01 . #x0A02)
965 (#x0A3C . #x0A3C)
966 (#x0A41 . #x0A4D)
967 (#x0A70 . #x0A71)
968 (#x0A81 . #x0A82)
969 (#x0ABC . #x0ABC)
970 (#x0AC1 . #x0AC8)
971 (#x0ACD . #x0ACD)
972 (#x0AE2 . #x0AE3)
973 (#x0B01 . #x0B01)
974 (#x0B3C . #x0B3C)
975 (#x0B3F . #x0B3F)
976 (#x0B41 . #x0B43)
977 (#x0B4D . #x0B56)
978 (#x0B82 . #x0B82)
979 (#x0BC0 . #x0BC0)
980 (#x0BCD . #x0BCD)
981 (#x0C3E . #x0C40)
982 (#x0C46 . #x0C56)
983 (#x0CBC . #x0CBC)
984 (#x0CBF . #x0CBF)
985 (#x0CC6 . #x0CC6)
986 (#x0CCC . #x0CCD)
987 (#x0CE2 . #x0CE3)
988 (#x0D41 . #x0D43)
989 (#x0D4D . #x0D4D)
990 (#x0DCA . #x0DCA)
991 (#x0DD2 . #x0DD6)
992 (#x0E31 . #x0E31)
993 (#x0E34 . #x0E3A)
994 (#x0E47 . #x0E4E)
995 (#x0EB1 . #x0EB1)
996 (#x0EB4 . #x0EBC)
997 (#x0EC8 . #x0ECD)
998 (#x0F18 . #x0F19)
999 (#x0F35 . #x0F35)
1000 (#x0F37 . #x0F37)
1001 (#x0F39 . #x0F39)
1002 (#x0F71 . #x0F7E)
1003 (#x0F80 . #x0F84)
1004 (#x0F86 . #x0F87)
1005 (#x0F90 . #x0FBC)
1006 (#x0FC6 . #x0FC6)
1007 (#x102D . #x1030)
1008 (#x1032 . #x1037)
1009 (#x1039 . #x1039)
1010 (#x1058 . #x1059)
1011 (#x1160 . #x11FF)
1012 (#x135F . #x135F)
1013 (#x1712 . #x1714)
1014 (#x1732 . #x1734)
1015 (#x1752 . #x1753)
1016 (#x1772 . #x1773)
1017 (#x17B4 . #x17B5)
1018 (#x17B7 . #x17BD)
1019 (#x17C6 . #x17C6)
1020 (#x17C9 . #x17D3)
1021 (#x17DD . #x17DD)
1022 (#x180B . #x180D)
1023 (#x18A9 . #x18A9)
1024 (#x1920 . #x1922)
1025 (#x1927 . #x1928)
1026 (#x1932 . #x1932)
1027 (#x1939 . #x193B)
1028 (#x1A17 . #x1A18)
1029 (#x1B00 . #x1B03)
1030 (#x1B34 . #x1B34)
1031 (#x1B36 . #x1B3A)
1032 (#x1B3C . #x1B3C)
1033 (#x1B42 . #x1B42)
1034 (#x1B6B . #x1B73)
1035 (#x1DC0 . #x1DFF)
1036 (#x200B . #x200F)
1037 (#x202A . #x202E)
1038 (#x2060 . #x206F)
1039 (#x20D0 . #x20EF)
1040 (#x302A . #x302F)
1041 (#x3099 . #x309A)
1042 (#xA806 . #xA806)
1043 (#xA80B . #xA80B)
1044 (#xA825 . #xA826)
1045 (#xFB1E . #xFB1E)
1046 (#xFE00 . #xFE0F)
1047 (#xFE20 . #xFE23)
1048 (#xFEFF . #xFEFF)
1049 (#xFFF9 . #xFFFB)
1050 (#x10A01 . #x10A0F)
1051 (#x10A38 . #x10A3F)
1052 (#x1D167 . #x1D169)
1053 (#x1D173 . #x1D182)
1054 (#x1D185 . #x1D18B)
1055 (#x1D1AA . #x1D1AD)
1056 (#x1D242 . #x1D244)
1057 (#xE0001 . #xE01EF))))
1058 (dolist (elt l)
1059 (set-char-table-range char-width-table elt 0)))
1060
1061;; 2: East Asian Wide and Full-width characters.
1062(let ((l '((#x1100 . #x115F)
1063 (#x2329 . #x232A)
1064 (#x2E80 . #x303E)
1065 (#x3040 . #xA4CF)
1066 (#xAC00 . #xD7A3)
ed0cb465 1067 (#xF900 . #xFAFF)
7760ba82 1068 (#xFE30 . #xFE6F)
bb5c62cf 1069 (#xFF01 . #xFF60)
7760ba82
KH
1070 (#xFFE0 . #xFFE6)
1071 (#x20000 . #x2FFFF)
1072 (#x30000 . #x3FFFF))))
ed0cb465 1073 (dolist (elt l)
7760ba82 1074 (set-char-table-range char-width-table elt 2)))
173f18ce
DL
1075
1076;; Other double width
7760ba82
KH
1077;;(map-charset-chars
1078;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1079;; 'ethiopic)
1080;; (map-charset-chars
1081;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1082;; 'tibetan)
173f18ce 1083(map-charset-chars
9d3aa82c 1084 (lambda (range _ignore) (set-char-table-range char-width-table range 2))
173f18ce
DL
1085 'indian-2-column)
1086(map-charset-chars
9d3aa82c 1087 (lambda (range _ignore) (set-char-table-range char-width-table range 2))
173f18ce 1088 'arabic-2-column)
777cfce6 1089
dbff07a2
KH
1090;; Internal use only.
1091;; Alist of locale symbol vs charsets. In a language environment
1092;; corresponding to the locale, width of characters in the charsets is
1093;; set to 2. Each element has the form:
1094;; (LOCALE TABLE (CHARSET (FROM-CODE . TO-CODE) ...) ...)
1095;; LOCALE: locale symbol
1096;; TABLE: char-table used for char-width-table, initially nil.
d5081c1e 1097;; CHARSET: character set
dbff07a2
KH
1098;; FROM-CODE, TO-CODE: range of code-points in CHARSET
1099
1100(defvar cjk-char-width-table-list
1101 '((ja_JP nil (japanese-jisx0208 (#x2121 . #x287E))
1102 (cp932-2-byte (#x8140 . #x879F)))
1103 (zh_CN nil (chinese-gb2312 (#x2121 . #x297E)))
1104 (zh_HK nil (big5-hkscs (#xA140 . #xA3FE) (#xC6A0 . #xC8FE)))
1105 (zh_TW nil (big5 (#xA140 . #xA3FE))
1106 (chinese-cns11643-1 (#x2121 . #x427E)))
1107 (ko_KR nil (korean-ksc5601 (#x2121 . #x2C7E)))))
1108
1109;; Internal use only.
1110;; Setup char-width-table appropriate for a language environment
1111;; corresponding to LOCALE-NAME (symbol).
1112
1113(defun use-cjk-char-width-table (locale-name)
1114 (while (char-table-parent char-width-table)
1115 (setq char-width-table (char-table-parent char-width-table)))
9d3aa82c 1116 (let ((slot (assq locale-name cjk-char-width-table-list)))
dbff07a2
KH
1117 (or slot (error "Unknown locale for CJK language environment: %s"
1118 locale-name))
1119 (unless (nth 1 slot)
1120 (let ((table (make-char-table nil)))
1121 (dolist (charset-info (nthcdr 2 slot))
1122 (let ((charset (car charset-info)))
1123 (dolist (code-range (cdr charset-info))
9d3aa82c 1124 (map-charset-chars #'(lambda (range _arg)
dbff07a2
KH
1125 (set-char-table-range table range 2))
1126 charset nil
1127 (car code-range) (cdr code-range)))))
1128 (optimize-char-table table)
1129 (set-char-table-parent table char-width-table)
1130 (setcar (cdr slot) table)))
1131 (setq char-width-table (nth 1 slot))))
55a3ed16
KH
1132
1133(defun use-default-char-width-table ()
1134 "Internal use only.
9f336de0 1135Setup char-width-table appropriate for non-CJK language environment."
dbff07a2
KH
1136 (while (char-table-parent char-width-table)
1137 (setq char-width-table (char-table-parent char-width-table))))
55a3ed16 1138
87a39edb 1139(optimize-char-table (standard-case-table))
87a39edb
DL
1140(optimize-char-table (standard-syntax-table))
1141
55a3ed16
KH
1142\f
1143;; Setting char-script-table.
1144
57939ff4
EZ
1145;; The data is compiled from Blocks.txt and Scripts.txt in the
1146;; "Unicode Character Database", simplified to lump together all the
1147;; blocks belonging to the same language. E.g., "Basic Latin",
1148;; "Latin-1 Supplement", "Latin Extended-A", etc. are all lumped
1149;; together under "latin".
1150;;
b427c97e
DL
1151;; The Unicode blocks actually extend past some of these ranges with
1152;; undefined codepoints.
9ce5de1c
KH
1153(let ((script-list nil))
1154 (dolist
1155 (elt
b982c760 1156 '((#x0000 #x007F latin)
6c52dd78
JR
1157 (#x00A0 #x024F latin)
1158 (#x0250 #x02AF phonetic)
1159 (#x02B0 #x036F latin)
9ce5de1c
KH
1160 (#x0370 #x03E1 greek)
1161 (#x03E2 #x03EF coptic)
1162 (#x03F0 #x03F3 greek)
57939ff4 1163 (#x0400 #x052F cyrillic)
9ce5de1c
KH
1164 (#x0530 #x058F armenian)
1165 (#x0590 #x05FF hebrew)
1166 (#x0600 #x06FF arabic)
1167 (#x0700 #x074F syriac)
57939ff4 1168 (#x0750 #x077F arabic)
9ce5de1c 1169 (#x0780 #x07BF thaana)
57939ff4
EZ
1170 (#x07C0 #x07FF nko)
1171 (#x0800 #x083F samaritan)
1172 (#x0840 #x085F mandaic)
1173 (#x08A0 #x08FF arabic)
9ce5de1c
KH
1174 (#x0900 #x097F devanagari)
1175 (#x0980 #x09FF bengali)
1176 (#x0A00 #x0A7F gurmukhi)
1177 (#x0A80 #x0AFF gujarati)
1178 (#x0B00 #x0B7F oriya)
1179 (#x0B80 #x0BFF tamil)
1180 (#x0C00 #x0C7F telugu)
1181 (#x0C80 #x0CFF kannada)
1182 (#x0D00 #x0D7F malayalam)
1183 (#x0D80 #x0DFF sinhala)
57939ff4
EZ
1184 (#x0E00 #x0E7F thai)
1185 (#x0E80 #x0EFF lao)
9ce5de1c 1186 (#x0F00 #x0FFF tibetan)
57939ff4 1187 (#x1000 #x109F burmese) ; according to Unicode 6.1, should be "myanmar"
9ce5de1c
KH
1188 (#x10A0 #x10FF georgian)
1189 (#x1100 #x11FF hangul)
4c81b0f6 1190 (#x1200 #x139F ethiopic)
9ce5de1c
KH
1191 (#x13A0 #x13FF cherokee)
1192 (#x1400 #x167F canadian-aboriginal)
1193 (#x1680 #x169F ogham)
1194 (#x16A0 #x16FF runic)
57939ff4
EZ
1195 (#x1700 #x171F tagalog)
1196 (#x1720 #x173F hanunoo)
1197 (#x1740 #x175F buhid)
1198 (#x1760 #x177F tagbanwa)
9ce5de1c
KH
1199 (#x1780 #x17FF khmer)
1200 (#x1800 #x18AF mongolian)
57939ff4
EZ
1201 (#x18B0 #x18FF canadian-aboriginal)
1202 (#x1900 #x194F limbu)
1203 (#x1950 #x197F tai-le)
1204 (#x1980 #x19DF tai-lue)
1205 (#x19E0 #x19FF khmer)
1206 (#x1A00 #x1A00 buginese)
1207 (#x1A20 #x1AAF tai-tham)
1208 (#x1B00 #x1B7F balinese)
1209 (#x1B80 #x1BBF sundanese)
1210 (#x1BC0 #x1BFF batak)
1211 (#x1C00 #x1C4F lepcha)
1212 (#x1C50 #x1C7F ol-chiki)
1213 (#x1CC0 #x1CCF sundanese)
1214 (#x1CD0 #x1CFF vedic)
1215 (#x1D00 #x1DBF phonetic)
1216 (#x1DC0 #x1EFF latin)
9ce5de1c 1217 (#x1F00 #x1FFF greek)
f041d33e 1218 (#x2000 #x27FF symbol)
9ce5de1c 1219 (#x2800 #x28FF braille)
57939ff4
EZ
1220 (#x2900 #x2BFF symbol)
1221 (#x2C00 #x2C5F glagolitic)
1222 (#x2C60 #x2C7F latin)
1223 (#x2C80 #x2CFF coptic)
1224 (#x2D00 #x2D2F georgian)
1225 (#x2D30 #x2D7F tifinagh)
4c81b0f6 1226 (#x2D80 #x2DDF ethiopic)
57939ff4
EZ
1227 (#x2DE0 #x2DFF cyrillic)
1228 (#x2E00 #x2E7F symbol)
9ce5de1c
KH
1229 (#x2E80 #x2FDF han)
1230 (#x2FF0 #x2FFF ideographic-description)
1231 (#x3000 #x303F cjk-misc)
1232 (#x3040 #x30FF kana)
1233 (#x3100 #x312F bopomofo)
1234 (#x3130 #x318F hangul)
1235 (#x3190 #x319F kanbun)
1236 (#x31A0 #x31BF bopomofo)
57939ff4
EZ
1237 (#x31C0 #x31EF cjk-misc)
1238 (#x31F0 #x31FF kana)
1239 (#x3200 #x9FAF han)
9ce5de1c 1240 (#xA000 #xA4CF yi)
57939ff4
EZ
1241 (#xA4D0 #xA4FF lisu)
1242 (#xA500 #xA63F vai)
1243 (#xA640 #xA69F cyrillic)
1244 (#xA6A0 #xA6FF bamum)
1245 (#xA700 #xA7FF latin)
1246 (#xA800 #xA82F syloti-nagri)
1247 (#xA830 #xA83F north-indic-number)
1248 (#xA840 #xA87F phags-pa)
1249 (#xA880 #xA8DF saurashtra)
1250 (#xA8E0 #xA8FF devanagari)
1251 (#xA900 #xA92F kayah-li)
1252 (#xA930 #xA95F rejang)
1253 (#xA960 #xA97F hangul)
1254 (#xA980 #xA9DF javanese)
1ffae953 1255 (#xAA00 #xAA5F cham)
57939ff4 1256 (#xAA60 #xAA7B burmese) ; Unicode 6.1: "myanmar"
d807d0c7 1257 (#xAA80 #xAADF tai-viet)
57939ff4
EZ
1258 (#xAAE0 #xAAFF meetei-mayek)
1259 (#xAB00 #xAB2F ethiopic)
1260 (#xABC0 #xABFF meetei-mayek)
1261 (#xAC00 #xD7FF hangul)
95ac45fa 1262 (#xF900 #xFAFF han)
9ce5de1c
KH
1263 (#xFB1D #xFB4F hebrew)
1264 (#xFB50 #xFDFF arabic)
57939ff4
EZ
1265 (#xFE30 #xFE4F han)
1266 (#xFE70 #xFEFF arabic)
9ce5de1c
KH
1267 (#xFF00 #xFF5F cjk-misc)
1268 (#xFF61 #xFF9F kana)
1269 (#xFFE0 #xFFE6 cjk-misc)
458888ab
KH
1270 (#x10000 #x100FF linear-b)
1271 (#x10100 #x1013F aegean-number)
57939ff4
EZ
1272 (#x10140 #x1018F ancient-greek-number)
1273 (#x10190 #x101CF ancient-symbol)
458888ab
KH
1274 (#x101D0 #x101FF phaistos-disc)
1275 (#x10280 #x1029F lycian)
1276 (#x102A0 #x102DF carian)
1277 (#x10300 #x1032F olt-italic)
57939ff4 1278 (#x10330 #x1034F gothic)
458888ab
KH
1279 (#x10380 #x1039F ugaritic)
1280 (#x103A0 #x103DF old-persian)
1281 (#x10400 #x1044F deseret)
1282 (#x10450 #x1047F shavian)
1283 (#x10480 #x104AF osmanya)
1284 (#x10800 #x1083F cypriot-syllabary)
57939ff4 1285 (#x10840 #x1085F aramaic)
458888ab
KH
1286 (#x10900 #x1091F phoenician)
1287 (#x10920 #x1093F lydian)
57939ff4 1288 (#x10980 #x109FF meroitic)
458888ab 1289 (#x10A00 #x10A5F kharoshthi)
57939ff4
EZ
1290 (#x10A60 #x10A7F old-south-arabian)
1291 (#x10B00 #x10B3F avestan)
1292 (#x10B40 #x10B5F inscriptional-parthian)
1293 (#x10B60 #x10B7F inscriptional-pahlavi)
1294 (#x10C00 #x10C4F old-turkic)
1295 (#x10E60 #x10E7F rumi-number)
1296 (#x11000 #x1107F brahmi)
1297 (#x11080 #x110CF kaithi)
1298 (#x110D0 #x110FF sora-sompeng)
1299 (#x11100 #x1114F chakma)
1300 (#x11180 #x111DF sharada)
1301 (#x11680 #x116CF takri)
458888ab
KH
1302 (#x12000 #x123FF cuneiform)
1303 (#x12400 #x1247F cuneiform-numbers-and-punctuation)
57939ff4
EZ
1304 (#x13000 #x1342F egyptian)
1305 (#x16800 #x16A3F bamum)
1306 (#x16F00 #x16F9F miao)
1307 (#x1B000 #x1B0FF kana)
e7da2f38
KH
1308 (#x1D000 #x1D0FF byzantine-musical-symbol)
1309 (#x1D100 #x1D1FF musical-symbol)
458888ab
KH
1310 (#x1D200 #x1D24F ancient-greek-musical-notation)
1311 (#x1D300 #x1D35F tai-xuan-jing-symbol)
1312 (#x1D360 #x1D37F counting-rod-numeral)
e7da2f38 1313 (#x1D400 #x1D7FF mathematical)
57939ff4 1314 (#x1EE00 #x1EEFF arabic)
458888ab
KH
1315 (#x1F000 #x1F02F mahjong-tile)
1316 (#x1F030 #x1F09F domino-tile)
57939ff4
EZ
1317 (#x1F0A0 #x1F0FF playing-cards)
1318 (#x1F100 #x1F1FF symbol)
1319 (#x1F200 #x1F2FF han)
1320 (#x1F300 #x1F64F symbol)
1321 (#x1F680 #x1F77F symbol)
1322 (#x20000 #x2B81F han)
9ce5de1c
KH
1323 (#x2F800 #x2FFFF han)))
1324 (set-char-table-range char-script-table
1325 (cons (car elt) (nth 1 elt)) (nth 2 elt))
1326 (or (memq (nth 2 elt) script-list)
1327 (setq script-list (cons (nth 2 elt) script-list))))
1328 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1329
8f924df7 1330(map-charset-chars
9d3aa82c 1331 #'(lambda (range _ignore)
cdfc5141
KH
1332 (set-char-table-range char-script-table range 'tibetan))
1333 'tibetan)
1334
e7259832 1335\f
59db3a5c
KH
1336;;; Setting unicode-category-table.
1337
20372d0c
GM
1338(when (setq unicode-category-table
1339 (unicode-property-table-internal 'general-category))
1340 (map-char-table #'(lambda (key val)
1341 (if (and val
1342 (or (and (/= (aref (symbol-name val) 0) ?M)
1343 (/= (aref (symbol-name val) 0) ?C))
1344 (eq val 'Zs)))
1345 (modify-category-entry key ?.)))
1346 unicode-category-table))
8ea6fa80
KH
1347
1348(optimize-char-table (standard-category-table))
59db3a5c
KH
1349
1350\f
b2cca856
KH
1351;; Display of glyphless characters.
1352
1353(defvar char-acronym-table
1354 (make-char-table 'char-acronym-table nil)
1355 "Char table of acronyms for non-graphic characters.")
1356
1357(let ((c0-acronyms '("NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK" "BEL"
1358 "BS" nil nil "VT" "FF" "CR" "SO" "SI"
1359 "DLE" "DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB"
1360 "CAN" "EM" "SUB" "ESC" "FC" "GS" "RS" "US")))
1361 (dotimes (i 32)
1362 (aset char-acronym-table i (car c0-acronyms))
1363 (setq c0-acronyms (cdr c0-acronyms))))
1364
1365(let ((c1-acronyms '("XXX" "XXX" "BPH" "NBH" "IND" "NEL" "SSA" "ESA"
1366 "HTS" "HTJ" "VTS" "PLD" "PLU" "R1" "SS2" "SS1"
1367 "DCS" "PU1" "PU2" "STS" "CCH" "MW" "SPA" "EPA"
1368 "SOS" "XXX" "SC1" "CSI" "ST" "OSC" "PM" "APC")))
1369 (dotimes (i 32)
1370 (aset char-acronym-table (+ #x0080 i) (car c1-acronyms))
1371 (setq c1-acronyms (cdr c1-acronyms))))
1372
1373(aset char-acronym-table #x17B4 "KIVAQ") ; KHMER VOWEL INHERENT AQ
1374(aset char-acronym-table #x17B5 "KIVAA") ; KHMER VOWEL INHERENT AA
1375(aset char-acronym-table #x200B "ZWSP") ; ZERO WIDTH SPACE
1376(aset char-acronym-table #x200C "ZWNJ") ; ZERO WIDTH NON-JOINER
1377(aset char-acronym-table #x200D "ZWJ") ; ZERO WIDTH JOINER
1378(aset char-acronym-table #x200E "LRM") ; LEFT-TO-RIGHT MARK
1379(aset char-acronym-table #x200F "RLM") ; RIGHT-TO-LEFT MARK
1380(aset char-acronym-table #x202A "LRE") ; LEFT-TO-RIGHT EMBEDDING
1381(aset char-acronym-table #x202B "RLE") ; RIGHT-TO-LEFT EMBEDDING
1382(aset char-acronym-table #x202C "PDF") ; POP DIRECTIONAL FORMATTING
1383(aset char-acronym-table #x202D "LRO") ; LEFT-TO-RIGHT OVERRIDE
1384(aset char-acronym-table #x202E "RLO") ; RIGHT-TO-LEFT OVERRIDE
1385(aset char-acronym-table #x2060 "WJ") ; WORD JOINER
1386(aset char-acronym-table #x206A "ISS") ; INHIBIT SYMMETRIC SWAPPING
1387(aset char-acronym-table #x206B "ASS") ; ACTIVATE SYMMETRIC SWAPPING
1388(aset char-acronym-table #x206C "IAFS") ; INHIBIT ARABIC FORM SHAPING
1389(aset char-acronym-table #x206D "AAFS") ; ACTIVATE ARABIC FORM SHAPING
1390(aset char-acronym-table #x206E "NADS") ; NATIONAL DIGIT SHAPES
1391(aset char-acronym-table #x206F "NODS") ; NOMINAL DIGIT SHAPES
1392(aset char-acronym-table #xFEFF "ZWNBSP") ; ZERO WIDTH NO-BREAK SPACE
1393(aset char-acronym-table #xFFF9 "IAA") ; INTERLINEAR ANNOTATION ANCHOR
1394(aset char-acronym-table #xFFFA "IAS") ; INTERLINEAR ANNOTATION SEPARATOR
1395(aset char-acronym-table #xFFFB "IAT") ; INTERLINEAR ANNOTATION TERMINATOR
1396(aset char-acronym-table #x1D173 "BEGBM") ; MUSICAL SYMBOL BEGIN BEAM
1397(aset char-acronym-table #x1D174 "ENDBM") ; MUSICAL SYMBOL END BEAM
1398(aset char-acronym-table #x1D175 "BEGTIE") ; MUSICAL SYMBOL BEGIN TIE
1399(aset char-acronym-table #x1D176 "END") ; MUSICAL SYMBOL END TIE
1400(aset char-acronym-table #x1D177 "BEGSLR") ; MUSICAL SYMBOL BEGIN SLUR
1401(aset char-acronym-table #x1D178 "ENDSLR") ; MUSICAL SYMBOL END SLUR
1402(aset char-acronym-table #x1D179 "BEGPHR") ; MUSICAL SYMBOL BEGIN PHRASE
1403(aset char-acronym-table #x1D17A "ENDPHR") ; MUSICAL SYMBOL END PHRASE
1404(aset char-acronym-table #xE0001 "|->TAG") ; LANGUAGE TAG
1405(aset char-acronym-table #xE0020 "SP TAG") ; TAG SPACE
1406(dotimes (i 94)
1407 (aset char-acronym-table (+ #xE0021 i) (format " %c TAG" (+ 33 i))))
1408(aset char-acronym-table #xE007F "->|TAG") ; CANCEL TAG
1409
0e7c0582 1410(defun update-glyphless-char-display (&optional variable value)
0eb025fb 1411 "Make the setting of `glyphless-char-display-control' take effect.
b2cca856 1412This function updates the char-table `glyphless-char-display'."
0e7c0582
EZ
1413 (when value
1414 (set-default variable value))
1415 (dolist (elt value)
b2cca856
KH
1416 (let ((target (car elt))
1417 (method (cdr elt)))
0eb025fb
EZ
1418 (or (memq method '(zero-width thin-space empty-box acronym hex-code))
1419 (error "Invalid glyphless character display method: %s" method))
b2cca856 1420 (cond ((eq target 'c0-control)
bd3921f0
PS
1421 (glyphless-set-char-table-range glyphless-char-display
1422 #x00 #x1F method)
96107967
EZ
1423 ;; Users will not expect their newlines and TABs be
1424 ;; displayed as anything but themselves, so exempt those
1425 ;; two characters from c0-control.
1426 (set-char-table-range glyphless-char-display #x9 nil)
1427 (set-char-table-range glyphless-char-display #xa nil))
b2cca856 1428 ((eq target 'c1-control)
bd3921f0
PS
1429 (glyphless-set-char-table-range glyphless-char-display
1430 #x80 #x9F method))
b2cca856 1431 ((eq target 'format-control)
20372d0c
GM
1432 (when unicode-category-table
1433 (map-char-table
1434 #'(lambda (char category)
1435 (if (eq category 'Cf)
1436 (let ((this-method method)
1437 from to)
1438 (if (consp char)
1439 (setq from (car char) to (cdr char))
1440 (setq from char to char))
1441 (while (<= from to)
1442 (when (/= from #xAD)
1443 (if (eq method 'acronym)
1444 (setq this-method
1445 (aref char-acronym-table from)))
1446 (set-char-table-range glyphless-char-display
1447 from this-method))
1448 (setq from (1+ from))))))
1449 unicode-category-table)))
b2cca856
KH
1450 ((eq target 'no-font)
1451 (set-char-table-extra-slot glyphless-char-display 0 method))
1452 (t
0eb025fb 1453 (error "Invalid glyphless character group: %s" target))))))
bd3921f0
PS
1454
1455(defun glyphless-set-char-table-range (chartable from to method)
1456 (if (eq method 'acronym)
1457 (let ((i from))
1458 (while (<= i to)
1459 (set-char-table-range chartable i (aref char-acronym-table i))
1460 (setq i (1+ i))))
1461 (set-char-table-range chartable (cons from to) method)))
b2cca856 1462
0e7c0582
EZ
1463;;; Control of displaying glyphless characters.
1464(defcustom glyphless-char-display-control
1465 '((format-control . thin-space)
1466 (no-font . hex-code))
1467 "List of directives to control display of glyphless characters.
1468
1469Each element has the form (GROUP . METHOD), where GROUP is a
1470symbol specifying the character group, and METHOD is a symbol
1471specifying the method of displaying characters belonging to that
1472group.
1473
1474GROUP must be one of these symbols:
96107967 1475 `c0-control': U+0000..U+001F, but excluding newline and TAB.
0e7c0582
EZ
1476 `c1-control': U+0080..U+009F.
1477 `format-control': Characters of Unicode General Category `Cf',
1478 such as U+200C (ZWNJ), U+200E (LRM), but
1479 excluding characters that have graphic images,
1480 such as U+00AD (SHY).
1481 `no-font': characters for which no suitable font is found.
1482 For character terminals, characters that cannot
1483 be encoded by `terminal-coding-system'.
1484
1485METHOD must be one of these symbols:
1486 `zero-width': don't display.
1487 `thin-space': display a thin (1-pixel width) space. On character
1488 terminals, display as 1-character space.
1489 `empty-box': display an empty box.
1490 `acronym': display an acronym of the character in a box. The
1491 acronym is taken from `char-acronym-table', which see.
d35f31a4
XF
1492 `hex-code': display the hexadecimal character code in a box.
1493
1494Do not set its value directly from Lisp; the value takes effect
1495only via a custom `:set'
1496function (`update-glyphless-char-display'), which updates
1497`glyphless-char-display'."
2bed3f04 1498 :version "24.1"
0e7c0582
EZ
1499 :type '(alist :key-type (symbol :tag "Character Group")
1500 :value-type (symbol :tag "Display Method"))
1501 :options '((c0-control
1502 (choice (const :tag "Don't display" zero-width)
1503 (const :tag "Display as thin space" thin-space)
1504 (const :tag "Display as empty box" empty-box)
1505 (const :tag "Display acronym" acronym)
1506 (const :tag "Display hex code in a box" hex-code)))
1507 (c1-control
1508 (choice (const :tag "Don't display" zero-width)
1509 (const :tag "Display as thin space" thin-space)
1510 (const :tag "Display as empty box" empty-box)
1511 (const :tag "Display acronym" acronym)
1512 (const :tag "Display hex code in a box" hex-code)))
1513 (format-control
1514 (choice (const :tag "Don't display" zero-width)
1515 (const :tag "Display as thin space" thin-space)
1516 (const :tag "Display as empty box" empty-box)
1517 (const :tag "Display acronym" acronym)
1518 (const :tag "Display hex code in a box" hex-code)))
1519 (no-font
1520 (choice (const :tag "Don't display" zero-width)
1521 (const :tag "Display as thin space" thin-space)
1522 (const :tag "Display as empty box" empty-box)
1523 (const :tag "Display acronym" acronym)
1524 (const :tag "Display hex code in a box" hex-code))))
1525 :set 'update-glyphless-char-display
1526 :group 'display)
1527
b2cca856 1528\f
e7259832
KH
1529;;; Setting word boundary.
1530
e7259832 1531(setq word-combining-categories
4626499f
KH
1532 '((nil . ?^)
1533 (?^ . nil)
7ffefb08
MB
1534 (?C . ?H)
1535 (?C . ?K)))
e7259832
KH
1536
1537(setq word-separating-categories ; (2-byte character sets)
4626499f 1538 '((?H . ?K) ; Hiragana - Katakana
e7259832
KH
1539 ))
1540
1cbfaab9 1541;; Local Variables:
985773c9 1542;; coding: utf-8
1cbfaab9 1543;; End:
777cfce6 1544
60370d40 1545;;; characters.el ends here