Add C interface for Unicode character property table.
[bpt/emacs.git] / lisp / international / characters.el
CommitLineData
4ed46869
KH
1;;; characters.el --- set syntax and category for multibyte characters
2
95df8112 3;; Copyright (C) 1997, 2000-2011 Free Software Foundation, Inc.
7976eda0 4;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 5;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
2fd125a3
KH
6;; National Institute of Advanced Industrial Science and Technology (AIST)
7;; Registration Number H14PRO021
8f924df7 8;; Copyright (C) 2003
55bd52ea
KH
9;; National Institute of Advanced Industrial Science and Technology (AIST)
10;; Registration Number H13PRO009
4ed46869
KH
11
12;; Keywords: multibyte character, character set, syntax, category
13
14;; This file is part of GNU Emacs.
15
4936186e 16;; GNU Emacs is free software: you can redistribute it and/or modify
4ed46869 17;; it under the terms of the GNU General Public License as published by
4936186e
GM
18;; the Free Software Foundation, either version 3 of the License, or
19;; (at your option) any later version.
4ed46869
KH
20
21;; GNU Emacs is distributed in the hope that it will be useful,
22;; but WITHOUT ANY WARRANTY; without even the implied warranty of
23;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24;; GNU General Public License for more details.
25
26;; You should have received a copy of the GNU General Public License
4936186e 27;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
4ed46869
KH
28
29;;; Commentary:
30
60370d40
PJ
31;;; Code:
32
4ed46869
KH
33;;; Predefined categories.
34
35;; For each character set.
36
46bf60bc
KH
37(define-category ?a "ASCII
38ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
4ed46869
KH
39(define-category ?l "Latin")
40(define-category ?t "Thai")
41(define-category ?g "Greek")
42(define-category ?b "Arabic")
43(define-category ?w "Hebrew")
44(define-category ?y "Cyrillic")
46bf60bc
KH
45(define-category ?k "Katakana
46Japanese katakana")
47(define-category ?r "Roman
48Japanese roman")
4ed46869
KH
49(define-category ?c "Chinese")
50(define-category ?j "Japanese")
51(define-category ?h "Korean")
46bf60bc
KH
52(define-category ?e "Ethiopic
53Ethiopic (Ge'ez)")
54(define-category ?v "Viet
55Vietnamese")
4ed46869 56(define-category ?i "Indian")
6eba8645 57(define-category ?o "Lao")
9395eb7c 58(define-category ?q "Tibetan")
4ed46869
KH
59
60;; For each group (row) of 2-byte character sets.
61
46bf60bc
KH
62(define-category ?A "2-byte alnum
63Alpha-numeric characters of 2-byte character sets")
64(define-category ?C "2-byte han
65Chinese (Han) characters of 2-byte character sets")
66(define-category ?G "2-byte Greek
67Greek characters of 2-byte character sets")
68(define-category ?H "2-byte Hiragana
69Japanese Hiragana characters of 2-byte character sets")
70(define-category ?K "2-byte Katakana
71Japanese Katakana characters of 2-byte character sets")
72(define-category ?N "2-byte Korean
73Korean Hangul characters of 2-byte character sets")
91c491e0 74(define-category ?Y "2-byte Cyrillic
46bf60bc 75Cyrillic characters of 2-byte character sets")
4ed46869
KH
76(define-category ?I "Indian Glyphs")
77
78;; For phonetic classifications.
79
80(define-category ?0 "consonant")
46bf60bc 81(define-category ?1 "base vowel
4eb97232 82Base (independent) vowel")
46bf60bc 83(define-category ?2 "upper diacritic
4eb97232 84Upper diacritical mark (including upper vowel)")
46bf60bc 85(define-category ?3 "lower diacritic
4eb97232 86Lower diacritical mark (including lower vowel)")
46bf60bc 87(define-category ?4 "combining tone
4eb97232 88Combining tone mark")
9765a2ba 89(define-category ?5 "symbol")
4ed46869 90(define-category ?6 "digit")
91c491e0 91(define-category ?7 "vowel diacritic
4eb97232 92Vowel-modifying diacritical mark")
6eba8645
KH
93(define-category ?8 "vowel-signs")
94(define-category ?9 "semivowel lower")
4ed46869
KH
95
96;; For filling.
46bf60bc
KH
97(define-category ?| "line breakable
98While filling, we can break a line at this character.")
4ed46869 99
504af7b2 100;; For indentation calculation.
70ea295a 101(define-category ?\s
46bf60bc
KH
102 "space for indent
103This character counts as a space for indentation purposes.")
504af7b2 104
94487c4e 105;; Keep the following for `kinsoku' processing. See comments in
4ed46869 106;; kinsoku.el.
46bf60bc
KH
107(define-category ?> "Not at bol
108A character which can't be placed at beginning of line.")
109(define-category ?< "Not at eol
110A character which can't be placed at end of line.")
4ed46869 111
8ea6fa80
KH
112;; Base and Combining
113(define-category ?. "Base
114Base characters (Unicode General Category L,N,P,S,Zs)")
46bf60bc 115(define-category ?^ "Combining
4eb97232 116Combining diacritic or mark (Unicode General Category M)")
4ed46869
KH
117\f
118;;; Setting syntax and category.
119
120;; ASCII
121
e2cc40b7
KH
122;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
123(modify-category-entry '(32 . 127) ?a)
124(modify-category-entry '(32 . 127) ?l)
4ed46869 125
c94ae9eb
DL
126;; Deal with the CJK charsets first. Since the syntax of blocks is
127;; defined per charset, and the charsets may contain e.g. Latin
128;; characters, we end up with the wrong syntax definitions if we're
129;; not careful.
4ed46869 130
66bff5ed 131;; Chinese characters (Unicode)
a5bb49e1
KH
132(modify-category-entry '(#x2E80 . #x312F) ?|)
133(modify-category-entry '(#x3190 . #x33FF) ?|)
66a85e76
KH
134(modify-category-entry '(#x3400 . #x4DBF) ?C)
135(modify-category-entry '(#x4E00 . #x9FAF) ?C)
66bff5ed
KH
136(modify-category-entry '(#x3400 . #x9FAF) ?c)
137(modify-category-entry '(#x3400 . #x9FAF) ?|)
138(modify-category-entry '(#xF900 . #xFAFF) ?C)
139(modify-category-entry '(#xF900 . #xFAFF) ?c)
140(modify-category-entry '(#xF900 . #xFAFF) ?|)
796f8b2f
KH
141(modify-category-entry '(#x20000 . #x2FFFF) ?|)
142(modify-category-entry '(#x20000 . #x2FFFF) ?C)
143(modify-category-entry '(#x20000 . #x2FFFF) ?c)
8e4cd685 144
4ed46869
KH
145
146;; Chinese character set (GB2312)
147
66bff5ed
KH
148(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
149(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
150(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
4ed46869 151
87a39edb 152(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
9ad4b491
KH
153(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
154(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
155(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
66bff5ed
KH
156(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
157(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
158(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
159(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
160(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
4ed46869
KH
161
162;; Chinese character set (BIG5)
163
e7259832 164(map-charset-chars #'modify-category-entry 'big5 ?c)
66a85e76 165(map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA261)
9ad4b491 166(map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
66a85e76 167(map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DC)
4ed46869
KH
168
169;; Chinese character set (CNS11643)
170
87a39edb
DL
171(dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
172 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
173 chinese-cns11643-7))
174 (map-charset-chars #'modify-category-entry c ?c)
9ad4b491
KH
175 (if (eq c 'chinese-cns11643-1)
176 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
8e4cd685 177 (map-charset-chars #'modify-category-entry c ?C)))
4ed46869 178
8f924df7 179;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
4ed46869 180
66bff5ed 181(map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
4ed46869 182
66bff5ed 183(map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
4ed46869 184
8f924df7 185(dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
761f6427
KH
186 japanese-jisx0213-1 japanese-jisx0213-2
187 cp932-2-byte))
8e4cd685 188 (map-charset-chars #'modify-category-entry l ?j))
4ed46869 189
c4186f9c
KH
190;; Fullwidth characters
191(modify-category-entry '(#xff01 . #xff60) ?\|)
192
269a5dd0 193;; Unicode equivalents of JISX0201-kana
66bff5ed
KH
194(let ((range '(#xff61 . #xff9f)))
195 (modify-category-entry range ?k)
196 (modify-category-entry range ?j)
197 (modify-category-entry range ?\|))
269a5dd0
DL
198
199;; Katakana block
796f8b2f
KH
200(modify-category-entry '(#x3099 . #x309C) ?K)
201(modify-category-entry '(#x30A0 . #x30FF) ?K)
6f3ac1e1 202(modify-category-entry '(#x31F0 . #x31FF) ?K)
b11c2874 203(modify-category-entry '(#x30A0 . #x30FA) ?\|)
796f8b2f 204(modify-category-entry #x30FF ?\|)
269a5dd0
DL
205
206;; Hiragana block
796f8b2f
KH
207(modify-category-entry '(#x3040 . #x309F) ?H)
208(modify-category-entry '(#x3040 . #x3096) ?\|)
209(modify-category-entry #x309F ?\|)
210(modify-category-entry #x30A0 ?H)
211(modify-category-entry #x30FC ?H)
212
269a5dd0 213
4ed46869 214;; JISX0208
66bff5ed
KH
215(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
216(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
217(let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
69c2c6ea 218 (dolist (elt chars)
abdaa411 219 (modify-syntax-entry (car chars) "w")))
66bff5ed
KH
220
221(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
222(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
223(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
224(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
225(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
226(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
227(modify-category-entry ?ー ?K)
228(let ((chars '(?゛ ?゜)))
4ed46869
KH
229 (while chars
230 (modify-category-entry (car chars) ?K)
231 (modify-category-entry (car chars) ?H)
232 (setq chars (cdr chars))))
66a85e76 233(let ((chars '(?仝 ?々 ?〆 ?〇)))
4ed46869
KH
234 (while chars
235 (modify-category-entry (car chars) ?C)
236 (setq chars (cdr chars))))
237
238;; JISX0212
4ed46869 239
66bff5ed 240(map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
4ed46869
KH
241
242;; JISX0201-Kana
87a39edb 243
abdaa411 244(let ((chars '(?。 ?、 ?・)))
4ed46869
KH
245 (while chars
246 (modify-syntax-entry (car chars) ".")
247 (setq chars (cdr chars))))
248
e6d10035
KH
249(modify-syntax-entry ?\「 "(」")
250(modify-syntax-entry ?\」 "(「")
226e4119 251
4ed46869
KH
252;; Korean character set (KSC5601)
253
87a39edb 254(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
66bff5ed
KH
255
256(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
439f7264
DL
257(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
258(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
259(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
9ad4b491
KH
260(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
261(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
262(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
66bff5ed
KH
263(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
264(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
265(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
266(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
4ed46869 267
c94ae9eb 268;; These are in more than one charset.
8f924df7
KH
269(let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
270 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
271 "()[]{}"))
272 open close)
273 (dotimes (i (/ (length parens) 2))
274 (setq open (aref parens (* i 2))
275 close (aref parens (1+ (* i 2))))
276 (modify-syntax-entry open (format "(%c" close))
277 (modify-syntax-entry close (format ")%c" open))))
d05cfa1f 278
c94ae9eb 279;; Arabic character set
6eba8645 280
c94ae9eb
DL
281(let ((charsets '(arabic-iso8859-6
282 arabic-digit
283 arabic-1-column
284 arabic-2-column)))
285 (while charsets
286 (map-charset-chars #'modify-category-entry (car charsets) ?b)
287 (setq charsets (cdr charsets))))
288(modify-category-entry '(#x600 . #x6ff) ?b)
289(modify-category-entry '(#xfb50 . #xfdff) ?b)
290(modify-category-entry '(#xfe70 . #xfefe) ?b)
6eba8645 291
c94ae9eb
DL
292;; Cyrillic character set (ISO-8859-5)
293
294(modify-syntax-entry ?№ ".")
295
296;; Ethiopic character set
297
4c81b0f6
KH
298(modify-category-entry '(#x1200 . #x1399) ?e)
299(modify-category-entry '(#x2d80 . #x2dde) ?e)
55a3ed16 300(let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨)))
c94ae9eb
DL
301 (while chars
302 (modify-syntax-entry (car chars) ".")
303 (setq chars (cdr chars))))
304(map-charset-chars #'modify-category-entry 'ethiopic ?e)
305
306;; Hebrew character set (ISO-8859-8)
307
308(modify-syntax-entry #x5be ".") ; MAQAF
309(modify-syntax-entry #x5c0 ".") ; PASEQ
310(modify-syntax-entry #x5c3 ".") ; SOF PASUQ
311(modify-syntax-entry #x5f3 ".") ; GERESH
312(modify-syntax-entry #x5f4 ".") ; GERSHAYIM
313
314;; Indian character set (IS 13194 and other Emacs original Indian charsets)
315
316(modify-category-entry '(#x901 . #x970) ?i)
317(map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
318(map-charset-chars #'modify-category-entry 'indian-2-column ?i)
d05cfa1f 319
6eba8645
KH
320;; Lao character set
321
abdaa411
DL
322(modify-category-entry '(#xe80 . #xeff) ?o)
323(map-charset-chars #'modify-category-entry 'lao ?o)
6eba8645 324
abdaa411 325(let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
e6d10035
KH
326 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
327 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
328 ("ຸູ" "w" ?3) ; vowel lower
8f924df7 329 ("່-໋" "w" ?4) ; tone mark
e6d10035
KH
330 ("ຼຽ" "w" ?9) ; semivowel lower
331 ("໐-໙" "w" ?6) ; digit
332 ("ຯໆ" "_" ?5) ; symbol
6eba8645
KH
333 ))
334 elm chars len syntax category to ch i)
335 (while deflist
336 (setq elm (car deflist))
337 (setq chars (car elm)
338 len (length chars)
339 syntax (nth 1 elm)
340 category (nth 2 elm)
341 i 0)
342 (while (< i len)
343 (if (= (aref chars i) ?-)
344 (setq i (1+ i)
4a027a0d
KH
345 to (aref chars i))
346 (setq ch (aref chars i)
6eba8645
KH
347 to ch))
348 (while (<= ch to)
269a5dd0
DL
349 (unless (string-equal syntax "w")
350 (modify-syntax-entry ch syntax))
6eba8645
KH
351 (modify-category-entry ch category)
352 (setq ch (1+ ch)))
4a027a0d 353 (setq i (1+ i)))
6eba8645
KH
354 (setq deflist (cdr deflist))))
355
4ed46869
KH
356;; Thai character set (TIS620)
357
abdaa411
DL
358(modify-category-entry '(#xe00 . #xe7f) ?t)
359(map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
4ed46869
KH
360
361(let ((deflist '(;; chars syntax category
e6d10035
KH
362 ("ก-รลว-ฮ" "w" ?0) ; consonant
363 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
364 ("ัิ-ื็๎" "w" ?2) ; vowel upper
365 ("ุ-ฺ" "w" ?3) ; vowel lower
8f924df7 366 ("่-ํ" "w" ?4) ; tone mark
e6d10035
KH
367 ("๐-๙" "w" ?6) ; digit
368 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
4ed46869
KH
369 ))
370 elm chars len syntax category to ch i)
9395eb7c
KH
371 (while deflist
372 (setq elm (car deflist))
373 (setq chars (car elm)
374 len (length chars)
375 syntax (nth 1 elm)
376 category (nth 2 elm)
377 i 0)
378 (while (< i len)
379 (if (= (aref chars i) ?-)
380 (setq i (1+ i)
4a027a0d
KH
381 to (aref chars i))
382 (setq ch (aref chars i)
9395eb7c
KH
383 to ch))
384 (while (<= ch to)
269a5dd0
DL
385 (unless (string-equal syntax "w")
386 (modify-syntax-entry ch syntax))
9395eb7c
KH
387 (modify-category-entry ch category)
388 (setq ch (1+ ch)))
4a027a0d 389 (setq i (1+ i)))
9395eb7c
KH
390 (setq deflist (cdr deflist))))
391
392;; Tibetan character set
393
abdaa411
DL
394(modify-category-entry '(#xf00 . #xfff) ?q)
395(map-charset-chars #'modify-category-entry 'tibetan ?q)
396(map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
9395eb7c
KH
397
398(let ((deflist '(;; chars syntax category
725d7c92 399 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
55a3ed16 400 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
725d7c92
DL
401 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
402 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
55a3ed16 403 ("྄ཱུ༙༵༷" "w" ?3) ; lowel vowel/modifier
8f924df7 404 ("཰" "w" ?3) ; invisible vowel a
725d7c92
DL
405 ("༠-༩༪-༳" "w" ?6) ; digit
406 ("་།-༒༔ཿ" "." ?|) ; line-break char
407 ("་།༏༐༑༔ཿ" "." ?|) ;
408 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
409 ("་།༏༐༑༔ཿ" "." ?>) ;
410 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
411 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
9395eb7c
KH
412 ))
413 elm chars len syntax category to ch i)
4ed46869
KH
414 (while deflist
415 (setq elm (car deflist))
416 (setq chars (car elm)
417 len (length chars)
418 syntax (nth 1 elm)
419 category (nth 2 elm)
420 i 0)
421 (while (< i len)
422 (if (= (aref chars i) ?-)
423 (setq i (1+ i)
4a027a0d
KH
424 to (aref chars i))
425 (setq ch (aref chars i)
4ed46869
KH
426 to ch))
427 (while (<= ch to)
269a5dd0
DL
428 (unless (string-equal syntax "w")
429 (modify-syntax-entry ch syntax))
4ed46869
KH
430 (modify-category-entry ch category)
431 (setq ch (1+ ch)))
4a027a0d 432 (setq i (1+ i)))
4ed46869
KH
433 (setq deflist (cdr deflist))))
434
435;; Vietnamese character set
436
abdaa411
DL
437;; To make a word with Latin characters
438(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
439(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
440
441(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
442(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
4ed46869 443
e5dd1155
KH
444(let ((tbl (standard-case-table))
445 (i 32))
446 (while (< i 128)
725d7c92
DL
447 (let* ((char (decode-char 'vietnamese-viscii-upper i))
448 (charl (decode-char 'vietnamese-viscii-lower i))
449 (uc (encode-char char 'ucs))
450 (lc (encode-char charl 'ucs)))
451 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
4eb97232 452 tbl)
725d7c92
DL
453 (if uc (modify-category-entry uc ?v))
454 (if lc (modify-category-entry lc ?v)))
e5dd1155
KH
455 (setq i (1+ i))))
456
d807d0c7
KH
457;; Tai Viet
458(let ((deflist '(;; chars syntax category
459 ((?ꪀ. ?ꪯ) "w" ?0) ; cosonant
460 ("ꪱꪵꪶ" "w" ?1) ; vowel base
461 ((?ꪹ . ?ꪽ) "w" ?1) ; vowel base
462 ("ꪰꪲꪳꪷꪸꪾ" "w" ?2) ; vowel upper
463 ("ꪴ" "w" ?3) ; vowel lower
464 ("ꫀꫂ" "w" ?1) ; non-combining tone-mark
465 ("꪿꫁" "w" ?4) ; combining tone-mark
466 ((?ꫛ . ?꫟) "_" ?5) ; symbol
467 )))
468 (dolist (elm deflist)
469 (let ((chars (car elm))
470 (syntax (nth 1 elm))
471 (category (nth 2 elm)))
472 (if (consp chars)
473 (progn
474 (modify-syntax-entry chars syntax)
475 (modify-category-entry chars category))
476 (mapc #'(lambda (x)
477 (modify-syntax-entry x syntax)
478 (modify-category-entry x category))
479 chars)))))
c94ae9eb
DL
480
481;; Latin
482
483(modify-category-entry '(#x80 . #x024F) ?l)
d05cfa1f 484
85ef8ece
KH
485(let ((tbl (standard-case-table)) c)
486
4fb82d62
DL
487 ;; Latin-1
488
489 ;; Fixme: Some of the non-word syntaxes here perhaps should be
490 ;; reviewed. (Note that the following all implicitly have word
491 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
492 ;; relating Unicode categories to Emacs syntax codes.
db92e81e
KH
493
494 ;; NBSP isn't semantically interchangeable with other whitespace chars,
495 ;; so it's more like punctation.
496 (set-case-syntax ?  "." tbl)
4fb82d62
DL
497 (set-case-syntax ?¡ "." tbl)
498 (set-case-syntax ?¦ "_" tbl)
499 (set-case-syntax ?§ "." tbl)
500 (set-case-syntax ?© "_" tbl)
501 (set-case-syntax-delims 171 187 tbl) ; « »
502 (set-case-syntax ?¬ "_" tbl)
503 (set-case-syntax ?­ "_" tbl)
504 (set-case-syntax ?® "_" tbl)
505 (set-case-syntax ?° "_" tbl)
506 (set-case-syntax ?± "_" tbl)
507 (set-case-syntax ?µ "_" tbl)
508 (set-case-syntax ?· "_" tbl)
509 (set-case-syntax ?¼ "_" tbl)
510 (set-case-syntax ?½ "_" tbl)
511 (set-case-syntax ?¾ "_" tbl)
512 (set-case-syntax ?¿ "." tbl)
513 (let ((c 192))
514 (while (<= c 222)
515 (set-case-syntax-pair c (+ c 32) tbl)
516 (setq c (1+ c))))
517 (set-case-syntax ?× "_" tbl)
518 (set-case-syntax ?ß "w" tbl)
519 (set-case-syntax ?÷ "_" tbl)
520 ;; See below for ÿ.
85ef8ece 521
85ef8ece
KH
522 ;; Latin Extended-A, Latin Extended-B
523 (setq c #x0100)
e5e381c8
KH
524 (while (<= c #x02B8)
525 (modify-category-entry c ?l)
d05cfa1f 526 (setq c (1+ c)))
2bb915b8 527
e5e381c8
KH
528 (let ((pair-ranges '((#x0100 . #x012F)
529 (#x0132 . #x0137)
530 (#x0139 . #x0148)
531 (#x014a . #x0177)
532 (#x0179 . #x017E)
533 (#x0182 . #x0185)
796f8b2f
KH
534 (#x0187 . #x0188)
535 (#x018B . #x018C)
e5e381c8
KH
536 (#x0191 . #x0192)
537 (#x0198 . #x0199)
538 (#x01A0 . #x01A5)
539 (#x01A7 . #x01A8)
540 (#x01AC . #x01AD)
541 (#x01AF . #x01B0)
542 (#x01B3 . #x01B6)
543 (#x01BC . #x01BD)
544 (#x01CD . #x01DC)
545 (#x01DE . #x01EF)
546 (#x01F4 . #x01F5)
547 (#x01F8 . #x021F)
548 (#x0222 . #x0233)
549 (#x023B . #x023C)
550 (#x0241 . #x0242)
551 (#x0246 . #x024F))))
552 (dolist (elt pair-ranges)
553 (let ((from (car elt)) (to (cdr elt)))
554 (while (< from to)
555 (set-case-syntax-pair from (1+ from) tbl)
556 (setq from (+ from 2))))))
2bb915b8 557
796f8b2f
KH
558 (set-case-syntax-pair #x189 #x256 tbl)
559 (set-case-syntax-pair #x18A #x257 tbl)
560
2bb915b8
KH
561 ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
562 ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
563 ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
564 ;; SMALL LETTER I.
565
566 ;; We used to set up half of those correspondence unconditionally,
567 ;; but that makes searches slow. So now we don't set up either half
568 ;; of these correspondences by default.
569
570 ;; (set-downcase-syntax ?İ ?i tbl)
571 ;; (set-upcase-syntax ?I ?ı tbl)
572
e6d10035
KH
573 (set-case-syntax-pair ?DŽ ?dž tbl)
574 (set-case-syntax-pair ?Dž ?dž tbl)
575 (set-case-syntax-pair ?LJ ?lj tbl)
576 (set-case-syntax-pair ?Lj ?lj tbl)
577 (set-case-syntax-pair ?NJ ?nj tbl)
578 (set-case-syntax-pair ?Nj ?nj tbl)
e5e381c8 579
269a5dd0 580 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
e6d10035
KH
581 (set-case-syntax-pair ?DZ ?dz tbl)
582 (set-case-syntax-pair ?Dz ?dz tbl)
e6d10035
KH
583 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
584 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
269a5dd0 585
85ef8ece 586 ;; Latin Extended Additional
abdaa411 587 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
85ef8ece 588 (setq c #x1e00)
d05cfa1f 589 (while (<= c #x1ef9)
d05cfa1f
KH
590 (and (zerop (% c 2))
591 (or (<= c #x1e94) (>= c #x1ea0))
abdaa411 592 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f
KH
593 (setq c (1+ c)))
594
85ef8ece 595 ;; Greek
abdaa411 596 (modify-category-entry '(#x0370 . #x03ff) ?g)
85ef8ece 597 (setq c #x0370)
d05cfa1f 598 (while (<= c #x03ff)
d05cfa1f
KH
599 (if (or (and (>= c #x0391) (<= c #x03a1))
600 (and (>= c #x03a3) (<= c #x03ab)))
abdaa411 601 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
602 (and (>= c #x03da)
603 (<= c #x03ee)
604 (zerop (% c 2))
abdaa411 605 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 606 (setq c (1+ c)))
e6d10035
KH
607 (set-case-syntax-pair ?Ά ?ά tbl)
608 (set-case-syntax-pair ?Έ ?έ tbl)
609 (set-case-syntax-pair ?Ή ?ή tbl)
610 (set-case-syntax-pair ?Ί ?ί tbl)
611 (set-case-syntax-pair ?Ό ?ό tbl)
612 (set-case-syntax-pair ?Ύ ?ύ tbl)
613 (set-case-syntax-pair ?Ώ ?ώ tbl)
d05cfa1f 614
269a5dd0
DL
615 ;; Armenian
616 (setq c #x531)
617 (while (<= c #x556)
abdaa411 618 (set-case-syntax-pair c (+ c #x30) tbl)
269a5dd0
DL
619 (setq c (1+ c)))
620
85ef8ece 621 ;; Greek Extended
abdaa411 622 (modify-category-entry '(#x1f00 . #x1fff) ?g)
85ef8ece 623 (setq c #x1f00)
d05cfa1f 624 (while (<= c #x1fff)
d05cfa1f
KH
625 (and (<= (logand c #x000f) 7)
626 (<= c #x1fa7)
796f8b2f
KH
627 (not (memq c '(#x1f16 #x1f17 #x1f56 #x1f57
628 #x1f50 #x1f52 #x1f54 #x1f56)))
629 (/= (logand c #x00f0) #x70)
abdaa411 630 (set-case-syntax-pair (+ c 8) c tbl))
d05cfa1f 631 (setq c (1+ c)))
e6d10035
KH
632 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
633 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
634 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
635 (set-case-syntax-pair ?Ά ?ά tbl)
636 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
637 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
638 (set-case-syntax-pair ?Έ ?έ tbl)
639 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
640 (set-case-syntax-pair ?Ή ?ή tbl)
641 (set-case-syntax-pair ?ῌ ?ῃ tbl)
642 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
643 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
644 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
645 (set-case-syntax-pair ?Ί ?ί tbl)
646 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
647 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
648 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
649 (set-case-syntax-pair ?Ύ ?ύ tbl)
650 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
651 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
652 (set-case-syntax-pair ?Ό ?ό tbl)
653 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
654 (set-case-syntax-pair ?Ώ ?ώ tbl)
655 (set-case-syntax-pair ?ῼ ?ῳ tbl)
d05cfa1f 656
85ef8ece 657 ;; cyrillic
abdaa411 658 (modify-category-entry '(#x0400 . #x04FF) ?y)
85ef8ece 659 (setq c #x0400)
d05cfa1f 660 (while (<= c #x04ff)
d05cfa1f
KH
661 (and (>= c #x0400)
662 (<= c #x040f)
abdaa411 663 (set-case-syntax-pair c (+ c 80) tbl))
d05cfa1f
KH
664 (and (>= c #x0410)
665 (<= c #x042f)
abdaa411 666 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
667 (and (zerop (% c 2))
668 (or (and (>= c #x0460) (<= c #x0480))
669 (and (>= c #x048c) (<= c #x04be))
670 (and (>= c #x04d0) (<= c #x04f4)))
8f924df7 671 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 672 (setq c (1+ c)))
e6d10035
KH
673 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
674 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
675 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
676 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
677 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
d05cfa1f 678
85ef8ece
KH
679 ;; general punctuation
680 (setq c #x2000)
d05cfa1f
KH
681 (while (<= c #x200b)
682 (set-case-syntax c " " tbl)
683 (setq c (1+ c)))
b427c97e
DL
684 (while (<= c #x200F)
685 (set-case-syntax c "." tbl)
686 (setq c (1+ c)))
687 ;; Fixme: These aren't all right:
6b61353c
KH
688 (setq c #x2010)
689 (while (<= c #x2016)
690 (set-case-syntax c "_" tbl)
691 (setq c (1+ c)))
692 ;; Punctuation syntax for quotation marks (like `)
693 (while (<= c #x201f)
694 (set-case-syntax c "." tbl)
695 (setq c (1+ c)))
696 ;; Fixme: These aren't all right:
d05cfa1f
KH
697 (while (<= c #x2027)
698 (set-case-syntax c "_" tbl)
699 (setq c (1+ c)))
b427c97e
DL
700 (while (<= c #x206F)
701 (set-case-syntax c "." tbl)
702 (setq c (1+ c)))
d05cfa1f 703
269a5dd0
DL
704 ;; Roman numerals
705 (setq c #x2160)
706 (while (<= c #x216f)
abdaa411 707 (set-case-syntax-pair c (+ c #x10) tbl)
269a5dd0
DL
708 (setq c (1+ c)))
709
4fb82d62
DL
710 ;; Fixme: The following blocks might be better as symbol rather than
711 ;; punctuation.
b427c97e
DL
712 ;; Arrows
713 (setq c #x2190)
6ca54a3a
DL
714 (while (<= c #x21FF)
715 (set-case-syntax c "." tbl)
b427c97e
DL
716 (setq c (1+ c)))
717 ;; Mathematical Operators
718 (while (<= c #x22FF)
6ca54a3a 719 (set-case-syntax c "." tbl)
b427c97e
DL
720 (setq c (1+ c)))
721 ;; Miscellaneous Technical
722 (while (<= c #x23FF)
6ca54a3a 723 (set-case-syntax c "." tbl)
b427c97e
DL
724 (setq c (1+ c)))
725 ;; Control Pictures
726 (while (<= c #x243F)
6ca54a3a 727 (set-case-syntax c "_" tbl)
269a5dd0
DL
728 (setq c (1+ c)))
729
730 ;; Circled Latin
731 (setq c #x24b6)
732 (while (<= c #x24cf)
abdaa411
DL
733 (set-case-syntax-pair c (+ c 26) tbl)
734 (modify-category-entry c ?l)
735 (modify-category-entry (+ c 26) ?l)
269a5dd0
DL
736 (setq c (1+ c)))
737
738 ;; Fullwidth Latin
739 (setq c #xff21)
740 (while (<= c #xff3a)
abdaa411
DL
741 (set-case-syntax-pair c (+ c #x20) tbl)
742 (modify-category-entry c ?l)
743 (modify-category-entry (+ c #x20) ?l)
269a5dd0
DL
744 (setq c (1+ c)))
745
269a5dd0 746 ;; Combining diacritics
abdaa411 747 (modify-category-entry '(#x300 . #x362) ?^)
269a5dd0 748 ;; Combining marks
abdaa411 749 (modify-category-entry '(#x20d0 . #x20e3) ?^)
269a5dd0
DL
750
751 ;; Fixme: syntax for symbols &c
752 )
6b61353c
KH
753
754(let ((pairs
e55a4d4e
KH
755 '("⁅⁆" ; U+2045 U+2046
756 "⁽⁾" ; U+207D U+207E
757 "₍₎" ; U+208D U+208E
758 "〈〉" ; U+2329 U+232A
759 "⎴⎵" ; U+23B4 U+23B5
760 "❨❩" ; U+2768 U+2769
761 "❪❫" ; U+276A U+276B
762 "❬❭" ; U+276C U+276D
763 "❰❱" ; U+2770 U+2771
764 "❲❳" ; U+2772 U+2773
765 "❴❵" ; U+2774 U+2775
766 "⟦⟧" ; U+27E6 U+27E7
767 "⟨⟩" ; U+27E8 U+27E9
768 "⟪⟫" ; U+27EA U+27EB
769 "⦃⦄" ; U+2983 U+2984
770 "⦅⦆" ; U+2985 U+2986
771 "⦇⦈" ; U+2987 U+2988
772 "⦉⦊" ; U+2989 U+298A
773 "⦋⦌" ; U+298B U+298C
774 "⦍⦎" ; U+298D U+298E
775 "⦏⦐" ; U+298F U+2990
776 "⦑⦒" ; U+2991 U+2992
777 "⦓⦔" ; U+2993 U+2994
778 "⦕⦖" ; U+2995 U+2996
779 "⦗⦘" ; U+2997 U+2998
780 "⧼⧽" ; U+29FC U+29FD
781 "〈〉" ; U+3008 U+3009
782 "《》" ; U+300A U+300B
783 "「」" ; U+300C U+300D
784 "『』" ; U+300E U+300F
785 "【】" ; U+3010 U+3011
786 "〔〕" ; U+3014 U+3015
787 "〖〗" ; U+3016 U+3017
788 "〘〙" ; U+3018 U+3019
789 "〚〛" ; U+301A U+301B
790 "﴾﴿" ; U+FD3E U+FD3F
791 "︵︶" ; U+FE35 U+FE36
792 "︷︸" ; U+FE37 U+FE38
793 "︹︺" ; U+FE39 U+FE3A
794 "︻︼" ; U+FE3B U+FE3C
795 "︽︾" ; U+FE3D U+FE3E
796 "︿﹀" ; U+FE3F U+FE40
797 "﹁﹂" ; U+FE41 U+FE42
798 "﹃﹄" ; U+FE43 U+FE44
799 "﹙﹚" ; U+FE59 U+FE5A
800 "﹛﹜" ; U+FE5B U+FE5C
801 "﹝﹞" ; U+FE5D U+FE5E
802 "()" ; U+FF08 U+FF09
803 "[]" ; U+FF3B U+FF3D
804 "{}" ; U+FF5B U+FF5D
805 "⦅⦆" ; U+FF5F U+FF60
806 "「」" ; U+FF62 U+FF63
6b61353c
KH
807 )))
808 (dolist (elt pairs)
809 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
810 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
811
4ed46869 812\f
777cfce6 813;; For each character set, put the information of the most proper
aaa9f206 814;; coding system to encode it by `preferred-coding-system' property.
777cfce6 815
abdaa411 816;; Fixme: should this be junked?
777cfce6
KH
817(let ((l '((latin-iso8859-1 . iso-latin-1)
818 (latin-iso8859-2 . iso-latin-2)
819 (latin-iso8859-3 . iso-latin-3)
820 (latin-iso8859-4 . iso-latin-4)
821 (thai-tis620 . thai-tis620)
822 (greek-iso8859-7 . greek-iso-8bit)
823 (arabic-iso8859-6 . iso-2022-7bit)
824 (hebrew-iso8859-8 . hebrew-iso-8bit)
825 (katakana-jisx0201 . japanese-shift-jis)
826 (latin-jisx0201 . japanese-shift-jis)
827 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
828 (latin-iso8859-9 . iso-latin-5)
829 (japanese-jisx0208-1978 . iso-2022-jp)
7870bdd9
KH
830 (chinese-gb2312 . chinese-iso-8bit)
831 (chinese-gbk . chinese-gbk)
832 (gb18030-2-byte . chinese-gb18030)
833 (gb18030-4-byte-bmp . chinese-gb18030)
834 (gb18030-4-byte-smp . chinese-gb18030)
835 (gb18030-4-byte-ext-1 . chinese-gb18030)
836 (gb18030-4-byte-ext-2 . chinese-gb18030)
777cfce6
KH
837 (japanese-jisx0208 . iso-2022-jp)
838 (korean-ksc5601 . iso-2022-kr)
839 (japanese-jisx0212 . iso-2022-jp)
777cfce6
KH
840 (chinese-big5-1 . chinese-big5)
841 (chinese-big5-2 . chinese-big5)
842 (chinese-sisheng . iso-2022-7bit)
843 (ipa . iso-2022-7bit)
844 (vietnamese-viscii-lower . vietnamese-viscii)
845 (vietnamese-viscii-upper . vietnamese-viscii)
846 (arabic-digit . iso-2022-7bit)
847 (arabic-1-column . iso-2022-7bit)
777cfce6
KH
848 (lao . lao)
849 (arabic-2-column . iso-2022-7bit)
850 (indian-is13194 . devanagari)
69e138b2 851 (indian-glyph . devanagari)
777cfce6 852 (tibetan-1-column . tibetan)
58cd41a3 853 (ethiopic . iso-2022-7bit)
7870bdd9
KH
854 (chinese-cns11643-1 . iso-2022-cn)
855 (chinese-cns11643-2 . iso-2022-cn)
777cfce6
KH
856 (chinese-cns11643-3 . iso-2022-cn)
857 (chinese-cns11643-4 . iso-2022-cn)
858 (chinese-cns11643-5 . iso-2022-cn)
859 (chinese-cns11643-6 . iso-2022-cn)
860 (chinese-cns11643-7 . iso-2022-cn)
861 (indian-2-column . devanagari)
7a860cf2
DL
862 (tibetan . tibetan)
863 (latin-iso8859-14 . iso-latin-8)
864 (latin-iso8859-15 . iso-latin-9))))
777cfce6 865 (while l
aaa9f206 866 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
777cfce6 867 (setq l (cdr l))))
df0415c5
KH
868
869\f
98a663f1 870;; Setup auto-fill-chars for charsets that should invoke auto-filling.
7760ba82 871;; SPACE and NEWLINE are already set.
df21429c
KH
872
873(set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
874(set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
875(set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
876(set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
877(set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
878(set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
879
55bd52ea 880\f
7760ba82
KH
881;;; Setting char-width-table. The default is 1.
882
883;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
884;; and final characters.
a2a22302 885(let ((l '((#x0300 . #x036F)
7760ba82
KH
886 (#x0483 . #x0489)
887 (#x0591 . #x05BD)
888 (#x05BF . #x05BF)
889 (#x05C1 . #x05C2)
890 (#x05C4 . #x05C5)
891 (#x05C7 . #x05C7)
892 (#x0600 . #x0603)
893 (#x0610 . #x0615)
894 (#x064B . #x065E)
895 (#x0670 . #x0670)
896 (#x06D6 . #x06E4)
897 (#x06E7 . #x06E8)
898 (#x06EA . #x06ED)
899 (#x070F . #x070F)
900 (#x0711 . #x0711)
901 (#x0730 . #x074A)
902 (#x07A6 . #x07B0)
903 (#x07EB . #x07F3)
904 (#x0901 . #x0902)
905 (#x093C . #x093C)
906 (#x0941 . #x0948)
907 (#x094D . #x094D)
908 (#x0951 . #x0954)
909 (#x0962 . #x0963)
910 (#x0981 . #x0981)
911 (#x09BC . #x09BC)
912 (#x09C1 . #x09C4)
913 (#x09CD . #x09CD)
914 (#x09E2 . #x09E3)
915 (#x0A01 . #x0A02)
916 (#x0A3C . #x0A3C)
917 (#x0A41 . #x0A4D)
918 (#x0A70 . #x0A71)
919 (#x0A81 . #x0A82)
920 (#x0ABC . #x0ABC)
921 (#x0AC1 . #x0AC8)
922 (#x0ACD . #x0ACD)
923 (#x0AE2 . #x0AE3)
924 (#x0B01 . #x0B01)
925 (#x0B3C . #x0B3C)
926 (#x0B3F . #x0B3F)
927 (#x0B41 . #x0B43)
928 (#x0B4D . #x0B56)
929 (#x0B82 . #x0B82)
930 (#x0BC0 . #x0BC0)
931 (#x0BCD . #x0BCD)
932 (#x0C3E . #x0C40)
933 (#x0C46 . #x0C56)
934 (#x0CBC . #x0CBC)
935 (#x0CBF . #x0CBF)
936 (#x0CC6 . #x0CC6)
937 (#x0CCC . #x0CCD)
938 (#x0CE2 . #x0CE3)
939 (#x0D41 . #x0D43)
940 (#x0D4D . #x0D4D)
941 (#x0DCA . #x0DCA)
942 (#x0DD2 . #x0DD6)
943 (#x0E31 . #x0E31)
944 (#x0E34 . #x0E3A)
945 (#x0E47 . #x0E4E)
946 (#x0EB1 . #x0EB1)
947 (#x0EB4 . #x0EBC)
948 (#x0EC8 . #x0ECD)
949 (#x0F18 . #x0F19)
950 (#x0F35 . #x0F35)
951 (#x0F37 . #x0F37)
952 (#x0F39 . #x0F39)
953 (#x0F71 . #x0F7E)
954 (#x0F80 . #x0F84)
955 (#x0F86 . #x0F87)
956 (#x0F90 . #x0FBC)
957 (#x0FC6 . #x0FC6)
958 (#x102D . #x1030)
959 (#x1032 . #x1037)
960 (#x1039 . #x1039)
961 (#x1058 . #x1059)
962 (#x1160 . #x11FF)
963 (#x135F . #x135F)
964 (#x1712 . #x1714)
965 (#x1732 . #x1734)
966 (#x1752 . #x1753)
967 (#x1772 . #x1773)
968 (#x17B4 . #x17B5)
969 (#x17B7 . #x17BD)
970 (#x17C6 . #x17C6)
971 (#x17C9 . #x17D3)
972 (#x17DD . #x17DD)
973 (#x180B . #x180D)
974 (#x18A9 . #x18A9)
975 (#x1920 . #x1922)
976 (#x1927 . #x1928)
977 (#x1932 . #x1932)
978 (#x1939 . #x193B)
979 (#x1A17 . #x1A18)
980 (#x1B00 . #x1B03)
981 (#x1B34 . #x1B34)
982 (#x1B36 . #x1B3A)
983 (#x1B3C . #x1B3C)
984 (#x1B42 . #x1B42)
985 (#x1B6B . #x1B73)
986 (#x1DC0 . #x1DFF)
987 (#x200B . #x200F)
988 (#x202A . #x202E)
989 (#x2060 . #x206F)
990 (#x20D0 . #x20EF)
991 (#x302A . #x302F)
992 (#x3099 . #x309A)
993 (#xA806 . #xA806)
994 (#xA80B . #xA80B)
995 (#xA825 . #xA826)
996 (#xFB1E . #xFB1E)
997 (#xFE00 . #xFE0F)
998 (#xFE20 . #xFE23)
999 (#xFEFF . #xFEFF)
1000 (#xFFF9 . #xFFFB)
1001 (#x10A01 . #x10A0F)
1002 (#x10A38 . #x10A3F)
1003 (#x1D167 . #x1D169)
1004 (#x1D173 . #x1D182)
1005 (#x1D185 . #x1D18B)
1006 (#x1D1AA . #x1D1AD)
1007 (#x1D242 . #x1D244)
1008 (#xE0001 . #xE01EF))))
1009 (dolist (elt l)
1010 (set-char-table-range char-width-table elt 0)))
1011
1012;; 2: East Asian Wide and Full-width characters.
1013(let ((l '((#x1100 . #x115F)
1014 (#x2329 . #x232A)
1015 (#x2E80 . #x303E)
1016 (#x3040 . #xA4CF)
1017 (#xAC00 . #xD7A3)
ed0cb465 1018 (#xF900 . #xFAFF)
7760ba82 1019 (#xFE30 . #xFE6F)
bb5c62cf 1020 (#xFF01 . #xFF60)
7760ba82
KH
1021 (#xFFE0 . #xFFE6)
1022 (#x20000 . #x2FFFF)
1023 (#x30000 . #x3FFFF))))
ed0cb465 1024 (dolist (elt l)
7760ba82 1025 (set-char-table-range char-width-table elt 2)))
173f18ce
DL
1026
1027;; Other double width
7760ba82
KH
1028;;(map-charset-chars
1029;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1030;; 'ethiopic)
1031;; (map-charset-chars
1032;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1033;; 'tibetan)
173f18ce
DL
1034(map-charset-chars
1035 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1036 'indian-2-column)
1037(map-charset-chars
1038 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1039 'arabic-2-column)
777cfce6 1040
dbff07a2
KH
1041;; Internal use only.
1042;; Alist of locale symbol vs charsets. In a language environment
1043;; corresponding to the locale, width of characters in the charsets is
1044;; set to 2. Each element has the form:
1045;; (LOCALE TABLE (CHARSET (FROM-CODE . TO-CODE) ...) ...)
1046;; LOCALE: locale symbol
1047;; TABLE: char-table used for char-width-table, initially nil.
1048;; CAHRSET: character set
1049;; FROM-CODE, TO-CODE: range of code-points in CHARSET
1050
1051(defvar cjk-char-width-table-list
1052 '((ja_JP nil (japanese-jisx0208 (#x2121 . #x287E))
1053 (cp932-2-byte (#x8140 . #x879F)))
1054 (zh_CN nil (chinese-gb2312 (#x2121 . #x297E)))
1055 (zh_HK nil (big5-hkscs (#xA140 . #xA3FE) (#xC6A0 . #xC8FE)))
1056 (zh_TW nil (big5 (#xA140 . #xA3FE))
1057 (chinese-cns11643-1 (#x2121 . #x427E)))
1058 (ko_KR nil (korean-ksc5601 (#x2121 . #x2C7E)))))
1059
1060;; Internal use only.
1061;; Setup char-width-table appropriate for a language environment
1062;; corresponding to LOCALE-NAME (symbol).
1063
1064(defun use-cjk-char-width-table (locale-name)
1065 (while (char-table-parent char-width-table)
1066 (setq char-width-table (char-table-parent char-width-table)))
1067 (let ((slot (assq locale-name cjk-char-width-table-list))
1068 table)
1069 (or slot (error "Unknown locale for CJK language environment: %s"
1070 locale-name))
1071 (unless (nth 1 slot)
1072 (let ((table (make-char-table nil)))
1073 (dolist (charset-info (nthcdr 2 slot))
1074 (let ((charset (car charset-info)))
1075 (dolist (code-range (cdr charset-info))
1076 (map-charset-chars #'(lambda (range arg)
1077 (set-char-table-range table range 2))
1078 charset nil
1079 (car code-range) (cdr code-range)))))
1080 (optimize-char-table table)
1081 (set-char-table-parent table char-width-table)
1082 (setcar (cdr slot) table)))
1083 (setq char-width-table (nth 1 slot))))
55a3ed16
KH
1084
1085(defun use-default-char-width-table ()
1086 "Internal use only.
9f336de0 1087Setup char-width-table appropriate for non-CJK language environment."
dbff07a2
KH
1088 (while (char-table-parent char-width-table)
1089 (setq char-width-table (char-table-parent char-width-table))))
55a3ed16 1090
87a39edb 1091(optimize-char-table (standard-case-table))
87a39edb
DL
1092(optimize-char-table (standard-syntax-table))
1093
55a3ed16
KH
1094\f
1095;; Setting char-script-table.
1096
b427c97e
DL
1097;; The Unicode blocks actually extend past some of these ranges with
1098;; undefined codepoints.
9ce5de1c
KH
1099(let ((script-list nil))
1100 (dolist
1101 (elt
b982c760 1102 '((#x0000 #x007F latin)
6c52dd78
JR
1103 (#x00A0 #x024F latin)
1104 (#x0250 #x02AF phonetic)
1105 (#x02B0 #x036F latin)
9ce5de1c
KH
1106 (#x0370 #x03E1 greek)
1107 (#x03E2 #x03EF coptic)
1108 (#x03F0 #x03F3 greek)
1109 (#x0400 #x04FF cyrillic)
1110 (#x0530 #x058F armenian)
1111 (#x0590 #x05FF hebrew)
1112 (#x0600 #x06FF arabic)
1113 (#x0700 #x074F syriac)
e7da2f38 1114 (#x07C0 #x07FA nko)
9ce5de1c
KH
1115 (#x0780 #x07BF thaana)
1116 (#x0900 #x097F devanagari)
1117 (#x0980 #x09FF bengali)
1118 (#x0A00 #x0A7F gurmukhi)
1119 (#x0A80 #x0AFF gujarati)
1120 (#x0B00 #x0B7F oriya)
1121 (#x0B80 #x0BFF tamil)
1122 (#x0C00 #x0C7F telugu)
1123 (#x0C80 #x0CFF kannada)
1124 (#x0D00 #x0D7F malayalam)
1125 (#x0D80 #x0DFF sinhala)
1126 (#x0E00 #x0E5F thai)
1127 (#x0E80 #x0EDF lao)
1128 (#x0F00 #x0FFF tibetan)
d99ea08e 1129 (#x1000 #x109F burmese)
9ce5de1c
KH
1130 (#x10A0 #x10FF georgian)
1131 (#x1100 #x11FF hangul)
4c81b0f6 1132 (#x1200 #x139F ethiopic)
9ce5de1c
KH
1133 (#x13A0 #x13FF cherokee)
1134 (#x1400 #x167F canadian-aboriginal)
1135 (#x1680 #x169F ogham)
1136 (#x16A0 #x16FF runic)
1137 (#x1780 #x17FF khmer)
1138 (#x1800 #x18AF mongolian)
6c52dd78 1139 (#x1D00 #x1DFF phonetic)
9ce5de1c
KH
1140 (#x1E00 #x1EFF latin)
1141 (#x1F00 #x1FFF greek)
f041d33e 1142 (#x2000 #x27FF symbol)
9ce5de1c 1143 (#x2800 #x28FF braille)
4c81b0f6 1144 (#x2D80 #x2DDF ethiopic)
9ce5de1c
KH
1145 (#x2E80 #x2FDF han)
1146 (#x2FF0 #x2FFF ideographic-description)
1147 (#x3000 #x303F cjk-misc)
1148 (#x3040 #x30FF kana)
1149 (#x3100 #x312F bopomofo)
1150 (#x3130 #x318F hangul)
1151 (#x3190 #x319F kanbun)
1152 (#x31A0 #x31BF bopomofo)
1153 (#x3400 #x9FAF han)
1154 (#xA000 #xA4CF yi)
1ffae953 1155 (#xAA00 #xAA5F cham)
d99ea08e 1156 (#xAA60 #xAA7B burmese)
d807d0c7 1157 (#xAA80 #xAADF tai-viet)
9ce5de1c 1158 (#xAC00 #xD7AF hangul)
95ac45fa 1159 (#xF900 #xFAFF han)
9ce5de1c
KH
1160 (#xFB1D #xFB4F hebrew)
1161 (#xFB50 #xFDFF arabic)
1162 (#xFE70 #xFEFC arabic)
1163 (#xFF00 #xFF5F cjk-misc)
1164 (#xFF61 #xFF9F kana)
1165 (#xFFE0 #xFFE6 cjk-misc)
458888ab
KH
1166 (#x10000 #x100FF linear-b)
1167 (#x10100 #x1013F aegean-number)
1168 (#x10140 #x1018A ancient-greek-number)
1169 (#x10190 #x1019B ancient-symbol)
1170 (#x101D0 #x101FF phaistos-disc)
1171 (#x10280 #x1029F lycian)
1172 (#x102A0 #x102DF carian)
1173 (#x10300 #x1032F olt-italic)
1174 (#x10380 #x1039F ugaritic)
1175 (#x103A0 #x103DF old-persian)
1176 (#x10400 #x1044F deseret)
1177 (#x10450 #x1047F shavian)
1178 (#x10480 #x104AF osmanya)
1179 (#x10800 #x1083F cypriot-syllabary)
1180 (#x10900 #x1091F phoenician)
1181 (#x10920 #x1093F lydian)
1182 (#x10A00 #x10A5F kharoshthi)
1183 (#x12000 #x123FF cuneiform)
1184 (#x12400 #x1247F cuneiform-numbers-and-punctuation)
e7da2f38
KH
1185 (#x1D000 #x1D0FF byzantine-musical-symbol)
1186 (#x1D100 #x1D1FF musical-symbol)
458888ab
KH
1187 (#x1D200 #x1D24F ancient-greek-musical-notation)
1188 (#x1D300 #x1D35F tai-xuan-jing-symbol)
1189 (#x1D360 #x1D37F counting-rod-numeral)
e7da2f38 1190 (#x1D400 #x1D7FF mathematical)
458888ab
KH
1191 (#x1F000 #x1F02F mahjong-tile)
1192 (#x1F030 #x1F09F domino-tile)
e7259832 1193 (#x20000 #x2AFFF han)
9ce5de1c
KH
1194 (#x2F800 #x2FFFF han)))
1195 (set-char-table-range char-script-table
1196 (cons (car elt) (nth 1 elt)) (nth 2 elt))
1197 (or (memq (nth 2 elt) script-list)
1198 (setq script-list (cons (nth 2 elt) script-list))))
1199 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1200
8f924df7 1201(map-charset-chars
cdfc5141
KH
1202 #'(lambda (range ignore)
1203 (set-char-table-range char-script-table range 'tibetan))
1204 'tibetan)
1205
e7259832 1206\f
59db3a5c
KH
1207;;; Setting unicode-category-table.
1208
c805dec0
KH
1209(setq unicode-category-table
1210 (unicode-property-table-internal 'general-category))
8ea6fa80
KH
1211(map-char-table #'(lambda (key val)
1212 (if (and val
1213 (or (and (/= (aref (symbol-name val) 0) ?M)
1214 (/= (aref (symbol-name val) 0) ?C))
1215 (eq val 'Zs)))
1216 (modify-category-entry key ?.)))
1217 unicode-category-table)
1218
1219(optimize-char-table (standard-category-table))
59db3a5c
KH
1220
1221\f
b2cca856
KH
1222;; Display of glyphless characters.
1223
1224(defvar char-acronym-table
1225 (make-char-table 'char-acronym-table nil)
1226 "Char table of acronyms for non-graphic characters.")
1227
1228(let ((c0-acronyms '("NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK" "BEL"
1229 "BS" nil nil "VT" "FF" "CR" "SO" "SI"
1230 "DLE" "DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB"
1231 "CAN" "EM" "SUB" "ESC" "FC" "GS" "RS" "US")))
1232 (dotimes (i 32)
1233 (aset char-acronym-table i (car c0-acronyms))
1234 (setq c0-acronyms (cdr c0-acronyms))))
1235
1236(let ((c1-acronyms '("XXX" "XXX" "BPH" "NBH" "IND" "NEL" "SSA" "ESA"
1237 "HTS" "HTJ" "VTS" "PLD" "PLU" "R1" "SS2" "SS1"
1238 "DCS" "PU1" "PU2" "STS" "CCH" "MW" "SPA" "EPA"
1239 "SOS" "XXX" "SC1" "CSI" "ST" "OSC" "PM" "APC")))
1240 (dotimes (i 32)
1241 (aset char-acronym-table (+ #x0080 i) (car c1-acronyms))
1242 (setq c1-acronyms (cdr c1-acronyms))))
1243
1244(aset char-acronym-table #x17B4 "KIVAQ") ; KHMER VOWEL INHERENT AQ
1245(aset char-acronym-table #x17B5 "KIVAA") ; KHMER VOWEL INHERENT AA
1246(aset char-acronym-table #x200B "ZWSP") ; ZERO WIDTH SPACE
1247(aset char-acronym-table #x200C "ZWNJ") ; ZERO WIDTH NON-JOINER
1248(aset char-acronym-table #x200D "ZWJ") ; ZERO WIDTH JOINER
1249(aset char-acronym-table #x200E "LRM") ; LEFT-TO-RIGHT MARK
1250(aset char-acronym-table #x200F "RLM") ; RIGHT-TO-LEFT MARK
1251(aset char-acronym-table #x202A "LRE") ; LEFT-TO-RIGHT EMBEDDING
1252(aset char-acronym-table #x202B "RLE") ; RIGHT-TO-LEFT EMBEDDING
1253(aset char-acronym-table #x202C "PDF") ; POP DIRECTIONAL FORMATTING
1254(aset char-acronym-table #x202D "LRO") ; LEFT-TO-RIGHT OVERRIDE
1255(aset char-acronym-table #x202E "RLO") ; RIGHT-TO-LEFT OVERRIDE
1256(aset char-acronym-table #x2060 "WJ") ; WORD JOINER
1257(aset char-acronym-table #x206A "ISS") ; INHIBIT SYMMETRIC SWAPPING
1258(aset char-acronym-table #x206B "ASS") ; ACTIVATE SYMMETRIC SWAPPING
1259(aset char-acronym-table #x206C "IAFS") ; INHIBIT ARABIC FORM SHAPING
1260(aset char-acronym-table #x206D "AAFS") ; ACTIVATE ARABIC FORM SHAPING
1261(aset char-acronym-table #x206E "NADS") ; NATIONAL DIGIT SHAPES
1262(aset char-acronym-table #x206F "NODS") ; NOMINAL DIGIT SHAPES
1263(aset char-acronym-table #xFEFF "ZWNBSP") ; ZERO WIDTH NO-BREAK SPACE
1264(aset char-acronym-table #xFFF9 "IAA") ; INTERLINEAR ANNOTATION ANCHOR
1265(aset char-acronym-table #xFFFA "IAS") ; INTERLINEAR ANNOTATION SEPARATOR
1266(aset char-acronym-table #xFFFB "IAT") ; INTERLINEAR ANNOTATION TERMINATOR
1267(aset char-acronym-table #x1D173 "BEGBM") ; MUSICAL SYMBOL BEGIN BEAM
1268(aset char-acronym-table #x1D174 "ENDBM") ; MUSICAL SYMBOL END BEAM
1269(aset char-acronym-table #x1D175 "BEGTIE") ; MUSICAL SYMBOL BEGIN TIE
1270(aset char-acronym-table #x1D176 "END") ; MUSICAL SYMBOL END TIE
1271(aset char-acronym-table #x1D177 "BEGSLR") ; MUSICAL SYMBOL BEGIN SLUR
1272(aset char-acronym-table #x1D178 "ENDSLR") ; MUSICAL SYMBOL END SLUR
1273(aset char-acronym-table #x1D179 "BEGPHR") ; MUSICAL SYMBOL BEGIN PHRASE
1274(aset char-acronym-table #x1D17A "ENDPHR") ; MUSICAL SYMBOL END PHRASE
1275(aset char-acronym-table #xE0001 "|->TAG") ; LANGUAGE TAG
1276(aset char-acronym-table #xE0020 "SP TAG") ; TAG SPACE
1277(dotimes (i 94)
1278 (aset char-acronym-table (+ #xE0021 i) (format " %c TAG" (+ 33 i))))
1279(aset char-acronym-table #xE007F "->|TAG") ; CANCEL TAG
1280
0e7c0582 1281(defun update-glyphless-char-display (&optional variable value)
0eb025fb 1282 "Make the setting of `glyphless-char-display-control' take effect.
b2cca856 1283This function updates the char-table `glyphless-char-display'."
0e7c0582
EZ
1284 (when value
1285 (set-default variable value))
1286 (dolist (elt value)
b2cca856
KH
1287 (let ((target (car elt))
1288 (method (cdr elt)))
0eb025fb
EZ
1289 (or (memq method '(zero-width thin-space empty-box acronym hex-code))
1290 (error "Invalid glyphless character display method: %s" method))
b2cca856
KH
1291 (cond ((eq target 'c0-control)
1292 (set-char-table-range glyphless-char-display '(#x00 . #x1F)
96107967
EZ
1293 method)
1294 ;; Users will not expect their newlines and TABs be
1295 ;; displayed as anything but themselves, so exempt those
1296 ;; two characters from c0-control.
1297 (set-char-table-range glyphless-char-display #x9 nil)
1298 (set-char-table-range glyphless-char-display #xa nil))
b2cca856
KH
1299 ((eq target 'c1-control)
1300 (set-char-table-range glyphless-char-display '(#x80 . #x9F)
1301 method))
1302 ((eq target 'format-control)
1303 (map-char-table
1304 #'(lambda (char category)
1305 (if (eq category 'Cf)
1306 (let ((this-method method)
1307 from to)
1308 (if (consp char)
1309 (setq from (car char) to (cdr char))
1310 (setq from char to char))
1311 (while (<= from to)
1312 (when (/= from #xAD)
1313 (if (eq method 'acronym)
0eb025fb 1314 (setq this-method
b2cca856
KH
1315 (aref char-acronym-table from)))
1316 (set-char-table-range glyphless-char-display
1317 from this-method))
1318 (setq from (1+ from))))))
1319 unicode-category-table))
1320 ((eq target 'no-font)
1321 (set-char-table-extra-slot glyphless-char-display 0 method))
1322 (t
0eb025fb 1323 (error "Invalid glyphless character group: %s" target))))))
b2cca856 1324
0e7c0582
EZ
1325;;; Control of displaying glyphless characters.
1326(defcustom glyphless-char-display-control
1327 '((format-control . thin-space)
1328 (no-font . hex-code))
1329 "List of directives to control display of glyphless characters.
1330
1331Each element has the form (GROUP . METHOD), where GROUP is a
1332symbol specifying the character group, and METHOD is a symbol
1333specifying the method of displaying characters belonging to that
1334group.
1335
1336GROUP must be one of these symbols:
96107967 1337 `c0-control': U+0000..U+001F, but excluding newline and TAB.
0e7c0582
EZ
1338 `c1-control': U+0080..U+009F.
1339 `format-control': Characters of Unicode General Category `Cf',
1340 such as U+200C (ZWNJ), U+200E (LRM), but
1341 excluding characters that have graphic images,
1342 such as U+00AD (SHY).
1343 `no-font': characters for which no suitable font is found.
1344 For character terminals, characters that cannot
1345 be encoded by `terminal-coding-system'.
1346
1347METHOD must be one of these symbols:
1348 `zero-width': don't display.
1349 `thin-space': display a thin (1-pixel width) space. On character
1350 terminals, display as 1-character space.
1351 `empty-box': display an empty box.
1352 `acronym': display an acronym of the character in a box. The
1353 acronym is taken from `char-acronym-table', which see.
1354 `hex-code': display the hexadecimal character code in a box."
1355
1356 :type '(alist :key-type (symbol :tag "Character Group")
1357 :value-type (symbol :tag "Display Method"))
1358 :options '((c0-control
1359 (choice (const :tag "Don't display" zero-width)
1360 (const :tag "Display as thin space" thin-space)
1361 (const :tag "Display as empty box" empty-box)
1362 (const :tag "Display acronym" acronym)
1363 (const :tag "Display hex code in a box" hex-code)))
1364 (c1-control
1365 (choice (const :tag "Don't display" zero-width)
1366 (const :tag "Display as thin space" thin-space)
1367 (const :tag "Display as empty box" empty-box)
1368 (const :tag "Display acronym" acronym)
1369 (const :tag "Display hex code in a box" hex-code)))
1370 (format-control
1371 (choice (const :tag "Don't display" zero-width)
1372 (const :tag "Display as thin space" thin-space)
1373 (const :tag "Display as empty box" empty-box)
1374 (const :tag "Display acronym" acronym)
1375 (const :tag "Display hex code in a box" hex-code)))
1376 (no-font
1377 (choice (const :tag "Don't display" zero-width)
1378 (const :tag "Display as thin space" thin-space)
1379 (const :tag "Display as empty box" empty-box)
1380 (const :tag "Display acronym" acronym)
1381 (const :tag "Display hex code in a box" hex-code))))
1382 :set 'update-glyphless-char-display
1383 :group 'display)
1384
b2cca856 1385\f
e7259832
KH
1386;;; Setting word boundary.
1387
e7259832 1388(setq word-combining-categories
4626499f
KH
1389 '((nil . ?^)
1390 (?^ . nil)
7ffefb08
MB
1391 (?C . ?H)
1392 (?C . ?K)))
e7259832
KH
1393
1394(setq word-separating-categories ; (2-byte character sets)
4626499f 1395 '((?H . ?K) ; Hiragana - Katakana
e7259832
KH
1396 ))
1397
1cbfaab9 1398;; Local Variables:
985773c9 1399;; coding: utf-8
1cbfaab9 1400;; End:
777cfce6 1401
60370d40 1402;;; characters.el ends here