Add 2010 to copyright years.
[bpt/emacs.git] / lisp / international / characters.el
CommitLineData
4ed46869
KH
1;;; characters.el --- set syntax and category for multibyte characters
2
114f9c96 3;; Copyright (C) 1997, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
2fd125a3 4;; Free Software Foundation, Inc.
7976eda0 5;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
114f9c96 6;; 2005, 2006, 2007, 2008, 2009, 2010
2fd125a3
KH
7;; National Institute of Advanced Industrial Science and Technology (AIST)
8;; Registration Number H14PRO021
8f924df7 9;; Copyright (C) 2003
55bd52ea
KH
10;; National Institute of Advanced Industrial Science and Technology (AIST)
11;; Registration Number H13PRO009
4ed46869
KH
12
13;; Keywords: multibyte character, character set, syntax, category
14
15;; This file is part of GNU Emacs.
16
4936186e 17;; GNU Emacs is free software: you can redistribute it and/or modify
4ed46869 18;; it under the terms of the GNU General Public License as published by
4936186e
GM
19;; the Free Software Foundation, either version 3 of the License, or
20;; (at your option) any later version.
4ed46869
KH
21
22;; GNU Emacs is distributed in the hope that it will be useful,
23;; but WITHOUT ANY WARRANTY; without even the implied warranty of
24;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25;; GNU General Public License for more details.
26
27;; You should have received a copy of the GNU General Public License
4936186e 28;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
4ed46869
KH
29
30;;; Commentary:
31
60370d40
PJ
32;;; Code:
33
4ed46869
KH
34;;; Predefined categories.
35
36;; For each character set.
37
46bf60bc
KH
38(define-category ?a "ASCII
39ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
4ed46869
KH
40(define-category ?l "Latin")
41(define-category ?t "Thai")
42(define-category ?g "Greek")
43(define-category ?b "Arabic")
44(define-category ?w "Hebrew")
45(define-category ?y "Cyrillic")
46bf60bc
KH
46(define-category ?k "Katakana
47Japanese katakana")
48(define-category ?r "Roman
49Japanese roman")
4ed46869
KH
50(define-category ?c "Chinese")
51(define-category ?j "Japanese")
52(define-category ?h "Korean")
46bf60bc
KH
53(define-category ?e "Ethiopic
54Ethiopic (Ge'ez)")
55(define-category ?v "Viet
56Vietnamese")
4ed46869 57(define-category ?i "Indian")
6eba8645 58(define-category ?o "Lao")
9395eb7c 59(define-category ?q "Tibetan")
4ed46869
KH
60
61;; For each group (row) of 2-byte character sets.
62
46bf60bc
KH
63(define-category ?A "2-byte alnum
64Alpha-numeric characters of 2-byte character sets")
65(define-category ?C "2-byte han
66Chinese (Han) characters of 2-byte character sets")
67(define-category ?G "2-byte Greek
68Greek characters of 2-byte character sets")
69(define-category ?H "2-byte Hiragana
70Japanese Hiragana characters of 2-byte character sets")
71(define-category ?K "2-byte Katakana
72Japanese Katakana characters of 2-byte character sets")
73(define-category ?N "2-byte Korean
74Korean Hangul characters of 2-byte character sets")
91c491e0 75(define-category ?Y "2-byte Cyrillic
46bf60bc 76Cyrillic characters of 2-byte character sets")
4ed46869
KH
77(define-category ?I "Indian Glyphs")
78
79;; For phonetic classifications.
80
81(define-category ?0 "consonant")
46bf60bc 82(define-category ?1 "base vowel
4eb97232 83Base (independent) vowel")
46bf60bc 84(define-category ?2 "upper diacritic
4eb97232 85Upper diacritical mark (including upper vowel)")
46bf60bc 86(define-category ?3 "lower diacritic
4eb97232 87Lower diacritical mark (including lower vowel)")
46bf60bc 88(define-category ?4 "combining tone
4eb97232 89Combining tone mark")
9765a2ba 90(define-category ?5 "symbol")
4ed46869 91(define-category ?6 "digit")
91c491e0 92(define-category ?7 "vowel diacritic
4eb97232 93Vowel-modifying diacritical mark")
6eba8645
KH
94(define-category ?8 "vowel-signs")
95(define-category ?9 "semivowel lower")
4ed46869
KH
96
97;; For filling.
46bf60bc
KH
98(define-category ?| "line breakable
99While filling, we can break a line at this character.")
4ed46869 100
504af7b2 101;; For indentation calculation.
70ea295a 102(define-category ?\s
46bf60bc
KH
103 "space for indent
104This character counts as a space for indentation purposes.")
504af7b2 105
94487c4e 106;; Keep the following for `kinsoku' processing. See comments in
4ed46869 107;; kinsoku.el.
46bf60bc
KH
108(define-category ?> "Not at bol
109A character which can't be placed at beginning of line.")
110(define-category ?< "Not at eol
111A character which can't be placed at end of line.")
4ed46869 112
8ea6fa80
KH
113;; Base and Combining
114(define-category ?. "Base
115Base characters (Unicode General Category L,N,P,S,Zs)")
46bf60bc 116(define-category ?^ "Combining
4eb97232 117Combining diacritic or mark (Unicode General Category M)")
4ed46869
KH
118\f
119;;; Setting syntax and category.
120
121;; ASCII
122
e2cc40b7
KH
123;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
124(modify-category-entry '(32 . 127) ?a)
125(modify-category-entry '(32 . 127) ?l)
4ed46869 126
c94ae9eb
DL
127;; Deal with the CJK charsets first. Since the syntax of blocks is
128;; defined per charset, and the charsets may contain e.g. Latin
129;; characters, we end up with the wrong syntax definitions if we're
130;; not careful.
4ed46869 131
66bff5ed 132;; Chinese characters (Unicode)
a5bb49e1
KH
133(modify-category-entry '(#x2E80 . #x312F) ?|)
134(modify-category-entry '(#x3190 . #x33FF) ?|)
66a85e76
KH
135(modify-category-entry '(#x3400 . #x4DBF) ?C)
136(modify-category-entry '(#x4E00 . #x9FAF) ?C)
66bff5ed
KH
137(modify-category-entry '(#x3400 . #x9FAF) ?c)
138(modify-category-entry '(#x3400 . #x9FAF) ?|)
139(modify-category-entry '(#xF900 . #xFAFF) ?C)
140(modify-category-entry '(#xF900 . #xFAFF) ?c)
141(modify-category-entry '(#xF900 . #xFAFF) ?|)
796f8b2f
KH
142(modify-category-entry '(#x20000 . #x2FFFF) ?|)
143(modify-category-entry '(#x20000 . #x2FFFF) ?C)
144(modify-category-entry '(#x20000 . #x2FFFF) ?c)
8e4cd685 145
4ed46869
KH
146
147;; Chinese character set (GB2312)
148
66bff5ed
KH
149(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
150(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
151(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
4ed46869 152
87a39edb 153(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
9ad4b491
KH
154(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
155(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
156(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
66bff5ed
KH
157(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
158(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
159(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
160(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
161(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
4ed46869
KH
162
163;; Chinese character set (BIG5)
164
e7259832 165(map-charset-chars #'modify-category-entry 'big5 ?c)
66a85e76 166(map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA261)
9ad4b491 167(map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
66a85e76 168(map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DC)
4ed46869
KH
169
170;; Chinese character set (CNS11643)
171
87a39edb
DL
172(dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
173 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
174 chinese-cns11643-7))
175 (map-charset-chars #'modify-category-entry c ?c)
9ad4b491
KH
176 (if (eq c 'chinese-cns11643-1)
177 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
8e4cd685 178 (map-charset-chars #'modify-category-entry c ?C)))
4ed46869 179
8f924df7 180;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
4ed46869 181
66bff5ed 182(map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
4ed46869 183
66bff5ed 184(map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
4ed46869 185
8f924df7 186(dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
761f6427
KH
187 japanese-jisx0213-1 japanese-jisx0213-2
188 cp932-2-byte))
8e4cd685 189 (map-charset-chars #'modify-category-entry l ?j))
4ed46869 190
269a5dd0 191;; Unicode equivalents of JISX0201-kana
66bff5ed
KH
192(let ((range '(#xff61 . #xff9f)))
193 (modify-category-entry range ?k)
194 (modify-category-entry range ?j)
195 (modify-category-entry range ?\|))
269a5dd0
DL
196
197;; Katakana block
796f8b2f
KH
198(modify-category-entry '(#x3099 . #x309C) ?K)
199(modify-category-entry '(#x30A0 . #x30FF) ?K)
6f3ac1e1 200(modify-category-entry '(#x31F0 . #x31FF) ?K)
b11c2874 201(modify-category-entry '(#x30A0 . #x30FA) ?\|)
796f8b2f 202(modify-category-entry #x30FF ?\|)
269a5dd0
DL
203
204;; Hiragana block
796f8b2f
KH
205(modify-category-entry '(#x3040 . #x309F) ?H)
206(modify-category-entry '(#x3040 . #x3096) ?\|)
207(modify-category-entry #x309F ?\|)
208(modify-category-entry #x30A0 ?H)
209(modify-category-entry #x30FC ?H)
210
269a5dd0 211
4ed46869 212;; JISX0208
66bff5ed
KH
213(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
214(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
215(let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
69c2c6ea 216 (dolist (elt chars)
abdaa411 217 (modify-syntax-entry (car chars) "w")))
66bff5ed
KH
218
219(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
220(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
221(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
222(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
223(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
224(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
225(modify-category-entry ?ー ?K)
226(let ((chars '(?゛ ?゜)))
4ed46869
KH
227 (while chars
228 (modify-category-entry (car chars) ?K)
229 (modify-category-entry (car chars) ?H)
230 (setq chars (cdr chars))))
66a85e76 231(let ((chars '(?仝 ?々 ?〆 ?〇)))
4ed46869
KH
232 (while chars
233 (modify-category-entry (car chars) ?C)
234 (setq chars (cdr chars))))
235
236;; JISX0212
4ed46869 237
66bff5ed 238(map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
4ed46869
KH
239
240;; JISX0201-Kana
87a39edb 241
abdaa411 242(let ((chars '(?。 ?、 ?・)))
4ed46869
KH
243 (while chars
244 (modify-syntax-entry (car chars) ".")
245 (setq chars (cdr chars))))
246
e6d10035
KH
247(modify-syntax-entry ?\「 "(」")
248(modify-syntax-entry ?\」 "(「")
226e4119 249
4ed46869
KH
250;; Korean character set (KSC5601)
251
87a39edb 252(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
66bff5ed
KH
253
254(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
439f7264
DL
255(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
256(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
257(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
9ad4b491
KH
258(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
259(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
260(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
66bff5ed
KH
261(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
262(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
263(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
264(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
4ed46869 265
c94ae9eb 266;; These are in more than one charset.
8f924df7
KH
267(let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
268 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
269 "()[]{}"))
270 open close)
271 (dotimes (i (/ (length parens) 2))
272 (setq open (aref parens (* i 2))
273 close (aref parens (1+ (* i 2))))
274 (modify-syntax-entry open (format "(%c" close))
275 (modify-syntax-entry close (format ")%c" open))))
d05cfa1f 276
c94ae9eb 277;; Arabic character set
6eba8645 278
c94ae9eb
DL
279(let ((charsets '(arabic-iso8859-6
280 arabic-digit
281 arabic-1-column
282 arabic-2-column)))
283 (while charsets
284 (map-charset-chars #'modify-category-entry (car charsets) ?b)
285 (setq charsets (cdr charsets))))
286(modify-category-entry '(#x600 . #x6ff) ?b)
287(modify-category-entry '(#xfb50 . #xfdff) ?b)
288(modify-category-entry '(#xfe70 . #xfefe) ?b)
6eba8645 289
c94ae9eb
DL
290;; Cyrillic character set (ISO-8859-5)
291
292(modify-syntax-entry ?№ ".")
293
294;; Ethiopic character set
295
4c81b0f6
KH
296(modify-category-entry '(#x1200 . #x1399) ?e)
297(modify-category-entry '(#x2d80 . #x2dde) ?e)
55a3ed16 298(let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨)))
c94ae9eb
DL
299 (while chars
300 (modify-syntax-entry (car chars) ".")
301 (setq chars (cdr chars))))
302(map-charset-chars #'modify-category-entry 'ethiopic ?e)
303
304;; Hebrew character set (ISO-8859-8)
305
306(modify-syntax-entry #x5be ".") ; MAQAF
307(modify-syntax-entry #x5c0 ".") ; PASEQ
308(modify-syntax-entry #x5c3 ".") ; SOF PASUQ
309(modify-syntax-entry #x5f3 ".") ; GERESH
310(modify-syntax-entry #x5f4 ".") ; GERSHAYIM
311
312;; Indian character set (IS 13194 and other Emacs original Indian charsets)
313
314(modify-category-entry '(#x901 . #x970) ?i)
315(map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
316(map-charset-chars #'modify-category-entry 'indian-2-column ?i)
d05cfa1f 317
6eba8645
KH
318;; Lao character set
319
abdaa411
DL
320(modify-category-entry '(#xe80 . #xeff) ?o)
321(map-charset-chars #'modify-category-entry 'lao ?o)
6eba8645 322
abdaa411 323(let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
e6d10035
KH
324 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
325 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
326 ("ຸູ" "w" ?3) ; vowel lower
8f924df7 327 ("່-໋" "w" ?4) ; tone mark
e6d10035
KH
328 ("ຼຽ" "w" ?9) ; semivowel lower
329 ("໐-໙" "w" ?6) ; digit
330 ("ຯໆ" "_" ?5) ; symbol
6eba8645
KH
331 ))
332 elm chars len syntax category to ch i)
333 (while deflist
334 (setq elm (car deflist))
335 (setq chars (car elm)
336 len (length chars)
337 syntax (nth 1 elm)
338 category (nth 2 elm)
339 i 0)
340 (while (< i len)
341 (if (= (aref chars i) ?-)
342 (setq i (1+ i)
4a027a0d
KH
343 to (aref chars i))
344 (setq ch (aref chars i)
6eba8645
KH
345 to ch))
346 (while (<= ch to)
269a5dd0
DL
347 (unless (string-equal syntax "w")
348 (modify-syntax-entry ch syntax))
6eba8645
KH
349 (modify-category-entry ch category)
350 (setq ch (1+ ch)))
4a027a0d 351 (setq i (1+ i)))
6eba8645
KH
352 (setq deflist (cdr deflist))))
353
4ed46869
KH
354;; Thai character set (TIS620)
355
abdaa411
DL
356(modify-category-entry '(#xe00 . #xe7f) ?t)
357(map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
4ed46869
KH
358
359(let ((deflist '(;; chars syntax category
e6d10035
KH
360 ("ก-รลว-ฮ" "w" ?0) ; consonant
361 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
362 ("ัิ-ื็๎" "w" ?2) ; vowel upper
363 ("ุ-ฺ" "w" ?3) ; vowel lower
8f924df7 364 ("่-ํ" "w" ?4) ; tone mark
e6d10035
KH
365 ("๐-๙" "w" ?6) ; digit
366 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
4ed46869
KH
367 ))
368 elm chars len syntax category to ch i)
9395eb7c
KH
369 (while deflist
370 (setq elm (car deflist))
371 (setq chars (car elm)
372 len (length chars)
373 syntax (nth 1 elm)
374 category (nth 2 elm)
375 i 0)
376 (while (< i len)
377 (if (= (aref chars i) ?-)
378 (setq i (1+ i)
4a027a0d
KH
379 to (aref chars i))
380 (setq ch (aref chars i)
9395eb7c
KH
381 to ch))
382 (while (<= ch to)
269a5dd0
DL
383 (unless (string-equal syntax "w")
384 (modify-syntax-entry ch syntax))
9395eb7c
KH
385 (modify-category-entry ch category)
386 (setq ch (1+ ch)))
4a027a0d 387 (setq i (1+ i)))
9395eb7c
KH
388 (setq deflist (cdr deflist))))
389
390;; Tibetan character set
391
abdaa411
DL
392(modify-category-entry '(#xf00 . #xfff) ?q)
393(map-charset-chars #'modify-category-entry 'tibetan ?q)
394(map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
9395eb7c
KH
395
396(let ((deflist '(;; chars syntax category
725d7c92 397 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
55a3ed16 398 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
725d7c92
DL
399 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
400 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
55a3ed16 401 ("྄ཱུ༙༵༷" "w" ?3) ; lowel vowel/modifier
8f924df7 402 ("཰" "w" ?3) ; invisible vowel a
725d7c92
DL
403 ("༠-༩༪-༳" "w" ?6) ; digit
404 ("་།-༒༔ཿ" "." ?|) ; line-break char
405 ("་།༏༐༑༔ཿ" "." ?|) ;
406 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
407 ("་།༏༐༑༔ཿ" "." ?>) ;
408 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
409 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
9395eb7c
KH
410 ))
411 elm chars len syntax category to ch i)
4ed46869
KH
412 (while deflist
413 (setq elm (car deflist))
414 (setq chars (car elm)
415 len (length chars)
416 syntax (nth 1 elm)
417 category (nth 2 elm)
418 i 0)
419 (while (< i len)
420 (if (= (aref chars i) ?-)
421 (setq i (1+ i)
4a027a0d
KH
422 to (aref chars i))
423 (setq ch (aref chars i)
4ed46869
KH
424 to ch))
425 (while (<= ch to)
269a5dd0
DL
426 (unless (string-equal syntax "w")
427 (modify-syntax-entry ch syntax))
4ed46869
KH
428 (modify-category-entry ch category)
429 (setq ch (1+ ch)))
4a027a0d 430 (setq i (1+ i)))
4ed46869
KH
431 (setq deflist (cdr deflist))))
432
433;; Vietnamese character set
434
abdaa411
DL
435;; To make a word with Latin characters
436(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
437(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
438
439(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
440(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
4ed46869 441
e5dd1155
KH
442(let ((tbl (standard-case-table))
443 (i 32))
444 (while (< i 128)
725d7c92
DL
445 (let* ((char (decode-char 'vietnamese-viscii-upper i))
446 (charl (decode-char 'vietnamese-viscii-lower i))
447 (uc (encode-char char 'ucs))
448 (lc (encode-char charl 'ucs)))
449 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
4eb97232 450 tbl)
725d7c92
DL
451 (if uc (modify-category-entry uc ?v))
452 (if lc (modify-category-entry lc ?v)))
e5dd1155
KH
453 (setq i (1+ i))))
454
d807d0c7
KH
455;; Tai Viet
456(let ((deflist '(;; chars syntax category
457 ((?ꪀ. ?ꪯ) "w" ?0) ; cosonant
458 ("ꪱꪵꪶ" "w" ?1) ; vowel base
459 ((?ꪹ . ?ꪽ) "w" ?1) ; vowel base
460 ("ꪰꪲꪳꪷꪸꪾ" "w" ?2) ; vowel upper
461 ("ꪴ" "w" ?3) ; vowel lower
462 ("ꫀꫂ" "w" ?1) ; non-combining tone-mark
463 ("꪿꫁" "w" ?4) ; combining tone-mark
464 ((?ꫛ . ?꫟) "_" ?5) ; symbol
465 )))
466 (dolist (elm deflist)
467 (let ((chars (car elm))
468 (syntax (nth 1 elm))
469 (category (nth 2 elm)))
470 (if (consp chars)
471 (progn
472 (modify-syntax-entry chars syntax)
473 (modify-category-entry chars category))
474 (mapc #'(lambda (x)
475 (modify-syntax-entry x syntax)
476 (modify-category-entry x category))
477 chars)))))
c94ae9eb
DL
478
479;; Latin
480
481(modify-category-entry '(#x80 . #x024F) ?l)
d05cfa1f 482
85ef8ece
KH
483(let ((tbl (standard-case-table)) c)
484
4fb82d62
DL
485 ;; Latin-1
486
487 ;; Fixme: Some of the non-word syntaxes here perhaps should be
488 ;; reviewed. (Note that the following all implicitly have word
489 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
490 ;; relating Unicode categories to Emacs syntax codes.
db92e81e
KH
491
492 ;; NBSP isn't semantically interchangeable with other whitespace chars,
493 ;; so it's more like punctation.
494 (set-case-syntax ?  "." tbl)
4fb82d62
DL
495 (set-case-syntax ?¡ "." tbl)
496 (set-case-syntax ?¦ "_" tbl)
497 (set-case-syntax ?§ "." tbl)
498 (set-case-syntax ?© "_" tbl)
499 (set-case-syntax-delims 171 187 tbl) ; « »
500 (set-case-syntax ?¬ "_" tbl)
501 (set-case-syntax ?­ "_" tbl)
502 (set-case-syntax ?® "_" tbl)
503 (set-case-syntax ?° "_" tbl)
504 (set-case-syntax ?± "_" tbl)
505 (set-case-syntax ?µ "_" tbl)
506 (set-case-syntax ?· "_" tbl)
507 (set-case-syntax ?¼ "_" tbl)
508 (set-case-syntax ?½ "_" tbl)
509 (set-case-syntax ?¾ "_" tbl)
510 (set-case-syntax ?¿ "." tbl)
511 (let ((c 192))
512 (while (<= c 222)
513 (set-case-syntax-pair c (+ c 32) tbl)
514 (setq c (1+ c))))
515 (set-case-syntax ?× "_" tbl)
516 (set-case-syntax ?ß "w" tbl)
517 (set-case-syntax ?÷ "_" tbl)
518 ;; See below for ÿ.
85ef8ece 519
85ef8ece
KH
520 ;; Latin Extended-A, Latin Extended-B
521 (setq c #x0100)
e5e381c8
KH
522 (while (<= c #x02B8)
523 (modify-category-entry c ?l)
d05cfa1f 524 (setq c (1+ c)))
2bb915b8 525
e5e381c8
KH
526 (let ((pair-ranges '((#x0100 . #x012F)
527 (#x0132 . #x0137)
528 (#x0139 . #x0148)
529 (#x014a . #x0177)
530 (#x0179 . #x017E)
531 (#x0182 . #x0185)
796f8b2f
KH
532 (#x0187 . #x0188)
533 (#x018B . #x018C)
e5e381c8
KH
534 (#x0191 . #x0192)
535 (#x0198 . #x0199)
536 (#x01A0 . #x01A5)
537 (#x01A7 . #x01A8)
538 (#x01AC . #x01AD)
539 (#x01AF . #x01B0)
540 (#x01B3 . #x01B6)
541 (#x01BC . #x01BD)
542 (#x01CD . #x01DC)
543 (#x01DE . #x01EF)
544 (#x01F4 . #x01F5)
545 (#x01F8 . #x021F)
546 (#x0222 . #x0233)
547 (#x023B . #x023C)
548 (#x0241 . #x0242)
549 (#x0246 . #x024F))))
550 (dolist (elt pair-ranges)
551 (let ((from (car elt)) (to (cdr elt)))
552 (while (< from to)
553 (set-case-syntax-pair from (1+ from) tbl)
554 (setq from (+ from 2))))))
2bb915b8 555
796f8b2f
KH
556 (set-case-syntax-pair #x189 #x256 tbl)
557 (set-case-syntax-pair #x18A #x257 tbl)
558
2bb915b8
KH
559 ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
560 ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
561 ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
562 ;; SMALL LETTER I.
563
564 ;; We used to set up half of those correspondence unconditionally,
565 ;; but that makes searches slow. So now we don't set up either half
566 ;; of these correspondences by default.
567
568 ;; (set-downcase-syntax ?İ ?i tbl)
569 ;; (set-upcase-syntax ?I ?ı tbl)
570
e6d10035
KH
571 (set-case-syntax-pair ?DŽ ?dž tbl)
572 (set-case-syntax-pair ?Dž ?dž tbl)
573 (set-case-syntax-pair ?LJ ?lj tbl)
574 (set-case-syntax-pair ?Lj ?lj tbl)
575 (set-case-syntax-pair ?NJ ?nj tbl)
576 (set-case-syntax-pair ?Nj ?nj tbl)
e5e381c8 577
269a5dd0 578 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
e6d10035
KH
579 (set-case-syntax-pair ?DZ ?dz tbl)
580 (set-case-syntax-pair ?Dz ?dz tbl)
e6d10035
KH
581 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
582 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
269a5dd0 583
85ef8ece 584 ;; Latin Extended Additional
abdaa411 585 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
85ef8ece 586 (setq c #x1e00)
d05cfa1f 587 (while (<= c #x1ef9)
d05cfa1f
KH
588 (and (zerop (% c 2))
589 (or (<= c #x1e94) (>= c #x1ea0))
abdaa411 590 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f
KH
591 (setq c (1+ c)))
592
85ef8ece 593 ;; Greek
abdaa411 594 (modify-category-entry '(#x0370 . #x03ff) ?g)
85ef8ece 595 (setq c #x0370)
d05cfa1f 596 (while (<= c #x03ff)
d05cfa1f
KH
597 (if (or (and (>= c #x0391) (<= c #x03a1))
598 (and (>= c #x03a3) (<= c #x03ab)))
abdaa411 599 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
600 (and (>= c #x03da)
601 (<= c #x03ee)
602 (zerop (% c 2))
abdaa411 603 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 604 (setq c (1+ c)))
e6d10035
KH
605 (set-case-syntax-pair ?Ά ?ά tbl)
606 (set-case-syntax-pair ?Έ ?έ tbl)
607 (set-case-syntax-pair ?Ή ?ή tbl)
608 (set-case-syntax-pair ?Ί ?ί tbl)
609 (set-case-syntax-pair ?Ό ?ό tbl)
610 (set-case-syntax-pair ?Ύ ?ύ tbl)
611 (set-case-syntax-pair ?Ώ ?ώ tbl)
d05cfa1f 612
269a5dd0
DL
613 ;; Armenian
614 (setq c #x531)
615 (while (<= c #x556)
abdaa411 616 (set-case-syntax-pair c (+ c #x30) tbl)
269a5dd0
DL
617 (setq c (1+ c)))
618
85ef8ece 619 ;; Greek Extended
abdaa411 620 (modify-category-entry '(#x1f00 . #x1fff) ?g)
85ef8ece 621 (setq c #x1f00)
d05cfa1f 622 (while (<= c #x1fff)
d05cfa1f
KH
623 (and (<= (logand c #x000f) 7)
624 (<= c #x1fa7)
796f8b2f
KH
625 (not (memq c '(#x1f16 #x1f17 #x1f56 #x1f57
626 #x1f50 #x1f52 #x1f54 #x1f56)))
627 (/= (logand c #x00f0) #x70)
abdaa411 628 (set-case-syntax-pair (+ c 8) c tbl))
d05cfa1f 629 (setq c (1+ c)))
e6d10035
KH
630 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
631 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
632 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
633 (set-case-syntax-pair ?Ά ?ά tbl)
634 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
635 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
636 (set-case-syntax-pair ?Έ ?έ tbl)
637 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
638 (set-case-syntax-pair ?Ή ?ή tbl)
639 (set-case-syntax-pair ?ῌ ?ῃ tbl)
640 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
641 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
642 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
643 (set-case-syntax-pair ?Ί ?ί tbl)
644 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
645 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
646 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
647 (set-case-syntax-pair ?Ύ ?ύ tbl)
648 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
649 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
650 (set-case-syntax-pair ?Ό ?ό tbl)
651 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
652 (set-case-syntax-pair ?Ώ ?ώ tbl)
653 (set-case-syntax-pair ?ῼ ?ῳ tbl)
d05cfa1f 654
85ef8ece 655 ;; cyrillic
abdaa411 656 (modify-category-entry '(#x0400 . #x04FF) ?y)
85ef8ece 657 (setq c #x0400)
d05cfa1f 658 (while (<= c #x04ff)
d05cfa1f
KH
659 (and (>= c #x0400)
660 (<= c #x040f)
abdaa411 661 (set-case-syntax-pair c (+ c 80) tbl))
d05cfa1f
KH
662 (and (>= c #x0410)
663 (<= c #x042f)
abdaa411 664 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
665 (and (zerop (% c 2))
666 (or (and (>= c #x0460) (<= c #x0480))
667 (and (>= c #x048c) (<= c #x04be))
668 (and (>= c #x04d0) (<= c #x04f4)))
8f924df7 669 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 670 (setq c (1+ c)))
e6d10035
KH
671 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
672 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
673 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
674 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
675 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
d05cfa1f 676
85ef8ece
KH
677 ;; general punctuation
678 (setq c #x2000)
d05cfa1f
KH
679 (while (<= c #x200b)
680 (set-case-syntax c " " tbl)
681 (setq c (1+ c)))
b427c97e
DL
682 (while (<= c #x200F)
683 (set-case-syntax c "." tbl)
684 (setq c (1+ c)))
685 ;; Fixme: These aren't all right:
6b61353c
KH
686 (setq c #x2010)
687 (while (<= c #x2016)
688 (set-case-syntax c "_" tbl)
689 (setq c (1+ c)))
690 ;; Punctuation syntax for quotation marks (like `)
691 (while (<= c #x201f)
692 (set-case-syntax c "." tbl)
693 (setq c (1+ c)))
694 ;; Fixme: These aren't all right:
d05cfa1f
KH
695 (while (<= c #x2027)
696 (set-case-syntax c "_" tbl)
697 (setq c (1+ c)))
b427c97e
DL
698 (while (<= c #x206F)
699 (set-case-syntax c "." tbl)
700 (setq c (1+ c)))
d05cfa1f 701
269a5dd0
DL
702 ;; Roman numerals
703 (setq c #x2160)
704 (while (<= c #x216f)
abdaa411 705 (set-case-syntax-pair c (+ c #x10) tbl)
269a5dd0
DL
706 (setq c (1+ c)))
707
4fb82d62
DL
708 ;; Fixme: The following blocks might be better as symbol rather than
709 ;; punctuation.
b427c97e
DL
710 ;; Arrows
711 (setq c #x2190)
6ca54a3a
DL
712 (while (<= c #x21FF)
713 (set-case-syntax c "." tbl)
b427c97e
DL
714 (setq c (1+ c)))
715 ;; Mathematical Operators
716 (while (<= c #x22FF)
6ca54a3a 717 (set-case-syntax c "." tbl)
b427c97e
DL
718 (setq c (1+ c)))
719 ;; Miscellaneous Technical
720 (while (<= c #x23FF)
6ca54a3a 721 (set-case-syntax c "." tbl)
b427c97e
DL
722 (setq c (1+ c)))
723 ;; Control Pictures
724 (while (<= c #x243F)
6ca54a3a 725 (set-case-syntax c "_" tbl)
269a5dd0
DL
726 (setq c (1+ c)))
727
728 ;; Circled Latin
729 (setq c #x24b6)
730 (while (<= c #x24cf)
abdaa411
DL
731 (set-case-syntax-pair c (+ c 26) tbl)
732 (modify-category-entry c ?l)
733 (modify-category-entry (+ c 26) ?l)
269a5dd0
DL
734 (setq c (1+ c)))
735
736 ;; Fullwidth Latin
737 (setq c #xff21)
738 (while (<= c #xff3a)
abdaa411
DL
739 (set-case-syntax-pair c (+ c #x20) tbl)
740 (modify-category-entry c ?l)
741 (modify-category-entry (+ c #x20) ?l)
269a5dd0
DL
742 (setq c (1+ c)))
743
269a5dd0 744 ;; Combining diacritics
abdaa411 745 (modify-category-entry '(#x300 . #x362) ?^)
269a5dd0 746 ;; Combining marks
abdaa411 747 (modify-category-entry '(#x20d0 . #x20e3) ?^)
269a5dd0
DL
748
749 ;; Fixme: syntax for symbols &c
750 )
6b61353c
KH
751
752(let ((pairs
e55a4d4e
KH
753 '("⁅⁆" ; U+2045 U+2046
754 "⁽⁾" ; U+207D U+207E
755 "₍₎" ; U+208D U+208E
756 "〈〉" ; U+2329 U+232A
757 "⎴⎵" ; U+23B4 U+23B5
758 "❨❩" ; U+2768 U+2769
759 "❪❫" ; U+276A U+276B
760 "❬❭" ; U+276C U+276D
761 "❰❱" ; U+2770 U+2771
762 "❲❳" ; U+2772 U+2773
763 "❴❵" ; U+2774 U+2775
764 "⟦⟧" ; U+27E6 U+27E7
765 "⟨⟩" ; U+27E8 U+27E9
766 "⟪⟫" ; U+27EA U+27EB
767 "⦃⦄" ; U+2983 U+2984
768 "⦅⦆" ; U+2985 U+2986
769 "⦇⦈" ; U+2987 U+2988
770 "⦉⦊" ; U+2989 U+298A
771 "⦋⦌" ; U+298B U+298C
772 "⦍⦎" ; U+298D U+298E
773 "⦏⦐" ; U+298F U+2990
774 "⦑⦒" ; U+2991 U+2992
775 "⦓⦔" ; U+2993 U+2994
776 "⦕⦖" ; U+2995 U+2996
777 "⦗⦘" ; U+2997 U+2998
778 "⧼⧽" ; U+29FC U+29FD
779 "〈〉" ; U+3008 U+3009
780 "《》" ; U+300A U+300B
781 "「」" ; U+300C U+300D
782 "『』" ; U+300E U+300F
783 "【】" ; U+3010 U+3011
784 "〔〕" ; U+3014 U+3015
785 "〖〗" ; U+3016 U+3017
786 "〘〙" ; U+3018 U+3019
787 "〚〛" ; U+301A U+301B
788 "﴾﴿" ; U+FD3E U+FD3F
789 "︵︶" ; U+FE35 U+FE36
790 "︷︸" ; U+FE37 U+FE38
791 "︹︺" ; U+FE39 U+FE3A
792 "︻︼" ; U+FE3B U+FE3C
793 "︽︾" ; U+FE3D U+FE3E
794 "︿﹀" ; U+FE3F U+FE40
795 "﹁﹂" ; U+FE41 U+FE42
796 "﹃﹄" ; U+FE43 U+FE44
797 "﹙﹚" ; U+FE59 U+FE5A
798 "﹛﹜" ; U+FE5B U+FE5C
799 "﹝﹞" ; U+FE5D U+FE5E
800 "()" ; U+FF08 U+FF09
801 "[]" ; U+FF3B U+FF3D
802 "{}" ; U+FF5B U+FF5D
803 "⦅⦆" ; U+FF5F U+FF60
804 "「」" ; U+FF62 U+FF63
6b61353c
KH
805 )))
806 (dolist (elt pairs)
807 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
808 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
809
4ed46869 810\f
777cfce6 811;; For each character set, put the information of the most proper
aaa9f206 812;; coding system to encode it by `preferred-coding-system' property.
777cfce6 813
abdaa411 814;; Fixme: should this be junked?
777cfce6
KH
815(let ((l '((latin-iso8859-1 . iso-latin-1)
816 (latin-iso8859-2 . iso-latin-2)
817 (latin-iso8859-3 . iso-latin-3)
818 (latin-iso8859-4 . iso-latin-4)
819 (thai-tis620 . thai-tis620)
820 (greek-iso8859-7 . greek-iso-8bit)
821 (arabic-iso8859-6 . iso-2022-7bit)
822 (hebrew-iso8859-8 . hebrew-iso-8bit)
823 (katakana-jisx0201 . japanese-shift-jis)
824 (latin-jisx0201 . japanese-shift-jis)
825 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
826 (latin-iso8859-9 . iso-latin-5)
827 (japanese-jisx0208-1978 . iso-2022-jp)
7870bdd9
KH
828 (chinese-gb2312 . chinese-iso-8bit)
829 (chinese-gbk . chinese-gbk)
830 (gb18030-2-byte . chinese-gb18030)
831 (gb18030-4-byte-bmp . chinese-gb18030)
832 (gb18030-4-byte-smp . chinese-gb18030)
833 (gb18030-4-byte-ext-1 . chinese-gb18030)
834 (gb18030-4-byte-ext-2 . chinese-gb18030)
777cfce6
KH
835 (japanese-jisx0208 . iso-2022-jp)
836 (korean-ksc5601 . iso-2022-kr)
837 (japanese-jisx0212 . iso-2022-jp)
777cfce6
KH
838 (chinese-big5-1 . chinese-big5)
839 (chinese-big5-2 . chinese-big5)
840 (chinese-sisheng . iso-2022-7bit)
841 (ipa . iso-2022-7bit)
842 (vietnamese-viscii-lower . vietnamese-viscii)
843 (vietnamese-viscii-upper . vietnamese-viscii)
844 (arabic-digit . iso-2022-7bit)
845 (arabic-1-column . iso-2022-7bit)
777cfce6
KH
846 (lao . lao)
847 (arabic-2-column . iso-2022-7bit)
848 (indian-is13194 . devanagari)
69e138b2 849 (indian-glyph . devanagari)
777cfce6 850 (tibetan-1-column . tibetan)
58cd41a3 851 (ethiopic . iso-2022-7bit)
7870bdd9
KH
852 (chinese-cns11643-1 . iso-2022-cn)
853 (chinese-cns11643-2 . iso-2022-cn)
777cfce6
KH
854 (chinese-cns11643-3 . iso-2022-cn)
855 (chinese-cns11643-4 . iso-2022-cn)
856 (chinese-cns11643-5 . iso-2022-cn)
857 (chinese-cns11643-6 . iso-2022-cn)
858 (chinese-cns11643-7 . iso-2022-cn)
859 (indian-2-column . devanagari)
7a860cf2
DL
860 (tibetan . tibetan)
861 (latin-iso8859-14 . iso-latin-8)
862 (latin-iso8859-15 . iso-latin-9))))
777cfce6 863 (while l
aaa9f206 864 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
777cfce6 865 (setq l (cdr l))))
df0415c5
KH
866
867\f
98a663f1 868;; Setup auto-fill-chars for charsets that should invoke auto-filling.
7760ba82 869;; SPACE and NEWLINE are already set.
df21429c
KH
870
871(set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
872(set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
873(set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
874(set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
875(set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
876(set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
877
55bd52ea 878\f
7760ba82
KH
879;;; Setting char-width-table. The default is 1.
880
881;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
882;; and final characters.
a2a22302 883(let ((l '((#x0300 . #x036F)
7760ba82
KH
884 (#x0483 . #x0489)
885 (#x0591 . #x05BD)
886 (#x05BF . #x05BF)
887 (#x05C1 . #x05C2)
888 (#x05C4 . #x05C5)
889 (#x05C7 . #x05C7)
890 (#x0600 . #x0603)
891 (#x0610 . #x0615)
892 (#x064B . #x065E)
893 (#x0670 . #x0670)
894 (#x06D6 . #x06E4)
895 (#x06E7 . #x06E8)
896 (#x06EA . #x06ED)
897 (#x070F . #x070F)
898 (#x0711 . #x0711)
899 (#x0730 . #x074A)
900 (#x07A6 . #x07B0)
901 (#x07EB . #x07F3)
902 (#x0901 . #x0902)
903 (#x093C . #x093C)
904 (#x0941 . #x0948)
905 (#x094D . #x094D)
906 (#x0951 . #x0954)
907 (#x0962 . #x0963)
908 (#x0981 . #x0981)
909 (#x09BC . #x09BC)
910 (#x09C1 . #x09C4)
911 (#x09CD . #x09CD)
912 (#x09E2 . #x09E3)
913 (#x0A01 . #x0A02)
914 (#x0A3C . #x0A3C)
915 (#x0A41 . #x0A4D)
916 (#x0A70 . #x0A71)
917 (#x0A81 . #x0A82)
918 (#x0ABC . #x0ABC)
919 (#x0AC1 . #x0AC8)
920 (#x0ACD . #x0ACD)
921 (#x0AE2 . #x0AE3)
922 (#x0B01 . #x0B01)
923 (#x0B3C . #x0B3C)
924 (#x0B3F . #x0B3F)
925 (#x0B41 . #x0B43)
926 (#x0B4D . #x0B56)
927 (#x0B82 . #x0B82)
928 (#x0BC0 . #x0BC0)
929 (#x0BCD . #x0BCD)
930 (#x0C3E . #x0C40)
931 (#x0C46 . #x0C56)
932 (#x0CBC . #x0CBC)
933 (#x0CBF . #x0CBF)
934 (#x0CC6 . #x0CC6)
935 (#x0CCC . #x0CCD)
936 (#x0CE2 . #x0CE3)
937 (#x0D41 . #x0D43)
938 (#x0D4D . #x0D4D)
939 (#x0DCA . #x0DCA)
940 (#x0DD2 . #x0DD6)
941 (#x0E31 . #x0E31)
942 (#x0E34 . #x0E3A)
943 (#x0E47 . #x0E4E)
944 (#x0EB1 . #x0EB1)
945 (#x0EB4 . #x0EBC)
946 (#x0EC8 . #x0ECD)
947 (#x0F18 . #x0F19)
948 (#x0F35 . #x0F35)
949 (#x0F37 . #x0F37)
950 (#x0F39 . #x0F39)
951 (#x0F71 . #x0F7E)
952 (#x0F80 . #x0F84)
953 (#x0F86 . #x0F87)
954 (#x0F90 . #x0FBC)
955 (#x0FC6 . #x0FC6)
956 (#x102D . #x1030)
957 (#x1032 . #x1037)
958 (#x1039 . #x1039)
959 (#x1058 . #x1059)
960 (#x1160 . #x11FF)
961 (#x135F . #x135F)
962 (#x1712 . #x1714)
963 (#x1732 . #x1734)
964 (#x1752 . #x1753)
965 (#x1772 . #x1773)
966 (#x17B4 . #x17B5)
967 (#x17B7 . #x17BD)
968 (#x17C6 . #x17C6)
969 (#x17C9 . #x17D3)
970 (#x17DD . #x17DD)
971 (#x180B . #x180D)
972 (#x18A9 . #x18A9)
973 (#x1920 . #x1922)
974 (#x1927 . #x1928)
975 (#x1932 . #x1932)
976 (#x1939 . #x193B)
977 (#x1A17 . #x1A18)
978 (#x1B00 . #x1B03)
979 (#x1B34 . #x1B34)
980 (#x1B36 . #x1B3A)
981 (#x1B3C . #x1B3C)
982 (#x1B42 . #x1B42)
983 (#x1B6B . #x1B73)
984 (#x1DC0 . #x1DFF)
985 (#x200B . #x200F)
986 (#x202A . #x202E)
987 (#x2060 . #x206F)
988 (#x20D0 . #x20EF)
989 (#x302A . #x302F)
990 (#x3099 . #x309A)
991 (#xA806 . #xA806)
992 (#xA80B . #xA80B)
993 (#xA825 . #xA826)
994 (#xFB1E . #xFB1E)
995 (#xFE00 . #xFE0F)
996 (#xFE20 . #xFE23)
997 (#xFEFF . #xFEFF)
998 (#xFFF9 . #xFFFB)
999 (#x10A01 . #x10A0F)
1000 (#x10A38 . #x10A3F)
1001 (#x1D167 . #x1D169)
1002 (#x1D173 . #x1D182)
1003 (#x1D185 . #x1D18B)
1004 (#x1D1AA . #x1D1AD)
1005 (#x1D242 . #x1D244)
1006 (#xE0001 . #xE01EF))))
1007 (dolist (elt l)
1008 (set-char-table-range char-width-table elt 0)))
1009
1010;; 2: East Asian Wide and Full-width characters.
1011(let ((l '((#x1100 . #x115F)
1012 (#x2329 . #x232A)
1013 (#x2E80 . #x303E)
1014 (#x3040 . #xA4CF)
1015 (#xAC00 . #xD7A3)
ed0cb465 1016 (#xF900 . #xFAFF)
7760ba82 1017 (#xFE30 . #xFE6F)
bb5c62cf 1018 (#xFF01 . #xFF60)
7760ba82
KH
1019 (#xFFE0 . #xFFE6)
1020 (#x20000 . #x2FFFF)
1021 (#x30000 . #x3FFFF))))
ed0cb465 1022 (dolist (elt l)
7760ba82 1023 (set-char-table-range char-width-table elt 2)))
173f18ce
DL
1024
1025;; Other double width
7760ba82
KH
1026;;(map-charset-chars
1027;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1028;; 'ethiopic)
1029;; (map-charset-chars
1030;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1031;; 'tibetan)
173f18ce
DL
1032(map-charset-chars
1033 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1034 'indian-2-column)
1035(map-charset-chars
1036 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1037 'arabic-2-column)
777cfce6 1038
dbff07a2
KH
1039;; Internal use only.
1040;; Alist of locale symbol vs charsets. In a language environment
1041;; corresponding to the locale, width of characters in the charsets is
1042;; set to 2. Each element has the form:
1043;; (LOCALE TABLE (CHARSET (FROM-CODE . TO-CODE) ...) ...)
1044;; LOCALE: locale symbol
1045;; TABLE: char-table used for char-width-table, initially nil.
1046;; CAHRSET: character set
1047;; FROM-CODE, TO-CODE: range of code-points in CHARSET
1048
1049(defvar cjk-char-width-table-list
1050 '((ja_JP nil (japanese-jisx0208 (#x2121 . #x287E))
1051 (cp932-2-byte (#x8140 . #x879F)))
1052 (zh_CN nil (chinese-gb2312 (#x2121 . #x297E)))
1053 (zh_HK nil (big5-hkscs (#xA140 . #xA3FE) (#xC6A0 . #xC8FE)))
1054 (zh_TW nil (big5 (#xA140 . #xA3FE))
1055 (chinese-cns11643-1 (#x2121 . #x427E)))
1056 (ko_KR nil (korean-ksc5601 (#x2121 . #x2C7E)))))
1057
1058;; Internal use only.
1059;; Setup char-width-table appropriate for a language environment
1060;; corresponding to LOCALE-NAME (symbol).
1061
1062(defun use-cjk-char-width-table (locale-name)
1063 (while (char-table-parent char-width-table)
1064 (setq char-width-table (char-table-parent char-width-table)))
1065 (let ((slot (assq locale-name cjk-char-width-table-list))
1066 table)
1067 (or slot (error "Unknown locale for CJK language environment: %s"
1068 locale-name))
1069 (unless (nth 1 slot)
1070 (let ((table (make-char-table nil)))
1071 (dolist (charset-info (nthcdr 2 slot))
1072 (let ((charset (car charset-info)))
1073 (dolist (code-range (cdr charset-info))
1074 (map-charset-chars #'(lambda (range arg)
1075 (set-char-table-range table range 2))
1076 charset nil
1077 (car code-range) (cdr code-range)))))
1078 (optimize-char-table table)
1079 (set-char-table-parent table char-width-table)
1080 (setcar (cdr slot) table)))
1081 (setq char-width-table (nth 1 slot))))
55a3ed16
KH
1082
1083(defun use-default-char-width-table ()
1084 "Internal use only.
9f336de0 1085Setup char-width-table appropriate for non-CJK language environment."
dbff07a2
KH
1086 (while (char-table-parent char-width-table)
1087 (setq char-width-table (char-table-parent char-width-table))))
55a3ed16 1088
87a39edb 1089(optimize-char-table (standard-case-table))
87a39edb
DL
1090(optimize-char-table (standard-syntax-table))
1091
55a3ed16
KH
1092\f
1093;; Setting char-script-table.
1094
b427c97e
DL
1095;; The Unicode blocks actually extend past some of these ranges with
1096;; undefined codepoints.
9ce5de1c
KH
1097(let ((script-list nil))
1098 (dolist
1099 (elt
b982c760 1100 '((#x0000 #x007F latin)
6c52dd78
JR
1101 (#x00A0 #x024F latin)
1102 (#x0250 #x02AF phonetic)
1103 (#x02B0 #x036F latin)
9ce5de1c
KH
1104 (#x0370 #x03E1 greek)
1105 (#x03E2 #x03EF coptic)
1106 (#x03F0 #x03F3 greek)
1107 (#x0400 #x04FF cyrillic)
1108 (#x0530 #x058F armenian)
1109 (#x0590 #x05FF hebrew)
1110 (#x0600 #x06FF arabic)
1111 (#x0700 #x074F syriac)
e7da2f38 1112 (#x07C0 #x07FA nko)
9ce5de1c
KH
1113 (#x0780 #x07BF thaana)
1114 (#x0900 #x097F devanagari)
1115 (#x0980 #x09FF bengali)
1116 (#x0A00 #x0A7F gurmukhi)
1117 (#x0A80 #x0AFF gujarati)
1118 (#x0B00 #x0B7F oriya)
1119 (#x0B80 #x0BFF tamil)
1120 (#x0C00 #x0C7F telugu)
1121 (#x0C80 #x0CFF kannada)
1122 (#x0D00 #x0D7F malayalam)
1123 (#x0D80 #x0DFF sinhala)
1124 (#x0E00 #x0E5F thai)
1125 (#x0E80 #x0EDF lao)
1126 (#x0F00 #x0FFF tibetan)
1127 (#x1000 #x105F myanmar)
1128 (#x10A0 #x10FF georgian)
1129 (#x1100 #x11FF hangul)
4c81b0f6 1130 (#x1200 #x139F ethiopic)
9ce5de1c
KH
1131 (#x13A0 #x13FF cherokee)
1132 (#x1400 #x167F canadian-aboriginal)
1133 (#x1680 #x169F ogham)
1134 (#x16A0 #x16FF runic)
1135 (#x1780 #x17FF khmer)
1136 (#x1800 #x18AF mongolian)
6c52dd78 1137 (#x1D00 #x1DFF phonetic)
9ce5de1c
KH
1138 (#x1E00 #x1EFF latin)
1139 (#x1F00 #x1FFF greek)
f041d33e 1140 (#x2000 #x27FF symbol)
9ce5de1c 1141 (#x2800 #x28FF braille)
4c81b0f6 1142 (#x2D80 #x2DDF ethiopic)
9ce5de1c
KH
1143 (#x2E80 #x2FDF han)
1144 (#x2FF0 #x2FFF ideographic-description)
1145 (#x3000 #x303F cjk-misc)
1146 (#x3040 #x30FF kana)
1147 (#x3100 #x312F bopomofo)
1148 (#x3130 #x318F hangul)
1149 (#x3190 #x319F kanbun)
1150 (#x31A0 #x31BF bopomofo)
1151 (#x3400 #x9FAF han)
1152 (#xA000 #xA4CF yi)
1ffae953 1153 (#xAA00 #xAA5F cham)
d807d0c7 1154 (#xAA80 #xAADF tai-viet)
9ce5de1c 1155 (#xAC00 #xD7AF hangul)
95ac45fa 1156 (#xF900 #xFAFF han)
9ce5de1c
KH
1157 (#xFB1D #xFB4F hebrew)
1158 (#xFB50 #xFDFF arabic)
1159 (#xFE70 #xFEFC arabic)
1160 (#xFF00 #xFF5F cjk-misc)
1161 (#xFF61 #xFF9F kana)
1162 (#xFFE0 #xFFE6 cjk-misc)
458888ab
KH
1163 (#x10000 #x100FF linear-b)
1164 (#x10100 #x1013F aegean-number)
1165 (#x10140 #x1018A ancient-greek-number)
1166 (#x10190 #x1019B ancient-symbol)
1167 (#x101D0 #x101FF phaistos-disc)
1168 (#x10280 #x1029F lycian)
1169 (#x102A0 #x102DF carian)
1170 (#x10300 #x1032F olt-italic)
1171 (#x10380 #x1039F ugaritic)
1172 (#x103A0 #x103DF old-persian)
1173 (#x10400 #x1044F deseret)
1174 (#x10450 #x1047F shavian)
1175 (#x10480 #x104AF osmanya)
1176 (#x10800 #x1083F cypriot-syllabary)
1177 (#x10900 #x1091F phoenician)
1178 (#x10920 #x1093F lydian)
1179 (#x10A00 #x10A5F kharoshthi)
1180 (#x12000 #x123FF cuneiform)
1181 (#x12400 #x1247F cuneiform-numbers-and-punctuation)
e7da2f38
KH
1182 (#x1D000 #x1D0FF byzantine-musical-symbol)
1183 (#x1D100 #x1D1FF musical-symbol)
458888ab
KH
1184 (#x1D200 #x1D24F ancient-greek-musical-notation)
1185 (#x1D300 #x1D35F tai-xuan-jing-symbol)
1186 (#x1D360 #x1D37F counting-rod-numeral)
e7da2f38 1187 (#x1D400 #x1D7FF mathematical)
458888ab
KH
1188 (#x1F000 #x1F02F mahjong-tile)
1189 (#x1F030 #x1F09F domino-tile)
e7259832 1190 (#x20000 #x2AFFF han)
9ce5de1c
KH
1191 (#x2F800 #x2FFFF han)))
1192 (set-char-table-range char-script-table
1193 (cons (car elt) (nth 1 elt)) (nth 2 elt))
1194 (or (memq (nth 2 elt) script-list)
1195 (setq script-list (cons (nth 2 elt) script-list))))
1196 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1197
8f924df7 1198(map-charset-chars
cdfc5141
KH
1199 #'(lambda (range ignore)
1200 (set-char-table-range char-script-table range 'tibetan))
1201 'tibetan)
1202
e7259832 1203\f
59db3a5c
KH
1204;;; Setting unicode-category-table.
1205
1206;; This macro is to build unicode-category-table at compile time so
1207;; that C code can access the table efficiently.
1208(defmacro build-unicode-category-table ()
1209 (let ((table (make-char-table 'unicode-category-table nil)))
1210 (dotimes (i #x110000)
1211 (if (or (< i #xD800)
796f8b2f
KH
1212 (and (>= i #xF900) (< i #x30000))
1213 (and (>= i #xE0000) (< i #xE0200)))
59db3a5c
KH
1214 (aset table i (get-char-code-property i 'general-category))))
1215 (set-char-table-range table '(#xE000 . #xF8FF) 'Co)
1216 (set-char-table-range table '(#xF0000 . #xFFFFD) 'Co)
1217 (set-char-table-range table '(#x100000 . #x10FFFD) 'Co)
1218 (optimize-char-table table 'eq)
1219 table))
1220
1221(setq unicode-category-table (build-unicode-category-table))
8ea6fa80
KH
1222(map-char-table #'(lambda (key val)
1223 (if (and val
1224 (or (and (/= (aref (symbol-name val) 0) ?M)
1225 (/= (aref (symbol-name val) 0) ?C))
1226 (eq val 'Zs)))
1227 (modify-category-entry key ?.)))
1228 unicode-category-table)
1229
1230(optimize-char-table (standard-category-table))
59db3a5c
KH
1231
1232\f
e7259832
KH
1233;;; Setting word boundary.
1234
e7259832 1235(setq word-combining-categories
4626499f
KH
1236 '((nil . ?^)
1237 (?^ . nil)
7ffefb08
MB
1238 (?C . ?H)
1239 (?C . ?K)))
e7259832
KH
1240
1241(setq word-separating-categories ; (2-byte character sets)
4626499f 1242 '((?H . ?K) ; Hiragana - Katakana
e7259832
KH
1243 ))
1244
1cbfaab9 1245;; Local Variables:
985773c9 1246;; coding: utf-8
1cbfaab9 1247;; End:
777cfce6 1248
1cbfaab9 1249;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
60370d40 1250;;; characters.el ends here