Add 2010 to copyright years.
[bpt/emacs.git] / lisp / international / characters.el
1 ;;; characters.el --- set syntax and category for multibyte characters
2
3 ;; Copyright (C) 1997, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
4 ;; Free Software Foundation, Inc.
5 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 ;; 2005, 2006, 2007, 2008, 2009, 2010
7 ;; National Institute of Advanced Industrial Science and Technology (AIST)
8 ;; Registration Number H14PRO021
9 ;; Copyright (C) 2003
10 ;; National Institute of Advanced Industrial Science and Technology (AIST)
11 ;; Registration Number H13PRO009
12
13 ;; Keywords: multibyte character, character set, syntax, category
14
15 ;; This file is part of GNU Emacs.
16
17 ;; GNU Emacs is free software: you can redistribute it and/or modify
18 ;; it under the terms of the GNU General Public License as published by
19 ;; the Free Software Foundation, either version 3 of the License, or
20 ;; (at your option) any later version.
21
22 ;; GNU Emacs is distributed in the hope that it will be useful,
23 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 ;; GNU General Public License for more details.
26
27 ;; You should have received a copy of the GNU General Public License
28 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
29
30 ;;; Commentary:
31
32 ;;; Code:
33
34 ;;; Predefined categories.
35
36 ;; For each character set.
37
38 (define-category ?a "ASCII
39 ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
40 (define-category ?l "Latin")
41 (define-category ?t "Thai")
42 (define-category ?g "Greek")
43 (define-category ?b "Arabic")
44 (define-category ?w "Hebrew")
45 (define-category ?y "Cyrillic")
46 (define-category ?k "Katakana
47 Japanese katakana")
48 (define-category ?r "Roman
49 Japanese roman")
50 (define-category ?c "Chinese")
51 (define-category ?j "Japanese")
52 (define-category ?h "Korean")
53 (define-category ?e "Ethiopic
54 Ethiopic (Ge'ez)")
55 (define-category ?v "Viet
56 Vietnamese")
57 (define-category ?i "Indian")
58 (define-category ?o "Lao")
59 (define-category ?q "Tibetan")
60
61 ;; For each group (row) of 2-byte character sets.
62
63 (define-category ?A "2-byte alnum
64 Alpha-numeric characters of 2-byte character sets")
65 (define-category ?C "2-byte han
66 Chinese (Han) characters of 2-byte character sets")
67 (define-category ?G "2-byte Greek
68 Greek characters of 2-byte character sets")
69 (define-category ?H "2-byte Hiragana
70 Japanese Hiragana characters of 2-byte character sets")
71 (define-category ?K "2-byte Katakana
72 Japanese Katakana characters of 2-byte character sets")
73 (define-category ?N "2-byte Korean
74 Korean Hangul characters of 2-byte character sets")
75 (define-category ?Y "2-byte Cyrillic
76 Cyrillic characters of 2-byte character sets")
77 (define-category ?I "Indian Glyphs")
78
79 ;; For phonetic classifications.
80
81 (define-category ?0 "consonant")
82 (define-category ?1 "base vowel
83 Base (independent) vowel")
84 (define-category ?2 "upper diacritic
85 Upper diacritical mark (including upper vowel)")
86 (define-category ?3 "lower diacritic
87 Lower diacritical mark (including lower vowel)")
88 (define-category ?4 "combining tone
89 Combining tone mark")
90 (define-category ?5 "symbol")
91 (define-category ?6 "digit")
92 (define-category ?7 "vowel diacritic
93 Vowel-modifying diacritical mark")
94 (define-category ?8 "vowel-signs")
95 (define-category ?9 "semivowel lower")
96
97 ;; For filling.
98 (define-category ?| "line breakable
99 While filling, we can break a line at this character.")
100
101 ;; For indentation calculation.
102 (define-category ?\s
103 "space for indent
104 This character counts as a space for indentation purposes.")
105
106 ;; Keep the following for `kinsoku' processing. See comments in
107 ;; kinsoku.el.
108 (define-category ?> "Not at bol
109 A character which can't be placed at beginning of line.")
110 (define-category ?< "Not at eol
111 A character which can't be placed at end of line.")
112
113 ;; Base and Combining
114 (define-category ?. "Base
115 Base characters (Unicode General Category L,N,P,S,Zs)")
116 (define-category ?^ "Combining
117 Combining diacritic or mark (Unicode General Category M)")
118 \f
119 ;;; Setting syntax and category.
120
121 ;; ASCII
122
123 ;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
124 (modify-category-entry '(32 . 127) ?a)
125 (modify-category-entry '(32 . 127) ?l)
126
127 ;; Deal with the CJK charsets first. Since the syntax of blocks is
128 ;; defined per charset, and the charsets may contain e.g. Latin
129 ;; characters, we end up with the wrong syntax definitions if we're
130 ;; not careful.
131
132 ;; Chinese characters (Unicode)
133 (modify-category-entry '(#x2E80 . #x312F) ?|)
134 (modify-category-entry '(#x3190 . #x33FF) ?|)
135 (modify-category-entry '(#x3400 . #x4DBF) ?C)
136 (modify-category-entry '(#x4E00 . #x9FAF) ?C)
137 (modify-category-entry '(#x3400 . #x9FAF) ?c)
138 (modify-category-entry '(#x3400 . #x9FAF) ?|)
139 (modify-category-entry '(#xF900 . #xFAFF) ?C)
140 (modify-category-entry '(#xF900 . #xFAFF) ?c)
141 (modify-category-entry '(#xF900 . #xFAFF) ?|)
142 (modify-category-entry '(#x20000 . #x2FFFF) ?|)
143 (modify-category-entry '(#x20000 . #x2FFFF) ?C)
144 (modify-category-entry '(#x20000 . #x2FFFF) ?c)
145
146
147 ;; Chinese character set (GB2312)
148
149 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
150 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
151 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
152
153 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
154 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
155 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
156 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
157 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
158 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
159 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
160 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
161 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
162
163 ;; Chinese character set (BIG5)
164
165 (map-charset-chars #'modify-category-entry 'big5 ?c)
166 (map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA261)
167 (map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
168 (map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DC)
169
170 ;; Chinese character set (CNS11643)
171
172 (dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
173 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
174 chinese-cns11643-7))
175 (map-charset-chars #'modify-category-entry c ?c)
176 (if (eq c 'chinese-cns11643-1)
177 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
178 (map-charset-chars #'modify-category-entry c ?C)))
179
180 ;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
181
182 (map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
183
184 (map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
185
186 (dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
187 japanese-jisx0213-1 japanese-jisx0213-2
188 cp932-2-byte))
189 (map-charset-chars #'modify-category-entry l ?j))
190
191 ;; Unicode equivalents of JISX0201-kana
192 (let ((range '(#xff61 . #xff9f)))
193 (modify-category-entry range ?k)
194 (modify-category-entry range ?j)
195 (modify-category-entry range ?\|))
196
197 ;; Katakana block
198 (modify-category-entry '(#x3099 . #x309C) ?K)
199 (modify-category-entry '(#x30A0 . #x30FF) ?K)
200 (modify-category-entry '(#x31F0 . #x31FF) ?K)
201 (modify-category-entry '(#x30A0 . #x30FA) ?\|)
202 (modify-category-entry #x30FF ?\|)
203
204 ;; Hiragana block
205 (modify-category-entry '(#x3040 . #x309F) ?H)
206 (modify-category-entry '(#x3040 . #x3096) ?\|)
207 (modify-category-entry #x309F ?\|)
208 (modify-category-entry #x30A0 ?H)
209 (modify-category-entry #x30FC ?H)
210
211
212 ;; JISX0208
213 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
214 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
215 (let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
216 (dolist (elt chars)
217 (modify-syntax-entry (car chars) "w")))
218
219 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
220 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
221 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
222 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
223 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
224 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
225 (modify-category-entry ?ー ?K)
226 (let ((chars '(?゛ ?゜)))
227 (while chars
228 (modify-category-entry (car chars) ?K)
229 (modify-category-entry (car chars) ?H)
230 (setq chars (cdr chars))))
231 (let ((chars '(?仝 ?々 ?〆 ?〇)))
232 (while chars
233 (modify-category-entry (car chars) ?C)
234 (setq chars (cdr chars))))
235
236 ;; JISX0212
237
238 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
239
240 ;; JISX0201-Kana
241
242 (let ((chars '(?。 ?、 ?・)))
243 (while chars
244 (modify-syntax-entry (car chars) ".")
245 (setq chars (cdr chars))))
246
247 (modify-syntax-entry ?\「 "(」")
248 (modify-syntax-entry ?\」 "(「")
249
250 ;; Korean character set (KSC5601)
251
252 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
253
254 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
255 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
256 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
257 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
258 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
259 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
260 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
261 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
262 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
263 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
264 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
265
266 ;; These are in more than one charset.
267 (let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
268 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
269 "()[]{}"))
270 open close)
271 (dotimes (i (/ (length parens) 2))
272 (setq open (aref parens (* i 2))
273 close (aref parens (1+ (* i 2))))
274 (modify-syntax-entry open (format "(%c" close))
275 (modify-syntax-entry close (format ")%c" open))))
276
277 ;; Arabic character set
278
279 (let ((charsets '(arabic-iso8859-6
280 arabic-digit
281 arabic-1-column
282 arabic-2-column)))
283 (while charsets
284 (map-charset-chars #'modify-category-entry (car charsets) ?b)
285 (setq charsets (cdr charsets))))
286 (modify-category-entry '(#x600 . #x6ff) ?b)
287 (modify-category-entry '(#xfb50 . #xfdff) ?b)
288 (modify-category-entry '(#xfe70 . #xfefe) ?b)
289
290 ;; Cyrillic character set (ISO-8859-5)
291
292 (modify-syntax-entry ?№ ".")
293
294 ;; Ethiopic character set
295
296 (modify-category-entry '(#x1200 . #x1399) ?e)
297 (modify-category-entry '(#x2d80 . #x2dde) ?e)
298 (let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨)))
299 (while chars
300 (modify-syntax-entry (car chars) ".")
301 (setq chars (cdr chars))))
302 (map-charset-chars #'modify-category-entry 'ethiopic ?e)
303
304 ;; Hebrew character set (ISO-8859-8)
305
306 (modify-syntax-entry #x5be ".") ; MAQAF
307 (modify-syntax-entry #x5c0 ".") ; PASEQ
308 (modify-syntax-entry #x5c3 ".") ; SOF PASUQ
309 (modify-syntax-entry #x5f3 ".") ; GERESH
310 (modify-syntax-entry #x5f4 ".") ; GERSHAYIM
311
312 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
313
314 (modify-category-entry '(#x901 . #x970) ?i)
315 (map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
316 (map-charset-chars #'modify-category-entry 'indian-2-column ?i)
317
318 ;; Lao character set
319
320 (modify-category-entry '(#xe80 . #xeff) ?o)
321 (map-charset-chars #'modify-category-entry 'lao ?o)
322
323 (let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
324 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
325 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
326 ("ຸູ" "w" ?3) ; vowel lower
327 ("່-໋" "w" ?4) ; tone mark
328 ("ຼຽ" "w" ?9) ; semivowel lower
329 ("໐-໙" "w" ?6) ; digit
330 ("ຯໆ" "_" ?5) ; symbol
331 ))
332 elm chars len syntax category to ch i)
333 (while deflist
334 (setq elm (car deflist))
335 (setq chars (car elm)
336 len (length chars)
337 syntax (nth 1 elm)
338 category (nth 2 elm)
339 i 0)
340 (while (< i len)
341 (if (= (aref chars i) ?-)
342 (setq i (1+ i)
343 to (aref chars i))
344 (setq ch (aref chars i)
345 to ch))
346 (while (<= ch to)
347 (unless (string-equal syntax "w")
348 (modify-syntax-entry ch syntax))
349 (modify-category-entry ch category)
350 (setq ch (1+ ch)))
351 (setq i (1+ i)))
352 (setq deflist (cdr deflist))))
353
354 ;; Thai character set (TIS620)
355
356 (modify-category-entry '(#xe00 . #xe7f) ?t)
357 (map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
358
359 (let ((deflist '(;; chars syntax category
360 ("ก-รลว-ฮ" "w" ?0) ; consonant
361 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
362 ("ัิ-ื็๎" "w" ?2) ; vowel upper
363 ("ุ-ฺ" "w" ?3) ; vowel lower
364 ("่-ํ" "w" ?4) ; tone mark
365 ("๐-๙" "w" ?6) ; digit
366 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
367 ))
368 elm chars len syntax category to ch i)
369 (while deflist
370 (setq elm (car deflist))
371 (setq chars (car elm)
372 len (length chars)
373 syntax (nth 1 elm)
374 category (nth 2 elm)
375 i 0)
376 (while (< i len)
377 (if (= (aref chars i) ?-)
378 (setq i (1+ i)
379 to (aref chars i))
380 (setq ch (aref chars i)
381 to ch))
382 (while (<= ch to)
383 (unless (string-equal syntax "w")
384 (modify-syntax-entry ch syntax))
385 (modify-category-entry ch category)
386 (setq ch (1+ ch)))
387 (setq i (1+ i)))
388 (setq deflist (cdr deflist))))
389
390 ;; Tibetan character set
391
392 (modify-category-entry '(#xf00 . #xfff) ?q)
393 (map-charset-chars #'modify-category-entry 'tibetan ?q)
394 (map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
395
396 (let ((deflist '(;; chars syntax category
397 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
398 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
399 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
400 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
401 ("྄ཱུ༙༵༷" "w" ?3) ; lowel vowel/modifier
402 ("཰" "w" ?3) ; invisible vowel a
403 ("༠-༩༪-༳" "w" ?6) ; digit
404 ("་།-༒༔ཿ" "." ?|) ; line-break char
405 ("་།༏༐༑༔ཿ" "." ?|) ;
406 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
407 ("་།༏༐༑༔ཿ" "." ?>) ;
408 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
409 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
410 ))
411 elm chars len syntax category to ch i)
412 (while deflist
413 (setq elm (car deflist))
414 (setq chars (car elm)
415 len (length chars)
416 syntax (nth 1 elm)
417 category (nth 2 elm)
418 i 0)
419 (while (< i len)
420 (if (= (aref chars i) ?-)
421 (setq i (1+ i)
422 to (aref chars i))
423 (setq ch (aref chars i)
424 to ch))
425 (while (<= ch to)
426 (unless (string-equal syntax "w")
427 (modify-syntax-entry ch syntax))
428 (modify-category-entry ch category)
429 (setq ch (1+ ch)))
430 (setq i (1+ i)))
431 (setq deflist (cdr deflist))))
432
433 ;; Vietnamese character set
434
435 ;; To make a word with Latin characters
436 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
437 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
438
439 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
440 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
441
442 (let ((tbl (standard-case-table))
443 (i 32))
444 (while (< i 128)
445 (let* ((char (decode-char 'vietnamese-viscii-upper i))
446 (charl (decode-char 'vietnamese-viscii-lower i))
447 (uc (encode-char char 'ucs))
448 (lc (encode-char charl 'ucs)))
449 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
450 tbl)
451 (if uc (modify-category-entry uc ?v))
452 (if lc (modify-category-entry lc ?v)))
453 (setq i (1+ i))))
454
455 ;; Tai Viet
456 (let ((deflist '(;; chars syntax category
457 ((?ꪀ. ?ꪯ) "w" ?0) ; cosonant
458 ("ꪱꪵꪶ" "w" ?1) ; vowel base
459 ((?ꪹ . ?ꪽ) "w" ?1) ; vowel base
460 ("ꪰꪲꪳꪷꪸꪾ" "w" ?2) ; vowel upper
461 ("ꪴ" "w" ?3) ; vowel lower
462 ("ꫀꫂ" "w" ?1) ; non-combining tone-mark
463 ("꪿꫁" "w" ?4) ; combining tone-mark
464 ((?ꫛ . ?꫟) "_" ?5) ; symbol
465 )))
466 (dolist (elm deflist)
467 (let ((chars (car elm))
468 (syntax (nth 1 elm))
469 (category (nth 2 elm)))
470 (if (consp chars)
471 (progn
472 (modify-syntax-entry chars syntax)
473 (modify-category-entry chars category))
474 (mapc #'(lambda (x)
475 (modify-syntax-entry x syntax)
476 (modify-category-entry x category))
477 chars)))))
478
479 ;; Latin
480
481 (modify-category-entry '(#x80 . #x024F) ?l)
482
483 (let ((tbl (standard-case-table)) c)
484
485 ;; Latin-1
486
487 ;; Fixme: Some of the non-word syntaxes here perhaps should be
488 ;; reviewed. (Note that the following all implicitly have word
489 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
490 ;; relating Unicode categories to Emacs syntax codes.
491
492 ;; NBSP isn't semantically interchangeable with other whitespace chars,
493 ;; so it's more like punctation.
494 (set-case-syntax"." tbl)
495 (set-case-syntax"." tbl)
496 (set-case-syntax"_" tbl)
497 (set-case-syntax"." tbl)
498 (set-case-syntax"_" tbl)
499 (set-case-syntax-delims 171 187 tbl) ; « »
500 (set-case-syntax"_" tbl)
501 (set-case-syntax"_" tbl)
502 (set-case-syntax"_" tbl)
503 (set-case-syntax"_" tbl)
504 (set-case-syntax"_" tbl)
505 (set-case-syntax"_" tbl)
506 (set-case-syntax"_" tbl)
507 (set-case-syntax"_" tbl)
508 (set-case-syntax"_" tbl)
509 (set-case-syntax"_" tbl)
510 (set-case-syntax ?¿ "." tbl)
511 (let ((c 192))
512 (while (<= c 222)
513 (set-case-syntax-pair c (+ c 32) tbl)
514 (setq c (1+ c))))
515 (set-case-syntax"_" tbl)
516 (set-case-syntax"w" tbl)
517 (set-case-syntax"_" tbl)
518 ;; See below for ÿ.
519
520 ;; Latin Extended-A, Latin Extended-B
521 (setq c #x0100)
522 (while (<= c #x02B8)
523 (modify-category-entry c ?l)
524 (setq c (1+ c)))
525
526 (let ((pair-ranges '((#x0100 . #x012F)
527 (#x0132 . #x0137)
528 (#x0139 . #x0148)
529 (#x014a . #x0177)
530 (#x0179 . #x017E)
531 (#x0182 . #x0185)
532 (#x0187 . #x0188)
533 (#x018B . #x018C)
534 (#x0191 . #x0192)
535 (#x0198 . #x0199)
536 (#x01A0 . #x01A5)
537 (#x01A7 . #x01A8)
538 (#x01AC . #x01AD)
539 (#x01AF . #x01B0)
540 (#x01B3 . #x01B6)
541 (#x01BC . #x01BD)
542 (#x01CD . #x01DC)
543 (#x01DE . #x01EF)
544 (#x01F4 . #x01F5)
545 (#x01F8 . #x021F)
546 (#x0222 . #x0233)
547 (#x023B . #x023C)
548 (#x0241 . #x0242)
549 (#x0246 . #x024F))))
550 (dolist (elt pair-ranges)
551 (let ((from (car elt)) (to (cdr elt)))
552 (while (< from to)
553 (set-case-syntax-pair from (1+ from) tbl)
554 (setq from (+ from 2))))))
555
556 (set-case-syntax-pair #x189 #x256 tbl)
557 (set-case-syntax-pair #x18A #x257 tbl)
558
559 ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
560 ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
561 ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
562 ;; SMALL LETTER I.
563
564 ;; We used to set up half of those correspondence unconditionally,
565 ;; but that makes searches slow. So now we don't set up either half
566 ;; of these correspondences by default.
567
568 ;; (set-downcase-syntax ?İ ?i tbl)
569 ;; (set-upcase-syntax ?I ?ı tbl)
570
571 (set-case-syntax-pair ?DŽ ?dž tbl)
572 (set-case-syntax-pair ?Dž ?dž tbl)
573 (set-case-syntax-pair ?LJ ?lj tbl)
574 (set-case-syntax-pair ?Lj ?lj tbl)
575 (set-case-syntax-pair ?NJ ?nj tbl)
576 (set-case-syntax-pair ?Nj ?nj tbl)
577
578 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
579 (set-case-syntax-pair ?DZ ?dz tbl)
580 (set-case-syntax-pair ?Dz ?dz tbl)
581 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
582 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
583
584 ;; Latin Extended Additional
585 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
586 (setq c #x1e00)
587 (while (<= c #x1ef9)
588 (and (zerop (% c 2))
589 (or (<= c #x1e94) (>= c #x1ea0))
590 (set-case-syntax-pair c (1+ c) tbl))
591 (setq c (1+ c)))
592
593 ;; Greek
594 (modify-category-entry '(#x0370 . #x03ff) ?g)
595 (setq c #x0370)
596 (while (<= c #x03ff)
597 (if (or (and (>= c #x0391) (<= c #x03a1))
598 (and (>= c #x03a3) (<= c #x03ab)))
599 (set-case-syntax-pair c (+ c 32) tbl))
600 (and (>= c #x03da)
601 (<= c #x03ee)
602 (zerop (% c 2))
603 (set-case-syntax-pair c (1+ c) tbl))
604 (setq c (1+ c)))
605 (set-case-syntax-pair ?Ά ?ά tbl)
606 (set-case-syntax-pair ?Έ ?έ tbl)
607 (set-case-syntax-pair ?Ή ?ή tbl)
608 (set-case-syntax-pair ?Ί ?ί tbl)
609 (set-case-syntax-pair ?Ό ?ό tbl)
610 (set-case-syntax-pair ?Ύ ?ύ tbl)
611 (set-case-syntax-pair ?Ώ ?ώ tbl)
612
613 ;; Armenian
614 (setq c #x531)
615 (while (<= c #x556)
616 (set-case-syntax-pair c (+ c #x30) tbl)
617 (setq c (1+ c)))
618
619 ;; Greek Extended
620 (modify-category-entry '(#x1f00 . #x1fff) ?g)
621 (setq c #x1f00)
622 (while (<= c #x1fff)
623 (and (<= (logand c #x000f) 7)
624 (<= c #x1fa7)
625 (not (memq c '(#x1f16 #x1f17 #x1f56 #x1f57
626 #x1f50 #x1f52 #x1f54 #x1f56)))
627 (/= (logand c #x00f0) #x70)
628 (set-case-syntax-pair (+ c 8) c tbl))
629 (setq c (1+ c)))
630 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
631 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
632 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
633 (set-case-syntax-pair ?Ά ?ά tbl)
634 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
635 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
636 (set-case-syntax-pair ?Έ ?έ tbl)
637 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
638 (set-case-syntax-pair ?Ή ?ή tbl)
639 (set-case-syntax-pair ?ῌ ?ῃ tbl)
640 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
641 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
642 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
643 (set-case-syntax-pair ?Ί ?ί tbl)
644 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
645 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
646 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
647 (set-case-syntax-pair ?Ύ ?ύ tbl)
648 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
649 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
650 (set-case-syntax-pair ?Ό ?ό tbl)
651 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
652 (set-case-syntax-pair ?Ώ ?ώ tbl)
653 (set-case-syntax-pair ?ῼ ?ῳ tbl)
654
655 ;; cyrillic
656 (modify-category-entry '(#x0400 . #x04FF) ?y)
657 (setq c #x0400)
658 (while (<= c #x04ff)
659 (and (>= c #x0400)
660 (<= c #x040f)
661 (set-case-syntax-pair c (+ c 80) tbl))
662 (and (>= c #x0410)
663 (<= c #x042f)
664 (set-case-syntax-pair c (+ c 32) tbl))
665 (and (zerop (% c 2))
666 (or (and (>= c #x0460) (<= c #x0480))
667 (and (>= c #x048c) (<= c #x04be))
668 (and (>= c #x04d0) (<= c #x04f4)))
669 (set-case-syntax-pair c (1+ c) tbl))
670 (setq c (1+ c)))
671 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
672 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
673 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
674 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
675 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
676
677 ;; general punctuation
678 (setq c #x2000)
679 (while (<= c #x200b)
680 (set-case-syntax c " " tbl)
681 (setq c (1+ c)))
682 (while (<= c #x200F)
683 (set-case-syntax c "." tbl)
684 (setq c (1+ c)))
685 ;; Fixme: These aren't all right:
686 (setq c #x2010)
687 (while (<= c #x2016)
688 (set-case-syntax c "_" tbl)
689 (setq c (1+ c)))
690 ;; Punctuation syntax for quotation marks (like `)
691 (while (<= c #x201f)
692 (set-case-syntax c "." tbl)
693 (setq c (1+ c)))
694 ;; Fixme: These aren't all right:
695 (while (<= c #x2027)
696 (set-case-syntax c "_" tbl)
697 (setq c (1+ c)))
698 (while (<= c #x206F)
699 (set-case-syntax c "." tbl)
700 (setq c (1+ c)))
701
702 ;; Roman numerals
703 (setq c #x2160)
704 (while (<= c #x216f)
705 (set-case-syntax-pair c (+ c #x10) tbl)
706 (setq c (1+ c)))
707
708 ;; Fixme: The following blocks might be better as symbol rather than
709 ;; punctuation.
710 ;; Arrows
711 (setq c #x2190)
712 (while (<= c #x21FF)
713 (set-case-syntax c "." tbl)
714 (setq c (1+ c)))
715 ;; Mathematical Operators
716 (while (<= c #x22FF)
717 (set-case-syntax c "." tbl)
718 (setq c (1+ c)))
719 ;; Miscellaneous Technical
720 (while (<= c #x23FF)
721 (set-case-syntax c "." tbl)
722 (setq c (1+ c)))
723 ;; Control Pictures
724 (while (<= c #x243F)
725 (set-case-syntax c "_" tbl)
726 (setq c (1+ c)))
727
728 ;; Circled Latin
729 (setq c #x24b6)
730 (while (<= c #x24cf)
731 (set-case-syntax-pair c (+ c 26) tbl)
732 (modify-category-entry c ?l)
733 (modify-category-entry (+ c 26) ?l)
734 (setq c (1+ c)))
735
736 ;; Fullwidth Latin
737 (setq c #xff21)
738 (while (<= c #xff3a)
739 (set-case-syntax-pair c (+ c #x20) tbl)
740 (modify-category-entry c ?l)
741 (modify-category-entry (+ c #x20) ?l)
742 (setq c (1+ c)))
743
744 ;; Combining diacritics
745 (modify-category-entry '(#x300 . #x362) ?^)
746 ;; Combining marks
747 (modify-category-entry '(#x20d0 . #x20e3) ?^)
748
749 ;; Fixme: syntax for symbols &c
750 )
751
752 (let ((pairs
753 '("⁅⁆" ; U+2045 U+2046
754 "⁽⁾" ; U+207D U+207E
755 "₍₎" ; U+208D U+208E
756 "〈〉" ; U+2329 U+232A
757 "⎴⎵" ; U+23B4 U+23B5
758 "❨❩" ; U+2768 U+2769
759 "❪❫" ; U+276A U+276B
760 "❬❭" ; U+276C U+276D
761 "❰❱" ; U+2770 U+2771
762 "❲❳" ; U+2772 U+2773
763 "❴❵" ; U+2774 U+2775
764 "⟦⟧" ; U+27E6 U+27E7
765 "⟨⟩" ; U+27E8 U+27E9
766 "⟪⟫" ; U+27EA U+27EB
767 "⦃⦄" ; U+2983 U+2984
768 "⦅⦆" ; U+2985 U+2986
769 "⦇⦈" ; U+2987 U+2988
770 "⦉⦊" ; U+2989 U+298A
771 "⦋⦌" ; U+298B U+298C
772 "⦍⦎" ; U+298D U+298E
773 "⦏⦐" ; U+298F U+2990
774 "⦑⦒" ; U+2991 U+2992
775 "⦓⦔" ; U+2993 U+2994
776 "⦕⦖" ; U+2995 U+2996
777 "⦗⦘" ; U+2997 U+2998
778 "⧼⧽" ; U+29FC U+29FD
779 "〈〉" ; U+3008 U+3009
780 "《》" ; U+300A U+300B
781 "「」" ; U+300C U+300D
782 "『』" ; U+300E U+300F
783 "【】" ; U+3010 U+3011
784 "〔〕" ; U+3014 U+3015
785 "〖〗" ; U+3016 U+3017
786 "〘〙" ; U+3018 U+3019
787 "〚〛" ; U+301A U+301B
788 "﴾﴿" ; U+FD3E U+FD3F
789 "︵︶" ; U+FE35 U+FE36
790 "︷︸" ; U+FE37 U+FE38
791 "︹︺" ; U+FE39 U+FE3A
792 "︻︼" ; U+FE3B U+FE3C
793 "︽︾" ; U+FE3D U+FE3E
794 "︿﹀" ; U+FE3F U+FE40
795 "﹁﹂" ; U+FE41 U+FE42
796 "﹃﹄" ; U+FE43 U+FE44
797 "﹙﹚" ; U+FE59 U+FE5A
798 "﹛﹜" ; U+FE5B U+FE5C
799 "﹝﹞" ; U+FE5D U+FE5E
800 "()" ; U+FF08 U+FF09
801 "[]" ; U+FF3B U+FF3D
802 "{}" ; U+FF5B U+FF5D
803 "⦅⦆" ; U+FF5F U+FF60
804 "「」" ; U+FF62 U+FF63
805 )))
806 (dolist (elt pairs)
807 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
808 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
809
810 \f
811 ;; For each character set, put the information of the most proper
812 ;; coding system to encode it by `preferred-coding-system' property.
813
814 ;; Fixme: should this be junked?
815 (let ((l '((latin-iso8859-1 . iso-latin-1)
816 (latin-iso8859-2 . iso-latin-2)
817 (latin-iso8859-3 . iso-latin-3)
818 (latin-iso8859-4 . iso-latin-4)
819 (thai-tis620 . thai-tis620)
820 (greek-iso8859-7 . greek-iso-8bit)
821 (arabic-iso8859-6 . iso-2022-7bit)
822 (hebrew-iso8859-8 . hebrew-iso-8bit)
823 (katakana-jisx0201 . japanese-shift-jis)
824 (latin-jisx0201 . japanese-shift-jis)
825 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
826 (latin-iso8859-9 . iso-latin-5)
827 (japanese-jisx0208-1978 . iso-2022-jp)
828 (chinese-gb2312 . chinese-iso-8bit)
829 (chinese-gbk . chinese-gbk)
830 (gb18030-2-byte . chinese-gb18030)
831 (gb18030-4-byte-bmp . chinese-gb18030)
832 (gb18030-4-byte-smp . chinese-gb18030)
833 (gb18030-4-byte-ext-1 . chinese-gb18030)
834 (gb18030-4-byte-ext-2 . chinese-gb18030)
835 (japanese-jisx0208 . iso-2022-jp)
836 (korean-ksc5601 . iso-2022-kr)
837 (japanese-jisx0212 . iso-2022-jp)
838 (chinese-big5-1 . chinese-big5)
839 (chinese-big5-2 . chinese-big5)
840 (chinese-sisheng . iso-2022-7bit)
841 (ipa . iso-2022-7bit)
842 (vietnamese-viscii-lower . vietnamese-viscii)
843 (vietnamese-viscii-upper . vietnamese-viscii)
844 (arabic-digit . iso-2022-7bit)
845 (arabic-1-column . iso-2022-7bit)
846 (lao . lao)
847 (arabic-2-column . iso-2022-7bit)
848 (indian-is13194 . devanagari)
849 (indian-glyph . devanagari)
850 (tibetan-1-column . tibetan)
851 (ethiopic . iso-2022-7bit)
852 (chinese-cns11643-1 . iso-2022-cn)
853 (chinese-cns11643-2 . iso-2022-cn)
854 (chinese-cns11643-3 . iso-2022-cn)
855 (chinese-cns11643-4 . iso-2022-cn)
856 (chinese-cns11643-5 . iso-2022-cn)
857 (chinese-cns11643-6 . iso-2022-cn)
858 (chinese-cns11643-7 . iso-2022-cn)
859 (indian-2-column . devanagari)
860 (tibetan . tibetan)
861 (latin-iso8859-14 . iso-latin-8)
862 (latin-iso8859-15 . iso-latin-9))))
863 (while l
864 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
865 (setq l (cdr l))))
866
867 \f
868 ;; Setup auto-fill-chars for charsets that should invoke auto-filling.
869 ;; SPACE and NEWLINE are already set.
870
871 (set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
872 (set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
873 (set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
874 (set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
875 (set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
876 (set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
877
878 \f
879 ;;; Setting char-width-table. The default is 1.
880
881 ;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
882 ;; and final characters.
883 (let ((l '((#x0300 . #x036F)
884 (#x0483 . #x0489)
885 (#x0591 . #x05BD)
886 (#x05BF . #x05BF)
887 (#x05C1 . #x05C2)
888 (#x05C4 . #x05C5)
889 (#x05C7 . #x05C7)
890 (#x0600 . #x0603)
891 (#x0610 . #x0615)
892 (#x064B . #x065E)
893 (#x0670 . #x0670)
894 (#x06D6 . #x06E4)
895 (#x06E7 . #x06E8)
896 (#x06EA . #x06ED)
897 (#x070F . #x070F)
898 (#x0711 . #x0711)
899 (#x0730 . #x074A)
900 (#x07A6 . #x07B0)
901 (#x07EB . #x07F3)
902 (#x0901 . #x0902)
903 (#x093C . #x093C)
904 (#x0941 . #x0948)
905 (#x094D . #x094D)
906 (#x0951 . #x0954)
907 (#x0962 . #x0963)
908 (#x0981 . #x0981)
909 (#x09BC . #x09BC)
910 (#x09C1 . #x09C4)
911 (#x09CD . #x09CD)
912 (#x09E2 . #x09E3)
913 (#x0A01 . #x0A02)
914 (#x0A3C . #x0A3C)
915 (#x0A41 . #x0A4D)
916 (#x0A70 . #x0A71)
917 (#x0A81 . #x0A82)
918 (#x0ABC . #x0ABC)
919 (#x0AC1 . #x0AC8)
920 (#x0ACD . #x0ACD)
921 (#x0AE2 . #x0AE3)
922 (#x0B01 . #x0B01)
923 (#x0B3C . #x0B3C)
924 (#x0B3F . #x0B3F)
925 (#x0B41 . #x0B43)
926 (#x0B4D . #x0B56)
927 (#x0B82 . #x0B82)
928 (#x0BC0 . #x0BC0)
929 (#x0BCD . #x0BCD)
930 (#x0C3E . #x0C40)
931 (#x0C46 . #x0C56)
932 (#x0CBC . #x0CBC)
933 (#x0CBF . #x0CBF)
934 (#x0CC6 . #x0CC6)
935 (#x0CCC . #x0CCD)
936 (#x0CE2 . #x0CE3)
937 (#x0D41 . #x0D43)
938 (#x0D4D . #x0D4D)
939 (#x0DCA . #x0DCA)
940 (#x0DD2 . #x0DD6)
941 (#x0E31 . #x0E31)
942 (#x0E34 . #x0E3A)
943 (#x0E47 . #x0E4E)
944 (#x0EB1 . #x0EB1)
945 (#x0EB4 . #x0EBC)
946 (#x0EC8 . #x0ECD)
947 (#x0F18 . #x0F19)
948 (#x0F35 . #x0F35)
949 (#x0F37 . #x0F37)
950 (#x0F39 . #x0F39)
951 (#x0F71 . #x0F7E)
952 (#x0F80 . #x0F84)
953 (#x0F86 . #x0F87)
954 (#x0F90 . #x0FBC)
955 (#x0FC6 . #x0FC6)
956 (#x102D . #x1030)
957 (#x1032 . #x1037)
958 (#x1039 . #x1039)
959 (#x1058 . #x1059)
960 (#x1160 . #x11FF)
961 (#x135F . #x135F)
962 (#x1712 . #x1714)
963 (#x1732 . #x1734)
964 (#x1752 . #x1753)
965 (#x1772 . #x1773)
966 (#x17B4 . #x17B5)
967 (#x17B7 . #x17BD)
968 (#x17C6 . #x17C6)
969 (#x17C9 . #x17D3)
970 (#x17DD . #x17DD)
971 (#x180B . #x180D)
972 (#x18A9 . #x18A9)
973 (#x1920 . #x1922)
974 (#x1927 . #x1928)
975 (#x1932 . #x1932)
976 (#x1939 . #x193B)
977 (#x1A17 . #x1A18)
978 (#x1B00 . #x1B03)
979 (#x1B34 . #x1B34)
980 (#x1B36 . #x1B3A)
981 (#x1B3C . #x1B3C)
982 (#x1B42 . #x1B42)
983 (#x1B6B . #x1B73)
984 (#x1DC0 . #x1DFF)
985 (#x200B . #x200F)
986 (#x202A . #x202E)
987 (#x2060 . #x206F)
988 (#x20D0 . #x20EF)
989 (#x302A . #x302F)
990 (#x3099 . #x309A)
991 (#xA806 . #xA806)
992 (#xA80B . #xA80B)
993 (#xA825 . #xA826)
994 (#xFB1E . #xFB1E)
995 (#xFE00 . #xFE0F)
996 (#xFE20 . #xFE23)
997 (#xFEFF . #xFEFF)
998 (#xFFF9 . #xFFFB)
999 (#x10A01 . #x10A0F)
1000 (#x10A38 . #x10A3F)
1001 (#x1D167 . #x1D169)
1002 (#x1D173 . #x1D182)
1003 (#x1D185 . #x1D18B)
1004 (#x1D1AA . #x1D1AD)
1005 (#x1D242 . #x1D244)
1006 (#xE0001 . #xE01EF))))
1007 (dolist (elt l)
1008 (set-char-table-range char-width-table elt 0)))
1009
1010 ;; 2: East Asian Wide and Full-width characters.
1011 (let ((l '((#x1100 . #x115F)
1012 (#x2329 . #x232A)
1013 (#x2E80 . #x303E)
1014 (#x3040 . #xA4CF)
1015 (#xAC00 . #xD7A3)
1016 (#xF900 . #xFAFF)
1017 (#xFE30 . #xFE6F)
1018 (#xFF01 . #xFF60)
1019 (#xFFE0 . #xFFE6)
1020 (#x20000 . #x2FFFF)
1021 (#x30000 . #x3FFFF))))
1022 (dolist (elt l)
1023 (set-char-table-range char-width-table elt 2)))
1024
1025 ;; Other double width
1026 ;;(map-charset-chars
1027 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1028 ;; 'ethiopic)
1029 ;; (map-charset-chars
1030 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1031 ;; 'tibetan)
1032 (map-charset-chars
1033 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1034 'indian-2-column)
1035 (map-charset-chars
1036 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1037 'arabic-2-column)
1038
1039 ;; Internal use only.
1040 ;; Alist of locale symbol vs charsets. In a language environment
1041 ;; corresponding to the locale, width of characters in the charsets is
1042 ;; set to 2. Each element has the form:
1043 ;; (LOCALE TABLE (CHARSET (FROM-CODE . TO-CODE) ...) ...)
1044 ;; LOCALE: locale symbol
1045 ;; TABLE: char-table used for char-width-table, initially nil.
1046 ;; CAHRSET: character set
1047 ;; FROM-CODE, TO-CODE: range of code-points in CHARSET
1048
1049 (defvar cjk-char-width-table-list
1050 '((ja_JP nil (japanese-jisx0208 (#x2121 . #x287E))
1051 (cp932-2-byte (#x8140 . #x879F)))
1052 (zh_CN nil (chinese-gb2312 (#x2121 . #x297E)))
1053 (zh_HK nil (big5-hkscs (#xA140 . #xA3FE) (#xC6A0 . #xC8FE)))
1054 (zh_TW nil (big5 (#xA140 . #xA3FE))
1055 (chinese-cns11643-1 (#x2121 . #x427E)))
1056 (ko_KR nil (korean-ksc5601 (#x2121 . #x2C7E)))))
1057
1058 ;; Internal use only.
1059 ;; Setup char-width-table appropriate for a language environment
1060 ;; corresponding to LOCALE-NAME (symbol).
1061
1062 (defun use-cjk-char-width-table (locale-name)
1063 (while (char-table-parent char-width-table)
1064 (setq char-width-table (char-table-parent char-width-table)))
1065 (let ((slot (assq locale-name cjk-char-width-table-list))
1066 table)
1067 (or slot (error "Unknown locale for CJK language environment: %s"
1068 locale-name))
1069 (unless (nth 1 slot)
1070 (let ((table (make-char-table nil)))
1071 (dolist (charset-info (nthcdr 2 slot))
1072 (let ((charset (car charset-info)))
1073 (dolist (code-range (cdr charset-info))
1074 (map-charset-chars #'(lambda (range arg)
1075 (set-char-table-range table range 2))
1076 charset nil
1077 (car code-range) (cdr code-range)))))
1078 (optimize-char-table table)
1079 (set-char-table-parent table char-width-table)
1080 (setcar (cdr slot) table)))
1081 (setq char-width-table (nth 1 slot))))
1082
1083 (defun use-default-char-width-table ()
1084 "Internal use only.
1085 Setup char-width-table appropriate for non-CJK language environment."
1086 (while (char-table-parent char-width-table)
1087 (setq char-width-table (char-table-parent char-width-table))))
1088
1089 (optimize-char-table (standard-case-table))
1090 (optimize-char-table (standard-syntax-table))
1091
1092 \f
1093 ;; Setting char-script-table.
1094
1095 ;; The Unicode blocks actually extend past some of these ranges with
1096 ;; undefined codepoints.
1097 (let ((script-list nil))
1098 (dolist
1099 (elt
1100 '((#x0000 #x007F latin)
1101 (#x00A0 #x024F latin)
1102 (#x0250 #x02AF phonetic)
1103 (#x02B0 #x036F latin)
1104 (#x0370 #x03E1 greek)
1105 (#x03E2 #x03EF coptic)
1106 (#x03F0 #x03F3 greek)
1107 (#x0400 #x04FF cyrillic)
1108 (#x0530 #x058F armenian)
1109 (#x0590 #x05FF hebrew)
1110 (#x0600 #x06FF arabic)
1111 (#x0700 #x074F syriac)
1112 (#x07C0 #x07FA nko)
1113 (#x0780 #x07BF thaana)
1114 (#x0900 #x097F devanagari)
1115 (#x0980 #x09FF bengali)
1116 (#x0A00 #x0A7F gurmukhi)
1117 (#x0A80 #x0AFF gujarati)
1118 (#x0B00 #x0B7F oriya)
1119 (#x0B80 #x0BFF tamil)
1120 (#x0C00 #x0C7F telugu)
1121 (#x0C80 #x0CFF kannada)
1122 (#x0D00 #x0D7F malayalam)
1123 (#x0D80 #x0DFF sinhala)
1124 (#x0E00 #x0E5F thai)
1125 (#x0E80 #x0EDF lao)
1126 (#x0F00 #x0FFF tibetan)
1127 (#x1000 #x105F myanmar)
1128 (#x10A0 #x10FF georgian)
1129 (#x1100 #x11FF hangul)
1130 (#x1200 #x139F ethiopic)
1131 (#x13A0 #x13FF cherokee)
1132 (#x1400 #x167F canadian-aboriginal)
1133 (#x1680 #x169F ogham)
1134 (#x16A0 #x16FF runic)
1135 (#x1780 #x17FF khmer)
1136 (#x1800 #x18AF mongolian)
1137 (#x1D00 #x1DFF phonetic)
1138 (#x1E00 #x1EFF latin)
1139 (#x1F00 #x1FFF greek)
1140 (#x2000 #x27FF symbol)
1141 (#x2800 #x28FF braille)
1142 (#x2D80 #x2DDF ethiopic)
1143 (#x2E80 #x2FDF han)
1144 (#x2FF0 #x2FFF ideographic-description)
1145 (#x3000 #x303F cjk-misc)
1146 (#x3040 #x30FF kana)
1147 (#x3100 #x312F bopomofo)
1148 (#x3130 #x318F hangul)
1149 (#x3190 #x319F kanbun)
1150 (#x31A0 #x31BF bopomofo)
1151 (#x3400 #x9FAF han)
1152 (#xA000 #xA4CF yi)
1153 (#xAA00 #xAA5F cham)
1154 (#xAA80 #xAADF tai-viet)
1155 (#xAC00 #xD7AF hangul)
1156 (#xF900 #xFAFF han)
1157 (#xFB1D #xFB4F hebrew)
1158 (#xFB50 #xFDFF arabic)
1159 (#xFE70 #xFEFC arabic)
1160 (#xFF00 #xFF5F cjk-misc)
1161 (#xFF61 #xFF9F kana)
1162 (#xFFE0 #xFFE6 cjk-misc)
1163 (#x10000 #x100FF linear-b)
1164 (#x10100 #x1013F aegean-number)
1165 (#x10140 #x1018A ancient-greek-number)
1166 (#x10190 #x1019B ancient-symbol)
1167 (#x101D0 #x101FF phaistos-disc)
1168 (#x10280 #x1029F lycian)
1169 (#x102A0 #x102DF carian)
1170 (#x10300 #x1032F olt-italic)
1171 (#x10380 #x1039F ugaritic)
1172 (#x103A0 #x103DF old-persian)
1173 (#x10400 #x1044F deseret)
1174 (#x10450 #x1047F shavian)
1175 (#x10480 #x104AF osmanya)
1176 (#x10800 #x1083F cypriot-syllabary)
1177 (#x10900 #x1091F phoenician)
1178 (#x10920 #x1093F lydian)
1179 (#x10A00 #x10A5F kharoshthi)
1180 (#x12000 #x123FF cuneiform)
1181 (#x12400 #x1247F cuneiform-numbers-and-punctuation)
1182 (#x1D000 #x1D0FF byzantine-musical-symbol)
1183 (#x1D100 #x1D1FF musical-symbol)
1184 (#x1D200 #x1D24F ancient-greek-musical-notation)
1185 (#x1D300 #x1D35F tai-xuan-jing-symbol)
1186 (#x1D360 #x1D37F counting-rod-numeral)
1187 (#x1D400 #x1D7FF mathematical)
1188 (#x1F000 #x1F02F mahjong-tile)
1189 (#x1F030 #x1F09F domino-tile)
1190 (#x20000 #x2AFFF han)
1191 (#x2F800 #x2FFFF han)))
1192 (set-char-table-range char-script-table
1193 (cons (car elt) (nth 1 elt)) (nth 2 elt))
1194 (or (memq (nth 2 elt) script-list)
1195 (setq script-list (cons (nth 2 elt) script-list))))
1196 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1197
1198 (map-charset-chars
1199 #'(lambda (range ignore)
1200 (set-char-table-range char-script-table range 'tibetan))
1201 'tibetan)
1202
1203 \f
1204 ;;; Setting unicode-category-table.
1205
1206 ;; This macro is to build unicode-category-table at compile time so
1207 ;; that C code can access the table efficiently.
1208 (defmacro build-unicode-category-table ()
1209 (let ((table (make-char-table 'unicode-category-table nil)))
1210 (dotimes (i #x110000)
1211 (if (or (< i #xD800)
1212 (and (>= i #xF900) (< i #x30000))
1213 (and (>= i #xE0000) (< i #xE0200)))
1214 (aset table i (get-char-code-property i 'general-category))))
1215 (set-char-table-range table '(#xE000 . #xF8FF) 'Co)
1216 (set-char-table-range table '(#xF0000 . #xFFFFD) 'Co)
1217 (set-char-table-range table '(#x100000 . #x10FFFD) 'Co)
1218 (optimize-char-table table 'eq)
1219 table))
1220
1221 (setq unicode-category-table (build-unicode-category-table))
1222 (map-char-table #'(lambda (key val)
1223 (if (and val
1224 (or (and (/= (aref (symbol-name val) 0) ?M)
1225 (/= (aref (symbol-name val) 0) ?C))
1226 (eq val 'Zs)))
1227 (modify-category-entry key ?.)))
1228 unicode-category-table)
1229
1230 (optimize-char-table (standard-category-table))
1231
1232 \f
1233 ;;; Setting word boundary.
1234
1235 (setq word-combining-categories
1236 '((nil . ?^)
1237 (?^ . nil)
1238 (?C . ?H)
1239 (?C . ?K)))
1240
1241 (setq word-separating-categories ; (2-byte character sets)
1242 '((?H . ?K) ; Hiragana - Katakana
1243 ))
1244
1245 ;; Local Variables:
1246 ;; coding: utf-8
1247 ;; End:
1248
1249 ;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
1250 ;;; characters.el ends here