Sync to HEAD
[bpt/emacs.git] / lisp / international / characters.el
CommitLineData
4ed46869
KH
1;;; characters.el --- set syntax and category for multibyte characters
2
28636af6 3;; Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
8f924df7 4;; Licensed to the Free Software Foundation.
cf6af551 5;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
8f924df7 6;; Copyright (C) 2003
55bd52ea
KH
7;; National Institute of Advanced Industrial Science and Technology (AIST)
8;; Registration Number H13PRO009
4ed46869
KH
9
10;; Keywords: multibyte character, character set, syntax, category
11
12;; This file is part of GNU Emacs.
13
14;; GNU Emacs is free software; you can redistribute it and/or modify
15;; it under the terms of the GNU General Public License as published by
16;; the Free Software Foundation; either version 2, or (at your option)
17;; any later version.
18
19;; GNU Emacs is distributed in the hope that it will be useful,
20;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22;; GNU General Public License for more details.
23
24;; You should have received a copy of the GNU General Public License
369314dc
KH
25;; along with GNU Emacs; see the file COPYING. If not, write to the
26;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27;; Boston, MA 02111-1307, USA.
4ed46869
KH
28
29;;; Commentary:
30
60370d40
PJ
31;;; Code:
32
4ed46869
KH
33;;; Predefined categories.
34
35;; For each character set.
36
37(define-category ?a "ASCII")
38(define-category ?l "Latin")
39(define-category ?t "Thai")
40(define-category ?g "Greek")
41(define-category ?b "Arabic")
42(define-category ?w "Hebrew")
43(define-category ?y "Cyrillic")
44(define-category ?k "Japanese katakana")
45(define-category ?r "Japanese roman")
46(define-category ?c "Chinese")
47(define-category ?j "Japanese")
48(define-category ?h "Korean")
49(define-category ?e "Ethiopic (Ge'ez)")
50(define-category ?v "Vietnamese")
51(define-category ?i "Indian")
6eba8645 52(define-category ?o "Lao")
9395eb7c 53(define-category ?q "Tibetan")
4ed46869
KH
54
55;; For each group (row) of 2-byte character sets.
56
94487c4e 57(define-category ?A "Alpha-numeric characters of 2-byte character sets")
4ed46869 58(define-category ?C "Chinese (Han) characters of 2-byte character sets")
94487c4e 59(define-category ?G "Greek characters of 2-byte character sets")
4ed46869
KH
60(define-category ?H "Japanese Hiragana characters of 2-byte character sets")
61(define-category ?K "Japanese Katakana characters of 2-byte character sets")
62(define-category ?N "Korean Hangul characters of 2-byte character sets")
94487c4e 63(define-category ?Y "Cyrillic characters of 2-byte character sets")
4ed46869
KH
64(define-category ?I "Indian Glyphs")
65
66;; For phonetic classifications.
67
68(define-category ?0 "consonant")
9765a2ba 69(define-category ?1 "base (independent) vowel")
4ed46869
KH
70(define-category ?2 "upper diacritical mark (including upper vowel)")
71(define-category ?3 "lower diacritical mark (including lower vowel)")
72(define-category ?4 "tone mark")
9765a2ba 73(define-category ?5 "symbol")
4ed46869
KH
74(define-category ?6 "digit")
75(define-category ?7 "vowel-modifying diacritical mark")
6eba8645
KH
76(define-category ?8 "vowel-signs")
77(define-category ?9 "semivowel lower")
4ed46869
KH
78
79;; For filling.
80(define-category ?| "While filling, we can break a line at this character.")
81
504af7b2 82;; For indentation calculation.
70ea295a 83(define-category ?\s
777cfce6 84 "This character counts as a space for indentation purposes.")
504af7b2 85
94487c4e 86;; Keep the following for `kinsoku' processing. See comments in
4ed46869
KH
87;; kinsoku.el.
88(define-category ?> "A character which can't be placed at beginning of line.")
89(define-category ?< "A character which can't be placed at end of line.")
90
269a5dd0
DL
91;; Combining
92(define-category ?^ "Combining diacritic or mark")
4ed46869
KH
93\f
94;;; Setting syntax and category.
95
96;; ASCII
97
e2cc40b7
KH
98;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
99(modify-category-entry '(32 . 127) ?a)
100(modify-category-entry '(32 . 127) ?l)
4ed46869 101
c94ae9eb
DL
102;; Deal with the CJK charsets first. Since the syntax of blocks is
103;; defined per charset, and the charsets may contain e.g. Latin
104;; characters, we end up with the wrong syntax definitions if we're
105;; not careful.
4ed46869 106
66bff5ed
KH
107;; Chinese characters (Unicode)
108(modify-category-entry '(#x3400 . #x9FAF) ?C)
109(modify-category-entry '(#x3400 . #x9FAF) ?c)
110(modify-category-entry '(#x3400 . #x9FAF) ?|)
111(modify-category-entry '(#xF900 . #xFAFF) ?C)
112(modify-category-entry '(#xF900 . #xFAFF) ?c)
113(modify-category-entry '(#xF900 . #xFAFF) ?|)
8e4cd685
KH
114(modify-category-entry '(#x20000 . #x2AFFF) ?|)
115(modify-category-entry '(#x2F800 . #x2FFFF) ?|)
116
4ed46869
KH
117
118;; Chinese character set (GB2312)
119
66bff5ed
KH
120(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
121(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
122(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
4ed46869 123
87a39edb 124(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
9ad4b491
KH
125(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
126(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
127(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
66bff5ed
KH
128(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
129(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
130(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
131(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
132(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
4ed46869
KH
133
134;; Chinese character set (BIG5)
135
e7259832 136(map-charset-chars #'modify-category-entry 'big5 ?c)
9ad4b491
KH
137(map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA25F)
138(map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
139(map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DF)
4ed46869
KH
140
141;; Chinese character set (CNS11643)
142
87a39edb
DL
143(dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
144 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
145 chinese-cns11643-7))
146 (map-charset-chars #'modify-category-entry c ?c)
9ad4b491
KH
147 (if (eq c 'chinese-cns11643-1)
148 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
8e4cd685 149 (map-charset-chars #'modify-category-entry c ?C)))
4ed46869 150
8f924df7 151;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
4ed46869 152
66bff5ed 153(map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
4ed46869 154
66bff5ed 155(map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
4ed46869 156
8f924df7
KH
157(dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
158 japanese-jisx0213-1 japanese-jisx0213-2))
8e4cd685 159 (map-charset-chars #'modify-category-entry l ?j))
4ed46869 160
269a5dd0 161;; Unicode equivalents of JISX0201-kana
66bff5ed
KH
162(let ((range '(#xff61 . #xff9f)))
163 (modify-category-entry range ?k)
164 (modify-category-entry range ?j)
165 (modify-category-entry range ?\|))
269a5dd0
DL
166
167;; Katakana block
66bff5ed
KH
168(let ((range '(#x30a0 . #x30ff)))
169 ;; ?K is double width, ?k isn't specified
170 (modify-category-entry range ?K)
171 (modify-category-entry range ?\|))
269a5dd0
DL
172
173;; Hiragana block
8f924df7 174(let ((range '(#x3040 . #x309d)))
66bff5ed
KH
175 ;; ?H is actually defined to be double width
176 ;;(modify-category-entry range ?H)
8e4cd685 177 (modify-category-entry range ?\|)
66bff5ed 178 )
269a5dd0 179
4ed46869 180;; JISX0208
66bff5ed
KH
181(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
182(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
183(let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
69c2c6ea 184 (dolist (elt chars)
abdaa411 185 (modify-syntax-entry (car chars) "w")))
66bff5ed
KH
186
187(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
188(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
189(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
190(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
191(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
192(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
193(modify-category-entry ?ー ?K)
194(let ((chars '(?゛ ?゜)))
4ed46869
KH
195 (while chars
196 (modify-category-entry (car chars) ?K)
197 (modify-category-entry (car chars) ?H)
198 (setq chars (cdr chars))))
66bff5ed 199(let ((chars '(?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
4ed46869
KH
200 (while chars
201 (modify-category-entry (car chars) ?C)
202 (setq chars (cdr chars))))
203
204;; JISX0212
4ed46869 205
66bff5ed 206(map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
4ed46869
KH
207
208;; JISX0201-Kana
87a39edb 209
abdaa411 210(let ((chars '(?。 ?、 ?・)))
4ed46869
KH
211 (while chars
212 (modify-syntax-entry (car chars) ".")
213 (setq chars (cdr chars))))
214
e6d10035
KH
215(modify-syntax-entry ?\「 "(」")
216(modify-syntax-entry ?\」 "(「")
226e4119 217
4ed46869
KH
218;; Korean character set (KSC5601)
219
87a39edb 220(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
66bff5ed
KH
221
222(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
439f7264
DL
223(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
224(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
225(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
9ad4b491
KH
226(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
227(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
228(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
66bff5ed
KH
229(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
230(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
231(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
232(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
4ed46869 233
c94ae9eb 234;; These are in more than one charset.
8f924df7
KH
235(let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
236 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
237 "()[]{}"))
238 open close)
239 (dotimes (i (/ (length parens) 2))
240 (setq open (aref parens (* i 2))
241 close (aref parens (1+ (* i 2))))
242 (modify-syntax-entry open (format "(%c" close))
243 (modify-syntax-entry close (format ")%c" open))))
d05cfa1f 244
c94ae9eb 245;; Arabic character set
6eba8645 246
c94ae9eb
DL
247(let ((charsets '(arabic-iso8859-6
248 arabic-digit
249 arabic-1-column
250 arabic-2-column)))
251 (while charsets
252 (map-charset-chars #'modify-category-entry (car charsets) ?b)
253 (setq charsets (cdr charsets))))
254(modify-category-entry '(#x600 . #x6ff) ?b)
255(modify-category-entry '(#xfb50 . #xfdff) ?b)
256(modify-category-entry '(#xfe70 . #xfefe) ?b)
6eba8645 257
c94ae9eb
DL
258;; Cyrillic character set (ISO-8859-5)
259
260(modify-syntax-entry ?№ ".")
261
262;; Ethiopic character set
263
c6d251f0 264(modify-category-entry '(#x1200 . #x137c) ?e)
c94ae9eb
DL
265(let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨ ? ? ? ? ? ?)))
266 (while chars
267 (modify-syntax-entry (car chars) ".")
268 (setq chars (cdr chars))))
269(map-charset-chars #'modify-category-entry 'ethiopic ?e)
270
271;; Hebrew character set (ISO-8859-8)
272
273(modify-syntax-entry #x5be ".") ; MAQAF
274(modify-syntax-entry #x5c0 ".") ; PASEQ
275(modify-syntax-entry #x5c3 ".") ; SOF PASUQ
276(modify-syntax-entry #x5f3 ".") ; GERESH
277(modify-syntax-entry #x5f4 ".") ; GERSHAYIM
278
279;; Indian character set (IS 13194 and other Emacs original Indian charsets)
280
281(modify-category-entry '(#x901 . #x970) ?i)
282(map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
283(map-charset-chars #'modify-category-entry 'indian-2-column ?i)
d05cfa1f 284
6eba8645
KH
285;; Lao character set
286
abdaa411
DL
287(modify-category-entry '(#xe80 . #xeff) ?o)
288(map-charset-chars #'modify-category-entry 'lao ?o)
6eba8645 289
abdaa411 290(let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
e6d10035
KH
291 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
292 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
293 ("ຸູ" "w" ?3) ; vowel lower
8f924df7 294 ("່-໋" "w" ?4) ; tone mark
e6d10035
KH
295 ("ຼຽ" "w" ?9) ; semivowel lower
296 ("໐-໙" "w" ?6) ; digit
297 ("ຯໆ" "_" ?5) ; symbol
6eba8645
KH
298 ))
299 elm chars len syntax category to ch i)
300 (while deflist
301 (setq elm (car deflist))
302 (setq chars (car elm)
303 len (length chars)
304 syntax (nth 1 elm)
305 category (nth 2 elm)
306 i 0)
307 (while (< i len)
308 (if (= (aref chars i) ?-)
309 (setq i (1+ i)
4a027a0d
KH
310 to (aref chars i))
311 (setq ch (aref chars i)
6eba8645
KH
312 to ch))
313 (while (<= ch to)
269a5dd0
DL
314 (unless (string-equal syntax "w")
315 (modify-syntax-entry ch syntax))
6eba8645
KH
316 (modify-category-entry ch category)
317 (setq ch (1+ ch)))
4a027a0d 318 (setq i (1+ i)))
6eba8645
KH
319 (setq deflist (cdr deflist))))
320
4ed46869
KH
321;; Thai character set (TIS620)
322
abdaa411
DL
323(modify-category-entry '(#xe00 . #xe7f) ?t)
324(map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
4ed46869
KH
325
326(let ((deflist '(;; chars syntax category
e6d10035
KH
327 ("ก-รลว-ฮ" "w" ?0) ; consonant
328 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
329 ("ัิ-ื็๎" "w" ?2) ; vowel upper
330 ("ุ-ฺ" "w" ?3) ; vowel lower
8f924df7 331 ("่-ํ" "w" ?4) ; tone mark
e6d10035
KH
332 ("๐-๙" "w" ?6) ; digit
333 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
4ed46869
KH
334 ))
335 elm chars len syntax category to ch i)
9395eb7c
KH
336 (while deflist
337 (setq elm (car deflist))
338 (setq chars (car elm)
339 len (length chars)
340 syntax (nth 1 elm)
341 category (nth 2 elm)
342 i 0)
343 (while (< i len)
344 (if (= (aref chars i) ?-)
345 (setq i (1+ i)
4a027a0d
KH
346 to (aref chars i))
347 (setq ch (aref chars i)
9395eb7c
KH
348 to ch))
349 (while (<= ch to)
269a5dd0
DL
350 (unless (string-equal syntax "w")
351 (modify-syntax-entry ch syntax))
9395eb7c
KH
352 (modify-category-entry ch category)
353 (setq ch (1+ ch)))
4a027a0d 354 (setq i (1+ i)))
9395eb7c
KH
355 (setq deflist (cdr deflist))))
356
357;; Tibetan character set
358
abdaa411
DL
359(modify-category-entry '(#xf00 . #xfff) ?q)
360(map-charset-chars #'modify-category-entry 'tibetan ?q)
361(map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
9395eb7c
KH
362
363(let ((deflist '(;; chars syntax category
725d7c92
DL
364 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
365 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
e6d10035
KH
366 ("-" "w" ?0) ;
367 ("-" "w" ?0) ;
725d7c92
DL
368 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
369 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
370 ("྄ཱུ༙༵༷" "w" ?3) ; lowel vowel/modifier
8f924df7 371 ("཰" "w" ?3) ; invisible vowel a
725d7c92
DL
372 ("༠-༩༪-༳" "w" ?6) ; digit
373 ("་།-༒༔ཿ" "." ?|) ; line-break char
374 ("་།༏༐༑༔ཿ" "." ?|) ;
375 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
376 ("་།༏༐༑༔ཿ" "." ?>) ;
377 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
378 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
9395eb7c
KH
379 ))
380 elm chars len syntax category to ch i)
4ed46869
KH
381 (while deflist
382 (setq elm (car deflist))
383 (setq chars (car elm)
384 len (length chars)
385 syntax (nth 1 elm)
386 category (nth 2 elm)
387 i 0)
388 (while (< i len)
389 (if (= (aref chars i) ?-)
390 (setq i (1+ i)
4a027a0d
KH
391 to (aref chars i))
392 (setq ch (aref chars i)
4ed46869
KH
393 to ch))
394 (while (<= ch to)
269a5dd0
DL
395 (unless (string-equal syntax "w")
396 (modify-syntax-entry ch syntax))
4ed46869
KH
397 (modify-category-entry ch category)
398 (setq ch (1+ ch)))
4a027a0d 399 (setq i (1+ i)))
4ed46869
KH
400 (setq deflist (cdr deflist))))
401
402;; Vietnamese character set
403
abdaa411
DL
404;; To make a word with Latin characters
405(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
406(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
407
408(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
409(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
4ed46869 410
e5dd1155
KH
411(let ((tbl (standard-case-table))
412 (i 32))
413 (while (< i 128)
725d7c92
DL
414 (let* ((char (decode-char 'vietnamese-viscii-upper i))
415 (charl (decode-char 'vietnamese-viscii-lower i))
416 (uc (encode-char char 'ucs))
417 (lc (encode-char charl 'ucs)))
418 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
419 tbl)
420 (if uc (modify-category-entry uc ?v))
421 (if lc (modify-category-entry lc ?v)))
e5dd1155
KH
422 (setq i (1+ i))))
423
c94ae9eb
DL
424
425;; Latin
426
427(modify-category-entry '(#x80 . #x024F) ?l)
d05cfa1f 428
85ef8ece
KH
429(let ((tbl (standard-case-table)) c)
430
431;; In some languages, U+0049 LATIN CAPITAL LETTER I and U+0131 LATIN
432;; SMALL LETTER DOTLESS I make a case pair, and so do U+0130 LATIN
433;; CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN SMALL LETTER I.
4c858c59 434;; See the Turkish language environment.
85ef8ece 435
4fb82d62
DL
436 ;; Latin-1
437
438 ;; Fixme: Some of the non-word syntaxes here perhaps should be
439 ;; reviewed. (Note that the following all implicitly have word
440 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
441 ;; relating Unicode categories to Emacs syntax codes.
442 (set-case-syntax ?  " " tbl) ; dubious
443 (set-case-syntax ?¡ "." tbl)
444 (set-case-syntax ?¦ "_" tbl)
445 (set-case-syntax ?§ "." tbl)
446 (set-case-syntax ?© "_" tbl)
447 (set-case-syntax-delims 171 187 tbl) ; « »
448 (set-case-syntax ?¬ "_" tbl)
449 (set-case-syntax ?­ "_" tbl)
450 (set-case-syntax ?® "_" tbl)
451 (set-case-syntax ?° "_" tbl)
452 (set-case-syntax ?± "_" tbl)
453 (set-case-syntax ?µ "_" tbl)
454 (set-case-syntax ?· "_" tbl)
455 (set-case-syntax ?¼ "_" tbl)
456 (set-case-syntax ?½ "_" tbl)
457 (set-case-syntax ?¾ "_" tbl)
458 (set-case-syntax ?¿ "." tbl)
459 (let ((c 192))
460 (while (<= c 222)
461 (set-case-syntax-pair c (+ c 32) tbl)
462 (setq c (1+ c))))
463 (set-case-syntax ?× "_" tbl)
464 (set-case-syntax ?ß "w" tbl)
465 (set-case-syntax ?÷ "_" tbl)
466 ;; See below for ÿ.
85ef8ece 467
85ef8ece
KH
468 ;; Latin Extended-A, Latin Extended-B
469 (setq c #x0100)
470 (while (<= c #x0233)
85ef8ece
KH
471 (and (or (<= c #x012e)
472 (and (>= c #x014a) (<= c #x0177)))
d05cfa1f 473 (zerop (% c 2))
abdaa411 474 (set-case-syntax-pair c (1+ c) tbl))
85ef8ece
KH
475 (and (>= c #x013a)
476 (<= c #x0148)
477 (zerop (% c 2))
abdaa411 478 (set-case-syntax-pair (1- c) c tbl))
d05cfa1f 479 (setq c (1+ c)))
e6d10035
KH
480 (set-case-syntax-pair ?IJ ?ij tbl)
481 (set-case-syntax-pair ?Ĵ ?ĵ tbl)
482 (set-case-syntax-pair ?Ķ ?ķ tbl)
ff25b4fc 483 (set-case-syntax-pair ?Ÿ ?ÿ tbl)
e6d10035
KH
484 (set-case-syntax-pair ?Ź ?ź tbl)
485 (set-case-syntax-pair ?Ż ?ż tbl)
486 (set-case-syntax-pair ?Ž ?ž tbl)
d05cfa1f 487
269a5dd0 488 ;; Latin Extended-B
e6d10035
KH
489 (set-case-syntax-pair ?Ɓ ?ɓ tbl)
490 (set-case-syntax-pair ?Ƃ ?ƃ tbl)
491 (set-case-syntax-pair ?Ƅ ?ƅ tbl)
492 (set-case-syntax-pair ?Ɔ ?ɔ tbl)
493 (set-case-syntax-pair ?Ƈ ?ƈ tbl)
494 (set-case-syntax-pair ?Ɖ ?ɖ tbl)
495 (set-case-syntax-pair ?Ɗ ?ɗ tbl)
496 (set-case-syntax-pair ?Ƌ ?ƌ tbl)
497 (set-case-syntax-pair ?Ǝ ?ǝ tbl)
498 (set-case-syntax-pair ?Ə ?ə tbl)
499 (set-case-syntax-pair ?Ɛ ?ɛ tbl)
500 (set-case-syntax-pair ?Ƒ ?ƒ tbl)
501 (set-case-syntax-pair ?Ɠ ?ɠ tbl)
502 (set-case-syntax-pair ?Ɣ ?ɣ tbl)
503 (set-case-syntax-pair ?Ɩ ?ɩ tbl)
504 (set-case-syntax-pair ?Ɨ ?ɨ tbl)
505 (set-case-syntax-pair ?Ƙ ?ƙ tbl)
506 (set-case-syntax-pair ?Ɯ ?ɯ tbl)
507 (set-case-syntax-pair ?Ɲ ?ɲ tbl)
508 (set-case-syntax-pair ?Ɵ ?ɵ tbl)
509 (set-case-syntax-pair ?Ơ ?ơ tbl)
510 (set-case-syntax-pair ?Ƣ ?ƣ tbl)
511 (set-case-syntax-pair ?Ƥ ?ƥ tbl)
512 (set-case-syntax-pair ?Ʀ ?ʀ tbl)
513 (set-case-syntax-pair ?Ƨ ?ƨ tbl)
514 (set-case-syntax-pair ?Ʃ ?ʃ tbl)
515 (set-case-syntax-pair ?Ƭ ?ƭ tbl)
516 (set-case-syntax-pair ?Ʈ ?ʈ tbl)
517 (set-case-syntax-pair ?Ư ?ư tbl)
518 (set-case-syntax-pair ?Ʊ ?ʊ tbl)
519 (set-case-syntax-pair ?Ʋ ?ʋ tbl)
520 (set-case-syntax-pair ?Ƴ ?ƴ tbl)
521 (set-case-syntax-pair ?Ƶ ?ƶ tbl)
522 (set-case-syntax-pair ?Ʒ ?ʒ tbl)
523 (set-case-syntax-pair ?Ƹ ?ƹ tbl)
524 (set-case-syntax-pair ?Ƽ ?ƽ tbl)
525 (set-case-syntax-pair ?DŽ ?dž tbl)
526 (set-case-syntax-pair ?Dž ?dž tbl)
527 (set-case-syntax-pair ?LJ ?lj tbl)
528 (set-case-syntax-pair ?Lj ?lj tbl)
529 (set-case-syntax-pair ?NJ ?nj tbl)
530 (set-case-syntax-pair ?Nj ?nj tbl)
531 (set-case-syntax-pair ?Ǎ ?ǎ tbl)
532 (set-case-syntax-pair ?Ǐ ?ǐ tbl)
533 (set-case-syntax-pair ?Ǒ ?ǒ tbl)
534 (set-case-syntax-pair ?Ǔ ?ǔ tbl)
535 (set-case-syntax-pair ?Ǖ ?ǖ tbl)
536 (set-case-syntax-pair ?Ǘ ?ǘ tbl)
537 (set-case-syntax-pair ?Ǚ ?ǚ tbl)
538 (set-case-syntax-pair ?Ǜ ?ǜ tbl)
539 (set-case-syntax-pair ?Ǟ ?ǟ tbl)
540 (set-case-syntax-pair ?Ǡ ?ǡ tbl)
541 (set-case-syntax-pair ?Ǣ ?ǣ tbl)
542 (set-case-syntax-pair ?Ǥ ?ǥ tbl)
543 (set-case-syntax-pair ?Ǧ ?ǧ tbl)
544 (set-case-syntax-pair ?Ǩ ?ǩ tbl)
545 (set-case-syntax-pair ?Ǫ ?ǫ tbl)
546 (set-case-syntax-pair ?Ǭ ?ǭ tbl)
547 (set-case-syntax-pair ?Ǯ ?ǯ tbl)
269a5dd0 548 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
e6d10035
KH
549 (set-case-syntax-pair ?DZ ?dz tbl)
550 (set-case-syntax-pair ?Dz ?dz tbl)
551 (set-case-syntax-pair ?Ǵ ?ǵ tbl)
552 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
553 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
554 (set-case-syntax-pair ?Ǹ ?ǹ tbl)
555 (set-case-syntax-pair ?Ǻ ?ǻ tbl)
556 (set-case-syntax-pair ?Ǽ ?ǽ tbl)
557 (set-case-syntax-pair ?Ǿ ?ǿ tbl)
558 (set-case-syntax-pair ?Ȁ ?ȁ tbl)
559 (set-case-syntax-pair ?Ȃ ?ȃ tbl)
560 (set-case-syntax-pair ?Ȅ ?ȅ tbl)
561 (set-case-syntax-pair ?Ȇ ?ȇ tbl)
562 (set-case-syntax-pair ?Ȉ ?ȉ tbl)
563 (set-case-syntax-pair ?Ȋ ?ȋ tbl)
564 (set-case-syntax-pair ?Ȍ ?ȍ tbl)
565 (set-case-syntax-pair ?Ȏ ?ȏ tbl)
566 (set-case-syntax-pair ?Ȑ ?ȑ tbl)
567 (set-case-syntax-pair ?Ȓ ?ȓ tbl)
568 (set-case-syntax-pair ?Ȕ ?ȕ tbl)
569 (set-case-syntax-pair ?Ȗ ?ȗ tbl)
570 (set-case-syntax-pair ?Ș ?ș tbl)
571 (set-case-syntax-pair ?Ț ?ț tbl)
572 (set-case-syntax-pair ?Ȝ ?ȝ tbl)
573 (set-case-syntax-pair ?Ȟ ?ȟ tbl)
574 (set-case-syntax-pair ?Ȣ ?ȣ tbl)
575 (set-case-syntax-pair ?Ȥ ?ȥ tbl)
576 (set-case-syntax-pair ?Ȧ ?ȧ tbl)
577 (set-case-syntax-pair ?Ȩ ?ȩ tbl)
578 (set-case-syntax-pair ?Ȫ ?ȫ tbl)
579 (set-case-syntax-pair ?Ȭ ?ȭ tbl)
580 (set-case-syntax-pair ?Ȯ ?ȯ tbl)
581 (set-case-syntax-pair ?Ȱ ?ȱ tbl)
582 (set-case-syntax-pair ?Ȳ ?ȳ tbl)
269a5dd0 583
85ef8ece 584 ;; Latin Extended Additional
abdaa411 585 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
85ef8ece 586 (setq c #x1e00)
d05cfa1f 587 (while (<= c #x1ef9)
d05cfa1f
KH
588 (and (zerop (% c 2))
589 (or (<= c #x1e94) (>= c #x1ea0))
abdaa411 590 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f
KH
591 (setq c (1+ c)))
592
85ef8ece 593 ;; Greek
abdaa411 594 (modify-category-entry '(#x0370 . #x03ff) ?g)
85ef8ece 595 (setq c #x0370)
d05cfa1f 596 (while (<= c #x03ff)
d05cfa1f
KH
597 (if (or (and (>= c #x0391) (<= c #x03a1))
598 (and (>= c #x03a3) (<= c #x03ab)))
abdaa411 599 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
600 (and (>= c #x03da)
601 (<= c #x03ee)
602 (zerop (% c 2))
abdaa411 603 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 604 (setq c (1+ c)))
e6d10035
KH
605 (set-case-syntax-pair ?Ά ?ά tbl)
606 (set-case-syntax-pair ?Έ ?έ tbl)
607 (set-case-syntax-pair ?Ή ?ή tbl)
608 (set-case-syntax-pair ?Ί ?ί tbl)
609 (set-case-syntax-pair ?Ό ?ό tbl)
610 (set-case-syntax-pair ?Ύ ?ύ tbl)
611 (set-case-syntax-pair ?Ώ ?ώ tbl)
d05cfa1f 612
269a5dd0
DL
613 ;; Armenian
614 (setq c #x531)
615 (while (<= c #x556)
abdaa411 616 (set-case-syntax-pair c (+ c #x30) tbl)
269a5dd0
DL
617 (setq c (1+ c)))
618
85ef8ece 619 ;; Greek Extended
abdaa411 620 (modify-category-entry '(#x1f00 . #x1fff) ?g)
85ef8ece 621 (setq c #x1f00)
d05cfa1f 622 (while (<= c #x1fff)
d05cfa1f
KH
623 (and (<= (logand c #x000f) 7)
624 (<= c #x1fa7)
625 (not (memq c '(#x1f50 #x1f52 #x1f54 #x1f56)))
626 (/= (logand c #x00f0) 7)
abdaa411 627 (set-case-syntax-pair (+ c 8) c tbl))
d05cfa1f 628 (setq c (1+ c)))
e6d10035
KH
629 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
630 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
631 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
632 (set-case-syntax-pair ?Ά ?ά tbl)
633 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
634 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
635 (set-case-syntax-pair ?Έ ?έ tbl)
636 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
637 (set-case-syntax-pair ?Ή ?ή tbl)
638 (set-case-syntax-pair ?ῌ ?ῃ tbl)
639 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
640 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
641 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
642 (set-case-syntax-pair ?Ί ?ί tbl)
643 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
644 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
645 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
646 (set-case-syntax-pair ?Ύ ?ύ tbl)
647 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
648 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
649 (set-case-syntax-pair ?Ό ?ό tbl)
650 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
651 (set-case-syntax-pair ?Ώ ?ώ tbl)
652 (set-case-syntax-pair ?ῼ ?ῳ tbl)
d05cfa1f 653
85ef8ece 654 ;; cyrillic
abdaa411 655 (modify-category-entry '(#x0400 . #x04FF) ?y)
85ef8ece 656 (setq c #x0400)
d05cfa1f 657 (while (<= c #x04ff)
d05cfa1f
KH
658 (and (>= c #x0400)
659 (<= c #x040f)
abdaa411 660 (set-case-syntax-pair c (+ c 80) tbl))
d05cfa1f
KH
661 (and (>= c #x0410)
662 (<= c #x042f)
abdaa411 663 (set-case-syntax-pair c (+ c 32) tbl))
d05cfa1f
KH
664 (and (zerop (% c 2))
665 (or (and (>= c #x0460) (<= c #x0480))
666 (and (>= c #x048c) (<= c #x04be))
667 (and (>= c #x04d0) (<= c #x04f4)))
8f924df7 668 (set-case-syntax-pair c (1+ c) tbl))
d05cfa1f 669 (setq c (1+ c)))
e6d10035
KH
670 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
671 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
672 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
673 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
674 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
d05cfa1f 675
85ef8ece
KH
676 ;; general punctuation
677 (setq c #x2000)
d05cfa1f
KH
678 (while (<= c #x200b)
679 (set-case-syntax c " " tbl)
680 (setq c (1+ c)))
b427c97e
DL
681 (while (<= c #x200F)
682 (set-case-syntax c "." tbl)
683 (setq c (1+ c)))
684 ;; Fixme: These aren't all right:
6b61353c
KH
685 (setq c #x2010)
686 (while (<= c #x2016)
687 (set-case-syntax c "_" tbl)
688 (setq c (1+ c)))
689 ;; Punctuation syntax for quotation marks (like `)
690 (while (<= c #x201f)
691 (set-case-syntax c "." tbl)
692 (setq c (1+ c)))
693 ;; Fixme: These aren't all right:
d05cfa1f
KH
694 (while (<= c #x2027)
695 (set-case-syntax c "_" tbl)
696 (setq c (1+ c)))
b427c97e
DL
697 (while (<= c #x206F)
698 (set-case-syntax c "." tbl)
699 (setq c (1+ c)))
d05cfa1f 700
269a5dd0
DL
701 ;; Roman numerals
702 (setq c #x2160)
703 (while (<= c #x216f)
abdaa411 704 (set-case-syntax-pair c (+ c #x10) tbl)
269a5dd0
DL
705 (setq c (1+ c)))
706
4fb82d62
DL
707 ;; Fixme: The following blocks might be better as symbol rather than
708 ;; punctuation.
b427c97e
DL
709 ;; Arrows
710 (setq c #x2190)
6ca54a3a
DL
711 (while (<= c #x21FF)
712 (set-case-syntax c "." tbl)
b427c97e
DL
713 (setq c (1+ c)))
714 ;; Mathematical Operators
715 (while (<= c #x22FF)
6ca54a3a 716 (set-case-syntax c "." tbl)
b427c97e
DL
717 (setq c (1+ c)))
718 ;; Miscellaneous Technical
719 (while (<= c #x23FF)
6ca54a3a 720 (set-case-syntax c "." tbl)
b427c97e
DL
721 (setq c (1+ c)))
722 ;; Control Pictures
723 (while (<= c #x243F)
6ca54a3a 724 (set-case-syntax c "_" tbl)
269a5dd0
DL
725 (setq c (1+ c)))
726
727 ;; Circled Latin
728 (setq c #x24b6)
729 (while (<= c #x24cf)
abdaa411
DL
730 (set-case-syntax-pair c (+ c 26) tbl)
731 (modify-category-entry c ?l)
732 (modify-category-entry (+ c 26) ?l)
269a5dd0
DL
733 (setq c (1+ c)))
734
735 ;; Fullwidth Latin
736 (setq c #xff21)
737 (while (<= c #xff3a)
abdaa411
DL
738 (set-case-syntax-pair c (+ c #x20) tbl)
739 (modify-category-entry c ?l)
740 (modify-category-entry (+ c #x20) ?l)
269a5dd0
DL
741 (setq c (1+ c)))
742
269a5dd0 743 ;; Combining diacritics
abdaa411 744 (modify-category-entry '(#x300 . #x362) ?^)
269a5dd0 745 ;; Combining marks
abdaa411 746 (modify-category-entry '(#x20d0 . #x20e3) ?^)
269a5dd0
DL
747
748 ;; Fixme: syntax for symbols &c
749 )
6b61353c
KH
750
751(let ((pairs
752 '("\e$,1sEsF\e(B" ; U+2045 U+2046
753 "\e$,1s}s~\e(B" ; U+207D U+207E
754 "\e$,1t-t.\e(B" ; U+208D U+208E
755 "\e$,1{){*\e(B" ; U+2329 U+232A
756 "\e$,1|T|U\e(B" ; U+23B4 U+23B5
757 "\e$,2&H&I\e(B" ; U+2768 U+2769
758 "\e$,2&J&K\e(B" ; U+276A U+276B
759 "\e$,2&L&M\e(B" ; U+276C U+276D
760 "\e$,2&P&Q\e(B" ; U+2770 U+2771
761 "\e$,2&R&S\e(B" ; U+2772 U+2773
762 "\e$,2&T&U\e(B" ; U+2774 U+2775
763 "\e$,2'f'g\e(B" ; U+27E6 U+27E7
764 "\e$,2'h'i\e(B" ; U+27E8 U+27E9
765 "\e$,2'j'k\e(B" ; U+27EA U+27EB
766 "\e$,2,#,$\e(B" ; U+2983 U+2984
767 "\e$,2,%,&\e(B" ; U+2985 U+2986
768 "\e$,2,',(\e(B" ; U+2987 U+2988
769 "\e$,2,),*\e(B" ; U+2989 U+298A
770 "\e$,2,+,,\e(B" ; U+298B U+298C
771 "\e$,2,-,.\e(B" ; U+298D U+298E
772 "\e$,2,/,0\e(B" ; U+298F U+2990
773 "\e$,2,1,2\e(B" ; U+2991 U+2992
774 "\e$,2,3,4\e(B" ; U+2993 U+2994
775 "\e$,2,5,6\e(B" ; U+2995 U+2996
776 "\e$,2,7,8\e(B" ; U+2997 U+2998
777 "\e$,2-<-=\e(B" ; U+29FC U+29FD
778 "\e$,2=H=I\e(B" ; U+3008 U+3009
779 "\e$,2=J=K\e(B" ; U+300A U+300B
780 "\e$,2=L=M\e(B" ; U+300C U+300D
781 "\e$,2=N=O\e(B" ; U+300E U+300F
782 "\e$,2=P=Q\e(B" ; U+3010 U+3011
783 "\e$,2=T=U\e(B" ; U+3014 U+3015
784 "\e$,2=V=W\e(B" ; U+3016 U+3017
785 "\e$,2=X=Y\e(B" ; U+3018 U+3019
786 "\e$,2=Z=[\e(B" ; U+301A U+301B
787 "\e$,3m~m\7f\e(B" ; U+FD3E U+FD3F
788 "\e$,3pUpV\e(B" ; U+FE35 U+FE36
789 "\e$,3pWpX\e(B" ; U+FE37 U+FE38
790 "\e$,3pYpZ\e(B" ; U+FE39 U+FE3A
791 "\e$,3p[p\\e(B" ; U+FE3B U+FE3C
792 "\e$,3p]p^\e(B" ; U+FE3D U+FE3E
793 "\e$,3p_p`\e(B" ; U+FE3F U+FE40
794 "\e$,3papb\e(B" ; U+FE41 U+FE42
795 "\e$,3pcpd\e(B" ; U+FE43 U+FE44
796 "\e$,3pypz\e(B" ; U+FE59 U+FE5A
797 "\e$,3p{p|\e(B" ; U+FE5B U+FE5C
798 "\e$,3p}p~\e(B" ; U+FE5D U+FE5E
799 "\e$,3rhri\e(B" ; U+FF08 U+FF09
800 "\e$,3s;s=\e(B" ; U+FF3B U+FF3D
801 "\e$,3s[s]\e(B" ; U+FF5B U+FF5D
802 "\e$,3s_s`\e(B" ; U+FF5F U+FF60
803 "\e$,3sbsc\e(B" ; U+FF62 U+FF63
804 )))
805 (dolist (elt pairs)
806 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
807 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
808
4ed46869 809\f
777cfce6 810;; For each character set, put the information of the most proper
aaa9f206 811;; coding system to encode it by `preferred-coding-system' property.
777cfce6 812
abdaa411 813;; Fixme: should this be junked?
777cfce6
KH
814(let ((l '((latin-iso8859-1 . iso-latin-1)
815 (latin-iso8859-2 . iso-latin-2)
816 (latin-iso8859-3 . iso-latin-3)
817 (latin-iso8859-4 . iso-latin-4)
818 (thai-tis620 . thai-tis620)
819 (greek-iso8859-7 . greek-iso-8bit)
820 (arabic-iso8859-6 . iso-2022-7bit)
821 (hebrew-iso8859-8 . hebrew-iso-8bit)
822 (katakana-jisx0201 . japanese-shift-jis)
823 (latin-jisx0201 . japanese-shift-jis)
824 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
825 (latin-iso8859-9 . iso-latin-5)
826 (japanese-jisx0208-1978 . iso-2022-jp)
827 (chinese-gb2312 . cn-gb-2312)
828 (japanese-jisx0208 . iso-2022-jp)
829 (korean-ksc5601 . iso-2022-kr)
830 (japanese-jisx0212 . iso-2022-jp)
831 (chinese-cns11643-1 . iso-2022-cn)
832 (chinese-cns11643-2 . iso-2022-cn)
833 (chinese-big5-1 . chinese-big5)
834 (chinese-big5-2 . chinese-big5)
835 (chinese-sisheng . iso-2022-7bit)
836 (ipa . iso-2022-7bit)
837 (vietnamese-viscii-lower . vietnamese-viscii)
838 (vietnamese-viscii-upper . vietnamese-viscii)
839 (arabic-digit . iso-2022-7bit)
840 (arabic-1-column . iso-2022-7bit)
777cfce6
KH
841 (lao . lao)
842 (arabic-2-column . iso-2022-7bit)
843 (indian-is13194 . devanagari)
69e138b2 844 (indian-glyph . devanagari)
777cfce6 845 (tibetan-1-column . tibetan)
58cd41a3 846 (ethiopic . iso-2022-7bit)
777cfce6
KH
847 (chinese-cns11643-3 . iso-2022-cn)
848 (chinese-cns11643-4 . iso-2022-cn)
849 (chinese-cns11643-5 . iso-2022-cn)
850 (chinese-cns11643-6 . iso-2022-cn)
851 (chinese-cns11643-7 . iso-2022-cn)
852 (indian-2-column . devanagari)
7a860cf2
DL
853 (tibetan . tibetan)
854 (latin-iso8859-14 . iso-latin-8)
855 (latin-iso8859-15 . iso-latin-9))))
777cfce6 856 (while l
aaa9f206 857 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
777cfce6 858 (setq l (cdr l))))
df0415c5
KH
859
860\f
98a663f1 861;; Setup auto-fill-chars for charsets that should invoke auto-filling.
269a5dd0
DL
862;; SPACE and NEWLINE are already set. Also put `nospace-between-words'
863;; property on the charsets.
df0415c5
KH
864(let ((l '(katakana-jisx0201
865 japanese-jisx0208 japanese-jisx0212
866 chinese-gb2312 chinese-big5-1 chinese-big5-2)))
867 (while l
55bd52ea 868 ;;(aset auto-fill-chars (make-char (car l)) t)
98a663f1 869 (put-charset-property (car l) 'nospace-between-words t)
df0415c5 870 (setq l (cdr l))))
777cfce6 871
55bd52ea 872\f
ed0cb465
KH
873;; CJK double width characters.
874(let ((l '((#x1100 . #x11FF)
875 (#x2E80 . #x9FAF)
876 (#xAC00 . #xD7AF)
877 (#xF900 . #xFAFF)
878 (#xFE30 . #xFE4F)
879 (#xFF00 . #xFF5F)
8f924df7
KH
880 (#xFFE0 . #xFFEF)
881 (#x20000 . #x2AFFF)
882 (#x2F800 . #x2FFFF))))
ed0cb465
KH
883 (dolist (elt l)
884 (set-char-table-range char-width-table
abdaa411 885 (cons (car elt) (cdr elt))
ed0cb465 886 2)))
439f7264
DL
887;; Fixme: Doing this affects non-CJK characters through unification,
888;; but presumably CJK users expect those characters to be
889;; double-width when using these charsets.
890;; (map-charset-chars
891;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
892;; 'japanese-jisx0208)
893;; (map-charset-chars
894;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
895;; 'japanese-jisx0212)
896;; (map-charset-chars
897;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
898;; 'japanese-jisx0213-1)
899;; (map-charset-chars
900;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
901;; 'japanese-jisx0213-2)
902;; (map-charset-chars
903;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
904;; 'korean-ksc5601)
173f18ce
DL
905
906;; Other double width
907(map-charset-chars
908 (lambda (range ignore) (set-char-table-range char-width-table range 2))
909 'ethiopic)
910(map-charset-chars
911 (lambda (range ignore) (set-char-table-range char-width-table range 2))
912 'tibetan)
913(map-charset-chars
914 (lambda (range ignore) (set-char-table-range char-width-table range 2))
915 'indian-2-column)
916(map-charset-chars
917 (lambda (range ignore) (set-char-table-range char-width-table range 2))
918 'arabic-2-column)
777cfce6 919
87a39edb
DL
920(optimize-char-table (standard-case-table))
921(optimize-char-table char-width-table)
922(optimize-char-table (standard-category-table))
923(optimize-char-table (standard-syntax-table))
924
b427c97e
DL
925;; The Unicode blocks actually extend past some of these ranges with
926;; undefined codepoints.
9ce5de1c
KH
927(let ((script-list nil))
928 (dolist
929 (elt
b982c760 930 '((#x0000 #x007F latin)
9ce5de1c
KH
931 (#x00A0 #x036F latin)
932 (#x0370 #x03E1 greek)
933 (#x03E2 #x03EF coptic)
934 (#x03F0 #x03F3 greek)
935 (#x0400 #x04FF cyrillic)
936 (#x0530 #x058F armenian)
937 (#x0590 #x05FF hebrew)
938 (#x0600 #x06FF arabic)
939 (#x0700 #x074F syriac)
940 (#x0780 #x07BF thaana)
941 (#x0900 #x097F devanagari)
942 (#x0980 #x09FF bengali)
943 (#x0A00 #x0A7F gurmukhi)
944 (#x0A80 #x0AFF gujarati)
945 (#x0B00 #x0B7F oriya)
946 (#x0B80 #x0BFF tamil)
947 (#x0C00 #x0C7F telugu)
948 (#x0C80 #x0CFF kannada)
949 (#x0D00 #x0D7F malayalam)
950 (#x0D80 #x0DFF sinhala)
951 (#x0E00 #x0E5F thai)
952 (#x0E80 #x0EDF lao)
953 (#x0F00 #x0FFF tibetan)
954 (#x1000 #x105F myanmar)
955 (#x10A0 #x10FF georgian)
956 (#x1100 #x11FF hangul)
957 (#x1200 #x137F ethiopic)
958 (#x13A0 #x13FF cherokee)
959 (#x1400 #x167F canadian-aboriginal)
960 (#x1680 #x169F ogham)
961 (#x16A0 #x16FF runic)
962 (#x1780 #x17FF khmer)
963 (#x1800 #x18AF mongolian)
964 (#x1E00 #x1EFF latin)
965 (#x1F00 #x1FFF greek)
f041d33e 966 (#x2000 #x27FF symbol)
9ce5de1c
KH
967 (#x2800 #x28FF braille)
968 (#x2E80 #x2FDF han)
969 (#x2FF0 #x2FFF ideographic-description)
970 (#x3000 #x303F cjk-misc)
971 (#x3040 #x30FF kana)
972 (#x3100 #x312F bopomofo)
973 (#x3130 #x318F hangul)
974 (#x3190 #x319F kanbun)
975 (#x31A0 #x31BF bopomofo)
976 (#x3400 #x9FAF han)
977 (#xA000 #xA4CF yi)
978 (#xAC00 #xD7AF hangul)
95ac45fa 979 (#xF900 #xFAFF han)
9ce5de1c
KH
980 (#xFB1D #xFB4F hebrew)
981 (#xFB50 #xFDFF arabic)
982 (#xFE70 #xFEFC arabic)
983 (#xFF00 #xFF5F cjk-misc)
984 (#xFF61 #xFF9F kana)
985 (#xFFE0 #xFFE6 cjk-misc)
e7259832 986 (#x20000 #x2AFFF han)
9ce5de1c
KH
987 (#x2F800 #x2FFFF han)))
988 (set-char-table-range char-script-table
989 (cons (car elt) (nth 1 elt)) (nth 2 elt))
990 (or (memq (nth 2 elt) script-list)
991 (setq script-list (cons (nth 2 elt) script-list))))
992 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
993
8f924df7 994(map-charset-chars
cdfc5141
KH
995 #'(lambda (range ignore)
996 (set-char-table-range char-script-table range 'tibetan))
997 'tibetan)
998
e7259832
KH
999\f
1000;;; Setting word boundary.
1001
1002(defun next-word-boundary-han (pos limit)
1003 (if (<= pos limit)
1004 (save-excursion
1005 (goto-char pos)
1006 (looking-at "\\cC+")
1007 (goto-char (match-end 0))
831980fa 1008 (if (looking-at "\\cH+")
e7259832
KH
1009 (goto-char (match-end 0)))
1010 (point))
1011 (while (and (> pos limit)
1012 (eq (aref char-script-table (char-after (1- pos))) 'han))
1013 (setq pos (1- pos)))
1014 pos))
1015
1016(defun next-word-boundary-kana (pos limit)
1017 (if (<= pos limit)
1018 (save-excursion
1019 (goto-char pos)
1020 (if (looking-at "\\cK+")
1021 (goto-char (match-end 0)))
1022 (if (looking-at "\\cH+")
1023 (goto-char (match-end 0)))
c01b9ec3
KH
1024 (if (looking-at "\\ck+")
1025 (goto-char (match-end 0)))
e7259832
KH
1026 (point))
1027 (let ((category-set (char-category-set (char-after pos)))
1028 category)
c01b9ec3 1029 (if (or (aref category-set ?K) (aref category-set ?k))
e7259832 1030 (while (and (> pos limit)
c01b9ec3
KH
1031 (setq category-set
1032 (char-category-set (char-after (1- pos))))
1033 (or (aref category-set ?K) (aref category-set ?k)))
e7259832
KH
1034 (setq pos (1- pos)))
1035 (while (and (> pos limit)
8f924df7 1036 (aref (setq category-set
e7259832
KH
1037 (char-category-set (char-after (1- pos)))) ?H))
1038 (setq pos (1- pos)))
1039 (setq category (cond ((aref category-set ?C) ?C)
1040 ((aref category-set ?K) ?K)
1041 ((aref category-set ?A) ?A)))
1042 (when category
1043 (setq pos (1- pos))
1044 (while (and (> pos limit)
1045 (aref (char-category-set (char-after (1- pos)))
1046 category))
1047 (setq pos (1- pos)))))
1048 pos)))
1049
1050(map-char-table
1051 #'(lambda (char script)
1052 (cond ((eq script 'han)
8f924df7 1053 (set-char-table-range find-word-boundary-function-table
e7259832
KH
1054 char #'next-word-boundary-han))
1055 ((eq script 'kana)
8f924df7 1056 (set-char-table-range find-word-boundary-function-table
e7259832
KH
1057 char #'next-word-boundary-kana))))
1058 char-script-table)
1059
1060(setq word-combining-categories
1061 '((?l . ?l)))
1062
1063(setq word-separating-categories ; (2-byte character sets)
1064 '((?A . ?K) ; Alpha numeric - Katakana
1065 (?A . ?C) ; Alpha numeric - Chinese
1066 (?H . ?A) ; Hiragana - Alpha numeric
1067 (?H . ?K) ; Hiragana - Katakana
1068 (?H . ?C) ; Hiragana - Chinese
1069 (?K . ?A) ; Katakana - Alpha numeric
1070 (?K . ?C) ; Katakana - Chinese
1071 (?C . ?A) ; Chinese - Alpha numeric
1072 (?C . ?K) ; Chinese - Katakana
1073 ))
1074
777cfce6 1075;;; Local Variables:
e6d10035 1076;;; coding: utf-8-emacs
777cfce6
KH
1077;;; End:
1078
6b61353c 1079;;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
60370d40 1080;;; characters.el ends here