Add some non-word syntax cases.
[bpt/emacs.git] / lisp / international / characters.el
1 ;;; characters.el --- set syntax and category for multibyte characters
2
3 ;; Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
6 ;; Copyright (C) 2001, 2002
7 ;; National Institute of Advanced Industrial Science and Technology (AIST)
8 ;; Registration Number H13PRO009
9
10 ;; Keywords: multibyte character, character set, syntax, category
11
12 ;; This file is part of GNU Emacs.
13
14 ;; GNU Emacs is free software; you can redistribute it and/or modify
15 ;; it under the terms of the GNU General Public License as published by
16 ;; the Free Software Foundation; either version 2, or (at your option)
17 ;; any later version.
18
19 ;; GNU Emacs is distributed in the hope that it will be useful,
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;; GNU General Public License for more details.
23
24 ;; You should have received a copy of the GNU General Public License
25 ;; along with GNU Emacs; see the file COPYING. If not, write to the
26 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 ;; Boston, MA 02111-1307, USA.
28
29 ;;; Commentary:
30
31 ;; This file contains multibyte characters. Save this file always in
32 ;; the coding system `iso-2022-7bit'.
33
34 ;; This file does not define the syntax for Latin-N character sets;
35 ;; those are defined by the files latin-N.el.
36
37 ;;; Code:
38
39 ;;; Predefined categories.
40
41 ;; For each character set.
42
43 (define-category ?a "ASCII")
44 (define-category ?l "Latin")
45 (define-category ?t "Thai")
46 (define-category ?g "Greek")
47 (define-category ?b "Arabic")
48 (define-category ?w "Hebrew")
49 (define-category ?y "Cyrillic")
50 (define-category ?k "Japanese katakana")
51 (define-category ?r "Japanese roman")
52 (define-category ?c "Chinese")
53 (define-category ?j "Japanese")
54 (define-category ?h "Korean")
55 (define-category ?e "Ethiopic (Ge'ez)")
56 (define-category ?v "Vietnamese")
57 (define-category ?i "Indian")
58 (define-category ?o "Lao")
59 (define-category ?q "Tibetan")
60
61 ;; For each group (row) of 2-byte character sets.
62
63 (define-category ?A "Alpha-numeric characters of 2-byte character sets")
64 (define-category ?C "Chinese (Han) characters of 2-byte character sets")
65 (define-category ?G "Greek characters of 2-byte character sets")
66 (define-category ?H "Japanese Hiragana characters of 2-byte character sets")
67 (define-category ?K "Japanese Katakana characters of 2-byte character sets")
68 (define-category ?N "Korean Hangul characters of 2-byte character sets")
69 (define-category ?Y "Cyrillic characters of 2-byte character sets")
70 (define-category ?I "Indian Glyphs")
71
72 ;; For phonetic classifications.
73
74 (define-category ?0 "consonant")
75 (define-category ?1 "base (independent) vowel")
76 (define-category ?2 "upper diacritical mark (including upper vowel)")
77 (define-category ?3 "lower diacritical mark (including lower vowel)")
78 (define-category ?4 "tone mark")
79 (define-category ?5 "symbol")
80 (define-category ?6 "digit")
81 (define-category ?7 "vowel-modifying diacritical mark")
82 (define-category ?8 "vowel-signs")
83 (define-category ?9 "semivowel lower")
84
85 ;; For filling.
86 (define-category ?| "While filling, we can break a line at this character.")
87
88 ;; For indentation calculation.
89 (define-category ?
90 "This character counts as a space for indentation purposes.")
91
92 ;; Keep the following for `kinsoku' processing. See comments in
93 ;; kinsoku.el.
94 (define-category ?> "A character which can't be placed at beginning of line.")
95 (define-category ?< "A character which can't be placed at end of line.")
96
97 ;; Combining
98 (define-category ?^ "Combining diacritic or mark")
99 \f
100 ;;; Setting syntax and category.
101
102 ;; ASCII
103
104 ;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
105 (modify-category-entry '(32 . 127) ?a)
106 (modify-category-entry '(32 . 127) ?l)
107
108 ;; Arabic character set
109
110 (let ((charsets '(arabic-iso8859-6
111 arabic-digit
112 arabic-1-column
113 arabic-2-column)))
114 (while charsets
115 (map-charset-chars #'modify-category-entry (car charsets) ?b)
116 (setq charsets (cdr charsets))))
117 (modify-category-entry '(#x600 . #x6ff) ?b)
118 (modify-category-entry '(#xfb50 . #xfdff) ?b)
119 (modify-category-entry '(#xfe70 . #xfefe) ?b)
120
121 ;; Chinese characters (Unicode)
122 (modify-category-entry '(#x3400 . #x9FAF) ?C)
123 (modify-category-entry '(#x3400 . #x9FAF) ?c)
124 (modify-category-entry '(#x3400 . #x9FAF) ?|)
125 (modify-category-entry '(#xF900 . #xFAFF) ?C)
126 (modify-category-entry '(#xF900 . #xFAFF) ?c)
127 (modify-category-entry '(#xF900 . #xFAFF) ?|)
128
129 ;; Chinese character set (GB2312)
130
131 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
132 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
133 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
134 (modify-syntax-entry ?\〔 "(〕")
135 (modify-syntax-entry ?\〈 "(〉")
136 (modify-syntax-entry ?\《 "(》")
137 (modify-syntax-entry ?\「 "(」")
138 (modify-syntax-entry ?\『 "(』")
139 (modify-syntax-entry ?\〖 "(〗")
140 (modify-syntax-entry ?\【 "(】")
141 (modify-syntax-entry ?\〕 ")〔")
142 (modify-syntax-entry ?\〉 ")〈")
143 (modify-syntax-entry ?\》 ")《")
144 (modify-syntax-entry ?\」 ")「")
145 (modify-syntax-entry ?\』 ")『")
146 (modify-syntax-entry ?\〗 ")〖")
147 (modify-syntax-entry ?\】 ")【")
148
149 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
150 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?|)
151 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
152 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
153 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
154 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
155 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
156 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
157 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
158 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
159
160 ;; Chinese character set (BIG5)
161
162 (map-charset-chars #'modify-category-entry 'big5 ?c)
163 (map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA25F)
164 (map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
165 (map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DF)
166 (map-charset-chars #'modify-category-entry 'big5 ?|)
167
168
169 ;; Chinese character set (CNS11643)
170
171 (dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
172 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
173 chinese-cns11643-7))
174 (map-charset-chars #'modify-category-entry c ?c)
175 (if (eq c 'chinese-cns11643-1)
176 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
177 (map-charset-chars #'modify-category-entry c ?C))
178 (map-charset-chars #'modify-category-entry c ?|))
179
180 ;; Cyrillic character set (ISO-8859-5)
181
182 (modify-syntax-entry ?№ ".")
183
184 ;; Ethiopic character set
185
186 (modify-category-entry '(#x1200 . #x137b) ?e)
187 (let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨ ? ? ? ? ? ?)))
188 (while chars
189 (modify-syntax-entry (car chars) ".")
190 (setq chars (cdr chars))))
191 (map-charset-chars #'modify-category-entry 'ethiopic ?e)
192
193 ;; Hebrew character set (ISO-8859-8)
194
195 (modify-syntax-entry #x5be ".") ; MAQAF
196 (modify-syntax-entry #x5c0 ".") ; PASEQ
197 (modify-syntax-entry #x5c3 ".") ; SOF PASUQ
198 (modify-syntax-entry #x5f3 ".") ; GERESH
199 (modify-syntax-entry #x5f4 ".") ; GERSHAYIM
200
201 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
202
203 (modify-category-entry '(#x901 . #x970) ?i)
204 (map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
205 (map-charset-chars #'modify-category-entry 'indian-2-column ?i)
206
207
208 ;; Japanese character set (JISX0201-kana, JISX0201-roman, JISX0208, JISX0212)
209
210 (map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
211
212 (map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
213
214 (dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212))
215 (map-charset-chars #'modify-category-entry l ?j)
216 (map-charset-chars #'modify-category-entry l ?\|))
217
218 ;; Unicode equivalents of JISX0201-kana
219 (let ((range '(#xff61 . #xff9f)))
220 (modify-category-entry range ?k)
221 (modify-category-entry range ?j)
222 (modify-category-entry range ?\|))
223
224 ;; Katakana block
225 (let ((range '(#x30a0 . #x30ff)))
226 ;; ?K is double width, ?k isn't specified
227 (modify-category-entry range ?K)
228 (modify-category-entry range ?\|))
229
230 ;; Hiragana block
231 (let ((range '(#x3040 . #x309f)))
232 ;; ?H is actually defined to be double width
233 ;;(modify-category-entry range ?H)
234 ;;(modify-category-entry range ?\|)
235 )
236
237 ;; JISX0208
238 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
239 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
240 (let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
241 (dolist (elt chars)
242 (modify-syntax-entry (car chars) "w")))
243 (modify-syntax-entry ?\( "()")
244 (modify-syntax-entry ?\[ "(]")
245 (modify-syntax-entry ?\{ "(}")
246 (modify-syntax-entry ?\「 "(」")
247 (modify-syntax-entry ?\『 "(』")
248 (modify-syntax-entry ?\) ")(")
249 (modify-syntax-entry ?\] ")[")
250 (modify-syntax-entry ?\} "){")
251 (modify-syntax-entry ?\」 ")「")
252 (modify-syntax-entry ?\』 ")『")
253
254 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
255 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
256 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
257 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
258 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
259 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
260 (modify-category-entry ?ー ?K)
261 (let ((chars '(?゛ ?゜)))
262 (while chars
263 (modify-category-entry (car chars) ?K)
264 (modify-category-entry (car chars) ?H)
265 (setq chars (cdr chars))))
266 (let ((chars '(?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
267 (while chars
268 (modify-category-entry (car chars) ?C)
269 (setq chars (cdr chars))))
270
271 ;; JISX0212
272
273 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
274
275 ;; JISX0201-Kana
276
277 (let ((chars '(?。 ?、 ?・)))
278 (while chars
279 (modify-syntax-entry (car chars) ".")
280 (setq chars (cdr chars))))
281
282 (modify-syntax-entry ?\「 "(」")
283 (modify-syntax-entry ?\」 "(「")
284
285 ;; Korean character set (KSC5601)
286
287 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
288
289 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
290 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
291 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
292 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
293 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
294 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
295 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
296 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
297 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
298 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
299 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
300
301 ;; Latin
302
303 (modify-category-entry '(#x80 . #x024F) ?l)
304
305 ;; Lao character set
306
307 (modify-category-entry '(#xe80 . #xeff) ?o)
308 (map-charset-chars #'modify-category-entry 'lao ?o)
309
310 (let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
311 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
312 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
313 ("ຸູ" "w" ?3) ; vowel lower
314 ("່-໋" "w" ?4) ; tone mark
315 ("ຼຽ" "w" ?9) ; semivowel lower
316 ("໐-໙" "w" ?6) ; digit
317 ("ຯໆ" "_" ?5) ; symbol
318 ))
319 elm chars len syntax category to ch i)
320 (while deflist
321 (setq elm (car deflist))
322 (setq chars (car elm)
323 len (length chars)
324 syntax (nth 1 elm)
325 category (nth 2 elm)
326 i 0)
327 (while (< i len)
328 (if (= (aref chars i) ?-)
329 (setq i (1+ i)
330 to (aref chars i))
331 (setq ch (aref chars i)
332 to ch))
333 (while (<= ch to)
334 (unless (string-equal syntax "w")
335 (modify-syntax-entry ch syntax))
336 (modify-category-entry ch category)
337 (setq ch (1+ ch)))
338 (setq i (1+ i)))
339 (setq deflist (cdr deflist))))
340
341 ;; Thai character set (TIS620)
342
343 (modify-category-entry '(#xe00 . #xe7f) ?t)
344 (map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
345
346 (let ((deflist '(;; chars syntax category
347 ("ก-รลว-ฮ" "w" ?0) ; consonant
348 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
349 ("ัิ-ื็๎" "w" ?2) ; vowel upper
350 ("ุ-ฺ" "w" ?3) ; vowel lower
351 ("่-ํ" "w" ?4) ; tone mark
352 ("๐-๙" "w" ?6) ; digit
353 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
354 ))
355 elm chars len syntax category to ch i)
356 (while deflist
357 (setq elm (car deflist))
358 (setq chars (car elm)
359 len (length chars)
360 syntax (nth 1 elm)
361 category (nth 2 elm)
362 i 0)
363 (while (< i len)
364 (if (= (aref chars i) ?-)
365 (setq i (1+ i)
366 to (aref chars i))
367 (setq ch (aref chars i)
368 to ch))
369 (while (<= ch to)
370 (unless (string-equal syntax "w")
371 (modify-syntax-entry ch syntax))
372 (modify-category-entry ch category)
373 (setq ch (1+ ch)))
374 (setq i (1+ i)))
375 (setq deflist (cdr deflist))))
376
377 ;; Tibetan character set
378
379 (modify-category-entry '(#xf00 . #xfff) ?q)
380 (map-charset-chars #'modify-category-entry 'tibetan ?q)
381 (map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
382
383 (let ((deflist '(;; chars syntax category
384 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
385 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
386 ("-" "w" ?0) ;
387 ("-" "w" ?0) ;
388 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
389 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
390 ("྄ཱུ༙༵༷" "w" ?3) ; lowel vowel/modifier
391 ("༠-༩༪-༳" "w" ?6) ; digit
392 ("་།-༒༔ཿ" "." ?|) ; line-break char
393 ("་།༏༐༑༔ཿ" "." ?|) ;
394 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
395 ("་།༏༐༑༔ཿ" "." ?>) ;
396 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
397 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
398 ))
399 elm chars len syntax category to ch i)
400 (while deflist
401 (setq elm (car deflist))
402 (setq chars (car elm)
403 len (length chars)
404 syntax (nth 1 elm)
405 category (nth 2 elm)
406 i 0)
407 (while (< i len)
408 (if (= (aref chars i) ?-)
409 (setq i (1+ i)
410 to (aref chars i))
411 (setq ch (aref chars i)
412 to ch))
413 (while (<= ch to)
414 (unless (string-equal syntax "w")
415 (modify-syntax-entry ch syntax))
416 (modify-category-entry ch category)
417 (setq ch (1+ ch)))
418 (setq i (1+ i)))
419 (setq deflist (cdr deflist))))
420
421 ;; Vietnamese character set
422
423 ;; To make a word with Latin characters
424 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
425 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
426
427 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
428 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
429
430 (let ((tbl (standard-case-table))
431 (i 32))
432 (while (< i 128)
433 (let* ((char (decode-char 'vietnamese-viscii-upper i))
434 (charl (decode-char 'vietnamese-viscii-lower i))
435 (uc (encode-char char 'ucs))
436 (lc (encode-char charl 'ucs)))
437 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
438 tbl)
439 (if uc (modify-category-entry uc ?v))
440 (if lc (modify-category-entry lc ?v)))
441 (setq i (1+ i))))
442
443 (let ((tbl (standard-case-table)) c)
444
445 ;; In some languages, U+0049 LATIN CAPITAL LETTER I and U+0131 LATIN
446 ;; SMALL LETTER DOTLESS I make a case pair, and so do U+0130 LATIN
447 ;; CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN SMALL LETTER I.
448 ;; Thus we have to check language-environment to handle casing
449 ;; correctly. Currently only I<->i is available.
450
451 ;; Latin Extended-A, Latin Extended-B
452 (setq c #x0100)
453 (while (<= c #x0233)
454 (and (or (<= c #x012e)
455 (and (>= c #x014a) (<= c #x0177)))
456 (zerop (% c 2))
457 (set-case-syntax-pair c (1+ c) tbl))
458 (and (>= c #x013a)
459 (<= c #x0148)
460 (zerop (% c 2))
461 (set-case-syntax-pair (1- c) c tbl))
462 (setq c (1+ c)))
463 (set-case-syntax-pair ?IJ ?ij tbl)
464 (set-case-syntax-pair ?Ĵ ?ĵ tbl)
465 (set-case-syntax-pair ?Ķ ?ķ tbl)
466 (set-case-syntax-pair ?Ÿ ?ÿ tbl)
467 (set-case-syntax-pair ?Ź ?ź tbl)
468 (set-case-syntax-pair ?Ż ?ż tbl)
469 (set-case-syntax-pair ?Ž ?ž tbl)
470
471 ;; Latin Extended-B
472 (set-case-syntax-pair ?Ɓ ?ɓ tbl)
473 (set-case-syntax-pair ?Ƃ ?ƃ tbl)
474 (set-case-syntax-pair ?Ƅ ?ƅ tbl)
475 (set-case-syntax-pair ?Ɔ ?ɔ tbl)
476 (set-case-syntax-pair ?Ƈ ?ƈ tbl)
477 (set-case-syntax-pair ?Ɖ ?ɖ tbl)
478 (set-case-syntax-pair ?Ɗ ?ɗ tbl)
479 (set-case-syntax-pair ?Ƌ ?ƌ tbl)
480 (set-case-syntax-pair ?Ǝ ?ǝ tbl)
481 (set-case-syntax-pair ?Ə ?ə tbl)
482 (set-case-syntax-pair ?Ɛ ?ɛ tbl)
483 (set-case-syntax-pair ?Ƒ ?ƒ tbl)
484 (set-case-syntax-pair ?Ɠ ?ɠ tbl)
485 (set-case-syntax-pair ?Ɣ ?ɣ tbl)
486 (set-case-syntax-pair ?Ɩ ?ɩ tbl)
487 (set-case-syntax-pair ?Ɨ ?ɨ tbl)
488 (set-case-syntax-pair ?Ƙ ?ƙ tbl)
489 (set-case-syntax-pair ?Ɯ ?ɯ tbl)
490 (set-case-syntax-pair ?Ɲ ?ɲ tbl)
491 (set-case-syntax-pair ?Ɵ ?ɵ tbl)
492 (set-case-syntax-pair ?Ơ ?ơ tbl)
493 (set-case-syntax-pair ?Ƣ ?ƣ tbl)
494 (set-case-syntax-pair ?Ƥ ?ƥ tbl)
495 (set-case-syntax-pair ?Ʀ ?ʀ tbl)
496 (set-case-syntax-pair ?Ƨ ?ƨ tbl)
497 (set-case-syntax-pair ?Ʃ ?ʃ tbl)
498 (set-case-syntax-pair ?Ƭ ?ƭ tbl)
499 (set-case-syntax-pair ?Ʈ ?ʈ tbl)
500 (set-case-syntax-pair ?Ư ?ư tbl)
501 (set-case-syntax-pair ?Ʊ ?ʊ tbl)
502 (set-case-syntax-pair ?Ʋ ?ʋ tbl)
503 (set-case-syntax-pair ?Ƴ ?ƴ tbl)
504 (set-case-syntax-pair ?Ƶ ?ƶ tbl)
505 (set-case-syntax-pair ?Ʒ ?ʒ tbl)
506 (set-case-syntax-pair ?Ƹ ?ƹ tbl)
507 (set-case-syntax-pair ?Ƽ ?ƽ tbl)
508 (set-case-syntax-pair ?DŽ ?dž tbl)
509 (set-case-syntax-pair ?Dž ?dž tbl)
510 (set-case-syntax-pair ?LJ ?lj tbl)
511 (set-case-syntax-pair ?Lj ?lj tbl)
512 (set-case-syntax-pair ?NJ ?nj tbl)
513 (set-case-syntax-pair ?Nj ?nj tbl)
514 (set-case-syntax-pair ?Ǎ ?ǎ tbl)
515 (set-case-syntax-pair ?Ǐ ?ǐ tbl)
516 (set-case-syntax-pair ?Ǒ ?ǒ tbl)
517 (set-case-syntax-pair ?Ǔ ?ǔ tbl)
518 (set-case-syntax-pair ?Ǖ ?ǖ tbl)
519 (set-case-syntax-pair ?Ǘ ?ǘ tbl)
520 (set-case-syntax-pair ?Ǚ ?ǚ tbl)
521 (set-case-syntax-pair ?Ǜ ?ǜ tbl)
522 (set-case-syntax-pair ?Ǟ ?ǟ tbl)
523 (set-case-syntax-pair ?Ǡ ?ǡ tbl)
524 (set-case-syntax-pair ?Ǣ ?ǣ tbl)
525 (set-case-syntax-pair ?Ǥ ?ǥ tbl)
526 (set-case-syntax-pair ?Ǧ ?ǧ tbl)
527 (set-case-syntax-pair ?Ǩ ?ǩ tbl)
528 (set-case-syntax-pair ?Ǫ ?ǫ tbl)
529 (set-case-syntax-pair ?Ǭ ?ǭ tbl)
530 (set-case-syntax-pair ?Ǯ ?ǯ tbl)
531 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
532 (set-case-syntax-pair ?DZ ?dz tbl)
533 (set-case-syntax-pair ?Dz ?dz tbl)
534 (set-case-syntax-pair ?Ǵ ?ǵ tbl)
535 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
536 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
537 (set-case-syntax-pair ?Ǹ ?ǹ tbl)
538 (set-case-syntax-pair ?Ǻ ?ǻ tbl)
539 (set-case-syntax-pair ?Ǽ ?ǽ tbl)
540 (set-case-syntax-pair ?Ǿ ?ǿ tbl)
541 (set-case-syntax-pair ?Ȁ ?ȁ tbl)
542 (set-case-syntax-pair ?Ȃ ?ȃ tbl)
543 (set-case-syntax-pair ?Ȅ ?ȅ tbl)
544 (set-case-syntax-pair ?Ȇ ?ȇ tbl)
545 (set-case-syntax-pair ?Ȉ ?ȉ tbl)
546 (set-case-syntax-pair ?Ȋ ?ȋ tbl)
547 (set-case-syntax-pair ?Ȍ ?ȍ tbl)
548 (set-case-syntax-pair ?Ȏ ?ȏ tbl)
549 (set-case-syntax-pair ?Ȑ ?ȑ tbl)
550 (set-case-syntax-pair ?Ȓ ?ȓ tbl)
551 (set-case-syntax-pair ?Ȕ ?ȕ tbl)
552 (set-case-syntax-pair ?Ȗ ?ȗ tbl)
553 (set-case-syntax-pair ?Ș ?ș tbl)
554 (set-case-syntax-pair ?Ț ?ț tbl)
555 (set-case-syntax-pair ?Ȝ ?ȝ tbl)
556 (set-case-syntax-pair ?Ȟ ?ȟ tbl)
557 (set-case-syntax-pair ?Ȣ ?ȣ tbl)
558 (set-case-syntax-pair ?Ȥ ?ȥ tbl)
559 (set-case-syntax-pair ?Ȧ ?ȧ tbl)
560 (set-case-syntax-pair ?Ȩ ?ȩ tbl)
561 (set-case-syntax-pair ?Ȫ ?ȫ tbl)
562 (set-case-syntax-pair ?Ȭ ?ȭ tbl)
563 (set-case-syntax-pair ?Ȯ ?ȯ tbl)
564 (set-case-syntax-pair ?Ȱ ?ȱ tbl)
565 (set-case-syntax-pair ?Ȳ ?ȳ tbl)
566
567 ;; Latin Extended Additional
568 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
569 (setq c #x1e00)
570 (while (<= c #x1ef9)
571 (and (zerop (% c 2))
572 (or (<= c #x1e94) (>= c #x1ea0))
573 (set-case-syntax-pair c (1+ c) tbl))
574 (setq c (1+ c)))
575
576 ;; Greek
577 (modify-category-entry '(#x0370 . #x03ff) ?g)
578 (setq c #x0370)
579 (while (<= c #x03ff)
580 (if (or (and (>= c #x0391) (<= c #x03a1))
581 (and (>= c #x03a3) (<= c #x03ab)))
582 (set-case-syntax-pair c (+ c 32) tbl))
583 (and (>= c #x03da)
584 (<= c #x03ee)
585 (zerop (% c 2))
586 (set-case-syntax-pair c (1+ c) tbl))
587 (setq c (1+ c)))
588 (set-case-syntax-pair ?Ά ?ά tbl)
589 (set-case-syntax-pair ?Έ ?έ tbl)
590 (set-case-syntax-pair ?Ή ?ή tbl)
591 (set-case-syntax-pair ?Ί ?ί tbl)
592 (set-case-syntax-pair ?Ό ?ό tbl)
593 (set-case-syntax-pair ?Ύ ?ύ tbl)
594 (set-case-syntax-pair ?Ώ ?ώ tbl)
595
596 ;; Armenian
597 (setq c #x531)
598 (while (<= c #x556)
599 (set-case-syntax-pair c (+ c #x30) tbl)
600 (setq c (1+ c)))
601
602 ;; Greek Extended
603 (modify-category-entry '(#x1f00 . #x1fff) ?g)
604 (setq c #x1f00)
605 (while (<= c #x1fff)
606 (and (<= (logand c #x000f) 7)
607 (<= c #x1fa7)
608 (not (memq c '(#x1f50 #x1f52 #x1f54 #x1f56)))
609 (/= (logand c #x00f0) 7)
610 (set-case-syntax-pair (+ c 8) c tbl))
611 (setq c (1+ c)))
612 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
613 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
614 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
615 (set-case-syntax-pair ?Ά ?ά tbl)
616 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
617 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
618 (set-case-syntax-pair ?Έ ?έ tbl)
619 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
620 (set-case-syntax-pair ?Ή ?ή tbl)
621 (set-case-syntax-pair ?ῌ ?ῃ tbl)
622 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
623 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
624 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
625 (set-case-syntax-pair ?Ί ?ί tbl)
626 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
627 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
628 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
629 (set-case-syntax-pair ?Ύ ?ύ tbl)
630 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
631 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
632 (set-case-syntax-pair ?Ό ?ό tbl)
633 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
634 (set-case-syntax-pair ?Ώ ?ώ tbl)
635 (set-case-syntax-pair ?ῼ ?ῳ tbl)
636
637 ;; cyrillic
638 (modify-category-entry '(#x0400 . #x04FF) ?y)
639 (setq c #x0400)
640 (while (<= c #x04ff)
641 (and (>= c #x0400)
642 (<= c #x040f)
643 (set-case-syntax-pair c (+ c 80) tbl))
644 (and (>= c #x0410)
645 (<= c #x042f)
646 (set-case-syntax-pair c (+ c 32) tbl))
647 (and (zerop (% c 2))
648 (or (and (>= c #x0460) (<= c #x0480))
649 (and (>= c #x048c) (<= c #x04be))
650 (and (>= c #x04d0) (<= c #x04f4)))
651 (set-case-syntax-pair c (1+ c) tbl))
652 (setq c (1+ c)))
653 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
654 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
655 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
656 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
657 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
658
659 ;; general punctuation
660 (setq c #x2000)
661 (while (<= c #x200b)
662 (set-case-syntax c " " tbl)
663 (setq c (1+ c)))
664 (while (<= c #x200F)
665 (set-case-syntax c "." tbl)
666 (setq c (1+ c)))
667 ;; Fixme: These aren't all right:
668 (while (<= c #x2027)
669 (set-case-syntax c "_" tbl)
670 (setq c (1+ c)))
671 (while (<= c #x206F)
672 (set-case-syntax c "." tbl)
673 (setq c (1+ c)))
674
675 ;; Roman numerals
676 (setq c #x2160)
677 (while (<= c #x216f)
678 (set-case-syntax-pair c (+ c #x10) tbl)
679 (setq c (1+ c)))
680
681 ;; Arrows
682 (setq c #x2190)
683 (while (<= c #c21FF)
684 (set-case-syntax-pair c "." tbl)
685 (setq c (1+ c)))
686 ;; Mathematical Operators
687 (while (<= c #x22FF)
688 (set-case-syntax-pair c "." tbl)
689 (setq c (1+ c)))
690 ;; Miscellaneous Technical
691 (while (<= c #x23FF)
692 (set-case-syntax-pair c "." tbl)
693 (setq c (1+ c)))
694 ;; Control Pictures
695 (while (<= c #x243F)
696 (set-case-syntax-pair c "_" tbl)
697 (set c (1+ c)))
698
699 ;; Circled Latin
700 (setq c #x24b6)
701 (while (<= c #x24cf)
702 (set-case-syntax-pair c (+ c 26) tbl)
703 (modify-category-entry c ?l)
704 (modify-category-entry (+ c 26) ?l)
705 (setq c (1+ c)))
706
707 ;; Fullwidth Latin
708 (setq c #xff21)
709 (while (<= c #xff3a)
710 (set-case-syntax-pair c (+ c #x20) tbl)
711 (modify-category-entry c ?l)
712 (modify-category-entry (+ c #x20) ?l)
713 (setq c (1+ c)))
714
715 ;; Combining diacritics
716 (modify-category-entry '(#x300 . #x362) ?^)
717 ;; Combining marks
718 (modify-category-entry '(#x20d0 . #x20e3) ?^)
719
720 ;; Fixme: syntax for symbols &c
721 )
722 \f
723 ;; For each character set, put the information of the most proper
724 ;; coding system to encode it by `preferred-coding-system' property.
725
726 ;; Fixme: should this be junked?
727 (let ((l '((latin-iso8859-1 . iso-latin-1)
728 (latin-iso8859-2 . iso-latin-2)
729 (latin-iso8859-3 . iso-latin-3)
730 (latin-iso8859-4 . iso-latin-4)
731 (thai-tis620 . thai-tis620)
732 (greek-iso8859-7 . greek-iso-8bit)
733 (arabic-iso8859-6 . iso-2022-7bit)
734 (hebrew-iso8859-8 . hebrew-iso-8bit)
735 (katakana-jisx0201 . japanese-shift-jis)
736 (latin-jisx0201 . japanese-shift-jis)
737 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
738 (latin-iso8859-9 . iso-latin-5)
739 (japanese-jisx0208-1978 . iso-2022-jp)
740 (chinese-gb2312 . cn-gb-2312)
741 (japanese-jisx0208 . iso-2022-jp)
742 (korean-ksc5601 . iso-2022-kr)
743 (japanese-jisx0212 . iso-2022-jp)
744 (chinese-cns11643-1 . iso-2022-cn)
745 (chinese-cns11643-2 . iso-2022-cn)
746 (chinese-big5-1 . chinese-big5)
747 (chinese-big5-2 . chinese-big5)
748 (chinese-sisheng . iso-2022-7bit)
749 (ipa . iso-2022-7bit)
750 (vietnamese-viscii-lower . vietnamese-viscii)
751 (vietnamese-viscii-upper . vietnamese-viscii)
752 (arabic-digit . iso-2022-7bit)
753 (arabic-1-column . iso-2022-7bit)
754 (lao . lao)
755 (arabic-2-column . iso-2022-7bit)
756 (indian-is13194 . devanagari)
757 (indian-glyph . devanagari)
758 (tibetan-1-column . tibetan)
759 (ethiopic . iso-2022-7bit)
760 (chinese-cns11643-3 . iso-2022-cn)
761 (chinese-cns11643-4 . iso-2022-cn)
762 (chinese-cns11643-5 . iso-2022-cn)
763 (chinese-cns11643-6 . iso-2022-cn)
764 (chinese-cns11643-7 . iso-2022-cn)
765 (indian-2-column . devanagari)
766 (tibetan . tibetan)
767 (latin-iso8859-14 . iso-latin-8)
768 (latin-iso8859-15 . iso-latin-9))))
769 (while l
770 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
771 (setq l (cdr l))))
772
773 \f
774 ;; Setup auto-fill-chars for charsets that should invoke auto-filling.
775 ;; SPACE and NEWLINE are already set. Also put `nospace-between-words'
776 ;; property on the charsets.
777 (let ((l '(katakana-jisx0201
778 japanese-jisx0208 japanese-jisx0212
779 chinese-gb2312 chinese-big5-1 chinese-big5-2)))
780 (while l
781 ;;(aset auto-fill-chars (make-char (car l)) t)
782 (put-charset-property (car l) 'nospace-between-words t)
783 (setq l (cdr l))))
784
785 \f
786 ;; CJK double width characters.
787 (let ((l '((#x1100 . #x11FF)
788 (#x2E80 . #x9FAF)
789 (#xAC00 . #xD7AF)
790 (#xF900 . #xFAFF)
791 (#xFE30 . #xFE4F)
792 (#xFF00 . #xFF5F)
793 (#xFFE0 . #xFFEF))))
794 (dolist (elt l)
795 (set-char-table-range char-width-table
796 (cons (car elt) (cdr elt))
797 2)))
798 ;; Fixme: Doing this affects non-CJK characters through unification,
799 ;; but presumably CJK users expect those characters to be
800 ;; double-width when using these charsets.
801 ;; (map-charset-chars
802 ;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
803 ;; 'japanese-jisx0208)
804 ;; (map-charset-chars
805 ;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
806 ;; 'japanese-jisx0212)
807 ;; (map-charset-chars
808 ;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
809 ;; 'japanese-jisx0213-1)
810 ;; (map-charset-chars
811 ;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
812 ;; 'japanese-jisx0213-2)
813 ;; (map-charset-chars
814 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
815 ;; 'korean-ksc5601)
816
817 ;; Other double width
818 (map-charset-chars
819 (lambda (range ignore) (set-char-table-range char-width-table range 2))
820 'ethiopic)
821 (map-charset-chars
822 (lambda (range ignore) (set-char-table-range char-width-table range 2))
823 'tibetan)
824 (map-charset-chars
825 (lambda (range ignore) (set-char-table-range char-width-table range 2))
826 'indian-2-column)
827 (map-charset-chars
828 (lambda (range ignore) (set-char-table-range char-width-table range 2))
829 'arabic-2-column)
830
831 (optimize-char-table (standard-case-table))
832 (optimize-char-table char-width-table)
833 (optimize-char-table (standard-category-table))
834 (optimize-char-table (standard-syntax-table))
835
836 ;; The Unicode blocks actually extend past some of these ranges with
837 ;; undefined codepoints.
838 (let ((script-list nil))
839 (dolist
840 (elt
841 '((#x0000 #x007F latin)
842 (#x00A0 #x036F latin)
843 (#x0370 #x03E1 greek)
844 (#x03E2 #x03EF coptic)
845 (#x03F0 #x03F3 greek)
846 (#x0400 #x04FF cyrillic)
847 (#x0530 #x058F armenian)
848 (#x0590 #x05FF hebrew)
849 (#x0600 #x06FF arabic)
850 (#x0700 #x074F syriac)
851 (#x0780 #x07BF thaana)
852 (#x0900 #x097F devanagari)
853 (#x0980 #x09FF bengali)
854 (#x0A00 #x0A7F gurmukhi)
855 (#x0A80 #x0AFF gujarati)
856 (#x0B00 #x0B7F oriya)
857 (#x0B80 #x0BFF tamil)
858 (#x0C00 #x0C7F telugu)
859 (#x0C80 #x0CFF kannada)
860 (#x0D00 #x0D7F malayalam)
861 (#x0D80 #x0DFF sinhala)
862 (#x0E00 #x0E5F thai)
863 (#x0E80 #x0EDF lao)
864 (#x0F00 #x0FFF tibetan)
865 (#x1000 #x105F myanmar)
866 (#x10A0 #x10FF georgian)
867 (#x1100 #x11FF hangul)
868 (#x1200 #x137F ethiopic)
869 (#x13A0 #x13FF cherokee)
870 (#x1400 #x167F canadian-aboriginal)
871 (#x1680 #x169F ogham)
872 (#x16A0 #x16FF runic)
873 (#x1780 #x17FF khmer)
874 (#x1800 #x18AF mongolian)
875 (#x1E00 #x1EFF latin)
876 (#x1F00 #x1FFF greek)
877 (#x20A0 #x20AF currency)
878 (#x2800 #x28FF braille)
879 (#x2E80 #x2FDF han)
880 (#x2FF0 #x2FFF ideographic-description)
881 (#x3000 #x303F cjk-misc)
882 (#x3040 #x30FF kana)
883 (#x3100 #x312F bopomofo)
884 (#x3130 #x318F hangul)
885 (#x3190 #x319F kanbun)
886 (#x31A0 #x31BF bopomofo)
887 (#x3400 #x9FAF han)
888 (#xA000 #xA4CF yi)
889 (#xAC00 #xD7AF hangul)
890 (#xF900 #xFA5F han)
891 (#xFB1D #xFB4F hebrew)
892 (#xFB50 #xFDFF arabic)
893 (#xFE70 #xFEFC arabic)
894 (#xFF00 #xFF5F cjk-misc)
895 (#xFF61 #xFF9F kana)
896 (#xFFE0 #xFFE6 cjk-misc)
897 (#x20000 #x2AFFF han)
898 (#x2F800 #x2FFFF han)))
899 (set-char-table-range char-script-table
900 (cons (car elt) (nth 1 elt)) (nth 2 elt))
901 (or (memq (nth 2 elt) script-list)
902 (setq script-list (cons (nth 2 elt) script-list))))
903 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
904
905 \f
906 ;;; Setting word boundary.
907
908 (defun next-word-boundary-han (pos limit)
909 (if (<= pos limit)
910 (save-excursion
911 (goto-char pos)
912 (looking-at "\\cC+")
913 (goto-char (match-end 0))
914 (if (looking-at "\\cH+")
915 (goto-char (match-end 0)))
916 (point))
917 (while (and (> pos limit)
918 (eq (aref char-script-table (char-after (1- pos))) 'han))
919 (setq pos (1- pos)))
920 pos))
921
922 (defun next-word-boundary-kana (pos limit)
923 (if (<= pos limit)
924 (save-excursion
925 (goto-char pos)
926 (if (looking-at "\\cK+")
927 (goto-char (match-end 0)))
928 (if (looking-at "\\cH+")
929 (goto-char (match-end 0)))
930 (point))
931 (let ((category-set (char-category-set (char-after pos)))
932 category)
933 (if (aref category-set ?K)
934 (while (and (> pos limit)
935 (aref (char-category-set (char-after (1- pos))) ?K))
936 (setq pos (1- pos)))
937 (while (and (> pos limit)
938 (aref (setq category-set
939 (char-category-set (char-after (1- pos)))) ?H))
940 (setq pos (1- pos)))
941 (setq category (cond ((aref category-set ?C) ?C)
942 ((aref category-set ?K) ?K)
943 ((aref category-set ?A) ?A)))
944 (when category
945 (setq pos (1- pos))
946 (while (and (> pos limit)
947 (aref (char-category-set (char-after (1- pos)))
948 category))
949 (setq pos (1- pos)))))
950 pos)))
951
952 (map-char-table
953 #'(lambda (char script)
954 (cond ((eq script 'han)
955 (set-char-table-range next-word-boundary-function-table
956 char #'next-word-boundary-han))
957 ((eq script 'kana)
958 (set-char-table-range next-word-boundary-function-table
959 char #'next-word-boundary-kana))))
960 char-script-table)
961
962 (setq word-combining-categories
963 '((?l . ?l)))
964
965 (setq word-separating-categories ; (2-byte character sets)
966 '((?A . ?K) ; Alpha numeric - Katakana
967 (?A . ?C) ; Alpha numeric - Chinese
968 (?H . ?A) ; Hiragana - Alpha numeric
969 (?H . ?K) ; Hiragana - Katakana
970 (?H . ?C) ; Hiragana - Chinese
971 (?K . ?A) ; Katakana - Alpha numeric
972 (?K . ?C) ; Katakana - Chinese
973 (?C . ?A) ; Chinese - Alpha numeric
974 (?C . ?K) ; Chinese - Katakana
975 ))
976
977 ;;; Local Variables:
978 ;;; coding: utf-8-emacs
979 ;;; End:
980
981 ;;; characters.el ends here