Commit | Line | Data |
---|---|---|
4ed46869 KH |
1 | ;;; characters.el --- set syntax and category for multibyte characters |
2 | ||
73b0cd50 | 3 | ;; Copyright (C) 1997, 2000-2011 |
2fd125a3 | 4 | ;; Free Software Foundation, Inc. |
7976eda0 | 5 | ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, |
5df4f04c | 6 | ;; 2005, 2006, 2007, 2008, 2009, 2010, 2011 |
2fd125a3 KH |
7 | ;; National Institute of Advanced Industrial Science and Technology (AIST) |
8 | ;; Registration Number H14PRO021 | |
8f924df7 | 9 | ;; Copyright (C) 2003 |
55bd52ea KH |
10 | ;; National Institute of Advanced Industrial Science and Technology (AIST) |
11 | ;; Registration Number H13PRO009 | |
4ed46869 KH |
12 | |
13 | ;; Keywords: multibyte character, character set, syntax, category | |
14 | ||
15 | ;; This file is part of GNU Emacs. | |
16 | ||
4936186e | 17 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
4ed46869 | 18 | ;; it under the terms of the GNU General Public License as published by |
4936186e GM |
19 | ;; the Free Software Foundation, either version 3 of the License, or |
20 | ;; (at your option) any later version. | |
4ed46869 KH |
21 | |
22 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
23 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
24 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
25 | ;; GNU General Public License for more details. | |
26 | ||
27 | ;; You should have received a copy of the GNU General Public License | |
4936186e | 28 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
4ed46869 KH |
29 | |
30 | ;;; Commentary: | |
31 | ||
60370d40 PJ |
32 | ;;; Code: |
33 | ||
4ed46869 KH |
34 | ;;; Predefined categories. |
35 | ||
36 | ;; For each character set. | |
37 | ||
46bf60bc KH |
38 | (define-category ?a "ASCII |
39 | ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])") | |
4ed46869 KH |
40 | (define-category ?l "Latin") |
41 | (define-category ?t "Thai") | |
42 | (define-category ?g "Greek") | |
43 | (define-category ?b "Arabic") | |
44 | (define-category ?w "Hebrew") | |
45 | (define-category ?y "Cyrillic") | |
46bf60bc KH |
46 | (define-category ?k "Katakana |
47 | Japanese katakana") | |
48 | (define-category ?r "Roman | |
49 | Japanese roman") | |
4ed46869 KH |
50 | (define-category ?c "Chinese") |
51 | (define-category ?j "Japanese") | |
52 | (define-category ?h "Korean") | |
46bf60bc KH |
53 | (define-category ?e "Ethiopic |
54 | Ethiopic (Ge'ez)") | |
55 | (define-category ?v "Viet | |
56 | Vietnamese") | |
4ed46869 | 57 | (define-category ?i "Indian") |
6eba8645 | 58 | (define-category ?o "Lao") |
9395eb7c | 59 | (define-category ?q "Tibetan") |
4ed46869 KH |
60 | |
61 | ;; For each group (row) of 2-byte character sets. | |
62 | ||
46bf60bc KH |
63 | (define-category ?A "2-byte alnum |
64 | Alpha-numeric characters of 2-byte character sets") | |
65 | (define-category ?C "2-byte han | |
66 | Chinese (Han) characters of 2-byte character sets") | |
67 | (define-category ?G "2-byte Greek | |
68 | Greek characters of 2-byte character sets") | |
69 | (define-category ?H "2-byte Hiragana | |
70 | Japanese Hiragana characters of 2-byte character sets") | |
71 | (define-category ?K "2-byte Katakana | |
72 | Japanese Katakana characters of 2-byte character sets") | |
73 | (define-category ?N "2-byte Korean | |
74 | Korean Hangul characters of 2-byte character sets") | |
91c491e0 | 75 | (define-category ?Y "2-byte Cyrillic |
46bf60bc | 76 | Cyrillic characters of 2-byte character sets") |
4ed46869 KH |
77 | (define-category ?I "Indian Glyphs") |
78 | ||
79 | ;; For phonetic classifications. | |
80 | ||
81 | (define-category ?0 "consonant") | |
46bf60bc | 82 | (define-category ?1 "base vowel |
4eb97232 | 83 | Base (independent) vowel") |
46bf60bc | 84 | (define-category ?2 "upper diacritic |
4eb97232 | 85 | Upper diacritical mark (including upper vowel)") |
46bf60bc | 86 | (define-category ?3 "lower diacritic |
4eb97232 | 87 | Lower diacritical mark (including lower vowel)") |
46bf60bc | 88 | (define-category ?4 "combining tone |
4eb97232 | 89 | Combining tone mark") |
9765a2ba | 90 | (define-category ?5 "symbol") |
4ed46869 | 91 | (define-category ?6 "digit") |
91c491e0 | 92 | (define-category ?7 "vowel diacritic |
4eb97232 | 93 | Vowel-modifying diacritical mark") |
6eba8645 KH |
94 | (define-category ?8 "vowel-signs") |
95 | (define-category ?9 "semivowel lower") | |
4ed46869 KH |
96 | |
97 | ;; For filling. | |
46bf60bc KH |
98 | (define-category ?| "line breakable |
99 | While filling, we can break a line at this character.") | |
4ed46869 | 100 | |
504af7b2 | 101 | ;; For indentation calculation. |
70ea295a | 102 | (define-category ?\s |
46bf60bc KH |
103 | "space for indent |
104 | This character counts as a space for indentation purposes.") | |
504af7b2 | 105 | |
94487c4e | 106 | ;; Keep the following for `kinsoku' processing. See comments in |
4ed46869 | 107 | ;; kinsoku.el. |
46bf60bc KH |
108 | (define-category ?> "Not at bol |
109 | A character which can't be placed at beginning of line.") | |
110 | (define-category ?< "Not at eol | |
111 | A character which can't be placed at end of line.") | |
4ed46869 | 112 | |
8ea6fa80 KH |
113 | ;; Base and Combining |
114 | (define-category ?. "Base | |
115 | Base characters (Unicode General Category L,N,P,S,Zs)") | |
46bf60bc | 116 | (define-category ?^ "Combining |
4eb97232 | 117 | Combining diacritic or mark (Unicode General Category M)") |
4ed46869 KH |
118 | \f |
119 | ;;; Setting syntax and category. | |
120 | ||
121 | ;; ASCII | |
122 | ||
e2cc40b7 KH |
123 | ;; All ASCII characters have the category `a' (ASCII) and `l' (Latin). |
124 | (modify-category-entry '(32 . 127) ?a) | |
125 | (modify-category-entry '(32 . 127) ?l) | |
4ed46869 | 126 | |
c94ae9eb DL |
127 | ;; Deal with the CJK charsets first. Since the syntax of blocks is |
128 | ;; defined per charset, and the charsets may contain e.g. Latin | |
129 | ;; characters, we end up with the wrong syntax definitions if we're | |
130 | ;; not careful. | |
4ed46869 | 131 | |
66bff5ed | 132 | ;; Chinese characters (Unicode) |
a5bb49e1 KH |
133 | (modify-category-entry '(#x2E80 . #x312F) ?|) |
134 | (modify-category-entry '(#x3190 . #x33FF) ?|) | |
66a85e76 KH |
135 | (modify-category-entry '(#x3400 . #x4DBF) ?C) |
136 | (modify-category-entry '(#x4E00 . #x9FAF) ?C) | |
66bff5ed KH |
137 | (modify-category-entry '(#x3400 . #x9FAF) ?c) |
138 | (modify-category-entry '(#x3400 . #x9FAF) ?|) | |
139 | (modify-category-entry '(#xF900 . #xFAFF) ?C) | |
140 | (modify-category-entry '(#xF900 . #xFAFF) ?c) | |
141 | (modify-category-entry '(#xF900 . #xFAFF) ?|) | |
796f8b2f KH |
142 | (modify-category-entry '(#x20000 . #x2FFFF) ?|) |
143 | (modify-category-entry '(#x20000 . #x2FFFF) ?C) | |
144 | (modify-category-entry '(#x20000 . #x2FFFF) ?c) | |
8e4cd685 | 145 | |
4ed46869 KH |
146 | |
147 | ;; Chinese character set (GB2312) | |
148 | ||
66bff5ed KH |
149 | (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E) |
150 | (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E) | |
151 | (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E) | |
4ed46869 | 152 | |
87a39edb | 153 | (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c) |
9ad4b491 KH |
154 | (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339) |
155 | (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A) | |
156 | (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A) | |
66bff5ed KH |
157 | (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E) |
158 | (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E) | |
159 | (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E) | |
160 | (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E) | |
161 | (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E) | |
4ed46869 KH |
162 | |
163 | ;; Chinese character set (BIG5) | |
164 | ||
e7259832 | 165 | (map-charset-chars #'modify-category-entry 'big5 ?c) |
66a85e76 | 166 | (map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA261) |
9ad4b491 | 167 | (map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E) |
66a85e76 | 168 | (map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DC) |
4ed46869 KH |
169 | |
170 | ;; Chinese character set (CNS11643) | |
171 | ||
87a39edb DL |
172 | (dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3 |
173 | chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6 | |
174 | chinese-cns11643-7)) | |
175 | (map-charset-chars #'modify-category-entry c ?c) | |
9ad4b491 KH |
176 | (if (eq c 'chinese-cns11643-1) |
177 | (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E) | |
8e4cd685 | 178 | (map-charset-chars #'modify-category-entry c ?C))) |
4ed46869 | 179 | |
8f924df7 | 180 | ;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213) |
4ed46869 | 181 | |
66bff5ed | 182 | (map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k) |
4ed46869 | 183 | |
66bff5ed | 184 | (map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r) |
4ed46869 | 185 | |
8f924df7 | 186 | (dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212 |
761f6427 KH |
187 | japanese-jisx0213-1 japanese-jisx0213-2 |
188 | cp932-2-byte)) | |
8e4cd685 | 189 | (map-charset-chars #'modify-category-entry l ?j)) |
4ed46869 | 190 | |
c4186f9c KH |
191 | ;; Fullwidth characters |
192 | (modify-category-entry '(#xff01 . #xff60) ?\|) | |
193 | ||
269a5dd0 | 194 | ;; Unicode equivalents of JISX0201-kana |
66bff5ed KH |
195 | (let ((range '(#xff61 . #xff9f))) |
196 | (modify-category-entry range ?k) | |
197 | (modify-category-entry range ?j) | |
198 | (modify-category-entry range ?\|)) | |
269a5dd0 DL |
199 | |
200 | ;; Katakana block | |
796f8b2f KH |
201 | (modify-category-entry '(#x3099 . #x309C) ?K) |
202 | (modify-category-entry '(#x30A0 . #x30FF) ?K) | |
6f3ac1e1 | 203 | (modify-category-entry '(#x31F0 . #x31FF) ?K) |
b11c2874 | 204 | (modify-category-entry '(#x30A0 . #x30FA) ?\|) |
796f8b2f | 205 | (modify-category-entry #x30FF ?\|) |
269a5dd0 DL |
206 | |
207 | ;; Hiragana block | |
796f8b2f KH |
208 | (modify-category-entry '(#x3040 . #x309F) ?H) |
209 | (modify-category-entry '(#x3040 . #x3096) ?\|) | |
210 | (modify-category-entry #x309F ?\|) | |
211 | (modify-category-entry #x30A0 ?H) | |
212 | (modify-category-entry #x30FC ?H) | |
213 | ||
269a5dd0 | 214 | |
4ed46869 | 215 | ;; JISX0208 |
66bff5ed KH |
216 | (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E) |
217 | (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E) | |
218 | (let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇))) | |
69c2c6ea | 219 | (dolist (elt chars) |
abdaa411 | 220 | (modify-syntax-entry (car chars) "w"))) |
66bff5ed KH |
221 | |
222 | (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E) | |
223 | (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E) | |
224 | (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E) | |
225 | (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E) | |
226 | (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E) | |
227 | (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E) | |
228 | (modify-category-entry ?ー ?K) | |
229 | (let ((chars '(?゛ ?゜))) | |
4ed46869 KH |
230 | (while chars |
231 | (modify-category-entry (car chars) ?K) | |
232 | (modify-category-entry (car chars) ?H) | |
233 | (setq chars (cdr chars)))) | |
66a85e76 | 234 | (let ((chars '(?仝 ?々 ?〆 ?〇))) |
4ed46869 KH |
235 | (while chars |
236 | (modify-category-entry (car chars) ?C) | |
237 | (setq chars (cdr chars)))) | |
238 | ||
239 | ;; JISX0212 | |
4ed46869 | 240 | |
66bff5ed | 241 | (map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E) |
4ed46869 KH |
242 | |
243 | ;; JISX0201-Kana | |
87a39edb | 244 | |
abdaa411 | 245 | (let ((chars '(?。 ?、 ?・))) |
4ed46869 KH |
246 | (while chars |
247 | (modify-syntax-entry (car chars) ".") | |
248 | (setq chars (cdr chars)))) | |
249 | ||
e6d10035 KH |
250 | (modify-syntax-entry ?\「 "(」") |
251 | (modify-syntax-entry ?\」 "(「") | |
226e4119 | 252 | |
4ed46869 KH |
253 | ;; Korean character set (KSC5601) |
254 | ||
87a39edb | 255 | (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h) |
66bff5ed KH |
256 | |
257 | (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E) | |
439f7264 DL |
258 | (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E) |
259 | (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E) | |
260 | (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E) | |
9ad4b491 KH |
261 | (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339) |
262 | (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A) | |
263 | (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A) | |
66bff5ed KH |
264 | (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E) |
265 | (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E) | |
266 | (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E) | |
267 | (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E) | |
4ed46869 | 268 | |
c94ae9eb | 269 | ;; These are in more than one charset. |
8f924df7 KH |
270 | (let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛" |
271 | "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄" | |
272 | "()[]{}")) | |
273 | open close) | |
274 | (dotimes (i (/ (length parens) 2)) | |
275 | (setq open (aref parens (* i 2)) | |
276 | close (aref parens (1+ (* i 2)))) | |
277 | (modify-syntax-entry open (format "(%c" close)) | |
278 | (modify-syntax-entry close (format ")%c" open)))) | |
d05cfa1f | 279 | |
c94ae9eb | 280 | ;; Arabic character set |
6eba8645 | 281 | |
c94ae9eb DL |
282 | (let ((charsets '(arabic-iso8859-6 |
283 | arabic-digit | |
284 | arabic-1-column | |
285 | arabic-2-column))) | |
286 | (while charsets | |
287 | (map-charset-chars #'modify-category-entry (car charsets) ?b) | |
288 | (setq charsets (cdr charsets)))) | |
289 | (modify-category-entry '(#x600 . #x6ff) ?b) | |
290 | (modify-category-entry '(#xfb50 . #xfdff) ?b) | |
291 | (modify-category-entry '(#xfe70 . #xfefe) ?b) | |
6eba8645 | 292 | |
c94ae9eb DL |
293 | ;; Cyrillic character set (ISO-8859-5) |
294 | ||
295 | (modify-syntax-entry ?№ ".") | |
296 | ||
297 | ;; Ethiopic character set | |
298 | ||
4c81b0f6 KH |
299 | (modify-category-entry '(#x1200 . #x1399) ?e) |
300 | (modify-category-entry '(#x2d80 . #x2dde) ?e) | |
55a3ed16 | 301 | (let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨))) |
c94ae9eb DL |
302 | (while chars |
303 | (modify-syntax-entry (car chars) ".") | |
304 | (setq chars (cdr chars)))) | |
305 | (map-charset-chars #'modify-category-entry 'ethiopic ?e) | |
306 | ||
307 | ;; Hebrew character set (ISO-8859-8) | |
308 | ||
309 | (modify-syntax-entry #x5be ".") ; MAQAF | |
310 | (modify-syntax-entry #x5c0 ".") ; PASEQ | |
311 | (modify-syntax-entry #x5c3 ".") ; SOF PASUQ | |
312 | (modify-syntax-entry #x5f3 ".") ; GERESH | |
313 | (modify-syntax-entry #x5f4 ".") ; GERSHAYIM | |
314 | ||
315 | ;; Indian character set (IS 13194 and other Emacs original Indian charsets) | |
316 | ||
317 | (modify-category-entry '(#x901 . #x970) ?i) | |
318 | (map-charset-chars #'modify-category-entry 'indian-is13194 ?i) | |
319 | (map-charset-chars #'modify-category-entry 'indian-2-column ?i) | |
d05cfa1f | 320 | |
6eba8645 KH |
321 | ;; Lao character set |
322 | ||
abdaa411 DL |
323 | (modify-category-entry '(#xe80 . #xeff) ?o) |
324 | (map-charset-chars #'modify-category-entry 'lao ?o) | |
6eba8645 | 325 | |
abdaa411 | 326 | (let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant |
e6d10035 KH |
327 | ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base |
328 | ("ັິ-ືົໍ" "w" ?2) ; vowel upper | |
329 | ("ຸູ" "w" ?3) ; vowel lower | |
8f924df7 | 330 | ("່-໋" "w" ?4) ; tone mark |
e6d10035 KH |
331 | ("ຼຽ" "w" ?9) ; semivowel lower |
332 | ("໐-໙" "w" ?6) ; digit | |
333 | ("ຯໆ" "_" ?5) ; symbol | |
6eba8645 KH |
334 | )) |
335 | elm chars len syntax category to ch i) | |
336 | (while deflist | |
337 | (setq elm (car deflist)) | |
338 | (setq chars (car elm) | |
339 | len (length chars) | |
340 | syntax (nth 1 elm) | |
341 | category (nth 2 elm) | |
342 | i 0) | |
343 | (while (< i len) | |
344 | (if (= (aref chars i) ?-) | |
345 | (setq i (1+ i) | |
4a027a0d KH |
346 | to (aref chars i)) |
347 | (setq ch (aref chars i) | |
6eba8645 KH |
348 | to ch)) |
349 | (while (<= ch to) | |
269a5dd0 DL |
350 | (unless (string-equal syntax "w") |
351 | (modify-syntax-entry ch syntax)) | |
6eba8645 KH |
352 | (modify-category-entry ch category) |
353 | (setq ch (1+ ch))) | |
4a027a0d | 354 | (setq i (1+ i))) |
6eba8645 KH |
355 | (setq deflist (cdr deflist)))) |
356 | ||
4ed46869 KH |
357 | ;; Thai character set (TIS620) |
358 | ||
abdaa411 DL |
359 | (modify-category-entry '(#xe00 . #xe7f) ?t) |
360 | (map-charset-chars #'modify-category-entry 'thai-tis620 ?t) | |
4ed46869 KH |
361 | |
362 | (let ((deflist '(;; chars syntax category | |
e6d10035 KH |
363 | ("ก-รลว-ฮ" "w" ?0) ; consonant |
364 | ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base | |
365 | ("ัิ-ื็๎" "w" ?2) ; vowel upper | |
366 | ("ุ-ฺ" "w" ?3) ; vowel lower | |
8f924df7 | 367 | ("่-ํ" "w" ?4) ; tone mark |
e6d10035 KH |
368 | ("๐-๙" "w" ?6) ; digit |
369 | ("ฯๆ฿๏๚๛" "_" ?5) ; symbol | |
4ed46869 KH |
370 | )) |
371 | elm chars len syntax category to ch i) | |
9395eb7c KH |
372 | (while deflist |
373 | (setq elm (car deflist)) | |
374 | (setq chars (car elm) | |
375 | len (length chars) | |
376 | syntax (nth 1 elm) | |
377 | category (nth 2 elm) | |
378 | i 0) | |
379 | (while (< i len) | |
380 | (if (= (aref chars i) ?-) | |
381 | (setq i (1+ i) | |
4a027a0d KH |
382 | to (aref chars i)) |
383 | (setq ch (aref chars i) | |
9395eb7c KH |
384 | to ch)) |
385 | (while (<= ch to) | |
269a5dd0 DL |
386 | (unless (string-equal syntax "w") |
387 | (modify-syntax-entry ch syntax)) | |
9395eb7c KH |
388 | (modify-category-entry ch category) |
389 | (setq ch (1+ ch))) | |
4a027a0d | 390 | (setq i (1+ i))) |
9395eb7c KH |
391 | (setq deflist (cdr deflist)))) |
392 | ||
393 | ;; Tibetan character set | |
394 | ||
abdaa411 DL |
395 | (modify-category-entry '(#xf00 . #xfff) ?q) |
396 | (map-charset-chars #'modify-category-entry 'tibetan ?q) | |
397 | (map-charset-chars #'modify-category-entry 'tibetan-1-column ?q) | |
9395eb7c KH |
398 | |
399 | (let ((deflist '(;; chars syntax category | |
725d7c92 | 400 | ("ཀ-ཀྵཪ" "w" ?0) ; consonant |
55a3ed16 | 401 | ("ྐ-ྐྵྺྻྼ" "w" ?0) ; |
725d7c92 DL |
402 | ("ིེཻོཽྀ" "w" ?2) ; upper vowel |
403 | ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier | |
55a3ed16 | 404 | ("྄ཱུ༙༵༷" "w" ?3) ; lowel vowel/modifier |
8f924df7 | 405 | ("" "w" ?3) ; invisible vowel a |
725d7c92 DL |
406 | ("༠-༩༪-༳" "w" ?6) ; digit |
407 | ("་།-༒༔ཿ" "." ?|) ; line-break char | |
408 | ("་།༏༐༑༔ཿ" "." ?|) ; | |
409 | ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition | |
410 | ("་།༏༐༑༔ཿ" "." ?>) ; | |
411 | ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition | |
412 | ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others | |
9395eb7c KH |
413 | )) |
414 | elm chars len syntax category to ch i) | |
4ed46869 KH |
415 | (while deflist |
416 | (setq elm (car deflist)) | |
417 | (setq chars (car elm) | |
418 | len (length chars) | |
419 | syntax (nth 1 elm) | |
420 | category (nth 2 elm) | |
421 | i 0) | |
422 | (while (< i len) | |
423 | (if (= (aref chars i) ?-) | |
424 | (setq i (1+ i) | |
4a027a0d KH |
425 | to (aref chars i)) |
426 | (setq ch (aref chars i) | |
4ed46869 KH |
427 | to ch)) |
428 | (while (<= ch to) | |
269a5dd0 DL |
429 | (unless (string-equal syntax "w") |
430 | (modify-syntax-entry ch syntax)) | |
4ed46869 KH |
431 | (modify-category-entry ch category) |
432 | (setq ch (1+ ch))) | |
4a027a0d | 433 | (setq i (1+ i))) |
4ed46869 KH |
434 | (setq deflist (cdr deflist)))) |
435 | ||
436 | ;; Vietnamese character set | |
437 | ||
abdaa411 DL |
438 | ;; To make a word with Latin characters |
439 | (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l) | |
440 | (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v) | |
441 | ||
442 | (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l) | |
443 | (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v) | |
4ed46869 | 444 | |
e5dd1155 KH |
445 | (let ((tbl (standard-case-table)) |
446 | (i 32)) | |
447 | (while (< i 128) | |
725d7c92 DL |
448 | (let* ((char (decode-char 'vietnamese-viscii-upper i)) |
449 | (charl (decode-char 'vietnamese-viscii-lower i)) | |
450 | (uc (encode-char char 'ucs)) | |
451 | (lc (encode-char charl 'ucs))) | |
452 | (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i) | |
4eb97232 | 453 | tbl) |
725d7c92 DL |
454 | (if uc (modify-category-entry uc ?v)) |
455 | (if lc (modify-category-entry lc ?v))) | |
e5dd1155 KH |
456 | (setq i (1+ i)))) |
457 | ||
d807d0c7 KH |
458 | ;; Tai Viet |
459 | (let ((deflist '(;; chars syntax category | |
460 | ((?ꪀ. ?ꪯ) "w" ?0) ; cosonant | |
461 | ("ꪱꪵꪶ" "w" ?1) ; vowel base | |
462 | ((?ꪹ . ?ꪽ) "w" ?1) ; vowel base | |
463 | ("ꪰꪲꪳꪷꪸꪾ" "w" ?2) ; vowel upper | |
464 | ("ꪴ" "w" ?3) ; vowel lower | |
465 | ("ꫀꫂ" "w" ?1) ; non-combining tone-mark | |
466 | ("꪿꫁" "w" ?4) ; combining tone-mark | |
467 | ((?ꫛ . ?꫟) "_" ?5) ; symbol | |
468 | ))) | |
469 | (dolist (elm deflist) | |
470 | (let ((chars (car elm)) | |
471 | (syntax (nth 1 elm)) | |
472 | (category (nth 2 elm))) | |
473 | (if (consp chars) | |
474 | (progn | |
475 | (modify-syntax-entry chars syntax) | |
476 | (modify-category-entry chars category)) | |
477 | (mapc #'(lambda (x) | |
478 | (modify-syntax-entry x syntax) | |
479 | (modify-category-entry x category)) | |
480 | chars))))) | |
c94ae9eb DL |
481 | |
482 | ;; Latin | |
483 | ||
484 | (modify-category-entry '(#x80 . #x024F) ?l) | |
d05cfa1f | 485 | |
85ef8ece KH |
486 | (let ((tbl (standard-case-table)) c) |
487 | ||
4fb82d62 DL |
488 | ;; Latin-1 |
489 | ||
490 | ;; Fixme: Some of the non-word syntaxes here perhaps should be | |
491 | ;; reviewed. (Note that the following all implicitly have word | |
492 | ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of | |
493 | ;; relating Unicode categories to Emacs syntax codes. | |
db92e81e KH |
494 | |
495 | ;; NBSP isn't semantically interchangeable with other whitespace chars, | |
496 | ;; so it's more like punctation. | |
497 | (set-case-syntax ? "." tbl) | |
4fb82d62 DL |
498 | (set-case-syntax ?¡ "." tbl) |
499 | (set-case-syntax ?¦ "_" tbl) | |
500 | (set-case-syntax ?§ "." tbl) | |
501 | (set-case-syntax ?© "_" tbl) | |
502 | (set-case-syntax-delims 171 187 tbl) ; « » | |
503 | (set-case-syntax ?¬ "_" tbl) | |
504 | (set-case-syntax ? "_" tbl) | |
505 | (set-case-syntax ?® "_" tbl) | |
506 | (set-case-syntax ?° "_" tbl) | |
507 | (set-case-syntax ?± "_" tbl) | |
508 | (set-case-syntax ?µ "_" tbl) | |
509 | (set-case-syntax ?· "_" tbl) | |
510 | (set-case-syntax ?¼ "_" tbl) | |
511 | (set-case-syntax ?½ "_" tbl) | |
512 | (set-case-syntax ?¾ "_" tbl) | |
513 | (set-case-syntax ?¿ "." tbl) | |
514 | (let ((c 192)) | |
515 | (while (<= c 222) | |
516 | (set-case-syntax-pair c (+ c 32) tbl) | |
517 | (setq c (1+ c)))) | |
518 | (set-case-syntax ?× "_" tbl) | |
519 | (set-case-syntax ?ß "w" tbl) | |
520 | (set-case-syntax ?÷ "_" tbl) | |
521 | ;; See below for ÿ. | |
85ef8ece | 522 | |
85ef8ece KH |
523 | ;; Latin Extended-A, Latin Extended-B |
524 | (setq c #x0100) | |
e5e381c8 KH |
525 | (while (<= c #x02B8) |
526 | (modify-category-entry c ?l) | |
d05cfa1f | 527 | (setq c (1+ c))) |
2bb915b8 | 528 | |
e5e381c8 KH |
529 | (let ((pair-ranges '((#x0100 . #x012F) |
530 | (#x0132 . #x0137) | |
531 | (#x0139 . #x0148) | |
532 | (#x014a . #x0177) | |
533 | (#x0179 . #x017E) | |
534 | (#x0182 . #x0185) | |
796f8b2f KH |
535 | (#x0187 . #x0188) |
536 | (#x018B . #x018C) | |
e5e381c8 KH |
537 | (#x0191 . #x0192) |
538 | (#x0198 . #x0199) | |
539 | (#x01A0 . #x01A5) | |
540 | (#x01A7 . #x01A8) | |
541 | (#x01AC . #x01AD) | |
542 | (#x01AF . #x01B0) | |
543 | (#x01B3 . #x01B6) | |
544 | (#x01BC . #x01BD) | |
545 | (#x01CD . #x01DC) | |
546 | (#x01DE . #x01EF) | |
547 | (#x01F4 . #x01F5) | |
548 | (#x01F8 . #x021F) | |
549 | (#x0222 . #x0233) | |
550 | (#x023B . #x023C) | |
551 | (#x0241 . #x0242) | |
552 | (#x0246 . #x024F)))) | |
553 | (dolist (elt pair-ranges) | |
554 | (let ((from (car elt)) (to (cdr elt))) | |
555 | (while (< from to) | |
556 | (set-case-syntax-pair from (1+ from) tbl) | |
557 | (setq from (+ from 2)))))) | |
2bb915b8 | 558 | |
796f8b2f KH |
559 | (set-case-syntax-pair #x189 #x256 tbl) |
560 | (set-case-syntax-pair #x18A #x257 tbl) | |
561 | ||
2bb915b8 KH |
562 | ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I |
563 | ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so | |
564 | ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN | |
565 | ;; SMALL LETTER I. | |
566 | ||
567 | ;; We used to set up half of those correspondence unconditionally, | |
568 | ;; but that makes searches slow. So now we don't set up either half | |
569 | ;; of these correspondences by default. | |
570 | ||
571 | ;; (set-downcase-syntax ?İ ?i tbl) | |
572 | ;; (set-upcase-syntax ?I ?ı tbl) | |
573 | ||
e6d10035 KH |
574 | (set-case-syntax-pair ?DŽ ?dž tbl) |
575 | (set-case-syntax-pair ?Dž ?dž tbl) | |
576 | (set-case-syntax-pair ?LJ ?lj tbl) | |
577 | (set-case-syntax-pair ?Lj ?lj tbl) | |
578 | (set-case-syntax-pair ?NJ ?nj tbl) | |
579 | (set-case-syntax-pair ?Nj ?nj tbl) | |
e5e381c8 | 580 | |
269a5dd0 | 581 | ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON |
e6d10035 KH |
582 | (set-case-syntax-pair ?DZ ?dz tbl) |
583 | (set-case-syntax-pair ?Dz ?dz tbl) | |
e6d10035 KH |
584 | (set-case-syntax-pair ?Ƕ ?ƕ tbl) |
585 | (set-case-syntax-pair ?Ƿ ?ƿ tbl) | |
269a5dd0 | 586 | |
85ef8ece | 587 | ;; Latin Extended Additional |
abdaa411 | 588 | (modify-category-entry '(#x1e00 . #x1ef9) ?l) |
85ef8ece | 589 | (setq c #x1e00) |
d05cfa1f | 590 | (while (<= c #x1ef9) |
d05cfa1f KH |
591 | (and (zerop (% c 2)) |
592 | (or (<= c #x1e94) (>= c #x1ea0)) | |
abdaa411 | 593 | (set-case-syntax-pair c (1+ c) tbl)) |
d05cfa1f KH |
594 | (setq c (1+ c))) |
595 | ||
85ef8ece | 596 | ;; Greek |
abdaa411 | 597 | (modify-category-entry '(#x0370 . #x03ff) ?g) |
85ef8ece | 598 | (setq c #x0370) |
d05cfa1f | 599 | (while (<= c #x03ff) |
d05cfa1f KH |
600 | (if (or (and (>= c #x0391) (<= c #x03a1)) |
601 | (and (>= c #x03a3) (<= c #x03ab))) | |
abdaa411 | 602 | (set-case-syntax-pair c (+ c 32) tbl)) |
d05cfa1f KH |
603 | (and (>= c #x03da) |
604 | (<= c #x03ee) | |
605 | (zerop (% c 2)) | |
abdaa411 | 606 | (set-case-syntax-pair c (1+ c) tbl)) |
d05cfa1f | 607 | (setq c (1+ c))) |
e6d10035 KH |
608 | (set-case-syntax-pair ?Ά ?ά tbl) |
609 | (set-case-syntax-pair ?Έ ?έ tbl) | |
610 | (set-case-syntax-pair ?Ή ?ή tbl) | |
611 | (set-case-syntax-pair ?Ί ?ί tbl) | |
612 | (set-case-syntax-pair ?Ό ?ό tbl) | |
613 | (set-case-syntax-pair ?Ύ ?ύ tbl) | |
614 | (set-case-syntax-pair ?Ώ ?ώ tbl) | |
d05cfa1f | 615 | |
269a5dd0 DL |
616 | ;; Armenian |
617 | (setq c #x531) | |
618 | (while (<= c #x556) | |
abdaa411 | 619 | (set-case-syntax-pair c (+ c #x30) tbl) |
269a5dd0 DL |
620 | (setq c (1+ c))) |
621 | ||
85ef8ece | 622 | ;; Greek Extended |
abdaa411 | 623 | (modify-category-entry '(#x1f00 . #x1fff) ?g) |
85ef8ece | 624 | (setq c #x1f00) |
d05cfa1f | 625 | (while (<= c #x1fff) |
d05cfa1f KH |
626 | (and (<= (logand c #x000f) 7) |
627 | (<= c #x1fa7) | |
796f8b2f KH |
628 | (not (memq c '(#x1f16 #x1f17 #x1f56 #x1f57 |
629 | #x1f50 #x1f52 #x1f54 #x1f56))) | |
630 | (/= (logand c #x00f0) #x70) | |
abdaa411 | 631 | (set-case-syntax-pair (+ c 8) c tbl)) |
d05cfa1f | 632 | (setq c (1+ c))) |
e6d10035 KH |
633 | (set-case-syntax-pair ?Ᾰ ?ᾰ tbl) |
634 | (set-case-syntax-pair ?Ᾱ ?ᾱ tbl) | |
635 | (set-case-syntax-pair ?Ὰ ?ὰ tbl) | |
636 | (set-case-syntax-pair ?Ά ?ά tbl) | |
637 | (set-case-syntax-pair ?ᾼ ?ᾳ tbl) | |
638 | (set-case-syntax-pair ?Ὲ ?ὲ tbl) | |
639 | (set-case-syntax-pair ?Έ ?έ tbl) | |
640 | (set-case-syntax-pair ?Ὴ ?ὴ tbl) | |
641 | (set-case-syntax-pair ?Ή ?ή tbl) | |
642 | (set-case-syntax-pair ?ῌ ?ῃ tbl) | |
643 | (set-case-syntax-pair ?Ῐ ?ῐ tbl) | |
644 | (set-case-syntax-pair ?Ῑ ?ῑ tbl) | |
645 | (set-case-syntax-pair ?Ὶ ?ὶ tbl) | |
646 | (set-case-syntax-pair ?Ί ?ί tbl) | |
647 | (set-case-syntax-pair ?Ῠ ?ῠ tbl) | |
648 | (set-case-syntax-pair ?Ῡ ?ῡ tbl) | |
649 | (set-case-syntax-pair ?Ὺ ?ὺ tbl) | |
650 | (set-case-syntax-pair ?Ύ ?ύ tbl) | |
651 | (set-case-syntax-pair ?Ῥ ?ῥ tbl) | |
652 | (set-case-syntax-pair ?Ὸ ?ὸ tbl) | |
653 | (set-case-syntax-pair ?Ό ?ό tbl) | |
654 | (set-case-syntax-pair ?Ὼ ?ὼ tbl) | |
655 | (set-case-syntax-pair ?Ώ ?ώ tbl) | |
656 | (set-case-syntax-pair ?ῼ ?ῳ tbl) | |
d05cfa1f | 657 | |
85ef8ece | 658 | ;; cyrillic |
abdaa411 | 659 | (modify-category-entry '(#x0400 . #x04FF) ?y) |
85ef8ece | 660 | (setq c #x0400) |
d05cfa1f | 661 | (while (<= c #x04ff) |
d05cfa1f KH |
662 | (and (>= c #x0400) |
663 | (<= c #x040f) | |
abdaa411 | 664 | (set-case-syntax-pair c (+ c 80) tbl)) |
d05cfa1f KH |
665 | (and (>= c #x0410) |
666 | (<= c #x042f) | |
abdaa411 | 667 | (set-case-syntax-pair c (+ c 32) tbl)) |
d05cfa1f KH |
668 | (and (zerop (% c 2)) |
669 | (or (and (>= c #x0460) (<= c #x0480)) | |
670 | (and (>= c #x048c) (<= c #x04be)) | |
671 | (and (>= c #x04d0) (<= c #x04f4))) | |
8f924df7 | 672 | (set-case-syntax-pair c (1+ c) tbl)) |
d05cfa1f | 673 | (setq c (1+ c))) |
e6d10035 KH |
674 | (set-case-syntax-pair ?Ӂ ?ӂ tbl) |
675 | (set-case-syntax-pair ?Ӄ ?ӄ tbl) | |
676 | (set-case-syntax-pair ?Ӈ ?ӈ tbl) | |
677 | (set-case-syntax-pair ?Ӌ ?ӌ tbl) | |
678 | (set-case-syntax-pair ?Ӹ ?ӹ tbl) | |
d05cfa1f | 679 | |
85ef8ece KH |
680 | ;; general punctuation |
681 | (setq c #x2000) | |
d05cfa1f KH |
682 | (while (<= c #x200b) |
683 | (set-case-syntax c " " tbl) | |
684 | (setq c (1+ c))) | |
b427c97e DL |
685 | (while (<= c #x200F) |
686 | (set-case-syntax c "." tbl) | |
687 | (setq c (1+ c))) | |
688 | ;; Fixme: These aren't all right: | |
6b61353c KH |
689 | (setq c #x2010) |
690 | (while (<= c #x2016) | |
691 | (set-case-syntax c "_" tbl) | |
692 | (setq c (1+ c))) | |
693 | ;; Punctuation syntax for quotation marks (like `) | |
694 | (while (<= c #x201f) | |
695 | (set-case-syntax c "." tbl) | |
696 | (setq c (1+ c))) | |
697 | ;; Fixme: These aren't all right: | |
d05cfa1f KH |
698 | (while (<= c #x2027) |
699 | (set-case-syntax c "_" tbl) | |
700 | (setq c (1+ c))) | |
b427c97e DL |
701 | (while (<= c #x206F) |
702 | (set-case-syntax c "." tbl) | |
703 | (setq c (1+ c))) | |
d05cfa1f | 704 | |
269a5dd0 DL |
705 | ;; Roman numerals |
706 | (setq c #x2160) | |
707 | (while (<= c #x216f) | |
abdaa411 | 708 | (set-case-syntax-pair c (+ c #x10) tbl) |
269a5dd0 DL |
709 | (setq c (1+ c))) |
710 | ||
4fb82d62 DL |
711 | ;; Fixme: The following blocks might be better as symbol rather than |
712 | ;; punctuation. | |
b427c97e DL |
713 | ;; Arrows |
714 | (setq c #x2190) | |
6ca54a3a DL |
715 | (while (<= c #x21FF) |
716 | (set-case-syntax c "." tbl) | |
b427c97e DL |
717 | (setq c (1+ c))) |
718 | ;; Mathematical Operators | |
719 | (while (<= c #x22FF) | |
6ca54a3a | 720 | (set-case-syntax c "." tbl) |
b427c97e DL |
721 | (setq c (1+ c))) |
722 | ;; Miscellaneous Technical | |
723 | (while (<= c #x23FF) | |
6ca54a3a | 724 | (set-case-syntax c "." tbl) |
b427c97e DL |
725 | (setq c (1+ c))) |
726 | ;; Control Pictures | |
727 | (while (<= c #x243F) | |
6ca54a3a | 728 | (set-case-syntax c "_" tbl) |
269a5dd0 DL |
729 | (setq c (1+ c))) |
730 | ||
731 | ;; Circled Latin | |
732 | (setq c #x24b6) | |
733 | (while (<= c #x24cf) | |
abdaa411 DL |
734 | (set-case-syntax-pair c (+ c 26) tbl) |
735 | (modify-category-entry c ?l) | |
736 | (modify-category-entry (+ c 26) ?l) | |
269a5dd0 DL |
737 | (setq c (1+ c))) |
738 | ||
739 | ;; Fullwidth Latin | |
740 | (setq c #xff21) | |
741 | (while (<= c #xff3a) | |
abdaa411 DL |
742 | (set-case-syntax-pair c (+ c #x20) tbl) |
743 | (modify-category-entry c ?l) | |
744 | (modify-category-entry (+ c #x20) ?l) | |
269a5dd0 DL |
745 | (setq c (1+ c))) |
746 | ||
269a5dd0 | 747 | ;; Combining diacritics |
abdaa411 | 748 | (modify-category-entry '(#x300 . #x362) ?^) |
269a5dd0 | 749 | ;; Combining marks |
abdaa411 | 750 | (modify-category-entry '(#x20d0 . #x20e3) ?^) |
269a5dd0 DL |
751 | |
752 | ;; Fixme: syntax for symbols &c | |
753 | ) | |
6b61353c KH |
754 | |
755 | (let ((pairs | |
e55a4d4e KH |
756 | '("⁅⁆" ; U+2045 U+2046 |
757 | "⁽⁾" ; U+207D U+207E | |
758 | "₍₎" ; U+208D U+208E | |
759 | "〈〉" ; U+2329 U+232A | |
760 | "⎴⎵" ; U+23B4 U+23B5 | |
761 | "❨❩" ; U+2768 U+2769 | |
762 | "❪❫" ; U+276A U+276B | |
763 | "❬❭" ; U+276C U+276D | |
764 | "❰❱" ; U+2770 U+2771 | |
765 | "❲❳" ; U+2772 U+2773 | |
766 | "❴❵" ; U+2774 U+2775 | |
767 | "⟦⟧" ; U+27E6 U+27E7 | |
768 | "⟨⟩" ; U+27E8 U+27E9 | |
769 | "⟪⟫" ; U+27EA U+27EB | |
770 | "⦃⦄" ; U+2983 U+2984 | |
771 | "⦅⦆" ; U+2985 U+2986 | |
772 | "⦇⦈" ; U+2987 U+2988 | |
773 | "⦉⦊" ; U+2989 U+298A | |
774 | "⦋⦌" ; U+298B U+298C | |
775 | "⦍⦎" ; U+298D U+298E | |
776 | "⦏⦐" ; U+298F U+2990 | |
777 | "⦑⦒" ; U+2991 U+2992 | |
778 | "⦓⦔" ; U+2993 U+2994 | |
779 | "⦕⦖" ; U+2995 U+2996 | |
780 | "⦗⦘" ; U+2997 U+2998 | |
781 | "⧼⧽" ; U+29FC U+29FD | |
782 | "〈〉" ; U+3008 U+3009 | |
783 | "《》" ; U+300A U+300B | |
784 | "「」" ; U+300C U+300D | |
785 | "『』" ; U+300E U+300F | |
786 | "【】" ; U+3010 U+3011 | |
787 | "〔〕" ; U+3014 U+3015 | |
788 | "〖〗" ; U+3016 U+3017 | |
789 | "〘〙" ; U+3018 U+3019 | |
790 | "〚〛" ; U+301A U+301B | |
791 | "﴾﴿" ; U+FD3E U+FD3F | |
792 | "︵︶" ; U+FE35 U+FE36 | |
793 | "︷︸" ; U+FE37 U+FE38 | |
794 | "︹︺" ; U+FE39 U+FE3A | |
795 | "︻︼" ; U+FE3B U+FE3C | |
796 | "︽︾" ; U+FE3D U+FE3E | |
797 | "︿﹀" ; U+FE3F U+FE40 | |
798 | "﹁﹂" ; U+FE41 U+FE42 | |
799 | "﹃﹄" ; U+FE43 U+FE44 | |
800 | "﹙﹚" ; U+FE59 U+FE5A | |
801 | "﹛﹜" ; U+FE5B U+FE5C | |
802 | "﹝﹞" ; U+FE5D U+FE5E | |
803 | "()" ; U+FF08 U+FF09 | |
804 | "[]" ; U+FF3B U+FF3D | |
805 | "{}" ; U+FF5B U+FF5D | |
806 | "⦅⦆" ; U+FF5F U+FF60 | |
807 | "「」" ; U+FF62 U+FF63 | |
6b61353c KH |
808 | ))) |
809 | (dolist (elt pairs) | |
810 | (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1))) | |
811 | (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0))))) | |
812 | ||
4ed46869 | 813 | \f |
777cfce6 | 814 | ;; For each character set, put the information of the most proper |
aaa9f206 | 815 | ;; coding system to encode it by `preferred-coding-system' property. |
777cfce6 | 816 | |
abdaa411 | 817 | ;; Fixme: should this be junked? |
777cfce6 KH |
818 | (let ((l '((latin-iso8859-1 . iso-latin-1) |
819 | (latin-iso8859-2 . iso-latin-2) | |
820 | (latin-iso8859-3 . iso-latin-3) | |
821 | (latin-iso8859-4 . iso-latin-4) | |
822 | (thai-tis620 . thai-tis620) | |
823 | (greek-iso8859-7 . greek-iso-8bit) | |
824 | (arabic-iso8859-6 . iso-2022-7bit) | |
825 | (hebrew-iso8859-8 . hebrew-iso-8bit) | |
826 | (katakana-jisx0201 . japanese-shift-jis) | |
827 | (latin-jisx0201 . japanese-shift-jis) | |
828 | (cyrillic-iso8859-5 . cyrillic-iso-8bit) | |
829 | (latin-iso8859-9 . iso-latin-5) | |
830 | (japanese-jisx0208-1978 . iso-2022-jp) | |
7870bdd9 KH |
831 | (chinese-gb2312 . chinese-iso-8bit) |
832 | (chinese-gbk . chinese-gbk) | |
833 | (gb18030-2-byte . chinese-gb18030) | |
834 | (gb18030-4-byte-bmp . chinese-gb18030) | |
835 | (gb18030-4-byte-smp . chinese-gb18030) | |
836 | (gb18030-4-byte-ext-1 . chinese-gb18030) | |
837 | (gb18030-4-byte-ext-2 . chinese-gb18030) | |
777cfce6 KH |
838 | (japanese-jisx0208 . iso-2022-jp) |
839 | (korean-ksc5601 . iso-2022-kr) | |
840 | (japanese-jisx0212 . iso-2022-jp) | |
777cfce6 KH |
841 | (chinese-big5-1 . chinese-big5) |
842 | (chinese-big5-2 . chinese-big5) | |
843 | (chinese-sisheng . iso-2022-7bit) | |
844 | (ipa . iso-2022-7bit) | |
845 | (vietnamese-viscii-lower . vietnamese-viscii) | |
846 | (vietnamese-viscii-upper . vietnamese-viscii) | |
847 | (arabic-digit . iso-2022-7bit) | |
848 | (arabic-1-column . iso-2022-7bit) | |
777cfce6 KH |
849 | (lao . lao) |
850 | (arabic-2-column . iso-2022-7bit) | |
851 | (indian-is13194 . devanagari) | |
69e138b2 | 852 | (indian-glyph . devanagari) |
777cfce6 | 853 | (tibetan-1-column . tibetan) |
58cd41a3 | 854 | (ethiopic . iso-2022-7bit) |
7870bdd9 KH |
855 | (chinese-cns11643-1 . iso-2022-cn) |
856 | (chinese-cns11643-2 . iso-2022-cn) | |
777cfce6 KH |
857 | (chinese-cns11643-3 . iso-2022-cn) |
858 | (chinese-cns11643-4 . iso-2022-cn) | |
859 | (chinese-cns11643-5 . iso-2022-cn) | |
860 | (chinese-cns11643-6 . iso-2022-cn) | |
861 | (chinese-cns11643-7 . iso-2022-cn) | |
862 | (indian-2-column . devanagari) | |
7a860cf2 DL |
863 | (tibetan . tibetan) |
864 | (latin-iso8859-14 . iso-latin-8) | |
865 | (latin-iso8859-15 . iso-latin-9)))) | |
777cfce6 | 866 | (while l |
aaa9f206 | 867 | (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l))) |
777cfce6 | 868 | (setq l (cdr l)))) |
df0415c5 KH |
869 | |
870 | \f | |
98a663f1 | 871 | ;; Setup auto-fill-chars for charsets that should invoke auto-filling. |
7760ba82 | 872 | ;; SPACE and NEWLINE are already set. |
df21429c KH |
873 | |
874 | (set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t) | |
875 | (set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t) | |
876 | (set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t) | |
877 | (set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t) | |
878 | (set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t) | |
879 | (set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t) | |
880 | ||
55bd52ea | 881 | \f |
7760ba82 KH |
882 | ;;; Setting char-width-table. The default is 1. |
883 | ||
884 | ;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial | |
885 | ;; and final characters. | |
a2a22302 | 886 | (let ((l '((#x0300 . #x036F) |
7760ba82 KH |
887 | (#x0483 . #x0489) |
888 | (#x0591 . #x05BD) | |
889 | (#x05BF . #x05BF) | |
890 | (#x05C1 . #x05C2) | |
891 | (#x05C4 . #x05C5) | |
892 | (#x05C7 . #x05C7) | |
893 | (#x0600 . #x0603) | |
894 | (#x0610 . #x0615) | |
895 | (#x064B . #x065E) | |
896 | (#x0670 . #x0670) | |
897 | (#x06D6 . #x06E4) | |
898 | (#x06E7 . #x06E8) | |
899 | (#x06EA . #x06ED) | |
900 | (#x070F . #x070F) | |
901 | (#x0711 . #x0711) | |
902 | (#x0730 . #x074A) | |
903 | (#x07A6 . #x07B0) | |
904 | (#x07EB . #x07F3) | |
905 | (#x0901 . #x0902) | |
906 | (#x093C . #x093C) | |
907 | (#x0941 . #x0948) | |
908 | (#x094D . #x094D) | |
909 | (#x0951 . #x0954) | |
910 | (#x0962 . #x0963) | |
911 | (#x0981 . #x0981) | |
912 | (#x09BC . #x09BC) | |
913 | (#x09C1 . #x09C4) | |
914 | (#x09CD . #x09CD) | |
915 | (#x09E2 . #x09E3) | |
916 | (#x0A01 . #x0A02) | |
917 | (#x0A3C . #x0A3C) | |
918 | (#x0A41 . #x0A4D) | |
919 | (#x0A70 . #x0A71) | |
920 | (#x0A81 . #x0A82) | |
921 | (#x0ABC . #x0ABC) | |
922 | (#x0AC1 . #x0AC8) | |
923 | (#x0ACD . #x0ACD) | |
924 | (#x0AE2 . #x0AE3) | |
925 | (#x0B01 . #x0B01) | |
926 | (#x0B3C . #x0B3C) | |
927 | (#x0B3F . #x0B3F) | |
928 | (#x0B41 . #x0B43) | |
929 | (#x0B4D . #x0B56) | |
930 | (#x0B82 . #x0B82) | |
931 | (#x0BC0 . #x0BC0) | |
932 | (#x0BCD . #x0BCD) | |
933 | (#x0C3E . #x0C40) | |
934 | (#x0C46 . #x0C56) | |
935 | (#x0CBC . #x0CBC) | |
936 | (#x0CBF . #x0CBF) | |
937 | (#x0CC6 . #x0CC6) | |
938 | (#x0CCC . #x0CCD) | |
939 | (#x0CE2 . #x0CE3) | |
940 | (#x0D41 . #x0D43) | |
941 | (#x0D4D . #x0D4D) | |
942 | (#x0DCA . #x0DCA) | |
943 | (#x0DD2 . #x0DD6) | |
944 | (#x0E31 . #x0E31) | |
945 | (#x0E34 . #x0E3A) | |
946 | (#x0E47 . #x0E4E) | |
947 | (#x0EB1 . #x0EB1) | |
948 | (#x0EB4 . #x0EBC) | |
949 | (#x0EC8 . #x0ECD) | |
950 | (#x0F18 . #x0F19) | |
951 | (#x0F35 . #x0F35) | |
952 | (#x0F37 . #x0F37) | |
953 | (#x0F39 . #x0F39) | |
954 | (#x0F71 . #x0F7E) | |
955 | (#x0F80 . #x0F84) | |
956 | (#x0F86 . #x0F87) | |
957 | (#x0F90 . #x0FBC) | |
958 | (#x0FC6 . #x0FC6) | |
959 | (#x102D . #x1030) | |
960 | (#x1032 . #x1037) | |
961 | (#x1039 . #x1039) | |
962 | (#x1058 . #x1059) | |
963 | (#x1160 . #x11FF) | |
964 | (#x135F . #x135F) | |
965 | (#x1712 . #x1714) | |
966 | (#x1732 . #x1734) | |
967 | (#x1752 . #x1753) | |
968 | (#x1772 . #x1773) | |
969 | (#x17B4 . #x17B5) | |
970 | (#x17B7 . #x17BD) | |
971 | (#x17C6 . #x17C6) | |
972 | (#x17C9 . #x17D3) | |
973 | (#x17DD . #x17DD) | |
974 | (#x180B . #x180D) | |
975 | (#x18A9 . #x18A9) | |
976 | (#x1920 . #x1922) | |
977 | (#x1927 . #x1928) | |
978 | (#x1932 . #x1932) | |
979 | (#x1939 . #x193B) | |
980 | (#x1A17 . #x1A18) | |
981 | (#x1B00 . #x1B03) | |
982 | (#x1B34 . #x1B34) | |
983 | (#x1B36 . #x1B3A) | |
984 | (#x1B3C . #x1B3C) | |
985 | (#x1B42 . #x1B42) | |
986 | (#x1B6B . #x1B73) | |
987 | (#x1DC0 . #x1DFF) | |
988 | (#x200B . #x200F) | |
989 | (#x202A . #x202E) | |
990 | (#x2060 . #x206F) | |
991 | (#x20D0 . #x20EF) | |
992 | (#x302A . #x302F) | |
993 | (#x3099 . #x309A) | |
994 | (#xA806 . #xA806) | |
995 | (#xA80B . #xA80B) | |
996 | (#xA825 . #xA826) | |
997 | (#xFB1E . #xFB1E) | |
998 | (#xFE00 . #xFE0F) | |
999 | (#xFE20 . #xFE23) | |
1000 | (#xFEFF . #xFEFF) | |
1001 | (#xFFF9 . #xFFFB) | |
1002 | (#x10A01 . #x10A0F) | |
1003 | (#x10A38 . #x10A3F) | |
1004 | (#x1D167 . #x1D169) | |
1005 | (#x1D173 . #x1D182) | |
1006 | (#x1D185 . #x1D18B) | |
1007 | (#x1D1AA . #x1D1AD) | |
1008 | (#x1D242 . #x1D244) | |
1009 | (#xE0001 . #xE01EF)))) | |
1010 | (dolist (elt l) | |
1011 | (set-char-table-range char-width-table elt 0))) | |
1012 | ||
1013 | ;; 2: East Asian Wide and Full-width characters. | |
1014 | (let ((l '((#x1100 . #x115F) | |
1015 | (#x2329 . #x232A) | |
1016 | (#x2E80 . #x303E) | |
1017 | (#x3040 . #xA4CF) | |
1018 | (#xAC00 . #xD7A3) | |
ed0cb465 | 1019 | (#xF900 . #xFAFF) |
7760ba82 | 1020 | (#xFE30 . #xFE6F) |
bb5c62cf | 1021 | (#xFF01 . #xFF60) |
7760ba82 KH |
1022 | (#xFFE0 . #xFFE6) |
1023 | (#x20000 . #x2FFFF) | |
1024 | (#x30000 . #x3FFFF)))) | |
ed0cb465 | 1025 | (dolist (elt l) |
7760ba82 | 1026 | (set-char-table-range char-width-table elt 2))) |
173f18ce DL |
1027 | |
1028 | ;; Other double width | |
7760ba82 KH |
1029 | ;;(map-charset-chars |
1030 | ;; (lambda (range ignore) (set-char-table-range char-width-table range 2)) | |
1031 | ;; 'ethiopic) | |
1032 | ;; (map-charset-chars | |
1033 | ;; (lambda (range ignore) (set-char-table-range char-width-table range 2)) | |
1034 | ;; 'tibetan) | |
173f18ce DL |
1035 | (map-charset-chars |
1036 | (lambda (range ignore) (set-char-table-range char-width-table range 2)) | |
1037 | 'indian-2-column) | |
1038 | (map-charset-chars | |
1039 | (lambda (range ignore) (set-char-table-range char-width-table range 2)) | |
1040 | 'arabic-2-column) | |
777cfce6 | 1041 | |
dbff07a2 KH |
1042 | ;; Internal use only. |
1043 | ;; Alist of locale symbol vs charsets. In a language environment | |
1044 | ;; corresponding to the locale, width of characters in the charsets is | |
1045 | ;; set to 2. Each element has the form: | |
1046 | ;; (LOCALE TABLE (CHARSET (FROM-CODE . TO-CODE) ...) ...) | |
1047 | ;; LOCALE: locale symbol | |
1048 | ;; TABLE: char-table used for char-width-table, initially nil. | |
1049 | ;; CAHRSET: character set | |
1050 | ;; FROM-CODE, TO-CODE: range of code-points in CHARSET | |
1051 | ||
1052 | (defvar cjk-char-width-table-list | |
1053 | '((ja_JP nil (japanese-jisx0208 (#x2121 . #x287E)) | |
1054 | (cp932-2-byte (#x8140 . #x879F))) | |
1055 | (zh_CN nil (chinese-gb2312 (#x2121 . #x297E))) | |
1056 | (zh_HK nil (big5-hkscs (#xA140 . #xA3FE) (#xC6A0 . #xC8FE))) | |
1057 | (zh_TW nil (big5 (#xA140 . #xA3FE)) | |
1058 | (chinese-cns11643-1 (#x2121 . #x427E))) | |
1059 | (ko_KR nil (korean-ksc5601 (#x2121 . #x2C7E))))) | |
1060 | ||
1061 | ;; Internal use only. | |
1062 | ;; Setup char-width-table appropriate for a language environment | |
1063 | ;; corresponding to LOCALE-NAME (symbol). | |
1064 | ||
1065 | (defun use-cjk-char-width-table (locale-name) | |
1066 | (while (char-table-parent char-width-table) | |
1067 | (setq char-width-table (char-table-parent char-width-table))) | |
1068 | (let ((slot (assq locale-name cjk-char-width-table-list)) | |
1069 | table) | |
1070 | (or slot (error "Unknown locale for CJK language environment: %s" | |
1071 | locale-name)) | |
1072 | (unless (nth 1 slot) | |
1073 | (let ((table (make-char-table nil))) | |
1074 | (dolist (charset-info (nthcdr 2 slot)) | |
1075 | (let ((charset (car charset-info))) | |
1076 | (dolist (code-range (cdr charset-info)) | |
1077 | (map-charset-chars #'(lambda (range arg) | |
1078 | (set-char-table-range table range 2)) | |
1079 | charset nil | |
1080 | (car code-range) (cdr code-range))))) | |
1081 | (optimize-char-table table) | |
1082 | (set-char-table-parent table char-width-table) | |
1083 | (setcar (cdr slot) table))) | |
1084 | (setq char-width-table (nth 1 slot)))) | |
55a3ed16 KH |
1085 | |
1086 | (defun use-default-char-width-table () | |
1087 | "Internal use only. | |
9f336de0 | 1088 | Setup char-width-table appropriate for non-CJK language environment." |
dbff07a2 KH |
1089 | (while (char-table-parent char-width-table) |
1090 | (setq char-width-table (char-table-parent char-width-table)))) | |
55a3ed16 | 1091 | |
87a39edb | 1092 | (optimize-char-table (standard-case-table)) |
87a39edb DL |
1093 | (optimize-char-table (standard-syntax-table)) |
1094 | ||
55a3ed16 KH |
1095 | \f |
1096 | ;; Setting char-script-table. | |
1097 | ||
b427c97e DL |
1098 | ;; The Unicode blocks actually extend past some of these ranges with |
1099 | ;; undefined codepoints. | |
9ce5de1c KH |
1100 | (let ((script-list nil)) |
1101 | (dolist | |
1102 | (elt | |
b982c760 | 1103 | '((#x0000 #x007F latin) |
6c52dd78 JR |
1104 | (#x00A0 #x024F latin) |
1105 | (#x0250 #x02AF phonetic) | |
1106 | (#x02B0 #x036F latin) | |
9ce5de1c KH |
1107 | (#x0370 #x03E1 greek) |
1108 | (#x03E2 #x03EF coptic) | |
1109 | (#x03F0 #x03F3 greek) | |
1110 | (#x0400 #x04FF cyrillic) | |
1111 | (#x0530 #x058F armenian) | |
1112 | (#x0590 #x05FF hebrew) | |
1113 | (#x0600 #x06FF arabic) | |
1114 | (#x0700 #x074F syriac) | |
e7da2f38 | 1115 | (#x07C0 #x07FA nko) |
9ce5de1c KH |
1116 | (#x0780 #x07BF thaana) |
1117 | (#x0900 #x097F devanagari) | |
1118 | (#x0980 #x09FF bengali) | |
1119 | (#x0A00 #x0A7F gurmukhi) | |
1120 | (#x0A80 #x0AFF gujarati) | |
1121 | (#x0B00 #x0B7F oriya) | |
1122 | (#x0B80 #x0BFF tamil) | |
1123 | (#x0C00 #x0C7F telugu) | |
1124 | (#x0C80 #x0CFF kannada) | |
1125 | (#x0D00 #x0D7F malayalam) | |
1126 | (#x0D80 #x0DFF sinhala) | |
1127 | (#x0E00 #x0E5F thai) | |
1128 | (#x0E80 #x0EDF lao) | |
1129 | (#x0F00 #x0FFF tibetan) | |
d99ea08e | 1130 | (#x1000 #x109F burmese) |
9ce5de1c KH |
1131 | (#x10A0 #x10FF georgian) |
1132 | (#x1100 #x11FF hangul) | |
4c81b0f6 | 1133 | (#x1200 #x139F ethiopic) |
9ce5de1c KH |
1134 | (#x13A0 #x13FF cherokee) |
1135 | (#x1400 #x167F canadian-aboriginal) | |
1136 | (#x1680 #x169F ogham) | |
1137 | (#x16A0 #x16FF runic) | |
1138 | (#x1780 #x17FF khmer) | |
1139 | (#x1800 #x18AF mongolian) | |
6c52dd78 | 1140 | (#x1D00 #x1DFF phonetic) |
9ce5de1c KH |
1141 | (#x1E00 #x1EFF latin) |
1142 | (#x1F00 #x1FFF greek) | |
f041d33e | 1143 | (#x2000 #x27FF symbol) |
9ce5de1c | 1144 | (#x2800 #x28FF braille) |
4c81b0f6 | 1145 | (#x2D80 #x2DDF ethiopic) |
9ce5de1c KH |
1146 | (#x2E80 #x2FDF han) |
1147 | (#x2FF0 #x2FFF ideographic-description) | |
1148 | (#x3000 #x303F cjk-misc) | |
1149 | (#x3040 #x30FF kana) | |
1150 | (#x3100 #x312F bopomofo) | |
1151 | (#x3130 #x318F hangul) | |
1152 | (#x3190 #x319F kanbun) | |
1153 | (#x31A0 #x31BF bopomofo) | |
1154 | (#x3400 #x9FAF han) | |
1155 | (#xA000 #xA4CF yi) | |
1ffae953 | 1156 | (#xAA00 #xAA5F cham) |
d99ea08e | 1157 | (#xAA60 #xAA7B burmese) |
d807d0c7 | 1158 | (#xAA80 #xAADF tai-viet) |
9ce5de1c | 1159 | (#xAC00 #xD7AF hangul) |
95ac45fa | 1160 | (#xF900 #xFAFF han) |
9ce5de1c KH |
1161 | (#xFB1D #xFB4F hebrew) |
1162 | (#xFB50 #xFDFF arabic) | |
1163 | (#xFE70 #xFEFC arabic) | |
1164 | (#xFF00 #xFF5F cjk-misc) | |
1165 | (#xFF61 #xFF9F kana) | |
1166 | (#xFFE0 #xFFE6 cjk-misc) | |
458888ab KH |
1167 | (#x10000 #x100FF linear-b) |
1168 | (#x10100 #x1013F aegean-number) | |
1169 | (#x10140 #x1018A ancient-greek-number) | |
1170 | (#x10190 #x1019B ancient-symbol) | |
1171 | (#x101D0 #x101FF phaistos-disc) | |
1172 | (#x10280 #x1029F lycian) | |
1173 | (#x102A0 #x102DF carian) | |
1174 | (#x10300 #x1032F olt-italic) | |
1175 | (#x10380 #x1039F ugaritic) | |
1176 | (#x103A0 #x103DF old-persian) | |
1177 | (#x10400 #x1044F deseret) | |
1178 | (#x10450 #x1047F shavian) | |
1179 | (#x10480 #x104AF osmanya) | |
1180 | (#x10800 #x1083F cypriot-syllabary) | |
1181 | (#x10900 #x1091F phoenician) | |
1182 | (#x10920 #x1093F lydian) | |
1183 | (#x10A00 #x10A5F kharoshthi) | |
1184 | (#x12000 #x123FF cuneiform) | |
1185 | (#x12400 #x1247F cuneiform-numbers-and-punctuation) | |
e7da2f38 KH |
1186 | (#x1D000 #x1D0FF byzantine-musical-symbol) |
1187 | (#x1D100 #x1D1FF musical-symbol) | |
458888ab KH |
1188 | (#x1D200 #x1D24F ancient-greek-musical-notation) |
1189 | (#x1D300 #x1D35F tai-xuan-jing-symbol) | |
1190 | (#x1D360 #x1D37F counting-rod-numeral) | |
e7da2f38 | 1191 | (#x1D400 #x1D7FF mathematical) |
458888ab KH |
1192 | (#x1F000 #x1F02F mahjong-tile) |
1193 | (#x1F030 #x1F09F domino-tile) | |
e7259832 | 1194 | (#x20000 #x2AFFF han) |
9ce5de1c KH |
1195 | (#x2F800 #x2FFFF han))) |
1196 | (set-char-table-range char-script-table | |
1197 | (cons (car elt) (nth 1 elt)) (nth 2 elt)) | |
1198 | (or (memq (nth 2 elt) script-list) | |
1199 | (setq script-list (cons (nth 2 elt) script-list)))) | |
1200 | (set-char-table-extra-slot char-script-table 0 (nreverse script-list))) | |
1201 | ||
8f924df7 | 1202 | (map-charset-chars |
cdfc5141 KH |
1203 | #'(lambda (range ignore) |
1204 | (set-char-table-range char-script-table range 'tibetan)) | |
1205 | 'tibetan) | |
1206 | ||
e7259832 | 1207 | \f |
59db3a5c KH |
1208 | ;;; Setting unicode-category-table. |
1209 | ||
1210 | ;; This macro is to build unicode-category-table at compile time so | |
1211 | ;; that C code can access the table efficiently. | |
1212 | (defmacro build-unicode-category-table () | |
1213 | (let ((table (make-char-table 'unicode-category-table nil))) | |
1214 | (dotimes (i #x110000) | |
1215 | (if (or (< i #xD800) | |
796f8b2f KH |
1216 | (and (>= i #xF900) (< i #x30000)) |
1217 | (and (>= i #xE0000) (< i #xE0200))) | |
59db3a5c KH |
1218 | (aset table i (get-char-code-property i 'general-category)))) |
1219 | (set-char-table-range table '(#xE000 . #xF8FF) 'Co) | |
1220 | (set-char-table-range table '(#xF0000 . #xFFFFD) 'Co) | |
1221 | (set-char-table-range table '(#x100000 . #x10FFFD) 'Co) | |
1222 | (optimize-char-table table 'eq) | |
1223 | table)) | |
1224 | ||
1225 | (setq unicode-category-table (build-unicode-category-table)) | |
8ea6fa80 KH |
1226 | (map-char-table #'(lambda (key val) |
1227 | (if (and val | |
1228 | (or (and (/= (aref (symbol-name val) 0) ?M) | |
1229 | (/= (aref (symbol-name val) 0) ?C)) | |
1230 | (eq val 'Zs))) | |
1231 | (modify-category-entry key ?.))) | |
1232 | unicode-category-table) | |
1233 | ||
1234 | (optimize-char-table (standard-category-table)) | |
59db3a5c KH |
1235 | |
1236 | \f | |
b2cca856 KH |
1237 | ;; Display of glyphless characters. |
1238 | ||
1239 | (defvar char-acronym-table | |
1240 | (make-char-table 'char-acronym-table nil) | |
1241 | "Char table of acronyms for non-graphic characters.") | |
1242 | ||
1243 | (let ((c0-acronyms '("NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK" "BEL" | |
1244 | "BS" nil nil "VT" "FF" "CR" "SO" "SI" | |
1245 | "DLE" "DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB" | |
1246 | "CAN" "EM" "SUB" "ESC" "FC" "GS" "RS" "US"))) | |
1247 | (dotimes (i 32) | |
1248 | (aset char-acronym-table i (car c0-acronyms)) | |
1249 | (setq c0-acronyms (cdr c0-acronyms)))) | |
1250 | ||
1251 | (let ((c1-acronyms '("XXX" "XXX" "BPH" "NBH" "IND" "NEL" "SSA" "ESA" | |
1252 | "HTS" "HTJ" "VTS" "PLD" "PLU" "R1" "SS2" "SS1" | |
1253 | "DCS" "PU1" "PU2" "STS" "CCH" "MW" "SPA" "EPA" | |
1254 | "SOS" "XXX" "SC1" "CSI" "ST" "OSC" "PM" "APC"))) | |
1255 | (dotimes (i 32) | |
1256 | (aset char-acronym-table (+ #x0080 i) (car c1-acronyms)) | |
1257 | (setq c1-acronyms (cdr c1-acronyms)))) | |
1258 | ||
1259 | (aset char-acronym-table #x17B4 "KIVAQ") ; KHMER VOWEL INHERENT AQ | |
1260 | (aset char-acronym-table #x17B5 "KIVAA") ; KHMER VOWEL INHERENT AA | |
1261 | (aset char-acronym-table #x200B "ZWSP") ; ZERO WIDTH SPACE | |
1262 | (aset char-acronym-table #x200C "ZWNJ") ; ZERO WIDTH NON-JOINER | |
1263 | (aset char-acronym-table #x200D "ZWJ") ; ZERO WIDTH JOINER | |
1264 | (aset char-acronym-table #x200E "LRM") ; LEFT-TO-RIGHT MARK | |
1265 | (aset char-acronym-table #x200F "RLM") ; RIGHT-TO-LEFT MARK | |
1266 | (aset char-acronym-table #x202A "LRE") ; LEFT-TO-RIGHT EMBEDDING | |
1267 | (aset char-acronym-table #x202B "RLE") ; RIGHT-TO-LEFT EMBEDDING | |
1268 | (aset char-acronym-table #x202C "PDF") ; POP DIRECTIONAL FORMATTING | |
1269 | (aset char-acronym-table #x202D "LRO") ; LEFT-TO-RIGHT OVERRIDE | |
1270 | (aset char-acronym-table #x202E "RLO") ; RIGHT-TO-LEFT OVERRIDE | |
1271 | (aset char-acronym-table #x2060 "WJ") ; WORD JOINER | |
1272 | (aset char-acronym-table #x206A "ISS") ; INHIBIT SYMMETRIC SWAPPING | |
1273 | (aset char-acronym-table #x206B "ASS") ; ACTIVATE SYMMETRIC SWAPPING | |
1274 | (aset char-acronym-table #x206C "IAFS") ; INHIBIT ARABIC FORM SHAPING | |
1275 | (aset char-acronym-table #x206D "AAFS") ; ACTIVATE ARABIC FORM SHAPING | |
1276 | (aset char-acronym-table #x206E "NADS") ; NATIONAL DIGIT SHAPES | |
1277 | (aset char-acronym-table #x206F "NODS") ; NOMINAL DIGIT SHAPES | |
1278 | (aset char-acronym-table #xFEFF "ZWNBSP") ; ZERO WIDTH NO-BREAK SPACE | |
1279 | (aset char-acronym-table #xFFF9 "IAA") ; INTERLINEAR ANNOTATION ANCHOR | |
1280 | (aset char-acronym-table #xFFFA "IAS") ; INTERLINEAR ANNOTATION SEPARATOR | |
1281 | (aset char-acronym-table #xFFFB "IAT") ; INTERLINEAR ANNOTATION TERMINATOR | |
1282 | (aset char-acronym-table #x1D173 "BEGBM") ; MUSICAL SYMBOL BEGIN BEAM | |
1283 | (aset char-acronym-table #x1D174 "ENDBM") ; MUSICAL SYMBOL END BEAM | |
1284 | (aset char-acronym-table #x1D175 "BEGTIE") ; MUSICAL SYMBOL BEGIN TIE | |
1285 | (aset char-acronym-table #x1D176 "END") ; MUSICAL SYMBOL END TIE | |
1286 | (aset char-acronym-table #x1D177 "BEGSLR") ; MUSICAL SYMBOL BEGIN SLUR | |
1287 | (aset char-acronym-table #x1D178 "ENDSLR") ; MUSICAL SYMBOL END SLUR | |
1288 | (aset char-acronym-table #x1D179 "BEGPHR") ; MUSICAL SYMBOL BEGIN PHRASE | |
1289 | (aset char-acronym-table #x1D17A "ENDPHR") ; MUSICAL SYMBOL END PHRASE | |
1290 | (aset char-acronym-table #xE0001 "|->TAG") ; LANGUAGE TAG | |
1291 | (aset char-acronym-table #xE0020 "SP TAG") ; TAG SPACE | |
1292 | (dotimes (i 94) | |
1293 | (aset char-acronym-table (+ #xE0021 i) (format " %c TAG" (+ 33 i)))) | |
1294 | (aset char-acronym-table #xE007F "->|TAG") ; CANCEL TAG | |
1295 | ||
0e7c0582 | 1296 | (defun update-glyphless-char-display (&optional variable value) |
0eb025fb | 1297 | "Make the setting of `glyphless-char-display-control' take effect. |
b2cca856 | 1298 | This function updates the char-table `glyphless-char-display'." |
0e7c0582 EZ |
1299 | (when value |
1300 | (set-default variable value)) | |
1301 | (dolist (elt value) | |
b2cca856 KH |
1302 | (let ((target (car elt)) |
1303 | (method (cdr elt))) | |
0eb025fb EZ |
1304 | (or (memq method '(zero-width thin-space empty-box acronym hex-code)) |
1305 | (error "Invalid glyphless character display method: %s" method)) | |
b2cca856 KH |
1306 | (cond ((eq target 'c0-control) |
1307 | (set-char-table-range glyphless-char-display '(#x00 . #x1F) | |
96107967 EZ |
1308 | method) |
1309 | ;; Users will not expect their newlines and TABs be | |
1310 | ;; displayed as anything but themselves, so exempt those | |
1311 | ;; two characters from c0-control. | |
1312 | (set-char-table-range glyphless-char-display #x9 nil) | |
1313 | (set-char-table-range glyphless-char-display #xa nil)) | |
b2cca856 KH |
1314 | ((eq target 'c1-control) |
1315 | (set-char-table-range glyphless-char-display '(#x80 . #x9F) | |
1316 | method)) | |
1317 | ((eq target 'format-control) | |
1318 | (map-char-table | |
1319 | #'(lambda (char category) | |
1320 | (if (eq category 'Cf) | |
1321 | (let ((this-method method) | |
1322 | from to) | |
1323 | (if (consp char) | |
1324 | (setq from (car char) to (cdr char)) | |
1325 | (setq from char to char)) | |
1326 | (while (<= from to) | |
1327 | (when (/= from #xAD) | |
1328 | (if (eq method 'acronym) | |
0eb025fb | 1329 | (setq this-method |
b2cca856 KH |
1330 | (aref char-acronym-table from))) |
1331 | (set-char-table-range glyphless-char-display | |
1332 | from this-method)) | |
1333 | (setq from (1+ from)))))) | |
1334 | unicode-category-table)) | |
1335 | ((eq target 'no-font) | |
1336 | (set-char-table-extra-slot glyphless-char-display 0 method)) | |
1337 | (t | |
0eb025fb | 1338 | (error "Invalid glyphless character group: %s" target)))))) |
b2cca856 | 1339 | |
0e7c0582 EZ |
1340 | ;;; Control of displaying glyphless characters. |
1341 | (defcustom glyphless-char-display-control | |
1342 | '((format-control . thin-space) | |
1343 | (no-font . hex-code)) | |
1344 | "List of directives to control display of glyphless characters. | |
1345 | ||
1346 | Each element has the form (GROUP . METHOD), where GROUP is a | |
1347 | symbol specifying the character group, and METHOD is a symbol | |
1348 | specifying the method of displaying characters belonging to that | |
1349 | group. | |
1350 | ||
1351 | GROUP must be one of these symbols: | |
96107967 | 1352 | `c0-control': U+0000..U+001F, but excluding newline and TAB. |
0e7c0582 EZ |
1353 | `c1-control': U+0080..U+009F. |
1354 | `format-control': Characters of Unicode General Category `Cf', | |
1355 | such as U+200C (ZWNJ), U+200E (LRM), but | |
1356 | excluding characters that have graphic images, | |
1357 | such as U+00AD (SHY). | |
1358 | `no-font': characters for which no suitable font is found. | |
1359 | For character terminals, characters that cannot | |
1360 | be encoded by `terminal-coding-system'. | |
1361 | ||
1362 | METHOD must be one of these symbols: | |
1363 | `zero-width': don't display. | |
1364 | `thin-space': display a thin (1-pixel width) space. On character | |
1365 | terminals, display as 1-character space. | |
1366 | `empty-box': display an empty box. | |
1367 | `acronym': display an acronym of the character in a box. The | |
1368 | acronym is taken from `char-acronym-table', which see. | |
1369 | `hex-code': display the hexadecimal character code in a box." | |
1370 | ||
1371 | :type '(alist :key-type (symbol :tag "Character Group") | |
1372 | :value-type (symbol :tag "Display Method")) | |
1373 | :options '((c0-control | |
1374 | (choice (const :tag "Don't display" zero-width) | |
1375 | (const :tag "Display as thin space" thin-space) | |
1376 | (const :tag "Display as empty box" empty-box) | |
1377 | (const :tag "Display acronym" acronym) | |
1378 | (const :tag "Display hex code in a box" hex-code))) | |
1379 | (c1-control | |
1380 | (choice (const :tag "Don't display" zero-width) | |
1381 | (const :tag "Display as thin space" thin-space) | |
1382 | (const :tag "Display as empty box" empty-box) | |
1383 | (const :tag "Display acronym" acronym) | |
1384 | (const :tag "Display hex code in a box" hex-code))) | |
1385 | (format-control | |
1386 | (choice (const :tag "Don't display" zero-width) | |
1387 | (const :tag "Display as thin space" thin-space) | |
1388 | (const :tag "Display as empty box" empty-box) | |
1389 | (const :tag "Display acronym" acronym) | |
1390 | (const :tag "Display hex code in a box" hex-code))) | |
1391 | (no-font | |
1392 | (choice (const :tag "Don't display" zero-width) | |
1393 | (const :tag "Display as thin space" thin-space) | |
1394 | (const :tag "Display as empty box" empty-box) | |
1395 | (const :tag "Display acronym" acronym) | |
1396 | (const :tag "Display hex code in a box" hex-code)))) | |
1397 | :set 'update-glyphless-char-display | |
1398 | :group 'display) | |
1399 | ||
b2cca856 | 1400 | \f |
e7259832 KH |
1401 | ;;; Setting word boundary. |
1402 | ||
e7259832 | 1403 | (setq word-combining-categories |
4626499f KH |
1404 | '((nil . ?^) |
1405 | (?^ . nil) | |
7ffefb08 MB |
1406 | (?C . ?H) |
1407 | (?C . ?K))) | |
e7259832 KH |
1408 | |
1409 | (setq word-separating-categories ; (2-byte character sets) | |
4626499f | 1410 | '((?H . ?K) ; Hiragana - Katakana |
e7259832 KH |
1411 | )) |
1412 | ||
1cbfaab9 | 1413 | ;; Local Variables: |
985773c9 | 1414 | ;; coding: utf-8 |
1cbfaab9 | 1415 | ;; End: |
777cfce6 | 1416 | |
60370d40 | 1417 | ;;; characters.el ends here |