lisp/international/characters.el

   1 ;;; characters.el --- set syntax and category for multibyte characters
   2
   3 ;; Copyright (C) 1997, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   4 ;;   Free Software Foundation, Inc.
   5 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   6 ;;   2005, 2006, 2007, 2008, 2009, 2010
   7 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
   8 ;;   Registration Number H14PRO021
   9 ;; Copyright (C) 2003
  10 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
  11 ;;   Registration Number H13PRO009
  12
  13 ;; Keywords: multibyte character, character set, syntax, category
  14
  15 ;; This file is part of GNU Emacs.
  16
  17 ;; GNU Emacs is free software: you can redistribute it and/or modify
  18 ;; it under the terms of the GNU General Public License as published by
  19 ;; the Free Software Foundation, either version 3 of the License, or
  20 ;; (at your option) any later version.
  21
  22 ;; GNU Emacs is distributed in the hope that it will be useful,
  23 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  25 ;; GNU General Public License for more details.
  26
  27 ;; You should have received a copy of the GNU General Public License
  28 ;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
  29
  30 ;;; Commentary:
  31
  32 ;;; Code:
  33
  34 ;;; Predefined categories.
  35
  36 ;; For each character set.
  37
  38 (define-category ?a "ASCII
  39 ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
  40 (define-category ?l "Latin")
  41 (define-category ?t "Thai")
  42 (define-category ?g "Greek")
  43 (define-category ?b "Arabic")
  44 (define-category ?w "Hebrew")
  45 (define-category ?y "Cyrillic")
  46 (define-category ?k "Katakana
  47 Japanese katakana")
  48 (define-category ?r "Roman
  49 Japanese roman")
  50 (define-category ?c "Chinese")
  51 (define-category ?j "Japanese")
  52 (define-category ?h "Korean")
  53 (define-category ?e "Ethiopic
  54 Ethiopic (Ge'ez)")
  55 (define-category ?v "Viet
  56 Vietnamese")
  57 (define-category ?i "Indian")
  58 (define-category ?o "Lao")
  59 (define-category ?q "Tibetan")
  60
  61 ;; For each group (row) of 2-byte character sets.
  62
  63 (define-category ?A "2-byte alnum
  64 Alpha-numeric characters of 2-byte character sets")
  65 (define-category ?C "2-byte han
  66 Chinese (Han) characters of 2-byte character sets")
  67 (define-category ?G "2-byte Greek
  68 Greek characters of 2-byte character sets")
  69 (define-category ?H "2-byte Hiragana
  70 Japanese Hiragana characters of 2-byte character sets")
  71 (define-category ?K "2-byte Katakana
  72 Japanese Katakana characters of 2-byte character sets")
  73 (define-category ?N "2-byte Korean
  74 Korean Hangul characters of 2-byte character sets")
  75 (define-category ?Y "2-byte Cyrillic
  76 Cyrillic characters of 2-byte character sets")
  77 (define-category ?I "Indian Glyphs")
  78
  79 ;; For phonetic classifications.
  80
  81 (define-category ?0 "consonant")
  82 (define-category ?1 "base vowel
  83 Base (independent) vowel")
  84 (define-category ?2 "upper diacritic
  85 Upper diacritical mark (including upper vowel)")
  86 (define-category ?3 "lower diacritic
  87 Lower diacritical mark (including lower vowel)")
  88 (define-category ?4 "combining tone
  89 Combining tone mark")
  90 (define-category ?5 "symbol")
  91 (define-category ?6 "digit")
  92 (define-category ?7 "vowel diacritic
  93 Vowel-modifying diacritical mark")
  94 (define-category ?8 "vowel-signs")
  95 (define-category ?9 "semivowel lower")
  96
  97 ;; For filling.
  98 (define-category ?| "line breakable
  99 While filling, we can break a line at this character.")
 100
 101 ;; For indentation calculation.
 102 (define-category ?\s
 103   "space for indent
 104 This character counts as a space for indentation purposes.")
 105
 106 ;; Keep the following for `kinsoku' processing.  See comments in
 107 ;; kinsoku.el.
 108 (define-category ?> "Not at bol
 109 A character which can't be placed at beginning of line.")
 110 (define-category ?< "Not at eol
 111 A character which can't be placed at end of line.")
 112
 113 ;; Base and Combining
 114 (define-category ?. "Base
 115 Base characters (Unicode General Category L,N,P,S,Zs)")
 116 (define-category ?^ "Combining
 117 Combining diacritic or mark (Unicode General Category M)")
 118 \f
 119 ;;; Setting syntax and category.
 120
 121 ;; ASCII
 122
 123 ;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
 124 (modify-category-entry '(32 . 127) ?a)
 125 (modify-category-entry '(32 . 127) ?l)
 126
 127 ;; Deal with the CJK charsets first.  Since the syntax of blocks is
 128 ;; defined per charset, and the charsets may contain e.g. Latin
 129 ;; characters, we end up with the wrong syntax definitions if we're
 130 ;; not careful.
 131
 132 ;; Chinese characters (Unicode)
 133 (modify-category-entry '(#x2E80 . #x312F) ?|)
 134 (modify-category-entry '(#x3190 . #x33FF) ?|)
 135 (modify-category-entry '(#x3400 . #x4DBF) ?C)
 136 (modify-category-entry '(#x4E00 . #x9FAF) ?C)
 137 (modify-category-entry '(#x3400 . #x9FAF) ?c)
 138 (modify-category-entry '(#x3400 . #x9FAF) ?|)
 139 (modify-category-entry '(#xF900 . #xFAFF) ?C)
 140 (modify-category-entry '(#xF900 . #xFAFF) ?c)
 141 (modify-category-entry '(#xF900 . #xFAFF) ?|)
 142 (modify-category-entry '(#x20000 . #x2FFFF) ?|)
 143 (modify-category-entry '(#x20000 . #x2FFFF) ?C)
 144 (modify-category-entry '(#x20000 . #x2FFFF) ?c)
 145
 146
 147 ;; Chinese character set (GB2312)
 148
 149 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
 150 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
 151 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
 152
 153 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
 154 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
 155 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
 156 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
 157 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
 158 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
 159 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
 160 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
 161 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
 162
 163 ;; Chinese character set (BIG5)
 164
 165 (map-charset-chars #'modify-category-entry 'big5 ?c)
 166 (map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA261)
 167 (map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
 168 (map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DC)
 169
 170 ;; Chinese character set (CNS11643)
 171
 172 (dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
 173              chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
 174              chinese-cns11643-7))
 175   (map-charset-chars #'modify-category-entry c ?c)
 176   (if (eq c 'chinese-cns11643-1)
 177       (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
 178     (map-charset-chars #'modify-category-entry c ?C)))
 179
 180 ;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
 181
 182 (map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
 183
 184 (map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
 185
 186 (dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
 187                                japanese-jisx0213-1 japanese-jisx0213-2
 188                                cp932-2-byte))
 189   (map-charset-chars #'modify-category-entry l ?j))
 190
 191 ;; Unicode equivalents of JISX0201-kana
 192 (let ((range '(#xff61 . #xff9f)))
 193   (modify-category-entry range  ?k)
 194   (modify-category-entry range ?j)
 195   (modify-category-entry range ?\|))
 196
 197 ;; Katakana block
 198 (modify-category-entry '(#x3099 . #x309C) ?K)
 199 (modify-category-entry '(#x30A0 . #x30FF) ?K)
 200 (modify-category-entry '(#x31F0 . #x31FF) ?K)
 201 (modify-category-entry '(#x30A0 . #x30FA) ?\|)
 202 (modify-category-entry #x30FF ?\|)
 203
 204 ;; Hiragana block
 205 (modify-category-entry '(#x3040 . #x309F) ?H)
 206 (modify-category-entry '(#x3040 . #x3096) ?\|)
 207 (modify-category-entry #x309F ?\|)
 208 (modify-category-entry #x30A0 ?H)
 209 (modify-category-entry #x30FC ?H)
 210
 211
 212 ;; JISX0208
 213 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
 214 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
 215 (let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
 216   (dolist (elt chars)
 217     (modify-syntax-entry (car chars) "w")))
 218
 219 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
 220 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
 221 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
 222 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
 223 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
 224 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
 225 (modify-category-entry ?ー ?K)
 226 (let ((chars '(?゛ ?゜)))
 227   (while chars
 228     (modify-category-entry (car chars) ?K)
 229     (modify-category-entry (car chars) ?H)
 230     (setq chars (cdr chars))))
 231 (let ((chars '(?仝 ?々 ?〆 ?〇)))
 232   (while chars
 233     (modify-category-entry (car chars) ?C)
 234     (setq chars (cdr chars))))
 235
 236 ;; JISX0212
 237
 238 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
 239
 240 ;; JISX0201-Kana
 241
 242 (let ((chars '(?｡ ?､ ?･)))
 243   (while chars
 244     (modify-syntax-entry (car chars) ".")
 245     (setq chars (cdr chars))))
 246
 247 (modify-syntax-entry ?\｢ "(｣")
 248 (modify-syntax-entry ?\｣ "(｢")
 249
 250 ;; Korean character set (KSC5601)
 251
 252 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
 253
 254 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
 255 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
 256 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
 257 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
 258 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
 259 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
 260 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
 261 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
 262 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
 263 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
 264 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
 265
 266 ;; These are in more than one charset.
 267 (let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
 268                       "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
 269                       "（）［］｛｝"))
 270       open close)
 271   (dotimes (i (/ (length parens) 2))
 272     (setq open (aref parens (* i 2))
 273           close (aref parens (1+ (* i 2))))
 274     (modify-syntax-entry open (format "(%c" close))
 275     (modify-syntax-entry close (format ")%c" open))))
 276
 277 ;; Arabic character set
 278
 279 (let ((charsets '(arabic-iso8859-6
 280                   arabic-digit
 281                   arabic-1-column
 282                   arabic-2-column)))
 283   (while charsets
 284     (map-charset-chars #'modify-category-entry (car charsets) ?b)
 285     (setq charsets (cdr charsets))))
 286 (modify-category-entry '(#x600 . #x6ff) ?b)
 287 (modify-category-entry '(#xfb50 . #xfdff) ?b)
 288 (modify-category-entry '(#xfe70 . #xfefe) ?b)
 289
 290 ;; Cyrillic character set (ISO-8859-5)
 291
 292 (modify-syntax-entry ?№ ".")
 293
 294 ;; Ethiopic character set
 295
 296 (modify-category-entry '(#x1200 . #x1399) ?e)
 297 (modify-category-entry '(#x2d80 . #x2dde) ?e)
 298 (let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨)))
 299   (while chars
 300     (modify-syntax-entry (car chars) ".")
 301     (setq chars (cdr chars))))
 302 (map-charset-chars #'modify-category-entry 'ethiopic ?e)
 303
 304 ;; Hebrew character set (ISO-8859-8)
 305
 306 (modify-syntax-entry #x5be ".") ; MAQAF
 307 (modify-syntax-entry #x5c0 ".") ; PASEQ
 308 (modify-syntax-entry #x5c3 ".") ; SOF PASUQ
 309 (modify-syntax-entry #x5f3 ".") ; GERESH
 310 (modify-syntax-entry #x5f4 ".") ; GERSHAYIM
 311
 312 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
 313
 314 (modify-category-entry '(#x901 . #x970) ?i)
 315 (map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
 316 (map-charset-chars #'modify-category-entry 'indian-2-column ?i)
 317
 318 ;; Lao character set
 319
 320 (modify-category-entry '(#xe80 . #xeff) ?o)
 321 (map-charset-chars #'modify-category-entry 'lao ?o)
 322
 323 (let ((deflist  '(("ກ-ຮ"    "w"     ?0) ; consonant
 324                   ("ະາຳຽເ-ໄ"        "w"     ?1) ; vowel base
 325                   ("ັິ-ືົໍ"   "w"     ?2) ; vowel upper
 326                   ("ຸູ"     "w"     ?3) ; vowel lower
 327                   ("່-໋"    "w"     ?4) ; tone mark
 328                   ("ຼຽ"     "w"     ?9) ; semivowel lower
 329                   ("໐-໙"    "w"     ?6) ; digit
 330                   ("ຯໆ"     "_"     ?5) ; symbol
 331                   ))
 332       elm chars len syntax category to ch i)
 333   (while deflist
 334     (setq elm (car deflist))
 335     (setq chars (car elm)
 336           len (length chars)
 337           syntax (nth 1 elm)
 338           category (nth 2 elm)
 339           i 0)
 340     (while (< i len)
 341       (if (= (aref chars i) ?-)
 342           (setq i (1+ i)
 343                 to (aref chars i))
 344         (setq ch (aref chars i)
 345               to ch))
 346       (while (<= ch to)
 347         (unless (string-equal syntax "w")
 348           (modify-syntax-entry ch syntax))
 349         (modify-category-entry ch category)
 350         (setq ch (1+ ch)))
 351       (setq i (1+ i)))
 352     (setq deflist (cdr deflist))))
 353
 354 ;; Thai character set (TIS620)
 355
 356 (modify-category-entry '(#xe00 . #xe7f) ?t)
 357 (map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
 358
 359 (let ((deflist  '(;; chars      syntax  category
 360                   ("ก-รลว-ฮ"  "w"     ?0) ; consonant
 361                   ("ฤฦะาำเ-ๅ"     "w"     ?1) ; vowel base
 362                   ("ัิ-ื็๎"   "w"     ?2) ; vowel upper
 363                   ("ุ-ฺ"    "w"     ?3) ; vowel lower
 364                   ("่-ํ"    "w"     ?4) ; tone mark
 365                   ("๐-๙"    "w"     ?6) ; digit
 366                   ("ฯๆ฿๏๚๛" "_"     ?5) ; symbol
 367                   ))
 368       elm chars len syntax category to ch i)
 369   (while deflist
 370     (setq elm (car deflist))
 371     (setq chars (car elm)
 372           len (length chars)
 373           syntax (nth 1 elm)
 374           category (nth 2 elm)
 375           i 0)
 376     (while (< i len)
 377       (if (= (aref chars i) ?-)
 378           (setq i (1+ i)
 379                 to (aref chars i))
 380         (setq ch (aref chars i)
 381               to ch))
 382       (while (<= ch to)
 383         (unless (string-equal syntax "w")
 384           (modify-syntax-entry ch syntax))
 385         (modify-category-entry ch category)
 386         (setq ch (1+ ch)))
 387       (setq i (1+ i)))
 388     (setq deflist (cdr deflist))))
 389
 390 ;; Tibetan character set
 391
 392 (modify-category-entry '(#xf00 . #xfff) ?q)
 393 (map-charset-chars #'modify-category-entry 'tibetan ?q)
 394 (map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
 395
 396 (let ((deflist  '(;; chars             syntax category
 397                   ("ཀ-ཀྵཪ"         "w"     ?0) ; consonant
 398                   ("ྐ-ྐྵྺྻྼ"       "w"     ?0) ;
 399                   ("ིེཻོཽྀ"       "w"       ?2) ; upper vowel
 400                   ("ཾྂྃ྆྇ྈྉྊྋ" "w"    ?2) ; upper modifier
 401                   ("྄ཱུ༙༵༷"       "w"       ?3) ; lowel vowel/modifier
 402                   ("཰"                "w" ?3)             ; invisible vowel a
 403                   ("༠-༩༪-༳"             "w"     ?6) ; digit
 404                   ("་།-༒༔ཿ"        "."     ?|) ; line-break char
 405                   ("་།༏༐༑༔ཿ"            "."     ?|) ;
 406                   ("༈་།-༒༔ཿ༽༴"  "."     ?>) ; prohibition
 407                   ("་།༏༐༑༔ཿ"            "."     ?>) ;
 408                   ("ༀ-༊༼࿁࿂྅"      "."     ?<) ; prohibition
 409                   ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
 410                   ))
 411       elm chars len syntax category to ch i)
 412   (while deflist
 413     (setq elm (car deflist))
 414     (setq chars (car elm)
 415           len (length chars)
 416           syntax (nth 1 elm)
 417           category (nth 2 elm)
 418           i 0)
 419     (while (< i len)
 420       (if (= (aref chars i) ?-)
 421           (setq i (1+ i)
 422                 to (aref chars i))
 423         (setq ch (aref chars i)
 424               to ch))
 425       (while (<= ch to)
 426         (unless (string-equal syntax "w")
 427           (modify-syntax-entry ch syntax))
 428         (modify-category-entry ch category)
 429         (setq ch (1+ ch)))
 430       (setq i (1+ i)))
 431     (setq deflist (cdr deflist))))
 432
 433 ;; Vietnamese character set
 434
 435 ;; To make a word with Latin characters
 436 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
 437 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
 438
 439 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
 440 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
 441
 442 (let ((tbl (standard-case-table))
 443       (i 32))
 444   (while (< i 128)
 445     (let* ((char (decode-char 'vietnamese-viscii-upper i))
 446            (charl (decode-char 'vietnamese-viscii-lower i))
 447            (uc (encode-char char 'ucs))
 448            (lc (encode-char charl 'ucs)))
 449       (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
 450                             tbl)
 451       (if uc (modify-category-entry uc ?v))
 452       (if lc (modify-category-entry lc ?v)))
 453     (setq i (1+ i))))
 454
 455 ;; Tai Viet
 456 (let ((deflist '(;; chars       syntax  category
 457                  ((?ꪀ.  ?ꪯ) "w"     ?0) ; cosonant
 458                  ("ꪱꪵꪶ"           "w"     ?1) ; vowel base
 459                  ((?ꪹ . ?ꪽ) "w"     ?1) ; vowel base
 460                  ("ꪰꪲꪳꪷꪸꪾ"  "w"     ?2) ; vowel upper
 461                  ("ꪴ"         "w"     ?3) ; vowel lower
 462                  ("ꫀꫂ"              "w"     ?1) ; non-combining tone-mark
 463                  ("꪿꫁"              "w"     ?4) ; combining tone-mark
 464                  ((?ꫛ . ?꫟) "_"     ?5) ; symbol
 465                  )))
 466   (dolist (elm deflist)
 467     (let ((chars (car elm))
 468           (syntax (nth 1 elm))
 469           (category (nth 2 elm)))
 470       (if (consp chars)
 471           (progn
 472             (modify-syntax-entry chars syntax)
 473             (modify-category-entry chars category))
 474         (mapc #'(lambda (x)
 475                   (modify-syntax-entry x syntax)
 476                   (modify-category-entry x category))
 477               chars)))))
 478
 479 ;; Latin
 480
 481 (modify-category-entry '(#x80 . #x024F) ?l)
 482
 483 (let ((tbl (standard-case-table)) c)
 484
 485   ;; Latin-1
 486
 487   ;; Fixme: Some of the non-word syntaxes here perhaps should be
 488   ;; reviewed.  (Note that the following all implicitly have word
 489   ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.)  There should be a well-defined way of
 490   ;; relating Unicode categories to Emacs syntax codes.
 491
 492   ;; NBSP isn't semantically interchangeable with other whitespace chars,
 493   ;; so it's more like punctation.
 494   (set-case-syntax ?  "." tbl)
 495   (set-case-syntax ?¡ "." tbl)
 496   (set-case-syntax ?¦ "_" tbl)
 497   (set-case-syntax ?§ "." tbl)
 498   (set-case-syntax ?© "_" tbl)
 499   (set-case-syntax-delims 171 187 tbl)  ; « »
 500   (set-case-syntax ?¬ "_" tbl)
 501   (set-case-syntax ? "_" tbl)
 502   (set-case-syntax ?® "_" tbl)
 503   (set-case-syntax ?° "_" tbl)
 504   (set-case-syntax ?± "_" tbl)
 505   (set-case-syntax ?µ "_" tbl)
 506   (set-case-syntax ?· "_" tbl)
 507   (set-case-syntax ?¼ "_" tbl)
 508   (set-case-syntax ?½ "_" tbl)
 509   (set-case-syntax ?¾ "_" tbl)
 510   (set-case-syntax ?¿ "." tbl)
 511   (let ((c 192))
 512     (while (<= c 222)
 513       (set-case-syntax-pair c (+ c 32) tbl)
 514       (setq c (1+ c))))
 515   (set-case-syntax ?× "_" tbl)
 516   (set-case-syntax ?ß "w" tbl)
 517   (set-case-syntax ?÷ "_" tbl)
 518   ;; See below for ÿ.
 519
 520   ;; Latin Extended-A, Latin Extended-B
 521   (setq c #x0100)
 522   (while (<= c #x02B8)
 523     (modify-category-entry c ?l)
 524     (setq c (1+ c)))
 525
 526   (let ((pair-ranges '((#x0100 . #x012F)
 527                        (#x0132 . #x0137)
 528                        (#x0139 . #x0148)
 529                        (#x014a . #x0177)
 530                        (#x0179 . #x017E)
 531                        (#x0182 . #x0185)
 532                        (#x0187 . #x0188)
 533                        (#x018B . #x018C)
 534                        (#x0191 . #x0192)
 535                        (#x0198 . #x0199)
 536                        (#x01A0 . #x01A5)
 537                        (#x01A7 . #x01A8)
 538                        (#x01AC . #x01AD)
 539                        (#x01AF . #x01B0)
 540                        (#x01B3 . #x01B6)
 541                        (#x01BC . #x01BD)
 542                        (#x01CD . #x01DC)
 543                        (#x01DE . #x01EF)
 544                        (#x01F4 . #x01F5)
 545                        (#x01F8 . #x021F)
 546                        (#x0222 . #x0233)
 547                        (#x023B . #x023C)
 548                        (#x0241 . #x0242)
 549                        (#x0246 . #x024F))))
 550     (dolist (elt pair-ranges)
 551       (let ((from (car elt)) (to (cdr elt)))
 552         (while (< from to)
 553           (set-case-syntax-pair from (1+ from) tbl)
 554           (setq from (+ from 2))))))
 555
 556   (set-case-syntax-pair #x189 #x256 tbl)
 557   (set-case-syntax-pair #x18A #x257 tbl)
 558
 559   ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
 560   ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
 561   ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
 562   ;; SMALL LETTER I.
 563
 564   ;; We used to set up half of those correspondence unconditionally,
 565   ;; but that makes searches slow.  So now we don't set up either half
 566   ;; of these correspondences by default.
 567
 568   ;; (set-downcase-syntax  ?İ ?i tbl)
 569   ;; (set-upcase-syntax    ?I ?ı tbl)
 570
 571   (set-case-syntax-pair ?Ǆ ?ǆ tbl)
 572   (set-case-syntax-pair ?ǅ ?ǆ tbl)
 573   (set-case-syntax-pair ?Ǉ ?ǉ tbl)
 574   (set-case-syntax-pair ?ǈ ?ǉ tbl)
 575   (set-case-syntax-pair ?Ǌ ?ǌ tbl)
 576   (set-case-syntax-pair ?ǋ ?ǌ tbl)
 577
 578   ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
 579   (set-case-syntax-pair ?Ǳ ?ǳ tbl)
 580   (set-case-syntax-pair ?ǲ ?ǳ tbl)
 581   (set-case-syntax-pair ?Ƕ ?ƕ tbl)
 582   (set-case-syntax-pair ?Ƿ ?ƿ tbl)
 583
 584   ;; Latin Extended Additional
 585   (modify-category-entry '(#x1e00 . #x1ef9) ?l)
 586   (setq c #x1e00)
 587   (while (<= c #x1ef9)
 588     (and (zerop (% c 2))
 589          (or (<= c #x1e94) (>= c #x1ea0))
 590          (set-case-syntax-pair c (1+ c) tbl))
 591     (setq c (1+ c)))
 592
 593   ;; Greek
 594   (modify-category-entry '(#x0370 . #x03ff) ?g)
 595   (setq c #x0370)
 596   (while (<= c #x03ff)
 597     (if (or (and (>= c #x0391) (<= c #x03a1))
 598             (and (>= c #x03a3) (<= c #x03ab)))
 599         (set-case-syntax-pair c (+ c 32) tbl))
 600     (and (>= c #x03da)
 601          (<= c #x03ee)
 602          (zerop (% c 2))
 603          (set-case-syntax-pair c (1+ c) tbl))
 604     (setq c (1+ c)))
 605   (set-case-syntax-pair ?Ά ?ά tbl)
 606   (set-case-syntax-pair ?Έ ?έ tbl)
 607   (set-case-syntax-pair ?Ή ?ή tbl)
 608   (set-case-syntax-pair ?Ί ?ί tbl)
 609   (set-case-syntax-pair ?Ό ?ό tbl)
 610   (set-case-syntax-pair ?Ύ ?ύ tbl)
 611   (set-case-syntax-pair ?Ώ ?ώ tbl)
 612
 613   ;; Armenian
 614   (setq c #x531)
 615   (while (<= c #x556)
 616     (set-case-syntax-pair c (+ c #x30) tbl)
 617     (setq c (1+ c)))
 618
 619   ;; Greek Extended
 620   (modify-category-entry '(#x1f00 . #x1fff) ?g)
 621   (setq c #x1f00)
 622   (while (<= c #x1fff)
 623     (and (<= (logand c #x000f) 7)
 624          (<= c #x1fa7)
 625          (not (memq c '(#x1f16 #x1f17 #x1f56 #x1f57
 626                                #x1f50 #x1f52 #x1f54 #x1f56)))
 627          (/= (logand c #x00f0) #x70)
 628          (set-case-syntax-pair (+ c 8) c tbl))
 629     (setq c (1+ c)))
 630   (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
 631   (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
 632   (set-case-syntax-pair ?Ὰ ?ὰ tbl)
 633   (set-case-syntax-pair ?Ά ?ά tbl)
 634   (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
 635   (set-case-syntax-pair ?Ὲ ?ὲ tbl)
 636   (set-case-syntax-pair ?Έ ?έ tbl)
 637   (set-case-syntax-pair ?Ὴ ?ὴ tbl)
 638   (set-case-syntax-pair ?Ή ?ή tbl)
 639   (set-case-syntax-pair ?ῌ ?ῃ tbl)
 640   (set-case-syntax-pair ?Ῐ ?ῐ tbl)
 641   (set-case-syntax-pair ?Ῑ ?ῑ tbl)
 642   (set-case-syntax-pair ?Ὶ ?ὶ tbl)
 643   (set-case-syntax-pair ?Ί ?ί tbl)
 644   (set-case-syntax-pair ?Ῠ ?ῠ tbl)
 645   (set-case-syntax-pair ?Ῡ ?ῡ tbl)
 646   (set-case-syntax-pair ?Ὺ ?ὺ tbl)
 647   (set-case-syntax-pair ?Ύ ?ύ tbl)
 648   (set-case-syntax-pair ?Ῥ ?ῥ tbl)
 649   (set-case-syntax-pair ?Ὸ ?ὸ tbl)
 650   (set-case-syntax-pair ?Ό ?ό tbl)
 651   (set-case-syntax-pair ?Ὼ ?ὼ tbl)
 652   (set-case-syntax-pair ?Ώ ?ώ tbl)
 653   (set-case-syntax-pair ?ῼ ?ῳ tbl)
 654
 655   ;; cyrillic
 656   (modify-category-entry '(#x0400 . #x04FF) ?y)
 657   (setq c #x0400)
 658   (while (<= c #x04ff)
 659     (and (>= c #x0400)
 660          (<= c #x040f)
 661          (set-case-syntax-pair c (+ c 80) tbl))
 662     (and (>= c #x0410)
 663          (<= c #x042f)
 664          (set-case-syntax-pair c (+ c 32) tbl))
 665     (and (zerop (% c 2))
 666          (or (and (>= c #x0460) (<= c #x0480))
 667              (and (>= c #x048c) (<= c #x04be))
 668              (and (>= c #x04d0) (<= c #x04f4)))
 669          (set-case-syntax-pair c (1+ c) tbl))
 670     (setq c (1+ c)))
 671   (set-case-syntax-pair ?Ӂ ?ӂ tbl)
 672   (set-case-syntax-pair ?Ӄ ?ӄ tbl)
 673   (set-case-syntax-pair ?Ӈ ?ӈ tbl)
 674   (set-case-syntax-pair ?Ӌ ?ӌ tbl)
 675   (set-case-syntax-pair ?Ӹ ?ӹ tbl)
 676
 677   ;; general punctuation
 678   (setq c #x2000)
 679   (while (<= c #x200b)
 680     (set-case-syntax c " " tbl)
 681     (setq c (1+ c)))
 682   (while (<= c #x200F)
 683     (set-case-syntax c "." tbl)
 684     (setq c (1+ c)))
 685   ;; Fixme: These aren't all right:
 686   (setq c #x2010)
 687   (while (<= c #x2016)
 688     (set-case-syntax c "_" tbl)
 689     (setq c (1+ c)))
 690   ;; Punctuation syntax for quotation marks (like `)
 691   (while (<= c #x201f)
 692     (set-case-syntax  c "." tbl)
 693     (setq c (1+ c)))
 694   ;; Fixme: These aren't all right:
 695   (while (<= c #x2027)
 696     (set-case-syntax c "_" tbl)
 697     (setq c (1+ c)))
 698   (while (<= c #x206F)
 699     (set-case-syntax c "." tbl)
 700     (setq c (1+ c)))
 701
 702   ;; Roman numerals
 703   (setq c #x2160)
 704   (while (<= c #x216f)
 705     (set-case-syntax-pair c (+ c #x10) tbl)
 706     (setq c (1+ c)))
 707
 708   ;; Fixme: The following blocks might be better as symbol rather than
 709   ;; punctuation.
 710   ;; Arrows
 711   (setq c #x2190)
 712   (while (<= c #x21FF)
 713     (set-case-syntax c "." tbl)
 714     (setq c (1+ c)))
 715   ;; Mathematical Operators
 716   (while (<= c #x22FF)
 717     (set-case-syntax c "." tbl)
 718     (setq c (1+ c)))
 719   ;; Miscellaneous Technical
 720   (while (<= c #x23FF)
 721     (set-case-syntax c "." tbl)
 722     (setq c (1+ c)))
 723   ;; Control Pictures
 724   (while (<= c #x243F)
 725     (set-case-syntax c "_" tbl)
 726     (setq c (1+ c)))
 727
 728   ;; Circled Latin
 729   (setq c #x24b6)
 730   (while (<= c #x24cf)
 731     (set-case-syntax-pair c (+ c 26) tbl)
 732     (modify-category-entry c ?l)
 733     (modify-category-entry (+ c 26) ?l)
 734     (setq c (1+ c)))
 735
 736   ;; Fullwidth Latin
 737   (setq c #xff21)
 738   (while (<= c #xff3a)
 739     (set-case-syntax-pair c (+ c #x20) tbl)
 740     (modify-category-entry c ?l)
 741     (modify-category-entry (+ c #x20) ?l)
 742     (setq c (1+ c)))
 743
 744   ;; Combining diacritics
 745   (modify-category-entry '(#x300 . #x362) ?^)
 746   ;; Combining marks
 747   (modify-category-entry '(#x20d0 . #x20e3) ?^)
 748
 749   ;; Fixme: syntax for symbols &c
 750   )
 751
 752 (let ((pairs
 753        '("⁅⁆"                               ; U+2045 U+2046
 754          "⁽⁾"                               ; U+207D U+207E
 755          "₍₎"                               ; U+208D U+208E
 756          "〈〉"                               ; U+2329 U+232A
 757          "⎴⎵"                               ; U+23B4 U+23B5
 758          "❨❩"                               ; U+2768 U+2769
 759          "❪❫"                               ; U+276A U+276B
 760          "❬❭"                               ; U+276C U+276D
 761          "❰❱"                               ; U+2770 U+2771
 762          "❲❳"                               ; U+2772 U+2773
 763          "❴❵"                               ; U+2774 U+2775
 764          "⟦⟧"                               ; U+27E6 U+27E7
 765          "⟨⟩"                               ; U+27E8 U+27E9
 766          "⟪⟫"                               ; U+27EA U+27EB
 767          "⦃⦄"                               ; U+2983 U+2984
 768          "⦅⦆"                               ; U+2985 U+2986
 769          "⦇⦈"                               ; U+2987 U+2988
 770          "⦉⦊"                               ; U+2989 U+298A
 771          "⦋⦌"                               ; U+298B U+298C
 772          "⦍⦎"                               ; U+298D U+298E
 773          "⦏⦐"                               ; U+298F U+2990
 774          "⦑⦒"                               ; U+2991 U+2992
 775          "⦓⦔"                               ; U+2993 U+2994
 776          "⦕⦖"                               ; U+2995 U+2996
 777          "⦗⦘"                               ; U+2997 U+2998
 778          "⧼⧽"                               ; U+29FC U+29FD
 779          "〈〉"                               ; U+3008 U+3009
 780          "《》"                               ; U+300A U+300B
 781          "「」"                               ; U+300C U+300D
 782          "『』"                               ; U+300E U+300F
 783          "【】"                               ; U+3010 U+3011
 784          "〔〕"                               ; U+3014 U+3015
 785          "〖〗"                               ; U+3016 U+3017
 786          "〘〙"                               ; U+3018 U+3019
 787          "〚〛"                               ; U+301A U+301B
 788          "﴾﴿"                               ; U+FD3E U+FD3F
 789          "︵︶"                               ; U+FE35 U+FE36
 790          "︷︸"                               ; U+FE37 U+FE38
 791          "︹︺"                               ; U+FE39 U+FE3A
 792          "︻︼"                               ; U+FE3B U+FE3C
 793          "︽︾"                               ; U+FE3D U+FE3E
 794          "︿﹀"                               ; U+FE3F U+FE40
 795          "﹁﹂"                               ; U+FE41 U+FE42
 796          "﹃﹄"                               ; U+FE43 U+FE44
 797          "﹙﹚"                               ; U+FE59 U+FE5A
 798          "﹛﹜"                               ; U+FE5B U+FE5C
 799          "﹝﹞"                               ; U+FE5D U+FE5E
 800          "（）"                               ; U+FF08 U+FF09
 801          "［］"                               ; U+FF3B U+FF3D
 802          "｛｝"                               ; U+FF5B U+FF5D
 803          "｟｠"                               ; U+FF5F U+FF60
 804          "｢｣"                               ; U+FF62 U+FF63
 805          )))
 806   (dolist (elt pairs)
 807     (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
 808     (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
 809
 810 \f
 811 ;; For each character set, put the information of the most proper
 812 ;; coding system to encode it by `preferred-coding-system' property.
 813
 814 ;; Fixme: should this be junked?
 815 (let ((l '((latin-iso8859-1     . iso-latin-1)
 816            (latin-iso8859-2     . iso-latin-2)
 817            (latin-iso8859-3     . iso-latin-3)
 818            (latin-iso8859-4     . iso-latin-4)
 819            (thai-tis620         . thai-tis620)
 820            (greek-iso8859-7     . greek-iso-8bit)
 821            (arabic-iso8859-6    . iso-2022-7bit)
 822            (hebrew-iso8859-8    . hebrew-iso-8bit)
 823            (katakana-jisx0201   . japanese-shift-jis)
 824            (latin-jisx0201      . japanese-shift-jis)
 825            (cyrillic-iso8859-5  . cyrillic-iso-8bit)
 826            (latin-iso8859-9     . iso-latin-5)
 827            (japanese-jisx0208-1978 . iso-2022-jp)
 828            (chinese-gb2312      . chinese-iso-8bit)
 829            (chinese-gbk         . chinese-gbk)
 830            (gb18030-2-byte      . chinese-gb18030)
 831            (gb18030-4-byte-bmp  . chinese-gb18030)
 832            (gb18030-4-byte-smp  . chinese-gb18030)
 833            (gb18030-4-byte-ext-1 . chinese-gb18030)
 834            (gb18030-4-byte-ext-2 . chinese-gb18030)
 835            (japanese-jisx0208   . iso-2022-jp)
 836            (korean-ksc5601      . iso-2022-kr)
 837            (japanese-jisx0212   . iso-2022-jp)
 838            (chinese-big5-1      . chinese-big5)
 839            (chinese-big5-2      . chinese-big5)
 840            (chinese-sisheng     . iso-2022-7bit)
 841            (ipa                 . iso-2022-7bit)
 842            (vietnamese-viscii-lower . vietnamese-viscii)
 843            (vietnamese-viscii-upper . vietnamese-viscii)
 844            (arabic-digit        . iso-2022-7bit)
 845            (arabic-1-column     . iso-2022-7bit)
 846            (lao                 . lao)
 847            (arabic-2-column     . iso-2022-7bit)
 848            (indian-is13194      . devanagari)
 849            (indian-glyph        . devanagari)
 850            (tibetan-1-column    . tibetan)
 851            (ethiopic            . iso-2022-7bit)
 852            (chinese-cns11643-1  . iso-2022-cn)
 853            (chinese-cns11643-2  . iso-2022-cn)
 854            (chinese-cns11643-3  . iso-2022-cn)
 855            (chinese-cns11643-4  . iso-2022-cn)
 856            (chinese-cns11643-5  . iso-2022-cn)
 857            (chinese-cns11643-6  . iso-2022-cn)
 858            (chinese-cns11643-7  . iso-2022-cn)
 859            (indian-2-column     . devanagari)
 860            (tibetan             . tibetan)
 861            (latin-iso8859-14    . iso-latin-8)
 862            (latin-iso8859-15    . iso-latin-9))))
 863   (while l
 864     (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
 865     (setq l (cdr l))))
 866
 867 \f
 868 ;; Setup auto-fill-chars for charsets that should invoke auto-filling.
 869 ;; SPACE and NEWLINE are already set.
 870
 871 (set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
 872 (set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
 873 (set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
 874 (set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
 875 (set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
 876 (set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
 877
 878 \f
 879 ;;; Setting char-width-table.  The default is 1.
 880
 881 ;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
 882 ;;    and final characters.
 883 (let ((l '((#x0300 . #x036F)
 884            (#x0483 . #x0489)
 885            (#x0591 . #x05BD)
 886            (#x05BF . #x05BF)
 887            (#x05C1 . #x05C2)
 888            (#x05C4 . #x05C5)
 889            (#x05C7 . #x05C7)
 890            (#x0600 . #x0603)
 891            (#x0610 . #x0615)
 892            (#x064B . #x065E)
 893            (#x0670 . #x0670)
 894            (#x06D6 . #x06E4)
 895            (#x06E7 . #x06E8)
 896            (#x06EA . #x06ED)
 897            (#x070F . #x070F)
 898            (#x0711 . #x0711)
 899            (#x0730 . #x074A)
 900            (#x07A6 . #x07B0)
 901            (#x07EB . #x07F3)
 902            (#x0901 . #x0902)
 903            (#x093C . #x093C)
 904            (#x0941 . #x0948)
 905            (#x094D . #x094D)
 906            (#x0951 . #x0954)
 907            (#x0962 . #x0963)
 908            (#x0981 . #x0981)
 909            (#x09BC . #x09BC)
 910            (#x09C1 . #x09C4)
 911            (#x09CD . #x09CD)
 912            (#x09E2 . #x09E3)
 913            (#x0A01 . #x0A02)
 914            (#x0A3C . #x0A3C)
 915            (#x0A41 . #x0A4D)
 916            (#x0A70 . #x0A71)
 917            (#x0A81 . #x0A82)
 918            (#x0ABC . #x0ABC)
 919            (#x0AC1 . #x0AC8)
 920            (#x0ACD . #x0ACD)
 921            (#x0AE2 . #x0AE3)
 922            (#x0B01 . #x0B01)
 923            (#x0B3C . #x0B3C)
 924            (#x0B3F . #x0B3F)
 925            (#x0B41 . #x0B43)
 926            (#x0B4D . #x0B56)
 927            (#x0B82 . #x0B82)
 928            (#x0BC0 . #x0BC0)
 929            (#x0BCD . #x0BCD)
 930            (#x0C3E . #x0C40)
 931            (#x0C46 . #x0C56)
 932            (#x0CBC . #x0CBC)
 933            (#x0CBF . #x0CBF)
 934            (#x0CC6 . #x0CC6)
 935            (#x0CCC . #x0CCD)
 936            (#x0CE2 . #x0CE3)
 937            (#x0D41 . #x0D43)
 938            (#x0D4D . #x0D4D)
 939            (#x0DCA . #x0DCA)
 940            (#x0DD2 . #x0DD6)
 941            (#x0E31 . #x0E31)
 942            (#x0E34 . #x0E3A)
 943            (#x0E47 . #x0E4E)
 944            (#x0EB1 . #x0EB1)
 945            (#x0EB4 . #x0EBC)
 946            (#x0EC8 . #x0ECD)
 947            (#x0F18 . #x0F19)
 948            (#x0F35 . #x0F35)
 949            (#x0F37 . #x0F37)
 950            (#x0F39 . #x0F39)
 951            (#x0F71 . #x0F7E)
 952            (#x0F80 . #x0F84)
 953            (#x0F86 . #x0F87)
 954            (#x0F90 . #x0FBC)
 955            (#x0FC6 . #x0FC6)
 956            (#x102D . #x1030)
 957            (#x1032 . #x1037)
 958            (#x1039 . #x1039)
 959            (#x1058 . #x1059)
 960            (#x1160 . #x11FF)
 961            (#x135F . #x135F)
 962            (#x1712 . #x1714)
 963            (#x1732 . #x1734)
 964            (#x1752 . #x1753)
 965            (#x1772 . #x1773)
 966            (#x17B4 . #x17B5)
 967            (#x17B7 . #x17BD)
 968            (#x17C6 . #x17C6)
 969            (#x17C9 . #x17D3)
 970            (#x17DD . #x17DD)
 971            (#x180B . #x180D)
 972            (#x18A9 . #x18A9)
 973            (#x1920 . #x1922)
 974            (#x1927 . #x1928)
 975            (#x1932 . #x1932)
 976            (#x1939 . #x193B)
 977            (#x1A17 . #x1A18)
 978            (#x1B00 . #x1B03)
 979            (#x1B34 . #x1B34)
 980            (#x1B36 . #x1B3A)
 981            (#x1B3C . #x1B3C)
 982            (#x1B42 . #x1B42)
 983            (#x1B6B . #x1B73)
 984            (#x1DC0 . #x1DFF)
 985            (#x200B . #x200F)
 986            (#x202A . #x202E)
 987            (#x2060 . #x206F)
 988            (#x20D0 . #x20EF)
 989            (#x302A . #x302F)
 990            (#x3099 . #x309A)
 991            (#xA806 . #xA806)
 992            (#xA80B . #xA80B)
 993            (#xA825 . #xA826)
 994            (#xFB1E . #xFB1E)
 995            (#xFE00 . #xFE0F)
 996            (#xFE20 . #xFE23)
 997            (#xFEFF . #xFEFF)
 998            (#xFFF9 . #xFFFB)
 999            (#x10A01 . #x10A0F)
1000            (#x10A38 . #x10A3F)
1001            (#x1D167 . #x1D169)
1002            (#x1D173 . #x1D182)
1003            (#x1D185 . #x1D18B)
1004            (#x1D1AA . #x1D1AD)
1005            (#x1D242 . #x1D244)
1006            (#xE0001 . #xE01EF))))
1007   (dolist (elt l)
1008     (set-char-table-range char-width-table elt 0)))
1009
1010 ;; 2: East Asian Wide and Full-width characters.
1011 (let ((l '((#x1100 . #x115F)
1012            (#x2329 . #x232A)
1013            (#x2E80 . #x303E)
1014            (#x3040 . #xA4CF)
1015            (#xAC00 . #xD7A3)
1016            (#xF900 . #xFAFF)
1017            (#xFE30 . #xFE6F)
1018            (#xFF01 . #xFF60)
1019            (#xFFE0 . #xFFE6)
1020            (#x20000 . #x2FFFF)
1021            (#x30000 . #x3FFFF))))
1022   (dolist (elt l)
1023     (set-char-table-range char-width-table elt 2)))
1024
1025 ;; Other double width
1026 ;;(map-charset-chars
1027 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1028 ;; 'ethiopic)
1029 ;; (map-charset-chars
1030 ;;  (lambda (range ignore) (set-char-table-range char-width-table range 2))
1031 ;; 'tibetan)
1032 (map-charset-chars
1033  (lambda (range ignore) (set-char-table-range char-width-table range 2))
1034  'indian-2-column)
1035 (map-charset-chars
1036  (lambda (range ignore) (set-char-table-range char-width-table range 2))
1037  'arabic-2-column)
1038
1039 ;; Internal use only.
1040 ;; Alist of locale symbol vs charsets.  In a language environment
1041 ;; corresponding to the locale, width of characters in the charsets is
1042 ;; set to 2.  Each element has the form:
1043 ;;   (LOCALE TABLE (CHARSET (FROM-CODE . TO-CODE) ...) ...)
1044 ;; LOCALE: locale symbol
1045 ;; TABLE: char-table used for char-width-table, initially nil.
1046 ;; CAHRSET: character set
1047 ;; FROM-CODE, TO-CODE: range of code-points in CHARSET
1048
1049 (defvar cjk-char-width-table-list
1050   '((ja_JP nil (japanese-jisx0208 (#x2121 . #x287E))
1051                (cp932-2-byte (#x8140 . #x879F)))
1052     (zh_CN nil (chinese-gb2312 (#x2121 . #x297E)))
1053     (zh_HK nil (big5-hkscs (#xA140 . #xA3FE) (#xC6A0 . #xC8FE)))
1054     (zh_TW nil (big5 (#xA140 . #xA3FE))
1055                (chinese-cns11643-1 (#x2121 . #x427E)))
1056     (ko_KR nil (korean-ksc5601 (#x2121 . #x2C7E)))))
1057
1058 ;; Internal use only.
1059 ;; Setup char-width-table appropriate for a language environment
1060 ;; corresponding to LOCALE-NAME (symbol).
1061
1062 (defun use-cjk-char-width-table (locale-name)
1063   (while (char-table-parent char-width-table)
1064     (setq char-width-table (char-table-parent char-width-table)))
1065   (let ((slot (assq locale-name cjk-char-width-table-list))
1066         table)
1067     (or slot (error "Unknown locale for CJK language environment: %s"
1068                     locale-name))
1069     (unless (nth 1 slot)
1070       (let ((table (make-char-table nil)))
1071         (dolist (charset-info (nthcdr 2 slot))
1072           (let ((charset (car charset-info)))
1073             (dolist (code-range (cdr charset-info))
1074               (map-charset-chars #'(lambda (range arg)
1075                                      (set-char-table-range table range 2))
1076                                  charset nil
1077                                  (car code-range) (cdr code-range)))))
1078         (optimize-char-table table)
1079         (set-char-table-parent table char-width-table)
1080         (setcar (cdr slot) table)))
1081     (setq char-width-table (nth 1 slot))))
1082
1083 (defun use-default-char-width-table ()
1084   "Internal use only.
1085 Setup char-width-table appropriate for non-CJK language environment."
1086   (while (char-table-parent char-width-table)
1087     (setq char-width-table (char-table-parent char-width-table))))
1088
1089 (optimize-char-table (standard-case-table))
1090 (optimize-char-table (standard-syntax-table))
1091
1092 \f
1093 ;; Setting char-script-table.
1094
1095 ;; The Unicode blocks actually extend past some of these ranges with
1096 ;; undefined codepoints.
1097 (let ((script-list nil))
1098   (dolist
1099       (elt
1100        '((#x0000 #x007F latin)
1101          (#x00A0 #x024F latin)
1102          (#x0250 #x02AF phonetic)
1103          (#x02B0 #x036F latin)
1104          (#x0370 #x03E1 greek)
1105          (#x03E2 #x03EF coptic)
1106          (#x03F0 #x03F3 greek)
1107          (#x0400 #x04FF cyrillic)
1108          (#x0530 #x058F armenian)
1109          (#x0590 #x05FF hebrew)
1110          (#x0600 #x06FF arabic)
1111          (#x0700 #x074F syriac)
1112          (#x07C0 #x07FA nko)
1113          (#x0780 #x07BF thaana)
1114          (#x0900 #x097F devanagari)
1115          (#x0980 #x09FF bengali)
1116          (#x0A00 #x0A7F gurmukhi)
1117          (#x0A80 #x0AFF gujarati)
1118          (#x0B00 #x0B7F oriya)
1119          (#x0B80 #x0BFF tamil)
1120          (#x0C00 #x0C7F telugu)
1121          (#x0C80 #x0CFF kannada)
1122          (#x0D00 #x0D7F malayalam)
1123          (#x0D80 #x0DFF sinhala)
1124          (#x0E00 #x0E5F thai)
1125          (#x0E80 #x0EDF lao)
1126          (#x0F00 #x0FFF tibetan)
1127          (#x1000 #x105F myanmar)
1128          (#x10A0 #x10FF georgian)
1129          (#x1100 #x11FF hangul)
1130          (#x1200 #x139F ethiopic)
1131          (#x13A0 #x13FF cherokee)
1132          (#x1400 #x167F canadian-aboriginal)
1133          (#x1680 #x169F ogham)
1134          (#x16A0 #x16FF runic)
1135          (#x1780 #x17FF khmer)
1136          (#x1800 #x18AF mongolian)
1137          (#x1D00 #x1DFF phonetic)
1138          (#x1E00 #x1EFF latin)
1139          (#x1F00 #x1FFF greek)
1140          (#x2000 #x27FF symbol)
1141          (#x2800 #x28FF braille)
1142          (#x2D80 #x2DDF ethiopic)
1143          (#x2E80 #x2FDF han)
1144          (#x2FF0 #x2FFF ideographic-description)
1145          (#x3000 #x303F cjk-misc)
1146          (#x3040 #x30FF kana)
1147          (#x3100 #x312F bopomofo)
1148          (#x3130 #x318F hangul)
1149          (#x3190 #x319F kanbun)
1150          (#x31A0 #x31BF bopomofo)
1151          (#x3400 #x9FAF han)
1152          (#xA000 #xA4CF yi)
1153          (#xAA00 #xAA5F cham)
1154          (#xAA80 #xAADF tai-viet)
1155          (#xAC00 #xD7AF hangul)
1156          (#xF900 #xFAFF han)
1157          (#xFB1D #xFB4F hebrew)
1158          (#xFB50 #xFDFF arabic)
1159          (#xFE70 #xFEFC arabic)
1160          (#xFF00 #xFF5F cjk-misc)
1161          (#xFF61 #xFF9F kana)
1162          (#xFFE0 #xFFE6 cjk-misc)
1163          (#x10000 #x100FF linear-b)
1164          (#x10100 #x1013F aegean-number)
1165          (#x10140 #x1018A ancient-greek-number)
1166          (#x10190 #x1019B ancient-symbol)
1167          (#x101D0 #x101FF phaistos-disc)
1168          (#x10280 #x1029F lycian)
1169          (#x102A0 #x102DF carian)
1170          (#x10300 #x1032F olt-italic)
1171          (#x10380 #x1039F ugaritic)
1172          (#x103A0 #x103DF old-persian)
1173          (#x10400 #x1044F deseret)
1174          (#x10450 #x1047F shavian)
1175          (#x10480 #x104AF osmanya)
1176          (#x10800 #x1083F cypriot-syllabary)
1177          (#x10900 #x1091F phoenician)
1178          (#x10920 #x1093F lydian)
1179          (#x10A00 #x10A5F kharoshthi)
1180          (#x12000 #x123FF cuneiform)
1181          (#x12400 #x1247F cuneiform-numbers-and-punctuation)
1182          (#x1D000 #x1D0FF byzantine-musical-symbol)
1183          (#x1D100 #x1D1FF musical-symbol)
1184          (#x1D200 #x1D24F ancient-greek-musical-notation)
1185          (#x1D300 #x1D35F tai-xuan-jing-symbol)
1186          (#x1D360 #x1D37F counting-rod-numeral)
1187          (#x1D400 #x1D7FF mathematical)
1188          (#x1F000 #x1F02F mahjong-tile)
1189          (#x1F030 #x1F09F domino-tile)
1190          (#x20000 #x2AFFF han)
1191          (#x2F800 #x2FFFF han)))
1192     (set-char-table-range char-script-table
1193                           (cons (car elt) (nth 1 elt)) (nth 2 elt))
1194     (or (memq (nth 2 elt) script-list)
1195         (setq script-list (cons (nth 2 elt) script-list))))
1196   (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1197
1198 (map-charset-chars
1199  #'(lambda (range ignore)
1200      (set-char-table-range char-script-table range 'tibetan))
1201  'tibetan)
1202
1203 \f
1204 ;;; Setting unicode-category-table.
1205
1206 ;; This macro is to build unicode-category-table at compile time so
1207 ;; that C code can access the table efficiently.
1208 (defmacro build-unicode-category-table ()
1209   (let ((table (make-char-table 'unicode-category-table nil)))
1210     (dotimes (i #x110000)
1211       (if (or (< i #xD800)
1212               (and (>= i #xF900) (< i #x30000))
1213               (and (>= i #xE0000) (< i #xE0200)))
1214           (aset table i (get-char-code-property i 'general-category))))
1215     (set-char-table-range table '(#xE000 . #xF8FF) 'Co)
1216     (set-char-table-range table '(#xF0000 . #xFFFFD) 'Co)
1217     (set-char-table-range table '(#x100000 . #x10FFFD) 'Co)
1218     (optimize-char-table table 'eq)
1219     table))
1220
1221 (setq unicode-category-table (build-unicode-category-table))
1222 (map-char-table #'(lambda (key val)
1223                     (if (and val
1224                              (or (and (/= (aref (symbol-name val) 0) ?M)
1225                                       (/= (aref (symbol-name val) 0) ?C))
1226                                  (eq val 'Zs)))
1227                         (modify-category-entry key ?.)))
1228                 unicode-category-table)
1229
1230 (optimize-char-table (standard-category-table))
1231
1232 \f
1233 ;;; Setting word boundary.
1234
1235 (setq word-combining-categories
1236       '((nil . ?^)
1237         (?^ . nil)
1238         (?C . ?H)
1239         (?C . ?K)))
1240
1241 (setq word-separating-categories        ;  (2-byte character sets)
1242       '((?H . ?K)                       ; Hiragana - Katakana
1243         ))
1244
1245 ;; Local Variables:
1246 ;; coding: utf-8
1247 ;; End:
1248
1249 ;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
1250 ;;; characters.el ends here