;;; characters.el --- set syntax and category for multibyte characters
-;; Copyright (C) 1997, 2000-2012 Free Software Foundation, Inc.
+;; Copyright (C) 1997, 2000-2014 Free Software Foundation, Inc.
;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
;; National Institute of Advanced Industrial Science and Technology (AIST)
(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
(let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
(dolist (elt chars)
- (modify-syntax-entry (car chars) "w")))
+ (modify-syntax-entry elt "w")))
(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
-(modify-category-entry ?ー ?K)
-(let ((chars '(?゛ ?゜)))
- (while chars
- (modify-category-entry (car chars) ?K)
- (modify-category-entry (car chars) ?H)
- (setq chars (cdr chars))))
(let ((chars '(?仝 ?々 ?〆 ?〇)))
(while chars
(modify-category-entry (car chars) ?C)
;; Bidi categories
-(map-char-table (lambda (key val)
- (cond
- ((memq val '(R AL RLO RLE))
- (modify-category-entry key ?R))
- ((memq val '(L LRE LRO))
- (modify-category-entry key ?L))))
- (unicode-property-table-internal 'bidi-class))
+;; If bootstrapping without generated uni-*.el files, table not defined.
+(let ((table (unicode-property-table-internal 'bidi-class)))
+ (when table
+ (map-char-table (lambda (key val)
+ (cond
+ ((memq val '(R AL RLO RLE))
+ (modify-category-entry key ?R))
+ ((memq val '(L LRE LRO))
+ (modify-category-entry key ?L))))
+ table)))
;; Latin
(set-case-syntax ?¦ "_" tbl)
(set-case-syntax ?§ "." tbl)
(set-case-syntax ?© "_" tbl)
- (set-case-syntax-delims 171 187 tbl) ; « »
+ ;; French wants
+ ;; (set-case-syntax-delims ?« ?» tbl)
+ ;; And German wants
+ ;; (set-case-syntax-delims ?» ?« tbl)
+ ;; So let's stay neutral and let users set these up if/when they want to.
+ (set-case-syntax ?« "." tbl)
+ (set-case-syntax ?» "." tbl)
(set-case-syntax ?¬ "_" tbl)
(set-case-syntax ? "_" tbl)
(set-case-syntax ?® "_" tbl)
(#x01AC . #x01AD)
(#x01AF . #x01B0)
(#x01B3 . #x01B6)
+ (#x01B8 . #x01B9)
(#x01BC . #x01BD)
(#x01CD . #x01DC)
(#x01DE . #x01EF)
(set-case-syntax-pair from (1+ from) tbl)
(setq from (+ from 2))))))
- (set-case-syntax-pair #x189 #x256 tbl)
- (set-case-syntax-pair #x18A #x257 tbl)
+ (set-case-syntax-pair ?Ÿ ?ÿ tbl)
;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
;; (set-downcase-syntax ?İ ?i tbl)
;; (set-upcase-syntax ?I ?ı tbl)
+ (set-case-syntax-pair ?Ɓ ?ɓ tbl)
+ (set-case-syntax-pair ?Ɔ ?ɔ tbl)
+ (set-case-syntax-pair ?Ɖ ?ɖ tbl)
+ (set-case-syntax-pair ?Ɗ ?ɗ tbl)
+ (set-case-syntax-pair ?Ǝ ?ǝ tbl)
+ (set-case-syntax-pair ?Ə ?ə tbl)
+ (set-case-syntax-pair ?Ɛ ?ɛ tbl)
+ (set-case-syntax-pair ?Ɠ ?ɠ tbl)
+ (set-case-syntax-pair ?Ɣ ?ɣ tbl)
+ (set-case-syntax-pair ?Ɩ ?ɩ tbl)
+ (set-case-syntax-pair ?Ɨ ?ɨ tbl)
+ (set-case-syntax-pair ?Ɯ ?ɯ tbl)
+ (set-case-syntax-pair ?Ɲ ?ɲ tbl)
+ (set-case-syntax-pair ?Ɵ ?ɵ tbl)
+ (set-case-syntax-pair ?Ʀ ?ʀ tbl)
+ (set-case-syntax-pair ?Ʃ ?ʃ tbl)
+ (set-case-syntax-pair ?Ʈ ?ʈ tbl)
+ (set-case-syntax-pair ?Ʊ ?ʊ tbl)
+ (set-case-syntax-pair ?Ʋ ?ʋ tbl)
+ (set-case-syntax-pair ?Ʒ ?ʒ tbl)
(set-case-syntax-pair ?DŽ ?dž tbl)
(set-case-syntax-pair ?Dž ?dž tbl)
(set-case-syntax-pair ?LJ ?lj tbl)
(set-case-syntax-pair ?Dz ?dz tbl)
(set-case-syntax-pair ?Ƕ ?ƕ tbl)
(set-case-syntax-pair ?Ƿ ?ƿ tbl)
+ (set-case-syntax-pair ?Ⱥ ?ⱥ tbl)
+ (set-case-syntax-pair ?Ƚ ?ƚ tbl)
+ (set-case-syntax-pair ?Ⱦ ?ⱦ tbl)
+ (set-case-syntax-pair ?Ƀ ?ƀ tbl)
+ (set-case-syntax-pair ?Ʉ ?ʉ tbl)
+ (set-case-syntax-pair ?Ʌ ?ʌ tbl)
;; Latin Extended Additional
(modify-category-entry '(#x1e00 . #x1ef9) ?l)
;; Combining diacritics
(modify-category-entry '(#x300 . #x362) ?^)
;; Combining marks
- (modify-category-entry '(#x20d0 . #x20e3) ?^)
+ (modify-category-entry '(#x20d0 . #x20ff) ?^)
;; Fixme: syntax for symbols &c
)
;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
;; 'tibetan)
(map-charset-chars
- (lambda (range ignore) (set-char-table-range char-width-table range 2))
+ (lambda (range _ignore) (set-char-table-range char-width-table range 2))
'indian-2-column)
(map-charset-chars
- (lambda (range ignore) (set-char-table-range char-width-table range 2))
+ (lambda (range _ignore) (set-char-table-range char-width-table range 2))
'arabic-2-column)
;; Internal use only.
;; (LOCALE TABLE (CHARSET (FROM-CODE . TO-CODE) ...) ...)
;; LOCALE: locale symbol
;; TABLE: char-table used for char-width-table, initially nil.
-;; CAHRSET: character set
+;; CHARSET: character set
;; FROM-CODE, TO-CODE: range of code-points in CHARSET
(defvar cjk-char-width-table-list
(defun use-cjk-char-width-table (locale-name)
(while (char-table-parent char-width-table)
(setq char-width-table (char-table-parent char-width-table)))
- (let ((slot (assq locale-name cjk-char-width-table-list))
- table)
+ (let ((slot (assq locale-name cjk-char-width-table-list)))
(or slot (error "Unknown locale for CJK language environment: %s"
locale-name))
(unless (nth 1 slot)
(dolist (charset-info (nthcdr 2 slot))
(let ((charset (car charset-info)))
(dolist (code-range (cdr charset-info))
- (map-charset-chars #'(lambda (range arg)
+ (map-charset-chars #'(lambda (range _arg)
(set-char-table-range table range 2))
charset nil
(car code-range) (cdr code-range)))))
\f
;; Setting char-script-table.
+;; The data is compiled from Blocks.txt and Scripts.txt in the
+;; "Unicode Character Database", simplified to lump together all the
+;; blocks belonging to the same language. E.g., "Basic Latin",
+;; "Latin-1 Supplement", "Latin Extended-A", etc. are all lumped
+;; together under "latin".
+;;
;; The Unicode blocks actually extend past some of these ranges with
;; undefined codepoints.
(let ((script-list nil))
(#x0370 #x03E1 greek)
(#x03E2 #x03EF coptic)
(#x03F0 #x03F3 greek)
- (#x0400 #x04FF cyrillic)
+ (#x0400 #x052F cyrillic)
(#x0530 #x058F armenian)
(#x0590 #x05FF hebrew)
(#x0600 #x06FF arabic)
(#x0700 #x074F syriac)
- (#x07C0 #x07FA nko)
+ (#x0750 #x077F arabic)
(#x0780 #x07BF thaana)
+ (#x07C0 #x07FF nko)
+ (#x0800 #x083F samaritan)
+ (#x0840 #x085F mandaic)
+ (#x08A0 #x08FF arabic)
(#x0900 #x097F devanagari)
(#x0980 #x09FF bengali)
(#x0A00 #x0A7F gurmukhi)
(#x0C80 #x0CFF kannada)
(#x0D00 #x0D7F malayalam)
(#x0D80 #x0DFF sinhala)
- (#x0E00 #x0E5F thai)
- (#x0E80 #x0EDF lao)
+ (#x0E00 #x0E7F thai)
+ (#x0E80 #x0EFF lao)
(#x0F00 #x0FFF tibetan)
- (#x1000 #x109F burmese)
+ (#x1000 #x109F burmese) ; according to Unicode 6.1, should be "myanmar"
(#x10A0 #x10FF georgian)
(#x1100 #x11FF hangul)
(#x1200 #x139F ethiopic)
(#x1400 #x167F canadian-aboriginal)
(#x1680 #x169F ogham)
(#x16A0 #x16FF runic)
+ (#x1700 #x171F tagalog)
+ (#x1720 #x173F hanunoo)
+ (#x1740 #x175F buhid)
+ (#x1760 #x177F tagbanwa)
(#x1780 #x17FF khmer)
(#x1800 #x18AF mongolian)
- (#x1D00 #x1DFF phonetic)
- (#x1E00 #x1EFF latin)
+ (#x18B0 #x18FF canadian-aboriginal)
+ (#x1900 #x194F limbu)
+ (#x1950 #x197F tai-le)
+ (#x1980 #x19DF tai-lue)
+ (#x19E0 #x19FF khmer)
+ (#x1A00 #x1A00 buginese)
+ (#x1A20 #x1AAF tai-tham)
+ (#x1B00 #x1B7F balinese)
+ (#x1B80 #x1BBF sundanese)
+ (#x1BC0 #x1BFF batak)
+ (#x1C00 #x1C4F lepcha)
+ (#x1C50 #x1C7F ol-chiki)
+ (#x1CC0 #x1CCF sundanese)
+ (#x1CD0 #x1CFF vedic)
+ (#x1D00 #x1DBF phonetic)
+ (#x1DC0 #x1EFF latin)
(#x1F00 #x1FFF greek)
(#x2000 #x27FF symbol)
(#x2800 #x28FF braille)
+ (#x2900 #x2BFF symbol)
+ (#x2C00 #x2C5F glagolitic)
+ (#x2C60 #x2C7F latin)
+ (#x2C80 #x2CFF coptic)
+ (#x2D00 #x2D2F georgian)
+ (#x2D30 #x2D7F tifinagh)
(#x2D80 #x2DDF ethiopic)
+ (#x2DE0 #x2DFF cyrillic)
+ (#x2E00 #x2E7F symbol)
(#x2E80 #x2FDF han)
(#x2FF0 #x2FFF ideographic-description)
(#x3000 #x303F cjk-misc)
(#x3130 #x318F hangul)
(#x3190 #x319F kanbun)
(#x31A0 #x31BF bopomofo)
- (#x3400 #x9FAF han)
+ (#x31C0 #x31EF cjk-misc)
+ (#x31F0 #x31FF kana)
+ (#x3200 #x9FAF han)
(#xA000 #xA4CF yi)
+ (#xA4D0 #xA4FF lisu)
+ (#xA500 #xA63F vai)
+ (#xA640 #xA69F cyrillic)
+ (#xA6A0 #xA6FF bamum)
+ (#xA700 #xA7FF latin)
+ (#xA800 #xA82F syloti-nagri)
+ (#xA830 #xA83F north-indic-number)
+ (#xA840 #xA87F phags-pa)
+ (#xA880 #xA8DF saurashtra)
+ (#xA8E0 #xA8FF devanagari)
+ (#xA900 #xA92F kayah-li)
+ (#xA930 #xA95F rejang)
+ (#xA960 #xA97F hangul)
+ (#xA980 #xA9DF javanese)
(#xAA00 #xAA5F cham)
- (#xAA60 #xAA7B burmese)
+ (#xAA60 #xAA7B burmese) ; Unicode 6.1: "myanmar"
(#xAA80 #xAADF tai-viet)
- (#xAC00 #xD7AF hangul)
+ (#xAAE0 #xAAFF meetei-mayek)
+ (#xAB00 #xAB2F ethiopic)
+ (#xABC0 #xABFF meetei-mayek)
+ (#xAC00 #xD7FF hangul)
(#xF900 #xFAFF han)
(#xFB1D #xFB4F hebrew)
(#xFB50 #xFDFF arabic)
- (#xFE70 #xFEFC arabic)
+ (#xFE30 #xFE4F han)
+ (#xFE70 #xFEFF arabic)
(#xFF00 #xFF5F cjk-misc)
(#xFF61 #xFF9F kana)
(#xFFE0 #xFFE6 cjk-misc)
(#x10000 #x100FF linear-b)
(#x10100 #x1013F aegean-number)
- (#x10140 #x1018A ancient-greek-number)
- (#x10190 #x1019B ancient-symbol)
+ (#x10140 #x1018F ancient-greek-number)
+ (#x10190 #x101CF ancient-symbol)
(#x101D0 #x101FF phaistos-disc)
(#x10280 #x1029F lycian)
(#x102A0 #x102DF carian)
(#x10300 #x1032F olt-italic)
+ (#x10330 #x1034F gothic)
(#x10380 #x1039F ugaritic)
(#x103A0 #x103DF old-persian)
(#x10400 #x1044F deseret)
(#x10450 #x1047F shavian)
(#x10480 #x104AF osmanya)
(#x10800 #x1083F cypriot-syllabary)
+ (#x10840 #x1085F aramaic)
(#x10900 #x1091F phoenician)
(#x10920 #x1093F lydian)
+ (#x10980 #x109FF meroitic)
(#x10A00 #x10A5F kharoshthi)
+ (#x10A60 #x10A7F old-south-arabian)
+ (#x10B00 #x10B3F avestan)
+ (#x10B40 #x10B5F inscriptional-parthian)
+ (#x10B60 #x10B7F inscriptional-pahlavi)
+ (#x10C00 #x10C4F old-turkic)
+ (#x10E60 #x10E7F rumi-number)
+ (#x11000 #x1107F brahmi)
+ (#x11080 #x110CF kaithi)
+ (#x110D0 #x110FF sora-sompeng)
+ (#x11100 #x1114F chakma)
+ (#x11180 #x111DF sharada)
+ (#x11680 #x116CF takri)
(#x12000 #x123FF cuneiform)
(#x12400 #x1247F cuneiform-numbers-and-punctuation)
+ (#x13000 #x1342F egyptian)
+ (#x16800 #x16A3F bamum)
+ (#x16F00 #x16F9F miao)
+ (#x1B000 #x1B0FF kana)
(#x1D000 #x1D0FF byzantine-musical-symbol)
(#x1D100 #x1D1FF musical-symbol)
(#x1D200 #x1D24F ancient-greek-musical-notation)
(#x1D300 #x1D35F tai-xuan-jing-symbol)
(#x1D360 #x1D37F counting-rod-numeral)
(#x1D400 #x1D7FF mathematical)
+ (#x1EE00 #x1EEFF arabic)
(#x1F000 #x1F02F mahjong-tile)
(#x1F030 #x1F09F domino-tile)
- (#x20000 #x2AFFF han)
+ (#x1F0A0 #x1F0FF playing-cards)
+ (#x1F100 #x1F1FF symbol)
+ (#x1F200 #x1F2FF han)
+ (#x1F300 #x1F64F symbol)
+ (#x1F680 #x1F77F symbol)
+ (#x20000 #x2B81F han)
(#x2F800 #x2FFFF han)))
(set-char-table-range char-script-table
(cons (car elt) (nth 1 elt)) (nth 2 elt))
(set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
(map-charset-chars
- #'(lambda (range ignore)
+ #'(lambda (range _ignore)
(set-char-table-range char-script-table range 'tibetan))
'tibetan)
\f
;;; Setting unicode-category-table.
-(setq unicode-category-table
- (unicode-property-table-internal 'general-category))
-(map-char-table #'(lambda (key val)
- (if (and val
- (or (and (/= (aref (symbol-name val) 0) ?M)
- (/= (aref (symbol-name val) 0) ?C))
- (eq val 'Zs)))
- (modify-category-entry key ?.)))
- unicode-category-table)
+(when (setq unicode-category-table
+ (unicode-property-table-internal 'general-category))
+ (map-char-table #'(lambda (key val)
+ (if (and val
+ (or (and (/= (aref (symbol-name val) 0) ?M)
+ (/= (aref (symbol-name val) 0) ?C))
+ (eq val 'Zs)))
+ (modify-category-entry key ?.)))
+ unicode-category-table))
(optimize-char-table (standard-category-table))
(or (memq method '(zero-width thin-space empty-box acronym hex-code))
(error "Invalid glyphless character display method: %s" method))
(cond ((eq target 'c0-control)
- (set-char-table-range glyphless-char-display '(#x00 . #x1F)
- method)
+ (glyphless-set-char-table-range glyphless-char-display
+ #x00 #x1F method)
;; Users will not expect their newlines and TABs be
;; displayed as anything but themselves, so exempt those
;; two characters from c0-control.
(set-char-table-range glyphless-char-display #x9 nil)
(set-char-table-range glyphless-char-display #xa nil))
((eq target 'c1-control)
- (set-char-table-range glyphless-char-display '(#x80 . #x9F)
- method))
+ (glyphless-set-char-table-range glyphless-char-display
+ #x80 #x9F method))
((eq target 'format-control)
- (map-char-table
- #'(lambda (char category)
- (if (eq category 'Cf)
- (let ((this-method method)
- from to)
- (if (consp char)
- (setq from (car char) to (cdr char))
- (setq from char to char))
- (while (<= from to)
- (when (/= from #xAD)
- (if (eq method 'acronym)
- (setq this-method
- (aref char-acronym-table from)))
- (set-char-table-range glyphless-char-display
- from this-method))
- (setq from (1+ from))))))
- unicode-category-table))
+ (when unicode-category-table
+ (map-char-table
+ #'(lambda (char category)
+ (if (eq category 'Cf)
+ (let ((this-method method)
+ from to)
+ (if (consp char)
+ (setq from (car char) to (cdr char))
+ (setq from char to char))
+ (while (<= from to)
+ (when (/= from #xAD)
+ (if (eq method 'acronym)
+ (setq this-method
+ (aref char-acronym-table from)))
+ (set-char-table-range glyphless-char-display
+ from this-method))
+ (setq from (1+ from))))))
+ unicode-category-table)))
((eq target 'no-font)
(set-char-table-extra-slot glyphless-char-display 0 method))
(t
(error "Invalid glyphless character group: %s" target))))))
+(defun glyphless-set-char-table-range (chartable from to method)
+ (if (eq method 'acronym)
+ (let ((i from))
+ (while (<= i to)
+ (set-char-table-range chartable i (aref char-acronym-table i))
+ (setq i (1+ i))))
+ (set-char-table-range chartable (cons from to) method)))
+
;;; Control of displaying glyphless characters.
(defcustom glyphless-char-display-control
'((format-control . thin-space)
`empty-box': display an empty box.
`acronym': display an acronym of the character in a box. The
acronym is taken from `char-acronym-table', which see.
- `hex-code': display the hexadecimal character code in a box."
+ `hex-code': display the hexadecimal character code in a box.
+Do not set its value directly from Lisp; the value takes effect
+only via a custom `:set'
+function (`update-glyphless-char-display'), which updates
+`glyphless-char-display'."
+ :version "24.1"
:type '(alist :key-type (symbol :tag "Character Group")
:value-type (symbol :tag "Display Method"))
:options '((c0-control