;; unidata-gen.el -- Create files containing character property data.
+
+;; Copyright (C) 2008-2014 Free Software Foundation, Inc.
+
;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
;; National Institute of Advanced Industrial Science and Technology (AIST)
;; Registration Number H13PRO009
;; SPECIAL NOTICE
;;
;; This file must be byte-compilable/loadable by `temacs' and also
-;; the entry function `unidata-gen-files' must be runnable by
-;; `temacs'.
+;; the entry function `unidata-gen-files' must be runnable by `temacs'.
;; FILES TO BE GENERATED
;;
;; The entry function `unidata-gen-files' generates these files in
-;; the current directory.
+;; in directory specified by its dest-dir argument.
;;
;; charprop.el
;; It contains a series of forms of this format:
(defvar unidata-list nil)
-;; Name of the directory containing files of Unicode Character
-;; Database.
+;; Name of the directory containing files of Unicode Character Database.
+;; Dynamically bound in unidata-gen-files.
(defvar unidata-dir nil)
(defun unidata-setup-list (unidata-text-file)
(setq unidata-list (cdr table))))
;; Alist of this form:
-;; (PROP INDEX GENERATOR FILENAME DOCSTRING DESCRIBER VAL-LIST)
+;; (PROP INDEX GENERATOR FILENAME DOCSTRING DESCRIBER DEFAULT VAL-LIST)
;; PROP: character property
;; INDEX: index to each element of unidata-list for PROP.
;; It may be a function that generates an alist of character codes
;; FILENAME: filename to store the char-table
;; DOCSTRING: docstring for the property
;; DESCRIBER: function to call to get a description string of property value
-;; DEFAULT: the default value of the property
+;; DEFAULT: the default value of the property. It may have the form
+;; (VAL0 (FROM1 TO1 VAL1) ...) which indicates that the default
+;; value is VAL0 except for characters in the ranges specified by
+;; FROMn and TOn (inclusive). The default value of characters
+;; between FROMn and TOn is VALn.
;; VAL-LIST: list of specially ordered property values
(defconst unidata-prop-alist
'((name
1 unidata-gen-table-name "uni-name.el"
"Unicode character name.
-Property value is a string.")
+Property value is a string or nil.
+The value nil stands for the default value \"null string\")."
+ nil
+ nil)
(general-category
2 unidata-gen-table-symbol "uni-category.el"
"Unicode general category.
Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn"
unidata-describe-general-category
- nil
+ Cn
;; The order of elements must be in sync with unicode_category_t
;; in src/character.h.
(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po
3 unidata-gen-table-integer "uni-combining.el"
"Unicode canonical combining class.
Property value is an integer."
- unidata-describe-canonical-combining-class)
+ unidata-describe-canonical-combining-class
+ 0)
(bidi-class
4 unidata-gen-table-symbol "uni-bidi.el"
"Unicode bidi class.
Property value is one of the following symbols:
- L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET,
- AN, CS, NSM, BN, B, S, WS, ON"
+ L, LRE, LRO, LRI, R, AL, RLE, RLO, RLI, FSI, PDF, PDI,
+ EN, ES, ET, AN, CS, NSM, BN, B, S, WS, ON"
unidata-describe-bidi-class
- L
+ ;; The assignment of default values to blocks of code points
+ ;; follows the file DerivedBidiClass.txt from the Unicode
+ ;; Character Database (UCD).
+ (L (#x0600 #x06FF AL) (#xFB50 #xFDFF AL) (#xFE70 #xFEFF AL)
+ (#x0590 #x05FF R) (#x07C0 #x08FF R)
+ (#xFB1D #xFB4F R) (#x10800 #x10FFF R) (#x1E800 #x1EFFF R))
;; The order of elements must be in sync with bidi_type_t in
;; src/dispextern.h.
- (L R EN AN BN B AL LRE LRO RLE RLO PDF ES ET CS NSM S WS ON))
+ (L R EN AN BN B AL LRE LRO RLE RLO PDF LRI RLI FSI PDI
+ ES ET CS NSM S WS ON))
(decomposition
5 unidata-gen-table-decomposition "uni-decomposition.el"
"Unicode decomposition mapping.
(decimal-digit-value
6 unidata-gen-table-integer "uni-decimal.el"
"Unicode numeric value (decimal digit).
-Property value is an integer.")
+Property value is an integer 0..9, or nil.
+The value nil stands for NaN \"Numeric_Value\".")
(digit-value
7 unidata-gen-table-integer "uni-digit.el"
"Unicode numeric value (digit).
-Property value is an integer.")
+Property value is an integer 0..9, or nil.
+The value nil stands for NaN \"Numeric_Value\".")
(numeric-value
8 unidata-gen-table-numeric "uni-numeric.el"
"Unicode numeric value (numeric).
-Property value is an integer or a floating point.")
+Property value is an integer, a floating point, or nil.
+The value nil stands for NaN \"Numeric_Value\".")
(mirrored
9 unidata-gen-table-symbol "uni-mirrored.el"
"Unicode bidi mirrored flag.
-Property value is a symbol `Y' or `N'. See also the property `mirroring'.")
+Property value is a symbol `Y' or `N'. See also the property `mirroring'."
+ nil
+ N)
(old-name
10 unidata-gen-table-name "uni-old-name.el"
"Unicode old names as published in Unicode 1.0.
-Property value is a string.")
+Property value is a string or nil.
+The value nil stands for the default value \"null string\").")
(iso-10646-comment
11 unidata-gen-table-name "uni-comment.el"
"Unicode ISO 10646 comment.
(uppercase
12 unidata-gen-table-character "uni-uppercase.el"
"Unicode simple uppercase mapping.
-Property value is a character."
+Property value is a character or nil.
+The value nil means that the actual property value of a character
+is the character itself."
string)
(lowercase
13 unidata-gen-table-character "uni-lowercase.el"
"Unicode simple lowercase mapping.
-Property value is a character."
+Property value is a character or nil.
+The value nil means that the actual property value of a character
+is the character itself."
string)
(titlecase
14 unidata-gen-table-character "uni-titlecase.el"
"Unicode simple titlecase mapping.
-Property value is a character."
+Property value is a character or nil.
+The value nil means that the actual property value of a character
+is the character itself."
string)
(mirroring
unidata-gen-mirroring-list unidata-gen-table-character "uni-mirrored.el"
"Unicode bidi-mirroring characters.
-Property value is a character that has the corresponding mirroring image,
-or nil for non-mirrored character.")))
+Property value is a character that has the corresponding mirroring image or nil.
+The value nil means that the actual property value of a character
+is the character itself.")))
;; Functions to access the above data.
(defsubst unidata-prop-index (prop) (nth 1 (assq prop unidata-prop-alist)))
;; If VAL is one of VALn, just return n.
;; Otherwise, VAL-LIST is modified to this:
;; ((nil . 0) (VAL1 . 1) (VAL2 . 2) ... (VAL . n+1))
+;;
+;; WARN is an optional warning to display when the value list is
+;; extended, for property values that need to be in sync with other
+;; parts of Emacs; currently only used for bidi-class.
-(defun unidata-encode-val (val-list val)
+(defun unidata-encode-val (val-list val &optional warn)
(let ((slot (assoc val val-list))
val-code)
(if slot
(cdr slot)
+ (if warn (message warn val))
(setq val-code (length val-list))
(nconc val-list (list (cons val val-code)))
val-code)))
(let ((table (make-char-table 'char-code-property-table))
(prop-idx (unidata-prop-index prop))
(vec (make-vector 128 0))
+ ;; When this warning is printed, there's a need to make the
+ ;; following changes:
+ ;; (1) update unidata-prop-alist with the new bidi-class values;
+ ;; (2) extend bidi_type_t enumeration on src/dispextern.h to
+ ;; include the new classes;
+ ;; (3) possibly update the assertion in bidi.c:bidi_check_type; and
+ ;; (4) possibly update the switch cases in
+ ;; bidi.c:bidi_get_type and bidi.c:bidi_get_category.
+ (bidi-warning "\
+** Found new bidi-class '%s', please update bidi.c and dispextern.h")
tail elt range val val-code idx slot
prev-range-data)
(setq val-list (cons nil (copy-sequence val-list)))
(while tail
(setcar tail (cons (car tail) val-code))
(setq tail (cdr tail) val-code (1+ val-code)))
- (setq default-value (unidata-encode-val val-list default-value))
- (set-char-table-range table t default-value)
- (set-char-table-range table nil default-value)
+ (if (consp default-value)
+ (setq default-value (copy-sequence default-value))
+ (setq default-value (list default-value)))
+ (setcar default-value
+ (unidata-encode-val val-list (car default-value)))
+ (set-char-table-range table t (car default-value))
+ (set-char-table-range table nil (car default-value))
+ (dolist (elm (cdr default-value))
+ (setcar (nthcdr 2 elm)
+ (unidata-encode-val val-list (nth 2 elm)))
+ (set-char-table-range table (cons (car elm) (nth 1 elm)) (nth 2 elm)))
+
(setq tail unidata-list)
(while tail
(setq elt (car tail) tail (cdr tail))
(setq range (car elt)
val (funcall val-func (nth prop-idx elt)))
- (setq val-code (if val (unidata-encode-val val-list val)))
+ (setq val-code (if val (unidata-encode-val val-list val
+ (and (eq prop 'bidi-class)
+ bidi-warning))))
(if (consp range)
(when val-code
(set-char-table-range table range val-code)
(setq prev-range-data (cons (cons from to) val-code)))))
(let* ((start (lsh (lsh range -7) 7))
(limit (+ start 127))
- str count new-val)
- (fillarray vec 0)
- ;; See the comment above.
- (when (and prev-range-data
- (>= (cdr (car prev-range-data)) start))
- (let ((from (car (car prev-range-data)))
- (to (cdr (car prev-range-data)))
- (vcode (cdr prev-range-data)))
+ str count new-val from to vcode)
+ (fillarray vec (car default-value))
+ (dolist (elm (cdr default-value))
+ (setq from (car elm) to (nth 1 elm))
+ (when (and (<= from limit)
+ (or (>= from start) (>= to start)))
+ (setq from (max from start)
+ to (min to limit)
+ vcode (nth 2 elm))
(while (<= from to)
(aset vec (- from start) vcode)
(setq from (1+ from)))))
+ ;; See the comment above.
+ (when (and prev-range-data
+ (>= (cdr (car prev-range-data)) start))
+ (setq from (car (car prev-range-data))
+ to (cdr (car prev-range-data))
+ vcode (cdr prev-range-data))
+ (while (<= from to)
+ (aset vec (- from start) vcode)
+ (setq from (1+ from))))
(setq prev-range-data nil)
(if val-code
(aset vec (- range start) val-code))
(setq new-val (funcall val-func (nth prop-idx elt)))
(if (not (eq val new-val))
(setq val new-val
- val-code (if val (unidata-encode-val val-list val))))
+ val-code (if val (unidata-encode-val
+ val-list val (and (eq prop 'bidi-class)
+ bidi-warning)))))
(if val-code
(aset vec (- range start) val-code))
(setq tail (cdr tail)))
;; Return a name of CHAR. VAL is the current value of (aref TABLE
;; CHAR).
-(defun unidata-get-name (char val table)
+(fset 'unidata-get-name '(lambda (char val table)
(cond
((stringp val)
(if (> (aref val 0) 0)
((eq sym 'CJK\ COMPATIBILITY\ IDEOGRAPH)
(format "%s-%04X" sym char))
((eq sym 'VARIATION\ SELECTOR)
- (format "%s-%d" sym (+ (- char #xe0100) 17))))))))
+ (format "%s-%d" sym (+ (- char #xe0100) 17)))))))))
;; Store VAL as the name of CHAR in TABLE.
-(defun unidata-put-name (char val table)
+(fset 'unidata-put-name '(lambda (char val table)
(let ((current-val (aref table char)))
(if (and (stringp current-val) (= (aref current-val 0) 0))
(funcall (char-table-extra-slot table 1) char current-val table))
- (aset table char val)))
+ (aset table char val))))
(defun unidata-get-decomposition (char val table)
(cond
+ ((not val)
+ (list char))
+
((consp val)
val)
(aset vec idx (nconc word-list tail-list)))
(dotimes (i 128)
(aset table (+ first-char i) (aref vec i)))
- (aref vec (- char first-char)))))
+ (setq val (aref vec (- char first-char)))
+ (or val (list char)))))
;; Hangul syllable
((and (eq val 0) (>= char #xAC00) (<= char #xD7A3))
(l nil)
(idx 0)
c)
- (if (= len 0)
+ (if (or (= len 0)
+ ;; Unicode Standard, paragraph 4.8: "For all other
+ ;; Unicode code points of all other types (Control,
+ ;; Private-Use, Surrogate, Noncharacter, and Reserved),
+ ;; the value of the Name property is the null string."
+ ;; We already handle elsewhere all the characters except
+ ;; Cc, Control characters, which are handled here.
+ (string= str "<control>"))
nil
(dotimes (i len)
(setq c (aref str i))
(defun unidata-gen-table-name (prop &rest ignore)
(let* ((table (unidata-gen-table-word-list prop 'unidata-split-name))
(word-tables (char-table-extra-slot table 4)))
- (byte-compile 'unidata-get-name)
- (byte-compile 'unidata-put-name)
(set-char-table-extra-slot table 1 (symbol-function 'unidata-get-name))
(set-char-table-extra-slot table 2 (symbol-function 'unidata-put-name))
(defun unidata-gen-table-decomposition (prop &rest ignore)
(let* ((table (unidata-gen-table-word-list prop 'unidata-split-decomposition))
(word-tables (char-table-extra-slot table 4)))
- (byte-compile 'unidata-get-decomposition)
- (byte-compile 'unidata-put-decomposition)
(set-char-table-extra-slot table 1
(symbol-function 'unidata-get-decomposition))
(set-char-table-extra-slot table 2
\f
-(defun unidata-describe-general-category (val)
+(fset
+ 'unidata-describe-general-category '(lambda (val)
(cdr (assq val
'((nil . "Uknown")
(Lu . "Letter, Uppercase")
(Cf . "Other, Format")
(Cs . "Other, Surrogate")
(Co . "Other, Private Use")
- (Cn . "Other, Not Assigned")))))
+ (Cn . "Other, Not Assigned"))))))
-(defun unidata-describe-canonical-combining-class (val)
+(fset 'unidata-describe-canonical-combining-class '(lambda (val)
(cdr (assq val
'((0 . "Spacing, split, enclosing, reordrant, and Tibetan subjoined")
(1 . "Overlays and interior")
(232 . "Above right")
(233 . "Double below")
(234 . "Double above")
- (240 . "Below (iota subscript)")))))
+ (240 . "Below (iota subscript)"))))))
-(defun unidata-describe-bidi-class (val)
+(fset 'unidata-describe-bidi-class '(lambda (val)
(cdr (assq val
'((L . "Left-to-Right")
(LRE . "Left-to-Right Embedding")
(RLE . "Right-to-Left Embedding")
(RLO . "Right-to-Left Override")
(PDF . "Pop Directional Format")
+ (LRI . "Left-to-Right Isolate")
+ (RLI . "Right-to-Left Isolate")
+ (FSI . "First Strong Isolate")
+ (PDI . "Pop Directional Isolate")
(EN . "European Number")
(ES . "European Number Separator")
(ET . "European Number Terminator")
(B . "Paragraph Separator")
(S . "Segment Separator")
(WS . "Whitespace")
- (ON . "Other Neutrals")))))
+ (ON . "Other Neutrals"))))))
(defun unidata-describe-decomposition (val)
(mapconcat
;; The entry function. It generates files described in the header
;; comment of this file.
-(defun unidata-gen-files (&optional data-dir unidata-text-file)
+;; Write files (charprop.el, uni-*.el) to dest-dir (default PWD),
+;; using as input files from data-dir, and
+;; unidata-text-file (default "unidata.txt" in PWD).
+(defun unidata-gen-files (&optional data-dir dest-dir unidata-text-file)
(or data-dir
- (setq data-dir (car command-line-args-left)
- command-line-args-left (cdr command-line-args-left)
- unidata-text-file (car command-line-args-left)
- command-line-args-left (cdr command-line-args-left)))
+ (setq data-dir (pop command-line-args-left)
+ dest-dir (or (pop command-line-args-left) default-directory)
+ unidata-text-file (or (pop command-line-args-left)
+ (expand-file-name "unidata.txt"))))
(let ((coding-system-for-write 'utf-8-unix)
- (charprop-file "charprop.el")
+ (charprop-file (expand-file-name "charprop.el" dest-dir))
(unidata-dir data-dir))
(dolist (elt unidata-prop-alist)
(let* ((prop (car elt))
- (file (unidata-prop-file prop)))
+ (file (expand-file-name (unidata-prop-file prop) dest-dir)))
(if (file-exists-p file)
(delete-file file))))
(unidata-setup-list unidata-text-file)
(dolist (elt unidata-prop-alist)
(let* ((prop (car elt))
(generator (unidata-prop-generator prop))
- (file (unidata-prop-file prop))
+ (file (expand-file-name (unidata-prop-file prop) dest-dir))
+ (basename (file-name-nondirectory file))
(docstring (unidata-prop-docstring prop))
(describer (unidata-prop-describer prop))
(default-value (unidata-prop-default prop))
table)
;; Filename in this comment line is extracted by sed in
;; Makefile.
- (insert (format ";; FILE: %s\n" file))
+ (insert (format ";; FILE: %s\n" basename))
(insert (format "(define-char-code-property '%S %S\n %S)\n"
- prop file docstring))
+ prop basename docstring))
(with-temp-buffer
+ (message "Using load-path %s" load-path)
(message "Generating %s..." file)
(when (file-exists-p file)
(insert-file-contents file)
(setq table (funcall generator prop default-value val-list))
(when describer
(unless (subrp (symbol-function describer))
- (byte-compile describer)
(setq describer (symbol-function describer)))
(set-char-table-extra-slot table 3 describer))
(if (bobp)
- (insert ";; Copyright (C) 1991-2009 Unicode, Inc.
+ (insert ";; Copyright (C) 1991-2013 Unicode, Inc.
;; This file was generated from the Unicode data files at
;; http://www.unicode.org/Public/UNIDATA/.
;; See lisp/international/README for the copyright and permission notice.\n"))
- (insert (format "(define-char-code-property '%S %S %S)\n"
+ (insert (format "(define-char-code-property '%S\n %S\n %S)\n"
prop table docstring))
(if (eobp)
(insert ";; Local Variables:\n"
";; coding: utf-8\n"
+ ";; version-control: never\n"
";; no-byte-compile: t\n"
+ ";; no-update-autoloads: t\n"
";; End:\n\n"
- (format ";; %s ends here\n" file)))
+ (format ";; %s ends here\n" basename)))
(write-file file)
(message "Generating %s...done" file))))
(message "Writing %s..." charprop-file)
(insert ";; Local Variables:\n"
";; coding: utf-8\n"
+ ";; version-control: never\n"
";; no-byte-compile: t\n"
+ ";; no-update-autoloads: t\n"
";; End:\n\n"
- (format ";; %s ends here\n" charprop-file)))))
+ (format ";; %s ends here\n"
+ (file-name-nondirectory charprop-file))))))
\f