X-Git-Url: http://git.hcoop.net/bpt/emacs.git/blobdiff_plain/1692ae2dd5ff8f6f1fc6f6f62b9a44ab7e615615..dc5d130eed60dbb3e55fb3cd31b3d4bd2333ab85:/admin/unidata/unidata-gen.el diff --git a/admin/unidata/unidata-gen.el b/admin/unidata/unidata-gen.el index ab1dcd134a..8b24b11622 100644 --- a/admin/unidata/unidata-gen.el +++ b/admin/unidata/unidata-gen.el @@ -1,4 +1,7 @@ ;; unidata-gen.el -- Create files containing character property data. + +;; Copyright (C) 2008-2014 Free Software Foundation, Inc. + ;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 ;; National Institute of Advanced Industrial Science and Technology (AIST) ;; Registration Number H13PRO009 @@ -23,13 +26,12 @@ ;; SPECIAL NOTICE ;; ;; This file must be byte-compilable/loadable by `temacs' and also -;; the entry function `unidata-gen-files' must be runnable by -;; `temacs'. +;; the entry function `unidata-gen-files' must be runnable by `temacs'. ;; FILES TO BE GENERATED ;; ;; The entry function `unidata-gen-files' generates these files in -;; the current directory. +;; in directory specified by its dest-dir argument. ;; ;; charprop.el ;; It contains a series of forms of this format: @@ -88,9 +90,9 @@ (defvar unidata-list nil) -;; Name of the directory containing files of Unicode Character -;; Database. +;; Name of the directory containing files of Unicode Character Database. +;; Dynamically bound in unidata-gen-files. (defvar unidata-dir nil) (defun unidata-setup-list (unidata-text-file) @@ -146,7 +148,7 @@ (setq unidata-list (cdr table)))) ;; Alist of this form: -;; (PROP INDEX GENERATOR FILENAME DOCSTRING DESCRIBER VAL-LIST) +;; (PROP INDEX GENERATOR FILENAME DOCSTRING DESCRIBER DEFAULT VAL-LIST) ;; PROP: character property ;; INDEX: index to each element of unidata-list for PROP. ;; It may be a function that generates an alist of character codes @@ -155,14 +157,21 @@ ;; FILENAME: filename to store the char-table ;; DOCSTRING: docstring for the property ;; DESCRIBER: function to call to get a description string of property value -;; DEFAULT: the default value of the property +;; DEFAULT: the default value of the property. It may have the form +;; (VAL0 (FROM1 TO1 VAL1) ...) which indicates that the default +;; value is VAL0 except for characters in the ranges specified by +;; FROMn and TOn (inclusive). The default value of characters +;; between FROMn and TOn is VALn. ;; VAL-LIST: list of specially ordered property values (defconst unidata-prop-alist '((name 1 unidata-gen-table-name "uni-name.el" "Unicode character name. -Property value is a string.") +Property value is a string or nil. +The value nil stands for the default value \"null string\")." + nil + nil) (general-category 2 unidata-gen-table-symbol "uni-category.el" "Unicode general category. @@ -170,7 +179,7 @@ Property value is one of the following symbols: Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po, Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn" unidata-describe-general-category - nil + Cn ;; The order of elements must be in sync with unicode_category_t ;; in src/character.h. (Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po @@ -179,18 +188,25 @@ Property value is one of the following symbols: 3 unidata-gen-table-integer "uni-combining.el" "Unicode canonical combining class. Property value is an integer." - unidata-describe-canonical-combining-class) + unidata-describe-canonical-combining-class + 0) (bidi-class 4 unidata-gen-table-symbol "uni-bidi.el" "Unicode bidi class. Property value is one of the following symbols: - L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET, - AN, CS, NSM, BN, B, S, WS, ON" + L, LRE, LRO, LRI, R, AL, RLE, RLO, RLI, FSI, PDF, PDI, + EN, ES, ET, AN, CS, NSM, BN, B, S, WS, ON" unidata-describe-bidi-class - L + ;; The assignment of default values to blocks of code points + ;; follows the file DerivedBidiClass.txt from the Unicode + ;; Character Database (UCD). + (L (#x0600 #x06FF AL) (#xFB50 #xFDFF AL) (#xFE70 #xFEFF AL) + (#x0590 #x05FF R) (#x07C0 #x08FF R) + (#xFB1D #xFB4F R) (#x10800 #x10FFF R) (#x1E800 #x1EFFF R)) ;; The order of elements must be in sync with bidi_type_t in ;; src/dispextern.h. - (L R EN AN BN B AL LRE LRO RLE RLO PDF ES ET CS NSM S WS ON)) + (L R EN AN BN B AL LRE LRO RLE RLO PDF LRI RLI FSI PDI + ES ET CS NSM S WS ON)) (decomposition 5 unidata-gen-table-decomposition "uni-decomposition.el" "Unicode decomposition mapping. @@ -202,23 +218,29 @@ one of these symbols representing compatibility formatting tag: (decimal-digit-value 6 unidata-gen-table-integer "uni-decimal.el" "Unicode numeric value (decimal digit). -Property value is an integer.") +Property value is an integer 0..9, or nil. +The value nil stands for NaN \"Numeric_Value\".") (digit-value 7 unidata-gen-table-integer "uni-digit.el" "Unicode numeric value (digit). -Property value is an integer.") +Property value is an integer 0..9, or nil. +The value nil stands for NaN \"Numeric_Value\".") (numeric-value 8 unidata-gen-table-numeric "uni-numeric.el" "Unicode numeric value (numeric). -Property value is an integer or a floating point.") +Property value is an integer, a floating point, or nil. +The value nil stands for NaN \"Numeric_Value\".") (mirrored 9 unidata-gen-table-symbol "uni-mirrored.el" "Unicode bidi mirrored flag. -Property value is a symbol `Y' or `N'. See also the property `mirroring'.") +Property value is a symbol `Y' or `N'. See also the property `mirroring'." + nil + N) (old-name 10 unidata-gen-table-name "uni-old-name.el" "Unicode old names as published in Unicode 1.0. -Property value is a string.") +Property value is a string or nil. +The value nil stands for the default value \"null string\").") (iso-10646-comment 11 unidata-gen-table-name "uni-comment.el" "Unicode ISO 10646 comment. @@ -226,23 +248,30 @@ Property value is a string.") (uppercase 12 unidata-gen-table-character "uni-uppercase.el" "Unicode simple uppercase mapping. -Property value is a character." +Property value is a character or nil. +The value nil means that the actual property value of a character +is the character itself." string) (lowercase 13 unidata-gen-table-character "uni-lowercase.el" "Unicode simple lowercase mapping. -Property value is a character." +Property value is a character or nil. +The value nil means that the actual property value of a character +is the character itself." string) (titlecase 14 unidata-gen-table-character "uni-titlecase.el" "Unicode simple titlecase mapping. -Property value is a character." +Property value is a character or nil. +The value nil means that the actual property value of a character +is the character itself." string) (mirroring unidata-gen-mirroring-list unidata-gen-table-character "uni-mirrored.el" "Unicode bidi-mirroring characters. -Property value is a character that has the corresponding mirroring image, -or nil for non-mirrored character."))) +Property value is a character that has the corresponding mirroring image or nil. +The value nil means that the actual property value of a character +is the character itself."))) ;; Functions to access the above data. (defsubst unidata-prop-index (prop) (nth 1 (assq prop unidata-prop-alist))) @@ -369,12 +398,17 @@ or nil for non-mirrored character."))) ;; If VAL is one of VALn, just return n. ;; Otherwise, VAL-LIST is modified to this: ;; ((nil . 0) (VAL1 . 1) (VAL2 . 2) ... (VAL . n+1)) +;; +;; WARN is an optional warning to display when the value list is +;; extended, for property values that need to be in sync with other +;; parts of Emacs; currently only used for bidi-class. -(defun unidata-encode-val (val-list val) +(defun unidata-encode-val (val-list val &optional warn) (let ((slot (assoc val val-list)) val-code) (if slot (cdr slot) + (if warn (message warn val)) (setq val-code (length val-list)) (nconc val-list (list (cons val val-code))) val-code))) @@ -385,6 +419,16 @@ or nil for non-mirrored character."))) (let ((table (make-char-table 'char-code-property-table)) (prop-idx (unidata-prop-index prop)) (vec (make-vector 128 0)) + ;; When this warning is printed, there's a need to make the + ;; following changes: + ;; (1) update unidata-prop-alist with the new bidi-class values; + ;; (2) extend bidi_type_t enumeration on src/dispextern.h to + ;; include the new classes; + ;; (3) possibly update the assertion in bidi.c:bidi_check_type; and + ;; (4) possibly update the switch cases in + ;; bidi.c:bidi_get_type and bidi.c:bidi_get_category. + (bidi-warning "\ +** Found new bidi-class '%s', please update bidi.c and dispextern.h") tail elt range val val-code idx slot prev-range-data) (setq val-list (cons nil (copy-sequence val-list))) @@ -393,15 +437,26 @@ or nil for non-mirrored character."))) (while tail (setcar tail (cons (car tail) val-code)) (setq tail (cdr tail) val-code (1+ val-code))) - (setq default-value (unidata-encode-val val-list default-value)) - (set-char-table-range table t default-value) - (set-char-table-range table nil default-value) + (if (consp default-value) + (setq default-value (copy-sequence default-value)) + (setq default-value (list default-value))) + (setcar default-value + (unidata-encode-val val-list (car default-value))) + (set-char-table-range table t (car default-value)) + (set-char-table-range table nil (car default-value)) + (dolist (elm (cdr default-value)) + (setcar (nthcdr 2 elm) + (unidata-encode-val val-list (nth 2 elm))) + (set-char-table-range table (cons (car elm) (nth 1 elm)) (nth 2 elm))) + (setq tail unidata-list) (while tail (setq elt (car tail) tail (cdr tail)) (setq range (car elt) val (funcall val-func (nth prop-idx elt))) - (setq val-code (if val (unidata-encode-val val-list val))) + (setq val-code (if val (unidata-encode-val val-list val + (and (eq prop 'bidi-class) + bidi-warning)))) (if (consp range) (when val-code (set-char-table-range table range val-code) @@ -419,17 +474,27 @@ or nil for non-mirrored character."))) (setq prev-range-data (cons (cons from to) val-code))))) (let* ((start (lsh (lsh range -7) 7)) (limit (+ start 127)) - str count new-val) - (fillarray vec 0) - ;; See the comment above. - (when (and prev-range-data - (>= (cdr (car prev-range-data)) start)) - (let ((from (car (car prev-range-data))) - (to (cdr (car prev-range-data))) - (vcode (cdr prev-range-data))) + str count new-val from to vcode) + (fillarray vec (car default-value)) + (dolist (elm (cdr default-value)) + (setq from (car elm) to (nth 1 elm)) + (when (and (<= from limit) + (or (>= from start) (>= to start))) + (setq from (max from start) + to (min to limit) + vcode (nth 2 elm)) (while (<= from to) (aset vec (- from start) vcode) (setq from (1+ from))))) + ;; See the comment above. + (when (and prev-range-data + (>= (cdr (car prev-range-data)) start)) + (setq from (car (car prev-range-data)) + to (cdr (car prev-range-data)) + vcode (cdr prev-range-data)) + (while (<= from to) + (aset vec (- from start) vcode) + (setq from (1+ from)))) (setq prev-range-data nil) (if val-code (aset vec (- range start) val-code)) @@ -439,7 +504,9 @@ or nil for non-mirrored character."))) (setq new-val (funcall val-func (nth prop-idx elt))) (if (not (eq val new-val)) (setq val new-val - val-code (if val (unidata-encode-val val-list val)))) + val-code (if val (unidata-encode-val + val-list val (and (eq prop 'bidi-class) + bidi-warning))))) (if val-code (aset vec (- range start) val-code)) (setq tail (cdr tail))) @@ -613,7 +680,7 @@ or nil for non-mirrored character."))) ;; Return a name of CHAR. VAL is the current value of (aref TABLE ;; CHAR). -(defun unidata-get-name (char val table) +(fset 'unidata-get-name '(lambda (char val table) (cond ((stringp val) (if (> (aref val 0) 0) @@ -695,18 +762,21 @@ or nil for non-mirrored character."))) ((eq sym 'CJK\ COMPATIBILITY\ IDEOGRAPH) (format "%s-%04X" sym char)) ((eq sym 'VARIATION\ SELECTOR) - (format "%s-%d" sym (+ (- char #xe0100) 17)))))))) + (format "%s-%d" sym (+ (- char #xe0100) 17))))))))) ;; Store VAL as the name of CHAR in TABLE. -(defun unidata-put-name (char val table) +(fset 'unidata-put-name '(lambda (char val table) (let ((current-val (aref table char))) (if (and (stringp current-val) (= (aref current-val 0) 0)) (funcall (char-table-extra-slot table 1) char current-val table)) - (aset table char val))) + (aset table char val)))) (defun unidata-get-decomposition (char val table) (cond + ((not val) + (list char)) + ((consp val) val) @@ -747,7 +817,8 @@ or nil for non-mirrored character."))) (aset vec idx (nconc word-list tail-list))) (dotimes (i 128) (aset table (+ first-char i) (aref vec i))) - (aref vec (- char first-char))))) + (setq val (aref vec (- char first-char))) + (or val (list char))))) ;; Hangul syllable ((and (eq val 0) (>= char #xAC00) (<= char #xD7A3)) @@ -913,7 +984,14 @@ or nil for non-mirrored character."))) (l nil) (idx 0) c) - (if (= len 0) + (if (or (= len 0) + ;; Unicode Standard, paragraph 4.8: "For all other + ;; Unicode code points of all other types (Control, + ;; Private-Use, Surrogate, Noncharacter, and Reserved), + ;; the value of the Name property is the null string." + ;; We already handle elsewhere all the characters except + ;; Cc, Control characters, which are handled here. + (string= str "")) nil (dotimes (i len) (setq c (aref str i)) @@ -929,8 +1007,6 @@ or nil for non-mirrored character."))) (defun unidata-gen-table-name (prop &rest ignore) (let* ((table (unidata-gen-table-word-list prop 'unidata-split-name)) (word-tables (char-table-extra-slot table 4))) - (byte-compile 'unidata-get-name) - (byte-compile 'unidata-put-name) (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-name)) (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-name)) @@ -968,8 +1044,6 @@ or nil for non-mirrored character."))) (defun unidata-gen-table-decomposition (prop &rest ignore) (let* ((table (unidata-gen-table-word-list prop 'unidata-split-decomposition)) (word-tables (char-table-extra-slot table 4))) - (byte-compile 'unidata-get-decomposition) - (byte-compile 'unidata-put-decomposition) (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-decomposition)) (set-char-table-extra-slot table 2 @@ -979,7 +1053,8 @@ or nil for non-mirrored character."))) -(defun unidata-describe-general-category (val) +(fset + 'unidata-describe-general-category '(lambda (val) (cdr (assq val '((nil . "Uknown") (Lu . "Letter, Uppercase") @@ -1011,9 +1086,9 @@ or nil for non-mirrored character."))) (Cf . "Other, Format") (Cs . "Other, Surrogate") (Co . "Other, Private Use") - (Cn . "Other, Not Assigned"))))) + (Cn . "Other, Not Assigned")))))) -(defun unidata-describe-canonical-combining-class (val) +(fset 'unidata-describe-canonical-combining-class '(lambda (val) (cdr (assq val '((0 . "Spacing, split, enclosing, reordrant, and Tibetan subjoined") (1 . "Overlays and interior") @@ -1040,9 +1115,9 @@ or nil for non-mirrored character."))) (232 . "Above right") (233 . "Double below") (234 . "Double above") - (240 . "Below (iota subscript)"))))) + (240 . "Below (iota subscript)")))))) -(defun unidata-describe-bidi-class (val) +(fset 'unidata-describe-bidi-class '(lambda (val) (cdr (assq val '((L . "Left-to-Right") (LRE . "Left-to-Right Embedding") @@ -1052,6 +1127,10 @@ or nil for non-mirrored character."))) (RLE . "Right-to-Left Embedding") (RLO . "Right-to-Left Override") (PDF . "Pop Directional Format") + (LRI . "Left-to-Right Isolate") + (RLI . "Right-to-Left Isolate") + (FSI . "First Strong Isolate") + (PDI . "Pop Directional Isolate") (EN . "European Number") (ES . "European Number Separator") (ET . "European Number Terminator") @@ -1062,7 +1141,7 @@ or nil for non-mirrored character."))) (B . "Paragraph Separator") (S . "Segment Separator") (WS . "Whitespace") - (ON . "Other Neutrals"))))) + (ON . "Other Neutrals")))))) (defun unidata-describe-decomposition (val) (mapconcat @@ -1127,18 +1206,21 @@ or nil for non-mirrored character."))) ;; The entry function. It generates files described in the header ;; comment of this file. -(defun unidata-gen-files (&optional data-dir unidata-text-file) +;; Write files (charprop.el, uni-*.el) to dest-dir (default PWD), +;; using as input files from data-dir, and +;; unidata-text-file (default "unidata.txt" in PWD). +(defun unidata-gen-files (&optional data-dir dest-dir unidata-text-file) (or data-dir - (setq data-dir (car command-line-args-left) - command-line-args-left (cdr command-line-args-left) - unidata-text-file (car command-line-args-left) - command-line-args-left (cdr command-line-args-left))) + (setq data-dir (pop command-line-args-left) + dest-dir (or (pop command-line-args-left) default-directory) + unidata-text-file (or (pop command-line-args-left) + (expand-file-name "unidata.txt")))) (let ((coding-system-for-write 'utf-8-unix) - (charprop-file "charprop.el") + (charprop-file (expand-file-name "charprop.el" dest-dir)) (unidata-dir data-dir)) (dolist (elt unidata-prop-alist) (let* ((prop (car elt)) - (file (unidata-prop-file prop))) + (file (expand-file-name (unidata-prop-file prop) dest-dir))) (if (file-exists-p file) (delete-file file)))) (unidata-setup-list unidata-text-file) @@ -1147,7 +1229,8 @@ or nil for non-mirrored character."))) (dolist (elt unidata-prop-alist) (let* ((prop (car elt)) (generator (unidata-prop-generator prop)) - (file (unidata-prop-file prop)) + (file (expand-file-name (unidata-prop-file prop) dest-dir)) + (basename (file-name-nondirectory file)) (docstring (unidata-prop-docstring prop)) (describer (unidata-prop-describer prop)) (default-value (unidata-prop-default prop)) @@ -1155,10 +1238,11 @@ or nil for non-mirrored character."))) table) ;; Filename in this comment line is extracted by sed in ;; Makefile. - (insert (format ";; FILE: %s\n" file)) + (insert (format ";; FILE: %s\n" basename)) (insert (format "(define-char-code-property '%S %S\n %S)\n" - prop file docstring)) + prop basename docstring)) (with-temp-buffer + (message "Using load-path %s" load-path) (message "Generating %s..." file) (when (file-exists-p file) (insert-file-contents file) @@ -1167,30 +1251,34 @@ or nil for non-mirrored character."))) (setq table (funcall generator prop default-value val-list)) (when describer (unless (subrp (symbol-function describer)) - (byte-compile describer) (setq describer (symbol-function describer))) (set-char-table-extra-slot table 3 describer)) (if (bobp) - (insert ";; Copyright (C) 1991-2009 Unicode, Inc. + (insert ";; Copyright (C) 1991-2013 Unicode, Inc. ;; This file was generated from the Unicode data files at ;; http://www.unicode.org/Public/UNIDATA/. ;; See lisp/international/README for the copyright and permission notice.\n")) - (insert (format "(define-char-code-property '%S %S %S)\n" + (insert (format "(define-char-code-property '%S\n %S\n %S)\n" prop table docstring)) (if (eobp) (insert ";; Local Variables:\n" ";; coding: utf-8\n" + ";; version-control: never\n" ";; no-byte-compile: t\n" + ";; no-update-autoloads: t\n" ";; End:\n\n" - (format ";; %s ends here\n" file))) + (format ";; %s ends here\n" basename))) (write-file file) (message "Generating %s...done" file)))) (message "Writing %s..." charprop-file) (insert ";; Local Variables:\n" ";; coding: utf-8\n" + ";; version-control: never\n" ";; no-byte-compile: t\n" + ";; no-update-autoloads: t\n" ";; End:\n\n" - (format ";; %s ends here\n" charprop-file))))) + (format ";; %s ends here\n" + (file-name-nondirectory charprop-file))))))