X-Git-Url: http://git.hcoop.net/bpt/emacs.git/blobdiff_plain/f67e15be8d94718b2e2ea7da68eb0b2dc94ce016..41b7f8bc6c4511ab6fcf7f6b1f9e7fdcd8592821:/admin/unidata/unidata-gen.el diff --git a/admin/unidata/unidata-gen.el b/admin/unidata/unidata-gen.el index 8f8a9cb02a..d9277217f0 100644 --- a/admin/unidata/unidata-gen.el +++ b/admin/unidata/unidata-gen.el @@ -1,14 +1,14 @@ ;; unidata-gen.el -- Create files containing character property data. -;; Copyright (C) 2005, 2006, 2007, 2008 +;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 ;; National Institute of Advanced Industrial Science and Technology (AIST) ;; Registration Number H13PRO009 ;; This file is part of GNU Emacs. -;; GNU Emacs is free software; you can redistribute it and/or modify +;; GNU Emacs is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by -;; the Free Software Foundation; either version 3, or (at your option) -;; any later version. +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. ;; GNU Emacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -16,9 +16,7 @@ ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License -;; along with GNU Emacs; see the file COPYING. If not, write to the -;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, -;; Boston, MA 02110-1301, USA. +;; along with GNU Emacs. If not, see . ;;; Commentary: @@ -35,24 +33,25 @@ ;; ;; charprop.el ;; It contains a series of forms of this format: -;; (char-code-property-register PROP FILE) +;; (define-char-code-property PROP FILE) ;; where PROP is a symbol representing a character property -;; (name, generic-category, etc), and FILE is a name of one of +;; (name, general-category, etc), and FILE is a name of one of ;; the following files. ;; ;; uni-name.el, uni-category.el, uni-combining.el, uni-bidi.el, ;; uni-decomposition.el, uni-decimal.el, uni-digit.el, uni-numeric.el, ;; uni-mirrored.el, uni-old-name.el, uni-comment.el, uni-uppercase.el, ;; uni-lowercase.el, uni-titlecase.el -;; They each contain a single form of this format: -;; (char-code-property-register PROP CHAR-TABLE) +;; They contain one or more forms of this format: +;; (define-char-code-property PROP CHAR-TABLE) ;; where PROP is the same as above, and CHAR-TABLE is a ;; char-table containing property values in a compressed format. ;; ;; When they are installed in .../lisp/international/, the file ;; "charprop.el" is preloaded in loadup.el. The other files are -;; automatically loaded when the functions `get-char-code-property' -;; and `put-char-code-property' are called. +;; automatically loaded when the Lisp functions +;; `get-char-code-property' and `put-char-code-property', and C +;; function uniprop_table are called. ;; ;; FORMAT OF A CHAR TABLE ;; @@ -64,17 +63,22 @@ ;; data in a char-table as below. ;; ;; If succeeding 128*N characters have the same property value, we -;; store that value for them. Otherwise, compress values for -;; succeeding 128 characters into a single string and store it as a -;; value for those characters. The way of compression depends on a -;; property. See the section "SIMPLE TABLE", "RUN-LENGTH TABLE", -;; and "WORD-LIST TABLE". - -;; The char table has four extra slots: +;; store that value (or the encoded one) for them. Otherwise, +;; compress values (or the encoded ones) for succeeding 128 +;; characters into a single string and store it for those +;; characters. The way of compression depends on a property. See +;; the section "SIMPLE TABLE", "RUN-LENGTH TABLE", and "WORD-LIST +;; TABLE". + +;; The char table has five extra slots: ;; 1st: property symbol -;; 2nd: function to call to get a property value -;; 3nd: function to call to put a property value -;; 4th: function to call to get a description of a property value +;; 2nd: function to call to get a property value, +;; or an index number of C function to decode the value, +;; or nil if the value can be directly got from the table. +;; 3nd: function to call to put a property value, +;; or an index number of C function to encode the value, +;; or nil if the value can be directly stored in the table. +;; 4th: function to call to get a description of a property value, or nil ;; 5th: data referred by the above functions ;; List of elements of this form: @@ -84,6 +88,11 @@ (defvar unidata-list nil) +;; Name of the directory containing files of Unicode Character +;; Database. + +(defvar unidata-dir nil) + (defun unidata-setup-list (unidata-text-file) (let* ((table (list nil)) (tail table) @@ -92,9 +101,14 @@ ("^<.*Surrogate" . nil) ("^<.*Private Use" . PRIVATE\ USE))) val char name) + (setq unidata-text-file (expand-file-name unidata-text-file unidata-dir)) (or (file-readable-p unidata-text-file) (error "File not readable: %s" unidata-text-file)) (with-temp-buffer + ;; Insert a file of this format: + ;; (CHAR NAME CATEGORY ...) + ;; where CHAR is a character code, the following elements are strings + ;; representing character properties. (insert-file-contents unidata-text-file) (goto-char (point-min)) (condition-case nil @@ -105,7 +119,7 @@ ;; Check this kind of block. ;; 4E00;;Lo;0;L;;;;;N;;;;; - ;; 9FA5;;Lo;0;L;;;;;N;;;;; + ;; 9FCB;;Lo;0;L;;;;;N;;;;; (if (and (= (aref name 0) ?<) (string-match ", First>$" name)) (let ((first char) @@ -132,66 +146,98 @@ (setq unidata-list (cdr table)))) ;; Alist of this form: -;; (PROP INDEX GENERATOR FILENAME) +;; (PROP INDEX GENERATOR FILENAME DOCSTRING DESCRIBER DEFAULT VAL-LIST) ;; PROP: character property -;; INDEX: index to each element of unidata-list for PROP +;; INDEX: index to each element of unidata-list for PROP. +;; It may be a function that generates an alist of character codes +;; vs. the corresponding property values. ;; GENERATOR: function to generate a char-table ;; FILENAME: filename to store the char-table +;; DOCSTRING: docstring for the property ;; DESCRIBER: function to call to get a description string of property value +;; DEFAULT: the default value of the property. It may have the form +;; (VAL0 (FROM1 TO1 VAL1) ...) which indicates that the default +;; value is VAL0 except for characters in the ranges specified by +;; FROMn and TOn (inclusive). The default value of characters +;; between FROMn and TOn is VALn. +;; VAL-LIST: list of specially ordered property values (defconst unidata-prop-alist '((name 1 unidata-gen-table-name "uni-name.el" "Unicode character name. -Property value is a string.") +Property value is a string or nil. +The value nil stands for the default value \"null string\")." + nil + nil) (general-category 2 unidata-gen-table-symbol "uni-category.el" "Unicode general category. Property value is one of the following symbols: Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po, Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn" - unidata-describe-general-category) + unidata-describe-general-category + Cn + ;; The order of elements must be in sync with unicode_category_t + ;; in src/character.h. + (Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po + Sm Sc Sk So Zs Zl Zp Cc Cf Cs Co Cn)) (canonical-combining-class 3 unidata-gen-table-integer "uni-combining.el" "Unicode canonical combining class. Property value is an integer." - unidata-describe-canonical-combining-class) + unidata-describe-canonical-combining-class + 0) (bidi-class 4 unidata-gen-table-symbol "uni-bidi.el" "Unicode bidi class. Property value is one of the following symbols: L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET, AN, CS, NSM, BN, B, S, WS, ON" - unidata-describe-bidi-class) + unidata-describe-bidi-class + ;; The assignment of default values to blocks of code points + ;; follows the file DerivedBidiClass.txt from the Unicode + ;; Character Database (UCD). + (L (#x0600 #x06FF AL) (#xFB50 #xFDFF AL) (#xFE70 #xFEFF AL) + (#x0590 #x05FF R) (#x07C0 #x08FF R) + (#xFB1D #xFB4F R) (#x10800 #x10FFF R) (#x1E800 #x1EFFF R)) + ;; The order of elements must be in sync with bidi_type_t in + ;; src/dispextern.h. + (L R EN AN BN B AL LRE LRO RLE RLO PDF ES ET CS NSM S WS ON)) (decomposition 5 unidata-gen-table-decomposition "uni-decomposition.el" "Unicode decomposition mapping. Property value is a list of characters. The first element may be one of these symbols representing compatibility formatting tag: - , , , , , , , - , , , , , , , , - " + font, noBreak, initial, medial, final, isolated, circle, super, + sub, vertical, wide, narrow, small, square, fraction, compat" unidata-describe-decomposition) (decimal-digit-value 6 unidata-gen-table-integer "uni-decimal.el" "Unicode numeric value (decimal digit). -Property value is an integer.") +Property value is an integer 0..9, or nil. +The value nil stands for NaN \"Numeric_Value\".") (digit-value 7 unidata-gen-table-integer "uni-digit.el" "Unicode numeric value (digit). -Property value is an integer.") +Property value is an integer 0..9, or nil. +The value nil stands for NaN \"Numeric_Value\".") (numeric-value - 8 unidata-gen-table-symbol "uni-numeric.el" + 8 unidata-gen-table-numeric "uni-numeric.el" "Unicode numeric value (numeric). -Property value is a symbol.") +Property value is an integer, a floating point, or nil. +The value nil stands for NaN \"Numeric_Value\".") (mirrored 9 unidata-gen-table-symbol "uni-mirrored.el" "Unicode bidi mirrored flag. -Property value is a symbol `Y' or `N'.") +Property value is a symbol `Y' or `N'. See also the property `mirroring'." + nil + N) (old-name 10 unidata-gen-table-name "uni-old-name.el" "Unicode old names as published in Unicode 1.0. -Property value is a string.") +Property value is a string or nil. +The value nil stands for the default value \"null string\").") (iso-10646-comment 11 unidata-gen-table-name "uni-comment.el" "Unicode ISO 10646 comment. @@ -199,18 +245,30 @@ Property value is a string.") (uppercase 12 unidata-gen-table-character "uni-uppercase.el" "Unicode simple uppercase mapping. -Property value is a character." +Property value is a character or nil. +The value nil means that the actual property value of a character +is the character itself." string) (lowercase 13 unidata-gen-table-character "uni-lowercase.el" "Unicode simple lowercase mapping. -Property value is a character." +Property value is a character or nil. +The value nil means that the actual property value of a character +is the character itself." string) (titlecase 14 unidata-gen-table-character "uni-titlecase.el" "Unicode simple titlecase mapping. -Property value is a character." - string))) +Property value is a character or nil. +The value nil means that the actual property value of a character +is the character itself." + string) + (mirroring + unidata-gen-mirroring-list unidata-gen-table-character "uni-mirrored.el" + "Unicode bidi-mirroring characters. +Property value is a character that has the corresponding mirroring image or nil. +The value nil means that the actual property value of a character +is the character itself."))) ;; Functions to access the above data. (defsubst unidata-prop-index (prop) (nth 1 (assq prop unidata-prop-alist))) @@ -218,6 +276,8 @@ Property value is a character." (defsubst unidata-prop-file (prop) (nth 3 (assq prop unidata-prop-alist))) (defsubst unidata-prop-docstring (prop) (nth 4 (assq prop unidata-prop-alist))) (defsubst unidata-prop-describer (prop) (nth 5 (assq prop unidata-prop-alist))) +(defsubst unidata-prop-default (prop) (nth 6 (assq prop unidata-prop-alist))) +(defsubst unidata-prop-val-list (prop) (nth 7 (assq prop unidata-prop-alist))) ;; SIMPLE TABLE @@ -226,52 +286,34 @@ Property value is a character." ;; values of succeeding character codes are usually different, we use ;; a char-table described here to store such values. ;; -;; If succeeding 128 characters has no property, a char-table has the -;; symbol t is for them. Otherwise a char-table has a string of the -;; following format for them. +;; A char-table divides character code space (#x0..#x3FFFFF) into +;; #x8000 blocks (each block contains 128 characters). + +;; If all characters of a block have no property, a char-table has the +;; symbol nil for that block. Otherwise a char-table has a string of +;; the following format for it. ;; -;; The first character of the string is FIRST-INDEX. -;; The Nth (N > 0) character of the string is a property value of the -;; character (BLOCK-HEAD + FIRST-INDEX + N - 1), where BLOCK-HEAD is -;; the first of the characters in the block. +;; The first character of the string is ?\001. +;; The second character of the string is FIRST-INDEX. +;; The Nth (N > 1) character of the string is a property value of the +;; character (BLOCK-HEAD + FIRST-INDEX + N - 2), where BLOCK-HEAD is +;; the first character of the block. ;; -;; The 4th extra slot of a char-table is nil. - -(defun unidata-get-character (char val table) - (cond - ((characterp val) - val) +;; This kind of char-table has these extra slots: +;; 1st: the property symbol +;; 2nd: nil +;; 3rd: 0 (corresponding to uniprop_encode_character in chartab.c) +;; 4th to 5th: nil - ((stringp val) - (let* ((len (length val)) - (block-head (lsh (lsh char -7) 7)) - (vec (make-vector 128 nil)) - (first-index (aref val 0))) - (dotimes (i (1- len)) - (let ((elt (aref val (1+ i)))) - (if (> elt 0) - (aset vec (+ first-index i) elt)))) - (dotimes (i 128) - (aset table (+ block-head i) (aref vec i))) - (aref vec (- char block-head)))))) - -(defun unidata-put-character (char val table) - (or (characterp val) - (not val) - (error "Not a character nor nil: %S" val)) - (let ((current-val (aref table char))) - (unless (eq current-val val) - (if (stringp current-val) - (funcall (char-table-extra-slot table 1) char current-val table)) - (aset table char val)))) - -(defun unidata-gen-table-character (prop) +(defun unidata-gen-table-character (prop &rest ignore) (let ((table (make-char-table 'char-code-property-table)) (prop-idx (unidata-prop-index prop)) (vec (make-vector 128 0)) (tail unidata-list) elt range val idx slot) - (set-char-table-range table (cons 0 (max-char)) t) + (if (functionp prop-idx) + (setq tail (funcall prop-idx) + prop-idx 1)) (while tail (setq elt (car tail) tail (cdr tail)) (setq range (car elt) @@ -300,7 +342,7 @@ Property value is a character." (setq first-index last-index))) (setq tail (cdr tail))) (when first-index - (let ((str (string first-index)) + (let ((str (string 1 first-index)) c) (while (<= first-index last-index) (setq str (format "%s%c" str (or (aref vec first-index) 0)) @@ -308,142 +350,87 @@ Property value is a character." (set-char-table-range table (cons start limit) str)))))) (set-char-table-extra-slot table 0 prop) - (byte-compile 'unidata-get-character) - (byte-compile 'unidata-put-character) - (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-character)) - (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-character)) - + (set-char-table-extra-slot table 2 0) table)) ;; RUN-LENGTH TABLE ;; -;; If the type of character property value is symbol, integer, -;; boolean, or character, we use a char-table described here to store -;; the values. +;; If many characters of successive character codes have the same +;; property value, we use a char-table described here to store the +;; values. ;; -;; The 4th extra slot is a vector of property values (VAL-TABLE), and -;; values for succeeding 128 characters are encoded into this -;; character sequence: +;; At first, instead of a value itself, we store an index number to +;; the VAL-TABLE (5th extra slot) in the table. We call that index +;; number as VAL-CODE here after. +;; +;; A char-table divides character code space (#x0..#x3FFFFF) into +;; #x8000 blocks (each block contains 128 characters). +;; +;; If all characters of a block have the same value, a char-table has +;; VAL-CODE for that block. Otherwise a char-table has a string of +;; the following format for that block. +;; +;; The first character of the string is ?\002. +;; The following characters has this form: ;; ( VAL-CODE RUN-LENGTH ? ) + ;; where: -;; VAL-CODE (0..127): -;; (VAL-CODE - 1) is an index into VAL-TABLE. -;; The value 0 means no-value. +;; VAL-CODE (0..127): index into VAL-TABLE. ;; RUN-LENGTH (130..255): ;; (RUN-LENGTH - 128) specifies how many characters have the same ;; value. If omitted, it means 1. - - -;; Return a symbol-type character property value of CHAR. VAL is the -;; current value of (aref TABLE CHAR). - -(defun unidata-get-symbol (char val table) - (let ((val-table (char-table-extra-slot table 4))) - (cond ((symbolp val) - val) - ((stringp val) - (let ((first-char (lsh (lsh char -7) 7)) - (str val) - (len (length val)) - (idx 0) - this-val count) - (set-char-table-range table (cons first-char (+ first-char 127)) - nil) - (while (< idx len) - (setq val (aref str idx) idx (1+ idx) - count (if (< idx len) (aref str idx) 1)) - (setq val (and (> val 0) (aref val-table (1- val))) - count (if (< count 128) - 1 - (prog1 (- count 128) (setq idx (1+ idx))))) - (dotimes (i count) - (if val - (aset table first-char val)) - (if (= first-char char) - (setq this-val val)) - (setq first-char (1+ first-char)))) - this-val)) - ((> val 0) - (aref val-table (1- val)))))) - -;; Return a integer-type character property value of CHAR. VAL is the -;; current value of (aref TABLE CHAR). - -(defun unidata-get-integer (char val table) - (let ((val-table (char-table-extra-slot table 4))) - (cond ((integerp val) - val) - ((stringp val) - (let ((first-char (lsh (lsh char -7) 7)) - (str val) - (len (length val)) - (idx 0) - this-val count) - (while (< idx len) - (setq val (aref str idx) idx (1+ idx) - count (if (< idx len) (aref str idx) 1)) - (setq val (and (> val 0) (aref val-table (1- val))) - count (if (< count 128) - 1 - (prog1 (- count 128) (setq idx (1+ idx))))) - (dotimes (i count) - (aset table first-char val) - (if (= first-char char) - (setq this-val val)) - (setq first-char (1+ first-char)))) - this-val))))) - -;; Store VAL (symbol) as a character property value of CHAR in TABLE. - -(defun unidata-put-symbol (char val table) - (or (symbolp val) - (error "Not a symbol: %S" val)) - (let ((current-val (aref table char))) - (unless (eq current-val val) - (if (stringp current-val) - (funcall (char-table-extra-slot table 1) char current-val table)) - (aset table char val)))) - -;; Store VAL (integer) as a character property value of CHAR in TABLE. - -(defun unidata-put-integer (char val table) - (or (integerp val) - (not val) - (error "Not an integer nor nil: %S" val)) - (let ((current-val (aref table char))) - (unless (eq current-val val) - (if (stringp current-val) - (funcall (char-table-extra-slot table 1) char current-val table)) - (aset table char val)))) +;; +;; This kind of char-table has these extra slots: +;; 1st: the property symbol +;; 2nd: 0 (corresponding to uniprop_decode_value in chartab.c) +;; 3rd: 1..3 (corresponding to uniprop_encode_xxx in chartab.c) +;; 4th: function or nil +;; 5th: VAL-TABLE ;; Encode the character property value VAL into an integer value by ;; VAL-LIST. By side effect, VAL-LIST is modified. ;; VAL-LIST has this form: -;; (t (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...) -;; If VAL is one of VALn, just return VAL-CODEn. Otherwise, -;; VAL-LIST is modified to this: -;; (t (VAL . (1+ VAL-CODE1)) (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...) +;; ((nil . 0) (VAL1 . 1) (VAL2 . 2) ...) +;; If VAL is one of VALn, just return n. +;; Otherwise, VAL-LIST is modified to this: +;; ((nil . 0) (VAL1 . 1) (VAL2 . 2) ... (VAL . n+1)) (defun unidata-encode-val (val-list val) - (let ((slot (assq val val-list)) + (let ((slot (assoc val val-list)) val-code) (if slot (cdr slot) - (setq val-code (if (cdr val-list) (1+ (cdr (nth 1 val-list))) 1)) - (setcdr val-list (cons (cons val val-code) (cdr val-list))) + (setq val-code (length val-list)) + (nconc val-list (list (cons val val-code))) val-code))) ;; Generate a char-table for the character property PROP. -(defun unidata-gen-table (prop val-func default-value) +(defun unidata-gen-table (prop val-func default-value val-list) (let ((table (make-char-table 'char-code-property-table)) (prop-idx (unidata-prop-index prop)) - (val-list (list t)) (vec (make-vector 128 0)) - tail elt range val val-code idx slot) - (set-char-table-range table (cons 0 (max-char)) default-value) + tail elt range val val-code idx slot + prev-range-data) + (setq val-list (cons nil (copy-sequence val-list))) + (setq tail val-list val-code 0) + ;; Convert (nil A B ...) to ((nil . 0) (A . 1) (B . 2) ...) + (while tail + (setcar tail (cons (car tail) val-code)) + (setq tail (cdr tail) val-code (1+ val-code))) + (if (consp default-value) + (setq default-value (copy-sequence default-value)) + (setq default-value (list default-value))) + (setcar default-value + (unidata-encode-val val-list (car default-value))) + (set-char-table-range table t (car default-value)) + (set-char-table-range table nil (car default-value)) + (dolist (elm (cdr default-value)) + (setcar (nthcdr 2 elm) + (unidata-encode-val val-list (nth 2 elm))) + (set-char-table-range table (cons (car elm) (nth 1 elm)) (nth 2 elm))) + (setq tail unidata-list) (while tail (setq elt (car tail) tail (cdr tail)) @@ -451,12 +438,44 @@ Property value is a character." val (funcall val-func (nth prop-idx elt))) (setq val-code (if val (unidata-encode-val val-list val))) (if (consp range) - (if val-code - (set-char-table-range table range val)) + (when val-code + (set-char-table-range table range val-code) + (let ((from (car range)) (to (cdr range))) + ;; If RANGE doesn't end at the char-table boundary (each + ;; 128 characters), we may have to carry over the data + ;; for the last several characters (at most 127 chars) + ;; to the next loop. In that case, set PREV-RANGE-DATA + ;; to ((FROM . TO) . VAL-CODE) where (FROM . TO) + ;; specifies the range of characters handled in the next + ;; loop. + (when (< (logand to #x7F) #x7F) + (if (< from (logand to #x1FFF80)) + (setq from (logand to #x1FFF80))) + (setq prev-range-data (cons (cons from to) val-code))))) (let* ((start (lsh (lsh range -7) 7)) (limit (+ start 127)) - str count new-val) - (fillarray vec 0) + str count new-val from to vcode) + (fillarray vec (car default-value)) + (dolist (elm (cdr default-value)) + (setq from (car elm) to (nth 1 elm)) + (when (and (<= from limit) + (or (>= from start) (>= to start))) + (setq from (max from start) + to (min to limit) + vcode (nth 2 elm)) + (while (<= from to) + (aset vec (- from start) vcode) + (setq from (1+ from))))) + ;; See the comment above. + (when (and prev-range-data + (>= (cdr (car prev-range-data)) start)) + (setq from (car (car prev-range-data)) + to (cdr (car prev-range-data)) + vcode (cdr prev-range-data)) + (while (<= from to) + (aset vec (- from start) vcode) + (setq from (1+ from)))) + (setq prev-range-data nil) (if val-code (aset vec (- range start) val-code)) (while (and (setq elt (car tail) range (car elt)) @@ -469,7 +488,7 @@ Property value is a character." (if val-code (aset vec (- range start) val-code)) (setq tail (cdr tail))) - (setq str "" val-code -1 count 0) + (setq str "\002" val-code -1 count 0) (mapc #'(lambda (x) (if (= val-code x) (setq count (1+ count)) @@ -484,7 +503,7 @@ Property value is a character." vec) (if (= count 128) (if val - (set-char-table-range table (cons start limit) val)) + (set-char-table-range table (cons start limit) val-code)) (if (= val-code 0) (set-char-table-range table (cons start limit) str) (if (> count 2) @@ -494,31 +513,40 @@ Property value is a character." (setq str (concat str (string val-code))))) (set-char-table-range table (cons start limit) str)))))) - (setq val-list (nreverse (cdr val-list))) (set-char-table-extra-slot table 0 prop) (set-char-table-extra-slot table 4 (vconcat (mapcar 'car val-list))) table)) -(defun unidata-gen-table-symbol (prop) +(defun unidata-gen-table-symbol (prop default-value val-list) (let ((table (unidata-gen-table prop #'(lambda (x) (and (> (length x) 0) (intern x))) - 0))) - (byte-compile 'unidata-get-symbol) - (byte-compile 'unidata-put-symbol) - (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-symbol)) - (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-symbol)) + default-value val-list))) + (set-char-table-extra-slot table 1 0) + (set-char-table-extra-slot table 2 1) table)) -(defun unidata-gen-table-integer (prop) +(defun unidata-gen-table-integer (prop default-value val-list) (let ((table (unidata-gen-table prop #'(lambda (x) (and (> (length x) 0) (string-to-number x))) - t))) - (byte-compile 'unidata-get-integer) - (byte-compile 'unidata-put-integer) - (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-integer)) - (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-integer)) + default-value val-list))) + (set-char-table-extra-slot table 1 0) + (set-char-table-extra-slot table 2 1) + table)) + +(defun unidata-gen-table-numeric (prop default-value val-list) + (let ((table (unidata-gen-table prop + #'(lambda (x) + (if (string-match "/" x) + (/ (float (string-to-number x)) + (string-to-number + (substring x (match-end 0)))) + (if (> (length x) 0) + (string-to-number x)))) + default-value val-list))) + (set-char-table-extra-slot table 1 0) + (set-char-table-extra-slot table 2 2) table)) @@ -724,6 +752,9 @@ Property value is a character." (defun unidata-get-decomposition (char val table) (cond + ((not val) + (list char)) + ((consp val) val) @@ -764,7 +795,8 @@ Property value is a character." (aset vec idx (nconc word-list tail-list))) (dotimes (i 128) (aset table (+ first-char i) (aref vec i))) - (aref vec (- char first-char))))) + (setq val (aref vec (- char first-char))) + (or val (list char))))) ;; Hangul syllable ((and (eq val 0) (>= char #xAC00) (<= char #xD7A3)) @@ -774,11 +806,13 @@ Property value is a character." (L (+ #x1100 (/ char 588))) ;; V = VBase + (SIndex % NCount) * TCount (V (+ #x1161 (/ (% char 588) 28))) + ;; LV = SBase + (SIndex / TCount) * TCount + (LV (+ #xAC00 (* (/ char 28) 28))) ;; T = TBase + SIndex % TCount (T (+ #x11A7 (% char 28)))) (if (= T #x11A7) (list L V) - (list L V T)))) + (list LV T)))) )) @@ -809,7 +843,6 @@ Property value is a character." word-table block-list block-word-table block-end tail elt range val idx slot) - (set-char-table-range table (cons 0 (max-char)) 0) (setq tail unidata-list) (setq block-end -1) (while tail @@ -942,7 +975,7 @@ Property value is a character." idx (1+ i))))) (nreverse (cons (intern (substring str idx)) l)))))) -(defun unidata-gen-table-name (prop) +(defun unidata-gen-table-name (prop &rest ignore) (let* ((table (unidata-gen-table-word-list prop 'unidata-split-name)) (word-tables (char-table-extra-slot table 4))) (byte-compile 'unidata-get-name) @@ -972,16 +1005,16 @@ Property value is a character." (setq c (aref str i)) (if (= c 32) (setq l (if (= (aref str idx) ?<) - (cons (intern (substring str idx i)) l) + (cons (intern (substring str (1+ idx) (1- i))) l) (cons (string-to-number (substring str idx i) 16) l)) idx (1+ i)))) (if (= (aref str idx) ?<) - (setq l (cons (intern (substring str idx len)) l)) + (setq l (cons (intern (substring str (1+ idx) (1- len))) l)) (setq l (cons (string-to-number (substring str idx len) 16) l))) (nreverse l))))) -(defun unidata-gen-table-decomposition (prop) +(defun unidata-gen-table-decomposition (prop &rest ignore) (let* ((table (unidata-gen-table-word-list prop 'unidata-split-decomposition)) (word-tables (char-table-extra-slot table 4))) (byte-compile 'unidata-get-decomposition) @@ -997,7 +1030,8 @@ Property value is a character." (defun unidata-describe-general-category (val) (cdr (assq val - '((Lu . "Letter, Uppercase") + '((nil . "Uknown") + (Lu . "Letter, Uppercase") (Ll . "Letter, Lowercase") (Lt . "Letter, Titlecase") (Lm . "Letter, Modifier") @@ -1080,8 +1114,26 @@ Property value is a character." (ON . "Other Neutrals"))))) (defun unidata-describe-decomposition (val) - (mapconcat #'(lambda (x) (if (symbolp x) (symbol-name x) (string ?' x ?'))) - val " ")) + (mapconcat + #'(lambda (x) + (if (symbolp x) (symbol-name x) + (concat (string ?') + (compose-string (string x) 0 1 (string ?\t x ?\t)) + (string ?')))) + val " ")) + +(defun unidata-gen-mirroring-list () + (let ((head (list nil)) + tail) + (with-temp-buffer + (insert-file-contents (expand-file-name "BidiMirroring.txt" unidata-dir)) + (goto-char (point-min)) + (setq tail head) + (while (re-search-forward "^\\([0-9A-F]+\\);\\s +\\([0-9A-F]+\\)" nil t) + (let ((char (string-to-number (match-string 1) 16)) + (mirror (match-string 2))) + (setq tail (setcdr tail (list (list char mirror))))))) + (cdr head))) ;; Verify if we can retrieve correct values from the generated ;; char-tables. @@ -1124,13 +1176,21 @@ Property value is a character." ;; The entry function. It generates files described in the header ;; comment of this file. -(defun unidata-gen-files (&optional unidata-text-file) - (or unidata-text-file - (setq unidata-text-file (car command-line-args-left) +(defun unidata-gen-files (&optional data-dir unidata-text-file) + (or data-dir + (setq data-dir (car command-line-args-left) + command-line-args-left (cdr command-line-args-left) + unidata-text-file (car command-line-args-left) command-line-args-left (cdr command-line-args-left))) - (unidata-setup-list unidata-text-file) (let ((coding-system-for-write 'utf-8-unix) - (charprop-file "charprop.el")) + (charprop-file "charprop.el") + (unidata-dir data-dir)) + (dolist (elt unidata-prop-alist) + (let* ((prop (car elt)) + (file (unidata-prop-file prop))) + (if (file-exists-p file) + (delete-file file)))) + (unidata-setup-list unidata-text-file) (with-temp-file charprop-file (insert ";; Automatically generated by unidata-gen.el.\n") (dolist (elt unidata-prop-alist) @@ -1139,28 +1199,41 @@ Property value is a character." (file (unidata-prop-file prop)) (docstring (unidata-prop-docstring prop)) (describer (unidata-prop-describer prop)) + (default-value (unidata-prop-default prop)) + (val-list (unidata-prop-val-list prop)) table) ;; Filename in this comment line is extracted by sed in ;; Makefile. (insert (format ";; FILE: %s\n" file)) (insert (format "(define-char-code-property '%S %S\n %S)\n" prop file docstring)) - (with-temp-file file + (with-temp-buffer (message "Generating %s..." file) - (setq table (funcall generator prop)) + (when (file-exists-p file) + (insert-file-contents file) + (goto-char (point-max)) + (search-backward ";; Local Variables:")) + (setq table (funcall generator prop default-value val-list)) (when describer (unless (subrp (symbol-function describer)) (byte-compile describer) (setq describer (symbol-function describer))) (set-char-table-extra-slot table 3 describer)) - (insert ";; Automatically generated from UnicodeData.txt.\n" - (format "(define-char-code-property '%S %S %S)\n" - prop table docstring) - ";; Local Variables:\n" - ";; coding: utf-8\n" - ";; no-byte-compile: t\n" - ";; End:\n\n" - (format ";; %s ends here\n" file))))) + (if (bobp) + (insert ";; Copyright (C) 1991-2009 Unicode, Inc. +;; This file was generated from the Unicode data files at +;; http://www.unicode.org/Public/UNIDATA/. +;; See lisp/international/README for the copyright and permission notice.\n")) + (insert (format "(define-char-code-property '%S %S %S)\n" + prop table docstring)) + (if (eobp) + (insert ";; Local Variables:\n" + ";; coding: utf-8\n" + ";; no-byte-compile: t\n" + ";; End:\n\n" + (format ";; %s ends here\n" file))) + (write-file file) + (message "Generating %s...done" file)))) (message "Writing %s..." charprop-file) (insert ";; Local Variables:\n" ";; coding: utf-8\n" @@ -1170,5 +1243,4 @@ Property value is a character." -;; arch-tag: 961c862e-b821-447e-9b8a-bfbab9c2d525 ;;; unidata-gen.el ends here