;; unidata-gen.el -- Create files containing character property data.
-;; Copyright (C) 2005, 2006, 2007, 2008
+;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
;; National Institute of Advanced Industrial Science and Technology (AIST)
;; Registration Number H13PRO009
;; This file is part of GNU Emacs.
-;; GNU Emacs is free software; you can redistribute it and/or modify
+;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
-;; the Free Software Foundation; either version 3, or (at your option)
-;; any later version.
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
-;; along with GNU Emacs; see the file COPYING. If not, write to the
-;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
-;; Boston, MA 02110-1301, USA.
+;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;;
;; charprop.el
;; It contains a series of forms of this format:
-;; (char-code-property-register PROP FILE)
+;; (define-char-code-property PROP FILE)
;; where PROP is a symbol representing a character property
-;; (name, generic-category, etc), and FILE is a name of one of
+;; (name, general-category, etc), and FILE is a name of one of
;; the following files.
;;
-;; uni-name.el, uni-cat.el, uni-comb.el, uni-bidi.el
-;; It contains a single form of this format:
-;; (char-code-property-register PROP CHAR-TABLE)
+;; uni-name.el, uni-category.el, uni-combining.el, uni-bidi.el,
+;; uni-decomposition.el, uni-decimal.el, uni-digit.el, uni-numeric.el,
+;; uni-mirrored.el, uni-old-name.el, uni-comment.el, uni-uppercase.el,
+;; uni-lowercase.el, uni-titlecase.el
+;; They contain one or more forms of this format:
+;; (define-char-code-property PROP CHAR-TABLE)
;; where PROP is the same as above, and CHAR-TABLE is a
;; char-table containing property values in a compressed format.
;;
;; When they are installed in .../lisp/international/, the file
;; "charprop.el" is preloaded in loadup.el. The other files are
-;; automatically loaded when the functions `get-char-code-property'
-;; and `put-char-code-property' are called.
+;; automatically loaded when the Lisp functions
+;; `get-char-code-property' and `put-char-code-property', and C
+;; function uniprop_table are called.
;;
;; FORMAT OF A CHAR TABLE
;;
;; data in a char-table as below.
;;
;; If succeeding 128*N characters have the same property value, we
-;; store that value for them. Otherwise, compress values for
-;; succeeding 128 characters into a single string and store it as a
-;; value for those characters. The way of compression depends on a
-;; property. See the section "SIMPLE TABLE", "RUN-LENGTH TABLE",
-;; and "WORD-LIST TABLE".
-
-;; The char table has four extra slots:
+;; store that value (or the encoded one) for them. Otherwise,
+;; compress values (or the encoded ones) for succeeding 128
+;; characters into a single string and store it for those
+;; characters. The way of compression depends on a property. See
+;; the section "SIMPLE TABLE", "RUN-LENGTH TABLE", and "WORD-LIST
+;; TABLE".
+
+;; The char table has five extra slots:
;; 1st: property symbol
-;; 2nd: function to call to get a property value
-;; 3nd: function to call to put a property value
-;; 4th: function to call to get a description of a property value
+;; 2nd: function to call to get a property value,
+;; or an index number of C function to decode the value,
+;; or nil if the value can be directly got from the table.
+;; 3nd: function to call to put a property value,
+;; or an index number of C function to encode the value,
+;; or nil if the value can be directly stored in the table.
+;; 4th: function to call to get a description of a property value, or nil
;; 5th: data referred by the above functions
;; List of elements of this form:
(defvar unidata-list nil)
+;; Name of the directory containing files of Unicode Character
+;; Database.
+
+(defvar unidata-dir nil)
+
(defun unidata-setup-list (unidata-text-file)
(let* ((table (list nil))
(tail table)
("^<.*Surrogate" . nil)
("^<.*Private Use" . PRIVATE\ USE)))
val char name)
+ (setq unidata-text-file (expand-file-name unidata-text-file unidata-dir))
(or (file-readable-p unidata-text-file)
(error "File not readable: %s" unidata-text-file))
(with-temp-buffer
+ ;; Insert a file of this format:
+ ;; (CHAR NAME CATEGORY ...)
+ ;; where CHAR is a character code, the following elements are strings
+ ;; representing character properties.
(insert-file-contents unidata-text-file)
(goto-char (point-min))
(condition-case nil
;; Check this kind of block.
;; 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
- ;; 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
+ ;; 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
(if (and (= (aref name 0) ?<)
(string-match ", First>$" name))
(let ((first char)
(setq unidata-list (cdr table))))
;; Alist of this form:
-;; (PROP INDEX GENERATOR FILENAME)
+;; (PROP INDEX GENERATOR FILENAME DOCSTRING DESCRIBER DEFAULT VAL-LIST)
;; PROP: character property
-;; INDEX: index to each element of unidata-list for PROP
+;; INDEX: index to each element of unidata-list for PROP.
+;; It may be a function that generates an alist of character codes
+;; vs. the corresponding property values.
;; GENERATOR: function to generate a char-table
;; FILENAME: filename to store the char-table
+;; DOCSTRING: docstring for the property
;; DESCRIBER: function to call to get a description string of property value
+;; DEFAULT: the default value of the property. It may have the form
+;; (VAL0 (FROM1 TO1 VAL1) ...) which indicates that the default
+;; value is VAL0 except for characters in the ranges specified by
+;; FROMn and TOn (inclusive). The default value of characters
+;; between FROMn and TOn is VALn.
+;; VAL-LIST: list of specially ordered property values
(defconst unidata-prop-alist
'((name
1 unidata-gen-table-name "uni-name.el"
"Unicode character name.
-Property value is a string.")
+Property value is a string or nil.
+The value nil stands for the default value \"null string\")."
+ nil
+ nil)
(general-category
2 unidata-gen-table-symbol "uni-category.el"
"Unicode general category.
Property value is one of the following symbols:
Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn"
- unidata-describe-general-category)
+ unidata-describe-general-category
+ Cn
+ ;; The order of elements must be in sync with unicode_category_t
+ ;; in src/character.h.
+ (Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po
+ Sm Sc Sk So Zs Zl Zp Cc Cf Cs Co Cn))
(canonical-combining-class
3 unidata-gen-table-integer "uni-combining.el"
"Unicode canonical combining class.
Property value is an integer."
- unidata-describe-canonical-combining-class)
+ unidata-describe-canonical-combining-class
+ 0)
(bidi-class
4 unidata-gen-table-symbol "uni-bidi.el"
"Unicode bidi class.
Property value is one of the following symbols:
L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET,
AN, CS, NSM, BN, B, S, WS, ON"
- unidata-describe-bidi-class)
+ unidata-describe-bidi-class
+ ;; The assignment of default values to blocks of code points
+ ;; follows the file DerivedBidiClass.txt from the Unicode
+ ;; Character Database (UCD).
+ (L (#x0600 #x06FF AL) (#xFB50 #xFDFF AL) (#xFE70 #xFEFF AL)
+ (#x0590 #x05FF R) (#x07C0 #x08FF R)
+ (#xFB1D #xFB4F R) (#x10800 #x10FFF R) (#x1E800 #x1EFFF R))
+ ;; The order of elements must be in sync with bidi_type_t in
+ ;; src/dispextern.h.
+ (L R EN AN BN B AL LRE LRO RLE RLO PDF ES ET CS NSM S WS ON))
(decomposition
5 unidata-gen-table-decomposition "uni-decomposition.el"
"Unicode decomposition mapping.
Property value is a list of characters. The first element may be
one of these symbols representing compatibility formatting tag:
- <font>, <noBreak>, <initial>, <medial>, <final>, <isolated>, <circle>,
- <super>, <sub>, <vertical>, <wide>, <narrow>, <small>, <square>, <fraction>,
- <compat>"
+ font, noBreak, initial, medial, final, isolated, circle, super,
+ sub, vertical, wide, narrow, small, square, fraction, compat"
unidata-describe-decomposition)
(decimal-digit-value
6 unidata-gen-table-integer "uni-decimal.el"
"Unicode numeric value (decimal digit).
-Property value is an integer.")
+Property value is an integer 0..9, or nil.
+The value nil stands for NaN \"Numeric_Value\".")
(digit-value
7 unidata-gen-table-integer "uni-digit.el"
"Unicode numeric value (digit).
-Property value is an integer.")
+Property value is an integer 0..9, or nil.
+The value nil stands for NaN \"Numeric_Value\".")
(numeric-value
- 8 unidata-gen-table-symbol "uni-numeric.el"
+ 8 unidata-gen-table-numeric "uni-numeric.el"
"Unicode numeric value (numeric).
-Property value is an symbol.")
+Property value is an integer, a floating point, or nil.
+The value nil stands for NaN \"Numeric_Value\".")
(mirrored
9 unidata-gen-table-symbol "uni-mirrored.el"
"Unicode bidi mirrored flag.
-Property value is a symbol `Y' or `N'.")
+Property value is a symbol `Y' or `N'. See also the property `mirroring'."
+ nil
+ N)
(old-name
10 unidata-gen-table-name "uni-old-name.el"
"Unicode old names as published in Unicode 1.0.
-Property value is a string.")
+Property value is a string or nil.
+The value nil stands for the default value \"null string\").")
(iso-10646-comment
11 unidata-gen-table-name "uni-comment.el"
"Unicode ISO 10646 comment.
(uppercase
12 unidata-gen-table-character "uni-uppercase.el"
"Unicode simple uppercase mapping.
-Property value is a character."
+Property value is a character or nil.
+The value nil means that the actual property value of a character
+is the character itself."
string)
(lowercase
13 unidata-gen-table-character "uni-lowercase.el"
"Unicode simple lowercase mapping.
-Property value is a character."
+Property value is a character or nil.
+The value nil means that the actual property value of a character
+is the character itself."
string)
(titlecase
14 unidata-gen-table-character "uni-titlecase.el"
"Unicode simple titlecase mapping.
-Property value is a character."
- string)))
+Property value is a character or nil.
+The value nil means that the actual property value of a character
+is the character itself."
+ string)
+ (mirroring
+ unidata-gen-mirroring-list unidata-gen-table-character "uni-mirrored.el"
+ "Unicode bidi-mirroring characters.
+Property value is a character that has the corresponding mirroring image or nil.
+The value nil means that the actual property value of a character
+is the character itself.")))
;; Functions to access the above data.
(defsubst unidata-prop-index (prop) (nth 1 (assq prop unidata-prop-alist)))
(defsubst unidata-prop-file (prop) (nth 3 (assq prop unidata-prop-alist)))
(defsubst unidata-prop-docstring (prop) (nth 4 (assq prop unidata-prop-alist)))
(defsubst unidata-prop-describer (prop) (nth 5 (assq prop unidata-prop-alist)))
+(defsubst unidata-prop-default (prop) (nth 6 (assq prop unidata-prop-alist)))
+(defsubst unidata-prop-val-list (prop) (nth 7 (assq prop unidata-prop-alist)))
\f
;; SIMPLE TABLE
;; values of succeeding character codes are usually different, we use
;; a char-table described here to store such values.
;;
-;; If succeeding 128 characters has no property, a char-table has the
-;; symbol t is for them. Otherwise a char-table has a string of the
-;; following format for them.
+;; A char-table divides character code space (#x0..#x3FFFFF) into
+;; #x8000 blocks (each block contains 128 characters).
+
+;; If all characters of a block have no property, a char-table has the
+;; symbol nil for that block. Otherwise a char-table has a string of
+;; the following format for it.
;;
-;; The first character of the string is FIRST-INDEX.
-;; The Nth (N > 0) character of the string is a property value of the
-;; character (BLOCK-HEAD + FIRST-INDEX + N - 1), where BLOCK-HEAD is
-;; the first of the characters in the block.
+;; The first character of the string is ?\001.
+;; The second character of the string is FIRST-INDEX.
+;; The Nth (N > 1) character of the string is a property value of the
+;; character (BLOCK-HEAD + FIRST-INDEX + N - 2), where BLOCK-HEAD is
+;; the first character of the block.
;;
-;; The 4th extra slot of a char-table is nil.
-
-(defun unidata-get-character (char val table)
- (cond
- ((characterp val)
- val)
+;; This kind of char-table has these extra slots:
+;; 1st: the property symbol
+;; 2nd: nil
+;; 3rd: 0 (corresponding to uniprop_encode_character in chartab.c)
+;; 4th to 5th: nil
- ((stringp val)
- (let* ((len (length val))
- (block-head (lsh (lsh char -7) 7))
- (vec (make-vector 128 nil))
- (first-index (aref val 0)))
- (dotimes (i (1- len))
- (let ((elt (aref val (1+ i))))
- (if (> elt 0)
- (aset vec (+ first-index i) elt))))
- (dotimes (i 128)
- (aset table (+ block-head i) (aref vec i)))
- (aref vec (- char block-head))))))
-
-(defun unidata-put-character (char val table)
- (or (characterp val)
- (not val)
- (error "Not an character nor nil: %S" val))
- (let ((current-val (aref table char)))
- (unless (eq current-val val)
- (if (stringp current-val)
- (funcall (char-table-extra-slot table 1) char current-val table))
- (aset table char val))))
-
-(defun unidata-gen-table-character (prop)
+(defun unidata-gen-table-character (prop &rest ignore)
(let ((table (make-char-table 'char-code-property-table))
(prop-idx (unidata-prop-index prop))
(vec (make-vector 128 0))
(tail unidata-list)
elt range val idx slot)
- (set-char-table-range table (cons 0 (max-char)) t)
+ (if (functionp prop-idx)
+ (setq tail (funcall prop-idx)
+ prop-idx 1))
(while tail
(setq elt (car tail) tail (cdr tail))
(setq range (car elt)
(setq first-index last-index)))
(setq tail (cdr tail)))
(when first-index
- (let ((str (string first-index))
+ (let ((str (string 1 first-index))
c)
(while (<= first-index last-index)
(setq str (format "%s%c" str (or (aref vec first-index) 0))
(set-char-table-range table (cons start limit) str))))))
(set-char-table-extra-slot table 0 prop)
- (byte-compile 'unidata-get-character)
- (byte-compile 'unidata-put-character)
- (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-character))
- (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-character))
-
+ (set-char-table-extra-slot table 2 0)
table))
\f
;; RUN-LENGTH TABLE
;;
-;; If the type of character property value is symbol, integer,
-;; boolean, or character, we use a char-table described here to store
-;; the values.
+;; If many characters of successive character codes have the same
+;; property value, we use a char-table described here to store the
+;; values.
+;;
+;; At first, instead of a value itself, we store an index number to
+;; the VAL-TABLE (5th extra slot) in the table. We call that index
+;; number as VAL-CODE here after.
;;
-;; The 4th extra slot is a vector of property values (VAL-TABLE), and
-;; values for succeeding 128 characters are encoded into this
-;; character sequence:
+;; A char-table divides character code space (#x0..#x3FFFFF) into
+;; #x8000 blocks (each block contains 128 characters).
+;;
+;; If all characters of a block have the same value, a char-table has
+;; VAL-CODE for that block. Otherwise a char-table has a string of
+;; the following format for that block.
+;;
+;; The first character of the string is ?\002.
+;; The following characters has this form:
;; ( VAL-CODE RUN-LENGTH ? ) +
;; where:
-;; VAL-CODE (0..127):
-;; (VAL-CODE - 1) is an index into VAL-TABLE.
-;; The value 0 means no-value.
+;; VAL-CODE (0..127): index into VAL-TABLE.
;; RUN-LENGTH (130..255):
;; (RUN-LENGTH - 128) specifies how many characters have the same
;; value. If omitted, it means 1.
-
-
-;; Return a symbol-type character property value of CHAR. VAL is the
-;; current value of (aref TABLE CHAR).
-
-(defun unidata-get-symbol (char val table)
- (let ((val-table (char-table-extra-slot table 4)))
- (cond ((symbolp val)
- val)
- ((stringp val)
- (let ((first-char (lsh (lsh char -7) 7))
- (str val)
- (len (length val))
- (idx 0)
- this-val count)
- (set-char-table-range table (cons first-char (+ first-char 127))
- nil)
- (while (< idx len)
- (setq val (aref str idx) idx (1+ idx)
- count (if (< idx len) (aref str idx) 1))
- (setq val (and (> val 0) (aref val-table (1- val)))
- count (if (< count 128)
- 1
- (prog1 (- count 128) (setq idx (1+ idx)))))
- (dotimes (i count)
- (if val
- (aset table first-char val))
- (if (= first-char char)
- (setq this-val val))
- (setq first-char (1+ first-char))))
- this-val))
- ((> val 0)
- (aref val-table (1- val))))))
-
-;; Return a integer-type character property value of CHAR. VAL is the
-;; current value of (aref TABLE CHAR).
-
-(defun unidata-get-integer (char val table)
- (let ((val-table (char-table-extra-slot table 4)))
- (cond ((integerp val)
- val)
- ((stringp val)
- (let ((first-char (lsh (lsh char -7) 7))
- (str val)
- (len (length val))
- (idx 0)
- this-val count)
- (while (< idx len)
- (setq val (aref str idx) idx (1+ idx)
- count (if (< idx len) (aref str idx) 1))
- (setq val (and (> val 0) (aref val-table (1- val)))
- count (if (< count 128)
- 1
- (prog1 (- count 128) (setq idx (1+ idx)))))
- (dotimes (i count)
- (aset table first-char val)
- (if (= first-char char)
- (setq this-val val))
- (setq first-char (1+ first-char))))
- this-val)))))
-
-;; Store VAL (symbol) as a character property value of CHAR in TABLE.
-
-(defun unidata-put-symbol (char val table)
- (or (symbolp val)
- (error "Not a symbol: %S" val))
- (let ((current-val (aref table char)))
- (unless (eq current-val val)
- (if (stringp current-val)
- (funcall (char-table-extra-slot table 1) char current-val table))
- (aset table char val))))
-
-;; Store VAL (integer) as a character property value of CHAR in TABLE.
-
-(defun unidata-put-integer (char val table)
- (or (integerp val)
- (not val)
- (error "Not an integer nor nil: %S" val))
- (let ((current-val (aref table char)))
- (unless (eq current-val val)
- (if (stringp current-val)
- (funcall (char-table-extra-slot table 1) char current-val table))
- (aset table char val))))
+;;
+;; This kind of char-table has these extra slots:
+;; 1st: the property symbol
+;; 2nd: 0 (corresponding to uniprop_decode_value in chartab.c)
+;; 3rd: 1..3 (corresponding to uniprop_encode_xxx in chartab.c)
+;; 4th: function or nil
+;; 5th: VAL-TABLE
;; Encode the character property value VAL into an integer value by
;; VAL-LIST. By side effect, VAL-LIST is modified.
;; VAL-LIST has this form:
-;; (t (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...)
-;; If VAL is one of VALn, just return VAL-CODEn. Otherwise,
-;; VAL-LIST is modified to this:
-;; (t (VAL . (1+ VAL-CODE1)) (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...)
+;; ((nil . 0) (VAL1 . 1) (VAL2 . 2) ...)
+;; If VAL is one of VALn, just return n.
+;; Otherwise, VAL-LIST is modified to this:
+;; ((nil . 0) (VAL1 . 1) (VAL2 . 2) ... (VAL . n+1))
(defun unidata-encode-val (val-list val)
- (let ((slot (assq val val-list))
+ (let ((slot (assoc val val-list))
val-code)
(if slot
(cdr slot)
- (setq val-code (if (cdr val-list) (1+ (cdr (nth 1 val-list))) 1))
- (setcdr val-list (cons (cons val val-code) (cdr val-list)))
+ (setq val-code (length val-list))
+ (nconc val-list (list (cons val val-code)))
val-code)))
;; Generate a char-table for the character property PROP.
-(defun unidata-gen-table (prop val-func default-value)
+(defun unidata-gen-table (prop val-func default-value val-list)
(let ((table (make-char-table 'char-code-property-table))
(prop-idx (unidata-prop-index prop))
- (val-list (list t))
(vec (make-vector 128 0))
- tail elt range val val-code idx slot)
- (set-char-table-range table (cons 0 (max-char)) default-value)
+ tail elt range val val-code idx slot
+ prev-range-data)
+ (setq val-list (cons nil (copy-sequence val-list)))
+ (setq tail val-list val-code 0)
+ ;; Convert (nil A B ...) to ((nil . 0) (A . 1) (B . 2) ...)
+ (while tail
+ (setcar tail (cons (car tail) val-code))
+ (setq tail (cdr tail) val-code (1+ val-code)))
+ (if (consp default-value)
+ (setq default-value (copy-sequence default-value))
+ (setq default-value (list default-value)))
+ (setcar default-value
+ (unidata-encode-val val-list (car default-value)))
+ (set-char-table-range table t (car default-value))
+ (set-char-table-range table nil (car default-value))
+ (dolist (elm (cdr default-value))
+ (setcar (nthcdr 2 elm)
+ (unidata-encode-val val-list (nth 2 elm)))
+ (set-char-table-range table (cons (car elm) (nth 1 elm)) (nth 2 elm)))
+
(setq tail unidata-list)
(while tail
(setq elt (car tail) tail (cdr tail))
val (funcall val-func (nth prop-idx elt)))
(setq val-code (if val (unidata-encode-val val-list val)))
(if (consp range)
- (if val-code
- (set-char-table-range table range val))
+ (when val-code
+ (set-char-table-range table range val-code)
+ (let ((from (car range)) (to (cdr range)))
+ ;; If RANGE doesn't end at the char-table boundary (each
+ ;; 128 characters), we may have to carry over the data
+ ;; for the last several characters (at most 127 chars)
+ ;; to the next loop. In that case, set PREV-RANGE-DATA
+ ;; to ((FROM . TO) . VAL-CODE) where (FROM . TO)
+ ;; specifies the range of characters handled in the next
+ ;; loop.
+ (when (< (logand to #x7F) #x7F)
+ (if (< from (logand to #x1FFF80))
+ (setq from (logand to #x1FFF80)))
+ (setq prev-range-data (cons (cons from to) val-code)))))
(let* ((start (lsh (lsh range -7) 7))
(limit (+ start 127))
- str count new-val)
- (fillarray vec 0)
+ str count new-val from to vcode)
+ (fillarray vec (car default-value))
+ (dolist (elm (cdr default-value))
+ (setq from (car elm) to (nth 1 elm))
+ (when (and (<= from limit)
+ (or (>= from start) (>= to start)))
+ (setq from (max from start)
+ to (min to limit)
+ vcode (nth 2 elm))
+ (while (<= from to)
+ (aset vec (- from start) vcode)
+ (setq from (1+ from)))))
+ ;; See the comment above.
+ (when (and prev-range-data
+ (>= (cdr (car prev-range-data)) start))
+ (setq from (car (car prev-range-data))
+ to (cdr (car prev-range-data))
+ vcode (cdr prev-range-data))
+ (while (<= from to)
+ (aset vec (- from start) vcode)
+ (setq from (1+ from))))
+ (setq prev-range-data nil)
(if val-code
(aset vec (- range start) val-code))
(while (and (setq elt (car tail) range (car elt))
(if val-code
(aset vec (- range start) val-code))
(setq tail (cdr tail)))
- (setq str "" val-code -1 count 0)
+ (setq str "\002" val-code -1 count 0)
(mapc #'(lambda (x)
(if (= val-code x)
(setq count (1+ count))
vec)
(if (= count 128)
(if val
- (set-char-table-range table (cons start limit) val))
+ (set-char-table-range table (cons start limit) val-code))
(if (= val-code 0)
(set-char-table-range table (cons start limit) str)
(if (> count 2)
(setq str (concat str (string val-code)))))
(set-char-table-range table (cons start limit) str))))))
- (setq val-list (nreverse (cdr val-list)))
- (set-char-table-extra-slot table 0 prop)
+ (set-char-table-extra-slot table 0 prop)
(set-char-table-extra-slot table 4 (vconcat (mapcar 'car val-list)))
table))
-(defun unidata-gen-table-symbol (prop)
+(defun unidata-gen-table-symbol (prop default-value val-list)
(let ((table (unidata-gen-table prop
#'(lambda (x) (and (> (length x) 0)
(intern x)))
- 0)))
- (byte-compile 'unidata-get-symbol)
- (byte-compile 'unidata-put-symbol)
- (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-symbol))
- (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-symbol))
+ default-value val-list)))
+ (set-char-table-extra-slot table 1 0)
+ (set-char-table-extra-slot table 2 1)
table))
-(defun unidata-gen-table-integer (prop)
+(defun unidata-gen-table-integer (prop default-value val-list)
(let ((table (unidata-gen-table prop
#'(lambda (x) (and (> (length x) 0)
(string-to-number x)))
- t)))
- (byte-compile 'unidata-get-integer)
- (byte-compile 'unidata-put-integer)
- (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-integer))
- (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-integer))
+ default-value val-list)))
+ (set-char-table-extra-slot table 1 0)
+ (set-char-table-extra-slot table 2 1)
+ table))
+
+(defun unidata-gen-table-numeric (prop default-value val-list)
+ (let ((table (unidata-gen-table prop
+ #'(lambda (x)
+ (if (string-match "/" x)
+ (/ (float (string-to-number x))
+ (string-to-number
+ (substring x (match-end 0))))
+ (if (> (length x) 0)
+ (string-to-number x))))
+ default-value val-list)))
+ (set-char-table-extra-slot table 1 0)
+ (set-char-table-extra-slot table 2 2)
table))
\f
(while (and l1 (eq (car l1) (car l2)))
(setq beg (1+ beg)
l1 (cdr l1) len1 (1- len1) l2 (cdr l2) len2 (1- len2)))
- (while (and (< end len1) (< end len2)
+ (while (and (< end len1) (< end len2)
(eq (nth (- len1 end 1) l1) (nth (- len2 end 1) l2)))
(setq end (1+ end))))
(if (= (+ beg end) 0)
;; CHAR).
(defun unidata-get-name (char val table)
- (cond
+ (cond
((stringp val)
(if (> (aref val 0) 0)
val
(setq word-list (nconc word-list (list (car l)))
l (cdr l))))))
(setq word-list
- (nconc word-list
+ (nconc word-list
(list (symbol-name
(unidata-decode-word c word-table))))
i (1+ i))))
(V (/ (% char 588) 28))
;; TIndex = SIndex % TCount
(T (% char 28)))
- (format "HANGUL SYLLABLE %s%s%s"
+ (format "HANGUL SYLLABLE %s%s%s"
;; U+110B is nil in this table.
(or (aref (aref jamo-name-table 0) L) "")
(aref (aref jamo-name-table 1) V)
(defun unidata-get-decomposition (char val table)
(cond
+ ((not val)
+ (list char))
+
((consp val)
val)
(setq word-list (nconc word-list (list (car l)))
l (cdr l))))))
(setq word-list
- (nconc word-list
+ (nconc word-list
(list (or (unidata-decode-word c word-table) c)))
i (1+ i))))
(if (or word-list tail-list)
(aset vec idx (nconc word-list tail-list)))
(dotimes (i 128)
(aset table (+ first-char i) (aref vec i)))
- (aref vec (- char first-char)))))
+ (setq val (aref vec (- char first-char)))
+ (or val (list char)))))
;; Hangul syllable
((and (eq val 0) (>= char #xAC00) (<= char #xD7A3))
(L (+ #x1100 (/ char 588)))
;; V = VBase + (SIndex % NCount) * TCount
(V (+ #x1161 (/ (% char 588) 28)))
+ ;; LV = SBase + (SIndex / TCount) * TCount
+ (LV (+ #xAC00 (* (/ char 28) 28)))
;; T = TBase + SIndex % TCount
(T (+ #x11A7 (% char 28))))
(if (= T #x11A7)
(list L V)
- (list L V T))))
+ (list LV T))))
))
word-table
block-list block-word-table block-end
tail elt range val idx slot)
- (set-char-table-range table (cons 0 (max-char)) 0)
(setq tail unidata-list)
(setq block-end -1)
(while tail
(if (= c 32)
(setq l (cons (intern (substring str idx i)) l)
idx (1+ i))
- (if (and (= c ?-) (< idx i)
+ (if (and (= c ?-) (< idx i)
(< (1+ i) len) (/= (aref str (1+ i)) 32))
(setq l (cons '- (cons (intern (substring str idx i)) l))
idx (1+ i)))))
(nreverse (cons (intern (substring str idx)) l))))))
-(defun unidata-gen-table-name (prop)
+(defun unidata-gen-table-name (prop &rest ignore)
(let* ((table (unidata-gen-table-word-list prop 'unidata-split-name))
(word-tables (char-table-extra-slot table 4)))
(byte-compile 'unidata-get-name)
(setq c (aref str i))
(if (= c 32)
(setq l (if (= (aref str idx) ?<)
- (cons (intern (substring str idx i)) l)
+ (cons (intern (substring str (1+ idx) (1- i))) l)
(cons (string-to-number (substring str idx i) 16) l))
idx (1+ i))))
(if (= (aref str idx) ?<)
- (setq l (cons (intern (substring str idx len)) l))
+ (setq l (cons (intern (substring str (1+ idx) (1- len))) l))
(setq l (cons (string-to-number (substring str idx len) 16) l)))
(nreverse l)))))
-(defun unidata-gen-table-decomposition (prop)
+(defun unidata-gen-table-decomposition (prop &rest ignore)
(let* ((table (unidata-gen-table-word-list prop 'unidata-split-decomposition))
(word-tables (char-table-extra-slot table 4)))
(byte-compile 'unidata-get-decomposition)
(byte-compile 'unidata-put-decomposition)
(set-char-table-extra-slot table 1
(symbol-function 'unidata-get-decomposition))
- (set-char-table-extra-slot table 2
+ (set-char-table-extra-slot table 2
(symbol-function 'unidata-put-decomposition))
(set-char-table-extra-slot table 4 (car word-tables))
table))
\f
(defun unidata-describe-general-category (val)
(cdr (assq val
- '((Lu . "Letter, Uppercase")
+ '((nil . "Uknown")
+ (Lu . "Letter, Uppercase")
(Ll . "Letter, Lowercase")
(Lt . "Letter, Titlecase")
(Lm . "Letter, Modifier")
(ON . "Other Neutrals")))))
(defun unidata-describe-decomposition (val)
- (mapconcat #'(lambda (x) (if (symbolp x) (symbol-name x) (string ?' x ?')))
- val " "))
+ (mapconcat
+ #'(lambda (x)
+ (if (symbolp x) (symbol-name x)
+ (concat (string ?')
+ (compose-string (string x) 0 1 (string ?\t x ?\t))
+ (string ?'))))
+ val " "))
+
+(defun unidata-gen-mirroring-list ()
+ (let ((head (list nil))
+ tail)
+ (with-temp-buffer
+ (insert-file-contents (expand-file-name "BidiMirroring.txt" unidata-dir))
+ (goto-char (point-min))
+ (setq tail head)
+ (while (re-search-forward "^\\([0-9A-F]+\\);\\s +\\([0-9A-F]+\\)" nil t)
+ (let ((char (string-to-number (match-string 1) 16))
+ (mirror (match-string 2)))
+ (setq tail (setcdr tail (list (list char mirror)))))))
+ (cdr head)))
;; Verify if we can retrieve correct values from the generated
;; char-tables.
(let* ((prop (car elt))
(index (unidata-prop-index prop))
(generator (unidata-prop-generator prop))
- (table (progn
+ (table (progn
(message "Generating %S table..." prop)
(funcall generator prop)))
(decoder (char-table-extra-slot table 1))
(message "%S %04X" prop check)
(setq check (+ check #x400)))
(or (equal val1 val2)
- (insert (format "> %04X %S\n< %04X %S\n"
+ (insert (format "> %04X %S\n< %04X %S\n"
char val1 char val2)))
(sit-for 0)))))))
;; The entry function. It generates files described in the header
;; comment of this file.
-(defun unidata-gen-files (&optional unidata-text-file)
- (or unidata-text-file
- (setq unidata-text-file (car command-line-args-left)
+(defun unidata-gen-files (&optional data-dir unidata-text-file)
+ (or data-dir
+ (setq data-dir (car command-line-args-left)
+ command-line-args-left (cdr command-line-args-left)
+ unidata-text-file (car command-line-args-left)
command-line-args-left (cdr command-line-args-left)))
- (unidata-setup-list unidata-text-file)
(let ((coding-system-for-write 'utf-8-unix)
- (charprop-file "charprop.el"))
+ (charprop-file "charprop.el")
+ (unidata-dir data-dir))
+ (dolist (elt unidata-prop-alist)
+ (let* ((prop (car elt))
+ (file (unidata-prop-file prop)))
+ (if (file-exists-p file)
+ (delete-file file))))
+ (unidata-setup-list unidata-text-file)
(with-temp-file charprop-file
(insert ";; Automatically generated by unidata-gen.el.\n")
(dolist (elt unidata-prop-alist)
(file (unidata-prop-file prop))
(docstring (unidata-prop-docstring prop))
(describer (unidata-prop-describer prop))
+ (default-value (unidata-prop-default prop))
+ (val-list (unidata-prop-val-list prop))
table)
;; Filename in this comment line is extracted by sed in
;; Makefile.
(insert (format ";; FILE: %s\n" file))
(insert (format "(define-char-code-property '%S %S\n %S)\n"
prop file docstring))
- (with-temp-file file
+ (with-temp-buffer
(message "Generating %s..." file)
- (setq table (funcall generator prop))
+ (when (file-exists-p file)
+ (insert-file-contents file)
+ (goto-char (point-max))
+ (search-backward ";; Local Variables:"))
+ (setq table (funcall generator prop default-value val-list))
(when describer
(unless (subrp (symbol-function describer))
(byte-compile describer)
(setq describer (symbol-function describer)))
(set-char-table-extra-slot table 3 describer))
- (insert ";; Automatically generated from UnicodeData.txt.\n"
- (format "(define-char-code-property '%S %S %S)\n"
- prop table docstring)
- ";; Local Variables:\n"
- ";; coding: utf-8\n"
- ";; no-byte-compile: t\n"
- ";; End:\n\n"
- (format ";; %s ends here\n" file)))))
+ (if (bobp)
+ (insert ";; Copyright (C) 1991-2009 Unicode, Inc.
+;; This file was generated from the Unicode data files at
+;; http://www.unicode.org/Public/UNIDATA/.
+;; See lisp/international/README for the copyright and permission notice.\n"))
+ (insert (format "(define-char-code-property '%S %S %S)\n"
+ prop table docstring))
+ (if (eobp)
+ (insert ";; Local Variables:\n"
+ ";; coding: utf-8\n"
+ ";; no-byte-compile: t\n"
+ ";; End:\n\n"
+ (format ";; %s ends here\n" file)))
+ (write-file file)
+ (message "Generating %s...done" file))))
(message "Writing %s..." charprop-file)
(insert ";; Local Variables:\n"
";; coding: utf-8\n"
\f
-;; arch-tag: 961c862e-b821-447e-9b8a-bfbab9c2d525
;;; unidata-gen.el ends here