;; unidata-gen.el -- Create files containing character property data.
-;; Copyright (C) 2005
+;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010
;; National Institute of Advanced Industrial Science and Technology (AIST)
;; Registration Number H13PRO009
;; This file is part of GNU Emacs.
-;; GNU Emacs is free software; you can redistribute it and/or modify
+;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
-;; the Free Software Foundation; either version 2, or (at your option)
-;; any later version.
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
-;; along with GNU Emacs; see the file COPYING. If not, write to the
-;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-;; Boston, MA 02111-1307, USA.
+;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;; (name, generic-category, etc), and FILE is a name of one of
;; the following files.
;;
-;; uni-name.el, uni-cat.el, uni-comb.el, uni-bidi.el
-;; It contains a single form of this format:
+;; uni-name.el, uni-category.el, uni-combining.el, uni-bidi.el,
+;; uni-decomposition.el, uni-decimal.el, uni-digit.el, uni-numeric.el,
+;; uni-mirrored.el, uni-old-name.el, uni-comment.el, uni-uppercase.el,
+;; uni-lowercase.el, uni-titlecase.el
+;; They each contain a single form of this format:
;; (char-code-property-register PROP CHAR-TABLE)
;; where PROP is the same as above, and CHAR-TABLE is a
;; char-table containing property values in a compressed format.
(or (file-readable-p unidata-text-file)
(error "File not readable: %s" unidata-text-file))
(with-temp-buffer
+ ;; Insert a file of this format:
+ ;; (CHAR NAME CATEGORY ...)
+ ;; where CHAR is a character code, the following elements are strings
+ ;; representing character properties.
(insert-file-contents unidata-text-file)
(goto-char (point-min))
(condition-case nil
;; Check this kind of block.
;; 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
- ;; 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
+ ;; 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
(if (and (= (aref name 0) ?<)
(string-match ", First>$" name))
(let ((first char)
"Unicode decomposition mapping.
Property value is a list of characters. The first element may be
one of these symbols representing compatibility formatting tag:
- <font>, <noBreak>, <initial>, <medial>, <final>, <isolated>, <circle>,
- <super>, <sub>, <vertical>, <wide>, <narrow>, <small>, <square>, <fraction>,
- <compat>"
+ font, noBreak, initial, medial, final, isolated, circle, super,
+ sub, vertical, wide, narrow, small, square, fraction, compat"
unidata-describe-decomposition)
(decimal-digit-value
6 unidata-gen-table-integer "uni-decimal.el"
"Unicode numeric value (digit).
Property value is an integer.")
(numeric-value
- 8 unidata-gen-table-symbol "uni-numeric.el"
+ 8 unidata-gen-table-numeric "uni-numeric.el"
"Unicode numeric value (numeric).
-Property value is an symbol.")
+Property value is an integer or a floating point.")
(mirrored
9 unidata-gen-table-symbol "uni-mirrored.el"
"Unicode bidi mirrored flag.
;; a char-table described here to store such values.
;;
;; If succeeding 128 characters has no property, a char-table has the
-;; symbol t is for them. Otherwise a char-table has a string of the
+;; symbol t for them. Otherwise a char-table has a string of the
;; following format for them.
;;
;; The first character of the string is FIRST-INDEX.
(defun unidata-put-character (char val table)
(or (characterp val)
(not val)
- (error "Not an character nor nil: %S" val))
+ (error "Not a character nor nil: %S" val))
(let ((current-val (aref table char)))
(unless (eq current-val val)
(if (stringp current-val)
this-val))
((> val 0)
(aref val-table (1- val))))))
-
+
;; Return a integer-type character property value of CHAR. VAL is the
;; current value of (aref TABLE CHAR).
(setq first-char (1+ first-char))))
this-val)))))
+;; Return a numeric-type (integer or float) character property value
+;; of CHAR. VAL is the current value of (aref TABLE CHAR).
+
+(defun unidata-get-numeric (char val table)
+ (cond
+ ((numberp val)
+ val)
+ ((stringp val)
+ (let ((val-table (char-table-extra-slot table 4))
+ (first-char (lsh (lsh char -7) 7))
+ (str val)
+ (len (length val))
+ (idx 0)
+ this-val count)
+ (while (< idx len)
+ (setq val (aref str idx) idx (1+ idx)
+ count (if (< idx len) (aref str idx) 1))
+ (setq val (and (> val 0) (aref val-table (1- val)))
+ count (if (< count 128)
+ 1
+ (prog1 (- count 128) (setq idx (1+ idx)))))
+ (dotimes (i count)
+ (aset table first-char val)
+ (if (= first-char char)
+ (setq this-val val))
+ (setq first-char (1+ first-char))))
+ this-val))))
+
;; Store VAL (symbol) as a character property value of CHAR in TABLE.
(defun unidata-put-symbol (char val table)
(funcall (char-table-extra-slot table 1) char current-val table))
(aset table char val))))
+;; Store VAL (integer or float) as a character property value of CHAR
+;; in TABLE.
+
+(defun unidata-put-numeric (char val table)
+ (or (numberp val)
+ (not val)
+ (error "Not a number nor nil: %S" val))
+ (let ((current-val (aref table char)))
+ (unless (equal current-val val)
+ (if (stringp current-val)
+ (funcall (char-table-extra-slot table 1) char current-val table))
+ (aset table char val))))
+
;; Encode the character property value VAL into an integer value by
;; VAL-LIST. By side effect, VAL-LIST is modified.
;; VAL-LIST has this form:
;; (t (VAL . (1+ VAL-CODE1)) (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...)
(defun unidata-encode-val (val-list val)
- (let ((slot (assq val val-list))
+ (let ((slot (assoc val val-list))
val-code)
(if slot
(cdr slot)
(prop-idx (unidata-prop-index prop))
(val-list (list t))
(vec (make-vector 128 0))
- tail elt range val val-code idx slot)
+ tail elt range val val-code idx slot
+ prev-range-data)
(set-char-table-range table (cons 0 (max-char)) default-value)
(setq tail unidata-list)
(while tail
val (funcall val-func (nth prop-idx elt)))
(setq val-code (if val (unidata-encode-val val-list val)))
(if (consp range)
- (if val-code
- (set-char-table-range table range val))
+ (when val-code
+ (set-char-table-range table range val)
+ (let ((from (car range)) (to (cdr range)))
+ ;; If RANGE doesn't end at the char-table boundary (each
+ ;; 128 characters), we may have to carry over the data
+ ;; for the last several characters (at most 127 chars)
+ ;; to the next loop. In that case, set PREV-RANGE-DATA
+ ;; to ((FROM . TO) . VAL-CODE) where (FROM . TO)
+ ;; specifies the range of characters handled in the next
+ ;; loop.
+ (when (< (logand to #x7F) #x7F)
+ (if (< from (logand to #x1FFF80))
+ (setq from (logand to #x1FFF80)))
+ (setq prev-range-data (cons (cons from to) val-code)))))
(let* ((start (lsh (lsh range -7) 7))
(limit (+ start 127))
str count new-val)
(fillarray vec 0)
+ ;; See the comment above.
+ (when (and prev-range-data
+ (>= (cdr (car prev-range-data)) start))
+ (let ((from (car (car prev-range-data)))
+ (to (cdr (car prev-range-data)))
+ (vcode (cdr prev-range-data)))
+ (while (<= from to)
+ (aset vec (- from start) vcode)
+ (setq from (1+ from)))))
+ (setq prev-range-data nil)
(if val-code
(aset vec (- range start) val-code))
(while (and (setq elt (car tail) range (car elt))
(set-char-table-range table (cons start limit) str))))))
(setq val-list (nreverse (cdr val-list)))
- (set-char-table-extra-slot table 0 prop)
+ (set-char-table-extra-slot table 0 prop)
(set-char-table-extra-slot table 4 (vconcat (mapcar 'car val-list)))
table))
(set-char-table-extra-slot table 2 (symbol-function 'unidata-put-integer))
table))
+(defun unidata-gen-table-numeric (prop)
+ (let ((table (unidata-gen-table prop
+ #'(lambda (x)
+ (if (string-match "/" x)
+ (/ (float (string-to-number x))
+ (string-to-number
+ (substring x (match-end 0))))
+ (if (> (length x) 0)
+ (string-to-number x))))
+ t)))
+ (byte-compile 'unidata-get-numeric)
+ (byte-compile 'unidata-put-numeric)
+ (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-numeric))
+ (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-numeric))
+ table))
+
\f
;; WORD-LIST TABLE
(while (and l1 (eq (car l1) (car l2)))
(setq beg (1+ beg)
l1 (cdr l1) len1 (1- len1) l2 (cdr l2) len2 (1- len2)))
- (while (and (< end len1) (< end len2)
+ (while (and (< end len1) (< end len2)
(eq (nth (- len1 end 1) l1) (nth (- len2 end 1) l2)))
(setq end (1+ end))))
(if (= (+ beg end) 0)
;; CHAR).
(defun unidata-get-name (char val table)
- (cond
+ (cond
((stringp val)
(if (> (aref val 0) 0)
val
(setq word-list (nconc word-list (list (car l)))
l (cdr l))))))
(setq word-list
- (nconc word-list
+ (nconc word-list
(list (symbol-name
(unidata-decode-word c word-table))))
i (1+ i))))
(V (/ (% char 588) 28))
;; TIndex = SIndex % TCount
(T (% char 28)))
- (format "HANGUL SYLLABLE %s%s%s"
+ (format "HANGUL SYLLABLE %s%s%s"
;; U+110B is nil in this table.
(or (aref (aref jamo-name-table 0) L) "")
(aref (aref jamo-name-table 1) V)
(setq word-list (nconc word-list (list (car l)))
l (cdr l))))))
(setq word-list
- (nconc word-list
+ (nconc word-list
(list (or (unidata-decode-word c word-table) c)))
i (1+ i))))
(if (or word-list tail-list)
(L (+ #x1100 (/ char 588)))
;; V = VBase + (SIndex % NCount) * TCount
(V (+ #x1161 (/ (% char 588) 28)))
+ ;; LV = SBase + (SIndex / TCount) * TCount
+ (LV (+ #xAC00 (* (/ char 28) 28)))
;; T = TBase + SIndex % TCount
(T (+ #x11A7 (% char 28))))
(if (= T #x11A7)
(list L V)
- (list L V T))))
+ (list LV T))))
))
(if (= c 32)
(setq l (cons (intern (substring str idx i)) l)
idx (1+ i))
- (if (and (= c ?-) (< idx i)
+ (if (and (= c ?-) (< idx i)
(< (1+ i) len) (/= (aref str (1+ i)) 32))
(setq l (cons '- (cons (intern (substring str idx i)) l))
idx (1+ i)))))
(setq c (aref str i))
(if (= c 32)
(setq l (if (= (aref str idx) ?<)
- (cons (intern (substring str idx i)) l)
+ (cons (intern (substring str (1+ idx) (1- i))) l)
(cons (string-to-number (substring str idx i) 16) l))
idx (1+ i))))
(if (= (aref str idx) ?<)
- (setq l (cons (intern (substring str idx len)) l))
+ (setq l (cons (intern (substring str (1+ idx) (1- len))) l))
(setq l (cons (string-to-number (substring str idx len) 16) l)))
(nreverse l)))))
(byte-compile 'unidata-put-decomposition)
(set-char-table-extra-slot table 1
(symbol-function 'unidata-get-decomposition))
- (set-char-table-extra-slot table 2
+ (set-char-table-extra-slot table 2
(symbol-function 'unidata-put-decomposition))
(set-char-table-extra-slot table 4 (car word-tables))
table))
(ON . "Other Neutrals")))))
(defun unidata-describe-decomposition (val)
- (mapconcat #'(lambda (x) (if (symbolp x) (symbol-name x) (string ?' x ?')))
- val " "))
+ (mapconcat
+ #'(lambda (x)
+ (if (symbolp x) (symbol-name x)
+ (concat (string ?')
+ (compose-string (string x) 0 1 (string ?\t x ?\t))
+ (string ?'))))
+ val " "))
;; Verify if we can retrieve correct values from the generated
;; char-tables.
(let* ((prop (car elt))
(index (unidata-prop-index prop))
(generator (unidata-prop-generator prop))
- (table (progn
+ (table (progn
(message "Generating %S table..." prop)
(funcall generator prop)))
(decoder (char-table-extra-slot table 1))
(message "%S %04X" prop check)
(setq check (+ check #x400)))
(or (equal val1 val2)
- (insert (format "> %04X %S\n< %04X %S\n"
+ (insert (format "> %04X %S\n< %04X %S\n"
char val1 char val2)))
(sit-for 0)))))))
(setq unidata-text-file (car command-line-args-left)
command-line-args-left (cdr command-line-args-left)))
(unidata-setup-list unidata-text-file)
- (let ((coding-system-for-write 'utf-8)
+ (let ((coding-system-for-write 'utf-8-unix)
(charprop-file "charprop.el"))
(with-temp-file charprop-file
(insert ";; Automatically generated by unidata-gen.el.\n")
(byte-compile describer)
(setq describer (symbol-function describer)))
(set-char-table-extra-slot table 3 describer))
- (insert ";; Automatically generated from UnicodeData.txt.\n"
+ (insert ";; Copyright (C) 1991-2009, 2010 Unicode, Inc.
+;; This file was generated from the Unicode data file at
+;; http://www.unicode.org/Public/UNIDATA/UnicodeData.txt.
+;; See lisp/international/README for the copyright and permission notice.\n"
(format "(define-char-code-property '%S %S %S)\n"
prop table docstring)
";; Local Variables:\n"