X-Git-Url: http://git.hcoop.net/bpt/emacs.git/blobdiff_plain/dd559368b0db67654f643320b1d84afdabe60e97..3b59c3511cb74d944730b8156bbfd3bd7d8aa69f:/admin/unidata/unidata-gen.el?ds=sidebyside diff --git a/admin/unidata/unidata-gen.el b/admin/unidata/unidata-gen.el index 75d7e2d98f..211c6f0a53 100644 --- a/admin/unidata/unidata-gen.el +++ b/admin/unidata/unidata-gen.el @@ -1,14 +1,14 @@ ;; unidata-gen.el -- Create files containing character property data. -;; Copyright (C) 2005 +;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 ;; National Institute of Advanced Industrial Science and Technology (AIST) ;; Registration Number H13PRO009 ;; This file is part of GNU Emacs. -;; GNU Emacs is free software; you can redistribute it and/or modify +;; GNU Emacs is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by -;; the Free Software Foundation; either version 2, or (at your option) -;; any later version. +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. ;; GNU Emacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -16,9 +16,7 @@ ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License -;; along with GNU Emacs; see the file COPYING. If not, write to the -;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, -;; Boston, MA 02111-1307, USA. +;; along with GNU Emacs. If not, see . ;;; Commentary: @@ -40,8 +38,11 @@ ;; (name, generic-category, etc), and FILE is a name of one of ;; the following files. ;; -;; uni-name.el, uni-cat.el, uni-comb.el, uni-bidi.el -;; It contains a single form of this format: +;; uni-name.el, uni-category.el, uni-combining.el, uni-bidi.el, +;; uni-decomposition.el, uni-decimal.el, uni-digit.el, uni-numeric.el, +;; uni-mirrored.el, uni-old-name.el, uni-comment.el, uni-uppercase.el, +;; uni-lowercase.el, uni-titlecase.el +;; They each contain a single form of this format: ;; (char-code-property-register PROP CHAR-TABLE) ;; where PROP is the same as above, and CHAR-TABLE is a ;; char-table containing property values in a compressed format. @@ -92,6 +93,10 @@ (or (file-readable-p unidata-text-file) (error "File not readable: %s" unidata-text-file)) (with-temp-buffer + ;; Insert a file of this format: + ;; (CHAR NAME CATEGORY ...) + ;; where CHAR is a character code, the following elements are strings + ;; representing character properties. (insert-file-contents unidata-text-file) (goto-char (point-min)) (condition-case nil @@ -102,7 +107,7 @@ ;; Check this kind of block. ;; 4E00;;Lo;0;L;;;;;N;;;;; - ;; 9FA5;;Lo;0;L;;;;;N;;;;; + ;; 9FCB;;Lo;0;L;;;;;N;;;;; (if (and (= (aref name 0) ?<) (string-match ", First>$" name)) (let ((first char) @@ -165,9 +170,8 @@ Property value is one of the following symbols: "Unicode decomposition mapping. Property value is a list of characters. The first element may be one of these symbols representing compatibility formatting tag: - , , , , , , , - , , , , , , , , - " + font, noBreak, initial, medial, final, isolated, circle, super, + sub, vertical, wide, narrow, small, square, fraction, compat" unidata-describe-decomposition) (decimal-digit-value 6 unidata-gen-table-integer "uni-decimal.el" @@ -178,9 +182,9 @@ Property value is an integer.") "Unicode numeric value (digit). Property value is an integer.") (numeric-value - 8 unidata-gen-table-symbol "uni-numeric.el" + 8 unidata-gen-table-numeric "uni-numeric.el" "Unicode numeric value (numeric). -Property value is an symbol.") +Property value is an integer or a floating point.") (mirrored 9 unidata-gen-table-symbol "uni-mirrored.el" "Unicode bidi mirrored flag. @@ -224,7 +228,7 @@ Property value is a character." ;; a char-table described here to store such values. ;; ;; If succeeding 128 characters has no property, a char-table has the -;; symbol t is for them. Otherwise a char-table has a string of the +;; symbol t for them. Otherwise a char-table has a string of the ;; following format for them. ;; ;; The first character of the string is FIRST-INDEX. @@ -255,7 +259,7 @@ Property value is a character." (defun unidata-put-character (char val table) (or (characterp val) (not val) - (error "Not an character nor nil: %S" val)) + (error "Not a character nor nil: %S" val)) (let ((current-val (aref table char))) (unless (eq current-val val) (if (stringp current-val) @@ -364,7 +368,7 @@ Property value is a character." this-val)) ((> val 0) (aref val-table (1- val)))))) - + ;; Return a integer-type character property value of CHAR. VAL is the ;; current value of (aref TABLE CHAR). @@ -392,6 +396,34 @@ Property value is a character." (setq first-char (1+ first-char)))) this-val))))) +;; Return a numeric-type (integer or float) character property value +;; of CHAR. VAL is the current value of (aref TABLE CHAR). + +(defun unidata-get-numeric (char val table) + (cond + ((numberp val) + val) + ((stringp val) + (let ((val-table (char-table-extra-slot table 4)) + (first-char (lsh (lsh char -7) 7)) + (str val) + (len (length val)) + (idx 0) + this-val count) + (while (< idx len) + (setq val (aref str idx) idx (1+ idx) + count (if (< idx len) (aref str idx) 1)) + (setq val (and (> val 0) (aref val-table (1- val))) + count (if (< count 128) + 1 + (prog1 (- count 128) (setq idx (1+ idx))))) + (dotimes (i count) + (aset table first-char val) + (if (= first-char char) + (setq this-val val)) + (setq first-char (1+ first-char)))) + this-val)))) + ;; Store VAL (symbol) as a character property value of CHAR in TABLE. (defun unidata-put-symbol (char val table) @@ -415,6 +447,19 @@ Property value is a character." (funcall (char-table-extra-slot table 1) char current-val table)) (aset table char val)))) +;; Store VAL (integer or float) as a character property value of CHAR +;; in TABLE. + +(defun unidata-put-numeric (char val table) + (or (numberp val) + (not val) + (error "Not a number nor nil: %S" val)) + (let ((current-val (aref table char))) + (unless (equal current-val val) + (if (stringp current-val) + (funcall (char-table-extra-slot table 1) char current-val table)) + (aset table char val)))) + ;; Encode the character property value VAL into an integer value by ;; VAL-LIST. By side effect, VAL-LIST is modified. ;; VAL-LIST has this form: @@ -424,7 +469,7 @@ Property value is a character." ;; (t (VAL . (1+ VAL-CODE1)) (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...) (defun unidata-encode-val (val-list val) - (let ((slot (assq val val-list)) + (let ((slot (assoc val val-list)) val-code) (if slot (cdr slot) @@ -439,7 +484,8 @@ Property value is a character." (prop-idx (unidata-prop-index prop)) (val-list (list t)) (vec (make-vector 128 0)) - tail elt range val val-code idx slot) + tail elt range val val-code idx slot + prev-range-data) (set-char-table-range table (cons 0 (max-char)) default-value) (setq tail unidata-list) (while tail @@ -448,12 +494,34 @@ Property value is a character." val (funcall val-func (nth prop-idx elt))) (setq val-code (if val (unidata-encode-val val-list val))) (if (consp range) - (if val-code - (set-char-table-range table range val)) + (when val-code + (set-char-table-range table range val) + (let ((from (car range)) (to (cdr range))) + ;; If RANGE doesn't end at the char-table boundary (each + ;; 128 characters), we may have to carry over the data + ;; for the last several characters (at most 127 chars) + ;; to the next loop. In that case, set PREV-RANGE-DATA + ;; to ((FROM . TO) . VAL-CODE) where (FROM . TO) + ;; specifies the range of characters handled in the next + ;; loop. + (when (< (logand to #x7F) #x7F) + (if (< from (logand to #x1FFF80)) + (setq from (logand to #x1FFF80))) + (setq prev-range-data (cons (cons from to) val-code))))) (let* ((start (lsh (lsh range -7) 7)) (limit (+ start 127)) str count new-val) (fillarray vec 0) + ;; See the comment above. + (when (and prev-range-data + (>= (cdr (car prev-range-data)) start)) + (let ((from (car (car prev-range-data))) + (to (cdr (car prev-range-data))) + (vcode (cdr prev-range-data))) + (while (<= from to) + (aset vec (- from start) vcode) + (setq from (1+ from))))) + (setq prev-range-data nil) (if val-code (aset vec (- range start) val-code)) (while (and (setq elt (car tail) range (car elt)) @@ -492,7 +560,7 @@ Property value is a character." (set-char-table-range table (cons start limit) str)))))) (setq val-list (nreverse (cdr val-list))) - (set-char-table-extra-slot table 0 prop) + (set-char-table-extra-slot table 0 prop) (set-char-table-extra-slot table 4 (vconcat (mapcar 'car val-list))) table)) @@ -518,6 +586,22 @@ Property value is a character." (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-integer)) table)) +(defun unidata-gen-table-numeric (prop) + (let ((table (unidata-gen-table prop + #'(lambda (x) + (if (string-match "/" x) + (/ (float (string-to-number x)) + (string-to-number + (substring x (match-end 0)))) + (if (> (length x) 0) + (string-to-number x)))) + t))) + (byte-compile 'unidata-get-numeric) + (byte-compile 'unidata-put-numeric) + (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-numeric)) + (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-numeric)) + table)) + ;; WORD-LIST TABLE @@ -556,7 +640,7 @@ Property value is a character." (while (and l1 (eq (car l1) (car l2))) (setq beg (1+ beg) l1 (cdr l1) len1 (1- len1) l2 (cdr l2) len2 (1- len2))) - (while (and (< end len1) (< end len2) + (while (and (< end len1) (< end len2) (eq (nth (- len1 end 1) l1) (nth (- len2 end 1) l2))) (setq end (1+ end)))) (if (= (+ beg end) 0) @@ -628,7 +712,7 @@ Property value is a character." ;; CHAR). (defun unidata-get-name (char val table) - (cond + (cond ((stringp val) (if (> (aref val 0) 0) val @@ -659,7 +743,7 @@ Property value is a character." (setq word-list (nconc word-list (list (car l))) l (cdr l)))))) (setq word-list - (nconc word-list + (nconc word-list (list (symbol-name (unidata-decode-word c word-table)))) i (1+ i)))) @@ -698,7 +782,7 @@ Property value is a character." (V (/ (% char 588) 28)) ;; TIndex = SIndex % TCount (T (% char 28))) - (format "HANGUL SYLLABLE %s%s%s" + (format "HANGUL SYLLABLE %s%s%s" ;; U+110B is nil in this table. (or (aref (aref jamo-name-table 0) L) "") (aref (aref jamo-name-table 1) V) @@ -754,7 +838,7 @@ Property value is a character." (setq word-list (nconc word-list (list (car l))) l (cdr l)))))) (setq word-list - (nconc word-list + (nconc word-list (list (or (unidata-decode-word c word-table) c))) i (1+ i)))) (if (or word-list tail-list) @@ -771,11 +855,13 @@ Property value is a character." (L (+ #x1100 (/ char 588))) ;; V = VBase + (SIndex % NCount) * TCount (V (+ #x1161 (/ (% char 588) 28))) + ;; LV = SBase + (SIndex / TCount) * TCount + (LV (+ #xAC00 (* (/ char 28) 28))) ;; T = TBase + SIndex % TCount (T (+ #x11A7 (% char 28)))) (if (= T #x11A7) (list L V) - (list L V T)))) + (list LV T)))) )) @@ -933,7 +1019,7 @@ Property value is a character." (if (= c 32) (setq l (cons (intern (substring str idx i)) l) idx (1+ i)) - (if (and (= c ?-) (< idx i) + (if (and (= c ?-) (< idx i) (< (1+ i) len) (/= (aref str (1+ i)) 32)) (setq l (cons '- (cons (intern (substring str idx i)) l)) idx (1+ i))))) @@ -969,11 +1055,11 @@ Property value is a character." (setq c (aref str i)) (if (= c 32) (setq l (if (= (aref str idx) ?<) - (cons (intern (substring str idx i)) l) + (cons (intern (substring str (1+ idx) (1- i))) l) (cons (string-to-number (substring str idx i) 16) l)) idx (1+ i)))) (if (= (aref str idx) ?<) - (setq l (cons (intern (substring str idx len)) l)) + (setq l (cons (intern (substring str (1+ idx) (1- len))) l)) (setq l (cons (string-to-number (substring str idx len) 16) l))) (nreverse l))))) @@ -985,7 +1071,7 @@ Property value is a character." (byte-compile 'unidata-put-decomposition) (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-decomposition)) - (set-char-table-extra-slot table 2 + (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-decomposition)) (set-char-table-extra-slot table 4 (car word-tables)) table)) @@ -1077,8 +1163,13 @@ Property value is a character." (ON . "Other Neutrals"))))) (defun unidata-describe-decomposition (val) - (mapconcat #'(lambda (x) (if (symbolp x) (symbol-name x) (string ?' x ?'))) - val " ")) + (mapconcat + #'(lambda (x) + (if (symbolp x) (symbol-name x) + (concat (string ?') + (compose-string (string x) 0 1 (string ?\t x ?\t)) + (string ?')))) + val " ")) ;; Verify if we can retrieve correct values from the generated ;; char-tables. @@ -1088,7 +1179,7 @@ Property value is a character." (let* ((prop (car elt)) (index (unidata-prop-index prop)) (generator (unidata-prop-generator prop)) - (table (progn + (table (progn (message "Generating %S table..." prop) (funcall generator prop))) (decoder (char-table-extra-slot table 1)) @@ -1114,7 +1205,7 @@ Property value is a character." (message "%S %04X" prop check) (setq check (+ check #x400))) (or (equal val1 val2) - (insert (format "> %04X %S\n< %04X %S\n" + (insert (format "> %04X %S\n< %04X %S\n" char val1 char val2))) (sit-for 0))))))) @@ -1126,7 +1217,7 @@ Property value is a character." (setq unidata-text-file (car command-line-args-left) command-line-args-left (cdr command-line-args-left))) (unidata-setup-list unidata-text-file) - (let ((coding-system-for-write 'utf-8) + (let ((coding-system-for-write 'utf-8-unix) (charprop-file "charprop.el")) (with-temp-file charprop-file (insert ";; Automatically generated by unidata-gen.el.\n") @@ -1150,7 +1241,10 @@ Property value is a character." (byte-compile describer) (setq describer (symbol-function describer))) (set-char-table-extra-slot table 3 describer)) - (insert ";; Automatically generated from UnicodeData.txt.\n" + (insert ";; Copyright (C) 1991-2009, 2010 Unicode, Inc. +;; This file was generated from the Unicode data file at +;; http://www.unicode.org/Public/UNIDATA/UnicodeData.txt. +;; See lisp/international/README for the copyright and permission notice.\n" (format "(define-char-code-property '%S %S %S)\n" prop table docstring) ";; Local Variables:\n"