;;
;; Characters from other character sets can be encoded with mule-utf-8
;; by populating the translation table
-;; `utf-translation-table-for-encode' and registering the translation
-;; with `register-char-codings'. Hash tables
+;; `utf-translation-table-for-encode'. Hash tables
;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
;; used to support encoding and decoding of about a quarter of the CJK
;; space between U+3400 and U+DFFF.
(define-translation-table 'utf-translation-table-for-decode)
-(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq :size 43000
- :rehash-size 1000)
+(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
"Hash table mapping Emacs CJK character sets to Unicode code points.
If `utf-translate-cjk' is non-nil, this table populates the
translation-hash-table named `utf-subst-table-for-encode'.")
-(define-translation-hash-table 'utf-subst-table-for-encode
- (make-hash-table :test 'eq :size 43000 :rehash-size 1000))
+(define-translation-hash-table 'utf-subst-table-for-encode
+ ucs-mule-cjk-to-unicode)
-(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq :size 43000
- :rehash-size 1000)
+(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
"Hash table mapping Unicode code points to Emacs CJK character sets.
If `utf-translate-cjk' is non-nil, this table populates the
translation-hash-table named `utf-subst-table-for-decode'.")
(define-translation-hash-table 'utf-subst-table-for-decode
- (make-hash-table :test 'eq :size 21500 :rehash-size 200))
+ ucs-unicode-to-mule-cjk)
(mapc
(lambda (pair)
'translation-table)
ucs-mule-to-mule-unicode)
(define-translation-table 'utf-translation-table-for-encode
- utf-defragmentation-table)
- (dolist (coding '(mule-utf-8 mule-utf-16-be mule-utf-16-le))
- (register-char-codings coding utf-defragmentation-table))))
+ utf-defragmentation-table)))
(define-translation-table 'utf-translation-table-for-decode)
;; When unify-8859-on-encoding-mode is off, be sure to make
;; mule-utf-* disabled for characters in
(unless (eq (get 'utf-translation-table-for-encode
'translation-table)
ucs-mule-to-mule-unicode)
- (define-translation-table 'utf-translation-table-for-encode)
- (map-char-table
- (lambda (key val)
- (if (and (>= key 128) val)
- (aset char-coding-system-table key
- (delq 'mule-utf-8
- (delq 'mule-utf-16-le
- (delq 'mule-utf-16-be
- (aref char-coding-system-table
- key)))))))
- utf-defragmentation-table)))
+ (define-translation-table 'utf-translation-table-for-encode)))
(set-default s v))
:version "21.4"
:type 'boolean
:group 'mule)
-(defcustom utf-translate-cjk nil
+(define-minor-mode utf-translate-cjk-mode
"Whether the UTF based coding systems should decode/encode CJK characters.
Enabling this loads tables which allow the coding systems mule-utf-8,
mule-utf-16-le and mule-utf-16-be to encode characters in the charsets
The tables are large (over 40000 entries), so this option is not the
default. Also, installing them may be rather slow."
- :set (lambda (s v)
- (if v
- (progn
- ;; Load the files explicitly, to avoid having to keep
- ;; around the large tables they contain (as well as the
- ;; ones which get built).
- (cond
- ((string= "Korean" current-language-environment)
- (load "subst-jis")
- (load "subst-big5")
- (load "subst-gb2312")
- (load "subst-ksc"))
- ((string= "Chinese-BIG5" current-language-environment)
- (load "subst-jis")
- (load "subst-ksc")
- (load "subst-gb2312")
- (load "subst-big5"))
- ((string= "Chinese-GB" current-language-environment)
- (load "subst-jis")
- (load "subst-ksc")
- (load "subst-big5")
- (load "subst-gb2312"))
- (t
- (load "subst-ksc")
- (load "subst-gb2312")
- (load "subst-big5")
- (load "subst-jis"))) ; jis covers as much as big5, gb2312
- (let ((table (make-char-table 'translation-table)))
- (maphash (lambda (k v)
- (aset table k t))
- ucs-mule-cjk-to-unicode)
- (register-char-codings 'mule-utf-8 table)
- (register-char-codings 'mule-utf-16-le table)
- (register-char-codings 'mule-utf-16-be table))
- (define-translation-hash-table 'utf-subst-table-for-decode
- ucs-unicode-to-mule-cjk)
- (define-translation-hash-table 'utf-subst-table-for-encode
- ucs-mule-cjk-to-unicode))
- (map-char-table
- (lambda (k v)
- (if (gethash k ucs-mule-cjk-to-unicode)
- (aset char-coding-system-table k
- (delq 'mule-utf-8
- (delq 'mule-utf-16-le
- (delq 'mule-utf-16-be v))))))
- char-coding-system-table)
- (define-translation-hash-table 'utf-subst-table-for-decode
- (make-hash-table :test 'eq))
- (define-translation-hash-table 'utf-subst-table-for-encode
- (make-hash-table :test 'eq)))
- (set-default s v))
+ :init-value nil
:version "21.4"
:type 'boolean
:set-after '(current-language-environment)
- :group 'mule)
+ :group 'mule
+ :global t
+ (if utf-translate-cjk-mode
+ ;; Fixme: Allow the use of the CJK charsets to be
+ ;; customized by reordering and possible omission.
+ (progn
+ ;; Redefine them with realistic initial sizes and a
+ ;; smallish rehash size to avoid wasting significant
+ ;; space after they're built.
+ (setq ucs-mule-cjk-to-unicode
+ (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
+ ucs-unicode-to-mule-cjk
+ (make-hash-table :test 'eq :size 43000 :rehash-size 1000))
+ ;; Load the files explicitly, to avoid having to keep
+ ;; around the large tables they contain (as well as the
+ ;; ones which get built).
+ (cond
+ ((string= "Korean" current-language-environment)
+ (load "subst-jis")
+ (load "subst-big5")
+ (load "subst-gb2312")
+ (load "subst-ksc"))
+ ((string= "Chinese-BIG5" current-language-environment)
+ (load "subst-jis")
+ (load "subst-ksc")
+ (load "subst-gb2312")
+ (load "subst-big5"))
+ ((string= "Chinese-GB" current-language-environment)
+ (load "subst-jis")
+ (load "subst-ksc")
+ (load "subst-big5")
+ (load "subst-gb2312"))
+ (t
+ (load "subst-ksc")
+ (load "subst-gb2312")
+ (load "subst-big5")
+ (load "subst-jis"))) ; jis covers as much as big5, gb2312
+ (let ((table (make-char-table 'translation-table)))
+ (maphash (lambda (k v)
+ (aset table k t))
+ ucs-mule-cjk-to-unicode)
+ (define-translation-hash-table 'utf-subst-table-for-decode
+ ucs-unicode-to-mule-cjk)
+ (define-translation-hash-table 'utf-subst-table-for-encode
+ ucs-mule-cjk-to-unicode))
+ (define-translation-hash-table 'utf-subst-table-for-decode
+ (make-hash-table :test 'eq))
+ (define-translation-hash-table 'utf-subst-table-for-encode
+ (make-hash-table :test 'eq)))))
(define-ccl-program ccl-decode-mule-utf-8
;;
((r5 = ,(charset-id 'eight-bit-control))
(r6 = ,(charset-id 'eight-bit-graphic))
(loop
+ (r0 = -1)
(read r0)
;; 1byte encoding, i.e., ascii
(if (r0 < #x80)
- (write r0)
+ ((write r0))
(if (r0 < #xc0) ; continuation byte (invalid here)
- (if (r0 < #xa0)
- (write-multibyte-character r5 r0)
- (write-multibyte-character r6 r0))
+ ((if (r0 < #xa0)
+ (write-multibyte-character r5 r0)
+ (write-multibyte-character r6 r0)))
;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
(if (r0 < #xe0)
- ((read r1)
+ ((r1 = -1)
+ (read r1)
(if ((r1 & #b11000000) != #b10000000)
;; Invalid 2-byte sequence
;; 3byte encoding
;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
(if (r0 < #xf0)
- ((read r1 r2)
+ ((r1 = -1)
+ (r2 = -1)
+ (read r1 r2)
;; This is set to 1 if the encoding is invalid.
(r4 = 0)
(if (r2 < #xa0)
(write-multibyte-character r5 r2)
(write-multibyte-character r6 r2))))
-
+
;; mule-unicode-0100-24ff (>= 0800)
((if (r3 < #x2500)
((r0 = ,(charset-id 'mule-unicode-0100-24ff))
(translate-character
utf-translation-table-for-decode r0 r1)
(write-multibyte-character r0 r1))
-
+
;; mule-unicode-2500-33ff
(if (r3 < #x3400)
((r4 = r3) ; don't zap r3
(r3 = r5)
(r3 = r6))
(write-multibyte-character r3 r2))
-
+
;; mule-unicode-e000-ffff
;; Fixme: fffe and ffff are invalid.
((r0 = ,(charset-id 'mule-unicode-e000-ffff))
;; 4byte encoding
;; keep those bytes as eight-bit-{control|graphic}
;; Fixme: allow lookup in utf-subst-table-for-decode.
- ((read r1 r2 r3)
+ ((r1 = -1)
+ (r2 = -1)
+ (r3 = -1)
+ (read r1 r2 r3)
;; r0 > #xf0, thus eight-bit-graphic
(write-multibyte-character r6 r0)
(if (r1 < #xa0)
(write-multibyte-character r5 r3))
(write-multibyte-character r6 r3))
(if (r0 >= #xf8) ; 5- or 6-byte encoding
- ((read r1)
- (if (r1 < #xa0)
- (if (r1 < #x80) ; invalid byte
- (write r1)
- (write-multibyte-character r5 r1))
- (write-multibyte-character r6 r1))
+ ((r0 = -1)
+ (read r0)
+ (if (r0 < #xa0)
+ (if (r0 < #x80) ; invalid byte
+ (write r0)
+ (write-multibyte-character r5 r0))
+ (write-multibyte-character r6 r0))
(if (r0 >= #xfc) ; 6-byte
- ((read r1)
- (if (r1 < #xa0)
- (if (r1 < #x80) ; invalid byte
- (write r1)
- (write-multibyte-character r5 r1))
- (write-multibyte-character r6 r1)))))))
+ ((r0 = -1)
+ (read r0)
+ (if (r0 < #xa0)
+ (if (r0 < #x80) ; invalid byte
+ (write r0)
+ (write-multibyte-character r5 r0))
+ (write-multibyte-character r6 r0)))))))
;; else invalid byte >= #xfe
(write-multibyte-character r6 r0))))))
- (repeat))))
+ (repeat)))
+
+ ;; At EOF...
+ (if (r0 >= 0)
+ ((if (r0 < #x80)
+ (write r0)
+ (if (r0 < #xa0)
+ (write-multibyte-character r5 r0)
+ ((write-multibyte-character r6 r0))))
+ (if (r1 >= 0)
+ ((if (r1 < #x80)
+ (write r1)
+ (if (r1 < #xa0)
+ (write-multibyte-character r5 r1)
+ ((write-multibyte-character r6 r1))))
+ (if (r2 >= 0)
+ ((if (r2 < #x80)
+ (write r2)
+ (if (r2 < #xa0)
+ (write-multibyte-character r5 r2)
+ ((write-multibyte-character r6 r2))))
+ (if (r3 >= 0)
+ (if (r3 < #x80)
+ (write r3)
+ (if (r3 < #xa0)
+ (write-multibyte-character r5 r3)
+ ((write-multibyte-character r6 r3))))))))))))
"CCL program to decode UTF-8.
Basic decoding is done into the charsets ascii, latin-iso8859-1 and