| 1 | ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
| 2 | |
| 3 | ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 |
| 4 | ;; Free Software Foundation, Inc. |
| 5 | ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 |
| 6 | ;; National Institute of Advanced Industrial Science and Technology (AIST) |
| 7 | ;; Registration Number H14PRO021 |
| 8 | |
| 9 | ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
| 10 | ;; Maintainer: FSF |
| 11 | ;; Keywords: multilingual, Unicode, UTF-8, i18n |
| 12 | |
| 13 | ;; This file is part of GNU Emacs. |
| 14 | |
| 15 | ;; GNU Emacs is free software; you can redistribute it and/or modify |
| 16 | ;; it under the terms of the GNU General Public License as published by |
| 17 | ;; the Free Software Foundation; either version 3, or (at your option) |
| 18 | ;; any later version. |
| 19 | |
| 20 | ;; GNU Emacs is distributed in the hope that it will be useful, |
| 21 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 22 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 23 | ;; GNU General Public License for more details. |
| 24 | |
| 25 | ;; You should have received a copy of the GNU General Public License |
| 26 | ;; along with GNU Emacs; see the file COPYING. If not, write to the |
| 27 | ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
| 28 | ;; Boston, MA 02110-1301, USA. |
| 29 | |
| 30 | ;;; Commentary: |
| 31 | |
| 32 | ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
| 33 | ;; of the following character sets to and from UTF-8: |
| 34 | ;; |
| 35 | ;; ascii |
| 36 | ;; eight-bit-control |
| 37 | ;; latin-iso8859-1 |
| 38 | ;; mule-unicode-0100-24ff |
| 39 | ;; mule-unicode-2500-33ff |
| 40 | ;; mule-unicode-e000-ffff |
| 41 | ;; |
| 42 | ;; On decoding, Unicode characters that do not fit into the above |
| 43 | ;; character sets are handled as `eight-bit-control' or |
| 44 | ;; `eight-bit-graphic' characters to retain the information about the |
| 45 | ;; original byte sequence and text properties record the corresponding |
| 46 | ;; unicode. |
| 47 | ;; |
| 48 | ;; Fixme: note that reading and writing invalid utf-8 may not be |
| 49 | ;; idempotent -- to represent the bytes to fix that needs a new charset. |
| 50 | ;; |
| 51 | ;; Characters from other character sets can be encoded with mule-utf-8 |
| 52 | ;; by populating the translation table |
| 53 | ;; `utf-translation-table-for-encode'. Hash tables |
| 54 | ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are |
| 55 | ;; used to support encoding and decoding of about a quarter of the CJK |
| 56 | ;; space between U+3400 and U+DFFF. |
| 57 | |
| 58 | ;; UTF-8 is defined in RFC 3629. A sketch of the encoding is: |
| 59 | |
| 60 | ;; scalar | utf-8 |
| 61 | ;; value | 1st byte | 2nd byte | 3rd byte |
| 62 | ;; --------------------+-----------+-----------+---------- |
| 63 | ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | |
| 64 | ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | |
| 65 | ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx |
| 66 | |
| 67 | ;;; Code: |
| 68 | |
| 69 | (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil) |
| 70 | "Char table mapping characters to latin-iso8859-1 or mule-unicode-*. |
| 71 | |
| 72 | If `unify-8859-on-encoding-mode' is non-nil, this table populates the |
| 73 | translation-table named `utf-translation-table-for-encode'.") |
| 74 | |
| 75 | (define-translation-table 'utf-translation-table-for-encode) |
| 76 | |
| 77 | |
| 78 | ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the |
| 79 | ;; space of mule-unicode. For Latin scripts this isn't very |
| 80 | ;; important. Hebrew and Arabic might go here too when there's proper |
| 81 | ;; support for them. |
| 82 | |
| 83 | (defvar utf-fragmentation-table (make-char-table 'translation-table nil) |
| 84 | "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*. |
| 85 | |
| 86 | If `utf-fragment-on-decoding' is non-nil, this table populates the |
| 87 | translation-table named `utf-translation-table-for-decode'") |
| 88 | |
| 89 | (defvar utf-defragmentation-table (make-char-table 'translation-table nil) |
| 90 | "Char-table for reverse mapping of `utf-fragmentation-table'. |
| 91 | |
| 92 | If `utf-fragment-on-decoding' is non-nil and |
| 93 | `unify-8859-on-encoding-mode' is nil, this table populates the |
| 94 | translation-table named `utf-translation-table-for-encode'") |
| 95 | |
| 96 | (define-translation-table 'utf-translation-table-for-decode) |
| 97 | |
| 98 | |
| 99 | (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
| 100 | "Hash table mapping Emacs CJK character sets to Unicode code points. |
| 101 | |
| 102 | If `utf-translate-cjk-mode' is non-nil, this table populates the |
| 103 | translation-hash-table named `utf-subst-table-for-encode'.") |
| 104 | |
| 105 | (define-translation-hash-table 'utf-subst-table-for-encode |
| 106 | ucs-mule-cjk-to-unicode) |
| 107 | |
| 108 | (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) |
| 109 | "Hash table mapping Unicode code points to Emacs CJK character sets. |
| 110 | |
| 111 | If `utf-translate-cjk-mode' is non-nil, this table populates the |
| 112 | translation-hash-table named `utf-subst-table-for-decode'.") |
| 113 | |
| 114 | (define-translation-hash-table 'utf-subst-table-for-decode |
| 115 | ucs-unicode-to-mule-cjk) |
| 116 | |
| 117 | (mapc |
| 118 | (lambda (pair) |
| 119 | (aset utf-fragmentation-table (car pair) (cdr pair)) |
| 120 | (aset utf-defragmentation-table (cdr pair) (car pair))) |
| 121 | '((?\e$,1&d\e(B . ?\e,F4\e(B) (?\e$,1&e\e(B . ?\e,F5\e(B) (?\e$,1&f\e(B . ?\e,F6\e(B) (?\e$,1&h\e(B . ?\e,F8\e(B) (?\e$,1&i\e(B . ?\e,F9\e(B) |
| 122 | (?\e$,1&j\e(B . ?\e,F:\e(B) (?\e$,1&l\e(B . ?\e,F<\e(B) (?\e$,1&n\e(B . ?\e,F>\e(B) (?\e$,1&o\e(B . ?\e,F?\e(B) (?\e$,1&p\e(B . ?\e,F@\e(B) |
| 123 | (?\e$,1&q\e(B . ?\e,FA\e(B) (?\e$,1&r\e(B . ?\e,FB\e(B) (?\e$,1&s\e(B . ?\e,FC\e(B) (?\e$,1&t\e(B . ?\e,FD\e(B) (?\e$,1&u\e(B . ?\e,FE\e(B) |
| 124 | (?\e$,1&v\e(B . ?\e,FF\e(B) (?\e$,1&w\e(B . ?\e,FG\e(B) (?\e$,1&x\e(B . ?\e,FH\e(B) (?\e$,1&y\e(B . ?\e,FI\e(B) (?\e$,1&z\e(B . ?\e,FJ\e(B) |
| 125 | (?\e$,1&{\e(B . ?\e,FK\e(B) (?\e$,1&|\e(B . ?\e,FL\e(B) (?\e$,1&}\e(B . ?\e,FM\e(B) (?\e$,1&~\e(B . ?\e,FN\e(B) (?\e$,1&\7f\e(B . ?\e,FO\e(B) |
| 126 | (?\e$,1' \e(B . ?\e,FP\e(B) (?\e$,1'!\e(B . ?\e,FQ\e(B) (?\e$,1'#\e(B . ?\e,FS\e(B) (?\e$,1'$\e(B . ?\e,FT\e(B) (?\e$,1'%\e(B . ?\e,FU\e(B) |
| 127 | (?\e$,1'&\e(B . ?\e,FV\e(B) (?\e$,1''\e(B . ?\e,FW\e(B) (?\e$,1'(\e(B . ?\e,FX\e(B) (?\e$,1')\e(B . ?\e,FY\e(B) (?\e$,1'*\e(B . ?\e,FZ\e(B) |
| 128 | (?\e$,1'+\e(B . ?\e,F[\e(B) (?\e$,1',\e(B . ?\e,F\\e(B) (?\e$,1'-\e(B . ?\e,F]\e(B) (?\e$,1'.\e(B . ?\e,F^\e(B) (?\e$,1'/\e(B . ?\e,F_\e(B) |
| 129 | (?\e$,1'0\e(B . ?\e,F`\e(B) (?\e$,1'1\e(B . ?\e,Fa\e(B) (?\e$,1'2\e(B . ?\e,Fb\e(B) (?\e$,1'3\e(B . ?\e,Fc\e(B) (?\e$,1'4\e(B . ?\e,Fd\e(B) |
| 130 | (?\e$,1'5\e(B . ?\e,Fe\e(B) (?\e$,1'6\e(B . ?\e,Ff\e(B) (?\e$,1'7\e(B . ?\e,Fg\e(B) (?\e$,1'8\e(B . ?\e,Fh\e(B) (?\e$,1'9\e(B . ?\e,Fi\e(B) |
| 131 | (?\e$,1':\e(B . ?\e,Fj\e(B) (?\e$,1';\e(B . ?\e,Fk\e(B) (?\e$,1'<\e(B . ?\e,Fl\e(B) (?\e$,1'=\e(B . ?\e,Fm\e(B) (?\e$,1'>\e(B . ?\e,Fn\e(B) |
| 132 | (?\e$,1'?\e(B . ?\e,Fo\e(B) (?\e$,1'@\e(B . ?\e,Fp\e(B) (?\e$,1'A\e(B . ?\e,Fq\e(B) (?\e$,1'B\e(B . ?\e,Fr\e(B) (?\e$,1'C\e(B . ?\e,Fs\e(B) |
| 133 | (?\e$,1'D\e(B . ?\e,Ft\e(B) (?\e$,1'E\e(B . ?\e,Fu\e(B) (?\e$,1'F\e(B . ?\e,Fv\e(B) (?\e$,1'G\e(B . ?\e,Fw\e(B) (?\e$,1'H\e(B . ?\e,Fx\e(B) |
| 134 | (?\e$,1'I\e(B . ?\e,Fy\e(B) (?\e$,1'J\e(B . ?\e,Fz\e(B) (?\e$,1'K\e(B . ?\e,F{\e(B) (?\e$,1'L\e(B . ?\e,F|\e(B) (?\e$,1'M\e(B . ?\e,F}\e(B) |
| 135 | (?\e$,1'N\e(B . ?\e,F~\e(B) |
| 136 | |
| 137 | (?\e$,1(!\e(B . ?\e,L!\e(B) (?\e$,1("\e(B . ?\e,L"\e(B) (?\e$,1(#\e(B . ?\e,L#\e(B) (?\e$,1($\e(B . ?\e,L$\e(B) |
| 138 | (?\e$,1(%\e(B . ?\e,L%\e(B) (?\e$,1(&\e(B . ?\e,L&\e(B) (?\e$,1('\e(B . ?\e,L'\e(B) (?\e$,1((\e(B . ?\e,L(\e(B) (?\e$,1()\e(B . ?\e,L)\e(B) |
| 139 | (?\e$,1(*\e(B . ?\e,L*\e(B) (?\e$,1(+\e(B . ?\e,L+\e(B) (?\e$,1(,\e(B . ?\e,L,\e(B) (?\e$,1(.\e(B . ?\e,L.\e(B) (?\e$,1(/\e(B . ?\e,L/\e(B) |
| 140 | (?\e$,1(0\e(B . ?\e,L0\e(B) (?\e$,1(1\e(B . ?\e,L1\e(B) (?\e$,1(2\e(B . ?\e,L2\e(B) (?\e$,1(3\e(B . ?\e,L3\e(B) (?\e$,1(4\e(B . ?\e,L4\e(B) |
| 141 | (?\e$,1(5\e(B . ?\e,L5\e(B) (?\e$,1(6\e(B . ?\e,L6\e(B) (?\e$,1(7\e(B . ?\e,L7\e(B) (?\e$,1(8\e(B . ?\e,L8\e(B) (?\e$,1(9\e(B . ?\e,L9\e(B) |
| 142 | (?\e$,1(:\e(B . ?\e,L:\e(B) (?\e$,1(;\e(B . ?\e,L;\e(B) (?\e$,1(<\e(B . ?\e,L<\e(B) (?\e$,1(=\e(B . ?\e,L=\e(B) (?\e$,1(>\e(B . ?\e,L>\e(B) |
| 143 | (?\e$,1(?\e(B . ?\e,L?\e(B) (?\e$,1(@\e(B . ?\e,L@\e(B) (?\e$,1(A\e(B . ?\e,LA\e(B) (?\e$,1(B\e(B . ?\e,LB\e(B) (?\e$,1(C\e(B . ?\e,LC\e(B) |
| 144 | (?\e$,1(D\e(B . ?\e,LD\e(B) (?\e$,1(E\e(B . ?\e,LE\e(B) (?\e$,1(F\e(B . ?\e,LF\e(B) (?\e$,1(G\e(B . ?\e,LG\e(B) (?\e$,1(H\e(B . ?\e,LH\e(B) |
| 145 | (?\e$,1(I\e(B . ?\e,LI\e(B) (?\e$,1(J\e(B . ?\e,LJ\e(B) (?\e$,1(K\e(B . ?\e,LK\e(B) (?\e$,1(L\e(B . ?\e,LL\e(B) (?\e$,1(M\e(B . ?\e,LM\e(B) |
| 146 | (?\e$,1(N\e(B . ?\e,LN\e(B) (?\e$,1(O\e(B . ?\e,LO\e(B) (?\e$,1(P\e(B . ?\e,LP\e(B) (?\e$,1(Q\e(B . ?\e,LQ\e(B) (?\e$,1(R\e(B . ?\e,LR\e(B) |
| 147 | (?\e$,1(S\e(B . ?\e,LS\e(B) (?\e$,1(T\e(B . ?\e,LT\e(B) (?\e$,1(U\e(B . ?\e,LU\e(B) (?\e$,1(V\e(B . ?\e,LV\e(B) (?\e$,1(W\e(B . ?\e,LW\e(B) |
| 148 | (?\e$,1(X\e(B . ?\e,LX\e(B) (?\e$,1(Y\e(B . ?\e,LY\e(B) (?\e$,1(Z\e(B . ?\e,LZ\e(B) (?\e$,1([\e(B . ?\e,L[\e(B) (?\e$,1(\\e(B . ?\e,L\\e(B) |
| 149 | (?\e$,1(]\e(B . ?\e,L]\e(B) (?\e$,1(^\e(B . ?\e,L^\e(B) (?\e$,1(_\e(B . ?\e,L_\e(B) (?\e$,1(`\e(B . ?\e,L`\e(B) (?\e$,1(a\e(B . ?\e,La\e(B) |
| 150 | (?\e$,1(b\e(B . ?\e,Lb\e(B) (?\e$,1(c\e(B . ?\e,Lc\e(B) (?\e$,1(d\e(B . ?\e,Ld\e(B) (?\e$,1(e\e(B . ?\e,Le\e(B) (?\e$,1(f\e(B . ?\e,Lf\e(B) |
| 151 | (?\e$,1(g\e(B . ?\e,Lg\e(B) (?\e$,1(h\e(B . ?\e,Lh\e(B) (?\e$,1(i\e(B . ?\e,Li\e(B) (?\e$,1(j\e(B . ?\e,Lj\e(B) (?\e$,1(k\e(B . ?\e,Lk\e(B) |
| 152 | (?\e$,1(l\e(B . ?\e,Ll\e(B) (?\e$,1(m\e(B . ?\e,Lm\e(B) (?\e$,1(n\e(B . ?\e,Ln\e(B) (?\e$,1(o\e(B . ?\e,Lo\e(B) (?\e$,1(q\e(B . ?\e,Lq\e(B) |
| 153 | (?\e$,1(r\e(B . ?\e,Lr\e(B) (?\e$,1(s\e(B . ?\e,Ls\e(B) (?\e$,1(t\e(B . ?\e,Lt\e(B) (?\e$,1(u\e(B . ?\e,Lu\e(B) (?\e$,1(v\e(B . ?\e,Lv\e(B) |
| 154 | (?\e$,1(w\e(B . ?\e,Lw\e(B) (?\e$,1(x\e(B . ?\e,Lx\e(B) (?\e$,1(y\e(B . ?\e,Ly\e(B) (?\e$,1(z\e(B . ?\e,Lz\e(B) (?\e$,1({\e(B . ?\e,L{\e(B) |
| 155 | (?\e$,1(|\e(B . ?\e,L|\e(B) (?\e$,1(~\e(B . ?\e,L~\e(B) (?\e$,1(\7f\e(B . ?\e,L\7f\e(B))) |
| 156 | |
| 157 | |
| 158 | (defcustom utf-fragment-on-decoding nil |
| 159 | "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets. |
| 160 | Setting this means that the relevant Cyrillic and Greek characters are |
| 161 | decoded into the iso8859 charsets rather than into |
| 162 | mule-unicode-0100-24ff. The iso8859 charsets take half as much space |
| 163 | in the buffer, but using them may affect how the buffer can be re-encoded |
| 164 | and may require a different input method to search for them, for instance. |
| 165 | See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' |
| 166 | for mechanisms to make this largely transparent. |
| 167 | |
| 168 | Setting this variable outside customize has no effect." |
| 169 | :set (lambda (s v) |
| 170 | (if v |
| 171 | (progn |
| 172 | (define-translation-table 'utf-translation-table-for-decode |
| 173 | utf-fragmentation-table) |
| 174 | ;; Even if unify-8859-on-encoding-mode is off, make |
| 175 | ;; mule-utf-* encode characters in |
| 176 | ;; utf-fragmentation-table. |
| 177 | (unless (eq (get 'utf-translation-table-for-encode |
| 178 | 'translation-table) |
| 179 | ucs-mule-to-mule-unicode) |
| 180 | (define-translation-table 'utf-translation-table-for-encode |
| 181 | utf-defragmentation-table))) |
| 182 | (define-translation-table 'utf-translation-table-for-decode) |
| 183 | ;; When unify-8859-on-encoding-mode is off, be sure to make |
| 184 | ;; mule-utf-* disabled for characters in |
| 185 | ;; utf-fragmentation-table. |
| 186 | (unless (eq (get 'utf-translation-table-for-encode |
| 187 | 'translation-table) |
| 188 | ucs-mule-to-mule-unicode) |
| 189 | (define-translation-table 'utf-translation-table-for-encode))) |
| 190 | (set-default s v)) |
| 191 | :version "22.1" |
| 192 | :type 'boolean |
| 193 | :group 'mule) |
| 194 | |
| 195 | |
| 196 | (defconst utf-translate-cjk-charsets '(chinese-gb2312 |
| 197 | chinese-big5-1 chinese-big5-2 |
| 198 | japanese-jisx0208 japanese-jisx0212 |
| 199 | katakana-jisx0201 |
| 200 | korean-ksc5601) |
| 201 | "List of charsets supported by `utf-translate-cjk-mode'.") |
| 202 | |
| 203 | (defvar utf-translate-cjk-lang-env nil |
| 204 | "Language environment in which tables for `utf-translate-cjk-mode' is loaded. |
| 205 | The value nil means that the tables are not yet loaded.") |
| 206 | |
| 207 | (defvar utf-translate-cjk-unicode-range) |
| 208 | |
| 209 | ;; String generated from utf-translate-cjk-unicode-range. It is |
| 210 | ;; suitable for an argument to skip-chars-forward. |
| 211 | (defvar utf-translate-cjk-unicode-range-string nil) |
| 212 | |
| 213 | (defun utf-translate-cjk-set-unicode-range (range) |
| 214 | (setq utf-translate-cjk-unicode-range range) |
| 215 | (setq utf-translate-cjk-unicode-range-string |
| 216 | (let ((decode-char-no-trans |
| 217 | #'(lambda (x) |
| 218 | (cond ((< x #x100) (make-char 'latin-iso8859-1 x)) |
| 219 | ((< x #x2500) |
| 220 | (setq x (- x #x100)) |
| 221 | (make-char 'mule-unicode-0100-24ff |
| 222 | (+ (/ x 96) 32) (+ (% x 96) 32))) |
| 223 | ((< x #x3400) |
| 224 | (setq x (- x #x2500)) |
| 225 | (make-char 'mule-unicode-2500-33ff |
| 226 | (+ (/ x 96) 32) (+ (% x 96) 32))) |
| 227 | (t |
| 228 | (setq x (- x #xe000)) |
| 229 | (make-char 'mule-unicode-e000-ffff |
| 230 | (+ (/ x 96) 32) (+ (% x 96) 32)))))) |
| 231 | ranges from to) |
| 232 | (dolist (elt range) |
| 233 | (setq from (max #xA0 (car elt)) to (min #xffff (cdr elt))) |
| 234 | (if (and (>= to #x3400) (< to #xE000)) |
| 235 | (setq to #x33FF)) |
| 236 | (cond ((< from #x100) |
| 237 | (if (>= to #xE000) |
| 238 | (setq ranges (cons (cons #xE000 to) ranges) |
| 239 | to #x33FF)) |
| 240 | (if (>= to #x2500) |
| 241 | (setq ranges (cons (cons #x2500 to) ranges) |
| 242 | to #x24FF)) |
| 243 | (if (>= to #x100) |
| 244 | (setq ranges (cons (cons #x100 to) ranges) |
| 245 | to #xFF))) |
| 246 | ((< from #x2500) |
| 247 | (if (>= to #xE000) |
| 248 | (setq ranges (cons (cons #xE000 to) ranges) |
| 249 | to #x33FF)) |
| 250 | (if (>= to #x2500) |
| 251 | (setq ranges (cons (cons #x2500 to) ranges) |
| 252 | to #x24FF))) |
| 253 | ((< from #x3400) |
| 254 | (if (>= to #xE000) |
| 255 | (setq ranges (cons (cons #xE000 to) ranges) |
| 256 | to #x33FF)))) |
| 257 | (if (<= from to) |
| 258 | (setq ranges (cons (cons from to) ranges)))) |
| 259 | (mapconcat #'(lambda (x) |
| 260 | (format "%c-%c" |
| 261 | (funcall decode-char-no-trans (car x)) |
| 262 | (funcall decode-char-no-trans (cdr x)))) |
| 263 | ranges ""))) |
| 264 | ;; These forces loading and settting tables for |
| 265 | ;; utf-translate-cjk-mode. |
| 266 | (setq utf-translate-cjk-lang-env nil |
| 267 | ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
| 268 | ucs-unicode-to-mule-cjk (make-hash-table :test 'eq))) |
| 269 | |
| 270 | (defcustom utf-translate-cjk-unicode-range '((#x2e80 . #xd7a3) |
| 271 | (#xff00 . #xffef)) |
| 272 | "List of Unicode code ranges supported by `utf-translate-cjk-mode'. |
| 273 | Setting this variable directly does not take effect; |
| 274 | use either \\[customize] or the function |
| 275 | `utf-translate-cjk-set-unicode-range'." |
| 276 | :version "22.1" |
| 277 | :type '(repeat (cons integer integer)) |
| 278 | :set (lambda (symbol value) |
| 279 | (utf-translate-cjk-set-unicode-range value)) |
| 280 | :group 'mule) |
| 281 | |
| 282 | ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'. |
| 283 | (defsubst utf-translate-cjk-substitutable-p (code-point) |
| 284 | (let ((tail utf-translate-cjk-unicode-range) |
| 285 | elt) |
| 286 | (while tail |
| 287 | (setq elt (car tail) tail (cdr tail)) |
| 288 | (if (and (>= code-point (car elt)) (<= code-point (cdr elt))) |
| 289 | (setq tail nil) |
| 290 | (setq elt nil))) |
| 291 | elt)) |
| 292 | |
| 293 | (defun utf-translate-cjk-load-tables () |
| 294 | "Load tables for `utf-translate-cjk-mode'." |
| 295 | ;; Fixme: Allow the use of the CJK charsets to be |
| 296 | ;; customized by reordering and possible omission. |
| 297 | (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000))) |
| 298 | (if redefined |
| 299 | ;; Redefine them with realistic initial sizes and a |
| 300 | ;; smallish rehash size to avoid wasting significant |
| 301 | ;; space after they're built. |
| 302 | (setq ucs-mule-cjk-to-unicode |
| 303 | (make-hash-table :test 'eq :size 43000 :rehash-size 1000) |
| 304 | ucs-unicode-to-mule-cjk |
| 305 | (make-hash-table :test 'eq :size 21500 :rehash-size 1000))) |
| 306 | |
| 307 | ;; Load the files explicitly, to avoid having to keep |
| 308 | ;; around the large tables they contain (as well as the |
| 309 | ;; ones which get built). |
| 310 | ;; Here we bind coding-system-for-read to nil so that coding tags |
| 311 | ;; in the files are respected even if the files are not yet |
| 312 | ;; byte-compiled |
| 313 | (let ((coding-system-for-read nil) |
| 314 | ;; We must avoid clobbering this variable, in case the load |
| 315 | ;; files below use different coding systems. |
| 316 | (last-coding-system-used last-coding-system-used)) |
| 317 | (cond ((string= "Korean" current-language-environment) |
| 318 | (load "subst-jis") |
| 319 | (load "subst-big5") |
| 320 | (load "subst-gb2312") |
| 321 | (load "subst-ksc")) |
| 322 | ((string= "Chinese-BIG5" current-language-environment) |
| 323 | (load "subst-jis") |
| 324 | (load "subst-ksc") |
| 325 | (load "subst-gb2312") |
| 326 | (load "subst-big5")) |
| 327 | ((string= "Chinese-GB" current-language-environment) |
| 328 | (load "subst-jis") |
| 329 | (load "subst-ksc") |
| 330 | (load "subst-big5") |
| 331 | (load "subst-gb2312")) |
| 332 | (t |
| 333 | (load "subst-ksc") |
| 334 | (load "subst-gb2312") |
| 335 | (load "subst-big5") |
| 336 | (load "subst-jis")))) ; jis covers as much as big5, gb2312 |
| 337 | |
| 338 | (when redefined |
| 339 | (define-translation-hash-table 'utf-subst-table-for-decode |
| 340 | ucs-unicode-to-mule-cjk) |
| 341 | (define-translation-hash-table 'utf-subst-table-for-encode |
| 342 | ucs-mule-cjk-to-unicode) |
| 343 | (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
| 344 | 'translation-table) |
| 345 | 1 ucs-mule-cjk-to-unicode)) |
| 346 | |
| 347 | (setq utf-translate-cjk-lang-env current-language-environment))) |
| 348 | |
| 349 | (defun utf-lookup-subst-table-for-decode (code-point) |
| 350 | (if (and utf-translate-cjk-mode |
| 351 | (not utf-translate-cjk-lang-env) |
| 352 | (utf-translate-cjk-substitutable-p code-point)) |
| 353 | (utf-translate-cjk-load-tables)) |
| 354 | (gethash code-point |
| 355 | (get 'utf-subst-table-for-decode 'translation-hash-table))) |
| 356 | |
| 357 | |
| 358 | (defun utf-lookup-subst-table-for-encode (char) |
| 359 | (if (and utf-translate-cjk-mode |
| 360 | (not utf-translate-cjk-lang-env) |
| 361 | (memq (char-charset char) utf-translate-cjk-charsets)) |
| 362 | (utf-translate-cjk-load-tables)) |
| 363 | (gethash char |
| 364 | (get 'utf-subst-table-for-encode 'translation-hash-table))) |
| 365 | |
| 366 | (define-minor-mode utf-translate-cjk-mode |
| 367 | "Toggle whether UTF based coding systems de/encode CJK characters. |
| 368 | If ARG is an integer, enable if ARG is positive and disable if |
| 369 | zero or negative. This is a minor mode. |
| 370 | Enabling this allows the coding systems mule-utf-8, |
| 371 | mule-utf-16le and mule-utf-16be to encode characters in the charsets |
| 372 | `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', |
| 373 | `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to |
| 374 | decode the corresponding unicodes into such characters. |
| 375 | |
| 376 | Where the charsets overlap, the one preferred for decoding is chosen |
| 377 | according to the language environment in effect when this option is |
| 378 | turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for |
| 379 | Chinese-Big5 and jisx for other environments. |
| 380 | |
| 381 | This mode is on by default. If you are not interested in CJK |
| 382 | characters and want to avoid some overhead on encoding/decoding |
| 383 | by the above coding systems, you can customize the user option |
| 384 | `utf-translate-cjk-mode' to nil." |
| 385 | :init-value t |
| 386 | :version "22.1" |
| 387 | :type 'boolean |
| 388 | :group 'mule |
| 389 | :global t |
| 390 | (if utf-translate-cjk-mode |
| 391 | (progn |
| 392 | (define-translation-hash-table 'utf-subst-table-for-decode |
| 393 | ucs-unicode-to-mule-cjk) |
| 394 | (define-translation-hash-table 'utf-subst-table-for-encode |
| 395 | ucs-mule-cjk-to-unicode) |
| 396 | (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
| 397 | 'translation-table) |
| 398 | 1 ucs-mule-cjk-to-unicode)) |
| 399 | (define-translation-hash-table 'utf-subst-table-for-decode |
| 400 | (make-hash-table :test 'eq)) |
| 401 | (define-translation-hash-table 'utf-subst-table-for-encode |
| 402 | (make-hash-table :test 'eq)) |
| 403 | (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
| 404 | 'translation-table) |
| 405 | 1 nil)) |
| 406 | |
| 407 | ;; Update safe-chars of mule-utf-* coding systems. |
| 408 | (dolist (elt (coding-system-list t)) |
| 409 | (if (string-match "^mule-utf" (symbol-name elt)) |
| 410 | (let ((safe-charsets (coding-system-get elt 'safe-charsets)) |
| 411 | (safe-chars (coding-system-get elt 'safe-chars)) |
| 412 | (need-update nil)) |
| 413 | (dolist (charset utf-translate-cjk-charsets) |
| 414 | (unless (eq utf-translate-cjk-mode (memq charset safe-charsets)) |
| 415 | (setq safe-charsets |
| 416 | (if utf-translate-cjk-mode |
| 417 | (cons charset safe-charsets) |
| 418 | (delq charset safe-charsets)) |
| 419 | need-update t) |
| 420 | (aset safe-chars (make-char charset) utf-translate-cjk-mode))) |
| 421 | (when need-update |
| 422 | (coding-system-put elt 'safe-charsets safe-charsets) |
| 423 | (define-coding-system-internal elt)))))) |
| 424 | |
| 425 | (define-ccl-program ccl-mule-utf-untrans |
| 426 | ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or |
| 427 | ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write |
| 428 | ;; eight-bit-control/graphic sequence (2 to 4 chars) representing |
| 429 | ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified. |
| 430 | ;; |
| 431 | ;; This is a subrountine because we assume that this is called very |
| 432 | ;; rarely (so we don't have to worry about the overhead of the |
| 433 | ;; call). |
| 434 | `(0 |
| 435 | ((r5 = ,(charset-id 'eight-bit-control)) |
| 436 | (r6 = ,(charset-id 'eight-bit-graphic)) |
| 437 | (if (r0 < #x100) |
| 438 | ((r4 = ((r0 >> 6) | #xC0)) |
| 439 | (write-multibyte-character r6 r4)) |
| 440 | ((if (r0 < #x10000) |
| 441 | ((r4 = ((r0 >> 12) | #xE0)) |
| 442 | (write-multibyte-character r6 r4)) |
| 443 | ((r4 = ((r0 >> 18) | #xF0)) |
| 444 | (write-multibyte-character r6 r4) |
| 445 | (r4 = (((r0 >> 12) & #x3F) | #x80)) |
| 446 | (if (r4 < #xA0) |
| 447 | (write-multibyte-character r5 r4) |
| 448 | (write-multibyte-character r6 r4)))) |
| 449 | (r4 = (((r0 >> 6) & #x3F) | #x80)) |
| 450 | (if (r4 < #xA0) |
| 451 | (write-multibyte-character r5 r4) |
| 452 | (write-multibyte-character r6 r4)))) |
| 453 | (r4 = ((r0 & #x3F) | #x80)) |
| 454 | (if (r4 < #xA0) |
| 455 | (write-multibyte-character r5 r4) |
| 456 | (write-multibyte-character r6 r4))))) |
| 457 | |
| 458 | (define-ccl-program ccl-decode-mule-utf-8 |
| 459 | ;; |
| 460 | ;; charset | bytes in utf-8 | bytes in emacs |
| 461 | ;; -----------------------+----------------+--------------- |
| 462 | ;; ascii | 1 | 1 |
| 463 | ;; -----------------------+----------------+--------------- |
| 464 | ;; eight-bit-control | 2 | 2 |
| 465 | ;; eight-bit-graphic | 2 | 1 |
| 466 | ;; latin-iso8859-1 | 2 | 2 |
| 467 | ;; -----------------------+----------------+--------------- |
| 468 | ;; mule-unicode-0100-24ff | 2 | 4 |
| 469 | ;; (< 0800) | | |
| 470 | ;; -----------------------+----------------+--------------- |
| 471 | ;; mule-unicode-0100-24ff | 3 | 4 |
| 472 | ;; (>= 8000) | | |
| 473 | ;; mule-unicode-2500-33ff | 3 | 4 |
| 474 | ;; mule-unicode-e000-ffff | 3 | 4 |
| 475 | ;; -----------------------+----------------+--------------- |
| 476 | ;; invalid byte | 1 | 2 |
| 477 | ;; |
| 478 | ;; Thus magnification factor is two. |
| 479 | ;; |
| 480 | `(2 |
| 481 | ((r6 = ,(charset-id 'latin-iso8859-1)) |
| 482 | (read r0) |
| 483 | (loop |
| 484 | (if (r0 < #x80) |
| 485 | ;; 1-byte encoding, i.e., ascii |
| 486 | (write-read-repeat r0)) |
| 487 | (if (r0 < #xc2) |
| 488 | ;; continuation byte (invalid here) or 1st byte of overlong |
| 489 | ;; 2-byte sequence. |
| 490 | ((call ccl-mule-utf-untrans) |
| 491 | (r6 = ,(charset-id 'latin-iso8859-1)) |
| 492 | (read r0) |
| 493 | (repeat))) |
| 494 | |
| 495 | ;; Read the 2nd byte. |
| 496 | (read r1) |
| 497 | (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte |
| 498 | ((call ccl-mule-utf-untrans) |
| 499 | (r6 = ,(charset-id 'latin-iso8859-1)) |
| 500 | ;; Handle it in the next loop. |
| 501 | (r0 = r1) |
| 502 | (repeat))) |
| 503 | |
| 504 | (if (r0 < #xe0) |
| 505 | ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
| 506 | ((r1 &= #x3F) |
| 507 | (r1 |= ((r0 & #x1F) << 6)) |
| 508 | ;; Now r1 holds scalar value. We don't have to check |
| 509 | ;; `overlong sequence' because r0 >= 0xC2. |
| 510 | |
| 511 | (if (r1 >= 256) |
| 512 | ;; mule-unicode-0100-24ff (< 0800) |
| 513 | ((r0 = r1) |
| 514 | (lookup-integer utf-subst-table-for-decode r0 r1) |
| 515 | (if (r7 == 0) |
| 516 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
| 517 | (r1 -= #x0100) |
| 518 | (r2 = (((r1 / 96) + 32) << 7)) |
| 519 | (r1 %= 96) |
| 520 | (r1 += (r2 + 32)) |
| 521 | (translate-character |
| 522 | utf-translation-table-for-decode r0 r1))) |
| 523 | (write-multibyte-character r0 r1) |
| 524 | (read r0) |
| 525 | (repeat)) |
| 526 | (if (r1 >= 160) |
| 527 | ;; latin-iso8859-1 |
| 528 | ((r0 = r1) |
| 529 | (lookup-integer utf-subst-table-for-decode r0 r1) |
| 530 | (if (r7 == 0) |
| 531 | ((r1 -= 128) |
| 532 | (write-multibyte-character r6 r1)) |
| 533 | ((write-multibyte-character r0 r1))) |
| 534 | (read r0) |
| 535 | (repeat)) |
| 536 | ;; eight-bit-control |
| 537 | ((r0 = ,(charset-id 'eight-bit-control)) |
| 538 | (write-multibyte-character r0 r1) |
| 539 | (read r0) |
| 540 | (repeat)))))) |
| 541 | |
| 542 | ;; Read the 3rd bytes. |
| 543 | (read r2) |
| 544 | (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte |
| 545 | ((call ccl-mule-utf-untrans) |
| 546 | (r0 = r1) |
| 547 | (call ccl-mule-utf-untrans) |
| 548 | (r6 = ,(charset-id 'latin-iso8859-1)) |
| 549 | ;; Handle it in the next loop. |
| 550 | (r0 = r2) |
| 551 | (repeat))) |
| 552 | |
| 553 | (if (r0 < #xF0) |
| 554 | ;; 3byte encoding |
| 555 | ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx |
| 556 | ((r3 = ((r0 & #xF) << 12)) |
| 557 | (r3 |= ((r1 & #x3F) << 6)) |
| 558 | (r3 |= (r2 & #x3F)) |
| 559 | |
| 560 | (if (r3 < #x800) ; `overlong sequence' |
| 561 | ((call ccl-mule-utf-untrans) |
| 562 | (r0 = r1) |
| 563 | (call ccl-mule-utf-untrans) |
| 564 | (r0 = r2) |
| 565 | (call ccl-mule-utf-untrans) |
| 566 | (r6 = ,(charset-id 'latin-iso8859-1)) |
| 567 | (read r0) |
| 568 | (repeat))) |
| 569 | |
| 570 | (if (r3 < #x2500) |
| 571 | ;; mule-unicode-0100-24ff (>= 0800) |
| 572 | ((r0 = r3) |
| 573 | (lookup-integer utf-subst-table-for-decode r0 r1) |
| 574 | (if (r7 == 0) |
| 575 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
| 576 | (r3 -= #x0100) |
| 577 | (r3 //= 96) |
| 578 | (r1 = (r7 + 32)) |
| 579 | (r1 += ((r3 + 32) << 7)) |
| 580 | (translate-character |
| 581 | utf-translation-table-for-decode r0 r1))) |
| 582 | (write-multibyte-character r0 r1) |
| 583 | (read r0) |
| 584 | (repeat))) |
| 585 | |
| 586 | (if (r3 < #x3400) |
| 587 | ;; mule-unicode-2500-33ff |
| 588 | ((r0 = r3) ; don't zap r3 |
| 589 | (lookup-integer utf-subst-table-for-decode r0 r1) |
| 590 | (if (r7 == 0) |
| 591 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
| 592 | (r3 -= #x2500) |
| 593 | (r3 //= 96) |
| 594 | (r1 = (r7 + 32)) |
| 595 | (r1 += ((r3 + 32) << 7)))) |
| 596 | (write-multibyte-character r0 r1) |
| 597 | (read r0) |
| 598 | (repeat))) |
| 599 | |
| 600 | (if (r3 < #xE000) |
| 601 | ;; Try to convert to CJK chars, else |
| 602 | ;; keep them as eight-bit-{control|graphic}. |
| 603 | ((r0 = r3) |
| 604 | (lookup-integer utf-subst-table-for-decode r3 r1) |
| 605 | (if r7 |
| 606 | ;; got a translation |
| 607 | ((write-multibyte-character r3 r1) |
| 608 | (read r0) |
| 609 | (repeat)) |
| 610 | ((call ccl-mule-utf-untrans) |
| 611 | (r6 = ,(charset-id 'latin-iso8859-1)) |
| 612 | (read r0) |
| 613 | (repeat))))) |
| 614 | |
| 615 | ;; mule-unicode-e000-ffff |
| 616 | ;; Fixme: fffe and ffff are invalid. |
| 617 | (r0 = r3) ; don't zap r3 |
| 618 | (lookup-integer utf-subst-table-for-decode r0 r1) |
| 619 | (if (r7 == 0) |
| 620 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
| 621 | (r3 -= #xe000) |
| 622 | (r3 //= 96) |
| 623 | (r1 = (r7 + 32)) |
| 624 | (r1 += ((r3 + 32) << 7)))) |
| 625 | (write-multibyte-character r0 r1) |
| 626 | (read r0) |
| 627 | (repeat))) |
| 628 | |
| 629 | ;; Read the 4th bytes. |
| 630 | (read r3) |
| 631 | (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte |
| 632 | ((call ccl-mule-utf-untrans) |
| 633 | (r0 = r1) |
| 634 | (call ccl-mule-utf-untrans) |
| 635 | (r0 = r2) |
| 636 | (call ccl-mule-utf-untrans) |
| 637 | (r6 = ,(charset-id 'latin-iso8859-1)) |
| 638 | ;; Handle it in the next loop. |
| 639 | (r0 = r3) |
| 640 | (repeat))) |
| 641 | |
| 642 | (if (r0 < #xF8) |
| 643 | ;; 4-byte encoding: |
| 644 | ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx |
| 645 | ;; keep those bytes as eight-bit-{control|graphic} |
| 646 | ;; Fixme: allow lookup in utf-subst-table-for-decode. |
| 647 | ((r4 = ((r0 & #x7) << 18)) |
| 648 | (r4 |= ((r1 & #x3F) << 12)) |
| 649 | (r4 |= ((r2 & #x3F) << 6)) |
| 650 | (r4 |= (r3 & #x3F)) |
| 651 | |
| 652 | (if (r4 < #x10000) ; `overlong sequence' |
| 653 | ((call ccl-mule-utf-untrans) |
| 654 | (r0 = r1) |
| 655 | (call ccl-mule-utf-untrans) |
| 656 | (r0 = r2) |
| 657 | (call ccl-mule-utf-untrans) |
| 658 | (r0 = r3) |
| 659 | (call ccl-mule-utf-untrans)) |
| 660 | ((r0 = r4) |
| 661 | (call ccl-mule-utf-untrans)))) |
| 662 | |
| 663 | ;; Unsupported sequence. |
| 664 | ((call ccl-mule-utf-untrans) |
| 665 | (r0 = r1) |
| 666 | (call ccl-mule-utf-untrans) |
| 667 | (r0 = r2) |
| 668 | (call ccl-mule-utf-untrans) |
| 669 | (r0 = r3) |
| 670 | (call ccl-mule-utf-untrans))) |
| 671 | (r6 = ,(charset-id 'latin-iso8859-1)) |
| 672 | (read r0) |
| 673 | (repeat))) |
| 674 | |
| 675 | |
| 676 | ;; At EOF... |
| 677 | (if (r0 >= 0) |
| 678 | ;; r0 >= #x80 |
| 679 | ((call ccl-mule-utf-untrans) |
| 680 | (if (r1 >= 0) |
| 681 | ((r0 = r1) |
| 682 | (call ccl-mule-utf-untrans) |
| 683 | (if (r2 >= 0) |
| 684 | ((r0 = r2) |
| 685 | (call ccl-mule-utf-untrans) |
| 686 | (if (r3 >= 0) |
| 687 | ((r0 = r3) |
| 688 | (call ccl-mule-utf-untrans)))))))))) |
| 689 | |
| 690 | "CCL program to decode UTF-8. |
| 691 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
| 692 | mule-unicode-*, but see also `utf-fragmentation-table' and |
| 693 | `ucs-mule-cjk-to-unicode'. |
| 694 | Encodings of un-representable Unicode characters are decoded asis into |
| 695 | eight-bit-control and eight-bit-graphic characters.") |
| 696 | |
| 697 | (define-ccl-program ccl-mule-utf-8-encode-untrans |
| 698 | ;; UTF-8 decoder generates an UTF-8 sequence represented by a |
| 699 | ;; sequence eight-bit-control/graphic chars for an untranslatable |
| 700 | ;; character and an invalid byte. |
| 701 | ;; |
| 702 | ;; This CCL parses that sequence (the first byte is already in r1), |
| 703 | ;; writes out the original bytes of that sequence, and sets r5 to |
| 704 | ;; -1. |
| 705 | ;; |
| 706 | ;; If the eight-bit-control/graphic sequence is shorter than what r1 |
| 707 | ;; suggests, it sets r5 and r6 to the last character read that |
| 708 | ;; should be handled by the next loop of a caller. |
| 709 | ;; |
| 710 | ;; Note: For UTF-8 validation, we only check if a character is |
| 711 | ;; eight-bit-control/graphic or not. It may result in incorrect |
| 712 | ;; handling of random binary data, but such a data can't be encoded |
| 713 | ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such |
| 714 | ;; a sequence even if a source contains invalid byte-sequence. |
| 715 | `(0 |
| 716 | (;; Read the 2nd byte. |
| 717 | (read-multibyte-character r5 r6) |
| 718 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
| 719 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
| 720 | ((write r1) ; invalid UTF-8 |
| 721 | (r1 = -1) |
| 722 | (end))) |
| 723 | |
| 724 | (if (r1 <= #xC3) |
| 725 | ;; 2-byte sequence for an originally invalid byte. |
| 726 | ((r6 &= #x3F) |
| 727 | (r6 |= ((r1 & #x1F) << 6)) |
| 728 | (write r6) |
| 729 | (r5 = -1) |
| 730 | (end))) |
| 731 | |
| 732 | (write r1 r6) |
| 733 | (r2 = r1) |
| 734 | (r1 = -1) |
| 735 | ;; Read the 3rd byte. |
| 736 | (read-multibyte-character r5 r6) |
| 737 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
| 738 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
| 739 | (end)) ; invalid UTF-8 |
| 740 | (write r6) |
| 741 | (if (r2 < #xF0) |
| 742 | ;; 3-byte sequence for an untranslated character. |
| 743 | ((r5 = -1) |
| 744 | (end))) |
| 745 | ;; Read the 4th byte. |
| 746 | (read-multibyte-character r5 r6) |
| 747 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
| 748 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
| 749 | (end)) ; invalid UTF-8 |
| 750 | ;; 4-byte sequence for an untranslated character. |
| 751 | (write r6) |
| 752 | (r5 = -1) |
| 753 | (end)) |
| 754 | |
| 755 | ;; At EOF... |
| 756 | ((r5 = -1) |
| 757 | (if (r1 >= 0) |
| 758 | (write r1))))) |
| 759 | |
| 760 | (define-ccl-program ccl-encode-mule-utf-8 |
| 761 | `(1 |
| 762 | ((r5 = -1) |
| 763 | (loop |
| 764 | (if (r5 < 0) |
| 765 | (read-multibyte-character r0 r1) |
| 766 | ;; Pre-read character is in r5 (charset-ID) and r6 (code-point). |
| 767 | ((r0 = r5) |
| 768 | (r1 = r6) |
| 769 | (r5 = -1))) |
| 770 | (translate-character utf-translation-table-for-encode r0 r1) |
| 771 | |
| 772 | (if (r0 == ,(charset-id 'ascii)) |
| 773 | (write-repeat r1)) |
| 774 | |
| 775 | (if (r0 == ,(charset-id 'latin-iso8859-1)) |
| 776 | ;; r1 scalar utf-8 |
| 777 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
| 778 | ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
| 779 | ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
| 780 | ((write ((r1 >> 6) | #xc2)) |
| 781 | (r1 &= #x3f) |
| 782 | (r1 |= #x80) |
| 783 | (write-repeat r1))) |
| 784 | |
| 785 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
| 786 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
| 787 | ;; #x3f80 == (0011 1111 1000 0000)b |
| 788 | (r1 &= #x7f) |
| 789 | (r1 += (r0 + 224)) ; 240 == -32 + #x0100 |
| 790 | ;; now r1 holds scalar value |
| 791 | (if (r1 < #x0800) |
| 792 | ;; 2byte encoding |
| 793 | ((write ((r1 >> 6) | #xC0)) |
| 794 | (r1 &= #x3F) |
| 795 | (r1 |= #x80) |
| 796 | (write-repeat r1)) |
| 797 | ;; 3byte encoding |
| 798 | ((write ((r1 >> 12) | #xE0)) |
| 799 | (write (((r1 & #x0FC0) >> 6) | #x80)) |
| 800 | (r1 &= #x3F) |
| 801 | (r1 |= #x80) |
| 802 | (write-repeat r1))))) |
| 803 | |
| 804 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) |
| 805 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
| 806 | (r1 &= #x7f) |
| 807 | (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 |
| 808 | ;; now r1 holds scalar value |
| 809 | (write ((r1 >> 12) | #xE0)) |
| 810 | (write (((r1 & #x0FC0) >> 6) | #x80)) |
| 811 | (r1 &= #x3F) |
| 812 | (r1 |= #x80) |
| 813 | (write-repeat r1))) |
| 814 | |
| 815 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
| 816 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
| 817 | (r1 &= #x7f) |
| 818 | (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 |
| 819 | ;; now r1 holds scalar value |
| 820 | (write ((r1 >> 12) | #xE0)) |
| 821 | (write (((r1 & #x0FC0) >> 6) | #x80)) |
| 822 | (r1 &= #x3F) |
| 823 | (r1 |= #x80) |
| 824 | (write-repeat r1))) |
| 825 | |
| 826 | (if (r0 == ,(charset-id 'eight-bit-control)) |
| 827 | ;; r1 scalar utf-8 |
| 828 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
| 829 | ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 |
| 830 | ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 |
| 831 | ((write #xC2) |
| 832 | (write-repeat r1))) |
| 833 | |
| 834 | (if (r0 == ,(charset-id 'eight-bit-graphic)) |
| 835 | ;; r1 scalar utf-8 |
| 836 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
| 837 | ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
| 838 | ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
| 839 | ((r0 = (r1 >= #xC0)) |
| 840 | (r0 &= (r1 <= #xC3)) |
| 841 | (r4 = (r1 >= #xE1)) |
| 842 | (r4 &= (r1 <= #xF7)) |
| 843 | (r0 |= r4) |
| 844 | (if r0 |
| 845 | ((call ccl-mule-utf-8-encode-untrans) |
| 846 | (repeat)) |
| 847 | (write-repeat r1)))) |
| 848 | |
| 849 | (lookup-character utf-subst-table-for-encode r0 r1) |
| 850 | (if r7 ; lookup succeeded |
| 851 | (if (r0 < #x800) |
| 852 | ;; 2byte encoding |
| 853 | ((write ((r0 >> 6) | #xC0)) |
| 854 | (r0 = ((r0 & #x3F) | #x80)) |
| 855 | (write-repeat r0)) |
| 856 | ;; 3byte encoding |
| 857 | ((write ((r0 >> 12) | #xE0)) |
| 858 | (write (((r0 & #x0FC0) >> 6) | #x80)) |
| 859 | (r0 = ((r0 & #x3F) | #x80)) |
| 860 | (write-repeat r0)))) |
| 861 | |
| 862 | ;; Unsupported character. |
| 863 | ;; Output U+FFFD, which is `ef bf bd' in UTF-8. |
| 864 | (write #xef) |
| 865 | (write #xbf) |
| 866 | (write-repeat #xbd)))) |
| 867 | "CCL program to encode into UTF-8.") |
| 868 | |
| 869 | |
| 870 | (define-ccl-program ccl-untranslated-to-ucs |
| 871 | `(0 |
| 872 | (if (r1 == 0) |
| 873 | nil |
| 874 | (if (r0 <= #xC3) ; 2-byte encoding |
| 875 | ((r0 = ((r0 & #x3) << 6)) |
| 876 | (r0 |= (r1 & #x3F)) |
| 877 | (r1 = 2)) |
| 878 | (if (r2 == 0) |
| 879 | (r1 = 0) |
| 880 | (if (r0 < #xF0) ; 3-byte encoding, as above |
| 881 | ((r0 = ((r0 & #xF) << 12)) |
| 882 | (r0 |= ((r1 & #x3F) << 6)) |
| 883 | (r0 |= (r2 & #x3F)) |
| 884 | (r1 = 3)) |
| 885 | (if (r3 == 0) |
| 886 | (r1 = 0) |
| 887 | ((r0 = ((r0 & #x7) << 18)) |
| 888 | (r0 |= ((r1 & #x3F) << 12)) |
| 889 | (r0 |= ((r2 & #x3F) << 6)) |
| 890 | (r0 |= (r3 & #x3F)) |
| 891 | (r1 = 4)))))))) |
| 892 | "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. |
| 893 | Set r1 to the byte length. r0 == 0 for invalid sequence.") |
| 894 | |
| 895 | (defvar utf-8-ccl-regs (make-vector 8 0)) |
| 896 | |
| 897 | (defsubst utf-8-untranslated-to-ucs () |
| 898 | "Return the UCS code for an untranslated sequence of raw bytes t point. |
| 899 | Only for 3- or 4-byte sequences." |
| 900 | (aset utf-8-ccl-regs 0 (or (char-after) 0)) |
| 901 | (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) |
| 902 | (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) |
| 903 | (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) |
| 904 | (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)) |
| 905 | |
| 906 | (defun utf-8-help-echo (window object position) |
| 907 | (format "Untranslated Unicode U+%04X" |
| 908 | (get-char-property position 'untranslated-utf-8 object))) |
| 909 | |
| 910 | ;; We compose the untranslatable sequences into a single character, |
| 911 | ;; and move point to the next character. |
| 912 | ;; This is infelicitous for editing, because there's currently no |
| 913 | ;; mechanism for treating compositions as atomic, but is OK for |
| 914 | ;; display. They are composed to U+FFFD with help-echo which |
| 915 | ;; indicates the unicodes they represent. This function GCs too much. |
| 916 | |
| 917 | ;; If utf-translate-cjk-mode is non-nil, this function is called with |
| 918 | ;; HASH-TABLE which translates CJK characters into some of CJK |
| 919 | ;; charsets. |
| 920 | |
| 921 | (defsubst utf-8-compose (hash-table) |
| 922 | "Put a suitable composition on an untranslatable sequence at point. |
| 923 | If HASH-TABLE is non-nil, try to translate CJK characters by it at first. |
| 924 | Move point to the end of the sequence." |
| 925 | (utf-8-untranslated-to-ucs) |
| 926 | (let ((l (aref utf-8-ccl-regs 1)) |
| 927 | ch) |
| 928 | (if (> l 0) |
| 929 | (if (and hash-table |
| 930 | (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table))) |
| 931 | (progn |
| 932 | (insert ch) |
| 933 | (delete-region (point) (min (point-max) (+ l (point))))) |
| 934 | (setq ch (aref utf-8-ccl-regs 0)) |
| 935 | (put-text-property (point) (min (point-max) (+ l (point))) |
| 936 | 'untranslated-utf-8 ch) |
| 937 | (put-text-property (point) (min (point-max) (+ l (point))) |
| 938 | 'help-echo 'utf-8-help-echo) |
| 939 | (if (= l 2) |
| 940 | (put-text-property (point) (min (point-max) (+ l (point))) |
| 941 | 'display (propertize (format "\\%03o" ch) |
| 942 | 'face 'escape-glyph)) |
| 943 | (compose-region (point) (+ l (point)) ?\e$,3u=\e(B)) |
| 944 | (forward-char l)) |
| 945 | (forward-char 1)))) |
| 946 | |
| 947 | (defcustom utf-8-compose-scripts nil |
| 948 | "*Non-nil means compose various scripts on decoding utf-8 text." |
| 949 | :group 'mule |
| 950 | :version "22.1" |
| 951 | :type 'boolean) |
| 952 | |
| 953 | (defun utf-8-post-read-conversion (length) |
| 954 | "Compose untranslated utf-8 sequences into single characters. |
| 955 | If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters. |
| 956 | Also compose particular scripts if `utf-8-compose-scripts' is non-nil." |
| 957 | (save-excursion |
| 958 | (save-restriction |
| 959 | (narrow-to-region (point) (+ (point) length)) |
| 960 | ;; Can't do eval-when-compile to insert a multibyte constant |
| 961 | ;; version of the string in the loop, since it's always loaded as |
| 962 | ;; unibyte from a byte-compiled file. |
| 963 | (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7")) |
| 964 | (buffer-multibyte enable-multibyte-characters) |
| 965 | hash-table ch) |
| 966 | (set-buffer-multibyte t) |
| 967 | (when utf-translate-cjk-mode |
| 968 | (unless utf-translate-cjk-lang-env |
| 969 | ;; Check these characters in utf-translate-cjk-range. |
| 970 | ;; We may have to translate them to CJK charsets. |
| 971 | (skip-chars-forward |
| 972 | (concat range utf-translate-cjk-unicode-range-string)) |
| 973 | (unless (eobp) |
| 974 | (utf-translate-cjk-load-tables) |
| 975 | (setq range |
| 976 | (concat range utf-translate-cjk-unicode-range-string))) |
| 977 | (setq hash-table (get 'utf-subst-table-for-decode |
| 978 | 'translation-hash-table)))) |
| 979 | (while (and (skip-chars-forward range) |
| 980 | (not (eobp))) |
| 981 | (setq ch (following-char)) |
| 982 | (if (< ch 256) |
| 983 | (utf-8-compose hash-table) |
| 984 | (if (and hash-table |
| 985 | (setq ch (gethash (encode-char ch 'ucs) hash-table))) |
| 986 | (progn |
| 987 | (insert ch) |
| 988 | (delete-char 1)) |
| 989 | (forward-char 1)))) |
| 990 | (or buffer-multibyte |
| 991 | (set-buffer-multibyte nil))) |
| 992 | |
| 993 | (when (and utf-8-compose-scripts (> length 1)) |
| 994 | ;; This let-binding avoids recursive auto-loading. And, we |
| 995 | ;; anyway don't have to run the following code while |
| 996 | ;; auto-loading. |
| 997 | (let ((utf-8-compose-scripts nil)) |
| 998 | ;; These currently have definitions which cover the relevant |
| 999 | ;; unicodes. We could avoid loading thai-util &c by checking |
| 1000 | ;; whether the region contains any characters with the appropriate |
| 1001 | ;; categories. There aren't yet Unicode-based rules for Tibetan. |
| 1002 | (diacritic-compose-region (point-max) (point-min)) |
| 1003 | (thai-compose-region (point-max) (point-min)) |
| 1004 | (lao-compose-region (point-max) (point-min)) |
| 1005 | (devanagari-compose-region (point-max) (point-min)) |
| 1006 | (malayalam-compose-region (point-max) (point-min)) |
| 1007 | (tamil-compose-region (point-max) (point-min)))) |
| 1008 | (- (point-max) (point-min))))) |
| 1009 | |
| 1010 | (defun utf-8-pre-write-conversion (beg end) |
| 1011 | "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END. |
| 1012 | This is used as a post-read-conversion of utf-8 coding system." |
| 1013 | (if (and utf-translate-cjk-mode |
| 1014 | (not utf-translate-cjk-lang-env) |
| 1015 | (if (stringp beg) |
| 1016 | (string-match "\\cc\\|\\cj\\|\\ch" beg) |
| 1017 | (save-excursion |
| 1018 | (goto-char beg) |
| 1019 | (re-search-forward "\\cc\\|\\cj\\|\\ch" end t)))) |
| 1020 | (utf-translate-cjk-load-tables)) |
| 1021 | nil) |
| 1022 | |
| 1023 | (make-coding-system |
| 1024 | 'mule-utf-8 4 ?u |
| 1025 | "UTF-8 encoding for Emacs-supported Unicode characters. |
| 1026 | It supports Unicode characters of these ranges: |
| 1027 | U+0000..U+33FF, U+E000..U+FFFF. |
| 1028 | They correspond to these Emacs character sets: |
| 1029 | ascii, latin-iso8859-1, mule-unicode-0100-24ff, |
| 1030 | mule-unicode-2500-33ff, mule-unicode-e000-ffff |
| 1031 | |
| 1032 | On decoding (e.g. reading a file), Unicode characters not in the above |
| 1033 | ranges are decoded into sequences of eight-bit-control and |
| 1034 | eight-bit-graphic characters to preserve their byte sequences. The |
| 1035 | byte sequence is preserved on i/o for valid utf-8, but not necessarily |
| 1036 | for invalid utf-8. |
| 1037 | |
| 1038 | On encoding (e.g. writing a file), Emacs characters not belonging to |
| 1039 | any of the character sets listed above are encoded into the UTF-8 byte |
| 1040 | sequence representing U+FFFD (REPLACEMENT CHARACTER)." |
| 1041 | |
| 1042 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) |
| 1043 | `((safe-charsets |
| 1044 | ascii |
| 1045 | eight-bit-control |
| 1046 | eight-bit-graphic |
| 1047 | latin-iso8859-1 |
| 1048 | mule-unicode-0100-24ff |
| 1049 | mule-unicode-2500-33ff |
| 1050 | mule-unicode-e000-ffff |
| 1051 | ,@(if utf-translate-cjk-mode |
| 1052 | utf-translate-cjk-charsets)) |
| 1053 | (mime-charset . utf-8) |
| 1054 | (coding-category . coding-category-utf-8) |
| 1055 | (valid-codes (0 . 255)) |
| 1056 | (pre-write-conversion . utf-8-pre-write-conversion) |
| 1057 | (post-read-conversion . utf-8-post-read-conversion) |
| 1058 | (translation-table-for-encode . utf-translation-table-for-encode) |
| 1059 | (dependency unify-8859-on-encoding-mode |
| 1060 | unify-8859-on-decoding-mode |
| 1061 | utf-fragment-on-decoding |
| 1062 | utf-translate-cjk-mode))) |
| 1063 | |
| 1064 | (define-coding-system-alias 'utf-8 'mule-utf-8) |
| 1065 | |
| 1066 | ;; I think this needs special private charsets defined for the |
| 1067 | ;; untranslated sequences, if it's going to work well. |
| 1068 | |
| 1069 | ;;; (defun utf-8-compose-function (pos to pattern &optional string) |
| 1070 | ;;; (let* ((prop (get-char-property pos 'composition string)) |
| 1071 | ;;; (l (and prop (- (cadr prop) (car prop))))) |
| 1072 | ;;; (cond ((and l (> l (- to pos))) |
| 1073 | ;;; (delete-region pos to)) |
| 1074 | ;;; ((and (> (char-after pos) 224) |
| 1075 | ;;; (< (char-after pos) 256) |
| 1076 | ;;; (save-restriction |
| 1077 | ;;; (narrow-to-region pos to) |
| 1078 | ;;; (utf-8-compose))) |
| 1079 | ;;; t)))) |
| 1080 | |
| 1081 | ;;; (dotimes (i 96) |
| 1082 | ;;; (aset composition-function-table |
| 1083 | ;;; (+ 128 i) |
| 1084 | ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") |
| 1085 | ;;; . utf-8-compose-function)))) |
| 1086 | |
| 1087 | ;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5 |
| 1088 | ;;; utf-8.el ends here |