Commit | Line | Data |
---|---|---|
9ca2ac2d | 1 | ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
5ba7a870 | 2 | |
d7a0267c | 3 | ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 |
d4877ac1 | 4 | ;; Free Software Foundation, Inc. |
d7a0267c | 5 | ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 |
2fd125a3 KH |
6 | ;; National Institute of Advanced Industrial Science and Technology (AIST) |
7 | ;; Registration Number H14PRO021 | |
5ba7a870 | 8 | |
aa15b3e5 | 9 | ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
9ca2ac2d | 10 | ;; Maintainer: FSF |
c49b8288 | 11 | ;; Keywords: multilingual, Unicode, UTF-8, i18n |
5ba7a870 KH |
12 | |
13 | ;; This file is part of GNU Emacs. | |
14 | ||
15 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
16 | ;; it under the terms of the GNU General Public License as published by | |
d7142f3e | 17 | ;; the Free Software Foundation; either version 3, or (at your option) |
5ba7a870 KH |
18 | ;; any later version. |
19 | ||
20 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
21 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
22 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
23 | ;; GNU General Public License for more details. | |
24 | ||
25 | ;; You should have received a copy of the GNU General Public License | |
26 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
3a35cf56 LK |
27 | ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
28 | ;; Boston, MA 02110-1301, USA. | |
5ba7a870 KH |
29 | |
30 | ;;; Commentary: | |
31 | ||
aa2e3f49 DL |
32 | ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
33 | ;; of the following character sets to and from UTF-8: | |
5ba7a870 KH |
34 | ;; |
35 | ;; ascii | |
36 | ;; eight-bit-control | |
37 | ;; latin-iso8859-1 | |
38 | ;; mule-unicode-0100-24ff | |
39 | ;; mule-unicode-2500-33ff | |
40 | ;; mule-unicode-e000-ffff | |
41 | ;; | |
c49b8288 DL |
42 | ;; On decoding, Unicode characters that do not fit into the above |
43 | ;; character sets are handled as `eight-bit-control' or | |
44 | ;; `eight-bit-graphic' characters to retain the information about the | |
9ca2ac2d DL |
45 | ;; original byte sequence and text properties record the corresponding |
46 | ;; unicode. | |
47 | ;; | |
48 | ;; Fixme: note that reading and writing invalid utf-8 may not be | |
49 | ;; idempotent -- to represent the bytes to fix that needs a new charset. | |
aa2e3f49 | 50 | ;; |
ad88f5c5 | 51 | ;; Characters from other character sets can be encoded with mule-utf-8 |
ccdd5c61 | 52 | ;; by populating the translation table |
535665b8 | 53 | ;; `utf-translation-table-for-encode'. Hash tables |
ad88f5c5 KH |
54 | ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are |
55 | ;; used to support encoding and decoding of about a quarter of the CJK | |
56 | ;; space between U+3400 and U+DFFF. | |
c49b8288 | 57 | |
81639ac3 | 58 | ;; UTF-8 is defined in RFC 3629. A sketch of the encoding is: |
5ba7a870 KH |
59 | |
60 | ;; scalar | utf-8 | |
61 | ;; value | 1st byte | 2nd byte | 3rd byte | |
62 | ;; --------------------+-----------+-----------+---------- | |
63 | ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
64 | ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
65 | ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
66 | ||
67 | ;;; Code: | |
68 | ||
ad88f5c5 KH |
69 | (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil) |
70 | "Char table mapping characters to latin-iso8859-1 or mule-unicode-*. | |
9ca2ac2d | 71 | |
ad88f5c5 KH |
72 | If `unify-8859-on-encoding-mode' is non-nil, this table populates the |
73 | translation-table named `utf-translation-table-for-encode'.") | |
74 | ||
75 | (define-translation-table 'utf-translation-table-for-encode) | |
9ca2ac2d | 76 | |
9ca2ac2d DL |
77 | |
78 | ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the | |
79 | ;; space of mule-unicode. For Latin scripts this isn't very | |
80 | ;; important. Hebrew and Arabic might go here too when there's proper | |
81 | ;; support for them. | |
ad88f5c5 KH |
82 | |
83 | (defvar utf-fragmentation-table (make-char-table 'translation-table nil) | |
84 | "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*. | |
85 | ||
86 | If `utf-fragment-on-decoding' is non-nil, this table populates the | |
87 | translation-table named `utf-translation-table-for-decode'") | |
88 | ||
89 | (defvar utf-defragmentation-table (make-char-table 'translation-table nil) | |
90 | "Char-table for reverse mapping of `utf-fragmentation-table'. | |
91 | ||
92 | If `utf-fragment-on-decoding' is non-nil and | |
93 | `unify-8859-on-encoding-mode' is nil, this table populates the | |
94 | translation-table named `utf-translation-table-for-encode'") | |
95 | ||
96 | (define-translation-table 'utf-translation-table-for-decode) | |
97 | ||
98 | ||
0c8410d5 | 99 | (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
ad88f5c5 KH |
100 | "Hash table mapping Emacs CJK character sets to Unicode code points. |
101 | ||
9e24a165 | 102 | If `utf-translate-cjk-mode' is non-nil, this table populates the |
ad88f5c5 KH |
103 | translation-hash-table named `utf-subst-table-for-encode'.") |
104 | ||
0c8410d5 DL |
105 | (define-translation-hash-table 'utf-subst-table-for-encode |
106 | ucs-mule-cjk-to-unicode) | |
ad88f5c5 | 107 | |
0c8410d5 | 108 | (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) |
ad88f5c5 KH |
109 | "Hash table mapping Unicode code points to Emacs CJK character sets. |
110 | ||
9e24a165 | 111 | If `utf-translate-cjk-mode' is non-nil, this table populates the |
ad88f5c5 KH |
112 | translation-hash-table named `utf-subst-table-for-decode'.") |
113 | ||
114 | (define-translation-hash-table 'utf-subst-table-for-decode | |
0c8410d5 | 115 | ucs-unicode-to-mule-cjk) |
ad88f5c5 | 116 | |
9ca2ac2d DL |
117 | (mapc |
118 | (lambda (pair) | |
ad88f5c5 KH |
119 | (aset utf-fragmentation-table (car pair) (cdr pair)) |
120 | (aset utf-defragmentation-table (cdr pair) (car pair))) | |
9ca2ac2d DL |
121 | '((?\e$,1&d\e(B . ?\e,F4\e(B) (?\e$,1&e\e(B . ?\e,F5\e(B) (?\e$,1&f\e(B . ?\e,F6\e(B) (?\e$,1&h\e(B . ?\e,F8\e(B) (?\e$,1&i\e(B . ?\e,F9\e(B) |
122 | (?\e$,1&j\e(B . ?\e,F:\e(B) (?\e$,1&l\e(B . ?\e,F<\e(B) (?\e$,1&n\e(B . ?\e,F>\e(B) (?\e$,1&o\e(B . ?\e,F?\e(B) (?\e$,1&p\e(B . ?\e,F@\e(B) | |
123 | (?\e$,1&q\e(B . ?\e,FA\e(B) (?\e$,1&r\e(B . ?\e,FB\e(B) (?\e$,1&s\e(B . ?\e,FC\e(B) (?\e$,1&t\e(B . ?\e,FD\e(B) (?\e$,1&u\e(B . ?\e,FE\e(B) | |
124 | (?\e$,1&v\e(B . ?\e,FF\e(B) (?\e$,1&w\e(B . ?\e,FG\e(B) (?\e$,1&x\e(B . ?\e,FH\e(B) (?\e$,1&y\e(B . ?\e,FI\e(B) (?\e$,1&z\e(B . ?\e,FJ\e(B) | |
125 | (?\e$,1&{\e(B . ?\e,FK\e(B) (?\e$,1&|\e(B . ?\e,FL\e(B) (?\e$,1&}\e(B . ?\e,FM\e(B) (?\e$,1&~\e(B . ?\e,FN\e(B) (?\e$,1&\7f\e(B . ?\e,FO\e(B) | |
126 | (?\e$,1' \e(B . ?\e,FP\e(B) (?\e$,1'!\e(B . ?\e,FQ\e(B) (?\e$,1'#\e(B . ?\e,FS\e(B) (?\e$,1'$\e(B . ?\e,FT\e(B) (?\e$,1'%\e(B . ?\e,FU\e(B) | |
127 | (?\e$,1'&\e(B . ?\e,FV\e(B) (?\e$,1''\e(B . ?\e,FW\e(B) (?\e$,1'(\e(B . ?\e,FX\e(B) (?\e$,1')\e(B . ?\e,FY\e(B) (?\e$,1'*\e(B . ?\e,FZ\e(B) | |
128 | (?\e$,1'+\e(B . ?\e,F[\e(B) (?\e$,1',\e(B . ?\e,F\\e(B) (?\e$,1'-\e(B . ?\e,F]\e(B) (?\e$,1'.\e(B . ?\e,F^\e(B) (?\e$,1'/\e(B . ?\e,F_\e(B) | |
129 | (?\e$,1'0\e(B . ?\e,F`\e(B) (?\e$,1'1\e(B . ?\e,Fa\e(B) (?\e$,1'2\e(B . ?\e,Fb\e(B) (?\e$,1'3\e(B . ?\e,Fc\e(B) (?\e$,1'4\e(B . ?\e,Fd\e(B) | |
130 | (?\e$,1'5\e(B . ?\e,Fe\e(B) (?\e$,1'6\e(B . ?\e,Ff\e(B) (?\e$,1'7\e(B . ?\e,Fg\e(B) (?\e$,1'8\e(B . ?\e,Fh\e(B) (?\e$,1'9\e(B . ?\e,Fi\e(B) | |
131 | (?\e$,1':\e(B . ?\e,Fj\e(B) (?\e$,1';\e(B . ?\e,Fk\e(B) (?\e$,1'<\e(B . ?\e,Fl\e(B) (?\e$,1'=\e(B . ?\e,Fm\e(B) (?\e$,1'>\e(B . ?\e,Fn\e(B) | |
132 | (?\e$,1'?\e(B . ?\e,Fo\e(B) (?\e$,1'@\e(B . ?\e,Fp\e(B) (?\e$,1'A\e(B . ?\e,Fq\e(B) (?\e$,1'B\e(B . ?\e,Fr\e(B) (?\e$,1'C\e(B . ?\e,Fs\e(B) | |
133 | (?\e$,1'D\e(B . ?\e,Ft\e(B) (?\e$,1'E\e(B . ?\e,Fu\e(B) (?\e$,1'F\e(B . ?\e,Fv\e(B) (?\e$,1'G\e(B . ?\e,Fw\e(B) (?\e$,1'H\e(B . ?\e,Fx\e(B) | |
134 | (?\e$,1'I\e(B . ?\e,Fy\e(B) (?\e$,1'J\e(B . ?\e,Fz\e(B) (?\e$,1'K\e(B . ?\e,F{\e(B) (?\e$,1'L\e(B . ?\e,F|\e(B) (?\e$,1'M\e(B . ?\e,F}\e(B) | |
135 | (?\e$,1'N\e(B . ?\e,F~\e(B) | |
136 | ||
137 | (?\e$,1(!\e(B . ?\e,L!\e(B) (?\e$,1("\e(B . ?\e,L"\e(B) (?\e$,1(#\e(B . ?\e,L#\e(B) (?\e$,1($\e(B . ?\e,L$\e(B) | |
138 | (?\e$,1(%\e(B . ?\e,L%\e(B) (?\e$,1(&\e(B . ?\e,L&\e(B) (?\e$,1('\e(B . ?\e,L'\e(B) (?\e$,1((\e(B . ?\e,L(\e(B) (?\e$,1()\e(B . ?\e,L)\e(B) | |
139 | (?\e$,1(*\e(B . ?\e,L*\e(B) (?\e$,1(+\e(B . ?\e,L+\e(B) (?\e$,1(,\e(B . ?\e,L,\e(B) (?\e$,1(.\e(B . ?\e,L.\e(B) (?\e$,1(/\e(B . ?\e,L/\e(B) | |
140 | (?\e$,1(0\e(B . ?\e,L0\e(B) (?\e$,1(1\e(B . ?\e,L1\e(B) (?\e$,1(2\e(B . ?\e,L2\e(B) (?\e$,1(3\e(B . ?\e,L3\e(B) (?\e$,1(4\e(B . ?\e,L4\e(B) | |
141 | (?\e$,1(5\e(B . ?\e,L5\e(B) (?\e$,1(6\e(B . ?\e,L6\e(B) (?\e$,1(7\e(B . ?\e,L7\e(B) (?\e$,1(8\e(B . ?\e,L8\e(B) (?\e$,1(9\e(B . ?\e,L9\e(B) | |
142 | (?\e$,1(:\e(B . ?\e,L:\e(B) (?\e$,1(;\e(B . ?\e,L;\e(B) (?\e$,1(<\e(B . ?\e,L<\e(B) (?\e$,1(=\e(B . ?\e,L=\e(B) (?\e$,1(>\e(B . ?\e,L>\e(B) | |
143 | (?\e$,1(?\e(B . ?\e,L?\e(B) (?\e$,1(@\e(B . ?\e,L@\e(B) (?\e$,1(A\e(B . ?\e,LA\e(B) (?\e$,1(B\e(B . ?\e,LB\e(B) (?\e$,1(C\e(B . ?\e,LC\e(B) | |
144 | (?\e$,1(D\e(B . ?\e,LD\e(B) (?\e$,1(E\e(B . ?\e,LE\e(B) (?\e$,1(F\e(B . ?\e,LF\e(B) (?\e$,1(G\e(B . ?\e,LG\e(B) (?\e$,1(H\e(B . ?\e,LH\e(B) | |
145 | (?\e$,1(I\e(B . ?\e,LI\e(B) (?\e$,1(J\e(B . ?\e,LJ\e(B) (?\e$,1(K\e(B . ?\e,LK\e(B) (?\e$,1(L\e(B . ?\e,LL\e(B) (?\e$,1(M\e(B . ?\e,LM\e(B) | |
146 | (?\e$,1(N\e(B . ?\e,LN\e(B) (?\e$,1(O\e(B . ?\e,LO\e(B) (?\e$,1(P\e(B . ?\e,LP\e(B) (?\e$,1(Q\e(B . ?\e,LQ\e(B) (?\e$,1(R\e(B . ?\e,LR\e(B) | |
147 | (?\e$,1(S\e(B . ?\e,LS\e(B) (?\e$,1(T\e(B . ?\e,LT\e(B) (?\e$,1(U\e(B . ?\e,LU\e(B) (?\e$,1(V\e(B . ?\e,LV\e(B) (?\e$,1(W\e(B . ?\e,LW\e(B) | |
148 | (?\e$,1(X\e(B . ?\e,LX\e(B) (?\e$,1(Y\e(B . ?\e,LY\e(B) (?\e$,1(Z\e(B . ?\e,LZ\e(B) (?\e$,1([\e(B . ?\e,L[\e(B) (?\e$,1(\\e(B . ?\e,L\\e(B) | |
149 | (?\e$,1(]\e(B . ?\e,L]\e(B) (?\e$,1(^\e(B . ?\e,L^\e(B) (?\e$,1(_\e(B . ?\e,L_\e(B) (?\e$,1(`\e(B . ?\e,L`\e(B) (?\e$,1(a\e(B . ?\e,La\e(B) | |
150 | (?\e$,1(b\e(B . ?\e,Lb\e(B) (?\e$,1(c\e(B . ?\e,Lc\e(B) (?\e$,1(d\e(B . ?\e,Ld\e(B) (?\e$,1(e\e(B . ?\e,Le\e(B) (?\e$,1(f\e(B . ?\e,Lf\e(B) | |
151 | (?\e$,1(g\e(B . ?\e,Lg\e(B) (?\e$,1(h\e(B . ?\e,Lh\e(B) (?\e$,1(i\e(B . ?\e,Li\e(B) (?\e$,1(j\e(B . ?\e,Lj\e(B) (?\e$,1(k\e(B . ?\e,Lk\e(B) | |
152 | (?\e$,1(l\e(B . ?\e,Ll\e(B) (?\e$,1(m\e(B . ?\e,Lm\e(B) (?\e$,1(n\e(B . ?\e,Ln\e(B) (?\e$,1(o\e(B . ?\e,Lo\e(B) (?\e$,1(q\e(B . ?\e,Lq\e(B) | |
153 | (?\e$,1(r\e(B . ?\e,Lr\e(B) (?\e$,1(s\e(B . ?\e,Ls\e(B) (?\e$,1(t\e(B . ?\e,Lt\e(B) (?\e$,1(u\e(B . ?\e,Lu\e(B) (?\e$,1(v\e(B . ?\e,Lv\e(B) | |
154 | (?\e$,1(w\e(B . ?\e,Lw\e(B) (?\e$,1(x\e(B . ?\e,Lx\e(B) (?\e$,1(y\e(B . ?\e,Ly\e(B) (?\e$,1(z\e(B . ?\e,Lz\e(B) (?\e$,1({\e(B . ?\e,L{\e(B) | |
155 | (?\e$,1(|\e(B . ?\e,L|\e(B) (?\e$,1(~\e(B . ?\e,L~\e(B) (?\e$,1(\7f\e(B . ?\e,L\7f\e(B))) | |
156 | ||
ad88f5c5 KH |
157 | |
158 | (defcustom utf-fragment-on-decoding nil | |
159 | "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets. | |
9ca2ac2d DL |
160 | Setting this means that the relevant Cyrillic and Greek characters are |
161 | decoded into the iso8859 charsets rather than into | |
3873f5a5 | 162 | mule-unicode-0100-24ff. The iso8859 charsets take half as much space |
9ca2ac2d DL |
163 | in the buffer, but using them may affect how the buffer can be re-encoded |
164 | and may require a different input method to search for them, for instance. | |
165 | See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' | |
3873f5a5 KH |
166 | for mechanisms to make this largely transparent. |
167 | ||
168 | Setting this variable outside customize has no effect." | |
9ca2ac2d | 169 | :set (lambda (s v) |
ad88f5c5 KH |
170 | (if v |
171 | (progn | |
172 | (define-translation-table 'utf-translation-table-for-decode | |
173 | utf-fragmentation-table) | |
174 | ;; Even if unify-8859-on-encoding-mode is off, make | |
175 | ;; mule-utf-* encode characters in | |
176 | ;; utf-fragmentation-table. | |
177 | (unless (eq (get 'utf-translation-table-for-encode | |
178 | 'translation-table) | |
179 | ucs-mule-to-mule-unicode) | |
180 | (define-translation-table 'utf-translation-table-for-encode | |
535665b8 | 181 | utf-defragmentation-table))) |
ad88f5c5 KH |
182 | (define-translation-table 'utf-translation-table-for-decode) |
183 | ;; When unify-8859-on-encoding-mode is off, be sure to make | |
184 | ;; mule-utf-* disabled for characters in | |
185 | ;; utf-fragmentation-table. | |
186 | (unless (eq (get 'utf-translation-table-for-encode | |
187 | 'translation-table) | |
188 | ucs-mule-to-mule-unicode) | |
535665b8 | 189 | (define-translation-table 'utf-translation-table-for-encode))) |
9ca2ac2d | 190 | (set-default s v)) |
bf247b6e | 191 | :version "22.1" |
9ca2ac2d DL |
192 | :type 'boolean |
193 | :group 'mule) | |
194 | ||
c71c26e9 KH |
195 | |
196 | (defconst utf-translate-cjk-charsets '(chinese-gb2312 | |
197 | chinese-big5-1 chinese-big5-2 | |
198 | japanese-jisx0208 japanese-jisx0212 | |
7d9d5480 | 199 | katakana-jisx0201 |
c71c26e9 KH |
200 | korean-ksc5601) |
201 | "List of charsets supported by `utf-translate-cjk-mode'.") | |
202 | ||
fce59e40 KH |
203 | (defvar utf-translate-cjk-lang-env nil |
204 | "Language environment in which tables for `utf-translate-cjk-mode' is loaded. | |
205 | The value nil means that the tables are not yet loaded.") | |
206 | ||
207 | (defvar utf-translate-cjk-unicode-range) | |
208 | ||
209 | ;; String generated from utf-translate-cjk-unicode-range. It is | |
210 | ;; suitable for an argument to skip-chars-forward. | |
211 | (defvar utf-translate-cjk-unicode-range-string nil) | |
212 | ||
213 | (defun utf-translate-cjk-set-unicode-range (range) | |
214 | (setq utf-translate-cjk-unicode-range range) | |
215 | (setq utf-translate-cjk-unicode-range-string | |
216 | (let ((decode-char-no-trans | |
217 | #'(lambda (x) | |
218 | (cond ((< x #x100) (make-char 'latin-iso8859-1 x)) | |
219 | ((< x #x2500) | |
220 | (setq x (- x #x100)) | |
221 | (make-char 'mule-unicode-0100-24ff | |
222 | (+ (/ x 96) 32) (+ (% x 96) 32))) | |
bf247b6e | 223 | ((< x #x3400) |
fce59e40 KH |
224 | (setq x (- x #x2500)) |
225 | (make-char 'mule-unicode-2500-33ff | |
226 | (+ (/ x 96) 32) (+ (% x 96) 32))) | |
227 | (t | |
228 | (setq x (- x #xe000)) | |
229 | (make-char 'mule-unicode-e000-ffff | |
230 | (+ (/ x 96) 32) (+ (% x 96) 32)))))) | |
231 | ranges from to) | |
232 | (dolist (elt range) | |
233 | (setq from (max #xA0 (car elt)) to (min #xffff (cdr elt))) | |
234 | (if (and (>= to #x3400) (< to #xE000)) | |
235 | (setq to #x33FF)) | |
236 | (cond ((< from #x100) | |
237 | (if (>= to #xE000) | |
238 | (setq ranges (cons (cons #xE000 to) ranges) | |
239 | to #x33FF)) | |
240 | (if (>= to #x2500) | |
241 | (setq ranges (cons (cons #x2500 to) ranges) | |
242 | to #x24FF)) | |
243 | (if (>= to #x100) | |
244 | (setq ranges (cons (cons #x100 to) ranges) | |
245 | to #xFF))) | |
246 | ((< from #x2500) | |
247 | (if (>= to #xE000) | |
248 | (setq ranges (cons (cons #xE000 to) ranges) | |
249 | to #x33FF)) | |
250 | (if (>= to #x2500) | |
251 | (setq ranges (cons (cons #x2500 to) ranges) | |
252 | to #x24FF))) | |
253 | ((< from #x3400) | |
254 | (if (>= to #xE000) | |
255 | (setq ranges (cons (cons #xE000 to) ranges) | |
256 | to #x33FF)))) | |
257 | (if (<= from to) | |
258 | (setq ranges (cons (cons from to) ranges)))) | |
bf247b6e KS |
259 | (mapconcat #'(lambda (x) |
260 | (format "%c-%c" | |
fce59e40 KH |
261 | (funcall decode-char-no-trans (car x)) |
262 | (funcall decode-char-no-trans (cdr x)))) | |
263 | ranges ""))) | |
3ccf95cb KH |
264 | ;; These forces loading and settting tables for |
265 | ;; utf-translate-cjk-mode. | |
266 | (setq utf-translate-cjk-lang-env nil | |
267 | ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) | |
268 | ucs-unicode-to-mule-cjk (make-hash-table :test 'eq))) | |
fce59e40 KH |
269 | |
270 | (defcustom utf-translate-cjk-unicode-range '((#x2e80 . #xd7a3) | |
271 | (#xff00 . #xffef)) | |
272 | "List of Unicode code ranges supported by `utf-translate-cjk-mode'. | |
273 | Setting this variable directly does not take effect; | |
274 | use either \\[customize] or the function | |
275 | `utf-translate-cjk-set-unicode-range'." | |
bf247b6e | 276 | :version "22.1" |
fce59e40 KH |
277 | :type '(repeat (cons integer integer)) |
278 | :set (lambda (symbol value) | |
279 | (utf-translate-cjk-set-unicode-range value)) | |
280 | :group 'mule) | |
c71c26e9 KH |
281 | |
282 | ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'. | |
283 | (defsubst utf-translate-cjk-substitutable-p (code-point) | |
284 | (let ((tail utf-translate-cjk-unicode-range) | |
285 | elt) | |
286 | (while tail | |
287 | (setq elt (car tail) tail (cdr tail)) | |
288 | (if (and (>= code-point (car elt)) (<= code-point (cdr elt))) | |
289 | (setq tail nil) | |
290 | (setq elt nil))) | |
291 | elt)) | |
292 | ||
c71c26e9 KH |
293 | (defun utf-translate-cjk-load-tables () |
294 | "Load tables for `utf-translate-cjk-mode'." | |
295 | ;; Fixme: Allow the use of the CJK charsets to be | |
296 | ;; customized by reordering and possible omission. | |
297 | (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000))) | |
298 | (if redefined | |
299 | ;; Redefine them with realistic initial sizes and a | |
300 | ;; smallish rehash size to avoid wasting significant | |
301 | ;; space after they're built. | |
302 | (setq ucs-mule-cjk-to-unicode | |
303 | (make-hash-table :test 'eq :size 43000 :rehash-size 1000) | |
304 | ucs-unicode-to-mule-cjk | |
305 | (make-hash-table :test 'eq :size 21500 :rehash-size 1000))) | |
306 | ||
307 | ;; Load the files explicitly, to avoid having to keep | |
308 | ;; around the large tables they contain (as well as the | |
309 | ;; ones which get built). | |
e314a6e4 KH |
310 | ;; Here we bind coding-system-for-read to nil so that coding tags |
311 | ;; in the files are respected even if the files are not yet | |
312 | ;; byte-compiled | |
d81608e0 CY |
313 | (let ((coding-system-for-read nil) |
314 | ;; We must avoid clobbering this variable, in case the load | |
315 | ;; files below use different coding systems. | |
316 | (last-coding-system-used last-coding-system-used)) | |
e314a6e4 KH |
317 | (cond ((string= "Korean" current-language-environment) |
318 | (load "subst-jis") | |
319 | (load "subst-big5") | |
320 | (load "subst-gb2312") | |
321 | (load "subst-ksc")) | |
322 | ((string= "Chinese-BIG5" current-language-environment) | |
323 | (load "subst-jis") | |
324 | (load "subst-ksc") | |
325 | (load "subst-gb2312") | |
326 | (load "subst-big5")) | |
327 | ((string= "Chinese-GB" current-language-environment) | |
328 | (load "subst-jis") | |
329 | (load "subst-ksc") | |
330 | (load "subst-big5") | |
331 | (load "subst-gb2312")) | |
332 | (t | |
333 | (load "subst-ksc") | |
334 | (load "subst-gb2312") | |
335 | (load "subst-big5") | |
336 | (load "subst-jis")))) ; jis covers as much as big5, gb2312 | |
c71c26e9 KH |
337 | |
338 | (when redefined | |
339 | (define-translation-hash-table 'utf-subst-table-for-decode | |
340 | ucs-unicode-to-mule-cjk) | |
341 | (define-translation-hash-table 'utf-subst-table-for-encode | |
342 | ucs-mule-cjk-to-unicode) | |
343 | (set-char-table-extra-slot (get 'utf-translation-table-for-encode | |
344 | 'translation-table) | |
345 | 1 ucs-mule-cjk-to-unicode)) | |
346 | ||
347 | (setq utf-translate-cjk-lang-env current-language-environment))) | |
348 | ||
349 | (defun utf-lookup-subst-table-for-decode (code-point) | |
350 | (if (and utf-translate-cjk-mode | |
351 | (not utf-translate-cjk-lang-env) | |
352 | (utf-translate-cjk-substitutable-p code-point)) | |
353 | (utf-translate-cjk-load-tables)) | |
354 | (gethash code-point | |
355 | (get 'utf-subst-table-for-decode 'translation-hash-table))) | |
4bcce19c | 356 | |
c71c26e9 KH |
357 | |
358 | (defun utf-lookup-subst-table-for-encode (char) | |
359 | (if (and utf-translate-cjk-mode | |
360 | (not utf-translate-cjk-lang-env) | |
361 | (memq (char-charset char) utf-translate-cjk-charsets)) | |
362 | (utf-translate-cjk-load-tables)) | |
363 | (gethash char | |
364 | (get 'utf-subst-table-for-encode 'translation-hash-table))) | |
4bcce19c | 365 | |
fcfdeaf6 | 366 | (define-minor-mode utf-translate-cjk-mode |
4bcce19c LT |
367 | "Toggle whether UTF based coding systems de/encode CJK characters. |
368 | If ARG is an integer, enable if ARG is positive and disable if | |
369 | zero or negative. This is a minor mode. | |
c71c26e9 | 370 | Enabling this allows the coding systems mule-utf-8, |
72b338a2 | 371 | mule-utf-16le and mule-utf-16be to encode characters in the charsets |
ccdd5c61 DL |
372 | `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', |
373 | `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to | |
374 | decode the corresponding unicodes into such characters. | |
375 | ||
376 | Where the charsets overlap, the one preferred for decoding is chosen | |
377 | according to the language environment in effect when this option is | |
378 | turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for | |
379 | Chinese-Big5 and jisx for other environments. | |
380 | ||
4bcce19c | 381 | This mode is on by default. If you are not interested in CJK |
c71c26e9 | 382 | characters and want to avoid some overhead on encoding/decoding |
4bcce19c LT |
383 | by the above coding systems, you can customize the user option |
384 | `utf-translate-cjk-mode' to nil." | |
c71c26e9 | 385 | :init-value t |
bf247b6e | 386 | :version "22.1" |
9ca2ac2d | 387 | :type 'boolean |
fcfdeaf6 KG |
388 | :group 'mule |
389 | :global t | |
390 | (if utf-translate-cjk-mode | |
fcfdeaf6 | 391 | (progn |
2bf07f07 KH |
392 | (define-translation-hash-table 'utf-subst-table-for-decode |
393 | ucs-unicode-to-mule-cjk) | |
394 | (define-translation-hash-table 'utf-subst-table-for-encode | |
395 | ucs-mule-cjk-to-unicode) | |
396 | (set-char-table-extra-slot (get 'utf-translation-table-for-encode | |
397 | 'translation-table) | |
398 | 1 ucs-mule-cjk-to-unicode)) | |
aea797fc KH |
399 | (define-translation-hash-table 'utf-subst-table-for-decode |
400 | (make-hash-table :test 'eq)) | |
401 | (define-translation-hash-table 'utf-subst-table-for-encode | |
2bf07f07 KH |
402 | (make-hash-table :test 'eq)) |
403 | (set-char-table-extra-slot (get 'utf-translation-table-for-encode | |
404 | 'translation-table) | |
c71c26e9 KH |
405 | 1 nil)) |
406 | ||
407 | ;; Update safe-chars of mule-utf-* coding systems. | |
408 | (dolist (elt (coding-system-list t)) | |
409 | (if (string-match "^mule-utf" (symbol-name elt)) | |
410 | (let ((safe-charsets (coding-system-get elt 'safe-charsets)) | |
411 | (safe-chars (coding-system-get elt 'safe-chars)) | |
412 | (need-update nil)) | |
413 | (dolist (charset utf-translate-cjk-charsets) | |
414 | (unless (eq utf-translate-cjk-mode (memq charset safe-charsets)) | |
415 | (setq safe-charsets | |
416 | (if utf-translate-cjk-mode | |
417 | (cons charset safe-charsets) | |
418 | (delq charset safe-charsets)) | |
419 | need-update t) | |
420 | (aset safe-chars (make-char charset) utf-translate-cjk-mode))) | |
421 | (when need-update | |
422 | (coding-system-put elt 'safe-charsets safe-charsets) | |
423 | (define-coding-system-internal elt)))))) | |
424 | ||
425 | (define-ccl-program ccl-mule-utf-untrans | |
426 | ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or | |
427 | ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write | |
428 | ;; eight-bit-control/graphic sequence (2 to 4 chars) representing | |
429 | ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified. | |
430 | ;; | |
431 | ;; This is a subrountine because we assume that this is called very | |
432 | ;; rarely (so we don't have to worry about the overhead of the | |
433 | ;; call). | |
434 | `(0 | |
435 | ((r5 = ,(charset-id 'eight-bit-control)) | |
436 | (r6 = ,(charset-id 'eight-bit-graphic)) | |
437 | (if (r0 < #x100) | |
438 | ((r4 = ((r0 >> 6) | #xC0)) | |
439 | (write-multibyte-character r6 r4)) | |
440 | ((if (r0 < #x10000) | |
441 | ((r4 = ((r0 >> 12) | #xE0)) | |
442 | (write-multibyte-character r6 r4)) | |
443 | ((r4 = ((r0 >> 18) | #xF0)) | |
444 | (write-multibyte-character r6 r4) | |
445 | (r4 = (((r0 >> 12) & #x3F) | #x80)) | |
446 | (if (r4 < #xA0) | |
447 | (write-multibyte-character r5 r4) | |
448 | (write-multibyte-character r6 r4)))) | |
449 | (r4 = (((r0 >> 6) & #x3F) | #x80)) | |
450 | (if (r4 < #xA0) | |
451 | (write-multibyte-character r5 r4) | |
452 | (write-multibyte-character r6 r4)))) | |
453 | (r4 = ((r0 & #x3F) | #x80)) | |
454 | (if (r4 < #xA0) | |
455 | (write-multibyte-character r5 r4) | |
456 | (write-multibyte-character r6 r4))))) | |
9ca2ac2d | 457 | |
5ba7a870 KH |
458 | (define-ccl-program ccl-decode-mule-utf-8 |
459 | ;; | |
460 | ;; charset | bytes in utf-8 | bytes in emacs | |
461 | ;; -----------------------+----------------+--------------- | |
462 | ;; ascii | 1 | 1 | |
463 | ;; -----------------------+----------------+--------------- | |
464 | ;; eight-bit-control | 2 | 2 | |
aa2e3f49 | 465 | ;; eight-bit-graphic | 2 | 1 |
5ba7a870 KH |
466 | ;; latin-iso8859-1 | 2 | 2 |
467 | ;; -----------------------+----------------+--------------- | |
468 | ;; mule-unicode-0100-24ff | 2 | 4 | |
469 | ;; (< 0800) | | | |
470 | ;; -----------------------+----------------+--------------- | |
471 | ;; mule-unicode-0100-24ff | 3 | 4 | |
472 | ;; (>= 8000) | | | |
473 | ;; mule-unicode-2500-33ff | 3 | 4 | |
474 | ;; mule-unicode-e000-ffff | 3 | 4 | |
c71c26e9 KH |
475 | ;; -----------------------+----------------+--------------- |
476 | ;; invalid byte | 1 | 2 | |
5ba7a870 KH |
477 | ;; |
478 | ;; Thus magnification factor is two. | |
479 | ;; | |
480 | `(2 | |
064cff0b KH |
481 | ((r6 = ,(charset-id 'latin-iso8859-1)) |
482 | (read r0) | |
3d0e328b | 483 | (loop |
5ba7a870 | 484 | (if (r0 < #x80) |
c71c26e9 | 485 | ;; 1-byte encoding, i.e., ascii |
064cff0b KH |
486 | (write-read-repeat r0)) |
487 | (if (r0 < #xc2) | |
488 | ;; continuation byte (invalid here) or 1st byte of overlong | |
489 | ;; 2-byte sequence. | |
c71c26e9 | 490 | ((call ccl-mule-utf-untrans) |
064cff0b KH |
491 | (r6 = ,(charset-id 'latin-iso8859-1)) |
492 | (read r0) | |
c71c26e9 KH |
493 | (repeat))) |
494 | ||
495 | ;; Read the 2nd byte. | |
c71c26e9 KH |
496 | (read r1) |
497 | (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte | |
498 | ((call ccl-mule-utf-untrans) | |
064cff0b | 499 | (r6 = ,(charset-id 'latin-iso8859-1)) |
c71c26e9 KH |
500 | ;; Handle it in the next loop. |
501 | (r0 = r1) | |
502 | (repeat))) | |
503 | ||
504 | (if (r0 < #xe0) | |
505 | ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx | |
064cff0b KH |
506 | ((r1 &= #x3F) |
507 | (r1 |= ((r0 & #x1F) << 6)) | |
3ccf95cb | 508 | ;; Now r1 holds scalar value. We don't have to check |
064cff0b | 509 | ;; `overlong sequence' because r0 >= 0xC2. |
c71c26e9 | 510 | |
064cff0b | 511 | (if (r1 >= 256) |
c71c26e9 | 512 | ;; mule-unicode-0100-24ff (< 0800) |
3ccf95cb KH |
513 | ((r0 = r1) |
514 | (lookup-integer utf-subst-table-for-decode r0 r1) | |
515 | (if (r7 == 0) | |
516 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
517 | (r1 -= #x0100) | |
518 | (r2 = (((r1 / 96) + 32) << 7)) | |
519 | (r1 %= 96) | |
520 | (r1 += (r2 + 32)) | |
521 | (translate-character | |
522 | utf-translation-table-for-decode r0 r1))) | |
064cff0b KH |
523 | (write-multibyte-character r0 r1) |
524 | (read r0) | |
525 | (repeat)) | |
526 | (if (r1 >= 160) | |
527 | ;; latin-iso8859-1 | |
3ccf95cb KH |
528 | ((r0 = r1) |
529 | (lookup-integer utf-subst-table-for-decode r0 r1) | |
530 | (if (r7 == 0) | |
531 | ((r1 -= 128) | |
532 | (write-multibyte-character r6 r1)) | |
533 | ((write-multibyte-character r0 r1))) | |
064cff0b KH |
534 | (read r0) |
535 | (repeat)) | |
536 | ;; eight-bit-control | |
537 | ((r0 = ,(charset-id 'eight-bit-control)) | |
538 | (write-multibyte-character r0 r1) | |
539 | (read r0) | |
540 | (repeat)))))) | |
c71c26e9 KH |
541 | |
542 | ;; Read the 3rd bytes. | |
c71c26e9 KH |
543 | (read r2) |
544 | (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte | |
545 | ((call ccl-mule-utf-untrans) | |
546 | (r0 = r1) | |
547 | (call ccl-mule-utf-untrans) | |
064cff0b | 548 | (r6 = ,(charset-id 'latin-iso8859-1)) |
c71c26e9 KH |
549 | ;; Handle it in the next loop. |
550 | (r0 = r2) | |
551 | (repeat))) | |
552 | ||
553 | (if (r0 < #xF0) | |
554 | ;; 3byte encoding | |
555 | ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | |
556 | ((r3 = ((r0 & #xF) << 12)) | |
557 | (r3 |= ((r1 & #x3F) << 6)) | |
558 | (r3 |= (r2 & #x3F)) | |
559 | ||
560 | (if (r3 < #x800) ; `overlong sequence' | |
561 | ((call ccl-mule-utf-untrans) | |
562 | (r0 = r1) | |
563 | (call ccl-mule-utf-untrans) | |
564 | (r0 = r2) | |
565 | (call ccl-mule-utf-untrans) | |
064cff0b KH |
566 | (r6 = ,(charset-id 'latin-iso8859-1)) |
567 | (read r0) | |
c71c26e9 KH |
568 | (repeat))) |
569 | ||
570 | (if (r3 < #x2500) | |
571 | ;; mule-unicode-0100-24ff (>= 0800) | |
3ccf95cb KH |
572 | ((r0 = r3) |
573 | (lookup-integer utf-subst-table-for-decode r0 r1) | |
574 | (if (r7 == 0) | |
575 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
576 | (r3 -= #x0100) | |
577 | (r3 //= 96) | |
578 | (r1 = (r7 + 32)) | |
579 | (r1 += ((r3 + 32) << 7)) | |
580 | (translate-character | |
581 | utf-translation-table-for-decode r0 r1))) | |
c71c26e9 | 582 | (write-multibyte-character r0 r1) |
064cff0b | 583 | (read r0) |
c71c26e9 KH |
584 | (repeat))) |
585 | ||
586 | (if (r3 < #x3400) | |
587 | ;; mule-unicode-2500-33ff | |
588 | ((r0 = r3) ; don't zap r3 | |
589 | (lookup-integer utf-subst-table-for-decode r0 r1) | |
590 | (if (r7 == 0) | |
591 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
592 | (r3 -= #x2500) | |
593 | (r3 //= 96) | |
594 | (r1 = (r7 + 32)) | |
595 | (r1 += ((r3 + 32) << 7)))) | |
596 | (write-multibyte-character r0 r1) | |
064cff0b | 597 | (read r0) |
c71c26e9 KH |
598 | (repeat))) |
599 | ||
600 | (if (r3 < #xE000) | |
601 | ;; Try to convert to CJK chars, else | |
602 | ;; keep them as eight-bit-{control|graphic}. | |
603 | ((r0 = r3) | |
604 | (lookup-integer utf-subst-table-for-decode r3 r1) | |
605 | (if r7 | |
606 | ;; got a translation | |
064cff0b KH |
607 | ((write-multibyte-character r3 r1) |
608 | (read r0) | |
609 | (repeat)) | |
610 | ((call ccl-mule-utf-untrans) | |
611 | (r6 = ,(charset-id 'latin-iso8859-1)) | |
612 | (read r0) | |
613 | (repeat))))) | |
c71c26e9 KH |
614 | |
615 | ;; mule-unicode-e000-ffff | |
616 | ;; Fixme: fffe and ffff are invalid. | |
617 | (r0 = r3) ; don't zap r3 | |
618 | (lookup-integer utf-subst-table-for-decode r0 r1) | |
619 | (if (r7 == 0) | |
620 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
621 | (r3 -= #xe000) | |
622 | (r3 //= 96) | |
623 | (r1 = (r7 + 32)) | |
624 | (r1 += ((r3 + 32) << 7)))) | |
625 | (write-multibyte-character r0 r1) | |
064cff0b | 626 | (read r0) |
c71c26e9 KH |
627 | (repeat))) |
628 | ||
629 | ;; Read the 4th bytes. | |
c71c26e9 KH |
630 | (read r3) |
631 | (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte | |
632 | ((call ccl-mule-utf-untrans) | |
633 | (r0 = r1) | |
634 | (call ccl-mule-utf-untrans) | |
064cff0b KH |
635 | (r0 = r2) |
636 | (call ccl-mule-utf-untrans) | |
637 | (r6 = ,(charset-id 'latin-iso8859-1)) | |
c71c26e9 KH |
638 | ;; Handle it in the next loop. |
639 | (r0 = r3) | |
640 | (repeat))) | |
641 | ||
064cff0b | 642 | (if (r0 < #xF8) |
c71c26e9 KH |
643 | ;; 4-byte encoding: |
644 | ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx | |
645 | ;; keep those bytes as eight-bit-{control|graphic} | |
646 | ;; Fixme: allow lookup in utf-subst-table-for-decode. | |
647 | ((r4 = ((r0 & #x7) << 18)) | |
648 | (r4 |= ((r1 & #x3F) << 12)) | |
649 | (r4 |= ((r2 & #x3F) << 6)) | |
650 | (r4 |= (r3 & #x3F)) | |
651 | ||
652 | (if (r4 < #x10000) ; `overlong sequence' | |
653 | ((call ccl-mule-utf-untrans) | |
654 | (r0 = r1) | |
655 | (call ccl-mule-utf-untrans) | |
656 | (r0 = r2) | |
657 | (call ccl-mule-utf-untrans) | |
658 | (r0 = r3) | |
659 | (call ccl-mule-utf-untrans)) | |
660 | ((r0 = r4) | |
064cff0b | 661 | (call ccl-mule-utf-untrans)))) |
c71c26e9 | 662 | |
064cff0b KH |
663 | ;; Unsupported sequence. |
664 | ((call ccl-mule-utf-untrans) | |
665 | (r0 = r1) | |
666 | (call ccl-mule-utf-untrans) | |
667 | (r0 = r2) | |
668 | (call ccl-mule-utf-untrans) | |
669 | (r0 = r3) | |
670 | (call ccl-mule-utf-untrans))) | |
671 | (r6 = ,(charset-id 'latin-iso8859-1)) | |
672 | (read r0) | |
67ff2216 KH |
673 | (repeat))) |
674 | ||
064cff0b | 675 | |
67ff2216 KH |
676 | ;; At EOF... |
677 | (if (r0 >= 0) | |
c71c26e9 KH |
678 | ;; r0 >= #x80 |
679 | ((call ccl-mule-utf-untrans) | |
67ff2216 | 680 | (if (r1 >= 0) |
c71c26e9 KH |
681 | ((r0 = r1) |
682 | (call ccl-mule-utf-untrans) | |
67ff2216 | 683 | (if (r2 >= 0) |
c71c26e9 KH |
684 | ((r0 = r2) |
685 | (call ccl-mule-utf-untrans) | |
67ff2216 | 686 | (if (r3 >= 0) |
c71c26e9 KH |
687 | ((r0 = r3) |
688 | (call ccl-mule-utf-untrans)))))))))) | |
5ba7a870 | 689 | |
c49b8288 | 690 | "CCL program to decode UTF-8. |
74ace46a | 691 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
ad88f5c5 KH |
692 | mule-unicode-*, but see also `utf-fragmentation-table' and |
693 | `ucs-mule-cjk-to-unicode'. | |
9ca2ac2d DL |
694 | Encodings of un-representable Unicode characters are decoded asis into |
695 | eight-bit-control and eight-bit-graphic characters.") | |
5ba7a870 | 696 | |
c71c26e9 KH |
697 | (define-ccl-program ccl-mule-utf-8-encode-untrans |
698 | ;; UTF-8 decoder generates an UTF-8 sequence represented by a | |
699 | ;; sequence eight-bit-control/graphic chars for an untranslatable | |
700 | ;; character and an invalid byte. | |
4bcce19c | 701 | ;; |
c71c26e9 KH |
702 | ;; This CCL parses that sequence (the first byte is already in r1), |
703 | ;; writes out the original bytes of that sequence, and sets r5 to | |
704 | ;; -1. | |
705 | ;; | |
706 | ;; If the eight-bit-control/graphic sequence is shorter than what r1 | |
707 | ;; suggests, it sets r5 and r6 to the last character read that | |
708 | ;; should be handled by the next loop of a caller. | |
709 | ;; | |
710 | ;; Note: For UTF-8 validation, we only check if a character is | |
711 | ;; eight-bit-control/graphic or not. It may result in incorrect | |
712 | ;; handling of random binary data, but such a data can't be encoded | |
713 | ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such | |
714 | ;; a sequence even if a source contains invalid byte-sequence. | |
715 | `(0 | |
716 | (;; Read the 2nd byte. | |
717 | (read-multibyte-character r5 r6) | |
718 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
719 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
4bcce19c | 720 | ((write r1) ; invalid UTF-8 |
c71c26e9 KH |
721 | (r1 = -1) |
722 | (end))) | |
723 | ||
724 | (if (r1 <= #xC3) | |
725 | ;; 2-byte sequence for an originally invalid byte. | |
726 | ((r6 &= #x3F) | |
727 | (r6 |= ((r1 & #x1F) << 6)) | |
728 | (write r6) | |
729 | (r5 = -1) | |
730 | (end))) | |
731 | ||
732 | (write r1 r6) | |
733 | (r2 = r1) | |
734 | (r1 = -1) | |
735 | ;; Read the 3rd byte. | |
736 | (read-multibyte-character r5 r6) | |
4bcce19c | 737 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
c71c26e9 KH |
738 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
739 | (end)) ; invalid UTF-8 | |
740 | (write r6) | |
741 | (if (r2 < #xF0) | |
742 | ;; 3-byte sequence for an untranslated character. | |
743 | ((r5 = -1) | |
744 | (end))) | |
745 | ;; Read the 4th byte. | |
746 | (read-multibyte-character r5 r6) | |
4bcce19c | 747 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
c71c26e9 KH |
748 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
749 | (end)) ; invalid UTF-8 | |
750 | ;; 4-byte sequence for an untranslated character. | |
751 | (write r6) | |
752 | (r5 = -1) | |
753 | (end)) | |
754 | ||
755 | ;; At EOF... | |
756 | ((r5 = -1) | |
757 | (if (r1 >= 0) | |
758 | (write r1))))) | |
759 | ||
5ba7a870 KH |
760 | (define-ccl-program ccl-encode-mule-utf-8 |
761 | `(1 | |
aa15b3e5 KH |
762 | ((r5 = -1) |
763 | (loop | |
764 | (if (r5 < 0) | |
c71c26e9 KH |
765 | (read-multibyte-character r0 r1) |
766 | ;; Pre-read character is in r5 (charset-ID) and r6 (code-point). | |
767 | ((r0 = r5) | |
aa15b3e5 KH |
768 | (r1 = r6) |
769 | (r5 = -1))) | |
c71c26e9 | 770 | (translate-character utf-translation-table-for-encode r0 r1) |
aa15b3e5 KH |
771 | |
772 | (if (r0 == ,(charset-id 'ascii)) | |
c71c26e9 KH |
773 | (write-repeat r1)) |
774 | ||
775 | (if (r0 == ,(charset-id 'latin-iso8859-1)) | |
776 | ;; r1 scalar utf-8 | |
777 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
778 | ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 | |
779 | ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 | |
c1136bda | 780 | ((write ((r1 >> 6) | #xc2)) |
c71c26e9 KH |
781 | (r1 &= #x3f) |
782 | (r1 |= #x80) | |
c71c26e9 KH |
783 | (write-repeat r1))) |
784 | ||
785 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | |
786 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
787 | ;; #x3f80 == (0011 1111 1000 0000)b | |
788 | (r1 &= #x7f) | |
789 | (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | |
790 | ;; now r1 holds scalar value | |
791 | (if (r1 < #x0800) | |
792 | ;; 2byte encoding | |
793 | ((write ((r1 >> 6) | #xC0)) | |
794 | (r1 &= #x3F) | |
795 | (r1 |= #x80) | |
796 | (write-repeat r1)) | |
797 | ;; 3byte encoding | |
798 | ((write ((r1 >> 12) | #xE0)) | |
799 | (write (((r1 & #x0FC0) >> 6) | #x80)) | |
800 | (r1 &= #x3F) | |
801 | (r1 |= #x80) | |
802 | (write-repeat r1))))) | |
803 | ||
804 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
805 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
806 | (r1 &= #x7f) | |
807 | (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | |
808 | ;; now r1 holds scalar value | |
809 | (write ((r1 >> 12) | #xE0)) | |
810 | (write (((r1 & #x0FC0) >> 6) | #x80)) | |
811 | (r1 &= #x3F) | |
812 | (r1 |= #x80) | |
813 | (write-repeat r1))) | |
814 | ||
815 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | |
816 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
817 | (r1 &= #x7f) | |
818 | (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 | |
819 | ;; now r1 holds scalar value | |
820 | (write ((r1 >> 12) | #xE0)) | |
821 | (write (((r1 & #x0FC0) >> 6) | #x80)) | |
822 | (r1 &= #x3F) | |
823 | (r1 |= #x80) | |
824 | (write-repeat r1))) | |
825 | ||
826 | (if (r0 == ,(charset-id 'eight-bit-control)) | |
827 | ;; r1 scalar utf-8 | |
828 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
829 | ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | |
830 | ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | |
831 | ((write #xC2) | |
832 | (write-repeat r1))) | |
833 | ||
834 | (if (r0 == ,(charset-id 'eight-bit-graphic)) | |
835 | ;; r1 scalar utf-8 | |
836 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
837 | ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | |
838 | ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | |
839 | ((r0 = (r1 >= #xC0)) | |
840 | (r0 &= (r1 <= #xC3)) | |
841 | (r4 = (r1 >= #xE1)) | |
842 | (r4 &= (r1 <= #xF7)) | |
843 | (r0 |= r4) | |
844 | (if r0 | |
845 | ((call ccl-mule-utf-8-encode-untrans) | |
846 | (repeat)) | |
847 | (write-repeat r1)))) | |
848 | ||
849 | (lookup-character utf-subst-table-for-encode r0 r1) | |
850 | (if r7 ; lookup succeeded | |
851 | (if (r0 < #x800) | |
852 | ;; 2byte encoding | |
853 | ((write ((r0 >> 6) | #xC0)) | |
c1136bda KH |
854 | (r0 = ((r0 & #x3F) | #x80)) |
855 | (write-repeat r0)) | |
c71c26e9 KH |
856 | ;; 3byte encoding |
857 | ((write ((r0 >> 12) | #xE0)) | |
858 | (write (((r0 & #x0FC0) >> 6) | #x80)) | |
c1136bda KH |
859 | (r0 = ((r0 & #x3F) | #x80)) |
860 | (write-repeat r0)))) | |
5ba7a870 | 861 | |
c71c26e9 KH |
862 | ;; Unsupported character. |
863 | ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
864 | (write #xef) | |
865 | (write #xbf) | |
866 | (write-repeat #xbd)))) | |
9ca2ac2d | 867 | "CCL program to encode into UTF-8.") |
5ba7a870 | 868 | |
aa2e3f49 | 869 | |
9ca2ac2d DL |
870 | (define-ccl-program ccl-untranslated-to-ucs |
871 | `(0 | |
c71c26e9 KH |
872 | (if (r1 == 0) |
873 | nil | |
874 | (if (r0 <= #xC3) ; 2-byte encoding | |
875 | ((r0 = ((r0 & #x3) << 6)) | |
876 | (r0 |= (r1 & #x3F)) | |
877 | (r1 = 2)) | |
878 | (if (r2 == 0) | |
879 | (r1 = 0) | |
880 | (if (r0 < #xF0) ; 3-byte encoding, as above | |
881 | ((r0 = ((r0 & #xF) << 12)) | |
882 | (r0 |= ((r1 & #x3F) << 6)) | |
064cff0b | 883 | (r0 |= (r2 & #x3F)) |
c71c26e9 KH |
884 | (r1 = 3)) |
885 | (if (r3 == 0) | |
886 | (r1 = 0) | |
887 | ((r0 = ((r0 & #x7) << 18)) | |
888 | (r0 |= ((r1 & #x3F) << 12)) | |
889 | (r0 |= ((r2 & #x3F) << 6)) | |
890 | (r0 |= (r3 & #x3F)) | |
891 | (r1 = 4)))))))) | |
892 | "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. | |
893 | Set r1 to the byte length. r0 == 0 for invalid sequence.") | |
9ca2ac2d DL |
894 | |
895 | (defvar utf-8-ccl-regs (make-vector 8 0)) | |
896 | ||
aa2e3f49 | 897 | (defsubst utf-8-untranslated-to-ucs () |
9ca2ac2d DL |
898 | "Return the UCS code for an untranslated sequence of raw bytes t point. |
899 | Only for 3- or 4-byte sequences." | |
900 | (aset utf-8-ccl-regs 0 (or (char-after) 0)) | |
901 | (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) | |
902 | (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) | |
903 | (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) | |
c71c26e9 | 904 | (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)) |
aa2e3f49 DL |
905 | |
906 | (defun utf-8-help-echo (window object position) | |
907 | (format "Untranslated Unicode U+%04X" | |
908 | (get-char-property position 'untranslated-utf-8 object))) | |
909 | ||
c71c26e9 KH |
910 | ;; We compose the untranslatable sequences into a single character, |
911 | ;; and move point to the next character. | |
aa2e3f49 DL |
912 | ;; This is infelicitous for editing, because there's currently no |
913 | ;; mechanism for treating compositions as atomic, but is OK for | |
9ca2ac2d DL |
914 | ;; display. They are composed to U+FFFD with help-echo which |
915 | ;; indicates the unicodes they represent. This function GCs too much. | |
c71c26e9 KH |
916 | |
917 | ;; If utf-translate-cjk-mode is non-nil, this function is called with | |
918 | ;; HASH-TABLE which translates CJK characters into some of CJK | |
919 | ;; charsets. | |
920 | ||
921 | (defsubst utf-8-compose (hash-table) | |
922 | "Put a suitable composition on an untranslatable sequence at point. | |
923 | If HASH-TABLE is non-nil, try to translate CJK characters by it at first. | |
924 | Move point to the end of the sequence." | |
925 | (utf-8-untranslated-to-ucs) | |
926 | (let ((l (aref utf-8-ccl-regs 1)) | |
927 | ch) | |
928 | (if (> l 0) | |
929 | (if (and hash-table | |
930 | (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table))) | |
931 | (progn | |
932 | (insert ch) | |
933 | (delete-region (point) (min (point-max) (+ l (point))))) | |
934 | (setq ch (aref utf-8-ccl-regs 0)) | |
935 | (put-text-property (point) (min (point-max) (+ l (point))) | |
936 | 'untranslated-utf-8 ch) | |
937 | (put-text-property (point) (min (point-max) (+ l (point))) | |
938 | 'help-echo 'utf-8-help-echo) | |
939 | (if (= l 2) | |
940 | (put-text-property (point) (min (point-max) (+ l (point))) | |
6e3b8c5d KH |
941 | 'display (propertize (format "\\%03o" ch) |
942 | 'face 'escape-glyph)) | |
c71c26e9 KH |
943 | (compose-region (point) (+ l (point)) ?\e$,3u=\e(B)) |
944 | (forward-char l)) | |
945 | (forward-char 1)))) | |
aa2e3f49 DL |
946 | |
947 | (defcustom utf-8-compose-scripts nil | |
9ca2ac2d | 948 | "*Non-nil means compose various scripts on decoding utf-8 text." |
aa2e3f49 | 949 | :group 'mule |
bf247b6e | 950 | :version "22.1" |
9ca2ac2d | 951 | :type 'boolean) |
aa2e3f49 DL |
952 | |
953 | (defun utf-8-post-read-conversion (length) | |
954 | "Compose untranslated utf-8 sequences into single characters. | |
c71c26e9 | 955 | If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters. |
aa2e3f49 DL |
956 | Also compose particular scripts if `utf-8-compose-scripts' is non-nil." |
957 | (save-excursion | |
c71c26e9 KH |
958 | (save-restriction |
959 | (narrow-to-region (point) (+ (point) length)) | |
960 | ;; Can't do eval-when-compile to insert a multibyte constant | |
961 | ;; version of the string in the loop, since it's always loaded as | |
962 | ;; unibyte from a byte-compiled file. | |
963 | (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7")) | |
11d2e01b | 964 | (buffer-multibyte enable-multibyte-characters) |
c71c26e9 | 965 | hash-table ch) |
11d2e01b | 966 | (set-buffer-multibyte t) |
c71c26e9 | 967 | (when utf-translate-cjk-mode |
fce59e40 KH |
968 | (unless utf-translate-cjk-lang-env |
969 | ;; Check these characters in utf-translate-cjk-range. | |
970 | ;; We may have to translate them to CJK charsets. | |
971 | (skip-chars-forward | |
972 | (concat range utf-translate-cjk-unicode-range-string)) | |
973 | (unless (eobp) | |
974 | (utf-translate-cjk-load-tables) | |
975 | (setq range | |
3ccf95cb KH |
976 | (concat range utf-translate-cjk-unicode-range-string))) |
977 | (setq hash-table (get 'utf-subst-table-for-decode | |
978 | 'translation-hash-table)))) | |
c71c26e9 KH |
979 | (while (and (skip-chars-forward range) |
980 | (not (eobp))) | |
981 | (setq ch (following-char)) | |
982 | (if (< ch 256) | |
983 | (utf-8-compose hash-table) | |
984 | (if (and hash-table | |
985 | (setq ch (gethash (encode-char ch 'ucs) hash-table))) | |
986 | (progn | |
987 | (insert ch) | |
988 | (delete-char 1)) | |
11d2e01b KH |
989 | (forward-char 1)))) |
990 | (or buffer-multibyte | |
991 | (set-buffer-multibyte nil))) | |
c71c26e9 KH |
992 | |
993 | (when (and utf-8-compose-scripts (> length 1)) | |
7c095357 KH |
994 | ;; This let-binding avoids recursive auto-loading. And, we |
995 | ;; anyway don't have to run the following code while | |
996 | ;; auto-loading. | |
997 | (let ((utf-8-compose-scripts nil)) | |
998 | ;; These currently have definitions which cover the relevant | |
999 | ;; unicodes. We could avoid loading thai-util &c by checking | |
1000 | ;; whether the region contains any characters with the appropriate | |
1001 | ;; categories. There aren't yet Unicode-based rules for Tibetan. | |
1002 | (diacritic-compose-region (point-max) (point-min)) | |
1003 | (thai-compose-region (point-max) (point-min)) | |
1004 | (lao-compose-region (point-max) (point-min)) | |
1005 | (devanagari-compose-region (point-max) (point-min)) | |
1006 | (malayalam-compose-region (point-max) (point-min)) | |
1007 | (tamil-compose-region (point-max) (point-min)))) | |
c71c26e9 KH |
1008 | (- (point-max) (point-min))))) |
1009 | ||
1010 | (defun utf-8-pre-write-conversion (beg end) | |
1011 | "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END. | |
1012 | This is used as a post-read-conversion of utf-8 coding system." | |
1013 | (if (and utf-translate-cjk-mode | |
1014 | (not utf-translate-cjk-lang-env) | |
bfd14269 KH |
1015 | (if (stringp beg) |
1016 | (string-match "\\cc\\|\\cj\\|\\ch" beg) | |
1017 | (save-excursion | |
1018 | (goto-char beg) | |
1019 | (re-search-forward "\\cc\\|\\cj\\|\\ch" end t)))) | |
c71c26e9 KH |
1020 | (utf-translate-cjk-load-tables)) |
1021 | nil) | |
aa2e3f49 | 1022 | |
5ba7a870 KH |
1023 | (make-coding-system |
1024 | 'mule-utf-8 4 ?u | |
1025 | "UTF-8 encoding for Emacs-supported Unicode characters. | |
ad88f5c5 KH |
1026 | It supports Unicode characters of these ranges: |
1027 | U+0000..U+33FF, U+E000..U+FFFF. | |
1028 | They correspond to these Emacs character sets: | |
1029 | ascii, latin-iso8859-1, mule-unicode-0100-24ff, | |
1030 | mule-unicode-2500-33ff, mule-unicode-e000-ffff | |
1031 | ||
1032 | On decoding (e.g. reading a file), Unicode characters not in the above | |
1033 | ranges are decoded into sequences of eight-bit-control and | |
1034 | eight-bit-graphic characters to preserve their byte sequences. The | |
1035 | byte sequence is preserved on i/o for valid utf-8, but not necessarily | |
1036 | for invalid utf-8. | |
1037 | ||
1038 | On encoding (e.g. writing a file), Emacs characters not belonging to | |
1039 | any of the character sets listed above are encoded into the UTF-8 byte | |
1040 | sequence representing U+FFFD (REPLACEMENT CHARACTER)." | |
5ba7a870 KH |
1041 | |
1042 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
c71c26e9 | 1043 | `((safe-charsets |
5ba7a870 KH |
1044 | ascii |
1045 | eight-bit-control | |
1046 | eight-bit-graphic | |
1047 | latin-iso8859-1 | |
1048 | mule-unicode-0100-24ff | |
1049 | mule-unicode-2500-33ff | |
c71c26e9 KH |
1050 | mule-unicode-e000-ffff |
1051 | ,@(if utf-translate-cjk-mode | |
1052 | utf-translate-cjk-charsets)) | |
87ae7973 | 1053 | (mime-charset . utf-8) |
75f6d723 | 1054 | (coding-category . coding-category-utf-8) |
aa2e3f49 | 1055 | (valid-codes (0 . 255)) |
c71c26e9 | 1056 | (pre-write-conversion . utf-8-pre-write-conversion) |
ad88f5c5 | 1057 | (post-read-conversion . utf-8-post-read-conversion) |
2bf07f07 | 1058 | (translation-table-for-encode . utf-translation-table-for-encode) |
ad88f5c5 KH |
1059 | (dependency unify-8859-on-encoding-mode |
1060 | unify-8859-on-decoding-mode | |
1061 | utf-fragment-on-decoding | |
9e24a165 | 1062 | utf-translate-cjk-mode))) |
5ba7a870 KH |
1063 | |
1064 | (define-coding-system-alias 'utf-8 'mule-utf-8) | |
e8af40ee | 1065 | |
aa2e3f49 DL |
1066 | ;; I think this needs special private charsets defined for the |
1067 | ;; untranslated sequences, if it's going to work well. | |
1068 | ||
1069 | ;;; (defun utf-8-compose-function (pos to pattern &optional string) | |
1070 | ;;; (let* ((prop (get-char-property pos 'composition string)) | |
1071 | ;;; (l (and prop (- (cadr prop) (car prop))))) | |
1072 | ;;; (cond ((and l (> l (- to pos))) | |
1073 | ;;; (delete-region pos to)) | |
1074 | ;;; ((and (> (char-after pos) 224) | |
1075 | ;;; (< (char-after pos) 256) | |
1076 | ;;; (save-restriction | |
1077 | ;;; (narrow-to-region pos to) | |
1078 | ;;; (utf-8-compose))) | |
1079 | ;;; t)))) | |
1080 | ||
1081 | ;;; (dotimes (i 96) | |
1082 | ;;; (aset composition-function-table | |
1083 | ;;; (+ 128 i) | |
1084 | ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | |
1085 | ;;; . utf-8-compose-function)))) | |
1086 | ||
ab5796a9 | 1087 | ;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5 |
e8af40ee | 1088 | ;;; utf-8.el ends here |