(ogonek-jak, ogonek-how): Don't use beginning-of-buffer.
[bpt/emacs.git] / lisp / international / utf-8.el
CommitLineData
9ca2ac2d 1;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
5ba7a870 2
2fd125a3
KH
3;; Copyright (C) 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
4;; Copyright (C) 2001, 2002, 2003, 2004
5;; National Institute of Advanced Industrial Science and Technology (AIST)
6;; Registration Number H14PRO021
5ba7a870 7
aa15b3e5 8;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
9ca2ac2d 9;; Maintainer: FSF
c49b8288 10;; Keywords: multilingual, Unicode, UTF-8, i18n
5ba7a870
KH
11
12;; This file is part of GNU Emacs.
13
14;; GNU Emacs is free software; you can redistribute it and/or modify
15;; it under the terms of the GNU General Public License as published by
16;; the Free Software Foundation; either version 2, or (at your option)
17;; any later version.
18
19;; GNU Emacs is distributed in the hope that it will be useful,
20;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22;; GNU General Public License for more details.
23
24;; You should have received a copy of the GNU General Public License
25;; along with GNU Emacs; see the file COPYING. If not, write to the
26;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27;; Boston, MA 02111-1307, USA.
28
29;;; Commentary:
30
aa2e3f49
DL
31;; The coding-system `mule-utf-8' basically supports encoding/decoding
32;; of the following character sets to and from UTF-8:
5ba7a870
KH
33;;
34;; ascii
35;; eight-bit-control
36;; latin-iso8859-1
37;; mule-unicode-0100-24ff
38;; mule-unicode-2500-33ff
39;; mule-unicode-e000-ffff
40;;
c49b8288
DL
41;; On decoding, Unicode characters that do not fit into the above
42;; character sets are handled as `eight-bit-control' or
43;; `eight-bit-graphic' characters to retain the information about the
9ca2ac2d
DL
44;; original byte sequence and text properties record the corresponding
45;; unicode.
46;;
47;; Fixme: note that reading and writing invalid utf-8 may not be
48;; idempotent -- to represent the bytes to fix that needs a new charset.
aa2e3f49 49;;
ad88f5c5 50;; Characters from other character sets can be encoded with mule-utf-8
ccdd5c61 51;; by populating the translation table
535665b8 52;; `utf-translation-table-for-encode'. Hash tables
ad88f5c5
KH
53;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
54;; used to support encoding and decoding of about a quarter of the CJK
55;; space between U+3400 and U+DFFF.
c49b8288 56
81639ac3 57;; UTF-8 is defined in RFC 3629. A sketch of the encoding is:
5ba7a870
KH
58
59;; scalar | utf-8
60;; value | 1st byte | 2nd byte | 3rd byte
61;; --------------------+-----------+-----------+----------
62;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
63;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
64;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
65
66;;; Code:
67
ad88f5c5
KH
68(defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
69 "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
9ca2ac2d 70
ad88f5c5
KH
71If `unify-8859-on-encoding-mode' is non-nil, this table populates the
72translation-table named `utf-translation-table-for-encode'.")
73
74(define-translation-table 'utf-translation-table-for-encode)
9ca2ac2d 75
9ca2ac2d
DL
76
77;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
78;; space of mule-unicode. For Latin scripts this isn't very
79;; important. Hebrew and Arabic might go here too when there's proper
80;; support for them.
ad88f5c5
KH
81
82(defvar utf-fragmentation-table (make-char-table 'translation-table nil)
83 "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*.
84
85If `utf-fragment-on-decoding' is non-nil, this table populates the
86translation-table named `utf-translation-table-for-decode'")
87
88(defvar utf-defragmentation-table (make-char-table 'translation-table nil)
89 "Char-table for reverse mapping of `utf-fragmentation-table'.
90
91If `utf-fragment-on-decoding' is non-nil and
92`unify-8859-on-encoding-mode' is nil, this table populates the
93translation-table named `utf-translation-table-for-encode'")
94
95(define-translation-table 'utf-translation-table-for-decode)
96
97
0c8410d5 98(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
ad88f5c5
KH
99 "Hash table mapping Emacs CJK character sets to Unicode code points.
100
9e24a165 101If `utf-translate-cjk-mode' is non-nil, this table populates the
ad88f5c5
KH
102translation-hash-table named `utf-subst-table-for-encode'.")
103
0c8410d5
DL
104(define-translation-hash-table 'utf-subst-table-for-encode
105 ucs-mule-cjk-to-unicode)
ad88f5c5 106
0c8410d5 107(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
ad88f5c5
KH
108 "Hash table mapping Unicode code points to Emacs CJK character sets.
109
9e24a165 110If `utf-translate-cjk-mode' is non-nil, this table populates the
ad88f5c5
KH
111translation-hash-table named `utf-subst-table-for-decode'.")
112
113(define-translation-hash-table 'utf-subst-table-for-decode
0c8410d5 114 ucs-unicode-to-mule-cjk)
ad88f5c5 115
9ca2ac2d
DL
116(mapc
117 (lambda (pair)
ad88f5c5
KH
118 (aset utf-fragmentation-table (car pair) (cdr pair))
119 (aset utf-defragmentation-table (cdr pair) (car pair)))
9ca2ac2d
DL
120 '((?\e$,1&d\e(B . ?\e,F4\e(B) (?\e$,1&e\e(B . ?\e,F5\e(B) (?\e$,1&f\e(B . ?\e,F6\e(B) (?\e$,1&h\e(B . ?\e,F8\e(B) (?\e$,1&i\e(B . ?\e,F9\e(B)
121 (?\e$,1&j\e(B . ?\e,F:\e(B) (?\e$,1&l\e(B . ?\e,F<\e(B) (?\e$,1&n\e(B . ?\e,F>\e(B) (?\e$,1&o\e(B . ?\e,F?\e(B) (?\e$,1&p\e(B . ?\e,F@\e(B)
122 (?\e$,1&q\e(B . ?\e,FA\e(B) (?\e$,1&r\e(B . ?\e,FB\e(B) (?\e$,1&s\e(B . ?\e,FC\e(B) (?\e$,1&t\e(B . ?\e,FD\e(B) (?\e$,1&u\e(B . ?\e,FE\e(B)
123 (?\e$,1&v\e(B . ?\e,FF\e(B) (?\e$,1&w\e(B . ?\e,FG\e(B) (?\e$,1&x\e(B . ?\e,FH\e(B) (?\e$,1&y\e(B . ?\e,FI\e(B) (?\e$,1&z\e(B . ?\e,FJ\e(B)
124 (?\e$,1&{\e(B . ?\e,FK\e(B) (?\e$,1&|\e(B . ?\e,FL\e(B) (?\e$,1&}\e(B . ?\e,FM\e(B) (?\e$,1&~\e(B . ?\e,FN\e(B) (?\e$,1&\7f\e(B . ?\e,FO\e(B)
125 (?\e$,1' \e(B . ?\e,FP\e(B) (?\e$,1'!\e(B . ?\e,FQ\e(B) (?\e$,1'#\e(B . ?\e,FS\e(B) (?\e$,1'$\e(B . ?\e,FT\e(B) (?\e$,1'%\e(B . ?\e,FU\e(B)
126 (?\e$,1'&\e(B . ?\e,FV\e(B) (?\e$,1''\e(B . ?\e,FW\e(B) (?\e$,1'(\e(B . ?\e,FX\e(B) (?\e$,1')\e(B . ?\e,FY\e(B) (?\e$,1'*\e(B . ?\e,FZ\e(B)
127 (?\e$,1'+\e(B . ?\e,F[\e(B) (?\e$,1',\e(B . ?\e,F\\e(B) (?\e$,1'-\e(B . ?\e,F]\e(B) (?\e$,1'.\e(B . ?\e,F^\e(B) (?\e$,1'/\e(B . ?\e,F_\e(B)
128 (?\e$,1'0\e(B . ?\e,F`\e(B) (?\e$,1'1\e(B . ?\e,Fa\e(B) (?\e$,1'2\e(B . ?\e,Fb\e(B) (?\e$,1'3\e(B . ?\e,Fc\e(B) (?\e$,1'4\e(B . ?\e,Fd\e(B)
129 (?\e$,1'5\e(B . ?\e,Fe\e(B) (?\e$,1'6\e(B . ?\e,Ff\e(B) (?\e$,1'7\e(B . ?\e,Fg\e(B) (?\e$,1'8\e(B . ?\e,Fh\e(B) (?\e$,1'9\e(B . ?\e,Fi\e(B)
130 (?\e$,1':\e(B . ?\e,Fj\e(B) (?\e$,1';\e(B . ?\e,Fk\e(B) (?\e$,1'<\e(B . ?\e,Fl\e(B) (?\e$,1'=\e(B . ?\e,Fm\e(B) (?\e$,1'>\e(B . ?\e,Fn\e(B)
131 (?\e$,1'?\e(B . ?\e,Fo\e(B) (?\e$,1'@\e(B . ?\e,Fp\e(B) (?\e$,1'A\e(B . ?\e,Fq\e(B) (?\e$,1'B\e(B . ?\e,Fr\e(B) (?\e$,1'C\e(B . ?\e,Fs\e(B)
132 (?\e$,1'D\e(B . ?\e,Ft\e(B) (?\e$,1'E\e(B . ?\e,Fu\e(B) (?\e$,1'F\e(B . ?\e,Fv\e(B) (?\e$,1'G\e(B . ?\e,Fw\e(B) (?\e$,1'H\e(B . ?\e,Fx\e(B)
133 (?\e$,1'I\e(B . ?\e,Fy\e(B) (?\e$,1'J\e(B . ?\e,Fz\e(B) (?\e$,1'K\e(B . ?\e,F{\e(B) (?\e$,1'L\e(B . ?\e,F|\e(B) (?\e$,1'M\e(B . ?\e,F}\e(B)
134 (?\e$,1'N\e(B . ?\e,F~\e(B)
135
136 (?\e$,1(!\e(B . ?\e,L!\e(B) (?\e$,1("\e(B . ?\e,L"\e(B) (?\e$,1(#\e(B . ?\e,L#\e(B) (?\e$,1($\e(B . ?\e,L$\e(B)
137 (?\e$,1(%\e(B . ?\e,L%\e(B) (?\e$,1(&\e(B . ?\e,L&\e(B) (?\e$,1('\e(B . ?\e,L'\e(B) (?\e$,1((\e(B . ?\e,L(\e(B) (?\e$,1()\e(B . ?\e,L)\e(B)
138 (?\e$,1(*\e(B . ?\e,L*\e(B) (?\e$,1(+\e(B . ?\e,L+\e(B) (?\e$,1(,\e(B . ?\e,L,\e(B) (?\e$,1(.\e(B . ?\e,L.\e(B) (?\e$,1(/\e(B . ?\e,L/\e(B)
139 (?\e$,1(0\e(B . ?\e,L0\e(B) (?\e$,1(1\e(B . ?\e,L1\e(B) (?\e$,1(2\e(B . ?\e,L2\e(B) (?\e$,1(3\e(B . ?\e,L3\e(B) (?\e$,1(4\e(B . ?\e,L4\e(B)
140 (?\e$,1(5\e(B . ?\e,L5\e(B) (?\e$,1(6\e(B . ?\e,L6\e(B) (?\e$,1(7\e(B . ?\e,L7\e(B) (?\e$,1(8\e(B . ?\e,L8\e(B) (?\e$,1(9\e(B . ?\e,L9\e(B)
141 (?\e$,1(:\e(B . ?\e,L:\e(B) (?\e$,1(;\e(B . ?\e,L;\e(B) (?\e$,1(<\e(B . ?\e,L<\e(B) (?\e$,1(=\e(B . ?\e,L=\e(B) (?\e$,1(>\e(B . ?\e,L>\e(B)
142 (?\e$,1(?\e(B . ?\e,L?\e(B) (?\e$,1(@\e(B . ?\e,L@\e(B) (?\e$,1(A\e(B . ?\e,LA\e(B) (?\e$,1(B\e(B . ?\e,LB\e(B) (?\e$,1(C\e(B . ?\e,LC\e(B)
143 (?\e$,1(D\e(B . ?\e,LD\e(B) (?\e$,1(E\e(B . ?\e,LE\e(B) (?\e$,1(F\e(B . ?\e,LF\e(B) (?\e$,1(G\e(B . ?\e,LG\e(B) (?\e$,1(H\e(B . ?\e,LH\e(B)
144 (?\e$,1(I\e(B . ?\e,LI\e(B) (?\e$,1(J\e(B . ?\e,LJ\e(B) (?\e$,1(K\e(B . ?\e,LK\e(B) (?\e$,1(L\e(B . ?\e,LL\e(B) (?\e$,1(M\e(B . ?\e,LM\e(B)
145 (?\e$,1(N\e(B . ?\e,LN\e(B) (?\e$,1(O\e(B . ?\e,LO\e(B) (?\e$,1(P\e(B . ?\e,LP\e(B) (?\e$,1(Q\e(B . ?\e,LQ\e(B) (?\e$,1(R\e(B . ?\e,LR\e(B)
146 (?\e$,1(S\e(B . ?\e,LS\e(B) (?\e$,1(T\e(B . ?\e,LT\e(B) (?\e$,1(U\e(B . ?\e,LU\e(B) (?\e$,1(V\e(B . ?\e,LV\e(B) (?\e$,1(W\e(B . ?\e,LW\e(B)
147 (?\e$,1(X\e(B . ?\e,LX\e(B) (?\e$,1(Y\e(B . ?\e,LY\e(B) (?\e$,1(Z\e(B . ?\e,LZ\e(B) (?\e$,1([\e(B . ?\e,L[\e(B) (?\e$,1(\\e(B . ?\e,L\\e(B)
148 (?\e$,1(]\e(B . ?\e,L]\e(B) (?\e$,1(^\e(B . ?\e,L^\e(B) (?\e$,1(_\e(B . ?\e,L_\e(B) (?\e$,1(`\e(B . ?\e,L`\e(B) (?\e$,1(a\e(B . ?\e,La\e(B)
149 (?\e$,1(b\e(B . ?\e,Lb\e(B) (?\e$,1(c\e(B . ?\e,Lc\e(B) (?\e$,1(d\e(B . ?\e,Ld\e(B) (?\e$,1(e\e(B . ?\e,Le\e(B) (?\e$,1(f\e(B . ?\e,Lf\e(B)
150 (?\e$,1(g\e(B . ?\e,Lg\e(B) (?\e$,1(h\e(B . ?\e,Lh\e(B) (?\e$,1(i\e(B . ?\e,Li\e(B) (?\e$,1(j\e(B . ?\e,Lj\e(B) (?\e$,1(k\e(B . ?\e,Lk\e(B)
151 (?\e$,1(l\e(B . ?\e,Ll\e(B) (?\e$,1(m\e(B . ?\e,Lm\e(B) (?\e$,1(n\e(B . ?\e,Ln\e(B) (?\e$,1(o\e(B . ?\e,Lo\e(B) (?\e$,1(q\e(B . ?\e,Lq\e(B)
152 (?\e$,1(r\e(B . ?\e,Lr\e(B) (?\e$,1(s\e(B . ?\e,Ls\e(B) (?\e$,1(t\e(B . ?\e,Lt\e(B) (?\e$,1(u\e(B . ?\e,Lu\e(B) (?\e$,1(v\e(B . ?\e,Lv\e(B)
153 (?\e$,1(w\e(B . ?\e,Lw\e(B) (?\e$,1(x\e(B . ?\e,Lx\e(B) (?\e$,1(y\e(B . ?\e,Ly\e(B) (?\e$,1(z\e(B . ?\e,Lz\e(B) (?\e$,1({\e(B . ?\e,L{\e(B)
154 (?\e$,1(|\e(B . ?\e,L|\e(B) (?\e$,1(~\e(B . ?\e,L~\e(B) (?\e$,1(\7f\e(B . ?\e,L\7f\e(B)))
155
ad88f5c5
KH
156
157(defcustom utf-fragment-on-decoding nil
158 "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets.
9ca2ac2d
DL
159Setting this means that the relevant Cyrillic and Greek characters are
160decoded into the iso8859 charsets rather than into
3873f5a5 161mule-unicode-0100-24ff. The iso8859 charsets take half as much space
9ca2ac2d
DL
162in the buffer, but using them may affect how the buffer can be re-encoded
163and may require a different input method to search for them, for instance.
164See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
3873f5a5
KH
165for mechanisms to make this largely transparent.
166
167Setting this variable outside customize has no effect."
9ca2ac2d 168 :set (lambda (s v)
ad88f5c5
KH
169 (if v
170 (progn
171 (define-translation-table 'utf-translation-table-for-decode
172 utf-fragmentation-table)
173 ;; Even if unify-8859-on-encoding-mode is off, make
174 ;; mule-utf-* encode characters in
175 ;; utf-fragmentation-table.
176 (unless (eq (get 'utf-translation-table-for-encode
177 'translation-table)
178 ucs-mule-to-mule-unicode)
179 (define-translation-table 'utf-translation-table-for-encode
535665b8 180 utf-defragmentation-table)))
ad88f5c5
KH
181 (define-translation-table 'utf-translation-table-for-decode)
182 ;; When unify-8859-on-encoding-mode is off, be sure to make
183 ;; mule-utf-* disabled for characters in
184 ;; utf-fragmentation-table.
185 (unless (eq (get 'utf-translation-table-for-encode
186 'translation-table)
187 ucs-mule-to-mule-unicode)
535665b8 188 (define-translation-table 'utf-translation-table-for-encode)))
9ca2ac2d 189 (set-default s v))
bf247b6e 190 :version "22.1"
9ca2ac2d
DL
191 :type 'boolean
192 :group 'mule)
193
c71c26e9
KH
194
195(defconst utf-translate-cjk-charsets '(chinese-gb2312
196 chinese-big5-1 chinese-big5-2
197 japanese-jisx0208 japanese-jisx0212
7d9d5480 198 katakana-jisx0201
c71c26e9
KH
199 korean-ksc5601)
200 "List of charsets supported by `utf-translate-cjk-mode'.")
201
fce59e40
KH
202(defvar utf-translate-cjk-lang-env nil
203 "Language environment in which tables for `utf-translate-cjk-mode' is loaded.
204The value nil means that the tables are not yet loaded.")
205
206(defvar utf-translate-cjk-unicode-range)
207
208;; String generated from utf-translate-cjk-unicode-range. It is
209;; suitable for an argument to skip-chars-forward.
210(defvar utf-translate-cjk-unicode-range-string nil)
211
212(defun utf-translate-cjk-set-unicode-range (range)
213 (setq utf-translate-cjk-unicode-range range)
214 (setq utf-translate-cjk-unicode-range-string
215 (let ((decode-char-no-trans
216 #'(lambda (x)
217 (cond ((< x #x100) (make-char 'latin-iso8859-1 x))
218 ((< x #x2500)
219 (setq x (- x #x100))
220 (make-char 'mule-unicode-0100-24ff
221 (+ (/ x 96) 32) (+ (% x 96) 32)))
bf247b6e 222 ((< x #x3400)
fce59e40
KH
223 (setq x (- x #x2500))
224 (make-char 'mule-unicode-2500-33ff
225 (+ (/ x 96) 32) (+ (% x 96) 32)))
226 (t
227 (setq x (- x #xe000))
228 (make-char 'mule-unicode-e000-ffff
229 (+ (/ x 96) 32) (+ (% x 96) 32))))))
230 ranges from to)
231 (dolist (elt range)
232 (setq from (max #xA0 (car elt)) to (min #xffff (cdr elt)))
233 (if (and (>= to #x3400) (< to #xE000))
234 (setq to #x33FF))
235 (cond ((< from #x100)
236 (if (>= to #xE000)
237 (setq ranges (cons (cons #xE000 to) ranges)
238 to #x33FF))
239 (if (>= to #x2500)
240 (setq ranges (cons (cons #x2500 to) ranges)
241 to #x24FF))
242 (if (>= to #x100)
243 (setq ranges (cons (cons #x100 to) ranges)
244 to #xFF)))
245 ((< from #x2500)
246 (if (>= to #xE000)
247 (setq ranges (cons (cons #xE000 to) ranges)
248 to #x33FF))
249 (if (>= to #x2500)
250 (setq ranges (cons (cons #x2500 to) ranges)
251 to #x24FF)))
252 ((< from #x3400)
253 (if (>= to #xE000)
254 (setq ranges (cons (cons #xE000 to) ranges)
255 to #x33FF))))
256 (if (<= from to)
257 (setq ranges (cons (cons from to) ranges))))
bf247b6e
KS
258 (mapconcat #'(lambda (x)
259 (format "%c-%c"
fce59e40
KH
260 (funcall decode-char-no-trans (car x))
261 (funcall decode-char-no-trans (cdr x))))
262 ranges "")))
3ccf95cb
KH
263 ;; These forces loading and settting tables for
264 ;; utf-translate-cjk-mode.
265 (setq utf-translate-cjk-lang-env nil
266 ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
267 ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)))
fce59e40
KH
268
269(defcustom utf-translate-cjk-unicode-range '((#x2e80 . #xd7a3)
270 (#xff00 . #xffef))
271 "List of Unicode code ranges supported by `utf-translate-cjk-mode'.
272Setting this variable directly does not take effect;
273use either \\[customize] or the function
274`utf-translate-cjk-set-unicode-range'."
bf247b6e 275 :version "22.1"
fce59e40
KH
276 :type '(repeat (cons integer integer))
277 :set (lambda (symbol value)
278 (utf-translate-cjk-set-unicode-range value))
279 :group 'mule)
c71c26e9
KH
280
281;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'.
282(defsubst utf-translate-cjk-substitutable-p (code-point)
283 (let ((tail utf-translate-cjk-unicode-range)
284 elt)
285 (while tail
286 (setq elt (car tail) tail (cdr tail))
287 (if (and (>= code-point (car elt)) (<= code-point (cdr elt)))
288 (setq tail nil)
289 (setq elt nil)))
290 elt))
291
c71c26e9
KH
292(defun utf-translate-cjk-load-tables ()
293 "Load tables for `utf-translate-cjk-mode'."
294 ;; Fixme: Allow the use of the CJK charsets to be
295 ;; customized by reordering and possible omission.
296 (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000)))
297 (if redefined
298 ;; Redefine them with realistic initial sizes and a
299 ;; smallish rehash size to avoid wasting significant
300 ;; space after they're built.
301 (setq ucs-mule-cjk-to-unicode
302 (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
303 ucs-unicode-to-mule-cjk
304 (make-hash-table :test 'eq :size 21500 :rehash-size 1000)))
305
306 ;; Load the files explicitly, to avoid having to keep
307 ;; around the large tables they contain (as well as the
308 ;; ones which get built).
e314a6e4
KH
309 ;; Here we bind coding-system-for-read to nil so that coding tags
310 ;; in the files are respected even if the files are not yet
311 ;; byte-compiled
312 (let ((coding-system-for-read nil))
313 (cond ((string= "Korean" current-language-environment)
314 (load "subst-jis")
315 (load "subst-big5")
316 (load "subst-gb2312")
317 (load "subst-ksc"))
318 ((string= "Chinese-BIG5" current-language-environment)
319 (load "subst-jis")
320 (load "subst-ksc")
321 (load "subst-gb2312")
322 (load "subst-big5"))
323 ((string= "Chinese-GB" current-language-environment)
324 (load "subst-jis")
325 (load "subst-ksc")
326 (load "subst-big5")
327 (load "subst-gb2312"))
328 (t
329 (load "subst-ksc")
330 (load "subst-gb2312")
331 (load "subst-big5")
332 (load "subst-jis")))) ; jis covers as much as big5, gb2312
c71c26e9
KH
333
334 (when redefined
335 (define-translation-hash-table 'utf-subst-table-for-decode
336 ucs-unicode-to-mule-cjk)
337 (define-translation-hash-table 'utf-subst-table-for-encode
338 ucs-mule-cjk-to-unicode)
339 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
340 'translation-table)
341 1 ucs-mule-cjk-to-unicode))
342
343 (setq utf-translate-cjk-lang-env current-language-environment)))
344
345(defun utf-lookup-subst-table-for-decode (code-point)
346 (if (and utf-translate-cjk-mode
347 (not utf-translate-cjk-lang-env)
348 (utf-translate-cjk-substitutable-p code-point))
349 (utf-translate-cjk-load-tables))
350 (gethash code-point
351 (get 'utf-subst-table-for-decode 'translation-hash-table)))
4bcce19c 352
c71c26e9
KH
353
354(defun utf-lookup-subst-table-for-encode (char)
355 (if (and utf-translate-cjk-mode
356 (not utf-translate-cjk-lang-env)
357 (memq (char-charset char) utf-translate-cjk-charsets))
358 (utf-translate-cjk-load-tables))
359 (gethash char
360 (get 'utf-subst-table-for-encode 'translation-hash-table)))
4bcce19c 361
fcfdeaf6 362(define-minor-mode utf-translate-cjk-mode
4bcce19c
LT
363 "Toggle whether UTF based coding systems de/encode CJK characters.
364If ARG is an integer, enable if ARG is positive and disable if
365zero or negative. This is a minor mode.
c71c26e9 366Enabling this allows the coding systems mule-utf-8,
72b338a2 367mule-utf-16le and mule-utf-16be to encode characters in the charsets
ccdd5c61
DL
368`korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
369`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
370decode the corresponding unicodes into such characters.
371
372Where the charsets overlap, the one preferred for decoding is chosen
373according to the language environment in effect when this option is
374turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
375Chinese-Big5 and jisx for other environments.
376
4bcce19c 377This mode is on by default. If you are not interested in CJK
c71c26e9 378characters and want to avoid some overhead on encoding/decoding
4bcce19c
LT
379by the above coding systems, you can customize the user option
380`utf-translate-cjk-mode' to nil."
c71c26e9 381 :init-value t
bf247b6e 382 :version "22.1"
9ca2ac2d 383 :type 'boolean
fcfdeaf6
KG
384 :group 'mule
385 :global t
386 (if utf-translate-cjk-mode
fcfdeaf6 387 (progn
2bf07f07
KH
388 (define-translation-hash-table 'utf-subst-table-for-decode
389 ucs-unicode-to-mule-cjk)
390 (define-translation-hash-table 'utf-subst-table-for-encode
391 ucs-mule-cjk-to-unicode)
392 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
393 'translation-table)
394 1 ucs-mule-cjk-to-unicode))
aea797fc
KH
395 (define-translation-hash-table 'utf-subst-table-for-decode
396 (make-hash-table :test 'eq))
397 (define-translation-hash-table 'utf-subst-table-for-encode
2bf07f07
KH
398 (make-hash-table :test 'eq))
399 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
400 'translation-table)
c71c26e9
KH
401 1 nil))
402
403 ;; Update safe-chars of mule-utf-* coding systems.
404 (dolist (elt (coding-system-list t))
405 (if (string-match "^mule-utf" (symbol-name elt))
406 (let ((safe-charsets (coding-system-get elt 'safe-charsets))
407 (safe-chars (coding-system-get elt 'safe-chars))
408 (need-update nil))
409 (dolist (charset utf-translate-cjk-charsets)
410 (unless (eq utf-translate-cjk-mode (memq charset safe-charsets))
411 (setq safe-charsets
412 (if utf-translate-cjk-mode
413 (cons charset safe-charsets)
414 (delq charset safe-charsets))
415 need-update t)
416 (aset safe-chars (make-char charset) utf-translate-cjk-mode)))
417 (when need-update
418 (coding-system-put elt 'safe-charsets safe-charsets)
419 (define-coding-system-internal elt))))))
420
421(define-ccl-program ccl-mule-utf-untrans
422 ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or
423 ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write
424 ;; eight-bit-control/graphic sequence (2 to 4 chars) representing
425 ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified.
426 ;;
427 ;; This is a subrountine because we assume that this is called very
428 ;; rarely (so we don't have to worry about the overhead of the
429 ;; call).
430 `(0
431 ((r5 = ,(charset-id 'eight-bit-control))
432 (r6 = ,(charset-id 'eight-bit-graphic))
433 (if (r0 < #x100)
434 ((r4 = ((r0 >> 6) | #xC0))
435 (write-multibyte-character r6 r4))
436 ((if (r0 < #x10000)
437 ((r4 = ((r0 >> 12) | #xE0))
438 (write-multibyte-character r6 r4))
439 ((r4 = ((r0 >> 18) | #xF0))
440 (write-multibyte-character r6 r4)
441 (r4 = (((r0 >> 12) & #x3F) | #x80))
442 (if (r4 < #xA0)
443 (write-multibyte-character r5 r4)
444 (write-multibyte-character r6 r4))))
445 (r4 = (((r0 >> 6) & #x3F) | #x80))
446 (if (r4 < #xA0)
447 (write-multibyte-character r5 r4)
448 (write-multibyte-character r6 r4))))
449 (r4 = ((r0 & #x3F) | #x80))
450 (if (r4 < #xA0)
451 (write-multibyte-character r5 r4)
452 (write-multibyte-character r6 r4)))))
9ca2ac2d 453
5ba7a870
KH
454(define-ccl-program ccl-decode-mule-utf-8
455 ;;
456 ;; charset | bytes in utf-8 | bytes in emacs
457 ;; -----------------------+----------------+---------------
458 ;; ascii | 1 | 1
459 ;; -----------------------+----------------+---------------
460 ;; eight-bit-control | 2 | 2
aa2e3f49 461 ;; eight-bit-graphic | 2 | 1
5ba7a870
KH
462 ;; latin-iso8859-1 | 2 | 2
463 ;; -----------------------+----------------+---------------
464 ;; mule-unicode-0100-24ff | 2 | 4
465 ;; (< 0800) | |
466 ;; -----------------------+----------------+---------------
467 ;; mule-unicode-0100-24ff | 3 | 4
468 ;; (>= 8000) | |
469 ;; mule-unicode-2500-33ff | 3 | 4
470 ;; mule-unicode-e000-ffff | 3 | 4
c71c26e9
KH
471 ;; -----------------------+----------------+---------------
472 ;; invalid byte | 1 | 2
5ba7a870
KH
473 ;;
474 ;; Thus magnification factor is two.
475 ;;
476 `(2
064cff0b
KH
477 ((r6 = ,(charset-id 'latin-iso8859-1))
478 (read r0)
3d0e328b 479 (loop
5ba7a870 480 (if (r0 < #x80)
c71c26e9 481 ;; 1-byte encoding, i.e., ascii
064cff0b
KH
482 (write-read-repeat r0))
483 (if (r0 < #xc2)
484 ;; continuation byte (invalid here) or 1st byte of overlong
485 ;; 2-byte sequence.
c71c26e9 486 ((call ccl-mule-utf-untrans)
064cff0b
KH
487 (r6 = ,(charset-id 'latin-iso8859-1))
488 (read r0)
c71c26e9
KH
489 (repeat)))
490
491 ;; Read the 2nd byte.
c71c26e9
KH
492 (read r1)
493 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte
494 ((call ccl-mule-utf-untrans)
064cff0b 495 (r6 = ,(charset-id 'latin-iso8859-1))
c71c26e9
KH
496 ;; Handle it in the next loop.
497 (r0 = r1)
498 (repeat)))
499
500 (if (r0 < #xe0)
501 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
064cff0b
KH
502 ((r1 &= #x3F)
503 (r1 |= ((r0 & #x1F) << 6))
3ccf95cb 504 ;; Now r1 holds scalar value. We don't have to check
064cff0b 505 ;; `overlong sequence' because r0 >= 0xC2.
c71c26e9 506
064cff0b 507 (if (r1 >= 256)
c71c26e9 508 ;; mule-unicode-0100-24ff (< 0800)
3ccf95cb
KH
509 ((r0 = r1)
510 (lookup-integer utf-subst-table-for-decode r0 r1)
511 (if (r7 == 0)
512 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
513 (r1 -= #x0100)
514 (r2 = (((r1 / 96) + 32) << 7))
515 (r1 %= 96)
516 (r1 += (r2 + 32))
517 (translate-character
518 utf-translation-table-for-decode r0 r1)))
064cff0b
KH
519 (write-multibyte-character r0 r1)
520 (read r0)
521 (repeat))
522 (if (r1 >= 160)
523 ;; latin-iso8859-1
3ccf95cb
KH
524 ((r0 = r1)
525 (lookup-integer utf-subst-table-for-decode r0 r1)
526 (if (r7 == 0)
527 ((r1 -= 128)
528 (write-multibyte-character r6 r1))
529 ((write-multibyte-character r0 r1)))
064cff0b
KH
530 (read r0)
531 (repeat))
532 ;; eight-bit-control
533 ((r0 = ,(charset-id 'eight-bit-control))
534 (write-multibyte-character r0 r1)
535 (read r0)
536 (repeat))))))
c71c26e9
KH
537
538 ;; Read the 3rd bytes.
c71c26e9
KH
539 (read r2)
540 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte
541 ((call ccl-mule-utf-untrans)
542 (r0 = r1)
543 (call ccl-mule-utf-untrans)
064cff0b 544 (r6 = ,(charset-id 'latin-iso8859-1))
c71c26e9
KH
545 ;; Handle it in the next loop.
546 (r0 = r2)
547 (repeat)))
548
549 (if (r0 < #xF0)
550 ;; 3byte encoding
551 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
552 ((r3 = ((r0 & #xF) << 12))
553 (r3 |= ((r1 & #x3F) << 6))
554 (r3 |= (r2 & #x3F))
555
556 (if (r3 < #x800) ; `overlong sequence'
557 ((call ccl-mule-utf-untrans)
558 (r0 = r1)
559 (call ccl-mule-utf-untrans)
560 (r0 = r2)
561 (call ccl-mule-utf-untrans)
064cff0b
KH
562 (r6 = ,(charset-id 'latin-iso8859-1))
563 (read r0)
c71c26e9
KH
564 (repeat)))
565
566 (if (r3 < #x2500)
567 ;; mule-unicode-0100-24ff (>= 0800)
3ccf95cb
KH
568 ((r0 = r3)
569 (lookup-integer utf-subst-table-for-decode r0 r1)
570 (if (r7 == 0)
571 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
572 (r3 -= #x0100)
573 (r3 //= 96)
574 (r1 = (r7 + 32))
575 (r1 += ((r3 + 32) << 7))
576 (translate-character
577 utf-translation-table-for-decode r0 r1)))
c71c26e9 578 (write-multibyte-character r0 r1)
064cff0b 579 (read r0)
c71c26e9
KH
580 (repeat)))
581
582 (if (r3 < #x3400)
583 ;; mule-unicode-2500-33ff
584 ((r0 = r3) ; don't zap r3
585 (lookup-integer utf-subst-table-for-decode r0 r1)
586 (if (r7 == 0)
587 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
588 (r3 -= #x2500)
589 (r3 //= 96)
590 (r1 = (r7 + 32))
591 (r1 += ((r3 + 32) << 7))))
592 (write-multibyte-character r0 r1)
064cff0b 593 (read r0)
c71c26e9
KH
594 (repeat)))
595
596 (if (r3 < #xE000)
597 ;; Try to convert to CJK chars, else
598 ;; keep them as eight-bit-{control|graphic}.
599 ((r0 = r3)
600 (lookup-integer utf-subst-table-for-decode r3 r1)
601 (if r7
602 ;; got a translation
064cff0b
KH
603 ((write-multibyte-character r3 r1)
604 (read r0)
605 (repeat))
606 ((call ccl-mule-utf-untrans)
607 (r6 = ,(charset-id 'latin-iso8859-1))
608 (read r0)
609 (repeat)))))
c71c26e9
KH
610
611 ;; mule-unicode-e000-ffff
612 ;; Fixme: fffe and ffff are invalid.
613 (r0 = r3) ; don't zap r3
614 (lookup-integer utf-subst-table-for-decode r0 r1)
615 (if (r7 == 0)
616 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
617 (r3 -= #xe000)
618 (r3 //= 96)
619 (r1 = (r7 + 32))
620 (r1 += ((r3 + 32) << 7))))
621 (write-multibyte-character r0 r1)
064cff0b 622 (read r0)
c71c26e9
KH
623 (repeat)))
624
625 ;; Read the 4th bytes.
c71c26e9
KH
626 (read r3)
627 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte
628 ((call ccl-mule-utf-untrans)
629 (r0 = r1)
630 (call ccl-mule-utf-untrans)
064cff0b
KH
631 (r0 = r2)
632 (call ccl-mule-utf-untrans)
633 (r6 = ,(charset-id 'latin-iso8859-1))
c71c26e9
KH
634 ;; Handle it in the next loop.
635 (r0 = r3)
636 (repeat)))
637
064cff0b 638 (if (r0 < #xF8)
c71c26e9
KH
639 ;; 4-byte encoding:
640 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx
641 ;; keep those bytes as eight-bit-{control|graphic}
642 ;; Fixme: allow lookup in utf-subst-table-for-decode.
643 ((r4 = ((r0 & #x7) << 18))
644 (r4 |= ((r1 & #x3F) << 12))
645 (r4 |= ((r2 & #x3F) << 6))
646 (r4 |= (r3 & #x3F))
647
648 (if (r4 < #x10000) ; `overlong sequence'
649 ((call ccl-mule-utf-untrans)
650 (r0 = r1)
651 (call ccl-mule-utf-untrans)
652 (r0 = r2)
653 (call ccl-mule-utf-untrans)
654 (r0 = r3)
655 (call ccl-mule-utf-untrans))
656 ((r0 = r4)
064cff0b 657 (call ccl-mule-utf-untrans))))
c71c26e9 658
064cff0b
KH
659 ;; Unsupported sequence.
660 ((call ccl-mule-utf-untrans)
661 (r0 = r1)
662 (call ccl-mule-utf-untrans)
663 (r0 = r2)
664 (call ccl-mule-utf-untrans)
665 (r0 = r3)
666 (call ccl-mule-utf-untrans)))
667 (r6 = ,(charset-id 'latin-iso8859-1))
668 (read r0)
67ff2216
KH
669 (repeat)))
670
064cff0b 671
67ff2216
KH
672 ;; At EOF...
673 (if (r0 >= 0)
c71c26e9
KH
674 ;; r0 >= #x80
675 ((call ccl-mule-utf-untrans)
67ff2216 676 (if (r1 >= 0)
c71c26e9
KH
677 ((r0 = r1)
678 (call ccl-mule-utf-untrans)
67ff2216 679 (if (r2 >= 0)
c71c26e9
KH
680 ((r0 = r2)
681 (call ccl-mule-utf-untrans)
67ff2216 682 (if (r3 >= 0)
c71c26e9
KH
683 ((r0 = r3)
684 (call ccl-mule-utf-untrans))))))))))
5ba7a870 685
c49b8288 686 "CCL program to decode UTF-8.
74ace46a 687Basic decoding is done into the charsets ascii, latin-iso8859-1 and
ad88f5c5
KH
688mule-unicode-*, but see also `utf-fragmentation-table' and
689`ucs-mule-cjk-to-unicode'.
9ca2ac2d
DL
690Encodings of un-representable Unicode characters are decoded asis into
691eight-bit-control and eight-bit-graphic characters.")
5ba7a870 692
c71c26e9
KH
693(define-ccl-program ccl-mule-utf-8-encode-untrans
694 ;; UTF-8 decoder generates an UTF-8 sequence represented by a
695 ;; sequence eight-bit-control/graphic chars for an untranslatable
696 ;; character and an invalid byte.
4bcce19c 697 ;;
c71c26e9
KH
698 ;; This CCL parses that sequence (the first byte is already in r1),
699 ;; writes out the original bytes of that sequence, and sets r5 to
700 ;; -1.
701 ;;
702 ;; If the eight-bit-control/graphic sequence is shorter than what r1
703 ;; suggests, it sets r5 and r6 to the last character read that
704 ;; should be handled by the next loop of a caller.
705 ;;
706 ;; Note: For UTF-8 validation, we only check if a character is
707 ;; eight-bit-control/graphic or not. It may result in incorrect
708 ;; handling of random binary data, but such a data can't be encoded
709 ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such
710 ;; a sequence even if a source contains invalid byte-sequence.
711 `(0
712 (;; Read the 2nd byte.
713 (read-multibyte-character r5 r6)
714 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
715 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
4bcce19c 716 ((write r1) ; invalid UTF-8
c71c26e9
KH
717 (r1 = -1)
718 (end)))
719
720 (if (r1 <= #xC3)
721 ;; 2-byte sequence for an originally invalid byte.
722 ((r6 &= #x3F)
723 (r6 |= ((r1 & #x1F) << 6))
724 (write r6)
725 (r5 = -1)
726 (end)))
727
728 (write r1 r6)
729 (r2 = r1)
730 (r1 = -1)
731 ;; Read the 3rd byte.
732 (read-multibyte-character r5 r6)
4bcce19c 733 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
c71c26e9
KH
734 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
735 (end)) ; invalid UTF-8
736 (write r6)
737 (if (r2 < #xF0)
738 ;; 3-byte sequence for an untranslated character.
739 ((r5 = -1)
740 (end)))
741 ;; Read the 4th byte.
742 (read-multibyte-character r5 r6)
4bcce19c 743 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
c71c26e9
KH
744 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
745 (end)) ; invalid UTF-8
746 ;; 4-byte sequence for an untranslated character.
747 (write r6)
748 (r5 = -1)
749 (end))
750
751 ;; At EOF...
752 ((r5 = -1)
753 (if (r1 >= 0)
754 (write r1)))))
755
5ba7a870
KH
756(define-ccl-program ccl-encode-mule-utf-8
757 `(1
aa15b3e5
KH
758 ((r5 = -1)
759 (loop
760 (if (r5 < 0)
c71c26e9
KH
761 (read-multibyte-character r0 r1)
762 ;; Pre-read character is in r5 (charset-ID) and r6 (code-point).
763 ((r0 = r5)
aa15b3e5
KH
764 (r1 = r6)
765 (r5 = -1)))
c71c26e9 766 (translate-character utf-translation-table-for-encode r0 r1)
aa15b3e5
KH
767
768 (if (r0 == ,(charset-id 'ascii))
c71c26e9
KH
769 (write-repeat r1))
770
771 (if (r0 == ,(charset-id 'latin-iso8859-1))
772 ;; r1 scalar utf-8
773 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
774 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
775 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
c1136bda 776 ((write ((r1 >> 6) | #xc2))
c71c26e9
KH
777 (r1 &= #x3f)
778 (r1 |= #x80)
c71c26e9
KH
779 (write-repeat r1)))
780
781 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
782 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
783 ;; #x3f80 == (0011 1111 1000 0000)b
784 (r1 &= #x7f)
785 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
786 ;; now r1 holds scalar value
787 (if (r1 < #x0800)
788 ;; 2byte encoding
789 ((write ((r1 >> 6) | #xC0))
790 (r1 &= #x3F)
791 (r1 |= #x80)
792 (write-repeat r1))
793 ;; 3byte encoding
794 ((write ((r1 >> 12) | #xE0))
795 (write (((r1 & #x0FC0) >> 6) | #x80))
796 (r1 &= #x3F)
797 (r1 |= #x80)
798 (write-repeat r1)))))
799
800 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
801 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
802 (r1 &= #x7f)
803 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
804 ;; now r1 holds scalar value
805 (write ((r1 >> 12) | #xE0))
806 (write (((r1 & #x0FC0) >> 6) | #x80))
807 (r1 &= #x3F)
808 (r1 |= #x80)
809 (write-repeat r1)))
810
811 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
812 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
813 (r1 &= #x7f)
814 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
815 ;; now r1 holds scalar value
816 (write ((r1 >> 12) | #xE0))
817 (write (((r1 & #x0FC0) >> 6) | #x80))
818 (r1 &= #x3F)
819 (r1 |= #x80)
820 (write-repeat r1)))
821
822 (if (r0 == ,(charset-id 'eight-bit-control))
823 ;; r1 scalar utf-8
824 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
825 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
826 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
827 ((write #xC2)
828 (write-repeat r1)))
829
830 (if (r0 == ,(charset-id 'eight-bit-graphic))
831 ;; r1 scalar utf-8
832 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
833 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
834 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
835 ((r0 = (r1 >= #xC0))
836 (r0 &= (r1 <= #xC3))
837 (r4 = (r1 >= #xE1))
838 (r4 &= (r1 <= #xF7))
839 (r0 |= r4)
840 (if r0
841 ((call ccl-mule-utf-8-encode-untrans)
842 (repeat))
843 (write-repeat r1))))
844
845 (lookup-character utf-subst-table-for-encode r0 r1)
846 (if r7 ; lookup succeeded
847 (if (r0 < #x800)
848 ;; 2byte encoding
849 ((write ((r0 >> 6) | #xC0))
c1136bda
KH
850 (r0 = ((r0 & #x3F) | #x80))
851 (write-repeat r0))
c71c26e9
KH
852 ;; 3byte encoding
853 ((write ((r0 >> 12) | #xE0))
854 (write (((r0 & #x0FC0) >> 6) | #x80))
c1136bda
KH
855 (r0 = ((r0 & #x3F) | #x80))
856 (write-repeat r0))))
5ba7a870 857
c71c26e9
KH
858 ;; Unsupported character.
859 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
860 (write #xef)
861 (write #xbf)
862 (write-repeat #xbd))))
9ca2ac2d 863 "CCL program to encode into UTF-8.")
5ba7a870 864
aa2e3f49 865
9ca2ac2d
DL
866(define-ccl-program ccl-untranslated-to-ucs
867 `(0
c71c26e9
KH
868 (if (r1 == 0)
869 nil
870 (if (r0 <= #xC3) ; 2-byte encoding
871 ((r0 = ((r0 & #x3) << 6))
872 (r0 |= (r1 & #x3F))
873 (r1 = 2))
874 (if (r2 == 0)
875 (r1 = 0)
876 (if (r0 < #xF0) ; 3-byte encoding, as above
877 ((r0 = ((r0 & #xF) << 12))
878 (r0 |= ((r1 & #x3F) << 6))
064cff0b 879 (r0 |= (r2 & #x3F))
c71c26e9
KH
880 (r1 = 3))
881 (if (r3 == 0)
882 (r1 = 0)
883 ((r0 = ((r0 & #x7) << 18))
884 (r0 |= ((r1 & #x3F) << 12))
885 (r0 |= ((r2 & #x3F) << 6))
886 (r0 |= (r3 & #x3F))
887 (r1 = 4))))))))
888 "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
889Set r1 to the byte length. r0 == 0 for invalid sequence.")
9ca2ac2d
DL
890
891(defvar utf-8-ccl-regs (make-vector 8 0))
892
aa2e3f49 893(defsubst utf-8-untranslated-to-ucs ()
9ca2ac2d
DL
894 "Return the UCS code for an untranslated sequence of raw bytes t point.
895Only for 3- or 4-byte sequences."
896 (aset utf-8-ccl-regs 0 (or (char-after) 0))
897 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
898 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
899 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
c71c26e9 900 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs))
aa2e3f49
DL
901
902(defun utf-8-help-echo (window object position)
903 (format "Untranslated Unicode U+%04X"
904 (get-char-property position 'untranslated-utf-8 object)))
905
c71c26e9
KH
906;; We compose the untranslatable sequences into a single character,
907;; and move point to the next character.
aa2e3f49
DL
908;; This is infelicitous for editing, because there's currently no
909;; mechanism for treating compositions as atomic, but is OK for
9ca2ac2d
DL
910;; display. They are composed to U+FFFD with help-echo which
911;; indicates the unicodes they represent. This function GCs too much.
c71c26e9
KH
912
913;; If utf-translate-cjk-mode is non-nil, this function is called with
914;; HASH-TABLE which translates CJK characters into some of CJK
915;; charsets.
916
917(defsubst utf-8-compose (hash-table)
918 "Put a suitable composition on an untranslatable sequence at point.
919If HASH-TABLE is non-nil, try to translate CJK characters by it at first.
920Move point to the end of the sequence."
921 (utf-8-untranslated-to-ucs)
922 (let ((l (aref utf-8-ccl-regs 1))
923 ch)
924 (if (> l 0)
925 (if (and hash-table
926 (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table)))
927 (progn
928 (insert ch)
929 (delete-region (point) (min (point-max) (+ l (point)))))
930 (setq ch (aref utf-8-ccl-regs 0))
931 (put-text-property (point) (min (point-max) (+ l (point)))
932 'untranslated-utf-8 ch)
933 (put-text-property (point) (min (point-max) (+ l (point)))
934 'help-echo 'utf-8-help-echo)
935 (if (= l 2)
936 (put-text-property (point) (min (point-max) (+ l (point)))
937 'display (format "\\%03o" ch))
938 (compose-region (point) (+ l (point)) ?\e$,3u=\e(B))
939 (forward-char l))
940 (forward-char 1))))
aa2e3f49
DL
941
942(defcustom utf-8-compose-scripts nil
9ca2ac2d 943 "*Non-nil means compose various scripts on decoding utf-8 text."
aa2e3f49 944 :group 'mule
bf247b6e 945 :version "22.1"
9ca2ac2d 946 :type 'boolean)
aa2e3f49
DL
947
948(defun utf-8-post-read-conversion (length)
949 "Compose untranslated utf-8 sequences into single characters.
c71c26e9 950If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters.
aa2e3f49
DL
951Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
952 (save-excursion
c71c26e9
KH
953 (save-restriction
954 (narrow-to-region (point) (+ (point) length))
955 ;; Can't do eval-when-compile to insert a multibyte constant
956 ;; version of the string in the loop, since it's always loaded as
957 ;; unibyte from a byte-compiled file.
958 (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7"))
11d2e01b 959 (buffer-multibyte enable-multibyte-characters)
c71c26e9 960 hash-table ch)
11d2e01b 961 (set-buffer-multibyte t)
c71c26e9 962 (when utf-translate-cjk-mode
fce59e40
KH
963 (unless utf-translate-cjk-lang-env
964 ;; Check these characters in utf-translate-cjk-range.
965 ;; We may have to translate them to CJK charsets.
966 (skip-chars-forward
967 (concat range utf-translate-cjk-unicode-range-string))
968 (unless (eobp)
969 (utf-translate-cjk-load-tables)
970 (setq range
3ccf95cb
KH
971 (concat range utf-translate-cjk-unicode-range-string)))
972 (setq hash-table (get 'utf-subst-table-for-decode
973 'translation-hash-table))))
c71c26e9
KH
974 (while (and (skip-chars-forward range)
975 (not (eobp)))
976 (setq ch (following-char))
977 (if (< ch 256)
978 (utf-8-compose hash-table)
979 (if (and hash-table
980 (setq ch (gethash (encode-char ch 'ucs) hash-table)))
981 (progn
982 (insert ch)
983 (delete-char 1))
11d2e01b
KH
984 (forward-char 1))))
985 (or buffer-multibyte
986 (set-buffer-multibyte nil)))
c71c26e9
KH
987
988 (when (and utf-8-compose-scripts (> length 1))
989 ;; These currently have definitions which cover the relevant
990 ;; unicodes. We could avoid loading thai-util &c by checking
991 ;; whether the region contains any characters with the appropriate
992 ;; categories. There aren't yet Unicode-based rules for Tibetan.
993 (diacritic-compose-region (point-max) (point-min))
994 (thai-compose-region (point-max) (point-min))
995 (lao-compose-region (point-max) (point-min))
996 (devanagari-compose-region (point-max) (point-min))
997 (malayalam-compose-region (point-max) (point-min))
998 (tamil-compose-region (point-max) (point-min)))
999 (- (point-max) (point-min)))))
1000
1001(defun utf-8-pre-write-conversion (beg end)
1002 "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END.
1003This is used as a post-read-conversion of utf-8 coding system."
1004 (if (and utf-translate-cjk-mode
1005 (not utf-translate-cjk-lang-env)
1006 (save-excursion
1007 (goto-char beg)
1008 (re-search-forward "\\cc\\|\\cj\\|\\ch" end t)))
1009 (utf-translate-cjk-load-tables))
1010 nil)
aa2e3f49 1011
5ba7a870
KH
1012(make-coding-system
1013 'mule-utf-8 4 ?u
1014 "UTF-8 encoding for Emacs-supported Unicode characters.
ad88f5c5
KH
1015It supports Unicode characters of these ranges:
1016 U+0000..U+33FF, U+E000..U+FFFF.
1017They correspond to these Emacs character sets:
1018 ascii, latin-iso8859-1, mule-unicode-0100-24ff,
1019 mule-unicode-2500-33ff, mule-unicode-e000-ffff
1020
1021On decoding (e.g. reading a file), Unicode characters not in the above
1022ranges are decoded into sequences of eight-bit-control and
1023eight-bit-graphic characters to preserve their byte sequences. The
1024byte sequence is preserved on i/o for valid utf-8, but not necessarily
1025for invalid utf-8.
1026
1027On encoding (e.g. writing a file), Emacs characters not belonging to
1028any of the character sets listed above are encoded into the UTF-8 byte
1029sequence representing U+FFFD (REPLACEMENT CHARACTER)."
5ba7a870
KH
1030
1031 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
c71c26e9 1032 `((safe-charsets
5ba7a870
KH
1033 ascii
1034 eight-bit-control
1035 eight-bit-graphic
1036 latin-iso8859-1
1037 mule-unicode-0100-24ff
1038 mule-unicode-2500-33ff
c71c26e9
KH
1039 mule-unicode-e000-ffff
1040 ,@(if utf-translate-cjk-mode
1041 utf-translate-cjk-charsets))
87ae7973 1042 (mime-charset . utf-8)
75f6d723 1043 (coding-category . coding-category-utf-8)
aa2e3f49 1044 (valid-codes (0 . 255))
c71c26e9 1045 (pre-write-conversion . utf-8-pre-write-conversion)
ad88f5c5 1046 (post-read-conversion . utf-8-post-read-conversion)
2bf07f07 1047 (translation-table-for-encode . utf-translation-table-for-encode)
ad88f5c5
KH
1048 (dependency unify-8859-on-encoding-mode
1049 unify-8859-on-decoding-mode
1050 utf-fragment-on-decoding
9e24a165 1051 utf-translate-cjk-mode)))
5ba7a870
KH
1052
1053(define-coding-system-alias 'utf-8 'mule-utf-8)
e8af40ee 1054
aa2e3f49
DL
1055;; I think this needs special private charsets defined for the
1056;; untranslated sequences, if it's going to work well.
1057
1058;;; (defun utf-8-compose-function (pos to pattern &optional string)
1059;;; (let* ((prop (get-char-property pos 'composition string))
1060;;; (l (and prop (- (cadr prop) (car prop)))))
1061;;; (cond ((and l (> l (- to pos)))
1062;;; (delete-region pos to))
1063;;; ((and (> (char-after pos) 224)
1064;;; (< (char-after pos) 256)
1065;;; (save-restriction
1066;;; (narrow-to-region pos to)
1067;;; (utf-8-compose)))
1068;;; t))))
1069
1070;;; (dotimes (i 96)
1071;;; (aset composition-function-table
1072;;; (+ 128 i)
1073;;; `((,(string-as-multibyte "[\200-\237\240-\377]")
1074;;; . utf-8-compose-function))))
1075
ab5796a9 1076;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5
e8af40ee 1077;;; utf-8.el ends here