HCoop Git - bpt/emacs.git/blame_incremental - lisp/international/utf-8.el

... / ...

Commit	Line	Data
	1	;;; utf-8.el --- UTF-8 decoding/encoding support -- coding: iso-2022-7bit --
	2
	3	;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
	4	;; Free Software Foundation, Inc.
	5	;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
	6	;; National Institute of Advanced Industrial Science and Technology (AIST)
	7	;; Registration Number H14PRO021
	8
	9	;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
	10	;; Maintainer: FSF
	11	;; Keywords: multilingual, Unicode, UTF-8, i18n
	12
	13	;; This file is part of GNU Emacs.
	14
	15	;; GNU Emacs is free software; you can redistribute it and/or modify
	16	;; it under the terms of the GNU General Public License as published by
	17	;; the Free Software Foundation; either version 3, or (at your option)
	18	;; any later version.
	19
	20	;; GNU Emacs is distributed in the hope that it will be useful,
	21	;; but WITHOUT ANY WARRANTY; without even the implied warranty of
	22	;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	23	;; GNU General Public License for more details.
	24
	25	;; You should have received a copy of the GNU General Public License
	26	;; along with GNU Emacs; see the file COPYING. If not, write to the
	27	;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
	28	;; Boston, MA 02110-1301, USA.
	29
	30	;;; Commentary:
	31
	32	;; The coding-system `mule-utf-8' basically supports encoding/decoding
	33	;; of the following character sets to and from UTF-8:
	34	;;
	35	;; ascii
	36	;; eight-bit-control
	37	;; latin-iso8859-1
	38	;; mule-unicode-0100-24ff
	39	;; mule-unicode-2500-33ff
	40	;; mule-unicode-e000-ffff
	41	;;
	42	;; On decoding, Unicode characters that do not fit into the above
	43	;; character sets are handled as `eight-bit-control' or
	44	;; `eight-bit-graphic' characters to retain the information about the
	45	;; original byte sequence and text properties record the corresponding
	46	;; unicode.
	47	;;
	48	;; Fixme: note that reading and writing invalid utf-8 may not be
	49	;; idempotent -- to represent the bytes to fix that needs a new charset.
	50	;;
	51	;; Characters from other character sets can be encoded with mule-utf-8
	52	;; by populating the translation table
	53	;; `utf-translation-table-for-encode'. Hash tables
	54	;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
	55	;; used to support encoding and decoding of about a quarter of the CJK
	56	;; space between U+3400 and U+DFFF.
	57
	58	;; UTF-8 is defined in RFC 3629. A sketch of the encoding is:
	59
	60	;; scalar \| utf-8
	61	;; value \| 1st byte \| 2nd byte \| 3rd byte
	62	;; --------------------+-----------+-----------+----------
	63	;; 0000 0000 0xxx xxxx \| 0xxx xxxx \| \|
	64	;; 0000 0yyy yyxx xxxx \| 110y yyyy \| 10xx xxxx \|
	65	;; zzzz yyyy yyxx xxxx \| 1110 zzzz \| 10yy yyyy \| 10xx xxxx
	66
	67	;;; Code:
	68
	69	(defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
	70	"Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
	71
	72	If `unify-8859-on-encoding-mode' is non-nil, this table populates the
	73	translation-table named `utf-translation-table-for-encode'.")
	74
	75	(define-translation-table 'utf-translation-table-for-encode)
	76
	77
	78	;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
	79	;; space of mule-unicode. For Latin scripts this isn't very
	80	;; important. Hebrew and Arabic might go here too when there's proper
	81	;; support for them.
	82
	83	(defvar utf-fragmentation-table (make-char-table 'translation-table nil)
	84	"Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*.
	85
	86	If `utf-fragment-on-decoding' is non-nil, this table populates the
	87	translation-table named `utf-translation-table-for-decode'")
	88
	89	(defvar utf-defragmentation-table (make-char-table 'translation-table nil)
	90	"Char-table for reverse mapping of `utf-fragmentation-table'.
	91
	92	If `utf-fragment-on-decoding' is non-nil and
	93	`unify-8859-on-encoding-mode' is nil, this table populates the
	94	translation-table named `utf-translation-table-for-encode'")
	95
	96	(define-translation-table 'utf-translation-table-for-decode)
	97
	98
	99	(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
	100	"Hash table mapping Emacs CJK character sets to Unicode code points.
	101
	102	If `utf-translate-cjk-mode' is non-nil, this table populates the
	103	translation-hash-table named `utf-subst-table-for-encode'.")
	104
	105	(define-translation-hash-table 'utf-subst-table-for-encode
	106	ucs-mule-cjk-to-unicode)
	107
	108	(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
	109	"Hash table mapping Unicode code points to Emacs CJK character sets.
	110
	111	If `utf-translate-cjk-mode' is non-nil, this table populates the
	112	translation-hash-table named `utf-subst-table-for-decode'.")
	113
	114	(define-translation-hash-table 'utf-subst-table-for-decode
	115	ucs-unicode-to-mule-cjk)
	116
	117	(mapc
	118	(lambda (pair)
	119	(aset utf-fragmentation-table (car pair) (cdr pair))
	120	(aset utf-defragmentation-table (cdr pair) (car pair)))
	121	'((?\e$,1&d\e(B . ?\e,F4\e(B) (?\e$,1&e\e(B . ?\e,F5\e(B) (?\e$,1&f\e(B . ?\e,F6\e(B) (?\e$,1&h\e(B . ?\e,F8\e(B) (?\e$,1&i\e(B . ?\e,F9\e(B)
	122	(?\e$,1&j\e(B . ?\e,F:\e(B) (?\e$,1&l\e(B . ?\e,F<\e(B) (?\e$,1&n\e(B . ?\e,F>\e(B) (?\e$,1&o\e(B . ?\e,F?\e(B) (?\e$,1&p\e(B . ?\e,F@\e(B)
	123	(?\e$,1&q\e(B . ?\e,FA\e(B) (?\e$,1&r\e(B . ?\e,FB\e(B) (?\e$,1&s\e(B . ?\e,FC\e(B) (?\e$,1&t\e(B . ?\e,FD\e(B) (?\e$,1&u\e(B . ?\e,FE\e(B)
	124	(?\e$,1&v\e(B . ?\e,FF\e(B) (?\e$,1&w\e(B . ?\e,FG\e(B) (?\e$,1&x\e(B . ?\e,FH\e(B) (?\e$,1&y\e(B . ?\e,FI\e(B) (?\e$,1&z\e(B . ?\e,FJ\e(B)
	125	(?\e$,1&{\e(B . ?\e,FK\e(B) (?\e$,1&\|\e(B . ?\e,FL\e(B) (?\e$,1&}\e(B . ?\e,FM\e(B) (?\e$,1&~\e(B . ?\e,FN\e(B) (?\e$,1&\7f\e(B . ?\e,FO\e(B)
	126	(?\e$,1' \e(B . ?\e,FP\e(B) (?\e$,1'!\e(B . ?\e,FQ\e(B) (?\e$,1'#\e(B . ?\e,FS\e(B) (?\e$,1'$\e(B . ?\e,FT\e(B) (?\e$,1'%\e(B . ?\e,FU\e(B)
	127	(?\e$,1'&\e(B . ?\e,FV\e(B) (?\e$,1''\e(B . ?\e,FW\e(B) (?\e$,1'(\e(B . ?\e,FX\e(B) (?\e$,1')\e(B . ?\e,FY\e(B) (?\e$,1'*\e(B . ?\e,FZ\e(B)
	128	(?\e$,1'+\e(B . ?\e,F[\e(B) (?\e$,1',\e(B . ?\e,F\\e(B) (?\e$,1'-\e(B . ?\e,F]\e(B) (?\e$,1'.\e(B . ?\e,F^\e(B) (?\e$,1'/\e(B . ?\e,F_\e(B)
	129	(?\e$,1'0\e(B . ?\e,F`\e(B) (?\e$,1'1\e(B . ?\e,Fa\e(B) (?\e$,1'2\e(B . ?\e,Fb\e(B) (?\e$,1'3\e(B . ?\e,Fc\e(B) (?\e$,1'4\e(B . ?\e,Fd\e(B)
	130	(?\e$,1'5\e(B . ?\e,Fe\e(B) (?\e$,1'6\e(B . ?\e,Ff\e(B) (?\e$,1'7\e(B . ?\e,Fg\e(B) (?\e$,1'8\e(B . ?\e,Fh\e(B) (?\e$,1'9\e(B . ?\e,Fi\e(B)
	131	(?\e$,1':\e(B . ?\e,Fj\e(B) (?\e$,1';\e(B . ?\e,Fk\e(B) (?\e$,1'<\e(B . ?\e,Fl\e(B) (?\e$,1'=\e(B . ?\e,Fm\e(B) (?\e$,1'>\e(B . ?\e,Fn\e(B)
	132	(?\e$,1'?\e(B . ?\e,Fo\e(B) (?\e$,1'@\e(B . ?\e,Fp\e(B) (?\e$,1'A\e(B . ?\e,Fq\e(B) (?\e$,1'B\e(B . ?\e,Fr\e(B) (?\e$,1'C\e(B . ?\e,Fs\e(B)
	133	(?\e$,1'D\e(B . ?\e,Ft\e(B) (?\e$,1'E\e(B . ?\e,Fu\e(B) (?\e$,1'F\e(B . ?\e,Fv\e(B) (?\e$,1'G\e(B . ?\e,Fw\e(B) (?\e$,1'H\e(B . ?\e,Fx\e(B)
	134	(?\e$,1'I\e(B . ?\e,Fy\e(B) (?\e$,1'J\e(B . ?\e,Fz\e(B) (?\e$,1'K\e(B . ?\e,F{\e(B) (?\e$,1'L\e(B . ?\e,F\|\e(B) (?\e$,1'M\e(B . ?\e,F}\e(B)
	135	(?\e$,1'N\e(B . ?\e,F~\e(B)
	136
	137	(?\e$,1(!\e(B . ?\e,L!\e(B) (?\e$,1("\e(B . ?\e,L"\e(B) (?\e$,1(#\e(B . ?\e,L#\e(B) (?\e$,1($\e(B . ?\e,L$\e(B)
	138	(?\e$,1(%\e(B . ?\e,L%\e(B) (?\e$,1(&\e(B . ?\e,L&\e(B) (?\e$,1('\e(B . ?\e,L'\e(B) (?\e$,1((\e(B . ?\e,L(\e(B) (?\e$,1()\e(B . ?\e,L)\e(B)
	139	(?\e$,1(\e(B . ?\e,L\e(B) (?\e$,1(+\e(B . ?\e,L+\e(B) (?\e$,1(,\e(B . ?\e,L,\e(B) (?\e$,1(.\e(B . ?\e,L.\e(B) (?\e$,1(/\e(B . ?\e,L/\e(B)
	140	(?\e$,1(0\e(B . ?\e,L0\e(B) (?\e$,1(1\e(B . ?\e,L1\e(B) (?\e$,1(2\e(B . ?\e,L2\e(B) (?\e$,1(3\e(B . ?\e,L3\e(B) (?\e$,1(4\e(B . ?\e,L4\e(B)
	141	(?\e$,1(5\e(B . ?\e,L5\e(B) (?\e$,1(6\e(B . ?\e,L6\e(B) (?\e$,1(7\e(B . ?\e,L7\e(B) (?\e$,1(8\e(B . ?\e,L8\e(B) (?\e$,1(9\e(B . ?\e,L9\e(B)
	142	(?\e$,1(:\e(B . ?\e,L:\e(B) (?\e$,1(;\e(B . ?\e,L;\e(B) (?\e$,1(<\e(B . ?\e,L<\e(B) (?\e$,1(=\e(B . ?\e,L=\e(B) (?\e$,1(>\e(B . ?\e,L>\e(B)
	143	(?\e$,1(?\e(B . ?\e,L?\e(B) (?\e$,1(@\e(B . ?\e,L@\e(B) (?\e$,1(A\e(B . ?\e,LA\e(B) (?\e$,1(B\e(B . ?\e,LB\e(B) (?\e$,1(C\e(B . ?\e,LC\e(B)
	144	(?\e$,1(D\e(B . ?\e,LD\e(B) (?\e$,1(E\e(B . ?\e,LE\e(B) (?\e$,1(F\e(B . ?\e,LF\e(B) (?\e$,1(G\e(B . ?\e,LG\e(B) (?\e$,1(H\e(B . ?\e,LH\e(B)
	145	(?\e$,1(I\e(B . ?\e,LI\e(B) (?\e$,1(J\e(B . ?\e,LJ\e(B) (?\e$,1(K\e(B . ?\e,LK\e(B) (?\e$,1(L\e(B . ?\e,LL\e(B) (?\e$,1(M\e(B . ?\e,LM\e(B)
	146	(?\e$,1(N\e(B . ?\e,LN\e(B) (?\e$,1(O\e(B . ?\e,LO\e(B) (?\e$,1(P\e(B . ?\e,LP\e(B) (?\e$,1(Q\e(B . ?\e,LQ\e(B) (?\e$,1(R\e(B . ?\e,LR\e(B)
	147	(?\e$,1(S\e(B . ?\e,LS\e(B) (?\e$,1(T\e(B . ?\e,LT\e(B) (?\e$,1(U\e(B . ?\e,LU\e(B) (?\e$,1(V\e(B . ?\e,LV\e(B) (?\e$,1(W\e(B . ?\e,LW\e(B)
	148	(?\e$,1(X\e(B . ?\e,LX\e(B) (?\e$,1(Y\e(B . ?\e,LY\e(B) (?\e$,1(Z\e(B . ?\e,LZ\e(B) (?\e$,1([\e(B . ?\e,L[\e(B) (?\e$,1(\\e(B . ?\e,L\\e(B)
	149	(?\e$,1(]\e(B . ?\e,L]\e(B) (?\e$,1(^\e(B . ?\e,L^\e(B) (?\e$,1(_\e(B . ?\e,L_\e(B) (?\e$,1(`\e(B . ?\e,L`\e(B) (?\e$,1(a\e(B . ?\e,La\e(B)
	150	(?\e$,1(b\e(B . ?\e,Lb\e(B) (?\e$,1(c\e(B . ?\e,Lc\e(B) (?\e$,1(d\e(B . ?\e,Ld\e(B) (?\e$,1(e\e(B . ?\e,Le\e(B) (?\e$,1(f\e(B . ?\e,Lf\e(B)
	151	(?\e$,1(g\e(B . ?\e,Lg\e(B) (?\e$,1(h\e(B . ?\e,Lh\e(B) (?\e$,1(i\e(B . ?\e,Li\e(B) (?\e$,1(j\e(B . ?\e,Lj\e(B) (?\e$,1(k\e(B . ?\e,Lk\e(B)
	152	(?\e$,1(l\e(B . ?\e,Ll\e(B) (?\e$,1(m\e(B . ?\e,Lm\e(B) (?\e$,1(n\e(B . ?\e,Ln\e(B) (?\e$,1(o\e(B . ?\e,Lo\e(B) (?\e$,1(q\e(B . ?\e,Lq\e(B)
	153	(?\e$,1(r\e(B . ?\e,Lr\e(B) (?\e$,1(s\e(B . ?\e,Ls\e(B) (?\e$,1(t\e(B . ?\e,Lt\e(B) (?\e$,1(u\e(B . ?\e,Lu\e(B) (?\e$,1(v\e(B . ?\e,Lv\e(B)
	154	(?\e$,1(w\e(B . ?\e,Lw\e(B) (?\e$,1(x\e(B . ?\e,Lx\e(B) (?\e$,1(y\e(B . ?\e,Ly\e(B) (?\e$,1(z\e(B . ?\e,Lz\e(B) (?\e$,1({\e(B . ?\e,L{\e(B)
	155	(?\e$,1(\|\e(B . ?\e,L\|\e(B) (?\e$,1(~\e(B . ?\e,L~\e(B) (?\e$,1(\7f\e(B . ?\e,L\7f\e(B)))
	156
	157
	158	(defcustom utf-fragment-on-decoding nil
	159	"Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets.
	160	Setting this means that the relevant Cyrillic and Greek characters are
	161	decoded into the iso8859 charsets rather than into
	162	mule-unicode-0100-24ff. The iso8859 charsets take half as much space
	163	in the buffer, but using them may affect how the buffer can be re-encoded
	164	and may require a different input method to search for them, for instance.
	165	See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
	166	for mechanisms to make this largely transparent.
	167
	168	Setting this variable outside customize has no effect."
	169	:set (lambda (s v)
	170	(if v
	171	(progn
	172	(define-translation-table 'utf-translation-table-for-decode
	173	utf-fragmentation-table)
	174	;; Even if unify-8859-on-encoding-mode is off, make
	175	;; mule-utf-* encode characters in
	176	;; utf-fragmentation-table.
	177	(unless (eq (get 'utf-translation-table-for-encode
	178	'translation-table)
	179	ucs-mule-to-mule-unicode)
	180	(define-translation-table 'utf-translation-table-for-encode
	181	utf-defragmentation-table)))
	182	(define-translation-table 'utf-translation-table-for-decode)
	183	;; When unify-8859-on-encoding-mode is off, be sure to make
	184	;; mule-utf-* disabled for characters in
	185	;; utf-fragmentation-table.
	186	(unless (eq (get 'utf-translation-table-for-encode
	187	'translation-table)
	188	ucs-mule-to-mule-unicode)
	189	(define-translation-table 'utf-translation-table-for-encode)))
	190	(set-default s v))
	191	:version "22.1"
	192	:type 'boolean
	193	:group 'mule)
	194
	195
	196	(defconst utf-translate-cjk-charsets '(chinese-gb2312
	197	chinese-big5-1 chinese-big5-2
	198	japanese-jisx0208 japanese-jisx0212
	199	katakana-jisx0201
	200	korean-ksc5601)
	201	"List of charsets supported by `utf-translate-cjk-mode'.")
	202
	203	(defvar utf-translate-cjk-lang-env nil
	204	"Language environment in which tables for `utf-translate-cjk-mode' is loaded.
	205	The value nil means that the tables are not yet loaded.")
	206
	207	(defvar utf-translate-cjk-unicode-range)
	208
	209	;; String generated from utf-translate-cjk-unicode-range. It is
	210	;; suitable for an argument to skip-chars-forward.
	211	(defvar utf-translate-cjk-unicode-range-string nil)
	212
	213	(defun utf-translate-cjk-set-unicode-range (range)
	214	(setq utf-translate-cjk-unicode-range range)
	215	(setq utf-translate-cjk-unicode-range-string
	216	(let ((decode-char-no-trans
	217	#'(lambda (x)
	218	(cond ((< x #x100) (make-char 'latin-iso8859-1 x))
	219	((< x #x2500)
	220	(setq x (- x #x100))
	221	(make-char 'mule-unicode-0100-24ff
	222	(+ (/ x 96) 32) (+ (% x 96) 32)))
	223	((< x #x3400)
	224	(setq x (- x #x2500))
	225	(make-char 'mule-unicode-2500-33ff
	226	(+ (/ x 96) 32) (+ (% x 96) 32)))
	227	(t
	228	(setq x (- x #xe000))
	229	(make-char 'mule-unicode-e000-ffff
	230	(+ (/ x 96) 32) (+ (% x 96) 32))))))
	231	ranges from to)
	232	(dolist (elt range)
	233	(setq from (max #xA0 (car elt)) to (min #xffff (cdr elt)))
	234	(if (and (>= to #x3400) (< to #xE000))
	235	(setq to #x33FF))
	236	(cond ((< from #x100)
	237	(if (>= to #xE000)
	238	(setq ranges (cons (cons #xE000 to) ranges)
	239	to #x33FF))
	240	(if (>= to #x2500)
	241	(setq ranges (cons (cons #x2500 to) ranges)
	242	to #x24FF))
	243	(if (>= to #x100)
	244	(setq ranges (cons (cons #x100 to) ranges)
	245	to #xFF)))
	246	((< from #x2500)
	247	(if (>= to #xE000)
	248	(setq ranges (cons (cons #xE000 to) ranges)
	249	to #x33FF))
	250	(if (>= to #x2500)
	251	(setq ranges (cons (cons #x2500 to) ranges)
	252	to #x24FF)))
	253	((< from #x3400)
	254	(if (>= to #xE000)
	255	(setq ranges (cons (cons #xE000 to) ranges)
	256	to #x33FF))))
	257	(if (<= from to)
	258	(setq ranges (cons (cons from to) ranges))))
	259	(mapconcat #'(lambda (x)
	260	(format "%c-%c"
	261	(funcall decode-char-no-trans (car x))
	262	(funcall decode-char-no-trans (cdr x))))
	263	ranges "")))
	264	;; These forces loading and settting tables for
	265	;; utf-translate-cjk-mode.
	266	(setq utf-translate-cjk-lang-env nil
	267	ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
	268	ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)))
	269
	270	(defcustom utf-translate-cjk-unicode-range '((#x2e80 . #xd7a3)
	271	(#xff00 . #xffef))
	272	"List of Unicode code ranges supported by `utf-translate-cjk-mode'.
	273	Setting this variable directly does not take effect;
	274	use either \\[customize] or the function
	275	`utf-translate-cjk-set-unicode-range'."
	276	:version "22.1"
	277	:type '(repeat (cons integer integer))
	278	:set (lambda (symbol value)
	279	(utf-translate-cjk-set-unicode-range value))
	280	:group 'mule)
	281
	282	;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'.
	283	(defsubst utf-translate-cjk-substitutable-p (code-point)
	284	(let ((tail utf-translate-cjk-unicode-range)
	285	elt)
	286	(while tail
	287	(setq elt (car tail) tail (cdr tail))
	288	(if (and (>= code-point (car elt)) (<= code-point (cdr elt)))
	289	(setq tail nil)
	290	(setq elt nil)))
	291	elt))
	292
	293	(defun utf-translate-cjk-load-tables ()
	294	"Load tables for `utf-translate-cjk-mode'."
	295	;; Fixme: Allow the use of the CJK charsets to be
	296	;; customized by reordering and possible omission.
	297	(let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000)))
	298	(if redefined
	299	;; Redefine them with realistic initial sizes and a
	300	;; smallish rehash size to avoid wasting significant
	301	;; space after they're built.
	302	(setq ucs-mule-cjk-to-unicode
	303	(make-hash-table :test 'eq :size 43000 :rehash-size 1000)
	304	ucs-unicode-to-mule-cjk
	305	(make-hash-table :test 'eq :size 21500 :rehash-size 1000)))
	306
	307	;; Load the files explicitly, to avoid having to keep
	308	;; around the large tables they contain (as well as the
	309	;; ones which get built).
	310	;; Here we bind coding-system-for-read to nil so that coding tags
	311	;; in the files are respected even if the files are not yet
	312	;; byte-compiled
	313	(let ((coding-system-for-read nil)
	314	;; We must avoid clobbering this variable, in case the load
	315	;; files below use different coding systems.
	316	(last-coding-system-used last-coding-system-used))
	317	(cond ((string= "Korean" current-language-environment)
	318	(load "subst-jis")
	319	(load "subst-big5")
	320	(load "subst-gb2312")
	321	(load "subst-ksc"))
	322	((string= "Chinese-BIG5" current-language-environment)
	323	(load "subst-jis")
	324	(load "subst-ksc")
	325	(load "subst-gb2312")
	326	(load "subst-big5"))
	327	((string= "Chinese-GB" current-language-environment)
	328	(load "subst-jis")
	329	(load "subst-ksc")
	330	(load "subst-big5")
	331	(load "subst-gb2312"))
	332	(t
	333	(load "subst-ksc")
	334	(load "subst-gb2312")
	335	(load "subst-big5")
	336	(load "subst-jis")))) ; jis covers as much as big5, gb2312
	337
	338	(when redefined
	339	(define-translation-hash-table 'utf-subst-table-for-decode
	340	ucs-unicode-to-mule-cjk)
	341	(define-translation-hash-table 'utf-subst-table-for-encode
	342	ucs-mule-cjk-to-unicode)
	343	(set-char-table-extra-slot (get 'utf-translation-table-for-encode
	344	'translation-table)
	345	1 ucs-mule-cjk-to-unicode))
	346
	347	(setq utf-translate-cjk-lang-env current-language-environment)))
	348
	349	(defun utf-lookup-subst-table-for-decode (code-point)
	350	(if (and utf-translate-cjk-mode
	351	(not utf-translate-cjk-lang-env)
	352	(utf-translate-cjk-substitutable-p code-point))
	353	(utf-translate-cjk-load-tables))
	354	(gethash code-point
	355	(get 'utf-subst-table-for-decode 'translation-hash-table)))
	356
	357
	358	(defun utf-lookup-subst-table-for-encode (char)
	359	(if (and utf-translate-cjk-mode
	360	(not utf-translate-cjk-lang-env)
	361	(memq (char-charset char) utf-translate-cjk-charsets))
	362	(utf-translate-cjk-load-tables))
	363	(gethash char
	364	(get 'utf-subst-table-for-encode 'translation-hash-table)))
	365
	366	(define-minor-mode utf-translate-cjk-mode
	367	"Toggle whether UTF based coding systems de/encode CJK characters.
	368	If ARG is an integer, enable if ARG is positive and disable if
	369	zero or negative. This is a minor mode.
	370	Enabling this allows the coding systems mule-utf-8,
	371	mule-utf-16le and mule-utf-16be to encode characters in the charsets
	372	`korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
	373	`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
	374	decode the corresponding unicodes into such characters.
	375
	376	Where the charsets overlap, the one preferred for decoding is chosen
	377	according to the language environment in effect when this option is
	378	turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
	379	Chinese-Big5 and jisx for other environments.
	380
	381	This mode is on by default. If you are not interested in CJK
	382	characters and want to avoid some overhead on encoding/decoding
	383	by the above coding systems, you can customize the user option
	384	`utf-translate-cjk-mode' to nil."
	385	:init-value t
	386	:version "22.1"
	387	:type 'boolean
	388	:group 'mule
	389	:global t
	390	(if utf-translate-cjk-mode
	391	(progn
	392	(define-translation-hash-table 'utf-subst-table-for-decode
	393	ucs-unicode-to-mule-cjk)
	394	(define-translation-hash-table 'utf-subst-table-for-encode
	395	ucs-mule-cjk-to-unicode)
	396	(set-char-table-extra-slot (get 'utf-translation-table-for-encode
	397	'translation-table)
	398	1 ucs-mule-cjk-to-unicode))
	399	(define-translation-hash-table 'utf-subst-table-for-decode
	400	(make-hash-table :test 'eq))
	401	(define-translation-hash-table 'utf-subst-table-for-encode
	402	(make-hash-table :test 'eq))
	403	(set-char-table-extra-slot (get 'utf-translation-table-for-encode
	404	'translation-table)
	405	1 nil))
	406
	407	;; Update safe-chars of mule-utf-* coding systems.
	408	(dolist (elt (coding-system-list t))
	409	(if (string-match "^mule-utf" (symbol-name elt))
	410	(let ((safe-charsets (coding-system-get elt 'safe-charsets))
	411	(safe-chars (coding-system-get elt 'safe-chars))
	412	(need-update nil))
	413	(dolist (charset utf-translate-cjk-charsets)
	414	(unless (eq utf-translate-cjk-mode (memq charset safe-charsets))
	415	(setq safe-charsets
	416	(if utf-translate-cjk-mode
	417	(cons charset safe-charsets)
	418	(delq charset safe-charsets))
	419	need-update t)
	420	(aset safe-chars (make-char charset) utf-translate-cjk-mode)))
	421	(when need-update
	422	(coding-system-put elt 'safe-charsets safe-charsets)
	423	(define-coding-system-internal elt))))))
	424
	425	(define-ccl-program ccl-mule-utf-untrans
	426	;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or
	427	;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write
	428	;; eight-bit-control/graphic sequence (2 to 4 chars) representing
	429	;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified.
	430	;;
	431	;; This is a subrountine because we assume that this is called very
	432	;; rarely (so we don't have to worry about the overhead of the
	433	;; call).
	434	`(0
	435	((r5 = ,(charset-id 'eight-bit-control))
	436	(r6 = ,(charset-id 'eight-bit-graphic))
	437	(if (r0 < #x100)
	438	((r4 = ((r0 >> 6) \| #xC0))
	439	(write-multibyte-character r6 r4))
	440	((if (r0 < #x10000)
	441	((r4 = ((r0 >> 12) \| #xE0))
	442	(write-multibyte-character r6 r4))
	443	((r4 = ((r0 >> 18) \| #xF0))
	444	(write-multibyte-character r6 r4)
	445	(r4 = (((r0 >> 12) & #x3F) \| #x80))
	446	(if (r4 < #xA0)
	447	(write-multibyte-character r5 r4)
	448	(write-multibyte-character r6 r4))))
	449	(r4 = (((r0 >> 6) & #x3F) \| #x80))
	450	(if (r4 < #xA0)
	451	(write-multibyte-character r5 r4)
	452	(write-multibyte-character r6 r4))))
	453	(r4 = ((r0 & #x3F) \| #x80))
	454	(if (r4 < #xA0)
	455	(write-multibyte-character r5 r4)
	456	(write-multibyte-character r6 r4)))))
	457
	458	(define-ccl-program ccl-decode-mule-utf-8
	459	;;
	460	;; charset \| bytes in utf-8 \| bytes in emacs
	461	;; -----------------------+----------------+---------------
	462	;; ascii \| 1 \| 1
	463	;; -----------------------+----------------+---------------
	464	;; eight-bit-control \| 2 \| 2
	465	;; eight-bit-graphic \| 2 \| 1
	466	;; latin-iso8859-1 \| 2 \| 2
	467	;; -----------------------+----------------+---------------
	468	;; mule-unicode-0100-24ff \| 2 \| 4
	469	;; (< 0800) \| \|
	470	;; -----------------------+----------------+---------------
	471	;; mule-unicode-0100-24ff \| 3 \| 4
	472	;; (>= 8000) \| \|
	473	;; mule-unicode-2500-33ff \| 3 \| 4
	474	;; mule-unicode-e000-ffff \| 3 \| 4
	475	;; -----------------------+----------------+---------------
	476	;; invalid byte \| 1 \| 2
	477	;;
	478	;; Thus magnification factor is two.
	479	;;
	480	`(2
	481	((r6 = ,(charset-id 'latin-iso8859-1))
	482	(read r0)
	483	(loop
	484	(if (r0 < #x80)
	485	;; 1-byte encoding, i.e., ascii
	486	(write-read-repeat r0))
	487	(if (r0 < #xc2)
	488	;; continuation byte (invalid here) or 1st byte of overlong
	489	;; 2-byte sequence.
	490	((call ccl-mule-utf-untrans)
	491	(r6 = ,(charset-id 'latin-iso8859-1))
	492	(read r0)
	493	(repeat)))
	494
	495	;; Read the 2nd byte.
	496	(read r1)
	497	(if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte
	498	((call ccl-mule-utf-untrans)
	499	(r6 = ,(charset-id 'latin-iso8859-1))
	500	;; Handle it in the next loop.
	501	(r0 = r1)
	502	(repeat)))
	503
	504	(if (r0 < #xe0)
	505	;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
	506	((r1 &= #x3F)
	507	(r1 \|= ((r0 & #x1F) << 6))
	508	;; Now r1 holds scalar value. We don't have to check
	509	;; `overlong sequence' because r0 >= 0xC2.
	510
	511	(if (r1 >= 256)
	512	;; mule-unicode-0100-24ff (< 0800)
	513	((r0 = r1)
	514	(lookup-integer utf-subst-table-for-decode r0 r1)
	515	(if (r7 == 0)
	516	((r0 = ,(charset-id 'mule-unicode-0100-24ff))
	517	(r1 -= #x0100)
	518	(r2 = (((r1 / 96) + 32) << 7))
	519	(r1 %= 96)
	520	(r1 += (r2 + 32))
	521	(translate-character
	522	utf-translation-table-for-decode r0 r1)))
	523	(write-multibyte-character r0 r1)
	524	(read r0)
	525	(repeat))
	526	(if (r1 >= 160)
	527	;; latin-iso8859-1
	528	((r0 = r1)
	529	(lookup-integer utf-subst-table-for-decode r0 r1)
	530	(if (r7 == 0)
	531	((r1 -= 128)
	532	(write-multibyte-character r6 r1))
	533	((write-multibyte-character r0 r1)))
	534	(read r0)
	535	(repeat))
	536	;; eight-bit-control
	537	((r0 = ,(charset-id 'eight-bit-control))
	538	(write-multibyte-character r0 r1)
	539	(read r0)
	540	(repeat))))))
	541
	542	;; Read the 3rd bytes.
	543	(read r2)
	544	(if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte
	545	((call ccl-mule-utf-untrans)
	546	(r0 = r1)
	547	(call ccl-mule-utf-untrans)
	548	(r6 = ,(charset-id 'latin-iso8859-1))
	549	;; Handle it in the next loop.
	550	(r0 = r2)
	551	(repeat)))
	552
	553	(if (r0 < #xF0)
	554	;; 3byte encoding
	555	;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
	556	((r3 = ((r0 & #xF) << 12))
	557	(r3 \|= ((r1 & #x3F) << 6))
	558	(r3 \|= (r2 & #x3F))
	559
	560	(if (r3 < #x800) ; `overlong sequence'
	561	((call ccl-mule-utf-untrans)
	562	(r0 = r1)
	563	(call ccl-mule-utf-untrans)
	564	(r0 = r2)
	565	(call ccl-mule-utf-untrans)
	566	(r6 = ,(charset-id 'latin-iso8859-1))
	567	(read r0)
	568	(repeat)))
	569
	570	(if (r3 < #x2500)
	571	;; mule-unicode-0100-24ff (>= 0800)
	572	((r0 = r3)
	573	(lookup-integer utf-subst-table-for-decode r0 r1)
	574	(if (r7 == 0)
	575	((r0 = ,(charset-id 'mule-unicode-0100-24ff))
	576	(r3 -= #x0100)
	577	(r3 //= 96)
	578	(r1 = (r7 + 32))
	579	(r1 += ((r3 + 32) << 7))
	580	(translate-character
	581	utf-translation-table-for-decode r0 r1)))
	582	(write-multibyte-character r0 r1)
	583	(read r0)
	584	(repeat)))
	585
	586	(if (r3 < #x3400)
	587	;; mule-unicode-2500-33ff
	588	((r0 = r3) ; don't zap r3
	589	(lookup-integer utf-subst-table-for-decode r0 r1)
	590	(if (r7 == 0)
	591	((r0 = ,(charset-id 'mule-unicode-2500-33ff))
	592	(r3 -= #x2500)
	593	(r3 //= 96)
	594	(r1 = (r7 + 32))
	595	(r1 += ((r3 + 32) << 7))))
	596	(write-multibyte-character r0 r1)
	597	(read r0)
	598	(repeat)))
	599
	600	(if (r3 < #xE000)
	601	;; Try to convert to CJK chars, else
	602	;; keep them as eight-bit-{control\|graphic}.
	603	((r0 = r3)
	604	(lookup-integer utf-subst-table-for-decode r3 r1)
	605	(if r7
	606	;; got a translation
	607	((write-multibyte-character r3 r1)
	608	(read r0)
	609	(repeat))
	610	((call ccl-mule-utf-untrans)
	611	(r6 = ,(charset-id 'latin-iso8859-1))
	612	(read r0)
	613	(repeat)))))
	614
	615	;; mule-unicode-e000-ffff
	616	;; Fixme: fffe and ffff are invalid.
	617	(r0 = r3) ; don't zap r3
	618	(lookup-integer utf-subst-table-for-decode r0 r1)
	619	(if (r7 == 0)
	620	((r0 = ,(charset-id 'mule-unicode-e000-ffff))
	621	(r3 -= #xe000)
	622	(r3 //= 96)
	623	(r1 = (r7 + 32))
	624	(r1 += ((r3 + 32) << 7))))
	625	(write-multibyte-character r0 r1)
	626	(read r0)
	627	(repeat)))
	628
	629	;; Read the 4th bytes.
	630	(read r3)
	631	(if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte
	632	((call ccl-mule-utf-untrans)
	633	(r0 = r1)
	634	(call ccl-mule-utf-untrans)
	635	(r0 = r2)
	636	(call ccl-mule-utf-untrans)
	637	(r6 = ,(charset-id 'latin-iso8859-1))
	638	;; Handle it in the next loop.
	639	(r0 = r3)
	640	(repeat)))
	641
	642	(if (r0 < #xF8)
	643	;; 4-byte encoding:
	644	;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx
	645	;; keep those bytes as eight-bit-{control\|graphic}
	646	;; Fixme: allow lookup in utf-subst-table-for-decode.
	647	((r4 = ((r0 & #x7) << 18))
	648	(r4 \|= ((r1 & #x3F) << 12))
	649	(r4 \|= ((r2 & #x3F) << 6))
	650	(r4 \|= (r3 & #x3F))
	651
	652	(if (r4 < #x10000) ; `overlong sequence'
	653	((call ccl-mule-utf-untrans)
	654	(r0 = r1)
	655	(call ccl-mule-utf-untrans)
	656	(r0 = r2)
	657	(call ccl-mule-utf-untrans)
	658	(r0 = r3)
	659	(call ccl-mule-utf-untrans))
	660	((r0 = r4)
	661	(call ccl-mule-utf-untrans))))
	662
	663	;; Unsupported sequence.
	664	((call ccl-mule-utf-untrans)
	665	(r0 = r1)
	666	(call ccl-mule-utf-untrans)
	667	(r0 = r2)
	668	(call ccl-mule-utf-untrans)
	669	(r0 = r3)
	670	(call ccl-mule-utf-untrans)))
	671	(r6 = ,(charset-id 'latin-iso8859-1))
	672	(read r0)
	673	(repeat)))
	674
	675
	676	;; At EOF...
	677	(if (r0 >= 0)
	678	;; r0 >= #x80
	679	((call ccl-mule-utf-untrans)
	680	(if (r1 >= 0)
	681	((r0 = r1)
	682	(call ccl-mule-utf-untrans)
	683	(if (r2 >= 0)
	684	((r0 = r2)
	685	(call ccl-mule-utf-untrans)
	686	(if (r3 >= 0)
	687	((r0 = r3)
	688	(call ccl-mule-utf-untrans))))))))))
	689
	690	"CCL program to decode UTF-8.
	691	Basic decoding is done into the charsets ascii, latin-iso8859-1 and
	692	mule-unicode-*, but see also `utf-fragmentation-table' and
	693	`ucs-mule-cjk-to-unicode'.
	694	Encodings of un-representable Unicode characters are decoded asis into
	695	eight-bit-control and eight-bit-graphic characters.")
	696
	697	(define-ccl-program ccl-mule-utf-8-encode-untrans
	698	;; UTF-8 decoder generates an UTF-8 sequence represented by a
	699	;; sequence eight-bit-control/graphic chars for an untranslatable
	700	;; character and an invalid byte.
	701	;;
	702	;; This CCL parses that sequence (the first byte is already in r1),
	703	;; writes out the original bytes of that sequence, and sets r5 to
	704	;; -1.
	705	;;
	706	;; If the eight-bit-control/graphic sequence is shorter than what r1
	707	;; suggests, it sets r5 and r6 to the last character read that
	708	;; should be handled by the next loop of a caller.
	709	;;
	710	;; Note: For UTF-8 validation, we only check if a character is
	711	;; eight-bit-control/graphic or not. It may result in incorrect
	712	;; handling of random binary data, but such a data can't be encoded
	713	;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such
	714	;; a sequence even if a source contains invalid byte-sequence.
	715	`(0
	716	(;; Read the 2nd byte.
	717	(read-multibyte-character r5 r6)
	718	(r0 = (r5 != ,(charset-id 'eight-bit-control)))
	719	(if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
	720	((write r1) ; invalid UTF-8
	721	(r1 = -1)
	722	(end)))
	723
	724	(if (r1 <= #xC3)
	725	;; 2-byte sequence for an originally invalid byte.
	726	((r6 &= #x3F)
	727	(r6 \|= ((r1 & #x1F) << 6))
	728	(write r6)
	729	(r5 = -1)
	730	(end)))
	731
	732	(write r1 r6)
	733	(r2 = r1)
	734	(r1 = -1)
	735	;; Read the 3rd byte.
	736	(read-multibyte-character r5 r6)
	737	(r0 = (r5 != ,(charset-id 'eight-bit-control)))
	738	(if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
	739	(end)) ; invalid UTF-8
	740	(write r6)
	741	(if (r2 < #xF0)
	742	;; 3-byte sequence for an untranslated character.
	743	((r5 = -1)
	744	(end)))
	745	;; Read the 4th byte.
	746	(read-multibyte-character r5 r6)
	747	(r0 = (r5 != ,(charset-id 'eight-bit-control)))
	748	(if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
	749	(end)) ; invalid UTF-8
	750	;; 4-byte sequence for an untranslated character.
	751	(write r6)
	752	(r5 = -1)
	753	(end))
	754
	755	;; At EOF...
	756	((r5 = -1)
	757	(if (r1 >= 0)
	758	(write r1)))))
	759
	760	(define-ccl-program ccl-encode-mule-utf-8
	761	`(1
	762	((r5 = -1)
	763	(loop
	764	(if (r5 < 0)
	765	(read-multibyte-character r0 r1)
	766	;; Pre-read character is in r5 (charset-ID) and r6 (code-point).
	767	((r0 = r5)
	768	(r1 = r6)
	769	(r5 = -1)))
	770	(translate-character utf-translation-table-for-encode r0 r1)
	771
	772	(if (r0 == ,(charset-id 'ascii))
	773	(write-repeat r1))
	774
	775	(if (r0 == ,(charset-id 'latin-iso8859-1))
	776	;; r1 scalar utf-8
	777	;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
	778	;; 20 0000 0000 1010 0000 1100 0010 1010 0000
	779	;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
	780	((write ((r1 >> 6) \| #xc2))
	781	(r1 &= #x3f)
	782	(r1 \|= #x80)
	783	(write-repeat r1)))
	784
	785	(if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
	786	((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
	787	;; #x3f80 == (0011 1111 1000 0000)b
	788	(r1 &= #x7f)
	789	(r1 += (r0 + 224)) ; 240 == -32 + #x0100
	790	;; now r1 holds scalar value
	791	(if (r1 < #x0800)
	792	;; 2byte encoding
	793	((write ((r1 >> 6) \| #xC0))
	794	(r1 &= #x3F)
	795	(r1 \|= #x80)
	796	(write-repeat r1))
	797	;; 3byte encoding
	798	((write ((r1 >> 12) \| #xE0))
	799	(write (((r1 & #x0FC0) >> 6) \| #x80))
	800	(r1 &= #x3F)
	801	(r1 \|= #x80)
	802	(write-repeat r1)))))
	803
	804	(if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
	805	((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
	806	(r1 &= #x7f)
	807	(r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
	808	;; now r1 holds scalar value
	809	(write ((r1 >> 12) \| #xE0))
	810	(write (((r1 & #x0FC0) >> 6) \| #x80))
	811	(r1 &= #x3F)
	812	(r1 \|= #x80)
	813	(write-repeat r1)))
	814
	815	(if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
	816	((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
	817	(r1 &= #x7f)
	818	(r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
	819	;; now r1 holds scalar value
	820	(write ((r1 >> 12) \| #xE0))
	821	(write (((r1 & #x0FC0) >> 6) \| #x80))
	822	(r1 &= #x3F)
	823	(r1 \|= #x80)
	824	(write-repeat r1)))
	825
	826	(if (r0 == ,(charset-id 'eight-bit-control))
	827	;; r1 scalar utf-8
	828	;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
	829	;; 80 0000 0000 1000 0000 1100 0010 1000 0000
	830	;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
	831	((write #xC2)
	832	(write-repeat r1)))
	833
	834	(if (r0 == ,(charset-id 'eight-bit-graphic))
	835	;; r1 scalar utf-8
	836	;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
	837	;; a0 0000 0000 1010 0000 1100 0010 1010 0000
	838	;; ff 0000 0000 1111 1111 1101 1111 1011 1111
	839	((r0 = (r1 >= #xC0))
	840	(r0 &= (r1 <= #xC3))
	841	(r4 = (r1 >= #xE1))
	842	(r4 &= (r1 <= #xF7))
	843	(r0 \|= r4)
	844	(if r0
	845	((call ccl-mule-utf-8-encode-untrans)
	846	(repeat))
	847	(write-repeat r1))))
	848
	849	(lookup-character utf-subst-table-for-encode r0 r1)
	850	(if r7 ; lookup succeeded
	851	(if (r0 < #x800)
	852	;; 2byte encoding
	853	((write ((r0 >> 6) \| #xC0))
	854	(r0 = ((r0 & #x3F) \| #x80))
	855	(write-repeat r0))
	856	;; 3byte encoding
	857	((write ((r0 >> 12) \| #xE0))
	858	(write (((r0 & #x0FC0) >> 6) \| #x80))
	859	(r0 = ((r0 & #x3F) \| #x80))
	860	(write-repeat r0))))
	861
	862	;; Unsupported character.
	863	;; Output U+FFFD, which is `ef bf bd' in UTF-8.
	864	(write #xef)
	865	(write #xbf)
	866	(write-repeat #xbd))))
	867	"CCL program to encode into UTF-8.")
	868
	869
	870	(define-ccl-program ccl-untranslated-to-ucs
	871	`(0
	872	(if (r1 == 0)
	873	nil
	874	(if (r0 <= #xC3) ; 2-byte encoding
	875	((r0 = ((r0 & #x3) << 6))
	876	(r0 \|= (r1 & #x3F))
	877	(r1 = 2))
	878	(if (r2 == 0)
	879	(r1 = 0)
	880	(if (r0 < #xF0) ; 3-byte encoding, as above
	881	((r0 = ((r0 & #xF) << 12))
	882	(r0 \|= ((r1 & #x3F) << 6))
	883	(r0 \|= (r2 & #x3F))
	884	(r1 = 3))
	885	(if (r3 == 0)
	886	(r1 = 0)
	887	((r0 = ((r0 & #x7) << 18))
	888	(r0 \|= ((r1 & #x3F) << 12))
	889	(r0 \|= ((r2 & #x3F) << 6))
	890	(r0 \|= (r3 & #x3F))
	891	(r1 = 4))))))))
	892	"Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
	893	Set r1 to the byte length. r0 == 0 for invalid sequence.")
	894
	895	(defvar utf-8-ccl-regs (make-vector 8 0))
	896
	897	(defsubst utf-8-untranslated-to-ucs ()
	898	"Return the UCS code for an untranslated sequence of raw bytes t point.
	899	Only for 3- or 4-byte sequences."
	900	(aset utf-8-ccl-regs 0 (or (char-after) 0))
	901	(aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
	902	(aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
	903	(aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
	904	(ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs))
	905
	906	(defun utf-8-help-echo (window object position)
	907	(format "Untranslated Unicode U+%04X"
	908	(get-char-property position 'untranslated-utf-8 object)))
	909
	910	;; We compose the untranslatable sequences into a single character,
	911	;; and move point to the next character.
	912	;; This is infelicitous for editing, because there's currently no
	913	;; mechanism for treating compositions as atomic, but is OK for
	914	;; display. They are composed to U+FFFD with help-echo which
	915	;; indicates the unicodes they represent. This function GCs too much.
	916
	917	;; If utf-translate-cjk-mode is non-nil, this function is called with
	918	;; HASH-TABLE which translates CJK characters into some of CJK
	919	;; charsets.
	920
	921	(defsubst utf-8-compose (hash-table)
	922	"Put a suitable composition on an untranslatable sequence at point.
	923	If HASH-TABLE is non-nil, try to translate CJK characters by it at first.
	924	Move point to the end of the sequence."
	925	(utf-8-untranslated-to-ucs)
	926	(let ((l (aref utf-8-ccl-regs 1))
	927	ch)
	928	(if (> l 0)
	929	(if (and hash-table
	930	(setq ch (gethash (aref utf-8-ccl-regs 0) hash-table)))
	931	(progn
	932	(insert ch)
	933	(delete-region (point) (min (point-max) (+ l (point)))))
	934	(setq ch (aref utf-8-ccl-regs 0))
	935	(put-text-property (point) (min (point-max) (+ l (point)))
	936	'untranslated-utf-8 ch)
	937	(put-text-property (point) (min (point-max) (+ l (point)))
	938	'help-echo 'utf-8-help-echo)
	939	(if (= l 2)
	940	(put-text-property (point) (min (point-max) (+ l (point)))
	941	'display (propertize (format "\\%03o" ch)
	942	'face 'escape-glyph))
	943	(compose-region (point) (+ l (point)) ?\e$,3u=\e(B))
	944	(forward-char l))
	945	(forward-char 1))))
	946
	947	(defcustom utf-8-compose-scripts nil
	948	"*Non-nil means compose various scripts on decoding utf-8 text."
	949	:group 'mule
	950	:version "22.1"
	951	:type 'boolean)
	952
	953	(defun utf-8-post-read-conversion (length)
	954	"Compose untranslated utf-8 sequences into single characters.
	955	If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters.
	956	Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
	957	(save-excursion
	958	(save-restriction
	959	(narrow-to-region (point) (+ (point) length))
	960	;; Can't do eval-when-compile to insert a multibyte constant
	961	;; version of the string in the loop, since it's always loaded as
	962	;; unibyte from a byte-compiled file.
	963	(let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7"))
	964	(buffer-multibyte enable-multibyte-characters)
	965	hash-table ch)
	966	(set-buffer-multibyte t)
	967	(when utf-translate-cjk-mode
	968	(unless utf-translate-cjk-lang-env
	969	;; Check these characters in utf-translate-cjk-range.
	970	;; We may have to translate them to CJK charsets.
	971	(skip-chars-forward
	972	(concat range utf-translate-cjk-unicode-range-string))
	973	(unless (eobp)
	974	(utf-translate-cjk-load-tables)
	975	(setq range
	976	(concat range utf-translate-cjk-unicode-range-string)))
	977	(setq hash-table (get 'utf-subst-table-for-decode
	978	'translation-hash-table))))
	979	(while (and (skip-chars-forward range)
	980	(not (eobp)))
	981	(setq ch (following-char))
	982	(if (< ch 256)
	983	(utf-8-compose hash-table)
	984	(if (and hash-table
	985	(setq ch (gethash (encode-char ch 'ucs) hash-table)))
	986	(progn
	987	(insert ch)
	988	(delete-char 1))
	989	(forward-char 1))))
	990	(or buffer-multibyte
	991	(set-buffer-multibyte nil)))
	992
	993	(when (and utf-8-compose-scripts (> length 1))
	994	;; This let-binding avoids recursive auto-loading. And, we
	995	;; anyway don't have to run the following code while
	996	;; auto-loading.
	997	(let ((utf-8-compose-scripts nil))
	998	;; These currently have definitions which cover the relevant
	999	;; unicodes. We could avoid loading thai-util &c by checking
	1000	;; whether the region contains any characters with the appropriate
	1001	;; categories. There aren't yet Unicode-based rules for Tibetan.
	1002	(diacritic-compose-region (point-max) (point-min))
	1003	(thai-compose-region (point-max) (point-min))
	1004	(lao-compose-region (point-max) (point-min))
	1005	(devanagari-compose-region (point-max) (point-min))
	1006	(malayalam-compose-region (point-max) (point-min))
	1007	(tamil-compose-region (point-max) (point-min))))
	1008	(- (point-max) (point-min)))))
	1009
	1010	(defun utf-8-pre-write-conversion (beg end)
	1011	"Prepare for `utf-translate-cjk-mode' to encode text between BEG and END.
	1012	This is used as a post-read-conversion of utf-8 coding system."
	1013	(if (and utf-translate-cjk-mode
	1014	(not utf-translate-cjk-lang-env)
	1015	(if (stringp beg)
	1016	(string-match "\\cc\\\|\\cj\\\|\\ch" beg)
	1017	(save-excursion
	1018	(goto-char beg)
	1019	(re-search-forward "\\cc\\\|\\cj\\\|\\ch" end t))))
	1020	(utf-translate-cjk-load-tables))
	1021	nil)
	1022
	1023	(make-coding-system
	1024	'mule-utf-8 4 ?u
	1025	"UTF-8 encoding for Emacs-supported Unicode characters.
	1026	It supports Unicode characters of these ranges:
	1027	U+0000..U+33FF, U+E000..U+FFFF.
	1028	They correspond to these Emacs character sets:
	1029	ascii, latin-iso8859-1, mule-unicode-0100-24ff,
	1030	mule-unicode-2500-33ff, mule-unicode-e000-ffff
	1031
	1032	On decoding (e.g. reading a file), Unicode characters not in the above
	1033	ranges are decoded into sequences of eight-bit-control and
	1034	eight-bit-graphic characters to preserve their byte sequences. The
	1035	byte sequence is preserved on i/o for valid utf-8, but not necessarily
	1036	for invalid utf-8.
	1037
	1038	On encoding (e.g. writing a file), Emacs characters not belonging to
	1039	any of the character sets listed above are encoded into the UTF-8 byte
	1040	sequence representing U+FFFD (REPLACEMENT CHARACTER)."
	1041
	1042	'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
	1043	`((safe-charsets
	1044	ascii
	1045	eight-bit-control
	1046	eight-bit-graphic
	1047	latin-iso8859-1
	1048	mule-unicode-0100-24ff
	1049	mule-unicode-2500-33ff
	1050	mule-unicode-e000-ffff
	1051	,@(if utf-translate-cjk-mode
	1052	utf-translate-cjk-charsets))
	1053	(mime-charset . utf-8)
	1054	(coding-category . coding-category-utf-8)
	1055	(valid-codes (0 . 255))
	1056	(pre-write-conversion . utf-8-pre-write-conversion)
	1057	(post-read-conversion . utf-8-post-read-conversion)
	1058	(translation-table-for-encode . utf-translation-table-for-encode)
	1059	(dependency unify-8859-on-encoding-mode
	1060	unify-8859-on-decoding-mode
	1061	utf-fragment-on-decoding
	1062	utf-translate-cjk-mode)))
	1063
	1064	(define-coding-system-alias 'utf-8 'mule-utf-8)
	1065
	1066	;; I think this needs special private charsets defined for the
	1067	;; untranslated sequences, if it's going to work well.
	1068
	1069	;;; (defun utf-8-compose-function (pos to pattern &optional string)
	1070	;;; (let* ((prop (get-char-property pos 'composition string))
	1071	;;; (l (and prop (- (cadr prop) (car prop)))))
	1072	;;; (cond ((and l (> l (- to pos)))
	1073	;;; (delete-region pos to))
	1074	;;; ((and (> (char-after pos) 224)
	1075	;;; (< (char-after pos) 256)
	1076	;;; (save-restriction
	1077	;;; (narrow-to-region pos to)
	1078	;;; (utf-8-compose)))
	1079	;;; t))))
	1080
	1081	;;; (dotimes (i 96)
	1082	;;; (aset composition-function-table
	1083	;;; (+ 128 i)
	1084	;;; `((,(string-as-multibyte "[\200-\237\240-\377]")
	1085	;;; . utf-8-compose-function))))
	1086
	1087	;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5
	1088	;;; utf-8.el ends here