Commit | Line | Data |
---|---|---|
9ca2ac2d | 1 | ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
5ba7a870 KH |
2 | |
3 | ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | |
4 | ;; Licensed to the Free Software Foundation. | |
9ca2ac2d | 5 | ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc. |
5ba7a870 | 6 | |
aa15b3e5 | 7 | ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
9ca2ac2d | 8 | ;; Maintainer: FSF |
c49b8288 | 9 | ;; Keywords: multilingual, Unicode, UTF-8, i18n |
5ba7a870 KH |
10 | |
11 | ;; This file is part of GNU Emacs. | |
12 | ||
13 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
14 | ;; it under the terms of the GNU General Public License as published by | |
15 | ;; the Free Software Foundation; either version 2, or (at your option) | |
16 | ;; any later version. | |
17 | ||
18 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
19 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | ;; GNU General Public License for more details. | |
22 | ||
23 | ;; You should have received a copy of the GNU General Public License | |
24 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
25 | ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
26 | ;; Boston, MA 02111-1307, USA. | |
27 | ||
28 | ;;; Commentary: | |
29 | ||
aa2e3f49 DL |
30 | ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
31 | ;; of the following character sets to and from UTF-8: | |
5ba7a870 KH |
32 | ;; |
33 | ;; ascii | |
34 | ;; eight-bit-control | |
35 | ;; latin-iso8859-1 | |
36 | ;; mule-unicode-0100-24ff | |
37 | ;; mule-unicode-2500-33ff | |
38 | ;; mule-unicode-e000-ffff | |
39 | ;; | |
c49b8288 DL |
40 | ;; On decoding, Unicode characters that do not fit into the above |
41 | ;; character sets are handled as `eight-bit-control' or | |
42 | ;; `eight-bit-graphic' characters to retain the information about the | |
9ca2ac2d DL |
43 | ;; original byte sequence and text properties record the corresponding |
44 | ;; unicode. | |
45 | ;; | |
46 | ;; Fixme: note that reading and writing invalid utf-8 may not be | |
47 | ;; idempotent -- to represent the bytes to fix that needs a new charset. | |
aa2e3f49 | 48 | ;; |
ad88f5c5 | 49 | ;; Characters from other character sets can be encoded with mule-utf-8 |
ccdd5c61 | 50 | ;; by populating the translation table |
535665b8 | 51 | ;; `utf-translation-table-for-encode'. Hash tables |
ad88f5c5 KH |
52 | ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are |
53 | ;; used to support encoding and decoding of about a quarter of the CJK | |
54 | ;; space between U+3400 and U+DFFF. | |
c49b8288 DL |
55 | |
56 | ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | |
5ba7a870 KH |
57 | |
58 | ;; scalar | utf-8 | |
59 | ;; value | 1st byte | 2nd byte | 3rd byte | |
60 | ;; --------------------+-----------+-----------+---------- | |
61 | ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
62 | ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
63 | ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
64 | ||
65 | ;;; Code: | |
66 | ||
ad88f5c5 KH |
67 | (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil) |
68 | "Char table mapping characters to latin-iso8859-1 or mule-unicode-*. | |
9ca2ac2d | 69 | |
ad88f5c5 KH |
70 | If `unify-8859-on-encoding-mode' is non-nil, this table populates the |
71 | translation-table named `utf-translation-table-for-encode'.") | |
72 | ||
73 | (define-translation-table 'utf-translation-table-for-encode) | |
9ca2ac2d | 74 | |
9ca2ac2d DL |
75 | |
76 | ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the | |
77 | ;; space of mule-unicode. For Latin scripts this isn't very | |
78 | ;; important. Hebrew and Arabic might go here too when there's proper | |
79 | ;; support for them. | |
ad88f5c5 KH |
80 | |
81 | (defvar utf-fragmentation-table (make-char-table 'translation-table nil) | |
82 | "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*. | |
83 | ||
84 | If `utf-fragment-on-decoding' is non-nil, this table populates the | |
85 | translation-table named `utf-translation-table-for-decode'") | |
86 | ||
87 | (defvar utf-defragmentation-table (make-char-table 'translation-table nil) | |
88 | "Char-table for reverse mapping of `utf-fragmentation-table'. | |
89 | ||
90 | If `utf-fragment-on-decoding' is non-nil and | |
91 | `unify-8859-on-encoding-mode' is nil, this table populates the | |
92 | translation-table named `utf-translation-table-for-encode'") | |
93 | ||
94 | (define-translation-table 'utf-translation-table-for-decode) | |
95 | ||
96 | ||
0c8410d5 | 97 | (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
ad88f5c5 KH |
98 | "Hash table mapping Emacs CJK character sets to Unicode code points. |
99 | ||
100 | If `utf-translate-cjk' is non-nil, this table populates the | |
101 | translation-hash-table named `utf-subst-table-for-encode'.") | |
102 | ||
0c8410d5 DL |
103 | (define-translation-hash-table 'utf-subst-table-for-encode |
104 | ucs-mule-cjk-to-unicode) | |
ad88f5c5 | 105 | |
0c8410d5 | 106 | (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) |
ad88f5c5 KH |
107 | "Hash table mapping Unicode code points to Emacs CJK character sets. |
108 | ||
109 | If `utf-translate-cjk' is non-nil, this table populates the | |
110 | translation-hash-table named `utf-subst-table-for-decode'.") | |
111 | ||
112 | (define-translation-hash-table 'utf-subst-table-for-decode | |
0c8410d5 | 113 | ucs-unicode-to-mule-cjk) |
ad88f5c5 | 114 | |
9ca2ac2d DL |
115 | (mapc |
116 | (lambda (pair) | |
ad88f5c5 KH |
117 | (aset utf-fragmentation-table (car pair) (cdr pair)) |
118 | (aset utf-defragmentation-table (cdr pair) (car pair))) | |
9ca2ac2d DL |
119 | '((?\e$,1&d\e(B . ?\e,F4\e(B) (?\e$,1&e\e(B . ?\e,F5\e(B) (?\e$,1&f\e(B . ?\e,F6\e(B) (?\e$,1&h\e(B . ?\e,F8\e(B) (?\e$,1&i\e(B . ?\e,F9\e(B) |
120 | (?\e$,1&j\e(B . ?\e,F:\e(B) (?\e$,1&l\e(B . ?\e,F<\e(B) (?\e$,1&n\e(B . ?\e,F>\e(B) (?\e$,1&o\e(B . ?\e,F?\e(B) (?\e$,1&p\e(B . ?\e,F@\e(B) | |
121 | (?\e$,1&q\e(B . ?\e,FA\e(B) (?\e$,1&r\e(B . ?\e,FB\e(B) (?\e$,1&s\e(B . ?\e,FC\e(B) (?\e$,1&t\e(B . ?\e,FD\e(B) (?\e$,1&u\e(B . ?\e,FE\e(B) | |
122 | (?\e$,1&v\e(B . ?\e,FF\e(B) (?\e$,1&w\e(B . ?\e,FG\e(B) (?\e$,1&x\e(B . ?\e,FH\e(B) (?\e$,1&y\e(B . ?\e,FI\e(B) (?\e$,1&z\e(B . ?\e,FJ\e(B) | |
123 | (?\e$,1&{\e(B . ?\e,FK\e(B) (?\e$,1&|\e(B . ?\e,FL\e(B) (?\e$,1&}\e(B . ?\e,FM\e(B) (?\e$,1&~\e(B . ?\e,FN\e(B) (?\e$,1&\7f\e(B . ?\e,FO\e(B) | |
124 | (?\e$,1' \e(B . ?\e,FP\e(B) (?\e$,1'!\e(B . ?\e,FQ\e(B) (?\e$,1'#\e(B . ?\e,FS\e(B) (?\e$,1'$\e(B . ?\e,FT\e(B) (?\e$,1'%\e(B . ?\e,FU\e(B) | |
125 | (?\e$,1'&\e(B . ?\e,FV\e(B) (?\e$,1''\e(B . ?\e,FW\e(B) (?\e$,1'(\e(B . ?\e,FX\e(B) (?\e$,1')\e(B . ?\e,FY\e(B) (?\e$,1'*\e(B . ?\e,FZ\e(B) | |
126 | (?\e$,1'+\e(B . ?\e,F[\e(B) (?\e$,1',\e(B . ?\e,F\\e(B) (?\e$,1'-\e(B . ?\e,F]\e(B) (?\e$,1'.\e(B . ?\e,F^\e(B) (?\e$,1'/\e(B . ?\e,F_\e(B) | |
127 | (?\e$,1'0\e(B . ?\e,F`\e(B) (?\e$,1'1\e(B . ?\e,Fa\e(B) (?\e$,1'2\e(B . ?\e,Fb\e(B) (?\e$,1'3\e(B . ?\e,Fc\e(B) (?\e$,1'4\e(B . ?\e,Fd\e(B) | |
128 | (?\e$,1'5\e(B . ?\e,Fe\e(B) (?\e$,1'6\e(B . ?\e,Ff\e(B) (?\e$,1'7\e(B . ?\e,Fg\e(B) (?\e$,1'8\e(B . ?\e,Fh\e(B) (?\e$,1'9\e(B . ?\e,Fi\e(B) | |
129 | (?\e$,1':\e(B . ?\e,Fj\e(B) (?\e$,1';\e(B . ?\e,Fk\e(B) (?\e$,1'<\e(B . ?\e,Fl\e(B) (?\e$,1'=\e(B . ?\e,Fm\e(B) (?\e$,1'>\e(B . ?\e,Fn\e(B) | |
130 | (?\e$,1'?\e(B . ?\e,Fo\e(B) (?\e$,1'@\e(B . ?\e,Fp\e(B) (?\e$,1'A\e(B . ?\e,Fq\e(B) (?\e$,1'B\e(B . ?\e,Fr\e(B) (?\e$,1'C\e(B . ?\e,Fs\e(B) | |
131 | (?\e$,1'D\e(B . ?\e,Ft\e(B) (?\e$,1'E\e(B . ?\e,Fu\e(B) (?\e$,1'F\e(B . ?\e,Fv\e(B) (?\e$,1'G\e(B . ?\e,Fw\e(B) (?\e$,1'H\e(B . ?\e,Fx\e(B) | |
132 | (?\e$,1'I\e(B . ?\e,Fy\e(B) (?\e$,1'J\e(B . ?\e,Fz\e(B) (?\e$,1'K\e(B . ?\e,F{\e(B) (?\e$,1'L\e(B . ?\e,F|\e(B) (?\e$,1'M\e(B . ?\e,F}\e(B) | |
133 | (?\e$,1'N\e(B . ?\e,F~\e(B) | |
134 | ||
135 | (?\e$,1(!\e(B . ?\e,L!\e(B) (?\e$,1("\e(B . ?\e,L"\e(B) (?\e$,1(#\e(B . ?\e,L#\e(B) (?\e$,1($\e(B . ?\e,L$\e(B) | |
136 | (?\e$,1(%\e(B . ?\e,L%\e(B) (?\e$,1(&\e(B . ?\e,L&\e(B) (?\e$,1('\e(B . ?\e,L'\e(B) (?\e$,1((\e(B . ?\e,L(\e(B) (?\e$,1()\e(B . ?\e,L)\e(B) | |
137 | (?\e$,1(*\e(B . ?\e,L*\e(B) (?\e$,1(+\e(B . ?\e,L+\e(B) (?\e$,1(,\e(B . ?\e,L,\e(B) (?\e$,1(.\e(B . ?\e,L.\e(B) (?\e$,1(/\e(B . ?\e,L/\e(B) | |
138 | (?\e$,1(0\e(B . ?\e,L0\e(B) (?\e$,1(1\e(B . ?\e,L1\e(B) (?\e$,1(2\e(B . ?\e,L2\e(B) (?\e$,1(3\e(B . ?\e,L3\e(B) (?\e$,1(4\e(B . ?\e,L4\e(B) | |
139 | (?\e$,1(5\e(B . ?\e,L5\e(B) (?\e$,1(6\e(B . ?\e,L6\e(B) (?\e$,1(7\e(B . ?\e,L7\e(B) (?\e$,1(8\e(B . ?\e,L8\e(B) (?\e$,1(9\e(B . ?\e,L9\e(B) | |
140 | (?\e$,1(:\e(B . ?\e,L:\e(B) (?\e$,1(;\e(B . ?\e,L;\e(B) (?\e$,1(<\e(B . ?\e,L<\e(B) (?\e$,1(=\e(B . ?\e,L=\e(B) (?\e$,1(>\e(B . ?\e,L>\e(B) | |
141 | (?\e$,1(?\e(B . ?\e,L?\e(B) (?\e$,1(@\e(B . ?\e,L@\e(B) (?\e$,1(A\e(B . ?\e,LA\e(B) (?\e$,1(B\e(B . ?\e,LB\e(B) (?\e$,1(C\e(B . ?\e,LC\e(B) | |
142 | (?\e$,1(D\e(B . ?\e,LD\e(B) (?\e$,1(E\e(B . ?\e,LE\e(B) (?\e$,1(F\e(B . ?\e,LF\e(B) (?\e$,1(G\e(B . ?\e,LG\e(B) (?\e$,1(H\e(B . ?\e,LH\e(B) | |
143 | (?\e$,1(I\e(B . ?\e,LI\e(B) (?\e$,1(J\e(B . ?\e,LJ\e(B) (?\e$,1(K\e(B . ?\e,LK\e(B) (?\e$,1(L\e(B . ?\e,LL\e(B) (?\e$,1(M\e(B . ?\e,LM\e(B) | |
144 | (?\e$,1(N\e(B . ?\e,LN\e(B) (?\e$,1(O\e(B . ?\e,LO\e(B) (?\e$,1(P\e(B . ?\e,LP\e(B) (?\e$,1(Q\e(B . ?\e,LQ\e(B) (?\e$,1(R\e(B . ?\e,LR\e(B) | |
145 | (?\e$,1(S\e(B . ?\e,LS\e(B) (?\e$,1(T\e(B . ?\e,LT\e(B) (?\e$,1(U\e(B . ?\e,LU\e(B) (?\e$,1(V\e(B . ?\e,LV\e(B) (?\e$,1(W\e(B . ?\e,LW\e(B) | |
146 | (?\e$,1(X\e(B . ?\e,LX\e(B) (?\e$,1(Y\e(B . ?\e,LY\e(B) (?\e$,1(Z\e(B . ?\e,LZ\e(B) (?\e$,1([\e(B . ?\e,L[\e(B) (?\e$,1(\\e(B . ?\e,L\\e(B) | |
147 | (?\e$,1(]\e(B . ?\e,L]\e(B) (?\e$,1(^\e(B . ?\e,L^\e(B) (?\e$,1(_\e(B . ?\e,L_\e(B) (?\e$,1(`\e(B . ?\e,L`\e(B) (?\e$,1(a\e(B . ?\e,La\e(B) | |
148 | (?\e$,1(b\e(B . ?\e,Lb\e(B) (?\e$,1(c\e(B . ?\e,Lc\e(B) (?\e$,1(d\e(B . ?\e,Ld\e(B) (?\e$,1(e\e(B . ?\e,Le\e(B) (?\e$,1(f\e(B . ?\e,Lf\e(B) | |
149 | (?\e$,1(g\e(B . ?\e,Lg\e(B) (?\e$,1(h\e(B . ?\e,Lh\e(B) (?\e$,1(i\e(B . ?\e,Li\e(B) (?\e$,1(j\e(B . ?\e,Lj\e(B) (?\e$,1(k\e(B . ?\e,Lk\e(B) | |
150 | (?\e$,1(l\e(B . ?\e,Ll\e(B) (?\e$,1(m\e(B . ?\e,Lm\e(B) (?\e$,1(n\e(B . ?\e,Ln\e(B) (?\e$,1(o\e(B . ?\e,Lo\e(B) (?\e$,1(q\e(B . ?\e,Lq\e(B) | |
151 | (?\e$,1(r\e(B . ?\e,Lr\e(B) (?\e$,1(s\e(B . ?\e,Ls\e(B) (?\e$,1(t\e(B . ?\e,Lt\e(B) (?\e$,1(u\e(B . ?\e,Lu\e(B) (?\e$,1(v\e(B . ?\e,Lv\e(B) | |
152 | (?\e$,1(w\e(B . ?\e,Lw\e(B) (?\e$,1(x\e(B . ?\e,Lx\e(B) (?\e$,1(y\e(B . ?\e,Ly\e(B) (?\e$,1(z\e(B . ?\e,Lz\e(B) (?\e$,1({\e(B . ?\e,L{\e(B) | |
153 | (?\e$,1(|\e(B . ?\e,L|\e(B) (?\e$,1(~\e(B . ?\e,L~\e(B) (?\e$,1(\7f\e(B . ?\e,L\7f\e(B))) | |
154 | ||
ad88f5c5 KH |
155 | |
156 | (defcustom utf-fragment-on-decoding nil | |
157 | "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets. | |
9ca2ac2d DL |
158 | Setting this means that the relevant Cyrillic and Greek characters are |
159 | decoded into the iso8859 charsets rather than into | |
3873f5a5 | 160 | mule-unicode-0100-24ff. The iso8859 charsets take half as much space |
9ca2ac2d DL |
161 | in the buffer, but using them may affect how the buffer can be re-encoded |
162 | and may require a different input method to search for them, for instance. | |
163 | See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' | |
3873f5a5 KH |
164 | for mechanisms to make this largely transparent. |
165 | ||
166 | Setting this variable outside customize has no effect." | |
9ca2ac2d | 167 | :set (lambda (s v) |
ad88f5c5 KH |
168 | (if v |
169 | (progn | |
170 | (define-translation-table 'utf-translation-table-for-decode | |
171 | utf-fragmentation-table) | |
172 | ;; Even if unify-8859-on-encoding-mode is off, make | |
173 | ;; mule-utf-* encode characters in | |
174 | ;; utf-fragmentation-table. | |
175 | (unless (eq (get 'utf-translation-table-for-encode | |
176 | 'translation-table) | |
177 | ucs-mule-to-mule-unicode) | |
178 | (define-translation-table 'utf-translation-table-for-encode | |
535665b8 | 179 | utf-defragmentation-table))) |
ad88f5c5 KH |
180 | (define-translation-table 'utf-translation-table-for-decode) |
181 | ;; When unify-8859-on-encoding-mode is off, be sure to make | |
182 | ;; mule-utf-* disabled for characters in | |
183 | ;; utf-fragmentation-table. | |
184 | (unless (eq (get 'utf-translation-table-for-encode | |
185 | 'translation-table) | |
186 | ucs-mule-to-mule-unicode) | |
535665b8 | 187 | (define-translation-table 'utf-translation-table-for-encode))) |
9ca2ac2d DL |
188 | (set-default s v)) |
189 | :version "21.4" | |
190 | :type 'boolean | |
191 | :group 'mule) | |
192 | ||
ad88f5c5 KH |
193 | (defcustom utf-translate-cjk nil |
194 | "Whether the UTF based coding systems should decode/encode CJK characters. | |
ccdd5c61 DL |
195 | Enabling this loads tables which allow the coding systems mule-utf-8, |
196 | mule-utf-16-le and mule-utf-16-be to encode characters in the charsets | |
197 | `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', | |
198 | `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to | |
199 | decode the corresponding unicodes into such characters. | |
200 | ||
201 | Where the charsets overlap, the one preferred for decoding is chosen | |
202 | according to the language environment in effect when this option is | |
203 | turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for | |
204 | Chinese-Big5 and jisx for other environments. | |
205 | ||
206 | The tables are large (over 40000 entries), so this option is not the | |
207 | default. Also, installing them may be rather slow." | |
9ca2ac2d | 208 | :set (lambda (s v) |
ad88f5c5 | 209 | (if v |
0c8410d5 DL |
210 | ;; Fixme: Allow the use of the CJK charsets to be |
211 | ;; customized by reordering and possible omission. | |
ad88f5c5 | 212 | (progn |
0c8410d5 DL |
213 | ;; Redefine them with realistic initial sizes and a |
214 | ;; smallish rehash size to avoid wasting significant | |
215 | ;; space after they're built. | |
216 | (setq ucs-mule-cjk-to-unicode | |
217 | (make-hash-table :test 'eq :size 43000 :rehash-size 1000) | |
218 | ucs-unicode-to-mule-cjk | |
219 | (make-hash-table :test 'eq :size 43000 :rehash-size 1000)) | |
ccdd5c61 DL |
220 | ;; Load the files explicitly, to avoid having to keep |
221 | ;; around the large tables they contain (as well as the | |
222 | ;; ones which get built). | |
223 | (cond | |
224 | ((string= "Korean" current-language-environment) | |
225 | (load "subst-jis") | |
226 | (load "subst-big5") | |
227 | (load "subst-gb2312") | |
228 | (load "subst-ksc")) | |
229 | ((string= "Chinese-BIG5" current-language-environment) | |
230 | (load "subst-jis") | |
231 | (load "subst-ksc") | |
232 | (load "subst-gb2312") | |
233 | (load "subst-big5")) | |
234 | ((string= "Chinese-GB" current-language-environment) | |
235 | (load "subst-jis") | |
236 | (load "subst-ksc") | |
237 | (load "subst-big5") | |
238 | (load "subst-gb2312")) | |
239 | (t | |
240 | (load "subst-ksc") | |
241 | (load "subst-gb2312") | |
242 | (load "subst-big5") | |
243 | (load "subst-jis"))) ; jis covers as much as big5, gb2312 | |
ad88f5c5 KH |
244 | (let ((table (make-char-table 'translation-table))) |
245 | (maphash (lambda (k v) | |
246 | (aset table k t)) | |
247 | ucs-mule-cjk-to-unicode) | |
ad88f5c5 KH |
248 | (define-translation-hash-table 'utf-subst-table-for-decode |
249 | ucs-unicode-to-mule-cjk) | |
250 | (define-translation-hash-table 'utf-subst-table-for-encode | |
251 | ucs-mule-cjk-to-unicode)) | |
ad88f5c5 KH |
252 | (define-translation-hash-table 'utf-subst-table-for-decode |
253 | (make-hash-table :test 'eq)) | |
254 | (define-translation-hash-table 'utf-subst-table-for-encode | |
535665b8 | 255 | (make-hash-table :test 'eq)))) |
9ca2ac2d DL |
256 | (set-default s v)) |
257 | :version "21.4" | |
258 | :type 'boolean | |
ccdd5c61 | 259 | :set-after '(current-language-environment) |
9ca2ac2d DL |
260 | :group 'mule) |
261 | ||
5ba7a870 KH |
262 | (define-ccl-program ccl-decode-mule-utf-8 |
263 | ;; | |
264 | ;; charset | bytes in utf-8 | bytes in emacs | |
265 | ;; -----------------------+----------------+--------------- | |
266 | ;; ascii | 1 | 1 | |
267 | ;; -----------------------+----------------+--------------- | |
268 | ;; eight-bit-control | 2 | 2 | |
aa2e3f49 | 269 | ;; eight-bit-graphic | 2 | 1 |
5ba7a870 KH |
270 | ;; latin-iso8859-1 | 2 | 2 |
271 | ;; -----------------------+----------------+--------------- | |
272 | ;; mule-unicode-0100-24ff | 2 | 4 | |
273 | ;; (< 0800) | | | |
274 | ;; -----------------------+----------------+--------------- | |
275 | ;; mule-unicode-0100-24ff | 3 | 4 | |
276 | ;; (>= 8000) | | | |
277 | ;; mule-unicode-2500-33ff | 3 | 4 | |
278 | ;; mule-unicode-e000-ffff | 3 | 4 | |
279 | ;; | |
280 | ;; Thus magnification factor is two. | |
281 | ;; | |
282 | `(2 | |
3d0e328b GM |
283 | ((r5 = ,(charset-id 'eight-bit-control)) |
284 | (r6 = ,(charset-id 'eight-bit-graphic)) | |
285 | (loop | |
67ff2216 | 286 | (r0 = -1) |
5ba7a870 KH |
287 | (read r0) |
288 | ||
289 | ;; 1byte encoding, i.e., ascii | |
290 | (if (r0 < #x80) | |
67ff2216 | 291 | ((write r0)) |
9ca2ac2d | 292 | (if (r0 < #xc0) ; continuation byte (invalid here) |
67ff2216 KH |
293 | ((if (r0 < #xa0) |
294 | (write-multibyte-character r5 r0) | |
295 | (write-multibyte-character r6 r0))) | |
9ca2ac2d DL |
296 | ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
297 | (if (r0 < #xe0) | |
67ff2216 KH |
298 | ((r1 = -1) |
299 | (read r1) | |
9ca2ac2d DL |
300 | |
301 | (if ((r1 & #b11000000) != #b10000000) | |
302 | ;; Invalid 2-byte sequence | |
3d0e328b GM |
303 | ((if (r0 < #xa0) |
304 | (write-multibyte-character r5 r0) | |
305 | (write-multibyte-character r6 r0)) | |
306 | (if (r1 < #x80) | |
307 | (write r1) | |
308 | (if (r1 < #xa0) | |
309 | (write-multibyte-character r5 r1) | |
9ca2ac2d DL |
310 | (write-multibyte-character r6 r1)))) |
311 | ||
312 | ((r3 = r0) ; save in case of overlong sequence | |
313 | (r2 = r1) | |
314 | (r0 &= #x1f) | |
315 | (r0 <<= 6) | |
9ca2ac2d DL |
316 | (r1 &= #x3f) |
317 | (r1 += r0) | |
318 | ;; Now r1 holds scalar value | |
319 | ||
320 | (if (r1 < 128) ; `overlong sequence' | |
321 | ((if (r3 < #xa0) | |
322 | (write-multibyte-character r5 r3) | |
323 | (write-multibyte-character r6 r3)) | |
324 | (if (r2 < #x80) | |
325 | (write r2) | |
326 | (if (r2 < #xa0) | |
327 | (write-multibyte-character r5 r2) | |
328 | (write-multibyte-character r6 r2)))) | |
329 | ||
330 | ;; eight-bit-control | |
331 | (if (r1 < 160) | |
332 | ((write-multibyte-character r5 r1)) | |
333 | ||
334 | ;; latin-iso8859-1 | |
335 | (if (r1 < 256) | |
336 | ((r0 = ,(charset-id 'latin-iso8859-1)) | |
337 | (r1 -= 128) | |
338 | (write-multibyte-character r0 r1)) | |
339 | ||
340 | ;; mule-unicode-0100-24ff (< 0800) | |
341 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
342 | (r1 -= #x0100) | |
343 | (r2 = (((r1 / 96) + 32) << 7)) | |
344 | (r1 %= 96) | |
345 | (r1 += (r2 + 32)) | |
346 | (translate-character | |
ad88f5c5 | 347 | utf-translation-table-for-decode r0 r1) |
9ca2ac2d DL |
348 | (write-multibyte-character r0 r1)))))))) |
349 | ||
350 | ;; 3byte encoding | |
351 | ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | |
352 | (if (r0 < #xf0) | |
67ff2216 KH |
353 | ((r1 = -1) |
354 | (r2 = -1) | |
355 | (read r1 r2) | |
9ca2ac2d DL |
356 | |
357 | ;; This is set to 1 if the encoding is invalid. | |
358 | (r4 = 0) | |
359 | ||
360 | (r3 = (r1 & #b11000000)) | |
361 | (r3 |= ((r2 >> 2) & #b00110000)) | |
362 | (if (r3 != #b10100000) | |
363 | (r4 = 1) | |
364 | ((r3 = ((r0 & #x0f) << 12)) | |
365 | (r3 += ((r1 & #x3f) << 6)) | |
366 | (r3 += (r2 & #x3f)) | |
367 | (if (r3 < #x0800) | |
368 | (r4 = 1)))) | |
369 | ||
370 | (if (r4 != 0) | |
371 | ;; Invalid 3-byte sequence | |
372 | ((if (r0 < #xa0) | |
373 | (write-multibyte-character r5 r0) | |
374 | (write-multibyte-character r6 r0)) | |
375 | (if (r1 < #x80) | |
376 | (write r1) | |
377 | (if (r1 < #xa0) | |
378 | (write-multibyte-character r5 r1) | |
379 | (write-multibyte-character r6 r1))) | |
380 | (if (r2 < #x80) | |
381 | (write r2) | |
382 | (if (r2 < #xa0) | |
383 | (write-multibyte-character r5 r2) | |
384 | (write-multibyte-character r6 r2)))) | |
a1506d29 | 385 | |
9ca2ac2d DL |
386 | ;; mule-unicode-0100-24ff (>= 0800) |
387 | ((if (r3 < #x2500) | |
388 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
389 | (r3 -= #x0100) | |
3d0e328b GM |
390 | (r3 //= 96) |
391 | (r1 = (r7 + 32)) | |
392 | (r1 += ((r3 + 32) << 7)) | |
9ca2ac2d | 393 | (translate-character |
ad88f5c5 | 394 | utf-translation-table-for-decode r0 r1) |
3d0e328b | 395 | (write-multibyte-character r0 r1)) |
a1506d29 | 396 | |
9ca2ac2d | 397 | ;; mule-unicode-2500-33ff |
9ca2ac2d | 398 | (if (r3 < #x3400) |
ccdd5c61 DL |
399 | ((r4 = r3) ; don't zap r3 |
400 | (lookup-integer utf-subst-table-for-decode r4 r5) | |
401 | (if r7 | |
402 | ;; got a translation | |
403 | ((write-multibyte-character r4 r5) | |
404 | ;; Zapped through register starvation. | |
405 | (r5 = ,(charset-id 'eight-bit-control))) | |
406 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
407 | (r3 -= #x2500) | |
408 | (r3 //= 96) | |
409 | (r1 = (r7 + 32)) | |
410 | (r1 += ((r3 + 32) << 7)) | |
411 | (write-multibyte-character r0 r1)))) | |
9ca2ac2d DL |
412 | |
413 | ;; U+3400 .. U+D7FF | |
414 | ;; Try to convert to CJK chars, else keep | |
415 | ;; them as eight-bit-{control|graphic}. | |
416 | (if (r3 < #xd800) | |
417 | ((r4 = r3) ; don't zap r3 | |
ad88f5c5 | 418 | (lookup-integer utf-subst-table-for-decode r4 r5) |
9ca2ac2d DL |
419 | (if r7 |
420 | ;; got a translation | |
421 | ((write-multibyte-character r4 r5) | |
422 | ;; Zapped through register starvation. | |
423 | (r5 = ,(charset-id 'eight-bit-control))) | |
424 | ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | |
425 | ((r3 = r6) | |
426 | (write-multibyte-character r3 r0) | |
427 | (if (r1 < #xa0) | |
428 | (r3 = r5)) | |
429 | (write-multibyte-character r3 r1) | |
430 | (if (r2 < #xa0) | |
431 | (r3 = r5) | |
432 | (r3 = r6)) | |
433 | (write-multibyte-character r3 r2)))) | |
434 | ||
435 | ;; Surrogates, U+D800 .. U+DFFF | |
9ca2ac2d DL |
436 | (if (r3 < #xe000) |
437 | ((r3 = r6) | |
438 | (write-multibyte-character r3 r0) ; eight-bit-graphic | |
439 | (if (r1 < #xa0) | |
440 | (r3 = r5)) | |
441 | (write-multibyte-character r3 r1) | |
442 | (if (r2 < #xa0) | |
443 | (r3 = r5) | |
444 | (r3 = r6)) | |
445 | (write-multibyte-character r3 r2)) | |
a1506d29 | 446 | |
9ca2ac2d DL |
447 | ;; mule-unicode-e000-ffff |
448 | ;; Fixme: fffe and ffff are invalid. | |
449 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
450 | (r3 -= #xe000) | |
451 | (r3 //= 96) | |
452 | (r1 = (r7 + 32)) | |
453 | (r1 += ((r3 + 32) << 7)) | |
454 | (write-multibyte-character r0 r1))))))))) | |
455 | ||
456 | (if (r0 < #xfe) | |
457 | ;; 4byte encoding | |
458 | ;; keep those bytes as eight-bit-{control|graphic} | |
ad88f5c5 | 459 | ;; Fixme: allow lookup in utf-subst-table-for-decode. |
67ff2216 KH |
460 | ((r1 = -1) |
461 | (r2 = -1) | |
462 | (r3 = -1) | |
463 | (read r1 r2 r3) | |
9ca2ac2d DL |
464 | ;; r0 > #xf0, thus eight-bit-graphic |
465 | (write-multibyte-character r6 r0) | |
466 | (if (r1 < #xa0) | |
467 | (if (r1 < #x80) ; invalid byte | |
468 | (write r1) | |
469 | (write-multibyte-character r5 r1)) | |
470 | (write-multibyte-character r6 r1)) | |
471 | (if (r2 < #xa0) | |
472 | (if (r2 < #x80) ; invalid byte | |
473 | (write r2) | |
474 | (write-multibyte-character r5 r2)) | |
475 | (write-multibyte-character r6 r2)) | |
476 | (if (r3 < #xa0) | |
477 | (if (r3 < #x80) ; invalid byte | |
478 | (write r3) | |
479 | (write-multibyte-character r5 r3)) | |
480 | (write-multibyte-character r6 r3)) | |
481 | (if (r0 >= #xf8) ; 5- or 6-byte encoding | |
76b7fded KH |
482 | ((r0 = -1) |
483 | (read r0) | |
484 | (if (r0 < #xa0) | |
485 | (if (r0 < #x80) ; invalid byte | |
486 | (write r0) | |
487 | (write-multibyte-character r5 r0)) | |
488 | (write-multibyte-character r6 r0)) | |
9ca2ac2d | 489 | (if (r0 >= #xfc) ; 6-byte |
76b7fded KH |
490 | ((r0 = -1) |
491 | (read r0) | |
492 | (if (r0 < #xa0) | |
493 | (if (r0 < #x80) ; invalid byte | |
494 | (write r0) | |
495 | (write-multibyte-character r5 r0)) | |
496 | (write-multibyte-character r6 r0))))))) | |
9ca2ac2d DL |
497 | ;; else invalid byte >= #xfe |
498 | (write-multibyte-character r6 r0)))))) | |
67ff2216 KH |
499 | (repeat))) |
500 | ||
501 | ;; At EOF... | |
502 | (if (r0 >= 0) | |
503 | ((if (r0 < #x80) | |
504 | (write r0) | |
505 | (if (r0 < #xa0) | |
506 | (write-multibyte-character r5 r0) | |
507 | ((write-multibyte-character r6 r0)))) | |
508 | (if (r1 >= 0) | |
509 | ((if (r1 < #x80) | |
510 | (write r1) | |
511 | (if (r1 < #xa0) | |
512 | (write-multibyte-character r5 r1) | |
513 | ((write-multibyte-character r6 r1)))) | |
514 | (if (r2 >= 0) | |
515 | ((if (r2 < #x80) | |
516 | (write r2) | |
517 | (if (r2 < #xa0) | |
518 | (write-multibyte-character r5 r2) | |
519 | ((write-multibyte-character r6 r2)))) | |
520 | (if (r3 >= 0) | |
521 | (if (r3 < #x80) | |
522 | (write r3) | |
523 | (if (r3 < #xa0) | |
524 | (write-multibyte-character r5 r3) | |
525 | ((write-multibyte-character r6 r3)))))))))))) | |
5ba7a870 | 526 | |
c49b8288 | 527 | "CCL program to decode UTF-8. |
74ace46a | 528 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
ad88f5c5 KH |
529 | mule-unicode-*, but see also `utf-fragmentation-table' and |
530 | `ucs-mule-cjk-to-unicode'. | |
9ca2ac2d DL |
531 | Encodings of un-representable Unicode characters are decoded asis into |
532 | eight-bit-control and eight-bit-graphic characters.") | |
5ba7a870 KH |
533 | |
534 | (define-ccl-program ccl-encode-mule-utf-8 | |
535 | `(1 | |
aa15b3e5 KH |
536 | ((r5 = -1) |
537 | (loop | |
538 | (if (r5 < 0) | |
539 | ((r1 = -1) | |
aa2e3f49 | 540 | (read-multibyte-character r0 r1) |
ad88f5c5 | 541 | (translate-character utf-translation-table-for-encode r0 r1)) |
aa15b3e5 KH |
542 | (;; We have already done read-multibyte-character. |
543 | (r0 = r5) | |
544 | (r1 = r6) | |
545 | (r5 = -1))) | |
546 | ||
547 | (if (r0 == ,(charset-id 'ascii)) | |
548 | (write r1) | |
549 | ||
550 | (if (r0 == ,(charset-id 'latin-iso8859-1)) | |
551 | ;; r1 scalar utf-8 | |
552 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
553 | ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 | |
554 | ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 | |
555 | ((r0 = (((r1 & #x40) >> 6) | #xc2)) | |
556 | (r1 &= #x3f) | |
557 | (r1 |= #x80) | |
558 | (write r0 r1)) | |
559 | ||
560 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | |
561 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
562 | ;; #x3f80 == (0011 1111 1000 0000)b | |
563 | (r1 &= #x7f) | |
564 | (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | |
565 | ;; now r1 holds scalar value | |
566 | (if (r1 < #x0800) | |
567 | ;; 2byte encoding | |
568 | ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) | |
569 | ;; #x07c0 == (0000 0111 1100 0000)b | |
570 | (r1 &= #x3f) | |
571 | (r1 |= #x80) | |
572 | (write r0 r1)) | |
573 | ;; 3byte encoding | |
574 | ((r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
5ba7a870 KH |
575 | (r2 = ((r1 & #x3f) | #x80)) |
576 | (r1 &= #x0fc0) | |
577 | (r1 >>= 6) | |
578 | (r1 |= #x80) | |
aa15b3e5 KH |
579 | (write r0 r1 r2)))) |
580 | ||
581 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
582 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
583 | (r1 &= #x7f) | |
584 | (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | |
585 | (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
586 | (r2 = ((r1 & #x3f) | #x80)) | |
587 | (r1 &= #x0fc0) | |
588 | (r1 >>= 6) | |
589 | (r1 |= #x80) | |
590 | (write r0 r1 r2)) | |
591 | ||
592 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | |
593 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
594 | (r1 &= #x7f) | |
9ca2ac2d | 595 | (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 |
aa15b3e5 KH |
596 | (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
597 | (r2 = ((r1 & #x3f) | #x80)) | |
598 | (r1 &= #x0fc0) | |
599 | (r1 >>= 6) | |
600 | (r1 |= #x80) | |
601 | (write r0 r1 r2)) | |
602 | ||
603 | (if (r0 == ,(charset-id 'eight-bit-control)) | |
604 | ;; r1 scalar utf-8 | |
605 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
606 | ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | |
607 | ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | |
608 | ((write #xc2) | |
609 | (write r1)) | |
610 | ||
611 | (if (r0 == ,(charset-id 'eight-bit-graphic)) | |
612 | ;; r1 scalar utf-8 | |
613 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
614 | ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | |
615 | ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | |
616 | ((write r1) | |
617 | (r1 = -1) | |
618 | (read-multibyte-character r0 r1) | |
619 | (if (r0 != ,(charset-id 'eight-bit-graphic)) | |
620 | (if (r0 != ,(charset-id 'eight-bit-control)) | |
621 | ((r5 = r0) | |
622 | (r6 = r1)))) | |
623 | (if (r5 < 0) | |
624 | ((read-multibyte-character r0 r2) | |
625 | (if (r0 != ,(charset-id 'eight-bit-graphic)) | |
626 | (if (r0 != ,(charset-id 'eight-bit-control)) | |
627 | ((r5 = r0) | |
628 | (r6 = r2)))) | |
629 | (if (r5 < 0) | |
630 | (write r1 r2) | |
631 | (if (r1 < #xa0) | |
632 | (write r1) | |
633 | ((write #xc2) | |
634 | (write r1))))))) | |
635 | ||
ad88f5c5 | 636 | ((lookup-character utf-subst-table-for-encode r0 r1) |
9ca2ac2d DL |
637 | (if r7 ; lookup succeeded |
638 | ((r1 = (((r0 & #xf000) >> 12) | #xe0)) | |
639 | (r2 = ((r0 & #x3f) | #x80)) | |
640 | (r0 &= #x0fc0) | |
641 | (r0 >>= 6) | |
642 | (r0 |= #x80) | |
643 | (write r1 r0 r2)) | |
644 | ;; Unsupported character. | |
645 | ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
646 | ((write #xef) | |
647 | (write #xbf) | |
648 | (write #xbd))))))))))) | |
aa15b3e5 KH |
649 | (repeat))) |
650 | (if (r1 >= #xa0) | |
651 | (write r1) | |
652 | (if (r1 >= #x80) | |
653 | ((write #xc2) | |
654 | (write r1))))) | |
5ba7a870 | 655 | |
9ca2ac2d | 656 | "CCL program to encode into UTF-8.") |
5ba7a870 | 657 | |
aa2e3f49 | 658 | |
9ca2ac2d DL |
659 | (define-ccl-program ccl-untranslated-to-ucs |
660 | `(0 | |
661 | (if (r0 < #xf0) ; 3-byte encoding, as above | |
662 | ((r4 = 0) | |
663 | (r3 = (r1 & #b11000000)) | |
664 | (r3 |= ((r2 >> 2) & #b00110000)) | |
665 | (if (r3 != #b10100000) | |
666 | (r4 = 1) | |
667 | ((r3 = ((r0 & #x0f) << 12)) | |
668 | (r3 += ((r1 & #x3f) << 6)) | |
669 | (r3 += (r2 & #x3f)) | |
670 | (if (r3 < #x0800) | |
671 | (r4 = 1)))) | |
672 | (if (r4 != 0) | |
673 | (r0 = 0) | |
674 | (r0 = r3))) | |
675 | (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe) | |
676 | ((r4 = (r1 >> 6)) | |
677 | (if (r4 != #b10) | |
678 | (r0 = 0) | |
679 | ((r4 = (r2 >> 6)) | |
680 | (if (r4 != #b10) | |
681 | (r0 = 0) | |
682 | ((r4 = (r3 >> 6)) | |
683 | (if (r4 != #b10) | |
684 | (r0 = 0) | |
685 | ((r1 = ((r1 & #x3F) << 12)) | |
686 | (r2 = ((r2 & #x3F) << 6)) | |
687 | (r3 &= #x3F) | |
688 | (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3))))))))) | |
689 | (r0 = 0)))) | |
690 | "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. | |
691 | r0 == 0 for invalid sequence.") | |
692 | ||
693 | (defvar utf-8-ccl-regs (make-vector 8 0)) | |
694 | ||
aa2e3f49 | 695 | (defsubst utf-8-untranslated-to-ucs () |
9ca2ac2d DL |
696 | "Return the UCS code for an untranslated sequence of raw bytes t point. |
697 | Only for 3- or 4-byte sequences." | |
698 | (aset utf-8-ccl-regs 0 (or (char-after) 0)) | |
699 | (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) | |
700 | (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) | |
701 | (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) | |
702 | (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs) | |
703 | (aref utf-8-ccl-regs 0)) | |
aa2e3f49 DL |
704 | |
705 | (defun utf-8-help-echo (window object position) | |
706 | (format "Untranslated Unicode U+%04X" | |
707 | (get-char-property position 'untranslated-utf-8 object))) | |
708 | ||
aa2e3f49 DL |
709 | ;; We compose the untranslatable sequences into a single character. |
710 | ;; This is infelicitous for editing, because there's currently no | |
711 | ;; mechanism for treating compositions as atomic, but is OK for | |
9ca2ac2d DL |
712 | ;; display. They are composed to U+FFFD with help-echo which |
713 | ;; indicates the unicodes they represent. This function GCs too much. | |
aa2e3f49 DL |
714 | (defsubst utf-8-compose () |
715 | "Put a suitable composition on an untranslatable sequence. | |
716 | Return the sequence's length." | |
717 | (let* ((u (utf-8-untranslated-to-ucs)) | |
9ca2ac2d DL |
718 | (l (unless (zerop u) |
719 | (if (>= u #x10000) | |
aa2e3f49 | 720 | 4 |
9ca2ac2d DL |
721 | 3)))) |
722 | (when l | |
aa2e3f49 DL |
723 | (put-text-property (point) (min (point-max) (+ l (point))) |
724 | 'untranslated-utf-8 u) | |
9ca2ac2d DL |
725 | (put-text-property (point) (min (point-max) (+ l (point))) |
726 | 'help-echo 'utf-8-help-echo) | |
727 | (compose-region (point) (+ l (point)) ?\e$,3u=\e(B) | |
aa2e3f49 DL |
728 | l))) |
729 | ||
730 | (defcustom utf-8-compose-scripts nil | |
9ca2ac2d | 731 | "*Non-nil means compose various scripts on decoding utf-8 text." |
aa2e3f49 | 732 | :group 'mule |
9ca2ac2d DL |
733 | :version "21.4" |
734 | :type 'boolean) | |
aa2e3f49 DL |
735 | |
736 | (defun utf-8-post-read-conversion (length) | |
737 | "Compose untranslated utf-8 sequences into single characters. | |
738 | Also compose particular scripts if `utf-8-compose-scripts' is non-nil." | |
739 | (save-excursion | |
740 | ;; Can't do eval-when-compile to insert a multibyte constant | |
741 | ;; version of the string in the loop, since it's always loaded as | |
742 | ;; unibyte from a byte-compiled file. | |
9ca2ac2d DL |
743 | (let ((range (string-as-multibyte "^\xe1-\xf7"))) |
744 | (while (and (skip-chars-forward range) | |
aa2e3f49 DL |
745 | (not (eobp))) |
746 | (forward-char (utf-8-compose))))) | |
9ca2ac2d DL |
747 | ;; Fixme: Takahashi-san implies it may not work this easily. I |
748 | ;; asked why but didn't get a reply. -- fx | |
aa2e3f49 DL |
749 | (when (and utf-8-compose-scripts (> length 1)) |
750 | ;; These currently have definitions which cover the relevant | |
9ca2ac2d | 751 | ;; unicodes. We could avoid loading thai-util &c by checking |
aa2e3f49 DL |
752 | ;; whether the region contains any characters with the appropriate |
753 | ;; categories. There aren't yet Unicode-based rules for Tibetan. | |
754 | (save-excursion (setq length (diacritic-post-read-conversion length))) | |
755 | (save-excursion (setq length (thai-post-read-conversion length))) | |
756 | (save-excursion (setq length (lao-post-read-conversion length))) | |
9ca2ac2d DL |
757 | (save-excursion |
758 | (setq length (in-is13194-devanagari-post-read-conversion length)))) | |
aa2e3f49 DL |
759 | length) |
760 | ||
9ca2ac2d DL |
761 | ;; ucs-tables is preloaded |
762 | ;; (defun utf-8-pre-write-conversion (beg end) | |
763 | ;; "Semi-dummy pre-write function effectively to autoload ucs-tables." | |
ad88f5c5 | 764 | ;; ;; Ensure translation-table is loaded. |
9ca2ac2d DL |
765 | ;; (require 'ucs-tables) |
766 | ;; ;; Don't do this again. | |
767 | ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) | |
768 | ;; nil) | |
aa2e3f49 | 769 | |
5ba7a870 KH |
770 | (make-coding-system |
771 | 'mule-utf-8 4 ?u | |
772 | "UTF-8 encoding for Emacs-supported Unicode characters. | |
ad88f5c5 KH |
773 | It supports Unicode characters of these ranges: |
774 | U+0000..U+33FF, U+E000..U+FFFF. | |
775 | They correspond to these Emacs character sets: | |
776 | ascii, latin-iso8859-1, mule-unicode-0100-24ff, | |
777 | mule-unicode-2500-33ff, mule-unicode-e000-ffff | |
778 | ||
779 | On decoding (e.g. reading a file), Unicode characters not in the above | |
780 | ranges are decoded into sequences of eight-bit-control and | |
781 | eight-bit-graphic characters to preserve their byte sequences. The | |
782 | byte sequence is preserved on i/o for valid utf-8, but not necessarily | |
783 | for invalid utf-8. | |
784 | ||
785 | On encoding (e.g. writing a file), Emacs characters not belonging to | |
786 | any of the character sets listed above are encoded into the UTF-8 byte | |
787 | sequence representing U+FFFD (REPLACEMENT CHARACTER)." | |
5ba7a870 KH |
788 | |
789 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
790 | '((safe-charsets | |
791 | ascii | |
792 | eight-bit-control | |
793 | eight-bit-graphic | |
794 | latin-iso8859-1 | |
795 | mule-unicode-0100-24ff | |
796 | mule-unicode-2500-33ff | |
797 | mule-unicode-e000-ffff) | |
87ae7973 | 798 | (mime-charset . utf-8) |
75f6d723 | 799 | (coding-category . coding-category-utf-8) |
aa2e3f49 | 800 | (valid-codes (0 . 255)) |
9ca2ac2d | 801 | ;; (pre-write-conversion . utf-8-pre-write-conversion) |
ad88f5c5 KH |
802 | (post-read-conversion . utf-8-post-read-conversion) |
803 | (dependency unify-8859-on-encoding-mode | |
804 | unify-8859-on-decoding-mode | |
805 | utf-fragment-on-decoding | |
806 | utf-translate-cjk))) | |
5ba7a870 KH |
807 | |
808 | (define-coding-system-alias 'utf-8 'mule-utf-8) | |
e8af40ee | 809 | |
aa2e3f49 DL |
810 | ;; I think this needs special private charsets defined for the |
811 | ;; untranslated sequences, if it's going to work well. | |
812 | ||
813 | ;;; (defun utf-8-compose-function (pos to pattern &optional string) | |
814 | ;;; (let* ((prop (get-char-property pos 'composition string)) | |
815 | ;;; (l (and prop (- (cadr prop) (car prop))))) | |
816 | ;;; (cond ((and l (> l (- to pos))) | |
817 | ;;; (delete-region pos to)) | |
818 | ;;; ((and (> (char-after pos) 224) | |
819 | ;;; (< (char-after pos) 256) | |
820 | ;;; (save-restriction | |
821 | ;;; (narrow-to-region pos to) | |
822 | ;;; (utf-8-compose))) | |
823 | ;;; t)))) | |
824 | ||
825 | ;;; (dotimes (i 96) | |
826 | ;;; (aset composition-function-table | |
827 | ;;; (+ 128 i) | |
828 | ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | |
829 | ;;; . utf-8-compose-function)))) | |
830 | ||
e8af40ee | 831 | ;;; utf-8.el ends here |