Commit | Line | Data |
---|---|---|
e8af40ee | 1 | ;;; utf-8.el --- limited UTF-8 decoding/encoding support |
5ba7a870 KH |
2 | |
3 | ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | |
4 | ;; Licensed to the Free Software Foundation. | |
3d0e328b | 5 | ;; Copyright (C) 2001 Free Software Foundation, Inc. |
5ba7a870 | 6 | |
aa15b3e5 | 7 | ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
c49b8288 | 8 | ;; Keywords: multilingual, Unicode, UTF-8, i18n |
5ba7a870 KH |
9 | |
10 | ;; This file is part of GNU Emacs. | |
11 | ||
12 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
13 | ;; it under the terms of the GNU General Public License as published by | |
14 | ;; the Free Software Foundation; either version 2, or (at your option) | |
15 | ;; any later version. | |
16 | ||
17 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
18 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ;; GNU General Public License for more details. | |
21 | ||
22 | ;; You should have received a copy of the GNU General Public License | |
23 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
24 | ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
25 | ;; Boston, MA 02111-1307, USA. | |
26 | ||
27 | ;;; Commentary: | |
28 | ||
29 | ;; The coding-system `mule-utf-8' supports encoding/decoding of the | |
c49b8288 | 30 | ;; following character sets to and from UTF-8: |
5ba7a870 KH |
31 | ;; |
32 | ;; ascii | |
33 | ;; eight-bit-control | |
34 | ;; latin-iso8859-1 | |
35 | ;; mule-unicode-0100-24ff | |
36 | ;; mule-unicode-2500-33ff | |
37 | ;; mule-unicode-e000-ffff | |
38 | ;; | |
39 | ;; Characters of other character sets cannot be encoded with | |
c49b8288 DL |
40 | ;; mule-utf-8. Note that the mule-unicode charsets currently lack |
41 | ;; case and syntax information, so things like `downcase' will only | |
42 | ;; work for characters from ASCII and Latin-1. | |
5ba7a870 | 43 | ;; |
c49b8288 DL |
44 | ;; On decoding, Unicode characters that do not fit into the above |
45 | ;; character sets are handled as `eight-bit-control' or | |
46 | ;; `eight-bit-graphic' characters to retain the information about the | |
47 | ;; original byte sequence. | |
48 | ||
49 | ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | |
5ba7a870 KH |
50 | |
51 | ;; scalar | utf-8 | |
52 | ;; value | 1st byte | 2nd byte | 3rd byte | |
53 | ;; --------------------+-----------+-----------+---------- | |
54 | ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
55 | ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
56 | ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
57 | ||
58 | ;;; Code: | |
59 | ||
60 | (define-ccl-program ccl-decode-mule-utf-8 | |
61 | ;; | |
62 | ;; charset | bytes in utf-8 | bytes in emacs | |
63 | ;; -----------------------+----------------+--------------- | |
64 | ;; ascii | 1 | 1 | |
65 | ;; -----------------------+----------------+--------------- | |
66 | ;; eight-bit-control | 2 | 2 | |
67 | ;; latin-iso8859-1 | 2 | 2 | |
68 | ;; -----------------------+----------------+--------------- | |
69 | ;; mule-unicode-0100-24ff | 2 | 4 | |
70 | ;; (< 0800) | | | |
71 | ;; -----------------------+----------------+--------------- | |
72 | ;; mule-unicode-0100-24ff | 3 | 4 | |
73 | ;; (>= 8000) | | | |
74 | ;; mule-unicode-2500-33ff | 3 | 4 | |
75 | ;; mule-unicode-e000-ffff | 3 | 4 | |
76 | ;; | |
77 | ;; Thus magnification factor is two. | |
78 | ;; | |
79 | `(2 | |
3d0e328b GM |
80 | ((r5 = ,(charset-id 'eight-bit-control)) |
81 | (r6 = ,(charset-id 'eight-bit-graphic)) | |
82 | (loop | |
5ba7a870 KH |
83 | (read r0) |
84 | ||
85 | ;; 1byte encoding, i.e., ascii | |
86 | (if (r0 < #x80) | |
87 | (write r0) | |
88 | ||
3d0e328b | 89 | ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
5ba7a870 KH |
90 | (if (r0 < #xe0) |
91 | ((read r1) | |
3d0e328b GM |
92 | |
93 | (if ((r1 & #b11000000) != #b10000000) | |
94 | ;; Invalid 2-byte sequence | |
95 | ((if (r0 < #xa0) | |
96 | (write-multibyte-character r5 r0) | |
97 | (write-multibyte-character r6 r0)) | |
98 | (if (r1 < #x80) | |
99 | (write r1) | |
100 | (if (r1 < #xa0) | |
101 | (write-multibyte-character r5 r1) | |
102 | (write-multibyte-character r6 r1)))) | |
103 | ||
104 | ((r0 &= #x1f) | |
105 | (r0 <<= 6) | |
106 | (r1 &= #x3f) | |
107 | (r1 += r0) | |
108 | ;; Now r1 holds scalar value | |
109 | ||
110 | ;; eight-bit-control | |
111 | (if (r1 < 160) | |
112 | ((write-multibyte-character r5 r1)) | |
113 | ||
114 | ;; latin-iso8859-1 | |
115 | (if (r1 < 256) | |
116 | ((r0 = ,(charset-id 'latin-iso8859-1)) | |
117 | (r1 -= 128) | |
118 | (write-multibyte-character r0 r1)) | |
119 | ||
120 | ;; mule-unicode-0100-24ff (< 0800) | |
121 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
122 | (r1 -= #x0100) | |
123 | (r2 = (((r1 / 96) + 32) << 7)) | |
124 | (r1 %= 96) | |
125 | (r1 += (r2 + 32)) | |
126 | (write-multibyte-character r0 r1))))))) | |
5ba7a870 KH |
127 | |
128 | ;; 3byte encoding | |
3d0e328b | 129 | ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx |
5ba7a870 KH |
130 | (if (r0 < #xf0) |
131 | ((read r1 r2) | |
3d0e328b GM |
132 | |
133 | ;; This is set to 1 if the encoding is invalid. | |
134 | (r4 = 0) | |
135 | ||
136 | (r3 = (r1 & #b11000000)) | |
137 | (r3 |= ((r2 >> 2) & #b00110000)) | |
138 | (if (r3 != #b10100000) | |
139 | (r4 = 1) | |
140 | ((r3 = ((r0 & #x0f) << 12)) | |
141 | (r3 += ((r1 & #x3f) << 6)) | |
142 | (r3 += (r2 & #x3f)) | |
143 | (if (r3 < #x0800) | |
144 | (r4 = 1)))) | |
145 | ||
146 | (if (r4 != 0) | |
147 | ;; Invalid 3-byte sequence | |
148 | ((if (r0 < #xa0) | |
149 | (write-multibyte-character r5 r0) | |
150 | (write-multibyte-character r6 r0)) | |
151 | (if (r1 < #x80) | |
152 | (write r1) | |
153 | (if (r1 < #xa0) | |
154 | (write-multibyte-character r5 r1) | |
155 | (write-multibyte-character r6 r1))) | |
156 | (if (r2 < #x80) | |
157 | (write r2) | |
158 | (if (r2 < #xa0) | |
159 | (write-multibyte-character r5 r2) | |
160 | (write-multibyte-character r6 r2)))) | |
161 | ||
162 | ;; mule-unicode-0100-24ff (>= 0800) | |
163 | ((if (r3 < #x2500) | |
164 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
165 | (r3 -= #x0100) | |
166 | (r3 //= 96) | |
167 | (r1 = (r7 + 32)) | |
168 | (r1 += ((r3 + 32) << 7)) | |
169 | (write-multibyte-character r0 r1)) | |
170 | ||
171 | ;; mule-unicode-2500-33ff | |
172 | (if (r3 < #x3400) | |
173 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
174 | (r3 -= #x2500) | |
175 | (r3 //= 96) | |
176 | (r1 = (r7 + 32)) | |
177 | (r1 += ((r3 + 32) << 7)) | |
178 | (write-multibyte-character r0 r1)) | |
179 | ||
180 | ;; U+3400 .. U+DFFF | |
181 | ;; keep those bytes as eight-bit-{control|graphic} | |
182 | (if (r3 < #xe000) | |
183 | ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | |
184 | (r3 = r6) | |
185 | (write-multibyte-character r3 r0) | |
186 | (if (r1 < #xa0) | |
187 | (r3 = r5)) | |
188 | (write-multibyte-character r3 r1) | |
189 | (if (r2 < #xa0) | |
190 | (r3 = r5) | |
191 | (r3 = r6)) | |
192 | (write-multibyte-character r3 r2)) | |
193 | ||
194 | ;; mule-unicode-e000-ffff | |
195 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
196 | (r3 -= #xe000) | |
197 | (r3 //= 96) | |
198 | (r1 = (r7 + 32)) | |
199 | (r1 += ((r3 + 32) << 7)) | |
200 | (write-multibyte-character r0 r1)))))))) | |
5ba7a870 KH |
201 | |
202 | ;; 4byte encoding | |
203 | ;; keep those bytes as eight-bit-{control|graphic} | |
204 | ((read r1 r2 r3) | |
205 | ;; r0 > #xf0, thus eight-bit-graphic | |
3d0e328b | 206 | (write-multibyte-character r6 r0) |
5ba7a870 | 207 | (if (r1 < #xa0) |
3d0e328b GM |
208 | (write-multibyte-character r5 r1) |
209 | (write-multibyte-character r6 r1)) | |
5ba7a870 | 210 | (if (r2 < #xa0) |
3d0e328b GM |
211 | (write-multibyte-character r5 r2) |
212 | (write-multibyte-character r6 r2)) | |
5ba7a870 | 213 | (if (r3 < #xa0) |
3d0e328b GM |
214 | (write-multibyte-character r5 r3) |
215 | (write-multibyte-character r6 r3)))))) | |
5ba7a870 KH |
216 | |
217 | (repeat)))) | |
218 | ||
c49b8288 | 219 | "CCL program to decode UTF-8. |
74ace46a DL |
220 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
221 | mule-unicode-*. Encodings of un-representable Unicode characters are | |
222 | decoded asis into eight-bit-control and eight-bit-graphic | |
223 | characters.") | |
5ba7a870 KH |
224 | |
225 | (define-ccl-program ccl-encode-mule-utf-8 | |
226 | `(1 | |
aa15b3e5 KH |
227 | ((r5 = -1) |
228 | (loop | |
229 | (if (r5 < 0) | |
230 | ((r1 = -1) | |
231 | (read-multibyte-character r0 r1)) | |
232 | (;; We have already done read-multibyte-character. | |
233 | (r0 = r5) | |
234 | (r1 = r6) | |
235 | (r5 = -1))) | |
236 | ||
237 | (if (r0 == ,(charset-id 'ascii)) | |
238 | (write r1) | |
239 | ||
240 | (if (r0 == ,(charset-id 'latin-iso8859-1)) | |
241 | ;; r1 scalar utf-8 | |
242 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
243 | ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 | |
244 | ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 | |
245 | ((r0 = (((r1 & #x40) >> 6) | #xc2)) | |
246 | (r1 &= #x3f) | |
247 | (r1 |= #x80) | |
248 | (write r0 r1)) | |
249 | ||
250 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | |
251 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
252 | ;; #x3f80 == (0011 1111 1000 0000)b | |
253 | (r1 &= #x7f) | |
254 | (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | |
255 | ;; now r1 holds scalar value | |
256 | (if (r1 < #x0800) | |
257 | ;; 2byte encoding | |
258 | ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) | |
259 | ;; #x07c0 == (0000 0111 1100 0000)b | |
260 | (r1 &= #x3f) | |
261 | (r1 |= #x80) | |
262 | (write r0 r1)) | |
263 | ;; 3byte encoding | |
264 | ((r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
5ba7a870 KH |
265 | (r2 = ((r1 & #x3f) | #x80)) |
266 | (r1 &= #x0fc0) | |
267 | (r1 >>= 6) | |
268 | (r1 |= #x80) | |
aa15b3e5 KH |
269 | (write r0 r1 r2)))) |
270 | ||
271 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
272 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
273 | (r1 &= #x7f) | |
274 | (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | |
275 | (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
276 | (r2 = ((r1 & #x3f) | #x80)) | |
277 | (r1 &= #x0fc0) | |
278 | (r1 >>= 6) | |
279 | (r1 |= #x80) | |
280 | (write r0 r1 r2)) | |
281 | ||
282 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | |
283 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
284 | (r1 &= #x7f) | |
285 | (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 | |
286 | (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
287 | (r2 = ((r1 & #x3f) | #x80)) | |
288 | (r1 &= #x0fc0) | |
289 | (r1 >>= 6) | |
290 | (r1 |= #x80) | |
291 | (write r0 r1 r2)) | |
292 | ||
293 | (if (r0 == ,(charset-id 'eight-bit-control)) | |
294 | ;; r1 scalar utf-8 | |
295 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
296 | ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | |
297 | ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | |
298 | ((write #xc2) | |
299 | (write r1)) | |
300 | ||
301 | (if (r0 == ,(charset-id 'eight-bit-graphic)) | |
302 | ;; r1 scalar utf-8 | |
303 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
304 | ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | |
305 | ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | |
306 | ((write r1) | |
307 | (r1 = -1) | |
308 | (read-multibyte-character r0 r1) | |
309 | (if (r0 != ,(charset-id 'eight-bit-graphic)) | |
310 | (if (r0 != ,(charset-id 'eight-bit-control)) | |
311 | ((r5 = r0) | |
312 | (r6 = r1)))) | |
313 | (if (r5 < 0) | |
314 | ((read-multibyte-character r0 r2) | |
315 | (if (r0 != ,(charset-id 'eight-bit-graphic)) | |
316 | (if (r0 != ,(charset-id 'eight-bit-control)) | |
317 | ((r5 = r0) | |
318 | (r6 = r2)))) | |
319 | (if (r5 < 0) | |
320 | (write r1 r2) | |
321 | (if (r1 < #xa0) | |
322 | (write r1) | |
323 | ((write #xc2) | |
324 | (write r1))))))) | |
325 | ||
326 | ;; Unsupported character. | |
327 | ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
328 | ((write #xef) | |
329 | (write #xbf) | |
330 | (write #xbd))))))))) | |
331 | (repeat))) | |
332 | (if (r1 >= #xa0) | |
333 | (write r1) | |
334 | (if (r1 >= #x80) | |
335 | ((write #xc2) | |
336 | (write r1))))) | |
5ba7a870 | 337 | |
c49b8288 DL |
338 | "CCL program to encode into UTF-8. |
339 | Only characters from the charsets ascii, eight-bit-control, | |
74ace46a DL |
340 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. |
341 | Others are encoded as U+FFFD.") | |
5ba7a870 KH |
342 | |
343 | (make-coding-system | |
344 | 'mule-utf-8 4 ?u | |
345 | "UTF-8 encoding for Emacs-supported Unicode characters. | |
c49b8288 | 346 | The supported Emacs character sets are: |
5ba7a870 KH |
347 | ascii |
348 | eight-bit-control | |
349 | eight-bit-graphic | |
350 | latin-iso8859-1 | |
351 | mule-unicode-0100-24ff | |
352 | mule-unicode-2500-33ff | |
353 | mule-unicode-e000-ffff | |
354 | ||
c49b8288 DL |
355 | Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
356 | are decoded into sequences of eight-bit-control and eight-bit-graphic | |
357 | characters to preserve their byte sequences. Emacs characters out of | |
358 | these ranges are encoded into U+FFFD. | |
359 | ||
360 | Note that, currently, characters in the mule-unicode charsets have no | |
361 | syntax and case information. Thus, for instance, upper- and | |
362 | lower-casing commands won't work with them." | |
5ba7a870 KH |
363 | |
364 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
365 | '((safe-charsets | |
366 | ascii | |
367 | eight-bit-control | |
368 | eight-bit-graphic | |
369 | latin-iso8859-1 | |
370 | mule-unicode-0100-24ff | |
371 | mule-unicode-2500-33ff | |
372 | mule-unicode-e000-ffff) | |
87ae7973 | 373 | (mime-charset . utf-8) |
75f6d723 | 374 | (coding-category . coding-category-utf-8) |
87ae7973 | 375 | (valid-codes (0 . 255)))) |
5ba7a870 KH |
376 | |
377 | (define-coding-system-alias 'utf-8 'mule-utf-8) | |
e8af40ee PJ |
378 | |
379 | ;;; utf-8.el ends here |