Some fixes to follow coding conventions.
[bpt/emacs.git] / lisp / international / utf-8.el
CommitLineData
e8af40ee 1;;; utf-8.el --- limited UTF-8 decoding/encoding support
5ba7a870
KH
2
3;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4;; Licensed to the Free Software Foundation.
3d0e328b 5;; Copyright (C) 2001 Free Software Foundation, Inc.
5ba7a870 6
aa15b3e5 7;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
c49b8288 8;; Keywords: multilingual, Unicode, UTF-8, i18n
5ba7a870
KH
9
10;; This file is part of GNU Emacs.
11
12;; GNU Emacs is free software; you can redistribute it and/or modify
13;; it under the terms of the GNU General Public License as published by
14;; the Free Software Foundation; either version 2, or (at your option)
15;; any later version.
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
23;; along with GNU Emacs; see the file COPYING. If not, write to the
24;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25;; Boston, MA 02111-1307, USA.
26
27;;; Commentary:
28
29;; The coding-system `mule-utf-8' supports encoding/decoding of the
c49b8288 30;; following character sets to and from UTF-8:
5ba7a870
KH
31;;
32;; ascii
33;; eight-bit-control
34;; latin-iso8859-1
35;; mule-unicode-0100-24ff
36;; mule-unicode-2500-33ff
37;; mule-unicode-e000-ffff
38;;
39;; Characters of other character sets cannot be encoded with
c49b8288
DL
40;; mule-utf-8. Note that the mule-unicode charsets currently lack
41;; case and syntax information, so things like `downcase' will only
42;; work for characters from ASCII and Latin-1.
5ba7a870 43;;
c49b8288
DL
44;; On decoding, Unicode characters that do not fit into the above
45;; character sets are handled as `eight-bit-control' or
46;; `eight-bit-graphic' characters to retain the information about the
47;; original byte sequence.
48
49;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
5ba7a870
KH
50
51;; scalar | utf-8
52;; value | 1st byte | 2nd byte | 3rd byte
53;; --------------------+-----------+-----------+----------
54;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
55;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
56;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
57
58;;; Code:
59
60(define-ccl-program ccl-decode-mule-utf-8
61 ;;
62 ;; charset | bytes in utf-8 | bytes in emacs
63 ;; -----------------------+----------------+---------------
64 ;; ascii | 1 | 1
65 ;; -----------------------+----------------+---------------
66 ;; eight-bit-control | 2 | 2
67 ;; latin-iso8859-1 | 2 | 2
68 ;; -----------------------+----------------+---------------
69 ;; mule-unicode-0100-24ff | 2 | 4
70 ;; (< 0800) | |
71 ;; -----------------------+----------------+---------------
72 ;; mule-unicode-0100-24ff | 3 | 4
73 ;; (>= 8000) | |
74 ;; mule-unicode-2500-33ff | 3 | 4
75 ;; mule-unicode-e000-ffff | 3 | 4
76 ;;
77 ;; Thus magnification factor is two.
78 ;;
79 `(2
3d0e328b
GM
80 ((r5 = ,(charset-id 'eight-bit-control))
81 (r6 = ,(charset-id 'eight-bit-graphic))
82 (loop
5ba7a870
KH
83 (read r0)
84
85 ;; 1byte encoding, i.e., ascii
86 (if (r0 < #x80)
87 (write r0)
88
3d0e328b 89 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
5ba7a870
KH
90 (if (r0 < #xe0)
91 ((read r1)
3d0e328b
GM
92
93 (if ((r1 & #b11000000) != #b10000000)
94 ;; Invalid 2-byte sequence
95 ((if (r0 < #xa0)
96 (write-multibyte-character r5 r0)
97 (write-multibyte-character r6 r0))
98 (if (r1 < #x80)
99 (write r1)
100 (if (r1 < #xa0)
101 (write-multibyte-character r5 r1)
102 (write-multibyte-character r6 r1))))
103
104 ((r0 &= #x1f)
105 (r0 <<= 6)
106 (r1 &= #x3f)
107 (r1 += r0)
108 ;; Now r1 holds scalar value
109
110 ;; eight-bit-control
111 (if (r1 < 160)
112 ((write-multibyte-character r5 r1))
113
114 ;; latin-iso8859-1
115 (if (r1 < 256)
116 ((r0 = ,(charset-id 'latin-iso8859-1))
117 (r1 -= 128)
118 (write-multibyte-character r0 r1))
119
120 ;; mule-unicode-0100-24ff (< 0800)
121 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
122 (r1 -= #x0100)
123 (r2 = (((r1 / 96) + 32) << 7))
124 (r1 %= 96)
125 (r1 += (r2 + 32))
126 (write-multibyte-character r0 r1)))))))
5ba7a870
KH
127
128 ;; 3byte encoding
3d0e328b 129 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
5ba7a870
KH
130 (if (r0 < #xf0)
131 ((read r1 r2)
3d0e328b
GM
132
133 ;; This is set to 1 if the encoding is invalid.
134 (r4 = 0)
135
136 (r3 = (r1 & #b11000000))
137 (r3 |= ((r2 >> 2) & #b00110000))
138 (if (r3 != #b10100000)
139 (r4 = 1)
140 ((r3 = ((r0 & #x0f) << 12))
141 (r3 += ((r1 & #x3f) << 6))
142 (r3 += (r2 & #x3f))
143 (if (r3 < #x0800)
144 (r4 = 1))))
145
146 (if (r4 != 0)
147 ;; Invalid 3-byte sequence
148 ((if (r0 < #xa0)
149 (write-multibyte-character r5 r0)
150 (write-multibyte-character r6 r0))
151 (if (r1 < #x80)
152 (write r1)
153 (if (r1 < #xa0)
154 (write-multibyte-character r5 r1)
155 (write-multibyte-character r6 r1)))
156 (if (r2 < #x80)
157 (write r2)
158 (if (r2 < #xa0)
159 (write-multibyte-character r5 r2)
160 (write-multibyte-character r6 r2))))
161
162 ;; mule-unicode-0100-24ff (>= 0800)
163 ((if (r3 < #x2500)
164 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
165 (r3 -= #x0100)
166 (r3 //= 96)
167 (r1 = (r7 + 32))
168 (r1 += ((r3 + 32) << 7))
169 (write-multibyte-character r0 r1))
170
171 ;; mule-unicode-2500-33ff
172 (if (r3 < #x3400)
173 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
174 (r3 -= #x2500)
175 (r3 //= 96)
176 (r1 = (r7 + 32))
177 (r1 += ((r3 + 32) << 7))
178 (write-multibyte-character r0 r1))
179
180 ;; U+3400 .. U+DFFF
181 ;; keep those bytes as eight-bit-{control|graphic}
182 (if (r3 < #xe000)
183 ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
184 (r3 = r6)
185 (write-multibyte-character r3 r0)
186 (if (r1 < #xa0)
187 (r3 = r5))
188 (write-multibyte-character r3 r1)
189 (if (r2 < #xa0)
190 (r3 = r5)
191 (r3 = r6))
192 (write-multibyte-character r3 r2))
193
194 ;; mule-unicode-e000-ffff
195 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
196 (r3 -= #xe000)
197 (r3 //= 96)
198 (r1 = (r7 + 32))
199 (r1 += ((r3 + 32) << 7))
200 (write-multibyte-character r0 r1))))))))
5ba7a870
KH
201
202 ;; 4byte encoding
203 ;; keep those bytes as eight-bit-{control|graphic}
204 ((read r1 r2 r3)
205 ;; r0 > #xf0, thus eight-bit-graphic
3d0e328b 206 (write-multibyte-character r6 r0)
5ba7a870 207 (if (r1 < #xa0)
3d0e328b
GM
208 (write-multibyte-character r5 r1)
209 (write-multibyte-character r6 r1))
5ba7a870 210 (if (r2 < #xa0)
3d0e328b
GM
211 (write-multibyte-character r5 r2)
212 (write-multibyte-character r6 r2))
5ba7a870 213 (if (r3 < #xa0)
3d0e328b
GM
214 (write-multibyte-character r5 r3)
215 (write-multibyte-character r6 r3))))))
5ba7a870
KH
216
217 (repeat))))
218
c49b8288 219 "CCL program to decode UTF-8.
74ace46a
DL
220Basic decoding is done into the charsets ascii, latin-iso8859-1 and
221mule-unicode-*. Encodings of un-representable Unicode characters are
222decoded asis into eight-bit-control and eight-bit-graphic
223characters.")
5ba7a870
KH
224
225(define-ccl-program ccl-encode-mule-utf-8
226 `(1
aa15b3e5
KH
227 ((r5 = -1)
228 (loop
229 (if (r5 < 0)
230 ((r1 = -1)
231 (read-multibyte-character r0 r1))
232 (;; We have already done read-multibyte-character.
233 (r0 = r5)
234 (r1 = r6)
235 (r5 = -1)))
236
237 (if (r0 == ,(charset-id 'ascii))
238 (write r1)
239
240 (if (r0 == ,(charset-id 'latin-iso8859-1))
241 ;; r1 scalar utf-8
242 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
243 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
244 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
245 ((r0 = (((r1 & #x40) >> 6) | #xc2))
246 (r1 &= #x3f)
247 (r1 |= #x80)
248 (write r0 r1))
249
250 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
251 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
252 ;; #x3f80 == (0011 1111 1000 0000)b
253 (r1 &= #x7f)
254 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
255 ;; now r1 holds scalar value
256 (if (r1 < #x0800)
257 ;; 2byte encoding
258 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
259 ;; #x07c0 == (0000 0111 1100 0000)b
260 (r1 &= #x3f)
261 (r1 |= #x80)
262 (write r0 r1))
263 ;; 3byte encoding
264 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
5ba7a870
KH
265 (r2 = ((r1 & #x3f) | #x80))
266 (r1 &= #x0fc0)
267 (r1 >>= 6)
268 (r1 |= #x80)
aa15b3e5
KH
269 (write r0 r1 r2))))
270
271 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
272 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
273 (r1 &= #x7f)
274 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
275 (r0 = (((r1 & #xf000) >> 12) | #xe0))
276 (r2 = ((r1 & #x3f) | #x80))
277 (r1 &= #x0fc0)
278 (r1 >>= 6)
279 (r1 |= #x80)
280 (write r0 r1 r2))
281
282 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
283 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
284 (r1 &= #x7f)
285 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
286 (r0 = (((r1 & #xf000) >> 12) | #xe0))
287 (r2 = ((r1 & #x3f) | #x80))
288 (r1 &= #x0fc0)
289 (r1 >>= 6)
290 (r1 |= #x80)
291 (write r0 r1 r2))
292
293 (if (r0 == ,(charset-id 'eight-bit-control))
294 ;; r1 scalar utf-8
295 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
296 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
297 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
298 ((write #xc2)
299 (write r1))
300
301 (if (r0 == ,(charset-id 'eight-bit-graphic))
302 ;; r1 scalar utf-8
303 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
304 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
305 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
306 ((write r1)
307 (r1 = -1)
308 (read-multibyte-character r0 r1)
309 (if (r0 != ,(charset-id 'eight-bit-graphic))
310 (if (r0 != ,(charset-id 'eight-bit-control))
311 ((r5 = r0)
312 (r6 = r1))))
313 (if (r5 < 0)
314 ((read-multibyte-character r0 r2)
315 (if (r0 != ,(charset-id 'eight-bit-graphic))
316 (if (r0 != ,(charset-id 'eight-bit-control))
317 ((r5 = r0)
318 (r6 = r2))))
319 (if (r5 < 0)
320 (write r1 r2)
321 (if (r1 < #xa0)
322 (write r1)
323 ((write #xc2)
324 (write r1)))))))
325
326 ;; Unsupported character.
327 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
328 ((write #xef)
329 (write #xbf)
330 (write #xbd)))))))))
331 (repeat)))
332 (if (r1 >= #xa0)
333 (write r1)
334 (if (r1 >= #x80)
335 ((write #xc2)
336 (write r1)))))
5ba7a870 337
c49b8288
DL
338 "CCL program to encode into UTF-8.
339Only characters from the charsets ascii, eight-bit-control,
74ace46a
DL
340eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
341Others are encoded as U+FFFD.")
5ba7a870
KH
342
343(make-coding-system
344 'mule-utf-8 4 ?u
345 "UTF-8 encoding for Emacs-supported Unicode characters.
c49b8288 346The supported Emacs character sets are:
5ba7a870
KH
347 ascii
348 eight-bit-control
349 eight-bit-graphic
350 latin-iso8859-1
351 mule-unicode-0100-24ff
352 mule-unicode-2500-33ff
353 mule-unicode-e000-ffff
354
c49b8288
DL
355Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
356are decoded into sequences of eight-bit-control and eight-bit-graphic
357characters to preserve their byte sequences. Emacs characters out of
358these ranges are encoded into U+FFFD.
359
360Note that, currently, characters in the mule-unicode charsets have no
361syntax and case information. Thus, for instance, upper- and
362lower-casing commands won't work with them."
5ba7a870
KH
363
364 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
365 '((safe-charsets
366 ascii
367 eight-bit-control
368 eight-bit-graphic
369 latin-iso8859-1
370 mule-unicode-0100-24ff
371 mule-unicode-2500-33ff
372 mule-unicode-e000-ffff)
87ae7973 373 (mime-charset . utf-8)
75f6d723 374 (coding-category . coding-category-utf-8)
87ae7973 375 (valid-codes (0 . 255))))
5ba7a870
KH
376
377(define-coding-system-alias 'utf-8 'mule-utf-8)
e8af40ee
PJ
378
379;;; utf-8.el ends here