Doc and commentary fixes.
[bpt/emacs.git] / lisp / international / utf-8.el
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
2
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
5
6 ;; Keywords: multilingual, Unicode, UTF-8, i18n
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
13 ;; any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Commentary:
26
27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the
28 ;; following character sets to and from UTF-8:
29 ;;
30 ;; ascii
31 ;; eight-bit-control
32 ;; latin-iso8859-1
33 ;; mule-unicode-0100-24ff
34 ;; mule-unicode-2500-33ff
35 ;; mule-unicode-e000-ffff
36 ;;
37 ;; Characters of other character sets cannot be encoded with
38 ;; mule-utf-8. Note that the mule-unicode charsets currently lack
39 ;; case and syntax information, so things like `downcase' will only
40 ;; work for characters from ASCII and Latin-1.
41 ;;
42 ;; On decoding, Unicode characters that do not fit into the above
43 ;; character sets are handled as `eight-bit-control' or
44 ;; `eight-bit-graphic' characters to retain the information about the
45 ;; original byte sequence.
46
47 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
48
49 ;; scalar | utf-8
50 ;; value | 1st byte | 2nd byte | 3rd byte
51 ;; --------------------+-----------+-----------+----------
52 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
53 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
54 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
55
56 ;;; Code:
57
58 (define-ccl-program ccl-decode-mule-utf-8
59 ;;
60 ;; charset | bytes in utf-8 | bytes in emacs
61 ;; -----------------------+----------------+---------------
62 ;; ascii | 1 | 1
63 ;; -----------------------+----------------+---------------
64 ;; eight-bit-control | 2 | 2
65 ;; latin-iso8859-1 | 2 | 2
66 ;; -----------------------+----------------+---------------
67 ;; mule-unicode-0100-24ff | 2 | 4
68 ;; (< 0800) | |
69 ;; -----------------------+----------------+---------------
70 ;; mule-unicode-0100-24ff | 3 | 4
71 ;; (>= 8000) | |
72 ;; mule-unicode-2500-33ff | 3 | 4
73 ;; mule-unicode-e000-ffff | 3 | 4
74 ;;
75 ;; Thus magnification factor is two.
76 ;;
77 `(2
78 ((loop
79 (read r0)
80
81 ;; 1byte encoding, i.e., ascii
82 (if (r0 < #x80)
83 (write r0)
84
85 ;; 2byte encoding
86 (if (r0 < #xe0)
87 ((read r1)
88 (r0 &= #x1f)
89 (r0 <<= 6)
90 (r1 &= #x3f)
91 (r1 += r0)
92 ;; now r1 holds scalar value
93
94 ;; eight-bit-control
95 (if (r1 < 160)
96 ((r0 = ,(charset-id 'eight-bit-control))
97 (write-multibyte-character r0 r1))
98
99 ;; latin-iso8859-1
100 (if (r1 < 256)
101 ((r0 = ,(charset-id 'latin-iso8859-1))
102 (r1 -= 128)
103 (write-multibyte-character r0 r1))
104
105 ;; mule-unicode-0100-24ff (< 0800)
106 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
107 (r1 -= #x0100)
108 (r2 = (((r1 / 96) + 32) << 7))
109 (r1 %= 96)
110 (r1 += (r2 + 32))
111 (write-multibyte-character r0 r1)))))
112
113 ;; 3byte encoding
114 (if (r0 < #xf0)
115 ((read r1 r2)
116 (r3 = ((r0 & #x0f) << 12))
117 (r3 += ((r1 & #x3f) << 6))
118 (r3 += (r2 & #x3f))
119 ;; now r3 holds scalar value
120
121 ;; mule-unicode-0100-24ff (>= 0800)
122 (if (r3 < #x2500)
123 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
124 (r3 -= #x0100)
125 (r3 //= 96)
126 (r1 = (r7 + 32))
127 (r1 += ((r3 + 32) << 7))
128 (write-multibyte-character r0 r1))
129
130 ;; mule-unicode-2500-33ff
131 (if (r3 < #x3400)
132 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
133 (r3 -= #x2500)
134 (r3 //= 96)
135 (r1 = (r7 + 32))
136 (r1 += ((r3 + 32) << 7))
137 (write-multibyte-character r0 r1))
138
139 ;; U+3400 .. U+DFFF
140 ;; keep those bytes as eight-bit-{control|graphic}
141 (if (r3 < #xe000)
142 (;; #xe0 < r0 < #xf0, so r0 is eight-bit-graphic
143 (r3 = ,(charset-id 'eight-bit-graphic))
144 (write-multibyte-character r3 r0)
145 (if (r1 < #xa0)
146 (r3 = ,(charset-id 'eight-bit-control)))
147 (write-multibyte-character r3 r1)
148 (if (r2 < #xa0)
149 (r3 = ,(charset-id 'eight-bit-control))
150 (r3 = ,(charset-id 'eight-bit-graphic)))
151 (write-multibyte-character r3 r2))
152
153 ;; mule-unicode-e000-ffff
154 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
155 (r3 -= #xe000)
156 (r3 //= 96)
157 (r1 = (r7 + 32))
158 (r1 += ((r3 + 32) << 7))
159 (write-multibyte-character r0 r1))))))
160
161 ;; 4byte encoding
162 ;; keep those bytes as eight-bit-{control|graphic}
163 ((read r1 r2 r3)
164 ;; r0 > #xf0, thus eight-bit-graphic
165 (r4 = ,(charset-id 'eight-bit-graphic))
166 (write-multibyte-character r4 r0)
167 (if (r1 < #xa0)
168 (r4 = ,(charset-id 'eight-bit-control)))
169 (write-multibyte-character r4 r1)
170 (if (r2 < #xa0)
171 (r4 = ,(charset-id 'eight-bit-control))
172 (r4 = ,(charset-id 'eight-bit-graphic)))
173 (write-multibyte-character r4 r2)
174 (if (r3 < #xa0)
175 (r4 = ,(charset-id 'eight-bit-control))
176 (r4 = ,(charset-id 'eight-bit-graphic)))
177 (write-multibyte-character r4 r3)))))
178
179 (repeat))))
180
181 "CCL program to decode UTF-8.
182 Decoding is done into the charsets ascii, eight-bit-control,
183 latin-iso8859-1 and mule-unicode-* only.")
184
185 (define-ccl-program ccl-encode-mule-utf-8
186 `(1
187 (loop
188 (read-multibyte-character r0 r1)
189
190 (if (r0 == ,(charset-id 'ascii))
191 (write r1)
192
193 (if (r0 == ,(charset-id 'latin-iso8859-1))
194 ;; r1 scalar utf-8
195 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
196 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
197 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
198 ((r0 = (((r1 & #x40) >> 6) | #xc2))
199 (r1 &= #x3f)
200 (r1 |= #x80)
201 (write r0 r1))
202
203 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
204 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
205 ;; #x3f80 == (0011 1111 1000 0000)b
206 (r1 &= #x7f)
207 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
208 ;; now r1 holds scalar value
209 (if (r1 < #x0800)
210 ;; 2byte encoding
211 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
212 ;; #x07c0 == (0000 0111 1100 0000)b
213 (r1 &= #x3f)
214 (r1 |= #x80)
215 (write r0 r1))
216 ;; 3byte encoding
217 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
218 (r2 = ((r1 & #x3f) | #x80))
219 (r1 &= #x0fc0)
220 (r1 >>= 6)
221 (r1 |= #x80)
222 (write r0 r1 r2))))
223
224 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
225 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
226 (r1 &= #x7f)
227 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
228 (r0 = (((r1 & #xf000) >> 12) | #xe0))
229 (r2 = ((r1 & #x3f) | #x80))
230 (r1 &= #x0fc0)
231 (r1 >>= 6)
232 (r1 |= #x80)
233 (write r0 r1 r2))
234
235 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
236 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
237 (r1 &= #x7f)
238 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
239 (r0 = (((r1 & #xf000) >> 12) | #xe0))
240 (r2 = ((r1 & #x3f) | #x80))
241 (r1 &= #x0fc0)
242 (r1 >>= 6)
243 (r1 |= #x80)
244 (write r0 r1 r2))
245
246 (if (r0 == ,(charset-id 'eight-bit-control))
247 ;; r1 scalar utf-8
248 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
249 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
250 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
251 (write r1)
252
253 (if (r0 == ,(charset-id 'eight-bit-graphic))
254 ;; r1 scalar utf-8
255 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
256 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
257 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
258 (write r1)
259
260 ;; Unsupported character.
261 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
262 ((write #xef)
263 (write #xbf)
264 (write #xbd)))))))))
265 (repeat)))
266
267 "CCL program to encode into UTF-8.
268 Only characters from the charsets ascii, eight-bit-control,
269 latin-iso8859-1 and mule-unicode-* are recognized. Others are encoded
270 as U+FFFD.")
271
272 (make-coding-system
273 'mule-utf-8 4 ?u
274 "UTF-8 encoding for Emacs-supported Unicode characters.
275 The supported Emacs character sets are:
276 ascii
277 eight-bit-control
278 eight-bit-graphic
279 latin-iso8859-1
280 mule-unicode-0100-24ff
281 mule-unicode-2500-33ff
282 mule-unicode-e000-ffff
283
284 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
285 are decoded into sequences of eight-bit-control and eight-bit-graphic
286 characters to preserve their byte sequences. Emacs characters out of
287 these ranges are encoded into U+FFFD.
288
289 Note that, currently, characters in the mule-unicode charsets have no
290 syntax and case information. Thus, for instance, upper- and
291 lower-casing commands won't work with them."
292
293 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
294 '((safe-charsets
295 ascii
296 eight-bit-control
297 eight-bit-graphic
298 latin-iso8859-1
299 mule-unicode-0100-24ff
300 mule-unicode-2500-33ff
301 mule-unicode-e000-ffff)
302 (mime-charset . utf-8)))
303
304 (define-coding-system-alias 'utf-8 'mule-utf-8)