1 ;;; utf-8.el --- limited UTF-8 decoding/encoding support
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
5 ;; Copyright (C) 2001 Free Software Foundation, Inc.
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
8 ;; Keywords: multilingual, Unicode, UTF-8, i18n
10 ;; This file is part of GNU Emacs.
12 ;; GNU Emacs is free software; you can redistribute it and/or modify
13 ;; it under the terms of the GNU General Public License as published by
14 ;; the Free Software Foundation; either version 2, or (at your option)
17 ;; GNU Emacs is distributed in the hope that it will be useful,
18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;; GNU General Public License for more details.
22 ;; You should have received a copy of the GNU General Public License
23 ;; along with GNU Emacs; see the file COPYING. If not, write to the
24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25 ;; Boston, MA 02111-1307, USA.
29 ;; The coding-system `mule-utf-8' supports encoding/decoding of the
30 ;; following character sets to and from UTF-8:
35 ;; mule-unicode-0100-24ff
36 ;; mule-unicode-2500-33ff
37 ;; mule-unicode-e000-ffff
39 ;; Characters of other character sets cannot be encoded with
40 ;; mule-utf-8. Note that the mule-unicode charsets currently lack
41 ;; case and syntax information, so things like `downcase' will only
42 ;; work for characters from ASCII and Latin-1.
44 ;; On decoding, Unicode characters that do not fit into the above
45 ;; character sets are handled as `eight-bit-control' or
46 ;; `eight-bit-graphic' characters to retain the information about the
47 ;; original byte sequence.
49 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
52 ;; value | 1st byte | 2nd byte | 3rd byte
53 ;; --------------------+-----------+-----------+----------
54 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
55 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
56 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
60 (define-ccl-program ccl-decode-mule-utf-8
62 ;; charset | bytes in utf-8 | bytes in emacs
63 ;; -----------------------+----------------+---------------
65 ;; -----------------------+----------------+---------------
66 ;; eight-bit-control | 2 | 2
67 ;; latin-iso8859-1 | 2 | 2
68 ;; -----------------------+----------------+---------------
69 ;; mule-unicode-0100-24ff | 2 | 4
71 ;; -----------------------+----------------+---------------
72 ;; mule-unicode-0100-24ff | 3 | 4
74 ;; mule-unicode-2500-33ff | 3 | 4
75 ;; mule-unicode-e000-ffff | 3 | 4
77 ;; Thus magnification factor is two.
80 ((r5 = ,(charset-id 'eight-bit-control
))
81 (r6 = ,(charset-id 'eight-bit-graphic
))
85 ;; 1byte encoding, i.e., ascii
89 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
93 (if ((r1 & #b11000000
) != #b10000000
)
94 ;; Invalid 2-byte sequence
96 (write-multibyte-character r5 r0
)
97 (write-multibyte-character r6 r0
))
101 (write-multibyte-character r5 r1
)
102 (write-multibyte-character r6 r1
))))
108 ;; Now r1 holds scalar value
112 ((write-multibyte-character r5 r1
))
116 ((r0 = ,(charset-id 'latin-iso8859-1
))
118 (write-multibyte-character r0 r1
))
120 ;; mule-unicode-0100-24ff (< 0800)
121 ((r0 = ,(charset-id 'mule-unicode-0100-24ff
))
123 (r2 = (((r1 / 96) + 32) << 7))
126 (write-multibyte-character r0 r1
)))))))
129 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
133 ;; This is set to 1 if the encoding is invalid.
136 (r3 = (r1 & #b11000000
))
137 (r3 |
= ((r2 >> 2) & #b00110000
))
138 (if (r3 != #b10100000
)
140 ((r3 = ((r0 & #x0f
) << 12))
141 (r3 += ((r1 & #x3f
) << 6))
147 ;; Invalid 3-byte sequence
149 (write-multibyte-character r5 r0
)
150 (write-multibyte-character r6 r0
))
154 (write-multibyte-character r5 r1
)
155 (write-multibyte-character r6 r1
)))
159 (write-multibyte-character r5 r2
)
160 (write-multibyte-character r6 r2
))))
162 ;; mule-unicode-0100-24ff (>= 0800)
164 ((r0 = ,(charset-id 'mule-unicode-0100-24ff
))
168 (r1 += ((r3 + 32) << 7))
169 (write-multibyte-character r0 r1
))
171 ;; mule-unicode-2500-33ff
173 ((r0 = ,(charset-id 'mule-unicode-2500-33ff
))
177 (r1 += ((r3 + 32) << 7))
178 (write-multibyte-character r0 r1
))
181 ;; keep those bytes as eight-bit-{control|graphic}
183 ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
185 (write-multibyte-character r3 r0
)
188 (write-multibyte-character r3 r1
)
192 (write-multibyte-character r3 r2
))
194 ;; mule-unicode-e000-ffff
195 ((r0 = ,(charset-id 'mule-unicode-e000-ffff
))
199 (r1 += ((r3 + 32) << 7))
200 (write-multibyte-character r0 r1
))))))))
203 ;; keep those bytes as eight-bit-{control|graphic}
205 ;; r0 > #xf0, thus eight-bit-graphic
206 (write-multibyte-character r6 r0
)
208 (write-multibyte-character r5 r1
)
209 (write-multibyte-character r6 r1
))
211 (write-multibyte-character r5 r2
)
212 (write-multibyte-character r6 r2
))
214 (write-multibyte-character r5 r3
)
215 (write-multibyte-character r6 r3
))))))
219 "CCL program to decode UTF-8.
220 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
221 mule-unicode-*. Encodings of un-representable Unicode characters are
222 decoded asis into eight-bit-control and eight-bit-graphic
225 (define-ccl-program ccl-encode-mule-utf-8
231 (read-multibyte-character r0 r1
))
232 (;; We have already done read-multibyte-character.
237 (if (r0 == ,(charset-id 'ascii
))
240 (if (r0 == ,(charset-id 'latin-iso8859-1
))
242 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
243 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
244 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
245 ((r0 = (((r1 & #x40
) >> 6) |
#xc2
))
250 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff
))
251 ((r0 = ((((r1 & #x3f80
) >> 7) -
32) * 96))
252 ;; #x3f80 == (0011 1111 1000 0000)b
254 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
255 ;; now r1 holds scalar value
258 ((r0 = (((r1 & #x07c0
) >> 6) |
#xc0
))
259 ;; #x07c0 == (0000 0111 1100 0000)b
264 ((r0 = (((r1 & #xf000
) >> 12) |
#xe0
))
265 (r2 = ((r1 & #x3f
) |
#x80
))
271 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff
))
272 ((r0 = ((((r1 & #x3f80
) >> 7) -
32) * 96))
274 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
275 (r0 = (((r1 & #xf000
) >> 12) |
#xe0
))
276 (r2 = ((r1 & #x3f
) |
#x80
))
282 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff
))
283 ((r0 = ((((r1 & #x3f80
) >> 7) -
32) * 96))
285 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
286 (r0 = (((r1 & #xf000
) >> 12) |
#xe0
))
287 (r2 = ((r1 & #x3f
) |
#x80
))
293 (if (r0 == ,(charset-id 'eight-bit-control
))
295 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
296 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
297 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
301 (if (r0 == ,(charset-id 'eight-bit-graphic
))
303 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
304 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
305 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
308 (read-multibyte-character r0 r1
)
309 (if (r0 != ,(charset-id 'eight-bit-graphic
))
310 (if (r0 != ,(charset-id 'eight-bit-control
))
314 ((read-multibyte-character r0 r2
)
315 (if (r0 != ,(charset-id 'eight-bit-graphic
))
316 (if (r0 != ,(charset-id 'eight-bit-control
))
326 ;; Unsupported character.
327 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
338 "CCL program to encode into UTF-8.
339 Only characters from the charsets ascii, eight-bit-control,
340 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
341 Others are encoded as U+FFFD.")
345 "UTF-8 encoding for Emacs-supported Unicode characters.
346 The supported Emacs character sets are:
351 mule-unicode-0100-24ff
352 mule-unicode-2500-33ff
353 mule-unicode-e000-ffff
355 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
356 are decoded into sequences of eight-bit-control and eight-bit-graphic
357 characters to preserve their byte sequences. Emacs characters out of
358 these ranges are encoded into U+FFFD.
360 Note that, currently, characters in the mule-unicode charsets have no
361 syntax and case information. Thus, for instance, upper- and
362 lower-casing commands won't work with them."
364 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8
)
370 mule-unicode-0100-24ff
371 mule-unicode-2500-33ff
372 mule-unicode-e000-ffff
)
373 (mime-charset . utf-8
)
374 (coding-category . coding-category-utf-8
)
375 (valid-codes (0 .
255))))
377 (define-coding-system-alias 'utf-8
'mule-utf-8
)
379 ;;; utf-8.el ends here