Commit | Line | Data |
---|---|---|
fc2938d1 DL |
1 | ;;; utf-16.el --- UTF-16 encoding/decoding |
2 | ||
2fd125a3 KH |
3 | ;; Copyright (C) 2001, 2002, 2003, 2004 Free Software Foundation, Inc. |
4 | ;; Copyright (C) 2002, 2003, 2004 | |
5 | ;; National Institute of Advanced Industrial Science and Technology (AIST) | |
6 | ;; Registration Number H14PRO021 | |
fc2938d1 DL |
7 | |
8 | ;; Author: Dave Love <fx@gnu.org> | |
9 | ;; Keywords: Unicode, UTF-16, i18n | |
10 | ||
11 | ;; This file is part of GNU Emacs. | |
12 | ||
13 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
14 | ;; it under the terms of the GNU General Public License as published by | |
15 | ;; the Free Software Foundation; either version 2, or (at your option) | |
16 | ;; any later version. | |
17 | ||
18 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
19 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | ;; GNU General Public License for more details. | |
22 | ||
23 | ;; You should have received a copy of the GNU General Public License | |
24 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
25 | ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
26 | ;; Boston, MA 02111-1307, USA. | |
27 | ||
28 | ;;; Commentary: | |
29 | ||
30 | ;; Support for UTF-16, which is a two-byte encoding (modulo | |
cbcd4dc9 DL |
31 | ;; surrogates) of Unicode, defined in RFC 2781. It is written either |
32 | ;; in little or big endian order and either with or without the | |
33 | ;; leading BOM (a two-byte signature which identifies their byte sex). | |
65a0e5fe | 34 | ;; |
cbcd4dc9 | 35 | ;; We provide these base coding systems. |
65a0e5fe KH |
36 | ;; name endian BOM |
37 | ;; ---- ------ --- | |
38 | ;; mule-utf-16le little no | |
39 | ;; mule-utf-16be big no | |
40 | ;; mule-utf-16le-with-signature little yes | |
41 | ;; mule-utf-16be-with-signature big yes | |
42 | ;; mule-utf-16 both yes | |
43 | ;; | |
fc2938d1 DL |
44 | ;; Note that un-decodable sequences aren't (yet?) preserved as raw |
45 | ;; bytes, as they are with utf-8, so reading and writing as utf-16 can | |
46 | ;; corrupt data. | |
47 | ||
48 | ;;; Code: | |
49 | ||
50 | ;; We end up with trivially different -le and -be versions of most | |
51 | ;; things below, sometimes with commonality abstracted into a let | |
52 | ;; binding for maintenance convenience. | |
53 | ||
fc2938d1 DL |
54 | ;; Needed in macro expansion, so can't be let-bound. Zapped after use. |
55 | (eval-and-compile | |
56 | (defconst utf-16-decode-ucs | |
95d2d433 KH |
57 | ;; If r5 is negative, r1 is a Unicode chacter code. Otherise, r5 is |
58 | ;; the first of a surrogate pair and r1 is the second of the pair. | |
59 | ;; Output is charset ID in r0, code point in r1. R0 may be set to | |
60 | ;; -1 in which case a caller should not write out r1. | |
61 | `((if (r5 >= 0) | |
62 | ((r0 = (r1 < #xDC00)) | |
63 | (if ((r1 >= #xE000) | r0) | |
64 | ;; Invalid second code of surrogate pair. | |
65 | ((r0 = r5) | |
66 | (call ccl-mule-utf-untrans)) | |
67 | ((r1 -= #xDC00) | |
68 | (r1 += (((r5 - #xD800) << 10) + #x10000)))) | |
69 | (r5 = -1))) | |
70 | (if (r1 < 128) | |
71 | (r0 = ,(charset-id 'ascii)) | |
72 | ((lookup-integer utf-subst-table-for-decode r1 r3) | |
73 | (if r7 ; got a translation | |
74 | ((r0 = r1) (r1 = r3)) | |
75 | (if (r1 < 160) | |
76 | (r0 = ,(charset-id 'eight-bit-control)) | |
77 | (if (r1 < 256) | |
78 | ((r0 = ,(charset-id 'latin-iso8859-1)) | |
79 | (r1 -= 128)) | |
80 | (if (r1 < #x2500) | |
81 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
82 | (r1 -= #x100) | |
83 | (r2 = (((r1 / 96) + 32) << 7)) | |
84 | (r1 %= 96) | |
85 | (r1 += (r2 + 32))) | |
86 | (if (r1 < #x3400) | |
87 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
88 | (r1 -= #x2500) | |
89 | (r2 = (((r1 / 96) + 32) << 7)) | |
90 | (r1 %= 96) | |
91 | (r1 += (r2 + 32))) | |
92 | (if (r1 < #xD800) | |
93 | ;; We can't have this character. | |
94 | ((r0 = r1) | |
95 | (call ccl-mule-utf-untrans) | |
96 | (r5 = -1) | |
97 | (r0 = -1)) | |
98 | (if (r1 < #xDC00) | |
99 | ;; The first code of a surrogate pair. | |
100 | ((r5 = r1) | |
101 | (r0 = -1)) | |
102 | (if (r1 < #xE000) | |
103 | ;; The second code of a surrogate pair, invalid. | |
104 | ((r0 = r1) | |
105 | (call ccl-mule-utf-untrans) | |
106 | (r5 = -1) | |
107 | (r0 = -1)) | |
108 | (if (r1 < #x10000) | |
109 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
110 | (r1 -= #xE000) | |
111 | (r2 = (((r1 / 96) + 32) << 7)) | |
112 | (r1 %= 96) | |
113 | (r1 += (r2 + 32))) | |
114 | ;; We can't have this character. | |
115 | ((r0 = r1) | |
116 | (call ccl-mule-utf-untrans) | |
117 | (r5 = -1) | |
118 | (r0 = -1))))))))))))))) | |
4fbc4b17 | 119 | |
65a0e5fe | 120 | (defconst utf-16le-decode-loop |
95d2d433 KH |
121 | `((r5 = -1) |
122 | (loop | |
123 | (r3 = -1) | |
124 | (read r3 r4) | |
125 | (r1 = (r4 <8 r3)) | |
126 | ,@utf-16-decode-ucs | |
127 | (if (r0 >= 0) | |
128 | ((translate-character utf-translation-table-for-decode r0 r1) | |
129 | (write-multibyte-character r0 r1))) | |
130 | (repeat)))) | |
4fbc4b17 | 131 | |
65a0e5fe | 132 | (defconst utf-16be-decode-loop |
95d2d433 KH |
133 | `((r5 = -1) |
134 | (loop | |
135 | (r3 = -1) | |
136 | (read r3 r4) | |
137 | (r1 = (r3 <8 r4)) | |
138 | ,@utf-16-decode-ucs | |
139 | (if (r0 >= 0) | |
140 | ((translate-character utf-translation-table-for-decode r0 r1) | |
141 | (write-multibyte-character r0 r1))) | |
142 | (repeat)))) | |
4fbc4b17 KH |
143 | |
144 | ) | |
fc2938d1 | 145 | |
65a0e5fe | 146 | (define-ccl-program ccl-decode-mule-utf-16le |
fc2938d1 | 147 | `(2 ; 2 bytes -> 1 to 4 bytes |
95d2d433 KH |
148 | ,utf-16le-decode-loop |
149 | ((if (r5 >= 0) | |
150 | ((r0 = r5) | |
151 | (call ccl-mule-utf-untrans))) | |
152 | (if (r3 < 0) | |
153 | nil | |
154 | ((if (r3 < #xA0) | |
155 | (r0 = ,(charset-id 'eight-bit-control)) | |
156 | (r0 = ,(charset-id 'eight-bit-graphic))) | |
157 | (write-multibyte-character r0 r3))))) | |
2217b8e1 | 158 | "Decode UTF-16LE (little endian without signature bytes). |
fc2938d1 | 159 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
278ce936 KH |
160 | mule-unicode-*. Un-representable Unicode characters are decoded as |
161 | U+fffd. The result is run through the translation-table named | |
162 | `utf-translation-table-for-decode'.") | |
fc2938d1 | 163 | |
65a0e5fe | 164 | (define-ccl-program ccl-decode-mule-utf-16be |
fc2938d1 | 165 | `(2 ; 2 bytes -> 1 to 4 bytes |
95d2d433 KH |
166 | ,utf-16be-decode-loop |
167 | ((if (r5 >= 0) | |
168 | ((r0 = r5) | |
169 | (call ccl-mule-utf-untrans))) | |
170 | (if (r3 >= 0) | |
171 | ((r0 = r3) | |
172 | (call ccl-mule-utf-untrans))))) | |
2217b8e1 | 173 | "Decode UTF-16BE (big endian without signature bytes). |
fc2938d1 DL |
174 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
175 | mule-unicode-*. Un-representable Unicode characters are | |
278ce936 KH |
176 | decoded as U+fffd. The result is run through the translation-table of |
177 | name `utf-translation-table-for-decode'.") | |
fc2938d1 | 178 | |
65a0e5fe | 179 | (define-ccl-program ccl-decode-mule-utf-16le-with-signature |
4fbc4b17 | 180 | `(2 |
95d2d433 KH |
181 | ((r3 = -1) |
182 | (read r3 r4) | |
183 | ,@utf-16le-decode-loop) | |
184 | (if (r3 >= 0) | |
185 | ((r0 = r3) | |
186 | (call ccl-mule-utf-untrans)))) | |
65a0e5fe | 187 | "Like ccl-decode-utf-16le but skip the first 2-byte BOM.") |
4fbc4b17 | 188 | |
65a0e5fe | 189 | (define-ccl-program ccl-decode-mule-utf-16be-with-signature |
4fbc4b17 | 190 | `(2 |
95d2d433 KH |
191 | ((r3 = -1) |
192 | (read r3 r4) | |
193 | ,@utf-16be-decode-loop) | |
194 | (if (r3 >= 0) | |
195 | ((r0 = r3) | |
196 | (call ccl-mule-utf-untrans)))) | |
65a0e5fe | 197 | "Like ccl-decode-utf-16be but skip the first 2-byte BOM.") |
4fbc4b17 KH |
198 | |
199 | (define-ccl-program ccl-decode-mule-utf-16 | |
200 | `(2 | |
95d2d433 KH |
201 | ((r3 = -1) |
202 | (read r3 r4) | |
4fbc4b17 | 203 | (r1 = (r3 <8 r4)) |
95d2d433 | 204 | (r5 = -1) |
4fbc4b17 KH |
205 | (if (r1 == #xFFFE) |
206 | ;; R1 is a BOM for little endian. We keep this character as | |
207 | ;; is temporarily. It is removed by post-read-conversion | |
208 | ;; function. | |
209 | (,@utf-16-decode-ucs | |
210 | (write-multibyte-character r0 r1) | |
95d2d433 | 211 | ,@utf-16le-decode-loop) |
4fbc4b17 KH |
212 | ((if (r1 == #xFEFF) |
213 | ;; R1 is a BOM for big endian, but we can't keep that | |
214 | ;; character in the output because it can't be | |
215 | ;; distinguished with the normal U+FEFF. So, we keep | |
216 | ;; #xFFFF instead. | |
217 | ((r1 = #xFFFF) | |
95d2d433 KH |
218 | ,@utf-16-decode-ucs |
219 | (write-multibyte-character r0 r1)) | |
220 | ;; R1 is a normal Unicode character. | |
4fbc4b17 | 221 | (,@utf-16-decode-ucs |
95d2d433 KH |
222 | (if (r0 >= 0) |
223 | ((translate-character utf-translation-table-for-decode r0 r1) | |
224 | (write-multibyte-character r0 r1))))) | |
225 | ,@utf-16be-decode-loop))) | |
226 | (if (r3 >= 0) | |
227 | ((r0 = r3) | |
228 | (call ccl-mule-utf-untrans)))) | |
65a0e5fe | 229 | "Like ccl-decode-utf-16be/le but check the first BOM.") |
4fbc4b17 | 230 | |
fc2938d1 | 231 | (makunbound 'utf-16-decode-ucs) ; done with it |
65a0e5fe KH |
232 | (makunbound 'utf-16le-decode-loop) |
233 | (makunbound 'utf-16be-decode-loop) | |
fc2938d1 | 234 | |
95d2d433 KH |
235 | ;; UTF-16 decoder generates an UTF-8 sequence represented by a |
236 | ;; sequence eight-bit-control/graphic chars for an invalid byte (the | |
237 | ;; last byte of an odd length source) and an untranslatable character | |
238 | ;; (including an invalid surrogate-pair code-point). | |
239 | ;; | |
240 | ;; This CCL parses that sequence (the first byte is already in r1), | |
241 | ;; and if the sequence represents an untranslatable character, it sets | |
242 | ;; r1 to the original invalid code or untranslated Unicode character | |
243 | ;; code, sets r2 to -1 (to prevent r2 and r3 are written), set2 r5 to | |
244 | ;; -1 (to tell the caller that there's no pre-read character). | |
245 | ;; | |
246 | ;; If the sequence represents an invalid byte, it sets r1 to -1, r2 to | |
247 | ;; the byte, sets r3 and r5 to -1. | |
248 | ;; | |
249 | ;; Otherwise, don't change r1, set r2 and r3 to already read | |
250 | ;; eight-bit-control/graphic characters (if any), set r5 and r6 to the | |
251 | ;; last character that invalidates the UTF-8 form. | |
252 | ;; | |
253 | ;; Note: For UTF-8 validation, we only check if a character is | |
254 | ;; eight-bit-control/graphic or not. It may result in incorrect | |
255 | ;; handling of random binary data, but such a data can't be encoded by | |
256 | ;; UTF-16 anyway. At least, UTF-16 decoder doesn't generate such a | |
257 | ;; sequence even if a source contains invalid byte-sequence. | |
258 | ||
259 | (define-ccl-program ccl-mule-utf-16-encode-untrans | |
260 | `(0 | |
261 | ((r2 = -1) | |
262 | ;; Read the 2nd byte. | |
263 | (read-multibyte-character r5 r6) | |
264 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
265 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
266 | ((r2 = r1) | |
267 | (r3 = -1) | |
268 | (r1 = -1) | |
269 | (end))) ; invalid UTF-8 | |
270 | ||
271 | (r3 = -1) | |
272 | (r2 = r6) | |
273 | (if (r1 <= #xE0) | |
274 | ;; 2-byte UTF-8, i.e. originally an invalid byte. | |
275 | ((r2 &= #x3F) | |
276 | (r2 |= ((r1 & #x1F) << 6)) | |
277 | (r1 = -1) | |
278 | (r5 = -1) | |
279 | (end))) | |
280 | ||
281 | ;; Read the 3rd byte. | |
282 | (read-multibyte-character r5 r6) | |
283 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
284 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
285 | ((end))) ; invalid UTF-8 | |
286 | ||
287 | (if (r1 < #xF0) ; valid 3-byte UTF-8 | |
288 | ((r1 = ((r1 & #x0F) << 12)) | |
289 | (r1 |= ((r2 & #x3F) << 6)) | |
290 | (r1 |= (r6 & #x3F)) | |
291 | (r2 = -1) | |
292 | (r5 = -1) | |
293 | (end))) | |
294 | ||
295 | (r3 = r6) | |
296 | ;; Read the 4th byte. | |
297 | (read-multibyte-character r5 r6) | |
298 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
299 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
300 | (end)) ; livalid UTF-8 | |
301 | ||
302 | ;; valid 4-byte UTF-8 | |
303 | (r1 = ((r1 & #x07) << 18)) | |
304 | (r1 |= ((r2 & #x3F) << 12)) | |
305 | (r1 |= ((r3 & #x3F) << 6)) | |
306 | (r1 |= (r6 & #x3F)) | |
307 | (r2 = -1) | |
308 | (r5 = -1) | |
309 | (end)) | |
310 | ||
311 | (if (r1 >= 0) | |
312 | ((write r1) | |
313 | (if (r2 >= 0) | |
314 | ((write r2) | |
315 | (if (r3 >= 0) | |
316 | (write r3)))))))) | |
317 | ||
fc2938d1 DL |
318 | (eval-and-compile |
319 | (defconst utf-16-decode-to-ucs | |
95d2d433 KH |
320 | ;; Read a character and set r1 to the corresponding Unicode code. |
321 | ;; If r5 is not negative, it means that we have already read a | |
322 | ;; character into r5 and r6. | |
323 | ;; If an invalid eight-bit-control/graphic sequence is found, r2 and | |
324 | ;; r3 may contain a byte to written out, r5 and r6 may contain a | |
325 | ;; pre-read character. Usually they are set to -1. | |
326 | `((if (r5 < 0) | |
327 | (read-multibyte-character r0 r1) | |
328 | ((r0 = r5) | |
329 | (r1 = r6) | |
330 | (r5 = -1))) | |
331 | (lookup-character utf-subst-table-for-encode r0 r1) | |
332 | (r2 = -1) | |
333 | (if (r7 > 0) | |
334 | (r1 = r0) | |
335 | ((translate-character utf-translation-table-for-encode r0 r1) | |
336 | (if (r0 == ,(charset-id 'ascii)) | |
337 | nil | |
338 | (if (r0 == ,(charset-id 'latin-iso8859-1)) | |
339 | (r1 += 128) | |
340 | (if (r0 == ,(charset-id 'eight-bit-control)) | |
341 | nil | |
342 | (if (r0 == ,(charset-id 'eight-bit-graphic)) | |
343 | (call ccl-mule-utf-16-encode-untrans) | |
344 | ((r2 = ((r1 & #x7f) - 32)) | |
345 | (r3 = ((((r1 >> 7) - 32) * 96) + r2)) | |
346 | (r2 = -1) | |
347 | (r5 = -1) | |
348 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | |
349 | (r1 = (r3 + #x100)) | |
350 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
351 | (r1 = (r3 + #x2500)) | |
352 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | |
353 | (r1 = (r3 + #xe000)) | |
354 | (r1 = #xfffd))))))))))))) | |
4fbc4b17 | 355 | |
65a0e5fe | 356 | (defconst utf-16le-encode-loop |
95d2d433 KH |
357 | `((r5 = -1) |
358 | (loop | |
359 | ,@utf-16-decode-to-ucs | |
360 | (if (r1 >= #x10000) | |
361 | ((r1 -= #x10000) | |
362 | (r0 = ((r1 >> 10) + #xD800)) | |
363 | (write (r0 & 255)) | |
364 | (write (r0 >> 8)) | |
365 | (r1 = ((r1 & #x3FF) + #xDC00)))) | |
366 | (if (r1 >= 0) | |
367 | ((write (r1 & 255)) | |
368 | (write (r1 >> 8)))) | |
369 | (if (r2 >= 0) | |
370 | ((write r2) | |
371 | (if (r3 >= 0) | |
372 | (write r3)))) | |
373 | (repeat)))) | |
4fbc4b17 | 374 | |
65a0e5fe | 375 | (defconst utf-16be-encode-loop |
95d2d433 KH |
376 | `((r5 = -1) |
377 | (loop | |
378 | ,@utf-16-decode-to-ucs | |
379 | (if (r1 >= #x10000) | |
380 | ((r1 -= #x10000) | |
381 | (r0 = ((r1 >> 10) + #xD800)) | |
382 | (write (r0 >> 8)) | |
383 | (write (r0 & 255)) | |
384 | (r1 = ((r1 & #x3FF) + #xDC00)))) | |
385 | (if (r1 >= 0) | |
386 | ((write (r1 >> 8)) | |
387 | (write (r1 & 255)))) | |
388 | (if (r2 >= 0) | |
389 | ((write r2) | |
390 | (if (r3 >= 0) | |
391 | (write r3)))) | |
392 | (repeat)))) | |
4fbc4b17 | 393 | ) |
fc2938d1 | 394 | |
65a0e5fe KH |
395 | |
396 | (define-ccl-program ccl-encode-mule-utf-16le | |
dbaba2d2 | 397 | `(2 |
65a0e5fe | 398 | ,utf-16le-encode-loop) |
2217b8e1 | 399 | "Encode to UTF-16LE (little endian without signature). |
fc2938d1 DL |
400 | Characters from the charsets ascii, eight-bit-control, |
401 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
278ce936 KH |
402 | after translation through the translation-table of name |
403 | `utf-translation-table-for-encode'. | |
fc2938d1 DL |
404 | Others are encoded as U+FFFD.") |
405 | ||
65a0e5fe | 406 | (define-ccl-program ccl-encode-mule-utf-16be |
dbaba2d2 | 407 | `(2 |
65a0e5fe | 408 | ,utf-16be-encode-loop) |
2217b8e1 | 409 | "Encode to UTF-16BE (big endian without signature). |
fc2938d1 DL |
410 | Characters from the charsets ascii, eight-bit-control, |
411 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
278ce936 KH |
412 | after translation through the translation-table named |
413 | `utf-translation-table-for-encode'. | |
fc2938d1 DL |
414 | Others are encoded as U+FFFD.") |
415 | ||
65a0e5fe | 416 | (define-ccl-program ccl-encode-mule-utf-16le-with-signature |
f7c4d755 | 417 | `(2 |
4fbc4b17 KH |
418 | ((write #xFF) |
419 | (write #xFE) | |
95d2d433 | 420 | ,@utf-16le-encode-loop)) |
4fbc4b17 KH |
421 | "Encode to UTF-16 (little endian with signature). |
422 | Characters from the charsets ascii, eight-bit-control, | |
423 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
424 | after translation through the translation-table of name | |
425 | `utf-translation-table-for-encode'. | |
426 | Others are encoded as U+FFFD.") | |
427 | ||
65a0e5fe | 428 | (define-ccl-program ccl-encode-mule-utf-16be-with-signature |
f7c4d755 | 429 | `(2 |
4fbc4b17 KH |
430 | ((write #xFE) |
431 | (write #xFF) | |
95d2d433 | 432 | ,@utf-16be-encode-loop)) |
4fbc4b17 KH |
433 | "Encode to UTF-16 (big endian with signature). |
434 | Characters from the charsets ascii, eight-bit-control, | |
435 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
436 | after translation through the translation-table named | |
437 | `utf-translation-table-for-encode'. | |
438 | Others are encoded as U+FFFD.") | |
439 | ||
fc2938d1 | 440 | (makunbound 'utf-16-decode-to-ucs) |
65a0e5fe KH |
441 | (makunbound 'utf-16le-encode-loop) |
442 | (makunbound 'utf-16be-encode-loop) | |
4fbc4b17 KH |
443 | |
444 | (defun mule-utf-16-post-read-conversion (length) | |
445 | (when (> length 0) | |
95d2d433 | 446 | (setq length (utf-8-post-read-conversion length)) |
4fbc4b17 KH |
447 | (let ((char (following-char))) |
448 | (cond ((= char (decode-char 'ucs #xFFFE)) | |
449 | (delete-char 1) | |
450 | (setq last-coding-system-used | |
451 | (coding-system-change-text-conversion | |
452 | last-coding-system-used | |
65a0e5fe | 453 | 'mule-utf-16le-with-signature)) |
4fbc4b17 KH |
454 | (setq length (1- length))) |
455 | ((= char (decode-char 'ucs #xFFFF)) | |
456 | (delete-char 1) | |
457 | (setq last-coding-system-used | |
458 | (coding-system-change-text-conversion | |
459 | last-coding-system-used | |
65a0e5fe | 460 | 'mule-utf-16be-with-signature)) |
4fbc4b17 KH |
461 | (setq length (1- length))) |
462 | (t | |
65a0e5fe | 463 | (setq last-coding-system-used 'mule-utf-16be))))) |
4fbc4b17 | 464 | length) |
fc2938d1 | 465 | |
fc2938d1 DL |
466 | (let ((doc " |
467 | ||
278ce936 KH |
468 | It supports Unicode characters of these ranges: |
469 | U+0000..U+33FF, U+E000..U+FFFF. | |
470 | They correspond to these Emacs character sets: | |
471 | ascii, latin-iso8859-1, mule-unicode-0100-24ff, | |
472 | mule-unicode-2500-33ff, mule-unicode-e000-ffff | |
473 | ||
474 | On decoding (e.g. reading a file), Unicode characters not in the above | |
475 | ranges are decoded as U+FFFD, effectively corrupting the data | |
a1506d29 | 476 | if they are re-encoded. |
278ce936 KH |
477 | |
478 | On encoding (e.g. writing a file), Emacs characters not belonging to | |
479 | any of the character sets listed above are encoded into the byte | |
95d2d433 KH |
480 | sequence representing U+FFFD (REPLACEMENT CHARACTER).") |
481 | (props `((safe-charsets | |
482 | ascii | |
483 | eight-bit-control | |
484 | eight-bit-graphic | |
485 | latin-iso8859-1 | |
486 | mule-unicode-0100-24ff | |
487 | mule-unicode-2500-33ff | |
488 | mule-unicode-e000-ffff | |
489 | ,@(if utf-translate-cjk-mode | |
490 | utf-translate-cjk-charsets)) | |
491 | (valid-codes (0 . 255)) | |
492 | (mime-text-unsuitable . t) | |
493 | (pre-write-conversion . utf-8-pre-write-conversion) | |
494 | (dependency unify-8859-on-encoding-mode | |
495 | unify-8859-on-decoding-mode | |
496 | utf-fragment-on-decoding | |
497 | utf-translate-cjk-mode)))) | |
fc2938d1 | 498 | (make-coding-system |
65a0e5fe | 499 | 'mule-utf-16le 4 |
fc2938d1 DL |
500 | ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. |
501 | (concat | |
65a0e5fe | 502 | "UTF-16LE encoding for Emacs-supported Unicode characters." |
fc2938d1 | 503 | doc) |
65a0e5fe | 504 | '(ccl-decode-mule-utf-16le . ccl-encode-mule-utf-16le) |
95d2d433 KH |
505 | `(,@props |
506 | (post-read-conversion . utf-8-post-read-conversion) | |
507 | (mime-charset . utf-16le))) | |
fc2938d1 DL |
508 | |
509 | (make-coding-system | |
65a0e5fe | 510 | 'mule-utf-16be 4 ?u |
fc2938d1 | 511 | (concat |
65a0e5fe | 512 | "UTF-16BE encoding for Emacs-supported Unicode characters." |
fc2938d1 DL |
513 | doc) |
514 | ||
65a0e5fe | 515 | '(ccl-decode-mule-utf-16be . ccl-encode-mule-utf-16be) |
95d2d433 KH |
516 | `(,@props |
517 | (post-read-conversion . utf-8-post-read-conversion) | |
518 | (mime-charset . utf-16be))) | |
4fbc4b17 KH |
519 | |
520 | (make-coding-system | |
65a0e5fe | 521 | 'mule-utf-16le-with-signature 4 ?u |
4fbc4b17 KH |
522 | (concat |
523 | "Little endian UTF-16 (with BOM) for Emacs-supported Unicode characters." | |
524 | doc) | |
525 | ||
65a0e5fe KH |
526 | '(ccl-decode-mule-utf-16le-with-signature |
527 | . ccl-encode-mule-utf-16le-with-signature) | |
95d2d433 KH |
528 | `(,@props |
529 | (post-read-conversion . utf-8-post-read-conversion) | |
4fbc4b17 | 530 | (coding-category . coding-category-utf-16-le) |
95d2d433 | 531 | (mime-charset . utf-16))) |
4fbc4b17 KH |
532 | |
533 | (make-coding-system | |
65a0e5fe | 534 | 'mule-utf-16be-with-signature 4 ?u |
4fbc4b17 KH |
535 | (concat |
536 | "Big endian UTF-16 (with BOM) for Emacs-supported Unicode characters." | |
537 | doc) | |
538 | ||
65a0e5fe KH |
539 | '(ccl-decode-mule-utf-16be-with-signature |
540 | . ccl-encode-mule-utf-16be-with-signature) | |
95d2d433 KH |
541 | `(,@props |
542 | (post-read-conversion . utf-8-post-read-conversion) | |
4fbc4b17 | 543 | (coding-category . coding-category-utf-16-be) |
95d2d433 | 544 | (mime-charset . utf-16))) |
4fbc4b17 KH |
545 | |
546 | (make-coding-system | |
547 | 'mule-utf-16 4 ?u | |
548 | (concat | |
549 | "UTF-16 (with or without BOM) for Emacs-supported Unicode characters." | |
550 | doc) | |
551 | ||
65a0e5fe | 552 | '(ccl-decode-mule-utf-16 . ccl-encode-mule-utf-16be-with-signature) |
95d2d433 KH |
553 | `(,@props |
554 | (post-read-conversion . mule-utf-16-post-read-conversion) | |
fc2938d1 | 555 | (coding-category . coding-category-utf-16-be) |
95d2d433 | 556 | (mime-charset . utf-16))) |
4fbc4b17 | 557 | ) |
fc2938d1 | 558 | |
65a0e5fe KH |
559 | (define-coding-system-alias 'utf-16le 'mule-utf-16le) |
560 | (define-coding-system-alias 'utf-16be 'mule-utf-16be) | |
561 | (define-coding-system-alias 'utf-16le-with-signature | |
562 | 'mule-utf-16le-with-signature) | |
563 | (define-coding-system-alias 'utf-16be-with-signature | |
564 | 'mule-utf-16be-with-signature) | |
4fbc4b17 | 565 | (define-coding-system-alias 'utf-16 'mule-utf-16) |
fc2938d1 | 566 | |
65a0e5fe KH |
567 | ;; For backward compatibility. |
568 | (define-coding-system-alias 'mule-utf-16-le 'mule-utf-16le-with-signature) | |
569 | (define-coding-system-alias 'utf-16-le 'mule-utf-16le-with-signature) | |
570 | (define-coding-system-alias 'mule-utf-16-be 'mule-utf-16be-with-signature) | |
571 | (define-coding-system-alias 'utf-16-be 'mule-utf-16be-with-signature) | |
572 | ||
ab5796a9 | 573 | ;;; arch-tag: 85455d46-d9c9-466d-a6f3-c3582a7367c4 |
fc2938d1 | 574 | ;;; utf-16.el ends here |