Commit | Line | Data |
---|---|---|
fc2938d1 DL |
1 | ;;; utf-16.el --- UTF-16 encoding/decoding |
2 | ||
9e24a165 | 3 | ;; Copyright (C) 2001, 2002, 2003, 2004 Free Software Foundation, Inc. |
fc2938d1 DL |
4 | |
5 | ;; Author: Dave Love <fx@gnu.org> | |
6 | ;; Keywords: Unicode, UTF-16, i18n | |
7 | ||
8 | ;; This file is part of GNU Emacs. | |
9 | ||
10 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
11 | ;; it under the terms of the GNU General Public License as published by | |
12 | ;; the Free Software Foundation; either version 2, or (at your option) | |
13 | ;; any later version. | |
14 | ||
15 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
16 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 | ;; GNU General Public License for more details. | |
19 | ||
20 | ;; You should have received a copy of the GNU General Public License | |
21 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
22 | ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
23 | ;; Boston, MA 02111-1307, USA. | |
24 | ||
25 | ;;; Commentary: | |
26 | ||
27 | ;; Support for UTF-16, which is a two-byte encoding (modulo | |
cbcd4dc9 DL |
28 | ;; surrogates) of Unicode, defined in RFC 2781. It is written either |
29 | ;; in little or big endian order and either with or without the | |
30 | ;; leading BOM (a two-byte signature which identifies their byte sex). | |
65a0e5fe | 31 | ;; |
cbcd4dc9 | 32 | ;; We provide these base coding systems. |
65a0e5fe KH |
33 | ;; name endian BOM |
34 | ;; ---- ------ --- | |
35 | ;; mule-utf-16le little no | |
36 | ;; mule-utf-16be big no | |
37 | ;; mule-utf-16le-with-signature little yes | |
38 | ;; mule-utf-16be-with-signature big yes | |
39 | ;; mule-utf-16 both yes | |
40 | ;; | |
fc2938d1 DL |
41 | ;; Note that un-decodable sequences aren't (yet?) preserved as raw |
42 | ;; bytes, as they are with utf-8, so reading and writing as utf-16 can | |
43 | ;; corrupt data. | |
44 | ||
45 | ;;; Code: | |
46 | ||
47 | ;; We end up with trivially different -le and -be versions of most | |
48 | ;; things below, sometimes with commonality abstracted into a let | |
49 | ;; binding for maintenance convenience. | |
50 | ||
fc2938d1 DL |
51 | ;; Needed in macro expansion, so can't be let-bound. Zapped after use. |
52 | (eval-and-compile | |
53 | (defconst utf-16-decode-ucs | |
95d2d433 KH |
54 | ;; If r5 is negative, r1 is a Unicode chacter code. Otherise, r5 is |
55 | ;; the first of a surrogate pair and r1 is the second of the pair. | |
56 | ;; Output is charset ID in r0, code point in r1. R0 may be set to | |
57 | ;; -1 in which case a caller should not write out r1. | |
58 | `((if (r5 >= 0) | |
59 | ((r0 = (r1 < #xDC00)) | |
60 | (if ((r1 >= #xE000) | r0) | |
61 | ;; Invalid second code of surrogate pair. | |
62 | ((r0 = r5) | |
63 | (call ccl-mule-utf-untrans)) | |
64 | ((r1 -= #xDC00) | |
65 | (r1 += (((r5 - #xD800) << 10) + #x10000)))) | |
66 | (r5 = -1))) | |
67 | (if (r1 < 128) | |
68 | (r0 = ,(charset-id 'ascii)) | |
69 | ((lookup-integer utf-subst-table-for-decode r1 r3) | |
70 | (if r7 ; got a translation | |
71 | ((r0 = r1) (r1 = r3)) | |
72 | (if (r1 < 160) | |
73 | (r0 = ,(charset-id 'eight-bit-control)) | |
74 | (if (r1 < 256) | |
75 | ((r0 = ,(charset-id 'latin-iso8859-1)) | |
76 | (r1 -= 128)) | |
77 | (if (r1 < #x2500) | |
78 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
79 | (r1 -= #x100) | |
80 | (r2 = (((r1 / 96) + 32) << 7)) | |
81 | (r1 %= 96) | |
82 | (r1 += (r2 + 32))) | |
83 | (if (r1 < #x3400) | |
84 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
85 | (r1 -= #x2500) | |
86 | (r2 = (((r1 / 96) + 32) << 7)) | |
87 | (r1 %= 96) | |
88 | (r1 += (r2 + 32))) | |
89 | (if (r1 < #xD800) | |
90 | ;; We can't have this character. | |
91 | ((r0 = r1) | |
92 | (call ccl-mule-utf-untrans) | |
93 | (r5 = -1) | |
94 | (r0 = -1)) | |
95 | (if (r1 < #xDC00) | |
96 | ;; The first code of a surrogate pair. | |
97 | ((r5 = r1) | |
98 | (r0 = -1)) | |
99 | (if (r1 < #xE000) | |
100 | ;; The second code of a surrogate pair, invalid. | |
101 | ((r0 = r1) | |
102 | (call ccl-mule-utf-untrans) | |
103 | (r5 = -1) | |
104 | (r0 = -1)) | |
105 | (if (r1 < #x10000) | |
106 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
107 | (r1 -= #xE000) | |
108 | (r2 = (((r1 / 96) + 32) << 7)) | |
109 | (r1 %= 96) | |
110 | (r1 += (r2 + 32))) | |
111 | ;; We can't have this character. | |
112 | ((r0 = r1) | |
113 | (call ccl-mule-utf-untrans) | |
114 | (r5 = -1) | |
115 | (r0 = -1))))))))))))))) | |
4fbc4b17 | 116 | |
65a0e5fe | 117 | (defconst utf-16le-decode-loop |
95d2d433 KH |
118 | `((r5 = -1) |
119 | (loop | |
120 | (r3 = -1) | |
121 | (read r3 r4) | |
122 | (r1 = (r4 <8 r3)) | |
123 | ,@utf-16-decode-ucs | |
124 | (if (r0 >= 0) | |
125 | ((translate-character utf-translation-table-for-decode r0 r1) | |
126 | (write-multibyte-character r0 r1))) | |
127 | (repeat)))) | |
4fbc4b17 | 128 | |
65a0e5fe | 129 | (defconst utf-16be-decode-loop |
95d2d433 KH |
130 | `((r5 = -1) |
131 | (loop | |
132 | (r3 = -1) | |
133 | (read r3 r4) | |
134 | (r1 = (r3 <8 r4)) | |
135 | ,@utf-16-decode-ucs | |
136 | (if (r0 >= 0) | |
137 | ((translate-character utf-translation-table-for-decode r0 r1) | |
138 | (write-multibyte-character r0 r1))) | |
139 | (repeat)))) | |
4fbc4b17 KH |
140 | |
141 | ) | |
fc2938d1 | 142 | |
65a0e5fe | 143 | (define-ccl-program ccl-decode-mule-utf-16le |
fc2938d1 | 144 | `(2 ; 2 bytes -> 1 to 4 bytes |
95d2d433 KH |
145 | ,utf-16le-decode-loop |
146 | ((if (r5 >= 0) | |
147 | ((r0 = r5) | |
148 | (call ccl-mule-utf-untrans))) | |
149 | (if (r3 < 0) | |
150 | nil | |
151 | ((if (r3 < #xA0) | |
152 | (r0 = ,(charset-id 'eight-bit-control)) | |
153 | (r0 = ,(charset-id 'eight-bit-graphic))) | |
154 | (write-multibyte-character r0 r3))))) | |
2217b8e1 | 155 | "Decode UTF-16LE (little endian without signature bytes). |
fc2938d1 | 156 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
278ce936 KH |
157 | mule-unicode-*. Un-representable Unicode characters are decoded as |
158 | U+fffd. The result is run through the translation-table named | |
159 | `utf-translation-table-for-decode'.") | |
fc2938d1 | 160 | |
65a0e5fe | 161 | (define-ccl-program ccl-decode-mule-utf-16be |
fc2938d1 | 162 | `(2 ; 2 bytes -> 1 to 4 bytes |
95d2d433 KH |
163 | ,utf-16be-decode-loop |
164 | ((if (r5 >= 0) | |
165 | ((r0 = r5) | |
166 | (call ccl-mule-utf-untrans))) | |
167 | (if (r3 >= 0) | |
168 | ((r0 = r3) | |
169 | (call ccl-mule-utf-untrans))))) | |
2217b8e1 | 170 | "Decode UTF-16BE (big endian without signature bytes). |
fc2938d1 DL |
171 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
172 | mule-unicode-*. Un-representable Unicode characters are | |
278ce936 KH |
173 | decoded as U+fffd. The result is run through the translation-table of |
174 | name `utf-translation-table-for-decode'.") | |
fc2938d1 | 175 | |
65a0e5fe | 176 | (define-ccl-program ccl-decode-mule-utf-16le-with-signature |
4fbc4b17 | 177 | `(2 |
95d2d433 KH |
178 | ((r3 = -1) |
179 | (read r3 r4) | |
180 | ,@utf-16le-decode-loop) | |
181 | (if (r3 >= 0) | |
182 | ((r0 = r3) | |
183 | (call ccl-mule-utf-untrans)))) | |
65a0e5fe | 184 | "Like ccl-decode-utf-16le but skip the first 2-byte BOM.") |
4fbc4b17 | 185 | |
65a0e5fe | 186 | (define-ccl-program ccl-decode-mule-utf-16be-with-signature |
4fbc4b17 | 187 | `(2 |
95d2d433 KH |
188 | ((r3 = -1) |
189 | (read r3 r4) | |
190 | ,@utf-16be-decode-loop) | |
191 | (if (r3 >= 0) | |
192 | ((r0 = r3) | |
193 | (call ccl-mule-utf-untrans)))) | |
65a0e5fe | 194 | "Like ccl-decode-utf-16be but skip the first 2-byte BOM.") |
4fbc4b17 KH |
195 | |
196 | (define-ccl-program ccl-decode-mule-utf-16 | |
197 | `(2 | |
95d2d433 KH |
198 | ((r3 = -1) |
199 | (read r3 r4) | |
4fbc4b17 | 200 | (r1 = (r3 <8 r4)) |
95d2d433 | 201 | (r5 = -1) |
4fbc4b17 KH |
202 | (if (r1 == #xFFFE) |
203 | ;; R1 is a BOM for little endian. We keep this character as | |
204 | ;; is temporarily. It is removed by post-read-conversion | |
205 | ;; function. | |
206 | (,@utf-16-decode-ucs | |
207 | (write-multibyte-character r0 r1) | |
95d2d433 | 208 | ,@utf-16le-decode-loop) |
4fbc4b17 KH |
209 | ((if (r1 == #xFEFF) |
210 | ;; R1 is a BOM for big endian, but we can't keep that | |
211 | ;; character in the output because it can't be | |
212 | ;; distinguished with the normal U+FEFF. So, we keep | |
213 | ;; #xFFFF instead. | |
214 | ((r1 = #xFFFF) | |
95d2d433 KH |
215 | ,@utf-16-decode-ucs |
216 | (write-multibyte-character r0 r1)) | |
217 | ;; R1 is a normal Unicode character. | |
4fbc4b17 | 218 | (,@utf-16-decode-ucs |
95d2d433 KH |
219 | (if (r0 >= 0) |
220 | ((translate-character utf-translation-table-for-decode r0 r1) | |
221 | (write-multibyte-character r0 r1))))) | |
222 | ,@utf-16be-decode-loop))) | |
223 | (if (r3 >= 0) | |
224 | ((r0 = r3) | |
225 | (call ccl-mule-utf-untrans)))) | |
65a0e5fe | 226 | "Like ccl-decode-utf-16be/le but check the first BOM.") |
4fbc4b17 | 227 | |
fc2938d1 | 228 | (makunbound 'utf-16-decode-ucs) ; done with it |
65a0e5fe KH |
229 | (makunbound 'utf-16le-decode-loop) |
230 | (makunbound 'utf-16be-decode-loop) | |
fc2938d1 | 231 | |
95d2d433 KH |
232 | ;; UTF-16 decoder generates an UTF-8 sequence represented by a |
233 | ;; sequence eight-bit-control/graphic chars for an invalid byte (the | |
234 | ;; last byte of an odd length source) and an untranslatable character | |
235 | ;; (including an invalid surrogate-pair code-point). | |
236 | ;; | |
237 | ;; This CCL parses that sequence (the first byte is already in r1), | |
238 | ;; and if the sequence represents an untranslatable character, it sets | |
239 | ;; r1 to the original invalid code or untranslated Unicode character | |
240 | ;; code, sets r2 to -1 (to prevent r2 and r3 are written), set2 r5 to | |
241 | ;; -1 (to tell the caller that there's no pre-read character). | |
242 | ;; | |
243 | ;; If the sequence represents an invalid byte, it sets r1 to -1, r2 to | |
244 | ;; the byte, sets r3 and r5 to -1. | |
245 | ;; | |
246 | ;; Otherwise, don't change r1, set r2 and r3 to already read | |
247 | ;; eight-bit-control/graphic characters (if any), set r5 and r6 to the | |
248 | ;; last character that invalidates the UTF-8 form. | |
249 | ;; | |
250 | ;; Note: For UTF-8 validation, we only check if a character is | |
251 | ;; eight-bit-control/graphic or not. It may result in incorrect | |
252 | ;; handling of random binary data, but such a data can't be encoded by | |
253 | ;; UTF-16 anyway. At least, UTF-16 decoder doesn't generate such a | |
254 | ;; sequence even if a source contains invalid byte-sequence. | |
255 | ||
256 | (define-ccl-program ccl-mule-utf-16-encode-untrans | |
257 | `(0 | |
258 | ((r2 = -1) | |
259 | ;; Read the 2nd byte. | |
260 | (read-multibyte-character r5 r6) | |
261 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
262 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
263 | ((r2 = r1) | |
264 | (r3 = -1) | |
265 | (r1 = -1) | |
266 | (end))) ; invalid UTF-8 | |
267 | ||
268 | (r3 = -1) | |
269 | (r2 = r6) | |
270 | (if (r1 <= #xE0) | |
271 | ;; 2-byte UTF-8, i.e. originally an invalid byte. | |
272 | ((r2 &= #x3F) | |
273 | (r2 |= ((r1 & #x1F) << 6)) | |
274 | (r1 = -1) | |
275 | (r5 = -1) | |
276 | (end))) | |
277 | ||
278 | ;; Read the 3rd byte. | |
279 | (read-multibyte-character r5 r6) | |
280 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
281 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
282 | ((end))) ; invalid UTF-8 | |
283 | ||
284 | (if (r1 < #xF0) ; valid 3-byte UTF-8 | |
285 | ((r1 = ((r1 & #x0F) << 12)) | |
286 | (r1 |= ((r2 & #x3F) << 6)) | |
287 | (r1 |= (r6 & #x3F)) | |
288 | (r2 = -1) | |
289 | (r5 = -1) | |
290 | (end))) | |
291 | ||
292 | (r3 = r6) | |
293 | ;; Read the 4th byte. | |
294 | (read-multibyte-character r5 r6) | |
295 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
296 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
297 | (end)) ; livalid UTF-8 | |
298 | ||
299 | ;; valid 4-byte UTF-8 | |
300 | (r1 = ((r1 & #x07) << 18)) | |
301 | (r1 |= ((r2 & #x3F) << 12)) | |
302 | (r1 |= ((r3 & #x3F) << 6)) | |
303 | (r1 |= (r6 & #x3F)) | |
304 | (r2 = -1) | |
305 | (r5 = -1) | |
306 | (end)) | |
307 | ||
308 | (if (r1 >= 0) | |
309 | ((write r1) | |
310 | (if (r2 >= 0) | |
311 | ((write r2) | |
312 | (if (r3 >= 0) | |
313 | (write r3)))))))) | |
314 | ||
fc2938d1 DL |
315 | (eval-and-compile |
316 | (defconst utf-16-decode-to-ucs | |
95d2d433 KH |
317 | ;; Read a character and set r1 to the corresponding Unicode code. |
318 | ;; If r5 is not negative, it means that we have already read a | |
319 | ;; character into r5 and r6. | |
320 | ;; If an invalid eight-bit-control/graphic sequence is found, r2 and | |
321 | ;; r3 may contain a byte to written out, r5 and r6 may contain a | |
322 | ;; pre-read character. Usually they are set to -1. | |
323 | `((if (r5 < 0) | |
324 | (read-multibyte-character r0 r1) | |
325 | ((r0 = r5) | |
326 | (r1 = r6) | |
327 | (r5 = -1))) | |
328 | (lookup-character utf-subst-table-for-encode r0 r1) | |
329 | (r2 = -1) | |
330 | (if (r7 > 0) | |
331 | (r1 = r0) | |
332 | ((translate-character utf-translation-table-for-encode r0 r1) | |
333 | (if (r0 == ,(charset-id 'ascii)) | |
334 | nil | |
335 | (if (r0 == ,(charset-id 'latin-iso8859-1)) | |
336 | (r1 += 128) | |
337 | (if (r0 == ,(charset-id 'eight-bit-control)) | |
338 | nil | |
339 | (if (r0 == ,(charset-id 'eight-bit-graphic)) | |
340 | (call ccl-mule-utf-16-encode-untrans) | |
341 | ((r2 = ((r1 & #x7f) - 32)) | |
342 | (r3 = ((((r1 >> 7) - 32) * 96) + r2)) | |
343 | (r2 = -1) | |
344 | (r5 = -1) | |
345 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | |
346 | (r1 = (r3 + #x100)) | |
347 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
348 | (r1 = (r3 + #x2500)) | |
349 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | |
350 | (r1 = (r3 + #xe000)) | |
351 | (r1 = #xfffd))))))))))))) | |
4fbc4b17 | 352 | |
65a0e5fe | 353 | (defconst utf-16le-encode-loop |
95d2d433 KH |
354 | `((r5 = -1) |
355 | (loop | |
356 | ,@utf-16-decode-to-ucs | |
357 | (if (r1 >= #x10000) | |
358 | ((r1 -= #x10000) | |
359 | (r0 = ((r1 >> 10) + #xD800)) | |
360 | (write (r0 & 255)) | |
361 | (write (r0 >> 8)) | |
362 | (r1 = ((r1 & #x3FF) + #xDC00)))) | |
363 | (if (r1 >= 0) | |
364 | ((write (r1 & 255)) | |
365 | (write (r1 >> 8)))) | |
366 | (if (r2 >= 0) | |
367 | ((write r2) | |
368 | (if (r3 >= 0) | |
369 | (write r3)))) | |
370 | (repeat)))) | |
4fbc4b17 | 371 | |
65a0e5fe | 372 | (defconst utf-16be-encode-loop |
95d2d433 KH |
373 | `((r5 = -1) |
374 | (loop | |
375 | ,@utf-16-decode-to-ucs | |
376 | (if (r1 >= #x10000) | |
377 | ((r1 -= #x10000) | |
378 | (r0 = ((r1 >> 10) + #xD800)) | |
379 | (write (r0 >> 8)) | |
380 | (write (r0 & 255)) | |
381 | (r1 = ((r1 & #x3FF) + #xDC00)))) | |
382 | (if (r1 >= 0) | |
383 | ((write (r1 >> 8)) | |
384 | (write (r1 & 255)))) | |
385 | (if (r2 >= 0) | |
386 | ((write r2) | |
387 | (if (r3 >= 0) | |
388 | (write r3)))) | |
389 | (repeat)))) | |
4fbc4b17 | 390 | ) |
fc2938d1 | 391 | |
65a0e5fe KH |
392 | |
393 | (define-ccl-program ccl-encode-mule-utf-16le | |
dbaba2d2 | 394 | `(2 |
65a0e5fe | 395 | ,utf-16le-encode-loop) |
2217b8e1 | 396 | "Encode to UTF-16LE (little endian without signature). |
fc2938d1 DL |
397 | Characters from the charsets ascii, eight-bit-control, |
398 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
278ce936 KH |
399 | after translation through the translation-table of name |
400 | `utf-translation-table-for-encode'. | |
fc2938d1 DL |
401 | Others are encoded as U+FFFD.") |
402 | ||
65a0e5fe | 403 | (define-ccl-program ccl-encode-mule-utf-16be |
dbaba2d2 | 404 | `(2 |
65a0e5fe | 405 | ,utf-16be-encode-loop) |
2217b8e1 | 406 | "Encode to UTF-16BE (big endian without signature). |
fc2938d1 DL |
407 | Characters from the charsets ascii, eight-bit-control, |
408 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
278ce936 KH |
409 | after translation through the translation-table named |
410 | `utf-translation-table-for-encode'. | |
fc2938d1 DL |
411 | Others are encoded as U+FFFD.") |
412 | ||
65a0e5fe | 413 | (define-ccl-program ccl-encode-mule-utf-16le-with-signature |
f7c4d755 | 414 | `(2 |
4fbc4b17 KH |
415 | ((write #xFF) |
416 | (write #xFE) | |
95d2d433 | 417 | ,@utf-16le-encode-loop)) |
4fbc4b17 KH |
418 | "Encode to UTF-16 (little endian with signature). |
419 | Characters from the charsets ascii, eight-bit-control, | |
420 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
421 | after translation through the translation-table of name | |
422 | `utf-translation-table-for-encode'. | |
423 | Others are encoded as U+FFFD.") | |
424 | ||
65a0e5fe | 425 | (define-ccl-program ccl-encode-mule-utf-16be-with-signature |
f7c4d755 | 426 | `(2 |
4fbc4b17 KH |
427 | ((write #xFE) |
428 | (write #xFF) | |
95d2d433 | 429 | ,@utf-16be-encode-loop)) |
4fbc4b17 KH |
430 | "Encode to UTF-16 (big endian with signature). |
431 | Characters from the charsets ascii, eight-bit-control, | |
432 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
433 | after translation through the translation-table named | |
434 | `utf-translation-table-for-encode'. | |
435 | Others are encoded as U+FFFD.") | |
436 | ||
fc2938d1 | 437 | (makunbound 'utf-16-decode-to-ucs) |
65a0e5fe KH |
438 | (makunbound 'utf-16le-encode-loop) |
439 | (makunbound 'utf-16be-encode-loop) | |
4fbc4b17 KH |
440 | |
441 | (defun mule-utf-16-post-read-conversion (length) | |
442 | (when (> length 0) | |
95d2d433 | 443 | (setq length (utf-8-post-read-conversion length)) |
4fbc4b17 KH |
444 | (let ((char (following-char))) |
445 | (cond ((= char (decode-char 'ucs #xFFFE)) | |
446 | (delete-char 1) | |
447 | (setq last-coding-system-used | |
448 | (coding-system-change-text-conversion | |
449 | last-coding-system-used | |
65a0e5fe | 450 | 'mule-utf-16le-with-signature)) |
4fbc4b17 KH |
451 | (setq length (1- length))) |
452 | ((= char (decode-char 'ucs #xFFFF)) | |
453 | (delete-char 1) | |
454 | (setq last-coding-system-used | |
455 | (coding-system-change-text-conversion | |
456 | last-coding-system-used | |
65a0e5fe | 457 | 'mule-utf-16be-with-signature)) |
4fbc4b17 KH |
458 | (setq length (1- length))) |
459 | (t | |
65a0e5fe | 460 | (setq last-coding-system-used 'mule-utf-16be))))) |
4fbc4b17 | 461 | length) |
fc2938d1 | 462 | |
fc2938d1 DL |
463 | (let ((doc " |
464 | ||
278ce936 KH |
465 | It supports Unicode characters of these ranges: |
466 | U+0000..U+33FF, U+E000..U+FFFF. | |
467 | They correspond to these Emacs character sets: | |
468 | ascii, latin-iso8859-1, mule-unicode-0100-24ff, | |
469 | mule-unicode-2500-33ff, mule-unicode-e000-ffff | |
470 | ||
471 | On decoding (e.g. reading a file), Unicode characters not in the above | |
472 | ranges are decoded as U+FFFD, effectively corrupting the data | |
a1506d29 | 473 | if they are re-encoded. |
278ce936 KH |
474 | |
475 | On encoding (e.g. writing a file), Emacs characters not belonging to | |
476 | any of the character sets listed above are encoded into the byte | |
95d2d433 KH |
477 | sequence representing U+FFFD (REPLACEMENT CHARACTER).") |
478 | (props `((safe-charsets | |
479 | ascii | |
480 | eight-bit-control | |
481 | eight-bit-graphic | |
482 | latin-iso8859-1 | |
483 | mule-unicode-0100-24ff | |
484 | mule-unicode-2500-33ff | |
485 | mule-unicode-e000-ffff | |
486 | ,@(if utf-translate-cjk-mode | |
487 | utf-translate-cjk-charsets)) | |
488 | (valid-codes (0 . 255)) | |
489 | (mime-text-unsuitable . t) | |
490 | (pre-write-conversion . utf-8-pre-write-conversion) | |
491 | (dependency unify-8859-on-encoding-mode | |
492 | unify-8859-on-decoding-mode | |
493 | utf-fragment-on-decoding | |
494 | utf-translate-cjk-mode)))) | |
fc2938d1 | 495 | (make-coding-system |
65a0e5fe | 496 | 'mule-utf-16le 4 |
fc2938d1 DL |
497 | ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. |
498 | (concat | |
65a0e5fe | 499 | "UTF-16LE encoding for Emacs-supported Unicode characters." |
fc2938d1 | 500 | doc) |
65a0e5fe | 501 | '(ccl-decode-mule-utf-16le . ccl-encode-mule-utf-16le) |
95d2d433 KH |
502 | `(,@props |
503 | (post-read-conversion . utf-8-post-read-conversion) | |
504 | (mime-charset . utf-16le))) | |
fc2938d1 DL |
505 | |
506 | (make-coding-system | |
65a0e5fe | 507 | 'mule-utf-16be 4 ?u |
fc2938d1 | 508 | (concat |
65a0e5fe | 509 | "UTF-16BE encoding for Emacs-supported Unicode characters." |
fc2938d1 DL |
510 | doc) |
511 | ||
65a0e5fe | 512 | '(ccl-decode-mule-utf-16be . ccl-encode-mule-utf-16be) |
95d2d433 KH |
513 | `(,@props |
514 | (post-read-conversion . utf-8-post-read-conversion) | |
515 | (mime-charset . utf-16be))) | |
4fbc4b17 KH |
516 | |
517 | (make-coding-system | |
65a0e5fe | 518 | 'mule-utf-16le-with-signature 4 ?u |
4fbc4b17 KH |
519 | (concat |
520 | "Little endian UTF-16 (with BOM) for Emacs-supported Unicode characters." | |
521 | doc) | |
522 | ||
65a0e5fe KH |
523 | '(ccl-decode-mule-utf-16le-with-signature |
524 | . ccl-encode-mule-utf-16le-with-signature) | |
95d2d433 KH |
525 | `(,@props |
526 | (post-read-conversion . utf-8-post-read-conversion) | |
4fbc4b17 | 527 | (coding-category . coding-category-utf-16-le) |
95d2d433 | 528 | (mime-charset . utf-16))) |
4fbc4b17 KH |
529 | |
530 | (make-coding-system | |
65a0e5fe | 531 | 'mule-utf-16be-with-signature 4 ?u |
4fbc4b17 KH |
532 | (concat |
533 | "Big endian UTF-16 (with BOM) for Emacs-supported Unicode characters." | |
534 | doc) | |
535 | ||
65a0e5fe KH |
536 | '(ccl-decode-mule-utf-16be-with-signature |
537 | . ccl-encode-mule-utf-16be-with-signature) | |
95d2d433 KH |
538 | `(,@props |
539 | (post-read-conversion . utf-8-post-read-conversion) | |
4fbc4b17 | 540 | (coding-category . coding-category-utf-16-be) |
95d2d433 | 541 | (mime-charset . utf-16))) |
4fbc4b17 KH |
542 | |
543 | (make-coding-system | |
544 | 'mule-utf-16 4 ?u | |
545 | (concat | |
546 | "UTF-16 (with or without BOM) for Emacs-supported Unicode characters." | |
547 | doc) | |
548 | ||
65a0e5fe | 549 | '(ccl-decode-mule-utf-16 . ccl-encode-mule-utf-16be-with-signature) |
95d2d433 KH |
550 | `(,@props |
551 | (post-read-conversion . mule-utf-16-post-read-conversion) | |
fc2938d1 | 552 | (coding-category . coding-category-utf-16-be) |
95d2d433 | 553 | (mime-charset . utf-16))) |
4fbc4b17 | 554 | ) |
fc2938d1 | 555 | |
65a0e5fe KH |
556 | (define-coding-system-alias 'utf-16le 'mule-utf-16le) |
557 | (define-coding-system-alias 'utf-16be 'mule-utf-16be) | |
558 | (define-coding-system-alias 'utf-16le-with-signature | |
559 | 'mule-utf-16le-with-signature) | |
560 | (define-coding-system-alias 'utf-16be-with-signature | |
561 | 'mule-utf-16be-with-signature) | |
4fbc4b17 | 562 | (define-coding-system-alias 'utf-16 'mule-utf-16) |
fc2938d1 | 563 | |
65a0e5fe KH |
564 | ;; For backward compatibility. |
565 | (define-coding-system-alias 'mule-utf-16-le 'mule-utf-16le-with-signature) | |
566 | (define-coding-system-alias 'utf-16-le 'mule-utf-16le-with-signature) | |
567 | (define-coding-system-alias 'mule-utf-16-be 'mule-utf-16be-with-signature) | |
568 | (define-coding-system-alias 'utf-16-be 'mule-utf-16be-with-signature) | |
569 | ||
ab5796a9 | 570 | ;;; arch-tag: 85455d46-d9c9-466d-a6f3-c3582a7367c4 |
fc2938d1 | 571 | ;;; utf-16.el ends here |