Commit | Line | Data |
---|---|---|
fc2938d1 DL |
1 | ;;; utf-16.el --- UTF-16 encoding/decoding |
2 | ||
d7a0267c | 3 | ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 |
d4877ac1 | 4 | ;; Free Software Foundation, Inc. |
d7a0267c | 5 | ;; Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 |
2fd125a3 KH |
6 | ;; National Institute of Advanced Industrial Science and Technology (AIST) |
7 | ;; Registration Number H14PRO021 | |
fc2938d1 DL |
8 | |
9 | ;; Author: Dave Love <fx@gnu.org> | |
10 | ;; Keywords: Unicode, UTF-16, i18n | |
11 | ||
12 | ;; This file is part of GNU Emacs. | |
13 | ||
14 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
15 | ;; it under the terms of the GNU General Public License as published by | |
d7142f3e | 16 | ;; the Free Software Foundation; either version 3, or (at your option) |
fc2938d1 DL |
17 | ;; any later version. |
18 | ||
19 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
20 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
22 | ;; GNU General Public License for more details. | |
23 | ||
24 | ;; You should have received a copy of the GNU General Public License | |
25 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
3a35cf56 LK |
26 | ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
27 | ;; Boston, MA 02110-1301, USA. | |
fc2938d1 DL |
28 | |
29 | ;;; Commentary: | |
30 | ||
31 | ;; Support for UTF-16, which is a two-byte encoding (modulo | |
cbcd4dc9 DL |
32 | ;; surrogates) of Unicode, defined in RFC 2781. It is written either |
33 | ;; in little or big endian order and either with or without the | |
34 | ;; leading BOM (a two-byte signature which identifies their byte sex). | |
65a0e5fe | 35 | ;; |
cbcd4dc9 | 36 | ;; We provide these base coding systems. |
65a0e5fe KH |
37 | ;; name endian BOM |
38 | ;; ---- ------ --- | |
39 | ;; mule-utf-16le little no | |
40 | ;; mule-utf-16be big no | |
41 | ;; mule-utf-16le-with-signature little yes | |
42 | ;; mule-utf-16be-with-signature big yes | |
43 | ;; mule-utf-16 both yes | |
44 | ;; | |
fc2938d1 DL |
45 | ;; Note that un-decodable sequences aren't (yet?) preserved as raw |
46 | ;; bytes, as they are with utf-8, so reading and writing as utf-16 can | |
47 | ;; corrupt data. | |
48 | ||
49 | ;;; Code: | |
50 | ||
51 | ;; We end up with trivially different -le and -be versions of most | |
52 | ;; things below, sometimes with commonality abstracted into a let | |
53 | ;; binding for maintenance convenience. | |
54 | ||
fc2938d1 DL |
55 | ;; Needed in macro expansion, so can't be let-bound. Zapped after use. |
56 | (eval-and-compile | |
57 | (defconst utf-16-decode-ucs | |
95d2d433 KH |
58 | ;; If r5 is negative, r1 is a Unicode chacter code. Otherise, r5 is |
59 | ;; the first of a surrogate pair and r1 is the second of the pair. | |
60 | ;; Output is charset ID in r0, code point in r1. R0 may be set to | |
61 | ;; -1 in which case a caller should not write out r1. | |
62 | `((if (r5 >= 0) | |
63 | ((r0 = (r1 < #xDC00)) | |
64 | (if ((r1 >= #xE000) | r0) | |
65 | ;; Invalid second code of surrogate pair. | |
66 | ((r0 = r5) | |
67 | (call ccl-mule-utf-untrans)) | |
68 | ((r1 -= #xDC00) | |
69 | (r1 += (((r5 - #xD800) << 10) + #x10000)))) | |
70 | (r5 = -1))) | |
71 | (if (r1 < 128) | |
72 | (r0 = ,(charset-id 'ascii)) | |
73 | ((lookup-integer utf-subst-table-for-decode r1 r3) | |
74 | (if r7 ; got a translation | |
75 | ((r0 = r1) (r1 = r3)) | |
76 | (if (r1 < 160) | |
77 | (r0 = ,(charset-id 'eight-bit-control)) | |
78 | (if (r1 < 256) | |
79 | ((r0 = ,(charset-id 'latin-iso8859-1)) | |
80 | (r1 -= 128)) | |
81 | (if (r1 < #x2500) | |
82 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
83 | (r1 -= #x100) | |
84 | (r2 = (((r1 / 96) + 32) << 7)) | |
85 | (r1 %= 96) | |
86 | (r1 += (r2 + 32))) | |
87 | (if (r1 < #x3400) | |
88 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
89 | (r1 -= #x2500) | |
90 | (r2 = (((r1 / 96) + 32) << 7)) | |
91 | (r1 %= 96) | |
92 | (r1 += (r2 + 32))) | |
93 | (if (r1 < #xD800) | |
94 | ;; We can't have this character. | |
95 | ((r0 = r1) | |
96 | (call ccl-mule-utf-untrans) | |
97 | (r5 = -1) | |
98 | (r0 = -1)) | |
99 | (if (r1 < #xDC00) | |
100 | ;; The first code of a surrogate pair. | |
101 | ((r5 = r1) | |
102 | (r0 = -1)) | |
103 | (if (r1 < #xE000) | |
104 | ;; The second code of a surrogate pair, invalid. | |
105 | ((r0 = r1) | |
106 | (call ccl-mule-utf-untrans) | |
107 | (r5 = -1) | |
108 | (r0 = -1)) | |
109 | (if (r1 < #x10000) | |
110 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
111 | (r1 -= #xE000) | |
112 | (r2 = (((r1 / 96) + 32) << 7)) | |
113 | (r1 %= 96) | |
114 | (r1 += (r2 + 32))) | |
115 | ;; We can't have this character. | |
116 | ((r0 = r1) | |
117 | (call ccl-mule-utf-untrans) | |
118 | (r5 = -1) | |
119 | (r0 = -1))))))))))))))) | |
4fbc4b17 | 120 | |
65a0e5fe | 121 | (defconst utf-16le-decode-loop |
95d2d433 KH |
122 | `((r5 = -1) |
123 | (loop | |
124 | (r3 = -1) | |
125 | (read r3 r4) | |
126 | (r1 = (r4 <8 r3)) | |
127 | ,@utf-16-decode-ucs | |
128 | (if (r0 >= 0) | |
129 | ((translate-character utf-translation-table-for-decode r0 r1) | |
130 | (write-multibyte-character r0 r1))) | |
131 | (repeat)))) | |
4fbc4b17 | 132 | |
65a0e5fe | 133 | (defconst utf-16be-decode-loop |
95d2d433 KH |
134 | `((r5 = -1) |
135 | (loop | |
136 | (r3 = -1) | |
137 | (read r3 r4) | |
138 | (r1 = (r3 <8 r4)) | |
139 | ,@utf-16-decode-ucs | |
140 | (if (r0 >= 0) | |
141 | ((translate-character utf-translation-table-for-decode r0 r1) | |
142 | (write-multibyte-character r0 r1))) | |
143 | (repeat)))) | |
4fbc4b17 KH |
144 | |
145 | ) | |
fc2938d1 | 146 | |
65a0e5fe | 147 | (define-ccl-program ccl-decode-mule-utf-16le |
fc2938d1 | 148 | `(2 ; 2 bytes -> 1 to 4 bytes |
95d2d433 KH |
149 | ,utf-16le-decode-loop |
150 | ((if (r5 >= 0) | |
151 | ((r0 = r5) | |
152 | (call ccl-mule-utf-untrans))) | |
153 | (if (r3 < 0) | |
154 | nil | |
155 | ((if (r3 < #xA0) | |
156 | (r0 = ,(charset-id 'eight-bit-control)) | |
157 | (r0 = ,(charset-id 'eight-bit-graphic))) | |
158 | (write-multibyte-character r0 r3))))) | |
2217b8e1 | 159 | "Decode UTF-16LE (little endian without signature bytes). |
fc2938d1 | 160 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
278ce936 KH |
161 | mule-unicode-*. Un-representable Unicode characters are decoded as |
162 | U+fffd. The result is run through the translation-table named | |
163 | `utf-translation-table-for-decode'.") | |
fc2938d1 | 164 | |
65a0e5fe | 165 | (define-ccl-program ccl-decode-mule-utf-16be |
fc2938d1 | 166 | `(2 ; 2 bytes -> 1 to 4 bytes |
95d2d433 KH |
167 | ,utf-16be-decode-loop |
168 | ((if (r5 >= 0) | |
169 | ((r0 = r5) | |
170 | (call ccl-mule-utf-untrans))) | |
171 | (if (r3 >= 0) | |
172 | ((r0 = r3) | |
173 | (call ccl-mule-utf-untrans))))) | |
2217b8e1 | 174 | "Decode UTF-16BE (big endian without signature bytes). |
fc2938d1 DL |
175 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
176 | mule-unicode-*. Un-representable Unicode characters are | |
278ce936 KH |
177 | decoded as U+fffd. The result is run through the translation-table of |
178 | name `utf-translation-table-for-decode'.") | |
fc2938d1 | 179 | |
65a0e5fe | 180 | (define-ccl-program ccl-decode-mule-utf-16le-with-signature |
4fbc4b17 | 181 | `(2 |
95d2d433 KH |
182 | ((r3 = -1) |
183 | (read r3 r4) | |
184 | ,@utf-16le-decode-loop) | |
185 | (if (r3 >= 0) | |
186 | ((r0 = r3) | |
187 | (call ccl-mule-utf-untrans)))) | |
65a0e5fe | 188 | "Like ccl-decode-utf-16le but skip the first 2-byte BOM.") |
4fbc4b17 | 189 | |
65a0e5fe | 190 | (define-ccl-program ccl-decode-mule-utf-16be-with-signature |
4fbc4b17 | 191 | `(2 |
95d2d433 KH |
192 | ((r3 = -1) |
193 | (read r3 r4) | |
194 | ,@utf-16be-decode-loop) | |
195 | (if (r3 >= 0) | |
196 | ((r0 = r3) | |
197 | (call ccl-mule-utf-untrans)))) | |
65a0e5fe | 198 | "Like ccl-decode-utf-16be but skip the first 2-byte BOM.") |
4fbc4b17 KH |
199 | |
200 | (define-ccl-program ccl-decode-mule-utf-16 | |
201 | `(2 | |
95d2d433 KH |
202 | ((r3 = -1) |
203 | (read r3 r4) | |
4fbc4b17 | 204 | (r1 = (r3 <8 r4)) |
95d2d433 | 205 | (r5 = -1) |
4fbc4b17 KH |
206 | (if (r1 == #xFFFE) |
207 | ;; R1 is a BOM for little endian. We keep this character as | |
208 | ;; is temporarily. It is removed by post-read-conversion | |
209 | ;; function. | |
210 | (,@utf-16-decode-ucs | |
211 | (write-multibyte-character r0 r1) | |
95d2d433 | 212 | ,@utf-16le-decode-loop) |
4fbc4b17 KH |
213 | ((if (r1 == #xFEFF) |
214 | ;; R1 is a BOM for big endian, but we can't keep that | |
215 | ;; character in the output because it can't be | |
216 | ;; distinguished with the normal U+FEFF. So, we keep | |
217 | ;; #xFFFF instead. | |
218 | ((r1 = #xFFFF) | |
95d2d433 KH |
219 | ,@utf-16-decode-ucs |
220 | (write-multibyte-character r0 r1)) | |
221 | ;; R1 is a normal Unicode character. | |
4fbc4b17 | 222 | (,@utf-16-decode-ucs |
95d2d433 KH |
223 | (if (r0 >= 0) |
224 | ((translate-character utf-translation-table-for-decode r0 r1) | |
225 | (write-multibyte-character r0 r1))))) | |
226 | ,@utf-16be-decode-loop))) | |
227 | (if (r3 >= 0) | |
228 | ((r0 = r3) | |
229 | (call ccl-mule-utf-untrans)))) | |
65a0e5fe | 230 | "Like ccl-decode-utf-16be/le but check the first BOM.") |
4fbc4b17 | 231 | |
fc2938d1 | 232 | (makunbound 'utf-16-decode-ucs) ; done with it |
65a0e5fe KH |
233 | (makunbound 'utf-16le-decode-loop) |
234 | (makunbound 'utf-16be-decode-loop) | |
fc2938d1 | 235 | |
95d2d433 KH |
236 | ;; UTF-16 decoder generates an UTF-8 sequence represented by a |
237 | ;; sequence eight-bit-control/graphic chars for an invalid byte (the | |
238 | ;; last byte of an odd length source) and an untranslatable character | |
239 | ;; (including an invalid surrogate-pair code-point). | |
240 | ;; | |
241 | ;; This CCL parses that sequence (the first byte is already in r1), | |
242 | ;; and if the sequence represents an untranslatable character, it sets | |
243 | ;; r1 to the original invalid code or untranslated Unicode character | |
244 | ;; code, sets r2 to -1 (to prevent r2 and r3 are written), set2 r5 to | |
245 | ;; -1 (to tell the caller that there's no pre-read character). | |
246 | ;; | |
247 | ;; If the sequence represents an invalid byte, it sets r1 to -1, r2 to | |
248 | ;; the byte, sets r3 and r5 to -1. | |
249 | ;; | |
250 | ;; Otherwise, don't change r1, set r2 and r3 to already read | |
251 | ;; eight-bit-control/graphic characters (if any), set r5 and r6 to the | |
252 | ;; last character that invalidates the UTF-8 form. | |
253 | ;; | |
254 | ;; Note: For UTF-8 validation, we only check if a character is | |
255 | ;; eight-bit-control/graphic or not. It may result in incorrect | |
256 | ;; handling of random binary data, but such a data can't be encoded by | |
257 | ;; UTF-16 anyway. At least, UTF-16 decoder doesn't generate such a | |
258 | ;; sequence even if a source contains invalid byte-sequence. | |
259 | ||
260 | (define-ccl-program ccl-mule-utf-16-encode-untrans | |
261 | `(0 | |
262 | ((r2 = -1) | |
263 | ;; Read the 2nd byte. | |
264 | (read-multibyte-character r5 r6) | |
265 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
266 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
267 | ((r2 = r1) | |
268 | (r3 = -1) | |
269 | (r1 = -1) | |
270 | (end))) ; invalid UTF-8 | |
271 | ||
272 | (r3 = -1) | |
273 | (r2 = r6) | |
274 | (if (r1 <= #xE0) | |
275 | ;; 2-byte UTF-8, i.e. originally an invalid byte. | |
276 | ((r2 &= #x3F) | |
277 | (r2 |= ((r1 & #x1F) << 6)) | |
278 | (r1 = -1) | |
279 | (r5 = -1) | |
280 | (end))) | |
281 | ||
282 | ;; Read the 3rd byte. | |
283 | (read-multibyte-character r5 r6) | |
284 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
285 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
286 | ((end))) ; invalid UTF-8 | |
287 | ||
288 | (if (r1 < #xF0) ; valid 3-byte UTF-8 | |
289 | ((r1 = ((r1 & #x0F) << 12)) | |
290 | (r1 |= ((r2 & #x3F) << 6)) | |
291 | (r1 |= (r6 & #x3F)) | |
292 | (r2 = -1) | |
293 | (r5 = -1) | |
294 | (end))) | |
295 | ||
296 | (r3 = r6) | |
297 | ;; Read the 4th byte. | |
298 | (read-multibyte-character r5 r6) | |
299 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
300 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
301 | (end)) ; livalid UTF-8 | |
302 | ||
303 | ;; valid 4-byte UTF-8 | |
304 | (r1 = ((r1 & #x07) << 18)) | |
305 | (r1 |= ((r2 & #x3F) << 12)) | |
306 | (r1 |= ((r3 & #x3F) << 6)) | |
307 | (r1 |= (r6 & #x3F)) | |
308 | (r2 = -1) | |
309 | (r5 = -1) | |
310 | (end)) | |
311 | ||
312 | (if (r1 >= 0) | |
313 | ((write r1) | |
314 | (if (r2 >= 0) | |
315 | ((write r2) | |
316 | (if (r3 >= 0) | |
317 | (write r3)))))))) | |
318 | ||
fc2938d1 DL |
319 | (eval-and-compile |
320 | (defconst utf-16-decode-to-ucs | |
95d2d433 KH |
321 | ;; Read a character and set r1 to the corresponding Unicode code. |
322 | ;; If r5 is not negative, it means that we have already read a | |
323 | ;; character into r5 and r6. | |
324 | ;; If an invalid eight-bit-control/graphic sequence is found, r2 and | |
325 | ;; r3 may contain a byte to written out, r5 and r6 may contain a | |
326 | ;; pre-read character. Usually they are set to -1. | |
327 | `((if (r5 < 0) | |
328 | (read-multibyte-character r0 r1) | |
329 | ((r0 = r5) | |
330 | (r1 = r6) | |
331 | (r5 = -1))) | |
332 | (lookup-character utf-subst-table-for-encode r0 r1) | |
333 | (r2 = -1) | |
334 | (if (r7 > 0) | |
335 | (r1 = r0) | |
336 | ((translate-character utf-translation-table-for-encode r0 r1) | |
337 | (if (r0 == ,(charset-id 'ascii)) | |
338 | nil | |
339 | (if (r0 == ,(charset-id 'latin-iso8859-1)) | |
340 | (r1 += 128) | |
341 | (if (r0 == ,(charset-id 'eight-bit-control)) | |
342 | nil | |
343 | (if (r0 == ,(charset-id 'eight-bit-graphic)) | |
344 | (call ccl-mule-utf-16-encode-untrans) | |
345 | ((r2 = ((r1 & #x7f) - 32)) | |
346 | (r3 = ((((r1 >> 7) - 32) * 96) + r2)) | |
347 | (r2 = -1) | |
348 | (r5 = -1) | |
349 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | |
350 | (r1 = (r3 + #x100)) | |
351 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
352 | (r1 = (r3 + #x2500)) | |
353 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | |
354 | (r1 = (r3 + #xe000)) | |
355 | (r1 = #xfffd))))))))))))) | |
4fbc4b17 | 356 | |
65a0e5fe | 357 | (defconst utf-16le-encode-loop |
95d2d433 KH |
358 | `((r5 = -1) |
359 | (loop | |
360 | ,@utf-16-decode-to-ucs | |
361 | (if (r1 >= #x10000) | |
362 | ((r1 -= #x10000) | |
363 | (r0 = ((r1 >> 10) + #xD800)) | |
364 | (write (r0 & 255)) | |
365 | (write (r0 >> 8)) | |
366 | (r1 = ((r1 & #x3FF) + #xDC00)))) | |
367 | (if (r1 >= 0) | |
368 | ((write (r1 & 255)) | |
369 | (write (r1 >> 8)))) | |
370 | (if (r2 >= 0) | |
371 | ((write r2) | |
372 | (if (r3 >= 0) | |
373 | (write r3)))) | |
374 | (repeat)))) | |
4fbc4b17 | 375 | |
65a0e5fe | 376 | (defconst utf-16be-encode-loop |
95d2d433 KH |
377 | `((r5 = -1) |
378 | (loop | |
379 | ,@utf-16-decode-to-ucs | |
380 | (if (r1 >= #x10000) | |
381 | ((r1 -= #x10000) | |
382 | (r0 = ((r1 >> 10) + #xD800)) | |
383 | (write (r0 >> 8)) | |
384 | (write (r0 & 255)) | |
385 | (r1 = ((r1 & #x3FF) + #xDC00)))) | |
386 | (if (r1 >= 0) | |
387 | ((write (r1 >> 8)) | |
388 | (write (r1 & 255)))) | |
389 | (if (r2 >= 0) | |
390 | ((write r2) | |
391 | (if (r3 >= 0) | |
392 | (write r3)))) | |
393 | (repeat)))) | |
4fbc4b17 | 394 | ) |
fc2938d1 | 395 | |
65a0e5fe KH |
396 | |
397 | (define-ccl-program ccl-encode-mule-utf-16le | |
dbaba2d2 | 398 | `(2 |
65a0e5fe | 399 | ,utf-16le-encode-loop) |
2217b8e1 | 400 | "Encode to UTF-16LE (little endian without signature). |
fc2938d1 DL |
401 | Characters from the charsets ascii, eight-bit-control, |
402 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
278ce936 KH |
403 | after translation through the translation-table of name |
404 | `utf-translation-table-for-encode'. | |
fc2938d1 DL |
405 | Others are encoded as U+FFFD.") |
406 | ||
65a0e5fe | 407 | (define-ccl-program ccl-encode-mule-utf-16be |
dbaba2d2 | 408 | `(2 |
65a0e5fe | 409 | ,utf-16be-encode-loop) |
2217b8e1 | 410 | "Encode to UTF-16BE (big endian without signature). |
fc2938d1 DL |
411 | Characters from the charsets ascii, eight-bit-control, |
412 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
278ce936 KH |
413 | after translation through the translation-table named |
414 | `utf-translation-table-for-encode'. | |
fc2938d1 DL |
415 | Others are encoded as U+FFFD.") |
416 | ||
65a0e5fe | 417 | (define-ccl-program ccl-encode-mule-utf-16le-with-signature |
f7c4d755 | 418 | `(2 |
4fbc4b17 KH |
419 | ((write #xFF) |
420 | (write #xFE) | |
95d2d433 | 421 | ,@utf-16le-encode-loop)) |
4fbc4b17 KH |
422 | "Encode to UTF-16 (little endian with signature). |
423 | Characters from the charsets ascii, eight-bit-control, | |
424 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
425 | after translation through the translation-table of name | |
426 | `utf-translation-table-for-encode'. | |
427 | Others are encoded as U+FFFD.") | |
428 | ||
65a0e5fe | 429 | (define-ccl-program ccl-encode-mule-utf-16be-with-signature |
f7c4d755 | 430 | `(2 |
4fbc4b17 KH |
431 | ((write #xFE) |
432 | (write #xFF) | |
95d2d433 | 433 | ,@utf-16be-encode-loop)) |
4fbc4b17 KH |
434 | "Encode to UTF-16 (big endian with signature). |
435 | Characters from the charsets ascii, eight-bit-control, | |
436 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | |
437 | after translation through the translation-table named | |
438 | `utf-translation-table-for-encode'. | |
439 | Others are encoded as U+FFFD.") | |
440 | ||
fc2938d1 | 441 | (makunbound 'utf-16-decode-to-ucs) |
65a0e5fe KH |
442 | (makunbound 'utf-16le-encode-loop) |
443 | (makunbound 'utf-16be-encode-loop) | |
4fbc4b17 KH |
444 | |
445 | (defun mule-utf-16-post-read-conversion (length) | |
446 | (when (> length 0) | |
95d2d433 | 447 | (setq length (utf-8-post-read-conversion length)) |
4fbc4b17 KH |
448 | (let ((char (following-char))) |
449 | (cond ((= char (decode-char 'ucs #xFFFE)) | |
450 | (delete-char 1) | |
451 | (setq last-coding-system-used | |
452 | (coding-system-change-text-conversion | |
453 | last-coding-system-used | |
65a0e5fe | 454 | 'mule-utf-16le-with-signature)) |
4fbc4b17 KH |
455 | (setq length (1- length))) |
456 | ((= char (decode-char 'ucs #xFFFF)) | |
457 | (delete-char 1) | |
458 | (setq last-coding-system-used | |
459 | (coding-system-change-text-conversion | |
460 | last-coding-system-used | |
65a0e5fe | 461 | 'mule-utf-16be-with-signature)) |
4fbc4b17 KH |
462 | (setq length (1- length))) |
463 | (t | |
65a0e5fe | 464 | (setq last-coding-system-used 'mule-utf-16be))))) |
4fbc4b17 | 465 | length) |
fc2938d1 | 466 | |
fc2938d1 DL |
467 | (let ((doc " |
468 | ||
278ce936 KH |
469 | It supports Unicode characters of these ranges: |
470 | U+0000..U+33FF, U+E000..U+FFFF. | |
471 | They correspond to these Emacs character sets: | |
472 | ascii, latin-iso8859-1, mule-unicode-0100-24ff, | |
473 | mule-unicode-2500-33ff, mule-unicode-e000-ffff | |
474 | ||
475 | On decoding (e.g. reading a file), Unicode characters not in the above | |
476 | ranges are decoded as U+FFFD, effectively corrupting the data | |
a1506d29 | 477 | if they are re-encoded. |
278ce936 KH |
478 | |
479 | On encoding (e.g. writing a file), Emacs characters not belonging to | |
480 | any of the character sets listed above are encoded into the byte | |
95d2d433 KH |
481 | sequence representing U+FFFD (REPLACEMENT CHARACTER).") |
482 | (props `((safe-charsets | |
483 | ascii | |
484 | eight-bit-control | |
485 | eight-bit-graphic | |
486 | latin-iso8859-1 | |
487 | mule-unicode-0100-24ff | |
488 | mule-unicode-2500-33ff | |
489 | mule-unicode-e000-ffff | |
490 | ,@(if utf-translate-cjk-mode | |
491 | utf-translate-cjk-charsets)) | |
492 | (valid-codes (0 . 255)) | |
493 | (mime-text-unsuitable . t) | |
494 | (pre-write-conversion . utf-8-pre-write-conversion) | |
495 | (dependency unify-8859-on-encoding-mode | |
496 | unify-8859-on-decoding-mode | |
497 | utf-fragment-on-decoding | |
498 | utf-translate-cjk-mode)))) | |
fc2938d1 | 499 | (make-coding-system |
65a0e5fe | 500 | 'mule-utf-16le 4 |
fc2938d1 DL |
501 | ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. |
502 | (concat | |
65a0e5fe | 503 | "UTF-16LE encoding for Emacs-supported Unicode characters." |
fc2938d1 | 504 | doc) |
65a0e5fe | 505 | '(ccl-decode-mule-utf-16le . ccl-encode-mule-utf-16le) |
95d2d433 KH |
506 | `(,@props |
507 | (post-read-conversion . utf-8-post-read-conversion) | |
a9be2675 | 508 | (ascii-incompatible . t) |
95d2d433 | 509 | (mime-charset . utf-16le))) |
fc2938d1 DL |
510 | |
511 | (make-coding-system | |
65a0e5fe | 512 | 'mule-utf-16be 4 ?u |
fc2938d1 | 513 | (concat |
65a0e5fe | 514 | "UTF-16BE encoding for Emacs-supported Unicode characters." |
fc2938d1 DL |
515 | doc) |
516 | ||
65a0e5fe | 517 | '(ccl-decode-mule-utf-16be . ccl-encode-mule-utf-16be) |
95d2d433 KH |
518 | `(,@props |
519 | (post-read-conversion . utf-8-post-read-conversion) | |
a9be2675 | 520 | (ascii-incompatible . t) |
95d2d433 | 521 | (mime-charset . utf-16be))) |
4fbc4b17 KH |
522 | |
523 | (make-coding-system | |
65a0e5fe | 524 | 'mule-utf-16le-with-signature 4 ?u |
4fbc4b17 KH |
525 | (concat |
526 | "Little endian UTF-16 (with BOM) for Emacs-supported Unicode characters." | |
527 | doc) | |
528 | ||
65a0e5fe KH |
529 | '(ccl-decode-mule-utf-16le-with-signature |
530 | . ccl-encode-mule-utf-16le-with-signature) | |
95d2d433 KH |
531 | `(,@props |
532 | (post-read-conversion . utf-8-post-read-conversion) | |
4fbc4b17 | 533 | (coding-category . coding-category-utf-16-le) |
a9be2675 | 534 | (ascii-incompatible . t) |
95d2d433 | 535 | (mime-charset . utf-16))) |
4fbc4b17 KH |
536 | |
537 | (make-coding-system | |
65a0e5fe | 538 | 'mule-utf-16be-with-signature 4 ?u |
4fbc4b17 KH |
539 | (concat |
540 | "Big endian UTF-16 (with BOM) for Emacs-supported Unicode characters." | |
541 | doc) | |
542 | ||
65a0e5fe KH |
543 | '(ccl-decode-mule-utf-16be-with-signature |
544 | . ccl-encode-mule-utf-16be-with-signature) | |
95d2d433 KH |
545 | `(,@props |
546 | (post-read-conversion . utf-8-post-read-conversion) | |
4fbc4b17 | 547 | (coding-category . coding-category-utf-16-be) |
a9be2675 | 548 | (ascii-incompatible . t) |
95d2d433 | 549 | (mime-charset . utf-16))) |
4fbc4b17 KH |
550 | |
551 | (make-coding-system | |
552 | 'mule-utf-16 4 ?u | |
553 | (concat | |
554 | "UTF-16 (with or without BOM) for Emacs-supported Unicode characters." | |
555 | doc) | |
556 | ||
65a0e5fe | 557 | '(ccl-decode-mule-utf-16 . ccl-encode-mule-utf-16be-with-signature) |
95d2d433 KH |
558 | `(,@props |
559 | (post-read-conversion . mule-utf-16-post-read-conversion) | |
fc2938d1 | 560 | (coding-category . coding-category-utf-16-be) |
a9be2675 | 561 | (ascii-incompatible . t) |
95d2d433 | 562 | (mime-charset . utf-16))) |
4fbc4b17 | 563 | ) |
fc2938d1 | 564 | |
65a0e5fe KH |
565 | (define-coding-system-alias 'utf-16le 'mule-utf-16le) |
566 | (define-coding-system-alias 'utf-16be 'mule-utf-16be) | |
567 | (define-coding-system-alias 'utf-16le-with-signature | |
568 | 'mule-utf-16le-with-signature) | |
569 | (define-coding-system-alias 'utf-16be-with-signature | |
570 | 'mule-utf-16be-with-signature) | |
4fbc4b17 | 571 | (define-coding-system-alias 'utf-16 'mule-utf-16) |
fc2938d1 | 572 | |
65a0e5fe KH |
573 | ;; For backward compatibility. |
574 | (define-coding-system-alias 'mule-utf-16-le 'mule-utf-16le-with-signature) | |
575 | (define-coding-system-alias 'utf-16-le 'mule-utf-16le-with-signature) | |
576 | (define-coding-system-alias 'mule-utf-16-be 'mule-utf-16be-with-signature) | |
577 | (define-coding-system-alias 'utf-16-be 'mule-utf-16be-with-signature) | |
578 | ||
ab5796a9 | 579 | ;;; arch-tag: 85455d46-d9c9-466d-a6f3-c3582a7367c4 |
fc2938d1 | 580 | ;;; utf-16.el ends here |