Commit | Line | Data |
---|---|---|
4ed46869 | 1 | /* Header for coding system handler. |
ba318903 | 2 | Copyright (C) 2001-2014 Free Software Foundation, Inc. |
7976eda0 | 3 | Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, |
5df4f04c | 4 | 2005, 2006, 2007, 2008, 2009, 2010, 2011 |
ce03bf76 KH |
5 | National Institute of Advanced Industrial Science and Technology (AIST) |
6 | Registration Number H14PRO021 | |
8f924df7 | 7 | Copyright (C) 2003 |
df7492f9 KH |
8 | National Institute of Advanced Industrial Science and Technology (AIST) |
9 | Registration Number H13PRO009 | |
4ed46869 | 10 | |
369314dc KH |
11 | This file is part of GNU Emacs. |
12 | ||
b9b1cc14 | 13 | GNU Emacs is free software: you can redistribute it and/or modify |
369314dc | 14 | it under the terms of the GNU General Public License as published by |
b9b1cc14 GM |
15 | the Free Software Foundation, either version 3 of the License, or |
16 | (at your option) any later version. | |
4ed46869 | 17 | |
369314dc KH |
18 | GNU Emacs is distributed in the hope that it will be useful, |
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | GNU General Public License for more details. | |
4ed46869 | 22 | |
369314dc | 23 | You should have received a copy of the GNU General Public License |
b9b1cc14 | 24 | along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */ |
4ed46869 | 25 | |
6f776e81 KH |
26 | #ifndef EMACS_CODING_H |
27 | #define EMACS_CODING_H | |
4ed46869 | 28 | |
df7492f9 | 29 | /* Index to arguments of Fdefine_coding_system_internal. */ |
4ed46869 | 30 | |
df7492f9 KH |
31 | enum define_coding_system_arg_index |
32 | { | |
33 | coding_arg_name, | |
34 | coding_arg_mnemonic, | |
35 | coding_arg_coding_type, | |
36 | coding_arg_charset_list, | |
37 | coding_arg_ascii_compatible_p, | |
38 | coding_arg_decode_translation_table, | |
39 | coding_arg_encode_translation_table, | |
40 | coding_arg_post_read_conversion, | |
41 | coding_arg_pre_write_conversion, | |
42 | coding_arg_default_char, | |
8f924df7 | 43 | coding_arg_for_unibyte, |
df7492f9 KH |
44 | coding_arg_plist, |
45 | coding_arg_eol_type, | |
46 | coding_arg_max | |
47 | }; | |
4ed46869 | 48 | |
df7492f9 | 49 | enum define_coding_iso2022_arg_index |
4ed46869 | 50 | { |
df7492f9 KH |
51 | coding_arg_iso2022_initial = coding_arg_max, |
52 | coding_arg_iso2022_reg_usage, | |
53 | coding_arg_iso2022_request, | |
54 | coding_arg_iso2022_flags, | |
55 | coding_arg_iso2022_max | |
4ed46869 KH |
56 | }; |
57 | ||
e4215ddd KH |
58 | enum define_coding_utf8_arg_index |
59 | { | |
60 | coding_arg_utf8_bom = coding_arg_max, | |
61 | coding_arg_utf8_max | |
62 | }; | |
63 | ||
df7492f9 | 64 | enum define_coding_utf16_arg_index |
4ed46869 | 65 | { |
df7492f9 KH |
66 | coding_arg_utf16_bom = coding_arg_max, |
67 | coding_arg_utf16_endian, | |
68 | coding_arg_utf16_max | |
4ed46869 KH |
69 | }; |
70 | ||
df7492f9 KH |
71 | enum define_coding_ccl_arg_index |
72 | { | |
35d47d18 | 73 | coding_arg_ccl_decoder = coding_arg_max, |
df7492f9 KH |
74 | coding_arg_ccl_encoder, |
75 | coding_arg_ccl_valids, | |
76 | coding_arg_ccl_max | |
77 | }; | |
4ed46869 | 78 | |
270afa77 KH |
79 | enum define_coding_undecided_arg_index |
80 | { | |
81 | coding_arg_undecided_inhibit_null_byte_detection = coding_arg_max, | |
82 | coding_arg_undecided_inhibit_iso_escape_detection, | |
83 | coding_arg_undecided_prefer_utf_8, | |
84 | coding_arg_undecided_max | |
85 | }; | |
86 | ||
933373ed KH |
87 | /* Hash table for all coding systems. Keys are coding system symbols |
88 | and values are spec vectors of the corresponding coding system. A | |
89 | spec vector has the form [ ATTRS ALIASES EOL-TYPE ]. ATTRS is a | |
90 | vector of attribute of the coding system. ALIASES is a list of | |
91 | aliases (symbols) of the coding system. EOL-TYPE is `unix', `dos', | |
92 | `mac' or a vector of coding systems (symbols). */ | |
93 | ||
df7492f9 | 94 | extern Lisp_Object Vcoding_system_hash_table; |
4ed46869 | 95 | |
933373ed | 96 | |
df7492f9 | 97 | /* Enumeration of coding system type. */ |
4ed46869 | 98 | |
df7492f9 KH |
99 | enum coding_system_type |
100 | { | |
101 | coding_type_charset, | |
102 | coding_type_utf_8, | |
103 | coding_type_utf_16, | |
104 | coding_type_iso_2022, | |
105 | coding_type_emacs_mule, | |
106 | coding_type_sjis, | |
107 | coding_type_ccl, | |
108 | coding_type_raw_text, | |
109 | coding_type_undecided, | |
110 | coding_type_max | |
111 | }; | |
4ed46869 | 112 | |
4ed46869 | 113 | |
df7492f9 | 114 | /* Enumeration of end-of-line format type. */ |
4ed46869 | 115 | |
df7492f9 KH |
116 | enum end_of_line_type |
117 | { | |
118 | eol_lf, /* Line-feed only, same as Emacs' internal | |
119 | format. */ | |
120 | eol_crlf, /* Sequence of carriage-return and | |
121 | line-feed. */ | |
122 | eol_cr, /* Carriage-return only. */ | |
123 | eol_any, /* Accept any of above. Produce line-feed | |
124 | only. */ | |
125 | eol_undecided, /* This value is used to denote that the | |
126 | eol-type is not yet undecided. */ | |
127 | eol_type_max | |
128 | }; | |
4ed46869 | 129 | |
df7492f9 | 130 | /* Enumeration of index to an attribute vector of a coding system. */ |
4ed46869 | 131 | |
df7492f9 KH |
132 | enum coding_attr_index |
133 | { | |
134 | coding_attr_base_name, | |
135 | coding_attr_docstring, | |
136 | coding_attr_mnemonic, | |
137 | coding_attr_type, | |
138 | coding_attr_charset_list, | |
139 | coding_attr_ascii_compat, | |
140 | coding_attr_decode_tbl, | |
141 | coding_attr_encode_tbl, | |
404202e7 | 142 | coding_attr_trans_tbl, |
df7492f9 KH |
143 | coding_attr_post_read, |
144 | coding_attr_pre_write, | |
145 | coding_attr_default_char, | |
8f924df7 | 146 | coding_attr_for_unibyte, |
df7492f9 KH |
147 | coding_attr_plist, |
148 | ||
149 | coding_attr_category, | |
150 | coding_attr_safe_charsets, | |
151 | ||
152 | /* The followings are extra attributes for each type. */ | |
153 | coding_attr_charset_valids, | |
154 | ||
155 | coding_attr_ccl_decoder, | |
156 | coding_attr_ccl_encoder, | |
157 | coding_attr_ccl_valids, | |
158 | ||
159 | coding_attr_iso_initial, | |
160 | coding_attr_iso_usage, | |
161 | coding_attr_iso_request, | |
162 | coding_attr_iso_flags, | |
163 | ||
e4215ddd | 164 | coding_attr_utf_bom, |
df7492f9 KH |
165 | coding_attr_utf_16_endian, |
166 | ||
167 | coding_attr_emacs_mule_full, | |
168 | ||
270afa77 KH |
169 | coding_attr_undecided_inhibit_null_byte_detection, |
170 | coding_attr_undecided_inhibit_iso_escape_detection, | |
171 | coding_attr_undecided_prefer_utf_8, | |
172 | ||
df7492f9 KH |
173 | coding_attr_last_index |
174 | }; | |
4ed46869 | 175 | |
4ed46869 | 176 | |
933373ed KH |
177 | /* Macros to access an element of an attribute vector. */ |
178 | ||
04e05596 JB |
179 | #define CODING_ATTR_BASE_NAME(attrs) AREF (attrs, coding_attr_base_name) |
180 | #define CODING_ATTR_TYPE(attrs) AREF (attrs, coding_attr_type) | |
181 | #define CODING_ATTR_CHARSET_LIST(attrs) AREF (attrs, coding_attr_charset_list) | |
182 | #define CODING_ATTR_MNEMONIC(attrs) AREF (attrs, coding_attr_mnemonic) | |
183 | #define CODING_ATTR_DOCSTRING(attrs) AREF (attrs, coding_attr_docstring) | |
184 | #define CODING_ATTR_ASCII_COMPAT(attrs) AREF (attrs, coding_attr_ascii_compat) | |
185 | #define CODING_ATTR_DECODE_TBL(attrs) AREF (attrs, coding_attr_decode_tbl) | |
186 | #define CODING_ATTR_ENCODE_TBL(attrs) AREF (attrs, coding_attr_encode_tbl) | |
187 | #define CODING_ATTR_TRANS_TBL(attrs) AREF (attrs, coding_attr_trans_tbl) | |
188 | #define CODING_ATTR_POST_READ(attrs) AREF (attrs, coding_attr_post_read) | |
189 | #define CODING_ATTR_PRE_WRITE(attrs) AREF (attrs, coding_attr_pre_write) | |
190 | #define CODING_ATTR_DEFAULT_CHAR(attrs) AREF (attrs, coding_attr_default_char) | |
191 | #define CODING_ATTR_FOR_UNIBYTE(attrs) AREF (attrs, coding_attr_for_unibyte) | |
04e05596 JB |
192 | #define CODING_ATTR_PLIST(attrs) AREF (attrs, coding_attr_plist) |
193 | #define CODING_ATTR_CATEGORY(attrs) AREF (attrs, coding_attr_category) | |
194 | #define CODING_ATTR_SAFE_CHARSETS(attrs)AREF (attrs, coding_attr_safe_charsets) | |
8ddb35b2 | 195 | |
8ddb35b2 | 196 | |
933373ed KH |
197 | /* Return the name of a coding system specified by ID. */ |
198 | #define CODING_ID_NAME(id) \ | |
199 | (HASH_KEY (XHASH_TABLE (Vcoding_system_hash_table), id)) | |
200 | ||
201 | /* Return the attribute vector of a coding system specified by ID. */ | |
202 | ||
df7492f9 KH |
203 | #define CODING_ID_ATTRS(id) \ |
204 | (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 0)) | |
fbaa2ed9 | 205 | |
933373ed KH |
206 | /* Return the list of aliases of a coding system specified by ID. */ |
207 | ||
df7492f9 KH |
208 | #define CODING_ID_ALIASES(id) \ |
209 | (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 1)) | |
c0c69d45 | 210 | |
933373ed KH |
211 | /* Return the eol-type of a coding system specified by ID. */ |
212 | ||
df7492f9 KH |
213 | #define CODING_ID_EOL_TYPE(id) \ |
214 | (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 2)) | |
658cc252 | 215 | |
933373ed KH |
216 | |
217 | /* Return the spec vector of CODING_SYSTEM_SYMBOL. */ | |
fbaa2ed9 | 218 | |
df7492f9 KH |
219 | #define CODING_SYSTEM_SPEC(coding_system_symbol) \ |
220 | (Fgethash (coding_system_symbol, Vcoding_system_hash_table, Qnil)) | |
4ed46869 | 221 | |
933373ed KH |
222 | |
223 | /* Return the ID of CODING_SYSTEM_SYMBOL. */ | |
224 | ||
df7492f9 KH |
225 | #define CODING_SYSTEM_ID(coding_system_symbol) \ |
226 | hash_lookup (XHASH_TABLE (Vcoding_system_hash_table), \ | |
227 | coding_system_symbol, NULL) | |
4ed46869 | 228 | |
96c06863 | 229 | /* Return true if CODING_SYSTEM_SYMBOL is a coding system. */ |
933373ed | 230 | |
7c00e33d KH |
231 | #define CODING_SYSTEM_P(coding_system_symbol) \ |
232 | (CODING_SYSTEM_ID (coding_system_symbol) >= 0 \ | |
233 | || (! NILP (coding_system_symbol) \ | |
234 | && ! NILP (Fcoding_system_p (coding_system_symbol)))) | |
4ed46869 | 235 | |
933373ed KH |
236 | /* Check if X is a coding system or not. */ |
237 | ||
8f924df7 | 238 | #define CHECK_CODING_SYSTEM(x) \ |
df7492f9 | 239 | do { \ |
7c00e33d KH |
240 | if (CODING_SYSTEM_ID (x) < 0 \ |
241 | && NILP (Fcheck_coding_system (x))) \ | |
8f924df7 | 242 | wrong_type_argument (Qcoding_system_p, (x)); \ |
96c06863 | 243 | } while (false) |
658cc252 | 244 | |
4ed46869 | 245 | |
933373ed KH |
246 | /* Check if X is a coding system or not. If it is, set SEPC to the |
247 | spec vector of the coding system. */ | |
248 | ||
df7492f9 KH |
249 | #define CHECK_CODING_SYSTEM_GET_SPEC(x, spec) \ |
250 | do { \ | |
251 | spec = CODING_SYSTEM_SPEC (x); \ | |
7c00e33d KH |
252 | if (NILP (spec)) \ |
253 | { \ | |
254 | Fcheck_coding_system (x); \ | |
255 | spec = CODING_SYSTEM_SPEC (x); \ | |
256 | } \ | |
df7492f9 | 257 | if (NILP (spec)) \ |
02dfeba8 | 258 | wrong_type_argument (Qcoding_system_p, (x)); \ |
96c06863 | 259 | } while (false) |
e6de76f8 | 260 | |
8ddb35b2 | 261 | |
933373ed KH |
262 | /* Check if X is a coding system or not. If it is, set ID to the |
263 | ID of the coding system. */ | |
264 | ||
df7492f9 KH |
265 | #define CHECK_CODING_SYSTEM_GET_ID(x, id) \ |
266 | do \ | |
267 | { \ | |
268 | id = CODING_SYSTEM_ID (x); \ | |
7c00e33d KH |
269 | if (id < 0) \ |
270 | { \ | |
271 | Fcheck_coding_system (x); \ | |
272 | id = CODING_SYSTEM_ID (x); \ | |
273 | } \ | |
df7492f9 | 274 | if (id < 0) \ |
02dfeba8 | 275 | wrong_type_argument (Qcoding_system_p, (x)); \ |
96c06863 | 276 | } while (false) |
4ed46869 | 277 | |
4ed46869 KH |
278 | |
279 | /*** GENERAL section ***/ | |
280 | ||
df7492f9 KH |
281 | /* Enumeration of result code of code conversion. */ |
282 | enum coding_result_code | |
4ed46869 | 283 | { |
df7492f9 KH |
284 | CODING_RESULT_SUCCESS, |
285 | CODING_RESULT_INSUFFICIENT_SRC, | |
286 | CODING_RESULT_INSUFFICIENT_DST, | |
63e11478 | 287 | CODING_RESULT_INVALID_SRC, |
1af1a51a | 288 | CODING_RESULT_INTERRUPT |
4ed46869 KH |
289 | }; |
290 | ||
658cc252 | 291 | |
3b2d77fe | 292 | /* Macros used for the member `mode' of the struct coding_system. */ |
658cc252 | 293 | |
658cc252 | 294 | /* If set, the decoding/encoding routines treat the current data as |
5998373a | 295 | the last block of the whole text to be converted, and do the |
55496054 | 296 | appropriate finishing job. */ |
1af1a51a | 297 | #define CODING_MODE_LAST_BLOCK 0x01 |
658cc252 KH |
298 | |
299 | /* If set, it means that the current source text is in a buffer which | |
300 | enables selective display. */ | |
1af1a51a | 301 | #define CODING_MODE_SELECTIVE_DISPLAY 0x02 |
658cc252 KH |
302 | |
303 | /* This flag is used by the decoding/encoding routines on the fly. If | |
304 | set, it means that right-to-left text is being processed. */ | |
1af1a51a | 305 | #define CODING_MODE_DIRECTION 0x04 |
658cc252 | 306 | |
1af1a51a | 307 | #define CODING_MODE_FIXED_DESTINATION 0x08 |
df7492f9 | 308 | |
933373ed KH |
309 | /* If set, it means that the encoding routines produces some safe |
310 | ASCII characters (usually '?') for unsupported characters. */ | |
1af1a51a | 311 | #define CODING_MODE_SAFE_ENCODING 0x10 |
df7492f9 | 312 | |
825d0875 KH |
313 | /* For handling composition sequence. */ |
314 | #include "composite.h" | |
315 | ||
316 | enum composition_state | |
317 | { | |
318 | COMPOSING_NO, | |
319 | COMPOSING_CHAR, | |
320 | COMPOSING_RULE, | |
321 | COMPOSING_COMPONENT_CHAR, | |
322 | COMPOSING_COMPONENT_RULE | |
323 | }; | |
324 | ||
325 | /* Structure for the current composition status. */ | |
326 | struct composition_status | |
327 | { | |
328 | enum composition_state state; | |
329 | enum composition_method method; | |
f10fe38f | 330 | bool old_form; /* true if pre-21 form */ |
825d0875 KH |
331 | int length; /* number of elements produced in charbuf */ |
332 | int nchars; /* number of characters composed */ | |
333 | int ncomps; /* number of composition components */ | |
334 | /* Maximum carryover is for the case of COMPOSITION_WITH_RULE_ALTCHARS. | |
335 | See the comment in coding.c. */ | |
336 | int carryover[4 /* annotation header */ | |
337 | + MAX_COMPOSITION_COMPONENTS * 3 - 2 /* ALTs and RULEs */ | |
338 | + 2 /* intermediate -1 -1 */ | |
339 | + MAX_COMPOSITION_COMPONENTS /* CHARs */ | |
340 | ]; | |
341 | }; | |
342 | ||
343 | ||
df7492f9 KH |
344 | /* Structure of the field `spec.iso_2022' in the structure |
345 | `coding_system'. */ | |
346 | struct iso_2022_spec | |
4ed46869 | 347 | { |
2ec49574 | 348 | /* Bit-wise-or of CODING_ISO_FLAG_XXX. */ |
df7492f9 | 349 | unsigned flags; |
4ed46869 | 350 | |
df7492f9 KH |
351 | /* The current graphic register invoked to each graphic plane. */ |
352 | int current_invocation[2]; | |
658cc252 | 353 | |
df7492f9 KH |
354 | /* The current charset designated to each graphic register. The |
355 | value -1 means that not charset is designated, -2 means that | |
356 | there was an invalid designation previously. */ | |
357 | int current_designation[4]; | |
4ed46869 | 358 | |
825d0875 KH |
359 | /* If positive, we are now scanning CTEXT extended segment. */ |
360 | int ctext_extended_segment_len; | |
361 | ||
f10fe38f PE |
362 | /* True temporarily only when graphic register 2 or 3 is invoked by |
363 | single-shift while encoding. */ | |
96c06863 | 364 | bool_bf single_shifting : 1; |
f10fe38f PE |
365 | |
366 | /* True temporarily only when processing at beginning of line. */ | |
96c06863 | 367 | bool_bf bol : 1; |
f10fe38f PE |
368 | |
369 | /* If true, we are now scanning embedded UTF-8 sequence. */ | |
96c06863 | 370 | bool_bf embedded_utf_8 : 1; |
825d0875 KH |
371 | |
372 | /* The current composition. */ | |
373 | struct composition_status cmp_status; | |
374 | }; | |
375 | ||
376 | struct emacs_mule_spec | |
377 | { | |
825d0875 | 378 | struct composition_status cmp_status; |
df7492f9 | 379 | }; |
4ed46869 | 380 | |
270afa77 KH |
381 | struct undecided_spec |
382 | { | |
9c90cc06 PE |
383 | /* Inhibit null byte detection. 1 means always inhibit, |
384 | -1 means do not inhibit, 0 means rely on user variable. */ | |
385 | int inhibit_nbd; | |
386 | ||
387 | /* Inhibit ISO escape detection. -1, 0, 1 as above. */ | |
388 | int inhibit_ied; | |
389 | ||
390 | /* Prefer UTF-8 when the input could be other encodings. */ | |
391 | bool prefer_utf_8; | |
270afa77 KH |
392 | }; |
393 | ||
e4215ddd | 394 | enum utf_bom_type |
df7492f9 | 395 | { |
e4215ddd KH |
396 | utf_detect_bom, |
397 | utf_without_bom, | |
398 | utf_with_bom | |
df7492f9 | 399 | }; |
279d9f7b | 400 | |
df7492f9 KH |
401 | enum utf_16_endian_type |
402 | { | |
403 | utf_16_big_endian, | |
404 | utf_16_little_endian | |
405 | }; | |
279d9f7b | 406 | |
df7492f9 KH |
407 | struct utf_16_spec |
408 | { | |
e4215ddd | 409 | enum utf_bom_type bom; |
df7492f9 KH |
410 | enum utf_16_endian_type endian; |
411 | int surrogate; | |
412 | }; | |
279d9f7b | 413 | |
4fecac5c KH |
414 | struct coding_detection_info |
415 | { | |
416 | /* Values of these members are bitwise-OR of CATEGORY_MASK_XXXs. */ | |
417 | /* Which categories are already checked. */ | |
418 | int checked; | |
419 | /* Which categories are strongly found. */ | |
420 | int found; | |
421 | /* Which categories are rejected. */ | |
422 | int rejected; | |
423 | }; | |
279d9f7b | 424 | |
279d9f7b | 425 | |
df7492f9 KH |
426 | struct coding_system |
427 | { | |
428 | /* ID number of the coding system. This is an index to | |
429 | Vcoding_system_hash_table. This value is set by | |
430 | setup_coding_system. At the early stage of building time, this | |
431 | value is -1 in the array coding_categories to indicate that no | |
432 | coding-system of that category is yet defined. */ | |
d3411f89 | 433 | ptrdiff_t id; |
df7492f9 KH |
434 | |
435 | /* Flag bits of the coding system. The meaning of each bit is common | |
436 | to all types of coding systems. */ | |
437 | int common_flags; | |
438 | ||
439 | /* Mode bits of the coding system. See the comments of the macros | |
440 | CODING_MODE_XXX. */ | |
441 | unsigned int mode; | |
450c60a5 | 442 | |
4ed46869 | 443 | /* Detailed information specific to each type of coding system. */ |
df7492f9 | 444 | union |
4ed46869 | 445 | { |
df7492f9 KH |
446 | struct iso_2022_spec iso_2022; |
447 | struct ccl_spec *ccl; /* Defined in ccl.h. */ | |
448 | struct utf_16_spec utf_16; | |
e4215ddd | 449 | enum utf_bom_type utf_8_bom; |
825d0875 | 450 | struct emacs_mule_spec emacs_mule; |
270afa77 | 451 | struct undecided_spec undecided; |
4ed46869 KH |
452 | } spec; |
453 | ||
df7492f9 | 454 | int max_charset_id; |
1b3b981b | 455 | unsigned char *safe_charsets; |
658cc252 | 456 | |
df7492f9 | 457 | /* The following two members specify how binary 8-bit code 128..255 |
96c06863 PE |
458 | are represented in source and destination text respectively. True |
459 | means they are represented by 2-byte sequence, false means they are | |
df7492f9 | 460 | represented by 1-byte as is (see the comment in character.h). */ |
96c06863 PE |
461 | bool_bf src_multibyte : 1; |
462 | bool_bf dst_multibyte : 1; | |
811ea086 | 463 | |
a137bb00 KH |
464 | /* How may heading bytes we can skip for decoding. This is set to |
465 | -1 in setup_coding_system, and updated by detect_coding. So, | |
466 | when this is equal to the byte length of the text being | |
8a44e6d1 KH |
467 | converted, we can skip the actual conversion process except for |
468 | the eol format. */ | |
d311d28c | 469 | ptrdiff_t head_ascii; |
658cc252 | 470 | |
52840a9c KH |
471 | /* How many bytes/chars at the source are detected as valid utf-8 |
472 | sequence. Set by detect_coding_utf_8. */ | |
473 | ptrdiff_t detected_utf8_bytes, detected_utf8_chars; | |
e6d2f155 | 474 | |
8a44e6d1 KH |
475 | /* Used internally in coding.c. See the comment of detect_ascii. */ |
476 | int eol_seen; | |
477 | ||
658cc252 | 478 | /* The following members are set by encoding/decoding routine. */ |
d311d28c | 479 | ptrdiff_t produced, produced_char, consumed, consumed_char; |
658cc252 | 480 | |
811ea086 | 481 | /* Number of error source data found in a decoding routine. */ |
e6f29a68 | 482 | ptrdiff_t errors; |
811ea086 | 483 | |
6d5eb5b0 | 484 | /* Store the positions of error source data. */ |
d311d28c | 485 | ptrdiff_t *error_positions; |
e6a9a0bc | 486 | |
df7492f9 KH |
487 | /* Finish status of code conversion. */ |
488 | enum coding_result_code result; | |
6041c9ce | 489 | |
d311d28c | 490 | ptrdiff_t src_pos, src_pos_byte, src_chars, src_bytes; |
df7492f9 | 491 | Lisp_Object src_object; |
8f924df7 | 492 | const unsigned char *source; |
4ed46869 | 493 | |
d311d28c | 494 | ptrdiff_t dst_pos, dst_pos_byte, dst_bytes; |
df7492f9 KH |
495 | Lisp_Object dst_object; |
496 | unsigned char *destination; | |
4ed46869 | 497 | |
df7492f9 KH |
498 | /* If an element is non-negative, it is a character code. |
499 | ||
500 | If it is in the range -128..-1, it is a 8-bit character code | |
501 | minus 256. | |
502 | ||
503 | If it is less than -128, it specifies the start of an annotation | |
504 | chunk. The length of the chunk is -128 minus the value of the | |
505 | element. The following elements are OFFSET, ANNOTATION-TYPE, and | |
506 | a sequence of actual data for the annotation. OFFSET is a | |
507 | character position offset from dst_pos or src_pos, | |
22bcf204 | 508 | ANNOTATION-TYPE specifies the meaning of the annotation and how to |
df7492f9 KH |
509 | handle the following data.. */ |
510 | int *charbuf; | |
511 | int charbuf_size, charbuf_used; | |
512 | ||
f10fe38f PE |
513 | /* True if the source of conversion is not in the member |
514 | `charbuf', but at `src_object'. */ | |
96c06863 | 515 | bool_bf chars_at_source : 1; |
f10fe38f | 516 | |
f8498081 DA |
517 | /* Nonzero if the result of conversion is in `destination' |
518 | buffer rather than in `dst_object'. */ | |
96c06863 | 519 | bool_bf raw_destination : 1; |
f8498081 | 520 | |
96c06863 PE |
521 | /* Set to true if charbuf contains an annotation. */ |
522 | bool_bf annotated : 1; | |
4ed46869 | 523 | |
df7492f9 KH |
524 | unsigned char carryover[64]; |
525 | int carryover_bytes; | |
a5ee738b | 526 | |
df7492f9 KH |
527 | int default_char; |
528 | ||
f10fe38f | 529 | bool (*detector) (struct coding_system *, struct coding_detection_info *); |
383e0970 | 530 | void (*decoder) (struct coding_system *); |
f10fe38f | 531 | bool (*encoder) (struct coding_system *); |
df7492f9 KH |
532 | }; |
533 | ||
534 | /* Meanings of bits in the member `common_flags' of the structure | |
535 | coding_system. The lowest 8 bits are reserved for various kind of | |
536 | annotations (currently two of them are used). */ | |
537 | #define CODING_ANNOTATION_MASK 0x00FF | |
538 | #define CODING_ANNOTATE_COMPOSITION_MASK 0x0001 | |
539 | #define CODING_ANNOTATE_DIRECTION_MASK 0x0002 | |
4fecac5c | 540 | #define CODING_ANNOTATE_CHARSET_MASK 0x0003 |
df7492f9 KH |
541 | #define CODING_FOR_UNIBYTE_MASK 0x0100 |
542 | #define CODING_REQUIRE_FLUSHING_MASK 0x0200 | |
543 | #define CODING_REQUIRE_DECODING_MASK 0x0400 | |
544 | #define CODING_REQUIRE_ENCODING_MASK 0x0800 | |
545 | #define CODING_REQUIRE_DETECTION_MASK 0x1000 | |
546 | #define CODING_RESET_AT_BOL_MASK 0x2000 | |
547 | ||
96c06863 | 548 | /* Return nonzero if the coding context CODING requires annotation |
df7492f9 KH |
549 | handling. */ |
550 | #define CODING_REQUIRE_ANNOTATION(coding) \ | |
551 | ((coding)->common_flags & CODING_ANNOTATION_MASK) | |
552 | ||
96c06863 PE |
553 | /* Return nonzero if the coding context CODING prefers decoding into |
554 | unibyte. */ | |
df7492f9 KH |
555 | #define CODING_FOR_UNIBYTE(coding) \ |
556 | ((coding)->common_flags & CODING_FOR_UNIBYTE_MASK) | |
557 | ||
96c06863 | 558 | /* Return nonzero if the coding context CODING requires specific code to be |
a5ee738b KH |
559 | attached at the tail of converted text. */ |
560 | #define CODING_REQUIRE_FLUSHING(coding) \ | |
561 | ((coding)->common_flags & CODING_REQUIRE_FLUSHING_MASK) | |
562 | ||
96c06863 | 563 | /* Return nonzero if the coding context CODING requires code conversion on |
a5ee738b KH |
564 | decoding. */ |
565 | #define CODING_REQUIRE_DECODING(coding) \ | |
811ea086 KH |
566 | ((coding)->dst_multibyte \ |
567 | || (coding)->common_flags & CODING_REQUIRE_DECODING_MASK) | |
a5ee738b | 568 | |
df7492f9 | 569 | |
96c06863 | 570 | /* Return nonzero if the coding context CODING requires code conversion on |
c198294f KH |
571 | encoding. |
572 | The non-multibyte part of the condition is to support encoding of | |
573 | unibyte strings/buffers generated by string-as-unibyte or | |
574 | (set-buffer-multibyte nil) from multibyte strings/buffers. */ | |
df7492f9 KH |
575 | #define CODING_REQUIRE_ENCODING(coding) \ |
576 | ((coding)->src_multibyte \ | |
577 | || (coding)->common_flags & CODING_REQUIRE_ENCODING_MASK \ | |
578 | || (coding)->mode & CODING_MODE_SELECTIVE_DISPLAY) | |
579 | ||
a5ee738b | 580 | |
96c06863 | 581 | /* Return nonzero if the coding context CODING requires some kind of code |
a5ee738b KH |
582 | detection. */ |
583 | #define CODING_REQUIRE_DETECTION(coding) \ | |
584 | ((coding)->common_flags & CODING_REQUIRE_DETECTION_MASK) | |
585 | ||
96c06863 | 586 | /* Return nonzero if the coding context CODING requires code conversion on |
811ea086 | 587 | decoding or some kind of code detection. */ |
658cc252 | 588 | #define CODING_MAY_REQUIRE_DECODING(coding) \ |
811ea086 KH |
589 | (CODING_REQUIRE_DECODING (coding) \ |
590 | || CODING_REQUIRE_DETECTION (coding)) | |
4ed46869 | 591 | |
4ed46869 KH |
592 | /* Macros to decode or encode a character of JISX0208 in SJIS. S1 and |
593 | S2 are the 1st and 2nd position-codes of JISX0208 in SJIS coding | |
594 | system. C1 and C2 are the 1st and 2nd position codes of Emacs' | |
595 | internal format. */ | |
596 | ||
df7492f9 KH |
597 | #define SJIS_TO_JIS(code) \ |
598 | do { \ | |
599 | int s1, s2, j1, j2; \ | |
600 | \ | |
601 | s1 = (code) >> 8, s2 = (code) & 0xFF; \ | |
602 | \ | |
603 | if (s2 >= 0x9F) \ | |
604 | (j1 = s1 * 2 - (s1 >= 0xE0 ? 0x160 : 0xE0), \ | |
605 | j2 = s2 - 0x7E); \ | |
606 | else \ | |
607 | (j1 = s1 * 2 - ((s1 >= 0xE0) ? 0x161 : 0xE1), \ | |
608 | j2 = s2 - ((s2 >= 0x7F) ? 0x20 : 0x1F)); \ | |
609 | (code) = (j1 << 8) | j2; \ | |
96c06863 | 610 | } while (false) |
4ed46869 | 611 | |
6e58724e KH |
612 | #define SJIS_TO_JIS2(code) \ |
613 | do { \ | |
614 | int s1, s2, j1, j2; \ | |
615 | \ | |
616 | s1 = (code) >> 8, s2 = (code) & 0xFF; \ | |
617 | \ | |
618 | if (s2 >= 0x9F) \ | |
619 | { \ | |
620 | j1 = (s1 == 0xF0 ? 0x28 \ | |
621 | : s1 == 0xF1 ? 0x24 \ | |
622 | : s1 == 0xF2 ? 0x2C \ | |
623 | : s1 == 0xF3 ? 0x2E \ | |
624 | : 0x6E + (s1 - 0xF4) * 2); \ | |
625 | j2 = s2 - 0x7E; \ | |
626 | } \ | |
627 | else \ | |
628 | { \ | |
629 | j1 = (s1 <= 0xF2 ? 0x21 + (s1 - 0xF0) * 2 \ | |
630 | : s1 <= 0xF4 ? 0x2D + (s1 - 0xF3) * 2 \ | |
631 | : 0x6F + (s1 - 0xF5) * 2); \ | |
632 | j2 = s2 - ((s2 >= 0x7F ? 0x20 : 0x1F)); \ | |
633 | } \ | |
634 | (code) = (j1 << 8) | j2; \ | |
96c06863 | 635 | } while (false) |
6e58724e | 636 | |
df7492f9 KH |
637 | |
638 | #define JIS_TO_SJIS(code) \ | |
4ed46869 | 639 | do { \ |
df7492f9 KH |
640 | int s1, s2, j1, j2; \ |
641 | \ | |
642 | j1 = (code) >> 8, j2 = (code) & 0xFF; \ | |
643 | if (j1 & 1) \ | |
644 | (s1 = j1 / 2 + ((j1 < 0x5F) ? 0x71 : 0xB1), \ | |
645 | s2 = j2 + ((j2 >= 0x60) ? 0x20 : 0x1F)); \ | |
4ed46869 | 646 | else \ |
df7492f9 KH |
647 | (s1 = j1 / 2 + ((j1 < 0x5F) ? 0x70 : 0xB0), \ |
648 | s2 = j2 + 0x7E); \ | |
5afaefc1 | 649 | (code) = (s1 << 8) | s2; \ |
96c06863 | 650 | } while (false) |
4ed46869 | 651 | |
6e58724e KH |
652 | #define JIS_TO_SJIS2(code) \ |
653 | do { \ | |
654 | int s1, s2, j1, j2; \ | |
655 | \ | |
656 | j1 = (code) >> 8, j2 = (code) & 0xFF; \ | |
657 | if (j1 & 1) \ | |
658 | { \ | |
659 | s1 = (j1 <= 0x25 ? 0xF0 + (j1 - 0x21) / 2 \ | |
edb61b39 | 660 | : j1 <= 0x2F ? 0xF3 + (j1 - 0x2D) / 2 \ |
6e58724e KH |
661 | : 0xF5 + (j1 - 0x6F) / 2); \ |
662 | s2 = j2 + ((j2 >= 0x60) ? 0x20 : 0x1F); \ | |
663 | } \ | |
664 | else \ | |
665 | { \ | |
666 | s1 = (j1 == 0x28 ? 0xF0 \ | |
667 | : j1 == 0x24 ? 0xF1 \ | |
668 | : j1 == 0x2C ? 0xF2 \ | |
669 | : j1 == 0x2E ? 0xF3 \ | |
670 | : 0xF4 + (j1 - 0x6E) / 2); \ | |
671 | s2 = j2 + 0x7E; \ | |
672 | } \ | |
673 | (code) = (s1 << 8) | s2; \ | |
96c06863 | 674 | } while (false) |
df7492f9 | 675 | |
290591c8 KH |
676 | /* Encode the file name NAME using the specified coding system |
677 | for file names, if any. */ | |
c3e9160b | 678 | #define ENCODE_FILE(NAME) encode_file_name (NAME) |
df7492f9 | 679 | |
290591c8 KH |
680 | /* Decode the file name NAME using the specified coding system |
681 | for file names, if any. */ | |
c3e9160b | 682 | #define DECODE_FILE(NAME) decode_file_name (NAME) |
df7492f9 | 683 | |
2dfda962 | 684 | /* Encode the string STR using the specified coding system |
53eda481 | 685 | for system functions, if any. */ |
2dfda962 | 686 | #define ENCODE_SYSTEM(str) \ |
9b58c683 | 687 | (! NILP (Vlocale_coding_system) \ |
96c06863 | 688 | ? code_convert_string_norecord (str, Vlocale_coding_system, true) \ |
2dfda962 JR |
689 | : str) |
690 | ||
691 | /* Decode the string STR using the specified coding system | |
53eda481 | 692 | for system functions, if any. */ |
581e7427 | 693 | #define DECODE_SYSTEM(str) \ |
9b58c683 | 694 | (! NILP (Vlocale_coding_system) \ |
96c06863 | 695 | ? code_convert_string_norecord (str, Vlocale_coding_system, false) \ |
2dfda962 | 696 | : str) |
cf29bf99 | 697 | |
5bbb4727 | 698 | /* Note that this encodes utf-8, not utf-8-emacs, so it's not a no-op. */ |
96c06863 | 699 | #define ENCODE_UTF_8(str) code_convert_string_norecord (str, Qutf_8, true) |
b3a208b0 | 700 | |
4ed46869 | 701 | /* Extern declarations. */ |
f10fe38f | 702 | extern Lisp_Object code_conversion_save (bool, bool); |
383e0970 J |
703 | extern void setup_coding_system (Lisp_Object, struct coding_system *); |
704 | extern Lisp_Object coding_charset_list (struct coding_system *); | |
705 | extern Lisp_Object coding_system_charset_list (Lisp_Object); | |
383e0970 | 706 | extern Lisp_Object code_convert_string (Lisp_Object, Lisp_Object, |
f10fe38f | 707 | Lisp_Object, bool, bool, bool); |
383e0970 | 708 | extern Lisp_Object code_convert_string_norecord (Lisp_Object, Lisp_Object, |
f10fe38f | 709 | bool); |
c3e9160b EZ |
710 | extern Lisp_Object encode_file_name (Lisp_Object); |
711 | extern Lisp_Object decode_file_name (Lisp_Object); | |
383e0970 J |
712 | extern Lisp_Object raw_text_coding_system (Lisp_Object); |
713 | extern Lisp_Object coding_inherit_eol_type (Lisp_Object, Lisp_Object); | |
4628bef1 | 714 | extern Lisp_Object complement_process_encoding_system (Lisp_Object); |
383e0970 | 715 | |
f10fe38f PE |
716 | extern void decode_coding_gap (struct coding_system *, |
717 | ptrdiff_t, ptrdiff_t); | |
383e0970 | 718 | extern void decode_coding_object (struct coding_system *, |
d311d28c PE |
719 | Lisp_Object, ptrdiff_t, ptrdiff_t, |
720 | ptrdiff_t, ptrdiff_t, Lisp_Object); | |
383e0970 | 721 | extern void encode_coding_object (struct coding_system *, |
d311d28c PE |
722 | Lisp_Object, ptrdiff_t, ptrdiff_t, |
723 | ptrdiff_t, ptrdiff_t, Lisp_Object); | |
df7492f9 | 724 | |
7f590b0c | 725 | #if defined (WINDOWSNT) || defined (CYGWIN) |
ba116008 DC |
726 | |
727 | /* These functions use Lisp string objects to store the UTF-16LE | |
728 | strings that modern versions of Windows expect. These strings are | |
729 | not particularly useful to Lisp, and all Lisp strings should be | |
730 | native Emacs multibyte. */ | |
731 | ||
732 | /* Access the wide-character string stored in a Lisp string object. */ | |
733 | #define WCSDATA(x) ((wchar_t *) SDATA (x)) | |
734 | ||
735 | /* Convert the multi-byte string in STR to UTF-16LE encoded unibyte | |
736 | string, and store it in *BUF. BUF may safely point to STR on entry. */ | |
737 | extern wchar_t *to_unicode (Lisp_Object str, Lisp_Object *buf); | |
738 | ||
739 | /* Convert STR, a UTF-16LE encoded string embedded in a unibyte string | |
740 | object, to a multi-byte Emacs string and return it. This function | |
741 | calls code_convert_string_norecord internally and has all its | |
742 | failure modes. STR itself is not modified. */ | |
743 | extern Lisp_Object from_unicode (Lisp_Object str); | |
744 | ||
819e2da9 | 745 | /* Convert WSTR to an Emacs string. */ |
faa52174 | 746 | extern Lisp_Object from_unicode_buffer (const wchar_t *wstr); |
819e2da9 | 747 | |
7f590b0c | 748 | #endif /* WINDOWSNT || CYGWIN */ |
ba116008 | 749 | |
933373ed KH |
750 | /* Macros for backward compatibility. */ |
751 | ||
df7492f9 | 752 | #define encode_coding_string(coding, string, nocopy) \ |
729eadda EZ |
753 | (STRING_MULTIBYTE(string) ? \ |
754 | (encode_coding_object (coding, string, 0, 0, SCHARS (string), \ | |
755 | SBYTES (string), Qt), \ | |
756 | (coding)->dst_object) : (string)) | |
df7492f9 KH |
757 | |
758 | ||
759 | #define decode_coding_c_string(coding, src, bytes, dst_object) \ | |
760 | do { \ | |
761 | (coding)->source = (src); \ | |
762 | (coding)->src_chars = (coding)->src_bytes = (bytes); \ | |
763 | decode_coding_object ((coding), Qnil, 0, 0, (bytes), (bytes), \ | |
764 | (dst_object)); \ | |
96c06863 | 765 | } while (false) |
df7492f9 KH |
766 | |
767 | ||
c532d349 | 768 | extern Lisp_Object preferred_coding_system (void); |
df7492f9 KH |
769 | |
770 | ||
8f924df7 KH |
771 | extern Lisp_Object Qutf_8, Qutf_8_emacs; |
772 | ||
955cbe7b | 773 | extern Lisp_Object Qcoding_category_index; |
df7492f9 KH |
774 | extern Lisp_Object Qcoding_system_p; |
775 | extern Lisp_Object Qraw_text, Qemacs_mule, Qno_conversion, Qundecided; | |
4ed46869 | 776 | extern Lisp_Object Qbuffer_file_coding_system; |
df7492f9 | 777 | |
84cc1ab6 | 778 | extern Lisp_Object Qunix, Qdos; |
4ed46869 | 779 | |
f967223b KH |
780 | extern Lisp_Object Qtranslation_table; |
781 | extern Lisp_Object Qtranslation_table_id; | |
ab45712c | 782 | |
4ed46869 KH |
783 | #ifdef emacs |
784 | extern Lisp_Object Qfile_coding_system; | |
387f6ba5 | 785 | extern Lisp_Object Qcall_process, Qcall_process_region; |
4ed46869 | 786 | extern Lisp_Object Qstart_process, Qopen_network_stream; |
d008a7cc | 787 | extern Lisp_Object Qwrite_region; |
4ed46869 | 788 | |
383e0970 | 789 | extern char *emacs_strerror (int); |
68c45bf0 | 790 | |
fbaa2ed9 KH |
791 | /* Coding system to be used to encode text for terminal display when |
792 | terminal coding system is nil. */ | |
793 | extern struct coding_system safe_terminal_coding; | |
794 | ||
4ed46869 KH |
795 | #endif |
796 | ||
d008a7cc GM |
797 | /* Error signaled when there's a problem with detecting coding system */ |
798 | extern Lisp_Object Qcoding_system_error; | |
799 | ||
df7492f9 | 800 | extern char emacs_mule_bytes[256]; |
df7492f9 | 801 | |
6f776e81 | 802 | #endif /* EMACS_CODING_H */ |