Merge from emacs--rel--22
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006, 2007, 2008 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 2005, 2006, 2007, 2008
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8 Copyright (C) 2003
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
11
12 This file is part of GNU Emacs.
13
14 GNU Emacs is free software; you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation; either version 3, or (at your option)
17 any later version.
18
19 GNU Emacs is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with GNU Emacs; see the file COPYING. If not, write to
26 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
27 Boston, MA 02110-1301, USA. */
28
29 /*** TABLE OF CONTENTS ***
30
31 0. General comments
32 1. Preamble
33 2. Emacs' internal format (emacs-utf-8) handlers
34 3. UTF-8 handlers
35 4. UTF-16 handlers
36 5. Charset-base coding systems handlers
37 6. emacs-mule (old Emacs' internal format) handlers
38 7. ISO2022 handlers
39 8. Shift-JIS and BIG5 handlers
40 9. CCL handlers
41 10. C library functions
42 11. Emacs Lisp library functions
43 12. Postamble
44
45 */
46
47 /*** 0. General comments ***
48
49
50 CODING SYSTEM
51
52 A coding system is an object for an encoding mechanism that contains
53 information about how to convert byte sequences to character
54 sequences and vice versa. When we say "decode", it means converting
55 a byte sequence of a specific coding system into a character
56 sequence that is represented by Emacs' internal coding system
57 `emacs-utf-8', and when we say "encode", it means converting a
58 character sequence of emacs-utf-8 to a byte sequence of a specific
59 coding system.
60
61 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
62 C level, a coding system is represented by a vector of attributes
63 stored in the hash table Vcharset_hash_table. The conversion from
64 coding system symbol to attributes vector is done by looking up
65 Vcharset_hash_table by the symbol.
66
67 Coding systems are classified into the following types depending on
68 the encoding mechanism. Here's a brief description of the types.
69
70 o UTF-8
71
72 o UTF-16
73
74 o Charset-base coding system
75
76 A coding system defined by one or more (coded) character sets.
77 Decoding and encoding are done by a code converter defined for each
78 character set.
79
80 o Old Emacs internal format (emacs-mule)
81
82 The coding system adopted by old versions of Emacs (20 and 21).
83
84 o ISO2022-base coding system
85
86 The most famous coding system for multiple character sets. X's
87 Compound Text, various EUCs (Extended Unix Code), and coding systems
88 used in the Internet communication such as ISO-2022-JP are all
89 variants of ISO2022.
90
91 o SJIS (or Shift-JIS or MS-Kanji-Code)
92
93 A coding system to encode character sets: ASCII, JISX0201, and
94 JISX0208. Widely used for PC's in Japan. Details are described in
95 section 8.
96
97 o BIG5
98
99 A coding system to encode character sets: ASCII and Big5. Widely
100 used for Chinese (mainly in Taiwan and Hong Kong). Details are
101 described in section 8. In this file, when we write "big5" (all
102 lowercase), we mean the coding system, and when we write "Big5"
103 (capitalized), we mean the character set.
104
105 o CCL
106
107 If a user wants to decode/encode text encoded in a coding system
108 not listed above, he can supply a decoder and an encoder for it in
109 CCL (Code Conversion Language) programs. Emacs executes the CCL
110 program while decoding/encoding.
111
112 o Raw-text
113
114 A coding system for text containing raw eight-bit data. Emacs
115 treats each byte of source text as a character (except for
116 end-of-line conversion).
117
118 o No-conversion
119
120 Like raw text, but don't do end-of-line conversion.
121
122
123 END-OF-LINE FORMAT
124
125 How text end-of-line is encoded depends on operating system. For
126 instance, Unix's format is just one byte of LF (line-feed) code,
127 whereas DOS's format is two-byte sequence of `carriage-return' and
128 `line-feed' codes. MacOS's format is usually one byte of
129 `carriage-return'.
130
131 Since text character encoding and end-of-line encoding are
132 independent, any coding system described above can take any format
133 of end-of-line (except for no-conversion).
134
135 STRUCT CODING_SYSTEM
136
137 Before using a coding system for code conversion (i.e. decoding and
138 encoding), we setup a structure of type `struct coding_system'.
139 This structure keeps various information about a specific code
140 conversion (e.g. the location of source and destination data).
141
142 */
143
144 /* COMMON MACROS */
145
146
147 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
148
149 These functions check if a byte sequence specified as a source in
150 CODING conforms to the format of XXX, and update the members of
151 DETECT_INFO.
152
153 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
154
155 Below is the template of these functions. */
156
157 #if 0
158 static int
159 detect_coding_XXX (coding, detect_info)
160 struct coding_system *coding;
161 struct coding_detection_info *detect_info;
162 {
163 const unsigned char *src = coding->source;
164 const unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte;
166 int consumed_chars = 0;
167 int found = 0;
168 ...;
169
170 while (1)
171 {
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
174 ONE_MORE_BYTE (c);
175
176 if (! __C_conforms_to_XXX___ (c))
177 break;
178 if (! __C_strongly_suggests_XXX__ (c))
179 found = CATEGORY_MASK_XXX;
180 }
181 /* The byte sequence is invalid for XXX. */
182 detect_info->rejected |= CATEGORY_MASK_XXX;
183 return 0;
184
185 no_more_source:
186 /* The source exausted successfully. */
187 detect_info->found |= found;
188 return 1;
189 }
190 #endif
191
192 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
193
194 These functions decode a byte sequence specified as a source by
195 CODING. The resulting multibyte text goes to a place pointed to by
196 CODING->charbuf, the length of which should not exceed
197 CODING->charbuf_size;
198
199 These functions set the information of original and decoded texts in
200 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
201 They also set CODING->result to one of CODING_RESULT_XXX indicating
202 how the decoding is finished.
203
204 Below is the template of these functions. */
205
206 #if 0
207 static void
208 decode_coding_XXXX (coding)
209 struct coding_system *coding;
210 {
211 const unsigned char *src = coding->source + coding->consumed;
212 const unsigned char *src_end = coding->source + coding->src_bytes;
213 /* SRC_BASE remembers the start position in source in each loop.
214 The loop will be exited when there's not enough source code, or
215 when there's no room in CHARBUF for a decoded character. */
216 const unsigned char *src_base;
217 /* A buffer to produce decoded characters. */
218 int *charbuf = coding->charbuf + coding->charbuf_used;
219 int *charbuf_end = coding->charbuf + coding->charbuf_size;
220 int multibytep = coding->src_multibyte;
221
222 while (1)
223 {
224 src_base = src;
225 if (charbuf < charbuf_end)
226 /* No more room to produce a decoded character. */
227 break;
228 ONE_MORE_BYTE (c);
229 /* Decode it. */
230 }
231
232 no_more_source:
233 if (src_base < src_end
234 && coding->mode & CODING_MODE_LAST_BLOCK)
235 /* If the source ends by partial bytes to construct a character,
236 treat them as eight-bit raw data. */
237 while (src_base < src_end && charbuf < charbuf_end)
238 *charbuf++ = *src_base++;
239 /* Remember how many bytes and characters we consumed. If the
240 source is multibyte, the bytes and chars are not identical. */
241 coding->consumed = coding->consumed_char = src_base - coding->source;
242 /* Remember how many characters we produced. */
243 coding->charbuf_used = charbuf - coding->charbuf;
244 }
245 #endif
246
247 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
248
249 These functions encode SRC_BYTES length text at SOURCE of Emacs'
250 internal multibyte format by CODING. The resulting byte sequence
251 goes to a place pointed to by DESTINATION, the length of which
252 should not exceed DST_BYTES.
253
254 These functions set the information of original and encoded texts in
255 the members produced, produced_char, consumed, and consumed_char of
256 the structure *CODING. They also set the member result to one of
257 CODING_RESULT_XXX indicating how the encoding finished.
258
259 DST_BYTES zero means that source area and destination area are
260 overlapped, which means that we can produce a encoded text until it
261 reaches at the head of not-yet-encoded source text.
262
263 Below is a template of these functions. */
264 #if 0
265 static void
266 encode_coding_XXX (coding)
267 struct coding_system *coding;
268 {
269 int multibytep = coding->dst_multibyte;
270 int *charbuf = coding->charbuf;
271 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
272 unsigned char *dst = coding->destination + coding->produced;
273 unsigned char *dst_end = coding->destination + coding->dst_bytes;
274 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
275 int produced_chars = 0;
276
277 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
278 {
279 int c = *charbuf;
280 /* Encode C into DST, and increment DST. */
281 }
282 label_no_more_destination:
283 /* How many chars and bytes we produced. */
284 coding->produced_char += produced_chars;
285 coding->produced = dst - coding->destination;
286 }
287 #endif
288
289 \f
290 /*** 1. Preamble ***/
291
292 #include <config.h>
293 #include <stdio.h>
294
295 #include "lisp.h"
296 #include "buffer.h"
297 #include "character.h"
298 #include "charset.h"
299 #include "ccl.h"
300 #include "composite.h"
301 #include "coding.h"
302 #include "window.h"
303 #include "frame.h"
304 #include "termhooks.h"
305
306 Lisp_Object Vcoding_system_hash_table;
307
308 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
309 Lisp_Object Qunix, Qdos;
310 extern Lisp_Object Qmac; /* frame.c */
311 Lisp_Object Qbuffer_file_coding_system;
312 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
313 Lisp_Object Qdefault_char;
314 Lisp_Object Qno_conversion, Qundecided;
315 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
316 Lisp_Object Qbig, Qlittle;
317 Lisp_Object Qcoding_system_history;
318 Lisp_Object Qvalid_codes;
319 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
320 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
321 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
322 Lisp_Object QCascii_compatible_p;
323
324 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 Lisp_Object Qtarget_idx;
328
329 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
330 Lisp_Object Qinterrupted, Qinsufficient_memory;
331
332 extern Lisp_Object Qcompletion_ignore_case;
333
334 /* If a symbol has this property, evaluate the value to define the
335 symbol as a coding system. */
336 static Lisp_Object Qcoding_system_define_form;
337
338 int coding_system_require_warning;
339
340 Lisp_Object Vselect_safe_coding_system_function;
341
342 /* Mnemonic string for each format of end-of-line. */
343 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
344 /* Mnemonic string to indicate format of end-of-line is not yet
345 decided. */
346 Lisp_Object eol_mnemonic_undecided;
347
348 /* Format of end-of-line decided by system. This is Qunix on
349 Unix and Mac, Qdos on DOS/Windows.
350 This has an effect only for external encoding (i.e. for output to
351 file and process), not for in-buffer or Lisp string encoding. */
352 static Lisp_Object system_eol_type;
353
354 #ifdef emacs
355
356 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
357
358 Lisp_Object Qcoding_system_p, Qcoding_system_error;
359
360 /* Coding system emacs-mule and raw-text are for converting only
361 end-of-line format. */
362 Lisp_Object Qemacs_mule, Qraw_text;
363 Lisp_Object Qutf_8_emacs;
364
365 /* Coding-systems are handed between Emacs Lisp programs and C internal
366 routines by the following three variables. */
367 /* Coding-system for reading files and receiving data from process. */
368 Lisp_Object Vcoding_system_for_read;
369 /* Coding-system for writing files and sending data to process. */
370 Lisp_Object Vcoding_system_for_write;
371 /* Coding-system actually used in the latest I/O. */
372 Lisp_Object Vlast_coding_system_used;
373 /* Set to non-nil when an error is detected while code conversion. */
374 Lisp_Object Vlast_code_conversion_error;
375 /* A vector of length 256 which contains information about special
376 Latin codes (especially for dealing with Microsoft codes). */
377 Lisp_Object Vlatin_extra_code_table;
378
379 /* Flag to inhibit code conversion of end-of-line format. */
380 int inhibit_eol_conversion;
381
382 /* Flag to inhibit ISO2022 escape sequence detection. */
383 int inhibit_iso_escape_detection;
384
385 /* Flag to make buffer-file-coding-system inherit from process-coding. */
386 int inherit_process_coding_system;
387
388 /* Coding system to be used to encode text for terminal display when
389 terminal coding system is nil. */
390 struct coding_system safe_terminal_coding;
391
392 Lisp_Object Vfile_coding_system_alist;
393 Lisp_Object Vprocess_coding_system_alist;
394 Lisp_Object Vnetwork_coding_system_alist;
395
396 Lisp_Object Vlocale_coding_system;
397
398 #endif /* emacs */
399
400 /* Flag to tell if we look up translation table on character code
401 conversion. */
402 Lisp_Object Venable_character_translation;
403 /* Standard translation table to look up on decoding (reading). */
404 Lisp_Object Vstandard_translation_table_for_decode;
405 /* Standard translation table to look up on encoding (writing). */
406 Lisp_Object Vstandard_translation_table_for_encode;
407
408 Lisp_Object Qtranslation_table;
409 Lisp_Object Qtranslation_table_id;
410 Lisp_Object Qtranslation_table_for_decode;
411 Lisp_Object Qtranslation_table_for_encode;
412
413 /* Alist of charsets vs revision number. */
414 static Lisp_Object Vcharset_revision_table;
415
416 /* Default coding systems used for process I/O. */
417 Lisp_Object Vdefault_process_coding_system;
418
419 /* Char table for translating Quail and self-inserting input. */
420 Lisp_Object Vtranslation_table_for_input;
421
422 /* Two special coding systems. */
423 Lisp_Object Vsjis_coding_system;
424 Lisp_Object Vbig5_coding_system;
425
426 /* ISO2022 section */
427
428 #define CODING_ISO_INITIAL(coding, reg) \
429 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
430 coding_attr_iso_initial), \
431 reg)))
432
433
434 #define CODING_ISO_REQUEST(coding, charset_id) \
435 ((charset_id <= (coding)->max_charset_id \
436 ? (coding)->safe_charsets[charset_id] \
437 : -1))
438
439
440 #define CODING_ISO_FLAGS(coding) \
441 ((coding)->spec.iso_2022.flags)
442 #define CODING_ISO_DESIGNATION(coding, reg) \
443 ((coding)->spec.iso_2022.current_designation[reg])
444 #define CODING_ISO_INVOCATION(coding, plane) \
445 ((coding)->spec.iso_2022.current_invocation[plane])
446 #define CODING_ISO_SINGLE_SHIFTING(coding) \
447 ((coding)->spec.iso_2022.single_shifting)
448 #define CODING_ISO_BOL(coding) \
449 ((coding)->spec.iso_2022.bol)
450 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
451 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
452
453 /* Control characters of ISO2022. */
454 /* code */ /* function */
455 #define ISO_CODE_LF 0x0A /* line-feed */
456 #define ISO_CODE_CR 0x0D /* carriage-return */
457 #define ISO_CODE_SO 0x0E /* shift-out */
458 #define ISO_CODE_SI 0x0F /* shift-in */
459 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
460 #define ISO_CODE_ESC 0x1B /* escape */
461 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
462 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
463 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
464
465 /* All code (1-byte) of ISO2022 is classified into one of the
466 followings. */
467 enum iso_code_class_type
468 {
469 ISO_control_0, /* Control codes in the range
470 0x00..0x1F and 0x7F, except for the
471 following 5 codes. */
472 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
473 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
474 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
475 ISO_escape, /* ISO_CODE_SO (0x1B) */
476 ISO_control_1, /* Control codes in the range
477 0x80..0x9F, except for the
478 following 3 codes. */
479 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
480 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
481 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
482 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
483 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
484 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
485 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
486 };
487
488 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
489 `iso-flags' attribute of an iso2022 coding system. */
490
491 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
492 instead of the correct short-form sequence (e.g. ESC $ A). */
493 #define CODING_ISO_FLAG_LONG_FORM 0x0001
494
495 /* If set, reset graphic planes and registers at end-of-line to the
496 initial state. */
497 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
498
499 /* If set, reset graphic planes and registers before any control
500 characters to the initial state. */
501 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
502
503 /* If set, encode by 7-bit environment. */
504 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
505
506 /* If set, use locking-shift function. */
507 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
508
509 /* If set, use single-shift function. Overwrite
510 CODING_ISO_FLAG_LOCKING_SHIFT. */
511 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
512
513 /* If set, use designation escape sequence. */
514 #define CODING_ISO_FLAG_DESIGNATION 0x0040
515
516 /* If set, produce revision number sequence. */
517 #define CODING_ISO_FLAG_REVISION 0x0080
518
519 /* If set, produce ISO6429's direction specifying sequence. */
520 #define CODING_ISO_FLAG_DIRECTION 0x0100
521
522 /* If set, assume designation states are reset at beginning of line on
523 output. */
524 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
525
526 /* If set, designation sequence should be placed at beginning of line
527 on output. */
528 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
529
530 /* If set, do not encode unsafe charactes on output. */
531 #define CODING_ISO_FLAG_SAFE 0x0800
532
533 /* If set, extra latin codes (128..159) are accepted as a valid code
534 on input. */
535 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
536
537 #define CODING_ISO_FLAG_COMPOSITION 0x2000
538
539 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
540
541 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
542
543 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
544
545 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
546
547 /* A character to be produced on output if encoding of the original
548 character is prohibited by CODING_ISO_FLAG_SAFE. */
549 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
550
551
552 /* UTF-16 section */
553 #define CODING_UTF_16_BOM(coding) \
554 ((coding)->spec.utf_16.bom)
555
556 #define CODING_UTF_16_ENDIAN(coding) \
557 ((coding)->spec.utf_16.endian)
558
559 #define CODING_UTF_16_SURROGATE(coding) \
560 ((coding)->spec.utf_16.surrogate)
561
562
563 /* CCL section */
564 #define CODING_CCL_DECODER(coding) \
565 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
566 #define CODING_CCL_ENCODER(coding) \
567 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
568 #define CODING_CCL_VALIDS(coding) \
569 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
570
571 /* Index for each coding category in `coding_categories' */
572
573 enum coding_category
574 {
575 coding_category_iso_7,
576 coding_category_iso_7_tight,
577 coding_category_iso_8_1,
578 coding_category_iso_8_2,
579 coding_category_iso_7_else,
580 coding_category_iso_8_else,
581 coding_category_utf_8,
582 coding_category_utf_16_auto,
583 coding_category_utf_16_be,
584 coding_category_utf_16_le,
585 coding_category_utf_16_be_nosig,
586 coding_category_utf_16_le_nosig,
587 coding_category_charset,
588 coding_category_sjis,
589 coding_category_big5,
590 coding_category_ccl,
591 coding_category_emacs_mule,
592 /* All above are targets of code detection. */
593 coding_category_raw_text,
594 coding_category_undecided,
595 coding_category_max
596 };
597
598 /* Definitions of flag bits used in detect_coding_XXXX. */
599 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
600 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
601 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
602 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
603 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
604 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
605 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
606 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
607 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
608 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
609 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
610 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
611 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
612 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
613 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
614 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
615 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
616 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
617
618 /* This value is returned if detect_coding_mask () find nothing other
619 than ASCII characters. */
620 #define CATEGORY_MASK_ANY \
621 (CATEGORY_MASK_ISO_7 \
622 | CATEGORY_MASK_ISO_7_TIGHT \
623 | CATEGORY_MASK_ISO_8_1 \
624 | CATEGORY_MASK_ISO_8_2 \
625 | CATEGORY_MASK_ISO_7_ELSE \
626 | CATEGORY_MASK_ISO_8_ELSE \
627 | CATEGORY_MASK_UTF_8 \
628 | CATEGORY_MASK_UTF_16_BE \
629 | CATEGORY_MASK_UTF_16_LE \
630 | CATEGORY_MASK_UTF_16_BE_NOSIG \
631 | CATEGORY_MASK_UTF_16_LE_NOSIG \
632 | CATEGORY_MASK_CHARSET \
633 | CATEGORY_MASK_SJIS \
634 | CATEGORY_MASK_BIG5 \
635 | CATEGORY_MASK_CCL \
636 | CATEGORY_MASK_EMACS_MULE)
637
638
639 #define CATEGORY_MASK_ISO_7BIT \
640 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
641
642 #define CATEGORY_MASK_ISO_8BIT \
643 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
644
645 #define CATEGORY_MASK_ISO_ELSE \
646 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
647
648 #define CATEGORY_MASK_ISO_ESCAPE \
649 (CATEGORY_MASK_ISO_7 \
650 | CATEGORY_MASK_ISO_7_TIGHT \
651 | CATEGORY_MASK_ISO_7_ELSE \
652 | CATEGORY_MASK_ISO_8_ELSE)
653
654 #define CATEGORY_MASK_ISO \
655 ( CATEGORY_MASK_ISO_7BIT \
656 | CATEGORY_MASK_ISO_8BIT \
657 | CATEGORY_MASK_ISO_ELSE)
658
659 #define CATEGORY_MASK_UTF_16 \
660 (CATEGORY_MASK_UTF_16_BE \
661 | CATEGORY_MASK_UTF_16_LE \
662 | CATEGORY_MASK_UTF_16_BE_NOSIG \
663 | CATEGORY_MASK_UTF_16_LE_NOSIG)
664
665
666 /* List of symbols `coding-category-xxx' ordered by priority. This
667 variable is exposed to Emacs Lisp. */
668 static Lisp_Object Vcoding_category_list;
669
670 /* Table of coding categories (Lisp symbols). This variable is for
671 internal use oly. */
672 static Lisp_Object Vcoding_category_table;
673
674 /* Table of coding-categories ordered by priority. */
675 static enum coding_category coding_priorities[coding_category_max];
676
677 /* Nth element is a coding context for the coding system bound to the
678 Nth coding category. */
679 static struct coding_system coding_categories[coding_category_max];
680
681 /*** Commonly used macros and functions ***/
682
683 #ifndef min
684 #define min(a, b) ((a) < (b) ? (a) : (b))
685 #endif
686 #ifndef max
687 #define max(a, b) ((a) > (b) ? (a) : (b))
688 #endif
689
690 #define CODING_GET_INFO(coding, attrs, charset_list) \
691 do { \
692 (attrs) = CODING_ID_ATTRS ((coding)->id); \
693 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
694 } while (0)
695
696
697 /* Safely get one byte from the source text pointed by SRC which ends
698 at SRC_END, and set C to that byte. If there are not enough bytes
699 in the source, it jumps to `no_more_source'. If multibytep is
700 nonzero, and a multibyte character is found at SRC, set C to the
701 negative value of the character code. The caller should declare
702 and set these variables appropriately in advance:
703 src, src_end, multibytep */
704
705 #define ONE_MORE_BYTE(c) \
706 do { \
707 if (src == src_end) \
708 { \
709 if (src_base < src) \
710 record_conversion_result \
711 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
712 goto no_more_source; \
713 } \
714 c = *src++; \
715 if (multibytep && (c & 0x80)) \
716 { \
717 if ((c & 0xFE) == 0xC0) \
718 c = ((c & 1) << 6) | *src++; \
719 else \
720 { \
721 src--; \
722 c = - string_char (src, &src, NULL); \
723 record_conversion_result \
724 (coding, CODING_RESULT_INVALID_SRC); \
725 } \
726 } \
727 consumed_chars++; \
728 } while (0)
729
730
731 #define ONE_MORE_BYTE_NO_CHECK(c) \
732 do { \
733 c = *src++; \
734 if (multibytep && (c & 0x80)) \
735 { \
736 if ((c & 0xFE) == 0xC0) \
737 c = ((c & 1) << 6) | *src++; \
738 else \
739 { \
740 src--; \
741 c = - string_char (src, &src, NULL); \
742 record_conversion_result \
743 (coding, CODING_RESULT_INVALID_SRC); \
744 } \
745 } \
746 consumed_chars++; \
747 } while (0)
748
749
750 /* Store a byte C in the place pointed by DST and increment DST to the
751 next free point, and increment PRODUCED_CHARS. The caller should
752 assure that C is 0..127, and declare and set the variable `dst'
753 appropriately in advance.
754 */
755
756
757 #define EMIT_ONE_ASCII_BYTE(c) \
758 do { \
759 produced_chars++; \
760 *dst++ = (c); \
761 } while (0)
762
763
764 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
765
766 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
767 do { \
768 produced_chars += 2; \
769 *dst++ = (c1), *dst++ = (c2); \
770 } while (0)
771
772
773 /* Store a byte C in the place pointed by DST and increment DST to the
774 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
775 nonzero, store in an appropriate multibyte from. The caller should
776 declare and set the variables `dst' and `multibytep' appropriately
777 in advance. */
778
779 #define EMIT_ONE_BYTE(c) \
780 do { \
781 produced_chars++; \
782 if (multibytep) \
783 { \
784 int ch = (c); \
785 if (ch >= 0x80) \
786 ch = BYTE8_TO_CHAR (ch); \
787 CHAR_STRING_ADVANCE (ch, dst); \
788 } \
789 else \
790 *dst++ = (c); \
791 } while (0)
792
793
794 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
795
796 #define EMIT_TWO_BYTES(c1, c2) \
797 do { \
798 produced_chars += 2; \
799 if (multibytep) \
800 { \
801 int ch; \
802 \
803 ch = (c1); \
804 if (ch >= 0x80) \
805 ch = BYTE8_TO_CHAR (ch); \
806 CHAR_STRING_ADVANCE (ch, dst); \
807 ch = (c2); \
808 if (ch >= 0x80) \
809 ch = BYTE8_TO_CHAR (ch); \
810 CHAR_STRING_ADVANCE (ch, dst); \
811 } \
812 else \
813 { \
814 *dst++ = (c1); \
815 *dst++ = (c2); \
816 } \
817 } while (0)
818
819
820 #define EMIT_THREE_BYTES(c1, c2, c3) \
821 do { \
822 EMIT_ONE_BYTE (c1); \
823 EMIT_TWO_BYTES (c2, c3); \
824 } while (0)
825
826
827 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
828 do { \
829 EMIT_TWO_BYTES (c1, c2); \
830 EMIT_TWO_BYTES (c3, c4); \
831 } while (0)
832
833
834 /* Prototypes for static functions. */
835 static void record_conversion_result P_ ((struct coding_system *coding,
836 enum coding_result_code result));
837 static int detect_coding_utf_8 P_ ((struct coding_system *,
838 struct coding_detection_info *info));
839 static void decode_coding_utf_8 P_ ((struct coding_system *));
840 static int encode_coding_utf_8 P_ ((struct coding_system *));
841
842 static int detect_coding_utf_16 P_ ((struct coding_system *,
843 struct coding_detection_info *info));
844 static void decode_coding_utf_16 P_ ((struct coding_system *));
845 static int encode_coding_utf_16 P_ ((struct coding_system *));
846
847 static int detect_coding_iso_2022 P_ ((struct coding_system *,
848 struct coding_detection_info *info));
849 static void decode_coding_iso_2022 P_ ((struct coding_system *));
850 static int encode_coding_iso_2022 P_ ((struct coding_system *));
851
852 static int detect_coding_emacs_mule P_ ((struct coding_system *,
853 struct coding_detection_info *info));
854 static void decode_coding_emacs_mule P_ ((struct coding_system *));
855 static int encode_coding_emacs_mule P_ ((struct coding_system *));
856
857 static int detect_coding_sjis P_ ((struct coding_system *,
858 struct coding_detection_info *info));
859 static void decode_coding_sjis P_ ((struct coding_system *));
860 static int encode_coding_sjis P_ ((struct coding_system *));
861
862 static int detect_coding_big5 P_ ((struct coding_system *,
863 struct coding_detection_info *info));
864 static void decode_coding_big5 P_ ((struct coding_system *));
865 static int encode_coding_big5 P_ ((struct coding_system *));
866
867 static int detect_coding_ccl P_ ((struct coding_system *,
868 struct coding_detection_info *info));
869 static void decode_coding_ccl P_ ((struct coding_system *));
870 static int encode_coding_ccl P_ ((struct coding_system *));
871
872 static void decode_coding_raw_text P_ ((struct coding_system *));
873 static int encode_coding_raw_text P_ ((struct coding_system *));
874
875 static void coding_set_source P_ ((struct coding_system *));
876 static void coding_set_destination P_ ((struct coding_system *));
877 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
878 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
879 EMACS_INT, EMACS_INT));
880 static unsigned char *alloc_destination P_ ((struct coding_system *,
881 EMACS_INT, unsigned char *));
882 static void setup_iso_safe_charsets P_ ((Lisp_Object));
883 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
884 int *, int *,
885 unsigned char *));
886 static int detect_eol P_ ((const unsigned char *,
887 EMACS_INT, enum coding_category));
888 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
889 static void decode_eol P_ ((struct coding_system *));
890 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
891 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
892 int, int *, int *));
893 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
894 static INLINE void produce_composition P_ ((struct coding_system *, int *,
895 EMACS_INT));
896 static INLINE void produce_charset P_ ((struct coding_system *, int *,
897 EMACS_INT));
898 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
899 static int decode_coding P_ ((struct coding_system *));
900 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
901 struct coding_system *,
902 int *, EMACS_INT *));
903 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
904 struct coding_system *,
905 int *, EMACS_INT *));
906 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
907 static int encode_coding P_ ((struct coding_system *));
908 static Lisp_Object make_conversion_work_buffer P_ ((int));
909 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
910 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
911 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
912
913 static void
914 record_conversion_result (struct coding_system *coding,
915 enum coding_result_code result)
916 {
917 coding->result = result;
918 switch (result)
919 {
920 case CODING_RESULT_INSUFFICIENT_SRC:
921 Vlast_code_conversion_error = Qinsufficient_source;
922 break;
923 case CODING_RESULT_INCONSISTENT_EOL:
924 Vlast_code_conversion_error = Qinconsistent_eol;
925 break;
926 case CODING_RESULT_INVALID_SRC:
927 Vlast_code_conversion_error = Qinvalid_source;
928 break;
929 case CODING_RESULT_INTERRUPT:
930 Vlast_code_conversion_error = Qinterrupted;
931 break;
932 case CODING_RESULT_INSUFFICIENT_MEM:
933 Vlast_code_conversion_error = Qinsufficient_memory;
934 break;
935 default:
936 Vlast_code_conversion_error = intern ("Unknown error");
937 }
938 }
939
940 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
941 do { \
942 charset_map_loaded = 0; \
943 c = DECODE_CHAR (charset, code); \
944 if (charset_map_loaded) \
945 { \
946 const unsigned char *orig = coding->source; \
947 EMACS_INT offset; \
948 \
949 coding_set_source (coding); \
950 offset = coding->source - orig; \
951 src += offset; \
952 src_base += offset; \
953 src_end += offset; \
954 } \
955 } while (0)
956
957
958 /* If there are at least BYTES length of room at dst, allocate memory
959 for coding->destination and update dst and dst_end. We don't have
960 to take care of coding->source which will be relocated. It is
961 handled by calling coding_set_source in encode_coding. */
962
963 #define ASSURE_DESTINATION(bytes) \
964 do { \
965 if (dst + (bytes) >= dst_end) \
966 { \
967 int more_bytes = charbuf_end - charbuf + (bytes); \
968 \
969 dst = alloc_destination (coding, more_bytes, dst); \
970 dst_end = coding->destination + coding->dst_bytes; \
971 } \
972 } while (0)
973
974
975
976 static void
977 coding_set_source (coding)
978 struct coding_system *coding;
979 {
980 if (BUFFERP (coding->src_object))
981 {
982 struct buffer *buf = XBUFFER (coding->src_object);
983
984 if (coding->src_pos < 0)
985 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
986 else
987 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
988 }
989 else if (STRINGP (coding->src_object))
990 {
991 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
992 }
993 else
994 /* Otherwise, the source is C string and is never relocated
995 automatically. Thus we don't have to update anything. */
996 ;
997 }
998
999 static void
1000 coding_set_destination (coding)
1001 struct coding_system *coding;
1002 {
1003 if (BUFFERP (coding->dst_object))
1004 {
1005 if (coding->src_pos < 0)
1006 {
1007 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1008 coding->dst_bytes = (GAP_END_ADDR
1009 - (coding->src_bytes - coding->consumed)
1010 - coding->destination);
1011 }
1012 else
1013 {
1014 /* We are sure that coding->dst_pos_byte is before the gap
1015 of the buffer. */
1016 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1017 + coding->dst_pos_byte - BEG_BYTE);
1018 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1019 - coding->destination);
1020 }
1021 }
1022 else
1023 /* Otherwise, the destination is C string and is never relocated
1024 automatically. Thus we don't have to update anything. */
1025 ;
1026 }
1027
1028
1029 static void
1030 coding_alloc_by_realloc (coding, bytes)
1031 struct coding_system *coding;
1032 EMACS_INT bytes;
1033 {
1034 coding->destination = (unsigned char *) xrealloc (coding->destination,
1035 coding->dst_bytes + bytes);
1036 coding->dst_bytes += bytes;
1037 }
1038
1039 static void
1040 coding_alloc_by_making_gap (coding, offset, bytes)
1041 struct coding_system *coding;
1042 EMACS_INT offset, bytes;
1043 {
1044 if (BUFFERP (coding->dst_object)
1045 && EQ (coding->src_object, coding->dst_object))
1046 {
1047 EMACS_INT add = offset + (coding->src_bytes - coding->consumed);
1048
1049 GPT += offset, GPT_BYTE += offset;
1050 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1051 make_gap (bytes);
1052 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1053 GPT -= offset, GPT_BYTE -= offset;
1054 }
1055 else
1056 {
1057 Lisp_Object this_buffer;
1058
1059 this_buffer = Fcurrent_buffer ();
1060 set_buffer_internal (XBUFFER (coding->dst_object));
1061 make_gap (bytes);
1062 set_buffer_internal (XBUFFER (this_buffer));
1063 }
1064 }
1065
1066
1067 static unsigned char *
1068 alloc_destination (coding, nbytes, dst)
1069 struct coding_system *coding;
1070 EMACS_INT nbytes;
1071 unsigned char *dst;
1072 {
1073 EMACS_INT offset = dst - coding->destination;
1074
1075 if (BUFFERP (coding->dst_object))
1076 coding_alloc_by_making_gap (coding, offset, nbytes);
1077 else
1078 coding_alloc_by_realloc (coding, nbytes);
1079 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1080 coding_set_destination (coding);
1081 dst = coding->destination + offset;
1082 return dst;
1083 }
1084
1085 /** Macros for annotations. */
1086
1087 /* Maximum length of annotation data (sum of annotations for
1088 composition and charset). */
1089 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1090
1091 /* An annotation data is stored in the array coding->charbuf in this
1092 format:
1093 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1094 LENGTH is the number of elements in the annotation.
1095 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1096 NCHARS is the number of characters in the text annotated.
1097
1098 The format of the following elements depend on ANNOTATION_MASK.
1099
1100 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1101 follows:
1102 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1103 METHOD is one of enum composition_method.
1104 Optionnal COMPOSITION-COMPONENTS are characters and composition
1105 rules.
1106
1107 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1108 follows. */
1109
1110 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1111 do { \
1112 *(buf)++ = -(len); \
1113 *(buf)++ = (mask); \
1114 *(buf)++ = (nchars); \
1115 coding->annotated = 1; \
1116 } while (0);
1117
1118 #define ADD_COMPOSITION_DATA(buf, nchars, method) \
1119 do { \
1120 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1121 *buf++ = method; \
1122 } while (0)
1123
1124
1125 #define ADD_CHARSET_DATA(buf, nchars, id) \
1126 do { \
1127 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1128 *buf++ = id; \
1129 } while (0)
1130
1131 \f
1132 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1133
1134
1135
1136 \f
1137 /*** 3. UTF-8 ***/
1138
1139 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1140 Check if a text is encoded in UTF-8. If it is, return 1, else
1141 return 0. */
1142
1143 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1144 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1145 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1146 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1147 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1148 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1149
1150 static int
1151 detect_coding_utf_8 (coding, detect_info)
1152 struct coding_system *coding;
1153 struct coding_detection_info *detect_info;
1154 {
1155 const unsigned char *src = coding->source, *src_base;
1156 const unsigned char *src_end = coding->source + coding->src_bytes;
1157 int multibytep = coding->src_multibyte;
1158 int consumed_chars = 0;
1159 int found = 0;
1160
1161 detect_info->checked |= CATEGORY_MASK_UTF_8;
1162 /* A coding system of this category is always ASCII compatible. */
1163 src += coding->head_ascii;
1164
1165 while (1)
1166 {
1167 int c, c1, c2, c3, c4;
1168
1169 src_base = src;
1170 ONE_MORE_BYTE (c);
1171 if (c < 0 || UTF_8_1_OCTET_P (c))
1172 continue;
1173 ONE_MORE_BYTE (c1);
1174 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1175 break;
1176 if (UTF_8_2_OCTET_LEADING_P (c))
1177 {
1178 found = CATEGORY_MASK_UTF_8;
1179 continue;
1180 }
1181 ONE_MORE_BYTE (c2);
1182 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1183 break;
1184 if (UTF_8_3_OCTET_LEADING_P (c))
1185 {
1186 found = CATEGORY_MASK_UTF_8;
1187 continue;
1188 }
1189 ONE_MORE_BYTE (c3);
1190 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1191 break;
1192 if (UTF_8_4_OCTET_LEADING_P (c))
1193 {
1194 found = CATEGORY_MASK_UTF_8;
1195 continue;
1196 }
1197 ONE_MORE_BYTE (c4);
1198 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1199 break;
1200 if (UTF_8_5_OCTET_LEADING_P (c))
1201 {
1202 found = CATEGORY_MASK_UTF_8;
1203 continue;
1204 }
1205 break;
1206 }
1207 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1208 return 0;
1209
1210 no_more_source:
1211 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1212 {
1213 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1214 return 0;
1215 }
1216 detect_info->found |= found;
1217 return 1;
1218 }
1219
1220
1221 static void
1222 decode_coding_utf_8 (coding)
1223 struct coding_system *coding;
1224 {
1225 const unsigned char *src = coding->source + coding->consumed;
1226 const unsigned char *src_end = coding->source + coding->src_bytes;
1227 const unsigned char *src_base;
1228 int *charbuf = coding->charbuf + coding->charbuf_used;
1229 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1230 int consumed_chars = 0, consumed_chars_base;
1231 int multibytep = coding->src_multibyte;
1232 Lisp_Object attr, charset_list;
1233 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1234 int byte_after_cr = -1;
1235
1236 CODING_GET_INFO (coding, attr, charset_list);
1237
1238 while (1)
1239 {
1240 int c, c1, c2, c3, c4, c5;
1241
1242 src_base = src;
1243 consumed_chars_base = consumed_chars;
1244
1245 if (charbuf >= charbuf_end)
1246 break;
1247
1248 if (byte_after_cr >= 0)
1249 c1 = byte_after_cr, byte_after_cr = -1;
1250 else
1251 ONE_MORE_BYTE (c1);
1252 if (c1 < 0)
1253 {
1254 c = - c1;
1255 }
1256 else if (UTF_8_1_OCTET_P(c1))
1257 {
1258 if (eol_crlf && c1 == '\r')
1259 ONE_MORE_BYTE (byte_after_cr);
1260 c = c1;
1261 }
1262 else
1263 {
1264 ONE_MORE_BYTE (c2);
1265 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1266 goto invalid_code;
1267 if (UTF_8_2_OCTET_LEADING_P (c1))
1268 {
1269 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1270 /* Reject overlong sequences here and below. Encoders
1271 producing them are incorrect, they can be misleading,
1272 and they mess up read/write invariance. */
1273 if (c < 128)
1274 goto invalid_code;
1275 }
1276 else
1277 {
1278 ONE_MORE_BYTE (c3);
1279 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1280 goto invalid_code;
1281 if (UTF_8_3_OCTET_LEADING_P (c1))
1282 {
1283 c = (((c1 & 0xF) << 12)
1284 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1285 if (c < 0x800
1286 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1287 goto invalid_code;
1288 }
1289 else
1290 {
1291 ONE_MORE_BYTE (c4);
1292 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1293 goto invalid_code;
1294 if (UTF_8_4_OCTET_LEADING_P (c1))
1295 {
1296 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1297 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1298 if (c < 0x10000)
1299 goto invalid_code;
1300 }
1301 else
1302 {
1303 ONE_MORE_BYTE (c5);
1304 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1305 goto invalid_code;
1306 if (UTF_8_5_OCTET_LEADING_P (c1))
1307 {
1308 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1309 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1310 | (c5 & 0x3F));
1311 if ((c > MAX_CHAR) || (c < 0x200000))
1312 goto invalid_code;
1313 }
1314 else
1315 goto invalid_code;
1316 }
1317 }
1318 }
1319 }
1320
1321 *charbuf++ = c;
1322 continue;
1323
1324 invalid_code:
1325 src = src_base;
1326 consumed_chars = consumed_chars_base;
1327 ONE_MORE_BYTE (c);
1328 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1329 coding->errors++;
1330 }
1331
1332 no_more_source:
1333 coding->consumed_char += consumed_chars_base;
1334 coding->consumed = src_base - coding->source;
1335 coding->charbuf_used = charbuf - coding->charbuf;
1336 }
1337
1338
1339 static int
1340 encode_coding_utf_8 (coding)
1341 struct coding_system *coding;
1342 {
1343 int multibytep = coding->dst_multibyte;
1344 int *charbuf = coding->charbuf;
1345 int *charbuf_end = charbuf + coding->charbuf_used;
1346 unsigned char *dst = coding->destination + coding->produced;
1347 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1348 int produced_chars = 0;
1349 int c;
1350
1351 if (multibytep)
1352 {
1353 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1354
1355 while (charbuf < charbuf_end)
1356 {
1357 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1358
1359 ASSURE_DESTINATION (safe_room);
1360 c = *charbuf++;
1361 if (CHAR_BYTE8_P (c))
1362 {
1363 c = CHAR_TO_BYTE8 (c);
1364 EMIT_ONE_BYTE (c);
1365 }
1366 else
1367 {
1368 CHAR_STRING_ADVANCE (c, pend);
1369 for (p = str; p < pend; p++)
1370 EMIT_ONE_BYTE (*p);
1371 }
1372 }
1373 }
1374 else
1375 {
1376 int safe_room = MAX_MULTIBYTE_LENGTH;
1377
1378 while (charbuf < charbuf_end)
1379 {
1380 ASSURE_DESTINATION (safe_room);
1381 c = *charbuf++;
1382 if (CHAR_BYTE8_P (c))
1383 *dst++ = CHAR_TO_BYTE8 (c);
1384 else
1385 dst += CHAR_STRING (c, dst);
1386 produced_chars++;
1387 }
1388 }
1389 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1390 coding->produced_char += produced_chars;
1391 coding->produced = dst - coding->destination;
1392 return 0;
1393 }
1394
1395
1396 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1397 Check if a text is encoded in one of UTF-16 based coding systems.
1398 If it is, return 1, else return 0. */
1399
1400 #define UTF_16_HIGH_SURROGATE_P(val) \
1401 (((val) & 0xFC00) == 0xD800)
1402
1403 #define UTF_16_LOW_SURROGATE_P(val) \
1404 (((val) & 0xFC00) == 0xDC00)
1405
1406 #define UTF_16_INVALID_P(val) \
1407 (((val) == 0xFFFE) \
1408 || ((val) == 0xFFFF) \
1409 || UTF_16_LOW_SURROGATE_P (val))
1410
1411
1412 static int
1413 detect_coding_utf_16 (coding, detect_info)
1414 struct coding_system *coding;
1415 struct coding_detection_info *detect_info;
1416 {
1417 const unsigned char *src = coding->source, *src_base = src;
1418 const unsigned char *src_end = coding->source + coding->src_bytes;
1419 int multibytep = coding->src_multibyte;
1420 int consumed_chars = 0;
1421 int c1, c2;
1422
1423 detect_info->checked |= CATEGORY_MASK_UTF_16;
1424 if (coding->mode & CODING_MODE_LAST_BLOCK
1425 && (coding->src_chars & 1))
1426 {
1427 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1428 return 0;
1429 }
1430
1431 ONE_MORE_BYTE (c1);
1432 ONE_MORE_BYTE (c2);
1433 if ((c1 == 0xFF) && (c2 == 0xFE))
1434 {
1435 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1436 | CATEGORY_MASK_UTF_16_AUTO);
1437 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1438 | CATEGORY_MASK_UTF_16_BE_NOSIG
1439 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1440 }
1441 else if ((c1 == 0xFE) && (c2 == 0xFF))
1442 {
1443 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1444 | CATEGORY_MASK_UTF_16_AUTO);
1445 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1446 | CATEGORY_MASK_UTF_16_BE_NOSIG
1447 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1448 }
1449 else if (c1 >= 0 && c2 >= 0)
1450 {
1451 detect_info->rejected
1452 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1453 }
1454 no_more_source:
1455 return 1;
1456 }
1457
1458 static void
1459 decode_coding_utf_16 (coding)
1460 struct coding_system *coding;
1461 {
1462 const unsigned char *src = coding->source + coding->consumed;
1463 const unsigned char *src_end = coding->source + coding->src_bytes;
1464 const unsigned char *src_base;
1465 int *charbuf = coding->charbuf + coding->charbuf_used;
1466 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1467 int consumed_chars = 0, consumed_chars_base;
1468 int multibytep = coding->src_multibyte;
1469 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1470 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1471 int surrogate = CODING_UTF_16_SURROGATE (coding);
1472 Lisp_Object attr, charset_list;
1473 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1474 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1475
1476 CODING_GET_INFO (coding, attr, charset_list);
1477
1478 if (bom == utf_16_with_bom)
1479 {
1480 int c, c1, c2;
1481
1482 src_base = src;
1483 ONE_MORE_BYTE (c1);
1484 ONE_MORE_BYTE (c2);
1485 c = (c1 << 8) | c2;
1486
1487 if (endian == utf_16_big_endian
1488 ? c != 0xFEFF : c != 0xFFFE)
1489 {
1490 /* The first two bytes are not BOM. Treat them as bytes
1491 for a normal character. */
1492 src = src_base;
1493 coding->errors++;
1494 }
1495 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1496 }
1497 else if (bom == utf_16_detect_bom)
1498 {
1499 /* We have already tried to detect BOM and failed in
1500 detect_coding. */
1501 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1502 }
1503
1504 while (1)
1505 {
1506 int c, c1, c2;
1507
1508 src_base = src;
1509 consumed_chars_base = consumed_chars;
1510
1511 if (charbuf + 2 >= charbuf_end)
1512 break;
1513
1514 if (byte_after_cr1 >= 0)
1515 c1 = byte_after_cr1, byte_after_cr1 = -1;
1516 else
1517 ONE_MORE_BYTE (c1);
1518 if (c1 < 0)
1519 {
1520 *charbuf++ = -c1;
1521 continue;
1522 }
1523 if (byte_after_cr2 >= 0)
1524 c2 = byte_after_cr2, byte_after_cr2 = -1;
1525 else
1526 ONE_MORE_BYTE (c2);
1527 if (c2 < 0)
1528 {
1529 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1530 *charbuf++ = -c2;
1531 continue;
1532 }
1533 c = (endian == utf_16_big_endian
1534 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1535
1536 if (surrogate)
1537 {
1538 if (! UTF_16_LOW_SURROGATE_P (c))
1539 {
1540 if (endian == utf_16_big_endian)
1541 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1542 else
1543 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1544 *charbuf++ = c1;
1545 *charbuf++ = c2;
1546 coding->errors++;
1547 if (UTF_16_HIGH_SURROGATE_P (c))
1548 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1549 else
1550 *charbuf++ = c;
1551 }
1552 else
1553 {
1554 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1555 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1556 *charbuf++ = 0x10000 + c;
1557 }
1558 }
1559 else
1560 {
1561 if (UTF_16_HIGH_SURROGATE_P (c))
1562 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1563 else
1564 {
1565 if (eol_crlf && c == '\r')
1566 {
1567 ONE_MORE_BYTE (byte_after_cr1);
1568 ONE_MORE_BYTE (byte_after_cr2);
1569 }
1570 *charbuf++ = c;
1571 }
1572 }
1573 }
1574
1575 no_more_source:
1576 coding->consumed_char += consumed_chars_base;
1577 coding->consumed = src_base - coding->source;
1578 coding->charbuf_used = charbuf - coding->charbuf;
1579 }
1580
1581 static int
1582 encode_coding_utf_16 (coding)
1583 struct coding_system *coding;
1584 {
1585 int multibytep = coding->dst_multibyte;
1586 int *charbuf = coding->charbuf;
1587 int *charbuf_end = charbuf + coding->charbuf_used;
1588 unsigned char *dst = coding->destination + coding->produced;
1589 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1590 int safe_room = 8;
1591 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1592 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1593 int produced_chars = 0;
1594 Lisp_Object attrs, charset_list;
1595 int c;
1596
1597 CODING_GET_INFO (coding, attrs, charset_list);
1598
1599 if (bom != utf_16_without_bom)
1600 {
1601 ASSURE_DESTINATION (safe_room);
1602 if (big_endian)
1603 EMIT_TWO_BYTES (0xFE, 0xFF);
1604 else
1605 EMIT_TWO_BYTES (0xFF, 0xFE);
1606 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1607 }
1608
1609 while (charbuf < charbuf_end)
1610 {
1611 ASSURE_DESTINATION (safe_room);
1612 c = *charbuf++;
1613 if (c >= MAX_UNICODE_CHAR)
1614 c = coding->default_char;
1615
1616 if (c < 0x10000)
1617 {
1618 if (big_endian)
1619 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1620 else
1621 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1622 }
1623 else
1624 {
1625 int c1, c2;
1626
1627 c -= 0x10000;
1628 c1 = (c >> 10) + 0xD800;
1629 c2 = (c & 0x3FF) + 0xDC00;
1630 if (big_endian)
1631 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1632 else
1633 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1634 }
1635 }
1636 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1637 coding->produced = dst - coding->destination;
1638 coding->produced_char += produced_chars;
1639 return 0;
1640 }
1641
1642 \f
1643 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1644
1645 /* Emacs' internal format for representation of multiple character
1646 sets is a kind of multi-byte encoding, i.e. characters are
1647 represented by variable-length sequences of one-byte codes.
1648
1649 ASCII characters and control characters (e.g. `tab', `newline') are
1650 represented by one-byte sequences which are their ASCII codes, in
1651 the range 0x00 through 0x7F.
1652
1653 8-bit characters of the range 0x80..0x9F are represented by
1654 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1655 code + 0x20).
1656
1657 8-bit characters of the range 0xA0..0xFF are represented by
1658 one-byte sequences which are their 8-bit code.
1659
1660 The other characters are represented by a sequence of `base
1661 leading-code', optional `extended leading-code', and one or two
1662 `position-code's. The length of the sequence is determined by the
1663 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1664 whereas extended leading-code and position-code take the range 0xA0
1665 through 0xFF. See `charset.h' for more details about leading-code
1666 and position-code.
1667
1668 --- CODE RANGE of Emacs' internal format ---
1669 character set range
1670 ------------- -----
1671 ascii 0x00..0x7F
1672 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1673 eight-bit-graphic 0xA0..0xBF
1674 ELSE 0x81..0x9D + [0xA0..0xFF]+
1675 ---------------------------------------------
1676
1677 As this is the internal character representation, the format is
1678 usually not used externally (i.e. in a file or in a data sent to a
1679 process). But, it is possible to have a text externally in this
1680 format (i.e. by encoding by the coding system `emacs-mule').
1681
1682 In that case, a sequence of one-byte codes has a slightly different
1683 form.
1684
1685 At first, all characters in eight-bit-control are represented by
1686 one-byte sequences which are their 8-bit code.
1687
1688 Next, character composition data are represented by the byte
1689 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1690 where,
1691 METHOD is 0xF0 plus one of composition method (enum
1692 composition_method),
1693
1694 BYTES is 0xA0 plus a byte length of this composition data,
1695
1696 CHARS is 0x20 plus a number of characters composed by this
1697 data,
1698
1699 COMPONENTs are characters of multibye form or composition
1700 rules encoded by two-byte of ASCII codes.
1701
1702 In addition, for backward compatibility, the following formats are
1703 also recognized as composition data on decoding.
1704
1705 0x80 MSEQ ...
1706 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1707
1708 Here,
1709 MSEQ is a multibyte form but in these special format:
1710 ASCII: 0xA0 ASCII_CODE+0x80,
1711 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1712 RULE is a one byte code of the range 0xA0..0xF0 that
1713 represents a composition rule.
1714 */
1715
1716 char emacs_mule_bytes[256];
1717
1718 int
1719 emacs_mule_char (coding, src, nbytes, nchars, id)
1720 struct coding_system *coding;
1721 const unsigned char *src;
1722 int *nbytes, *nchars, *id;
1723 {
1724 const unsigned char *src_end = coding->source + coding->src_bytes;
1725 const unsigned char *src_base = src;
1726 int multibytep = coding->src_multibyte;
1727 struct charset *charset;
1728 unsigned code;
1729 int c;
1730 int consumed_chars = 0;
1731
1732 ONE_MORE_BYTE (c);
1733 if (c < 0)
1734 {
1735 c = -c;
1736 charset = emacs_mule_charset[0];
1737 }
1738 else
1739 {
1740 if (c >= 0xA0)
1741 {
1742 /* Old style component character of a composition. */
1743 if (c == 0xA0)
1744 {
1745 ONE_MORE_BYTE (c);
1746 c -= 0x80;
1747 }
1748 else
1749 c -= 0x20;
1750 }
1751
1752 switch (emacs_mule_bytes[c])
1753 {
1754 case 2:
1755 if (! (charset = emacs_mule_charset[c]))
1756 goto invalid_code;
1757 ONE_MORE_BYTE (c);
1758 if (c < 0xA0)
1759 goto invalid_code;
1760 code = c & 0x7F;
1761 break;
1762
1763 case 3:
1764 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1765 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1766 {
1767 ONE_MORE_BYTE (c);
1768 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1769 goto invalid_code;
1770 ONE_MORE_BYTE (c);
1771 if (c < 0xA0)
1772 goto invalid_code;
1773 code = c & 0x7F;
1774 }
1775 else
1776 {
1777 if (! (charset = emacs_mule_charset[c]))
1778 goto invalid_code;
1779 ONE_MORE_BYTE (c);
1780 if (c < 0xA0)
1781 goto invalid_code;
1782 code = (c & 0x7F) << 8;
1783 ONE_MORE_BYTE (c);
1784 if (c < 0xA0)
1785 goto invalid_code;
1786 code |= c & 0x7F;
1787 }
1788 break;
1789
1790 case 4:
1791 ONE_MORE_BYTE (c);
1792 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1793 goto invalid_code;
1794 ONE_MORE_BYTE (c);
1795 if (c < 0xA0)
1796 goto invalid_code;
1797 code = (c & 0x7F) << 8;
1798 ONE_MORE_BYTE (c);
1799 if (c < 0xA0)
1800 goto invalid_code;
1801 code |= c & 0x7F;
1802 break;
1803
1804 case 1:
1805 code = c;
1806 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1807 ? charset_ascii : charset_eight_bit);
1808 break;
1809
1810 default:
1811 abort ();
1812 }
1813 c = DECODE_CHAR (charset, code);
1814 if (c < 0)
1815 goto invalid_code;
1816 }
1817 *nbytes = src - src_base;
1818 *nchars = consumed_chars;
1819 if (id)
1820 *id = charset->id;
1821 return c;
1822
1823 no_more_source:
1824 return -2;
1825
1826 invalid_code:
1827 return -1;
1828 }
1829
1830
1831 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1832 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1833 else return 0. */
1834
1835 static int
1836 detect_coding_emacs_mule (coding, detect_info)
1837 struct coding_system *coding;
1838 struct coding_detection_info *detect_info;
1839 {
1840 const unsigned char *src = coding->source, *src_base;
1841 const unsigned char *src_end = coding->source + coding->src_bytes;
1842 int multibytep = coding->src_multibyte;
1843 int consumed_chars = 0;
1844 int c;
1845 int found = 0;
1846
1847 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1848 /* A coding system of this category is always ASCII compatible. */
1849 src += coding->head_ascii;
1850
1851 while (1)
1852 {
1853 src_base = src;
1854 ONE_MORE_BYTE (c);
1855 if (c < 0)
1856 continue;
1857 if (c == 0x80)
1858 {
1859 /* Perhaps the start of composite character. We simple skip
1860 it because analyzing it is too heavy for detecting. But,
1861 at least, we check that the composite character
1862 constitues of more than 4 bytes. */
1863 const unsigned char *src_base;
1864
1865 repeat:
1866 src_base = src;
1867 do
1868 {
1869 ONE_MORE_BYTE (c);
1870 }
1871 while (c >= 0xA0);
1872
1873 if (src - src_base <= 4)
1874 break;
1875 found = CATEGORY_MASK_EMACS_MULE;
1876 if (c == 0x80)
1877 goto repeat;
1878 }
1879
1880 if (c < 0x80)
1881 {
1882 if (c < 0x20
1883 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1884 break;
1885 }
1886 else
1887 {
1888 int more_bytes = emacs_mule_bytes[*src_base] - 1;
1889
1890 while (more_bytes > 0)
1891 {
1892 ONE_MORE_BYTE (c);
1893 if (c < 0xA0)
1894 {
1895 src--; /* Unread the last byte. */
1896 break;
1897 }
1898 more_bytes--;
1899 }
1900 if (more_bytes != 0)
1901 break;
1902 found = CATEGORY_MASK_EMACS_MULE;
1903 }
1904 }
1905 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1906 return 0;
1907
1908 no_more_source:
1909 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1910 {
1911 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1912 return 0;
1913 }
1914 detect_info->found |= found;
1915 return 1;
1916 }
1917
1918
1919 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1920
1921 /* Decode a character represented as a component of composition
1922 sequence of Emacs 20/21 style at SRC. Set C to that character and
1923 update SRC to the head of next character (or an encoded composition
1924 rule). If SRC doesn't points a composition component, set C to -1.
1925 If SRC points an invalid byte sequence, global exit by a return
1926 value 0. */
1927
1928 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1929 do \
1930 { \
1931 int c; \
1932 int nbytes, nchars; \
1933 \
1934 if (src == src_end) \
1935 break; \
1936 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1937 if (c < 0) \
1938 { \
1939 if (c == -2) \
1940 break; \
1941 goto invalid_code; \
1942 } \
1943 *buf++ = c; \
1944 src += nbytes; \
1945 consumed_chars += nchars; \
1946 } \
1947 while (0)
1948
1949
1950 /* Decode a composition rule represented as a component of composition
1951 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1952 and increment BUF. If SRC points an invalid byte sequence, set C
1953 to -1. */
1954
1955 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1956 do { \
1957 int c, gref, nref; \
1958 \
1959 if (src >= src_end) \
1960 goto invalid_code; \
1961 ONE_MORE_BYTE_NO_CHECK (c); \
1962 c -= 0xA0; \
1963 if (c < 0 || c >= 81) \
1964 goto invalid_code; \
1965 \
1966 gref = c / 9, nref = c % 9; \
1967 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1968 } while (0)
1969
1970
1971 /* Decode a composition rule represented as a component of composition
1972 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1973 and increment BUF. If SRC points an invalid byte sequence, set C
1974 to -1. */
1975
1976 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1977 do { \
1978 int gref, nref; \
1979 \
1980 if (src + 1>= src_end) \
1981 goto invalid_code; \
1982 ONE_MORE_BYTE_NO_CHECK (gref); \
1983 gref -= 0x20; \
1984 ONE_MORE_BYTE_NO_CHECK (nref); \
1985 nref -= 0x20; \
1986 if (gref < 0 || gref >= 81 \
1987 || nref < 0 || nref >= 81) \
1988 goto invalid_code; \
1989 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1990 } while (0)
1991
1992
1993 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1994 do { \
1995 /* Emacs 21 style format. The first three bytes at SRC are \
1996 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1997 the byte length of this composition information, CHARS is the \
1998 number of characters composed by this composition. */ \
1999 enum composition_method method = c - 0xF2; \
2000 int *charbuf_base = charbuf; \
2001 int consumed_chars_limit; \
2002 int nbytes, nchars; \
2003 \
2004 ONE_MORE_BYTE (c); \
2005 if (c < 0) \
2006 goto invalid_code; \
2007 nbytes = c - 0xA0; \
2008 if (nbytes < 3) \
2009 goto invalid_code; \
2010 ONE_MORE_BYTE (c); \
2011 if (c < 0) \
2012 goto invalid_code; \
2013 nchars = c - 0xA0; \
2014 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2015 consumed_chars_limit = consumed_chars_base + nbytes; \
2016 if (method != COMPOSITION_RELATIVE) \
2017 { \
2018 int i = 0; \
2019 while (consumed_chars < consumed_chars_limit) \
2020 { \
2021 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
2022 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
2023 else \
2024 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
2025 i++; \
2026 } \
2027 if (consumed_chars < consumed_chars_limit) \
2028 goto invalid_code; \
2029 charbuf_base[0] -= i; \
2030 } \
2031 } while (0)
2032
2033
2034 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2035 do { \
2036 /* Emacs 20 style format for relative composition. */ \
2037 /* Store multibyte form of characters to be composed. */ \
2038 enum composition_method method = COMPOSITION_RELATIVE; \
2039 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2040 int *buf = components; \
2041 int i, j; \
2042 \
2043 src = src_base; \
2044 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2045 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2046 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2047 if (i < 2) \
2048 goto invalid_code; \
2049 ADD_COMPOSITION_DATA (charbuf, i, method); \
2050 for (j = 0; j < i; j++) \
2051 *charbuf++ = components[j]; \
2052 } while (0)
2053
2054
2055 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2056 do { \
2057 /* Emacs 20 style format for rule-base composition. */ \
2058 /* Store multibyte form of characters to be composed. */ \
2059 enum composition_method method = COMPOSITION_WITH_RULE; \
2060 int *charbuf_base = charbuf; \
2061 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2062 int *buf = components; \
2063 int i, j; \
2064 \
2065 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2066 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
2067 { \
2068 if (*src < 0xA0) \
2069 break; \
2070 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
2071 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2072 } \
2073 if (i <= 1 || (buf - components) % 2 == 0) \
2074 goto invalid_code; \
2075 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
2076 goto no_more_source; \
2077 ADD_COMPOSITION_DATA (charbuf, i, method); \
2078 i = i * 2 - 1; \
2079 for (j = 0; j < i; j++) \
2080 *charbuf++ = components[j]; \
2081 charbuf_base[0] -= i; \
2082 for (j = 0; j < i; j += 2) \
2083 *charbuf++ = components[j]; \
2084 } while (0)
2085
2086
2087 static void
2088 decode_coding_emacs_mule (coding)
2089 struct coding_system *coding;
2090 {
2091 const unsigned char *src = coding->source + coding->consumed;
2092 const unsigned char *src_end = coding->source + coding->src_bytes;
2093 const unsigned char *src_base;
2094 int *charbuf = coding->charbuf + coding->charbuf_used;
2095 int *charbuf_end
2096 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2097 int consumed_chars = 0, consumed_chars_base;
2098 int multibytep = coding->src_multibyte;
2099 Lisp_Object attrs, charset_list;
2100 int char_offset = coding->produced_char;
2101 int last_offset = char_offset;
2102 int last_id = charset_ascii;
2103 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2104 int byte_after_cr = -1;
2105
2106 CODING_GET_INFO (coding, attrs, charset_list);
2107
2108 while (1)
2109 {
2110 int c;
2111
2112 src_base = src;
2113 consumed_chars_base = consumed_chars;
2114
2115 if (charbuf >= charbuf_end)
2116 break;
2117
2118 if (byte_after_cr >= 0)
2119 c = byte_after_cr, byte_after_cr = -1;
2120 else
2121 ONE_MORE_BYTE (c);
2122 if (c < 0)
2123 {
2124 *charbuf++ = -c;
2125 char_offset++;
2126 }
2127 else if (c < 0x80)
2128 {
2129 if (eol_crlf && c == '\r')
2130 ONE_MORE_BYTE (byte_after_cr);
2131 *charbuf++ = c;
2132 char_offset++;
2133 }
2134 else if (c == 0x80)
2135 {
2136 ONE_MORE_BYTE (c);
2137 if (c < 0)
2138 goto invalid_code;
2139 if (c - 0xF2 >= COMPOSITION_RELATIVE
2140 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2141 DECODE_EMACS_MULE_21_COMPOSITION (c);
2142 else if (c < 0xC0)
2143 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2144 else if (c == 0xFF)
2145 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2146 else
2147 goto invalid_code;
2148 }
2149 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2150 {
2151 int nbytes, nchars;
2152 int id;
2153
2154 src = src_base;
2155 consumed_chars = consumed_chars_base;
2156 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2157 if (c < 0)
2158 {
2159 if (c == -2)
2160 break;
2161 goto invalid_code;
2162 }
2163 if (last_id != id)
2164 {
2165 if (last_id != charset_ascii)
2166 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2167 last_id = id;
2168 last_offset = char_offset;
2169 }
2170 *charbuf++ = c;
2171 src += nbytes;
2172 consumed_chars += nchars;
2173 char_offset++;
2174 }
2175 else
2176 goto invalid_code;
2177 continue;
2178
2179 invalid_code:
2180 src = src_base;
2181 consumed_chars = consumed_chars_base;
2182 ONE_MORE_BYTE (c);
2183 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2184 char_offset++;
2185 coding->errors++;
2186 }
2187
2188 no_more_source:
2189 if (last_id != charset_ascii)
2190 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2191 coding->consumed_char += consumed_chars_base;
2192 coding->consumed = src_base - coding->source;
2193 coding->charbuf_used = charbuf - coding->charbuf;
2194 }
2195
2196
2197 #define EMACS_MULE_LEADING_CODES(id, codes) \
2198 do { \
2199 if (id < 0xA0) \
2200 codes[0] = id, codes[1] = 0; \
2201 else if (id < 0xE0) \
2202 codes[0] = 0x9A, codes[1] = id; \
2203 else if (id < 0xF0) \
2204 codes[0] = 0x9B, codes[1] = id; \
2205 else if (id < 0xF5) \
2206 codes[0] = 0x9C, codes[1] = id; \
2207 else \
2208 codes[0] = 0x9D, codes[1] = id; \
2209 } while (0);
2210
2211
2212 static int
2213 encode_coding_emacs_mule (coding)
2214 struct coding_system *coding;
2215 {
2216 int multibytep = coding->dst_multibyte;
2217 int *charbuf = coding->charbuf;
2218 int *charbuf_end = charbuf + coding->charbuf_used;
2219 unsigned char *dst = coding->destination + coding->produced;
2220 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2221 int safe_room = 8;
2222 int produced_chars = 0;
2223 Lisp_Object attrs, charset_list;
2224 int c;
2225 int preferred_charset_id = -1;
2226
2227 CODING_GET_INFO (coding, attrs, charset_list);
2228 if (! EQ (charset_list, Vemacs_mule_charset_list))
2229 {
2230 CODING_ATTR_CHARSET_LIST (attrs)
2231 = charset_list = Vemacs_mule_charset_list;
2232 }
2233
2234 while (charbuf < charbuf_end)
2235 {
2236 ASSURE_DESTINATION (safe_room);
2237 c = *charbuf++;
2238
2239 if (c < 0)
2240 {
2241 /* Handle an annotation. */
2242 switch (*charbuf)
2243 {
2244 case CODING_ANNOTATE_COMPOSITION_MASK:
2245 /* Not yet implemented. */
2246 break;
2247 case CODING_ANNOTATE_CHARSET_MASK:
2248 preferred_charset_id = charbuf[3];
2249 if (preferred_charset_id >= 0
2250 && NILP (Fmemq (make_number (preferred_charset_id),
2251 charset_list)))
2252 preferred_charset_id = -1;
2253 break;
2254 default:
2255 abort ();
2256 }
2257 charbuf += -c - 1;
2258 continue;
2259 }
2260
2261 if (ASCII_CHAR_P (c))
2262 EMIT_ONE_ASCII_BYTE (c);
2263 else if (CHAR_BYTE8_P (c))
2264 {
2265 c = CHAR_TO_BYTE8 (c);
2266 EMIT_ONE_BYTE (c);
2267 }
2268 else
2269 {
2270 struct charset *charset;
2271 unsigned code;
2272 int dimension;
2273 int emacs_mule_id;
2274 unsigned char leading_codes[2];
2275
2276 if (preferred_charset_id >= 0)
2277 {
2278 charset = CHARSET_FROM_ID (preferred_charset_id);
2279 if (! CHAR_CHARSET_P (c, charset))
2280 charset = char_charset (c, charset_list, NULL);
2281 }
2282 else
2283 charset = char_charset (c, charset_list, &code);
2284 if (! charset)
2285 {
2286 c = coding->default_char;
2287 if (ASCII_CHAR_P (c))
2288 {
2289 EMIT_ONE_ASCII_BYTE (c);
2290 continue;
2291 }
2292 charset = char_charset (c, charset_list, &code);
2293 }
2294 dimension = CHARSET_DIMENSION (charset);
2295 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2296 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2297 EMIT_ONE_BYTE (leading_codes[0]);
2298 if (leading_codes[1])
2299 EMIT_ONE_BYTE (leading_codes[1]);
2300 if (dimension == 1)
2301 EMIT_ONE_BYTE (code | 0x80);
2302 else
2303 {
2304 code |= 0x8080;
2305 EMIT_ONE_BYTE (code >> 8);
2306 EMIT_ONE_BYTE (code & 0xFF);
2307 }
2308 }
2309 }
2310 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2311 coding->produced_char += produced_chars;
2312 coding->produced = dst - coding->destination;
2313 return 0;
2314 }
2315
2316 \f
2317 /*** 7. ISO2022 handlers ***/
2318
2319 /* The following note describes the coding system ISO2022 briefly.
2320 Since the intention of this note is to help understand the
2321 functions in this file, some parts are NOT ACCURATE or are OVERLY
2322 SIMPLIFIED. For thorough understanding, please refer to the
2323 original document of ISO2022. This is equivalent to the standard
2324 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2325
2326 ISO2022 provides many mechanisms to encode several character sets
2327 in 7-bit and 8-bit environments. For 7-bit environments, all text
2328 is encoded using bytes less than 128. This may make the encoded
2329 text a little bit longer, but the text passes more easily through
2330 several types of gateway, some of which strip off the MSB (Most
2331 Significant Bit).
2332
2333 There are two kinds of character sets: control character sets and
2334 graphic character sets. The former contain control characters such
2335 as `newline' and `escape' to provide control functions (control
2336 functions are also provided by escape sequences). The latter
2337 contain graphic characters such as 'A' and '-'. Emacs recognizes
2338 two control character sets and many graphic character sets.
2339
2340 Graphic character sets are classified into one of the following
2341 four classes, according to the number of bytes (DIMENSION) and
2342 number of characters in one dimension (CHARS) of the set:
2343 - DIMENSION1_CHARS94
2344 - DIMENSION1_CHARS96
2345 - DIMENSION2_CHARS94
2346 - DIMENSION2_CHARS96
2347
2348 In addition, each character set is assigned an identification tag,
2349 unique for each set, called the "final character" (denoted as <F>
2350 hereafter). The <F> of each character set is decided by ECMA(*)
2351 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2352 (0x30..0x3F are for private use only).
2353
2354 Note (*): ECMA = European Computer Manufacturers Association
2355
2356 Here are examples of graphic character sets [NAME(<F>)]:
2357 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2358 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2359 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2360 o DIMENSION2_CHARS96 -- none for the moment
2361
2362 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2363 C0 [0x00..0x1F] -- control character plane 0
2364 GL [0x20..0x7F] -- graphic character plane 0
2365 C1 [0x80..0x9F] -- control character plane 1
2366 GR [0xA0..0xFF] -- graphic character plane 1
2367
2368 A control character set is directly designated and invoked to C0 or
2369 C1 by an escape sequence. The most common case is that:
2370 - ISO646's control character set is designated/invoked to C0, and
2371 - ISO6429's control character set is designated/invoked to C1,
2372 and usually these designations/invocations are omitted in encoded
2373 text. In a 7-bit environment, only C0 can be used, and a control
2374 character for C1 is encoded by an appropriate escape sequence to
2375 fit into the environment. All control characters for C1 are
2376 defined to have corresponding escape sequences.
2377
2378 A graphic character set is at first designated to one of four
2379 graphic registers (G0 through G3), then these graphic registers are
2380 invoked to GL or GR. These designations and invocations can be
2381 done independently. The most common case is that G0 is invoked to
2382 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2383 these invocations and designations are omitted in encoded text.
2384 In a 7-bit environment, only GL can be used.
2385
2386 When a graphic character set of CHARS94 is invoked to GL, codes
2387 0x20 and 0x7F of the GL area work as control characters SPACE and
2388 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2389 be used.
2390
2391 There are two ways of invocation: locking-shift and single-shift.
2392 With locking-shift, the invocation lasts until the next different
2393 invocation, whereas with single-shift, the invocation affects the
2394 following character only and doesn't affect the locking-shift
2395 state. Invocations are done by the following control characters or
2396 escape sequences:
2397
2398 ----------------------------------------------------------------------
2399 abbrev function cntrl escape seq description
2400 ----------------------------------------------------------------------
2401 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2402 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2403 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2404 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2405 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2406 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2407 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2408 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2409 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2410 ----------------------------------------------------------------------
2411 (*) These are not used by any known coding system.
2412
2413 Control characters for these functions are defined by macros
2414 ISO_CODE_XXX in `coding.h'.
2415
2416 Designations are done by the following escape sequences:
2417 ----------------------------------------------------------------------
2418 escape sequence description
2419 ----------------------------------------------------------------------
2420 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2421 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2422 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2423 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2424 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2425 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2426 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2427 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2428 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2429 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2430 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2431 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2432 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2433 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2434 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2435 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2436 ----------------------------------------------------------------------
2437
2438 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2439 of dimension 1, chars 94, and final character <F>, etc...
2440
2441 Note (*): Although these designations are not allowed in ISO2022,
2442 Emacs accepts them on decoding, and produces them on encoding
2443 CHARS96 character sets in a coding system which is characterized as
2444 7-bit environment, non-locking-shift, and non-single-shift.
2445
2446 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2447 '(' must be omitted. We refer to this as "short-form" hereafter.
2448
2449 Now you may notice that there are a lot of ways of encoding the
2450 same multilingual text in ISO2022. Actually, there exist many
2451 coding systems such as Compound Text (used in X11's inter client
2452 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2453 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2454 localized platforms), and all of these are variants of ISO2022.
2455
2456 In addition to the above, Emacs handles two more kinds of escape
2457 sequences: ISO6429's direction specification and Emacs' private
2458 sequence for specifying character composition.
2459
2460 ISO6429's direction specification takes the following form:
2461 o CSI ']' -- end of the current direction
2462 o CSI '0' ']' -- end of the current direction
2463 o CSI '1' ']' -- start of left-to-right text
2464 o CSI '2' ']' -- start of right-to-left text
2465 The control character CSI (0x9B: control sequence introducer) is
2466 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2467
2468 Character composition specification takes the following form:
2469 o ESC '0' -- start relative composition
2470 o ESC '1' -- end composition
2471 o ESC '2' -- start rule-base composition (*)
2472 o ESC '3' -- start relative composition with alternate chars (**)
2473 o ESC '4' -- start rule-base composition with alternate chars (**)
2474 Since these are not standard escape sequences of any ISO standard,
2475 the use of them with these meanings is restricted to Emacs only.
2476
2477 (*) This form is used only in Emacs 20.7 and older versions,
2478 but newer versions can safely decode it.
2479 (**) This form is used only in Emacs 21.1 and newer versions,
2480 and older versions can't decode it.
2481
2482 Here's a list of example usages of these composition escape
2483 sequences (categorized by `enum composition_method').
2484
2485 COMPOSITION_RELATIVE:
2486 ESC 0 CHAR [ CHAR ] ESC 1
2487 COMPOSITION_WITH_RULE:
2488 ESC 2 CHAR [ RULE CHAR ] ESC 1
2489 COMPOSITION_WITH_ALTCHARS:
2490 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2491 COMPOSITION_WITH_RULE_ALTCHARS:
2492 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2493
2494 enum iso_code_class_type iso_code_class[256];
2495
2496 #define SAFE_CHARSET_P(coding, id) \
2497 ((id) <= (coding)->max_charset_id \
2498 && (coding)->safe_charsets[id] >= 0)
2499
2500
2501 #define SHIFT_OUT_OK(category) \
2502 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2503
2504 static void
2505 setup_iso_safe_charsets (attrs)
2506 Lisp_Object attrs;
2507 {
2508 Lisp_Object charset_list, safe_charsets;
2509 Lisp_Object request;
2510 Lisp_Object reg_usage;
2511 Lisp_Object tail;
2512 int reg94, reg96;
2513 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2514 int max_charset_id;
2515
2516 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2517 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2518 && ! EQ (charset_list, Viso_2022_charset_list))
2519 {
2520 CODING_ATTR_CHARSET_LIST (attrs)
2521 = charset_list = Viso_2022_charset_list;
2522 ASET (attrs, coding_attr_safe_charsets, Qnil);
2523 }
2524
2525 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2526 return;
2527
2528 max_charset_id = 0;
2529 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2530 {
2531 int id = XINT (XCAR (tail));
2532 if (max_charset_id < id)
2533 max_charset_id = id;
2534 }
2535
2536 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2537 make_number (255));
2538 request = AREF (attrs, coding_attr_iso_request);
2539 reg_usage = AREF (attrs, coding_attr_iso_usage);
2540 reg94 = XINT (XCAR (reg_usage));
2541 reg96 = XINT (XCDR (reg_usage));
2542
2543 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2544 {
2545 Lisp_Object id;
2546 Lisp_Object reg;
2547 struct charset *charset;
2548
2549 id = XCAR (tail);
2550 charset = CHARSET_FROM_ID (XINT (id));
2551 reg = Fcdr (Fassq (id, request));
2552 if (! NILP (reg))
2553 SSET (safe_charsets, XINT (id), XINT (reg));
2554 else if (charset->iso_chars_96)
2555 {
2556 if (reg96 < 4)
2557 SSET (safe_charsets, XINT (id), reg96);
2558 }
2559 else
2560 {
2561 if (reg94 < 4)
2562 SSET (safe_charsets, XINT (id), reg94);
2563 }
2564 }
2565 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2566 }
2567
2568
2569 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2570 Check if a text is encoded in one of ISO-2022 based codig systems.
2571 If it is, return 1, else return 0. */
2572
2573 static int
2574 detect_coding_iso_2022 (coding, detect_info)
2575 struct coding_system *coding;
2576 struct coding_detection_info *detect_info;
2577 {
2578 const unsigned char *src = coding->source, *src_base = src;
2579 const unsigned char *src_end = coding->source + coding->src_bytes;
2580 int multibytep = coding->src_multibyte;
2581 int single_shifting = 0;
2582 int id;
2583 int c, c1;
2584 int consumed_chars = 0;
2585 int i;
2586 int rejected = 0;
2587 int found = 0;
2588
2589 detect_info->checked |= CATEGORY_MASK_ISO;
2590
2591 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2592 {
2593 struct coding_system *this = &(coding_categories[i]);
2594 Lisp_Object attrs, val;
2595
2596 attrs = CODING_ID_ATTRS (this->id);
2597 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2598 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2599 setup_iso_safe_charsets (attrs);
2600 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2601 this->max_charset_id = SCHARS (val) - 1;
2602 this->safe_charsets = (char *) SDATA (val);
2603 }
2604
2605 /* A coding system of this category is always ASCII compatible. */
2606 src += coding->head_ascii;
2607
2608 while (rejected != CATEGORY_MASK_ISO)
2609 {
2610 src_base = src;
2611 ONE_MORE_BYTE (c);
2612 switch (c)
2613 {
2614 case ISO_CODE_ESC:
2615 if (inhibit_iso_escape_detection)
2616 break;
2617 single_shifting = 0;
2618 ONE_MORE_BYTE (c);
2619 if (c >= '(' && c <= '/')
2620 {
2621 /* Designation sequence for a charset of dimension 1. */
2622 ONE_MORE_BYTE (c1);
2623 if (c1 < ' ' || c1 >= 0x80
2624 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2625 /* Invalid designation sequence. Just ignore. */
2626 break;
2627 }
2628 else if (c == '$')
2629 {
2630 /* Designation sequence for a charset of dimension 2. */
2631 ONE_MORE_BYTE (c);
2632 if (c >= '@' && c <= 'B')
2633 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2634 id = iso_charset_table[1][0][c];
2635 else if (c >= '(' && c <= '/')
2636 {
2637 ONE_MORE_BYTE (c1);
2638 if (c1 < ' ' || c1 >= 0x80
2639 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2640 /* Invalid designation sequence. Just ignore. */
2641 break;
2642 }
2643 else
2644 /* Invalid designation sequence. Just ignore it. */
2645 break;
2646 }
2647 else if (c == 'N' || c == 'O')
2648 {
2649 /* ESC <Fe> for SS2 or SS3. */
2650 single_shifting = 1;
2651 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2652 break;
2653 }
2654 else if (c >= '0' && c <= '4')
2655 {
2656 /* ESC <Fp> for start/end composition. */
2657 found |= CATEGORY_MASK_ISO;
2658 break;
2659 }
2660 else
2661 {
2662 /* Invalid escape sequence. Just ignore it. */
2663 break;
2664 }
2665
2666 /* We found a valid designation sequence for CHARSET. */
2667 rejected |= CATEGORY_MASK_ISO_8BIT;
2668 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2669 id))
2670 found |= CATEGORY_MASK_ISO_7;
2671 else
2672 rejected |= CATEGORY_MASK_ISO_7;
2673 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2674 id))
2675 found |= CATEGORY_MASK_ISO_7_TIGHT;
2676 else
2677 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2678 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2679 id))
2680 found |= CATEGORY_MASK_ISO_7_ELSE;
2681 else
2682 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2683 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2684 id))
2685 found |= CATEGORY_MASK_ISO_8_ELSE;
2686 else
2687 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2688 break;
2689
2690 case ISO_CODE_SO:
2691 case ISO_CODE_SI:
2692 /* Locking shift out/in. */
2693 if (inhibit_iso_escape_detection)
2694 break;
2695 single_shifting = 0;
2696 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2697 break;
2698
2699 case ISO_CODE_CSI:
2700 /* Control sequence introducer. */
2701 single_shifting = 0;
2702 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2703 found |= CATEGORY_MASK_ISO_8_ELSE;
2704 goto check_extra_latin;
2705
2706 case ISO_CODE_SS2:
2707 case ISO_CODE_SS3:
2708 /* Single shift. */
2709 if (inhibit_iso_escape_detection)
2710 break;
2711 single_shifting = 0;
2712 rejected |= CATEGORY_MASK_ISO_7BIT;
2713 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2714 & CODING_ISO_FLAG_SINGLE_SHIFT)
2715 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2716 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2717 & CODING_ISO_FLAG_SINGLE_SHIFT)
2718 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2719 if (single_shifting)
2720 break;
2721 goto check_extra_latin;
2722
2723 default:
2724 if (c < 0)
2725 continue;
2726 if (c < 0x80)
2727 {
2728 single_shifting = 0;
2729 break;
2730 }
2731 if (c >= 0xA0)
2732 {
2733 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2734 found |= CATEGORY_MASK_ISO_8_1;
2735 /* Check the length of succeeding codes of the range
2736 0xA0..0FF. If the byte length is even, we include
2737 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2738 only when we are not single shifting. */
2739 if (! single_shifting
2740 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2741 {
2742 int i = 1;
2743 while (src < src_end)
2744 {
2745 ONE_MORE_BYTE (c);
2746 if (c < 0xA0)
2747 break;
2748 i++;
2749 }
2750
2751 if (i & 1 && src < src_end)
2752 rejected |= CATEGORY_MASK_ISO_8_2;
2753 else
2754 found |= CATEGORY_MASK_ISO_8_2;
2755 }
2756 break;
2757 }
2758 check_extra_latin:
2759 single_shifting = 0;
2760 if (! VECTORP (Vlatin_extra_code_table)
2761 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2762 {
2763 rejected = CATEGORY_MASK_ISO;
2764 break;
2765 }
2766 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2767 & CODING_ISO_FLAG_LATIN_EXTRA)
2768 found |= CATEGORY_MASK_ISO_8_1;
2769 else
2770 rejected |= CATEGORY_MASK_ISO_8_1;
2771 rejected |= CATEGORY_MASK_ISO_8_2;
2772 }
2773 }
2774 detect_info->rejected |= CATEGORY_MASK_ISO;
2775 return 0;
2776
2777 no_more_source:
2778 detect_info->rejected |= rejected;
2779 detect_info->found |= (found & ~rejected);
2780 return 1;
2781 }
2782
2783
2784 /* Set designation state into CODING. Set CHARS_96 to -1 if the
2785 escape sequence should be kept. */
2786 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2787 do { \
2788 int id, prev; \
2789 \
2790 if (final < '0' || final >= 128 \
2791 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2792 || !SAFE_CHARSET_P (coding, id)) \
2793 { \
2794 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2795 chars_96 = -1; \
2796 break; \
2797 } \
2798 prev = CODING_ISO_DESIGNATION (coding, reg); \
2799 if (id == charset_jisx0201_roman) \
2800 { \
2801 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2802 id = charset_ascii; \
2803 } \
2804 else if (id == charset_jisx0208_1978) \
2805 { \
2806 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2807 id = charset_jisx0208; \
2808 } \
2809 CODING_ISO_DESIGNATION (coding, reg) = id; \
2810 /* If there was an invalid designation to REG previously, and this \
2811 designation is ASCII to REG, we should keep this designation \
2812 sequence. */ \
2813 if (prev == -2 && id == charset_ascii) \
2814 chars_96 = -1; \
2815 } while (0)
2816
2817
2818 #define MAYBE_FINISH_COMPOSITION() \
2819 do { \
2820 int i; \
2821 if (composition_state == COMPOSING_NO) \
2822 break; \
2823 /* It is assured that we have enough room for producing \
2824 characters stored in the table `components'. */ \
2825 if (charbuf + component_idx > charbuf_end) \
2826 goto no_more_source; \
2827 composition_state = COMPOSING_NO; \
2828 if (method == COMPOSITION_RELATIVE \
2829 || method == COMPOSITION_WITH_ALTCHARS) \
2830 { \
2831 for (i = 0; i < component_idx; i++) \
2832 *charbuf++ = components[i]; \
2833 char_offset += component_idx; \
2834 } \
2835 else \
2836 { \
2837 for (i = 0; i < component_idx; i += 2) \
2838 *charbuf++ = components[i]; \
2839 char_offset += (component_idx / 2) + 1; \
2840 } \
2841 } while (0)
2842
2843
2844 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2845 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2846 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2847 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2848 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2849 */
2850
2851 #define DECODE_COMPOSITION_START(c1) \
2852 do { \
2853 if (c1 == '0' \
2854 && composition_state == COMPOSING_COMPONENT_RULE) \
2855 { \
2856 component_len = component_idx; \
2857 composition_state = COMPOSING_CHAR; \
2858 } \
2859 else \
2860 { \
2861 const unsigned char *p; \
2862 \
2863 MAYBE_FINISH_COMPOSITION (); \
2864 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2865 goto no_more_source; \
2866 for (p = src; p < src_end - 1; p++) \
2867 if (*p == ISO_CODE_ESC && p[1] == '1') \
2868 break; \
2869 if (p == src_end - 1) \
2870 { \
2871 /* The current composition doesn't end in the current \
2872 source. */ \
2873 record_conversion_result \
2874 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
2875 goto no_more_source; \
2876 } \
2877 \
2878 /* This is surely the start of a composition. */ \
2879 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2880 : c1 == '2' ? COMPOSITION_WITH_RULE \
2881 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2882 : COMPOSITION_WITH_RULE_ALTCHARS); \
2883 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2884 : COMPOSING_COMPONENT_CHAR); \
2885 component_idx = component_len = 0; \
2886 } \
2887 } while (0)
2888
2889
2890 /* Handle compositoin end sequence ESC 1. */
2891
2892 #define DECODE_COMPOSITION_END() \
2893 do { \
2894 int nchars = (component_len > 0 ? component_idx - component_len \
2895 : method == COMPOSITION_RELATIVE ? component_idx \
2896 : (component_idx + 1) / 2); \
2897 int i; \
2898 int *saved_charbuf = charbuf; \
2899 \
2900 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2901 if (method != COMPOSITION_RELATIVE) \
2902 { \
2903 if (component_len == 0) \
2904 for (i = 0; i < component_idx; i++) \
2905 *charbuf++ = components[i]; \
2906 else \
2907 for (i = 0; i < component_len; i++) \
2908 *charbuf++ = components[i]; \
2909 *saved_charbuf = saved_charbuf - charbuf; \
2910 } \
2911 if (method == COMPOSITION_WITH_RULE) \
2912 for (i = 0; i < component_idx; i += 2, char_offset++) \
2913 *charbuf++ = components[i]; \
2914 else \
2915 for (i = component_len; i < component_idx; i++, char_offset++) \
2916 *charbuf++ = components[i]; \
2917 coding->annotated = 1; \
2918 composition_state = COMPOSING_NO; \
2919 } while (0)
2920
2921
2922 /* Decode a composition rule from the byte C1 (and maybe one more byte
2923 from SRC) and store one encoded composition rule in
2924 coding->cmp_data. */
2925
2926 #define DECODE_COMPOSITION_RULE(c1) \
2927 do { \
2928 (c1) -= 32; \
2929 if (c1 < 81) /* old format (before ver.21) */ \
2930 { \
2931 int gref = (c1) / 9; \
2932 int nref = (c1) % 9; \
2933 if (gref == 4) gref = 10; \
2934 if (nref == 4) nref = 10; \
2935 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2936 } \
2937 else if (c1 < 93) /* new format (after ver.21) */ \
2938 { \
2939 ONE_MORE_BYTE (c2); \
2940 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2941 } \
2942 else \
2943 c1 = 0; \
2944 } while (0)
2945
2946
2947 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2948
2949 static void
2950 decode_coding_iso_2022 (coding)
2951 struct coding_system *coding;
2952 {
2953 const unsigned char *src = coding->source + coding->consumed;
2954 const unsigned char *src_end = coding->source + coding->src_bytes;
2955 const unsigned char *src_base;
2956 int *charbuf = coding->charbuf + coding->charbuf_used;
2957 int *charbuf_end
2958 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2959 int consumed_chars = 0, consumed_chars_base;
2960 int multibytep = coding->src_multibyte;
2961 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2962 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2963 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2964 int charset_id_2, charset_id_3;
2965 struct charset *charset;
2966 int c;
2967 /* For handling composition sequence. */
2968 #define COMPOSING_NO 0
2969 #define COMPOSING_CHAR 1
2970 #define COMPOSING_RULE 2
2971 #define COMPOSING_COMPONENT_CHAR 3
2972 #define COMPOSING_COMPONENT_RULE 4
2973
2974 int composition_state = COMPOSING_NO;
2975 enum composition_method method;
2976 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2977 int component_idx;
2978 int component_len;
2979 Lisp_Object attrs, charset_list;
2980 int char_offset = coding->produced_char;
2981 int last_offset = char_offset;
2982 int last_id = charset_ascii;
2983 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2984 int byte_after_cr = -1;
2985
2986 CODING_GET_INFO (coding, attrs, charset_list);
2987 setup_iso_safe_charsets (attrs);
2988 /* Charset list may have been changed. */
2989 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2990 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
2991
2992 while (1)
2993 {
2994 int c1, c2;
2995
2996 src_base = src;
2997 consumed_chars_base = consumed_chars;
2998
2999 if (charbuf >= charbuf_end)
3000 break;
3001
3002 if (byte_after_cr >= 0)
3003 c1 = byte_after_cr, byte_after_cr = -1;
3004 else
3005 ONE_MORE_BYTE (c1);
3006 if (c1 < 0)
3007 goto invalid_code;
3008
3009 /* We produce at most one character. */
3010 switch (iso_code_class [c1])
3011 {
3012 case ISO_0x20_or_0x7F:
3013 if (composition_state != COMPOSING_NO)
3014 {
3015 if (composition_state == COMPOSING_RULE
3016 || composition_state == COMPOSING_COMPONENT_RULE)
3017 {
3018 DECODE_COMPOSITION_RULE (c1);
3019 components[component_idx++] = c1;
3020 composition_state--;
3021 continue;
3022 }
3023 }
3024 if (charset_id_0 < 0
3025 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3026 /* This is SPACE or DEL. */
3027 charset = CHARSET_FROM_ID (charset_ascii);
3028 else
3029 charset = CHARSET_FROM_ID (charset_id_0);
3030 break;
3031
3032 case ISO_graphic_plane_0:
3033 if (composition_state != COMPOSING_NO)
3034 {
3035 if (composition_state == COMPOSING_RULE
3036 || composition_state == COMPOSING_COMPONENT_RULE)
3037 {
3038 DECODE_COMPOSITION_RULE (c1);
3039 components[component_idx++] = c1;
3040 composition_state--;
3041 continue;
3042 }
3043 }
3044 if (charset_id_0 < 0)
3045 charset = CHARSET_FROM_ID (charset_ascii);
3046 else
3047 charset = CHARSET_FROM_ID (charset_id_0);
3048 break;
3049
3050 case ISO_0xA0_or_0xFF:
3051 if (charset_id_1 < 0
3052 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3053 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3054 goto invalid_code;
3055 /* This is a graphic character, we fall down ... */
3056
3057 case ISO_graphic_plane_1:
3058 if (charset_id_1 < 0)
3059 goto invalid_code;
3060 charset = CHARSET_FROM_ID (charset_id_1);
3061 break;
3062
3063 case ISO_control_0:
3064 if (eol_crlf && c1 == '\r')
3065 ONE_MORE_BYTE (byte_after_cr);
3066 MAYBE_FINISH_COMPOSITION ();
3067 charset = CHARSET_FROM_ID (charset_ascii);
3068 break;
3069
3070 case ISO_control_1:
3071 MAYBE_FINISH_COMPOSITION ();
3072 goto invalid_code;
3073
3074 case ISO_shift_out:
3075 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3076 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3077 goto invalid_code;
3078 CODING_ISO_INVOCATION (coding, 0) = 1;
3079 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3080 continue;
3081
3082 case ISO_shift_in:
3083 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3084 goto invalid_code;
3085 CODING_ISO_INVOCATION (coding, 0) = 0;
3086 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3087 continue;
3088
3089 case ISO_single_shift_2_7:
3090 case ISO_single_shift_2:
3091 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3092 goto invalid_code;
3093 /* SS2 is handled as an escape sequence of ESC 'N' */
3094 c1 = 'N';
3095 goto label_escape_sequence;
3096
3097 case ISO_single_shift_3:
3098 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3099 goto invalid_code;
3100 /* SS2 is handled as an escape sequence of ESC 'O' */
3101 c1 = 'O';
3102 goto label_escape_sequence;
3103
3104 case ISO_control_sequence_introducer:
3105 /* CSI is handled as an escape sequence of ESC '[' ... */
3106 c1 = '[';
3107 goto label_escape_sequence;
3108
3109 case ISO_escape:
3110 ONE_MORE_BYTE (c1);
3111 label_escape_sequence:
3112 /* Escape sequences handled here are invocation,
3113 designation, direction specification, and character
3114 composition specification. */
3115 switch (c1)
3116 {
3117 case '&': /* revision of following character set */
3118 ONE_MORE_BYTE (c1);
3119 if (!(c1 >= '@' && c1 <= '~'))
3120 goto invalid_code;
3121 ONE_MORE_BYTE (c1);
3122 if (c1 != ISO_CODE_ESC)
3123 goto invalid_code;
3124 ONE_MORE_BYTE (c1);
3125 goto label_escape_sequence;
3126
3127 case '$': /* designation of 2-byte character set */
3128 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3129 goto invalid_code;
3130 {
3131 int reg, chars96;
3132
3133 ONE_MORE_BYTE (c1);
3134 if (c1 >= '@' && c1 <= 'B')
3135 { /* designation of JISX0208.1978, GB2312.1980,
3136 or JISX0208.1980 */
3137 reg = 0, chars96 = 0;
3138 }
3139 else if (c1 >= 0x28 && c1 <= 0x2B)
3140 { /* designation of DIMENSION2_CHARS94 character set */
3141 reg = c1 - 0x28, chars96 = 0;
3142 ONE_MORE_BYTE (c1);
3143 }
3144 else if (c1 >= 0x2C && c1 <= 0x2F)
3145 { /* designation of DIMENSION2_CHARS96 character set */
3146 reg = c1 - 0x2C, chars96 = 1;
3147 ONE_MORE_BYTE (c1);
3148 }
3149 else
3150 goto invalid_code;
3151 DECODE_DESIGNATION (reg, 2, chars96, c1);
3152 /* We must update these variables now. */
3153 if (reg == 0)
3154 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3155 else if (reg == 1)
3156 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3157 if (chars96 < 0)
3158 goto invalid_code;
3159 }
3160 continue;
3161
3162 case 'n': /* invocation of locking-shift-2 */
3163 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3164 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3165 goto invalid_code;
3166 CODING_ISO_INVOCATION (coding, 0) = 2;
3167 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3168 continue;
3169
3170 case 'o': /* invocation of locking-shift-3 */
3171 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3172 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3173 goto invalid_code;
3174 CODING_ISO_INVOCATION (coding, 0) = 3;
3175 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3176 continue;
3177
3178 case 'N': /* invocation of single-shift-2 */
3179 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3180 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3181 goto invalid_code;
3182 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3183 if (charset_id_2 < 0)
3184 charset = CHARSET_FROM_ID (charset_ascii);
3185 else
3186 charset = CHARSET_FROM_ID (charset_id_2);
3187 ONE_MORE_BYTE (c1);
3188 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3189 goto invalid_code;
3190 break;
3191
3192 case 'O': /* invocation of single-shift-3 */
3193 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3194 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3195 goto invalid_code;
3196 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3197 if (charset_id_3 < 0)
3198 charset = CHARSET_FROM_ID (charset_ascii);
3199 else
3200 charset = CHARSET_FROM_ID (charset_id_3);
3201 ONE_MORE_BYTE (c1);
3202 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3203 goto invalid_code;
3204 break;
3205
3206 case '0': case '2': case '3': case '4': /* start composition */
3207 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3208 goto invalid_code;
3209 DECODE_COMPOSITION_START (c1);
3210 continue;
3211
3212 case '1': /* end composition */
3213 if (composition_state == COMPOSING_NO)
3214 goto invalid_code;
3215 DECODE_COMPOSITION_END ();
3216 continue;
3217
3218 case '[': /* specification of direction */
3219 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3220 goto invalid_code;
3221 /* For the moment, nested direction is not supported.
3222 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3223 left-to-right, and nozero means right-to-left. */
3224 ONE_MORE_BYTE (c1);
3225 switch (c1)
3226 {
3227 case ']': /* end of the current direction */
3228 coding->mode &= ~CODING_MODE_DIRECTION;
3229
3230 case '0': /* end of the current direction */
3231 case '1': /* start of left-to-right direction */
3232 ONE_MORE_BYTE (c1);
3233 if (c1 == ']')
3234 coding->mode &= ~CODING_MODE_DIRECTION;
3235 else
3236 goto invalid_code;
3237 break;
3238
3239 case '2': /* start of right-to-left direction */
3240 ONE_MORE_BYTE (c1);
3241 if (c1 == ']')
3242 coding->mode |= CODING_MODE_DIRECTION;
3243 else
3244 goto invalid_code;
3245 break;
3246
3247 default:
3248 goto invalid_code;
3249 }
3250 continue;
3251
3252 case '%':
3253 ONE_MORE_BYTE (c1);
3254 if (c1 == '/')
3255 {
3256 /* CTEXT extended segment:
3257 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3258 We keep these bytes as is for the moment.
3259 They may be decoded by post-read-conversion. */
3260 int dim, M, L;
3261 int size;
3262
3263 ONE_MORE_BYTE (dim);
3264 ONE_MORE_BYTE (M);
3265 ONE_MORE_BYTE (L);
3266 size = ((M - 128) * 128) + (L - 128);
3267 if (charbuf + 8 + size > charbuf_end)
3268 goto break_loop;
3269 *charbuf++ = ISO_CODE_ESC;
3270 *charbuf++ = '%';
3271 *charbuf++ = '/';
3272 *charbuf++ = dim;
3273 *charbuf++ = BYTE8_TO_CHAR (M);
3274 *charbuf++ = BYTE8_TO_CHAR (L);
3275 while (size-- > 0)
3276 {
3277 ONE_MORE_BYTE (c1);
3278 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3279 }
3280 }
3281 else if (c1 == 'G')
3282 {
3283 /* XFree86 extension for embedding UTF-8 in CTEXT:
3284 ESC % G --UTF-8-BYTES-- ESC % @
3285 We keep these bytes as is for the moment.
3286 They may be decoded by post-read-conversion. */
3287 int *p = charbuf;
3288
3289 if (p + 6 > charbuf_end)
3290 goto break_loop;
3291 *p++ = ISO_CODE_ESC;
3292 *p++ = '%';
3293 *p++ = 'G';
3294 while (p < charbuf_end)
3295 {
3296 ONE_MORE_BYTE (c1);
3297 if (c1 == ISO_CODE_ESC
3298 && src + 1 < src_end
3299 && src[0] == '%'
3300 && src[1] == '@')
3301 {
3302 src += 2;
3303 break;
3304 }
3305 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3306 }
3307 if (p + 3 > charbuf_end)
3308 goto break_loop;
3309 *p++ = ISO_CODE_ESC;
3310 *p++ = '%';
3311 *p++ = '@';
3312 charbuf = p;
3313 }
3314 else
3315 goto invalid_code;
3316 continue;
3317 break;
3318
3319 default:
3320 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3321 goto invalid_code;
3322 {
3323 int reg, chars96;
3324
3325 if (c1 >= 0x28 && c1 <= 0x2B)
3326 { /* designation of DIMENSION1_CHARS94 character set */
3327 reg = c1 - 0x28, chars96 = 0;
3328 ONE_MORE_BYTE (c1);
3329 }
3330 else if (c1 >= 0x2C && c1 <= 0x2F)
3331 { /* designation of DIMENSION1_CHARS96 character set */
3332 reg = c1 - 0x2C, chars96 = 1;
3333 ONE_MORE_BYTE (c1);
3334 }
3335 else
3336 goto invalid_code;
3337 DECODE_DESIGNATION (reg, 1, chars96, c1);
3338 /* We must update these variables now. */
3339 if (reg == 0)
3340 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3341 else if (reg == 1)
3342 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3343 if (chars96 < 0)
3344 goto invalid_code;
3345 }
3346 continue;
3347 }
3348 }
3349
3350 if (charset->id != charset_ascii
3351 && last_id != charset->id)
3352 {
3353 if (last_id != charset_ascii)
3354 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3355 last_id = charset->id;
3356 last_offset = char_offset;
3357 }
3358
3359 /* Now we know CHARSET and 1st position code C1 of a character.
3360 Produce a decoded character while getting 2nd position code
3361 C2 if necessary. */
3362 c1 &= 0x7F;
3363 if (CHARSET_DIMENSION (charset) > 1)
3364 {
3365 ONE_MORE_BYTE (c2);
3366 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3367 /* C2 is not in a valid range. */
3368 goto invalid_code;
3369 c1 = (c1 << 8) | (c2 & 0x7F);
3370 if (CHARSET_DIMENSION (charset) > 2)
3371 {
3372 ONE_MORE_BYTE (c2);
3373 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3374 /* C2 is not in a valid range. */
3375 goto invalid_code;
3376 c1 = (c1 << 8) | (c2 & 0x7F);
3377 }
3378 }
3379
3380 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3381 if (c < 0)
3382 {
3383 MAYBE_FINISH_COMPOSITION ();
3384 for (; src_base < src; src_base++, char_offset++)
3385 {
3386 if (ASCII_BYTE_P (*src_base))
3387 *charbuf++ = *src_base;
3388 else
3389 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3390 }
3391 }
3392 else if (composition_state == COMPOSING_NO)
3393 {
3394 *charbuf++ = c;
3395 char_offset++;
3396 }
3397 else
3398 {
3399 components[component_idx++] = c;
3400 if (method == COMPOSITION_WITH_RULE
3401 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3402 && composition_state == COMPOSING_COMPONENT_CHAR))
3403 composition_state++;
3404 }
3405 continue;
3406
3407 invalid_code:
3408 MAYBE_FINISH_COMPOSITION ();
3409 src = src_base;
3410 consumed_chars = consumed_chars_base;
3411 ONE_MORE_BYTE (c);
3412 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3413 char_offset++;
3414 coding->errors++;
3415 continue;
3416
3417 break_loop:
3418 break;
3419 }
3420
3421 no_more_source:
3422 if (last_id != charset_ascii)
3423 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3424 coding->consumed_char += consumed_chars_base;
3425 coding->consumed = src_base - coding->source;
3426 coding->charbuf_used = charbuf - coding->charbuf;
3427 }
3428
3429
3430 /* ISO2022 encoding stuff. */
3431
3432 /*
3433 It is not enough to say just "ISO2022" on encoding, we have to
3434 specify more details. In Emacs, each coding system of ISO2022
3435 variant has the following specifications:
3436 1. Initial designation to G0 thru G3.
3437 2. Allows short-form designation?
3438 3. ASCII should be designated to G0 before control characters?
3439 4. ASCII should be designated to G0 at end of line?
3440 5. 7-bit environment or 8-bit environment?
3441 6. Use locking-shift?
3442 7. Use Single-shift?
3443 And the following two are only for Japanese:
3444 8. Use ASCII in place of JIS0201-1976-Roman?
3445 9. Use JISX0208-1983 in place of JISX0208-1978?
3446 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3447 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3448 details.
3449 */
3450
3451 /* Produce codes (escape sequence) for designating CHARSET to graphic
3452 register REG at DST, and increment DST. If <final-char> of CHARSET is
3453 '@', 'A', or 'B' and the coding system CODING allows, produce
3454 designation sequence of short-form. */
3455
3456 #define ENCODE_DESIGNATION(charset, reg, coding) \
3457 do { \
3458 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3459 char *intermediate_char_94 = "()*+"; \
3460 char *intermediate_char_96 = ",-./"; \
3461 int revision = -1; \
3462 int c; \
3463 \
3464 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3465 revision = CHARSET_ISO_REVISION (charset); \
3466 \
3467 if (revision >= 0) \
3468 { \
3469 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3470 EMIT_ONE_BYTE ('@' + revision); \
3471 } \
3472 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3473 if (CHARSET_DIMENSION (charset) == 1) \
3474 { \
3475 if (! CHARSET_ISO_CHARS_96 (charset)) \
3476 c = intermediate_char_94[reg]; \
3477 else \
3478 c = intermediate_char_96[reg]; \
3479 EMIT_ONE_ASCII_BYTE (c); \
3480 } \
3481 else \
3482 { \
3483 EMIT_ONE_ASCII_BYTE ('$'); \
3484 if (! CHARSET_ISO_CHARS_96 (charset)) \
3485 { \
3486 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3487 || reg != 0 \
3488 || final_char < '@' || final_char > 'B') \
3489 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3490 } \
3491 else \
3492 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3493 } \
3494 EMIT_ONE_ASCII_BYTE (final_char); \
3495 \
3496 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3497 } while (0)
3498
3499
3500 /* The following two macros produce codes (control character or escape
3501 sequence) for ISO2022 single-shift functions (single-shift-2 and
3502 single-shift-3). */
3503
3504 #define ENCODE_SINGLE_SHIFT_2 \
3505 do { \
3506 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3507 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3508 else \
3509 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3510 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3511 } while (0)
3512
3513
3514 #define ENCODE_SINGLE_SHIFT_3 \
3515 do { \
3516 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3517 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3518 else \
3519 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3520 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3521 } while (0)
3522
3523
3524 /* The following four macros produce codes (control character or
3525 escape sequence) for ISO2022 locking-shift functions (shift-in,
3526 shift-out, locking-shift-2, and locking-shift-3). */
3527
3528 #define ENCODE_SHIFT_IN \
3529 do { \
3530 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3531 CODING_ISO_INVOCATION (coding, 0) = 0; \
3532 } while (0)
3533
3534
3535 #define ENCODE_SHIFT_OUT \
3536 do { \
3537 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3538 CODING_ISO_INVOCATION (coding, 0) = 1; \
3539 } while (0)
3540
3541
3542 #define ENCODE_LOCKING_SHIFT_2 \
3543 do { \
3544 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3545 CODING_ISO_INVOCATION (coding, 0) = 2; \
3546 } while (0)
3547
3548
3549 #define ENCODE_LOCKING_SHIFT_3 \
3550 do { \
3551 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3552 CODING_ISO_INVOCATION (coding, 0) = 3; \
3553 } while (0)
3554
3555
3556 /* Produce codes for a DIMENSION1 character whose character set is
3557 CHARSET and whose position-code is C1. Designation and invocation
3558 sequences are also produced in advance if necessary. */
3559
3560 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3561 do { \
3562 int id = CHARSET_ID (charset); \
3563 \
3564 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3565 && id == charset_ascii) \
3566 { \
3567 id = charset_jisx0201_roman; \
3568 charset = CHARSET_FROM_ID (id); \
3569 } \
3570 \
3571 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3572 { \
3573 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3574 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3575 else \
3576 EMIT_ONE_BYTE (c1 | 0x80); \
3577 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3578 break; \
3579 } \
3580 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3581 { \
3582 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3583 break; \
3584 } \
3585 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3586 { \
3587 EMIT_ONE_BYTE (c1 | 0x80); \
3588 break; \
3589 } \
3590 else \
3591 /* Since CHARSET is not yet invoked to any graphic planes, we \
3592 must invoke it, or, at first, designate it to some graphic \
3593 register. Then repeat the loop to actually produce the \
3594 character. */ \
3595 dst = encode_invocation_designation (charset, coding, dst, \
3596 &produced_chars); \
3597 } while (1)
3598
3599
3600 /* Produce codes for a DIMENSION2 character whose character set is
3601 CHARSET and whose position-codes are C1 and C2. Designation and
3602 invocation codes are also produced in advance if necessary. */
3603
3604 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3605 do { \
3606 int id = CHARSET_ID (charset); \
3607 \
3608 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3609 && id == charset_jisx0208) \
3610 { \
3611 id = charset_jisx0208_1978; \
3612 charset = CHARSET_FROM_ID (id); \
3613 } \
3614 \
3615 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3616 { \
3617 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3618 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3619 else \
3620 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3621 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3622 break; \
3623 } \
3624 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3625 { \
3626 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3627 break; \
3628 } \
3629 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3630 { \
3631 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3632 break; \
3633 } \
3634 else \
3635 /* Since CHARSET is not yet invoked to any graphic planes, we \
3636 must invoke it, or, at first, designate it to some graphic \
3637 register. Then repeat the loop to actually produce the \
3638 character. */ \
3639 dst = encode_invocation_designation (charset, coding, dst, \
3640 &produced_chars); \
3641 } while (1)
3642
3643
3644 #define ENCODE_ISO_CHARACTER(charset, c) \
3645 do { \
3646 int code = ENCODE_CHAR ((charset),(c)); \
3647 \
3648 if (CHARSET_DIMENSION (charset) == 1) \
3649 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3650 else \
3651 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3652 } while (0)
3653
3654
3655 /* Produce designation and invocation codes at a place pointed by DST
3656 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3657 Return new DST. */
3658
3659 unsigned char *
3660 encode_invocation_designation (charset, coding, dst, p_nchars)
3661 struct charset *charset;
3662 struct coding_system *coding;
3663 unsigned char *dst;
3664 int *p_nchars;
3665 {
3666 int multibytep = coding->dst_multibyte;
3667 int produced_chars = *p_nchars;
3668 int reg; /* graphic register number */
3669 int id = CHARSET_ID (charset);
3670
3671 /* At first, check designations. */
3672 for (reg = 0; reg < 4; reg++)
3673 if (id == CODING_ISO_DESIGNATION (coding, reg))
3674 break;
3675
3676 if (reg >= 4)
3677 {
3678 /* CHARSET is not yet designated to any graphic registers. */
3679 /* At first check the requested designation. */
3680 reg = CODING_ISO_REQUEST (coding, id);
3681 if (reg < 0)
3682 /* Since CHARSET requests no special designation, designate it
3683 to graphic register 0. */
3684 reg = 0;
3685
3686 ENCODE_DESIGNATION (charset, reg, coding);
3687 }
3688
3689 if (CODING_ISO_INVOCATION (coding, 0) != reg
3690 && CODING_ISO_INVOCATION (coding, 1) != reg)
3691 {
3692 /* Since the graphic register REG is not invoked to any graphic
3693 planes, invoke it to graphic plane 0. */
3694 switch (reg)
3695 {
3696 case 0: /* graphic register 0 */
3697 ENCODE_SHIFT_IN;
3698 break;
3699
3700 case 1: /* graphic register 1 */
3701 ENCODE_SHIFT_OUT;
3702 break;
3703
3704 case 2: /* graphic register 2 */
3705 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3706 ENCODE_SINGLE_SHIFT_2;
3707 else
3708 ENCODE_LOCKING_SHIFT_2;
3709 break;
3710
3711 case 3: /* graphic register 3 */
3712 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3713 ENCODE_SINGLE_SHIFT_3;
3714 else
3715 ENCODE_LOCKING_SHIFT_3;
3716 break;
3717 }
3718 }
3719
3720 *p_nchars = produced_chars;
3721 return dst;
3722 }
3723
3724 /* The following three macros produce codes for indicating direction
3725 of text. */
3726 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3727 do { \
3728 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3729 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3730 else \
3731 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3732 } while (0)
3733
3734
3735 #define ENCODE_DIRECTION_R2L() \
3736 do { \
3737 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3738 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3739 } while (0)
3740
3741
3742 #define ENCODE_DIRECTION_L2R() \
3743 do { \
3744 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3745 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3746 } while (0)
3747
3748
3749 /* Produce codes for designation and invocation to reset the graphic
3750 planes and registers to initial state. */
3751 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3752 do { \
3753 int reg; \
3754 struct charset *charset; \
3755 \
3756 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3757 ENCODE_SHIFT_IN; \
3758 for (reg = 0; reg < 4; reg++) \
3759 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3760 && (CODING_ISO_DESIGNATION (coding, reg) \
3761 != CODING_ISO_INITIAL (coding, reg))) \
3762 { \
3763 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3764 ENCODE_DESIGNATION (charset, reg, coding); \
3765 } \
3766 } while (0)
3767
3768
3769 /* Produce designation sequences of charsets in the line started from
3770 SRC to a place pointed by DST, and return updated DST.
3771
3772 If the current block ends before any end-of-line, we may fail to
3773 find all the necessary designations. */
3774
3775 static unsigned char *
3776 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3777 struct coding_system *coding;
3778 int *charbuf, *charbuf_end;
3779 unsigned char *dst;
3780 {
3781 struct charset *charset;
3782 /* Table of charsets to be designated to each graphic register. */
3783 int r[4];
3784 int c, found = 0, reg;
3785 int produced_chars = 0;
3786 int multibytep = coding->dst_multibyte;
3787 Lisp_Object attrs;
3788 Lisp_Object charset_list;
3789
3790 attrs = CODING_ID_ATTRS (coding->id);
3791 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3792 if (EQ (charset_list, Qiso_2022))
3793 charset_list = Viso_2022_charset_list;
3794
3795 for (reg = 0; reg < 4; reg++)
3796 r[reg] = -1;
3797
3798 while (found < 4)
3799 {
3800 int id;
3801
3802 c = *charbuf++;
3803 if (c == '\n')
3804 break;
3805 charset = char_charset (c, charset_list, NULL);
3806 id = CHARSET_ID (charset);
3807 reg = CODING_ISO_REQUEST (coding, id);
3808 if (reg >= 0 && r[reg] < 0)
3809 {
3810 found++;
3811 r[reg] = id;
3812 }
3813 }
3814
3815 if (found)
3816 {
3817 for (reg = 0; reg < 4; reg++)
3818 if (r[reg] >= 0
3819 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3820 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3821 }
3822
3823 return dst;
3824 }
3825
3826 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3827
3828 static int
3829 encode_coding_iso_2022 (coding)
3830 struct coding_system *coding;
3831 {
3832 int multibytep = coding->dst_multibyte;
3833 int *charbuf = coding->charbuf;
3834 int *charbuf_end = charbuf + coding->charbuf_used;
3835 unsigned char *dst = coding->destination + coding->produced;
3836 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3837 int safe_room = 16;
3838 int bol_designation
3839 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3840 && CODING_ISO_BOL (coding));
3841 int produced_chars = 0;
3842 Lisp_Object attrs, eol_type, charset_list;
3843 int ascii_compatible;
3844 int c;
3845 int preferred_charset_id = -1;
3846
3847 CODING_GET_INFO (coding, attrs, charset_list);
3848 eol_type = CODING_ID_EOL_TYPE (coding->id);
3849 if (VECTORP (eol_type))
3850 eol_type = Qunix;
3851
3852 setup_iso_safe_charsets (attrs);
3853 /* Charset list may have been changed. */
3854 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3855 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3856
3857 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3858
3859 while (charbuf < charbuf_end)
3860 {
3861 ASSURE_DESTINATION (safe_room);
3862
3863 if (bol_designation)
3864 {
3865 unsigned char *dst_prev = dst;
3866
3867 /* We have to produce designation sequences if any now. */
3868 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3869 bol_designation = 0;
3870 /* We are sure that designation sequences are all ASCII bytes. */
3871 produced_chars += dst - dst_prev;
3872 }
3873
3874 c = *charbuf++;
3875
3876 if (c < 0)
3877 {
3878 /* Handle an annotation. */
3879 switch (*charbuf)
3880 {
3881 case CODING_ANNOTATE_COMPOSITION_MASK:
3882 /* Not yet implemented. */
3883 break;
3884 case CODING_ANNOTATE_CHARSET_MASK:
3885 preferred_charset_id = charbuf[2];
3886 if (preferred_charset_id >= 0
3887 && NILP (Fmemq (make_number (preferred_charset_id),
3888 charset_list)))
3889 preferred_charset_id = -1;
3890 break;
3891 default:
3892 abort ();
3893 }
3894 charbuf += -c - 1;
3895 continue;
3896 }
3897
3898 /* Now encode the character C. */
3899 if (c < 0x20 || c == 0x7F)
3900 {
3901 if (c == '\n'
3902 || (c == '\r' && EQ (eol_type, Qmac)))
3903 {
3904 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3905 ENCODE_RESET_PLANE_AND_REGISTER ();
3906 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3907 {
3908 int i;
3909
3910 for (i = 0; i < 4; i++)
3911 CODING_ISO_DESIGNATION (coding, i)
3912 = CODING_ISO_INITIAL (coding, i);
3913 }
3914 bol_designation
3915 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3916 }
3917 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3918 ENCODE_RESET_PLANE_AND_REGISTER ();
3919 EMIT_ONE_ASCII_BYTE (c);
3920 }
3921 else if (ASCII_CHAR_P (c))
3922 {
3923 if (ascii_compatible)
3924 EMIT_ONE_ASCII_BYTE (c);
3925 else
3926 {
3927 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3928 ENCODE_ISO_CHARACTER (charset, c);
3929 }
3930 }
3931 else if (CHAR_BYTE8_P (c))
3932 {
3933 c = CHAR_TO_BYTE8 (c);
3934 EMIT_ONE_BYTE (c);
3935 }
3936 else
3937 {
3938 struct charset *charset;
3939
3940 if (preferred_charset_id >= 0)
3941 {
3942 charset = CHARSET_FROM_ID (preferred_charset_id);
3943 if (! CHAR_CHARSET_P (c, charset))
3944 charset = char_charset (c, charset_list, NULL);
3945 }
3946 else
3947 charset = char_charset (c, charset_list, NULL);
3948 if (!charset)
3949 {
3950 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3951 {
3952 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3953 charset = CHARSET_FROM_ID (charset_ascii);
3954 }
3955 else
3956 {
3957 c = coding->default_char;
3958 charset = char_charset (c, charset_list, NULL);
3959 }
3960 }
3961 ENCODE_ISO_CHARACTER (charset, c);
3962 }
3963 }
3964
3965 if (coding->mode & CODING_MODE_LAST_BLOCK
3966 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3967 {
3968 ASSURE_DESTINATION (safe_room);
3969 ENCODE_RESET_PLANE_AND_REGISTER ();
3970 }
3971 record_conversion_result (coding, CODING_RESULT_SUCCESS);
3972 CODING_ISO_BOL (coding) = bol_designation;
3973 coding->produced_char += produced_chars;
3974 coding->produced = dst - coding->destination;
3975 return 0;
3976 }
3977
3978 \f
3979 /*** 8,9. SJIS and BIG5 handlers ***/
3980
3981 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3982 quite widely. So, for the moment, Emacs supports them in the bare
3983 C code. But, in the future, they may be supported only by CCL. */
3984
3985 /* SJIS is a coding system encoding three character sets: ASCII, right
3986 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3987 as is. A character of charset katakana-jisx0201 is encoded by
3988 "position-code + 0x80". A character of charset japanese-jisx0208
3989 is encoded in 2-byte but two position-codes are divided and shifted
3990 so that it fit in the range below.
3991
3992 --- CODE RANGE of SJIS ---
3993 (character set) (range)
3994 ASCII 0x00 .. 0x7F
3995 KATAKANA-JISX0201 0xA0 .. 0xDF
3996 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3997 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3998 -------------------------------
3999
4000 */
4001
4002 /* BIG5 is a coding system encoding two character sets: ASCII and
4003 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4004 character set and is encoded in two-byte.
4005
4006 --- CODE RANGE of BIG5 ---
4007 (character set) (range)
4008 ASCII 0x00 .. 0x7F
4009 Big5 (1st byte) 0xA1 .. 0xFE
4010 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4011 --------------------------
4012
4013 */
4014
4015 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4016 Check if a text is encoded in SJIS. If it is, return
4017 CATEGORY_MASK_SJIS, else return 0. */
4018
4019 static int
4020 detect_coding_sjis (coding, detect_info)
4021 struct coding_system *coding;
4022 struct coding_detection_info *detect_info;
4023 {
4024 const unsigned char *src = coding->source, *src_base;
4025 const unsigned char *src_end = coding->source + coding->src_bytes;
4026 int multibytep = coding->src_multibyte;
4027 int consumed_chars = 0;
4028 int found = 0;
4029 int c;
4030
4031 detect_info->checked |= CATEGORY_MASK_SJIS;
4032 /* A coding system of this category is always ASCII compatible. */
4033 src += coding->head_ascii;
4034
4035 while (1)
4036 {
4037 src_base = src;
4038 ONE_MORE_BYTE (c);
4039 if (c < 0x80)
4040 continue;
4041 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4042 {
4043 ONE_MORE_BYTE (c);
4044 if (c < 0x40 || c == 0x7F || c > 0xFC)
4045 break;
4046 found = CATEGORY_MASK_SJIS;
4047 }
4048 else if (c >= 0xA0 && c < 0xE0)
4049 found = CATEGORY_MASK_SJIS;
4050 else
4051 break;
4052 }
4053 detect_info->rejected |= CATEGORY_MASK_SJIS;
4054 return 0;
4055
4056 no_more_source:
4057 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4058 {
4059 detect_info->rejected |= CATEGORY_MASK_SJIS;
4060 return 0;
4061 }
4062 detect_info->found |= found;
4063 return 1;
4064 }
4065
4066 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4067 Check if a text is encoded in BIG5. If it is, return
4068 CATEGORY_MASK_BIG5, else return 0. */
4069
4070 static int
4071 detect_coding_big5 (coding, detect_info)
4072 struct coding_system *coding;
4073 struct coding_detection_info *detect_info;
4074 {
4075 const unsigned char *src = coding->source, *src_base;
4076 const unsigned char *src_end = coding->source + coding->src_bytes;
4077 int multibytep = coding->src_multibyte;
4078 int consumed_chars = 0;
4079 int found = 0;
4080 int c;
4081
4082 detect_info->checked |= CATEGORY_MASK_BIG5;
4083 /* A coding system of this category is always ASCII compatible. */
4084 src += coding->head_ascii;
4085
4086 while (1)
4087 {
4088 src_base = src;
4089 ONE_MORE_BYTE (c);
4090 if (c < 0x80)
4091 continue;
4092 if (c >= 0xA1)
4093 {
4094 ONE_MORE_BYTE (c);
4095 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4096 return 0;
4097 found = CATEGORY_MASK_BIG5;
4098 }
4099 else
4100 break;
4101 }
4102 detect_info->rejected |= CATEGORY_MASK_BIG5;
4103 return 0;
4104
4105 no_more_source:
4106 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4107 {
4108 detect_info->rejected |= CATEGORY_MASK_BIG5;
4109 return 0;
4110 }
4111 detect_info->found |= found;
4112 return 1;
4113 }
4114
4115 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4116 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4117
4118 static void
4119 decode_coding_sjis (coding)
4120 struct coding_system *coding;
4121 {
4122 const unsigned char *src = coding->source + coding->consumed;
4123 const unsigned char *src_end = coding->source + coding->src_bytes;
4124 const unsigned char *src_base;
4125 int *charbuf = coding->charbuf + coding->charbuf_used;
4126 int *charbuf_end
4127 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4128 int consumed_chars = 0, consumed_chars_base;
4129 int multibytep = coding->src_multibyte;
4130 struct charset *charset_roman, *charset_kanji, *charset_kana;
4131 struct charset *charset_kanji2;
4132 Lisp_Object attrs, charset_list, val;
4133 int char_offset = coding->produced_char;
4134 int last_offset = char_offset;
4135 int last_id = charset_ascii;
4136 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4137 int byte_after_cr = -1;
4138
4139 CODING_GET_INFO (coding, attrs, charset_list);
4140
4141 val = charset_list;
4142 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4143 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4144 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4145 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4146
4147 while (1)
4148 {
4149 int c, c1;
4150 struct charset *charset;
4151
4152 src_base = src;
4153 consumed_chars_base = consumed_chars;
4154
4155 if (charbuf >= charbuf_end)
4156 break;
4157
4158 if (byte_after_cr >= 0)
4159 c = byte_after_cr, byte_after_cr = -1;
4160 else
4161 ONE_MORE_BYTE (c);
4162 if (c < 0)
4163 goto invalid_code;
4164 if (c < 0x80)
4165 {
4166 if (eol_crlf && c == '\r')
4167 ONE_MORE_BYTE (byte_after_cr);
4168 charset = charset_roman;
4169 }
4170 else if (c == 0x80 || c == 0xA0)
4171 goto invalid_code;
4172 else if (c >= 0xA1 && c <= 0xDF)
4173 {
4174 /* SJIS -> JISX0201-Kana */
4175 c &= 0x7F;
4176 charset = charset_kana;
4177 }
4178 else if (c <= 0xEF)
4179 {
4180 /* SJIS -> JISX0208 */
4181 ONE_MORE_BYTE (c1);
4182 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4183 goto invalid_code;
4184 c = (c << 8) | c1;
4185 SJIS_TO_JIS (c);
4186 charset = charset_kanji;
4187 }
4188 else if (c <= 0xFC && charset_kanji2)
4189 {
4190 /* SJIS -> JISX0213-2 */
4191 ONE_MORE_BYTE (c1);
4192 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4193 goto invalid_code;
4194 c = (c << 8) | c1;
4195 SJIS_TO_JIS2 (c);
4196 charset = charset_kanji2;
4197 }
4198 else
4199 goto invalid_code;
4200 if (charset->id != charset_ascii
4201 && last_id != charset->id)
4202 {
4203 if (last_id != charset_ascii)
4204 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4205 last_id = charset->id;
4206 last_offset = char_offset;
4207 }
4208 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4209 *charbuf++ = c;
4210 char_offset++;
4211 continue;
4212
4213 invalid_code:
4214 src = src_base;
4215 consumed_chars = consumed_chars_base;
4216 ONE_MORE_BYTE (c);
4217 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4218 char_offset++;
4219 coding->errors++;
4220 }
4221
4222 no_more_source:
4223 if (last_id != charset_ascii)
4224 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4225 coding->consumed_char += consumed_chars_base;
4226 coding->consumed = src_base - coding->source;
4227 coding->charbuf_used = charbuf - coding->charbuf;
4228 }
4229
4230 static void
4231 decode_coding_big5 (coding)
4232 struct coding_system *coding;
4233 {
4234 const unsigned char *src = coding->source + coding->consumed;
4235 const unsigned char *src_end = coding->source + coding->src_bytes;
4236 const unsigned char *src_base;
4237 int *charbuf = coding->charbuf + coding->charbuf_used;
4238 int *charbuf_end
4239 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4240 int consumed_chars = 0, consumed_chars_base;
4241 int multibytep = coding->src_multibyte;
4242 struct charset *charset_roman, *charset_big5;
4243 Lisp_Object attrs, charset_list, val;
4244 int char_offset = coding->produced_char;
4245 int last_offset = char_offset;
4246 int last_id = charset_ascii;
4247 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4248 int byte_after_cr = -1;
4249
4250 CODING_GET_INFO (coding, attrs, charset_list);
4251 val = charset_list;
4252 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4253 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4254
4255 while (1)
4256 {
4257 int c, c1;
4258 struct charset *charset;
4259
4260 src_base = src;
4261 consumed_chars_base = consumed_chars;
4262
4263 if (charbuf >= charbuf_end)
4264 break;
4265
4266 if (byte_after_cr >= 0)
4267 c = byte_after_cr, byte_after_cr = -1;
4268 else
4269 ONE_MORE_BYTE (c);
4270
4271 if (c < 0)
4272 goto invalid_code;
4273 if (c < 0x80)
4274 {
4275 if (eol_crlf && c == '\r')
4276 ONE_MORE_BYTE (byte_after_cr);
4277 charset = charset_roman;
4278 }
4279 else
4280 {
4281 /* BIG5 -> Big5 */
4282 if (c < 0xA1 || c > 0xFE)
4283 goto invalid_code;
4284 ONE_MORE_BYTE (c1);
4285 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4286 goto invalid_code;
4287 c = c << 8 | c1;
4288 charset = charset_big5;
4289 }
4290 if (charset->id != charset_ascii
4291 && last_id != charset->id)
4292 {
4293 if (last_id != charset_ascii)
4294 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4295 last_id = charset->id;
4296 last_offset = char_offset;
4297 }
4298 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4299 *charbuf++ = c;
4300 char_offset++;
4301 continue;
4302
4303 invalid_code:
4304 src = src_base;
4305 consumed_chars = consumed_chars_base;
4306 ONE_MORE_BYTE (c);
4307 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4308 char_offset++;
4309 coding->errors++;
4310 }
4311
4312 no_more_source:
4313 if (last_id != charset_ascii)
4314 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4315 coding->consumed_char += consumed_chars_base;
4316 coding->consumed = src_base - coding->source;
4317 coding->charbuf_used = charbuf - coding->charbuf;
4318 }
4319
4320 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4321 This function can encode charsets `ascii', `katakana-jisx0201',
4322 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4323 are sure that all these charsets are registered as official charset
4324 (i.e. do not have extended leading-codes). Characters of other
4325 charsets are produced without any encoding. If SJIS_P is 1, encode
4326 SJIS text, else encode BIG5 text. */
4327
4328 static int
4329 encode_coding_sjis (coding)
4330 struct coding_system *coding;
4331 {
4332 int multibytep = coding->dst_multibyte;
4333 int *charbuf = coding->charbuf;
4334 int *charbuf_end = charbuf + coding->charbuf_used;
4335 unsigned char *dst = coding->destination + coding->produced;
4336 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4337 int safe_room = 4;
4338 int produced_chars = 0;
4339 Lisp_Object attrs, charset_list, val;
4340 int ascii_compatible;
4341 struct charset *charset_roman, *charset_kanji, *charset_kana;
4342 struct charset *charset_kanji2;
4343 int c;
4344
4345 CODING_GET_INFO (coding, attrs, charset_list);
4346 val = charset_list;
4347 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4348 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4349 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4350 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4351
4352 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4353
4354 while (charbuf < charbuf_end)
4355 {
4356 ASSURE_DESTINATION (safe_room);
4357 c = *charbuf++;
4358 /* Now encode the character C. */
4359 if (ASCII_CHAR_P (c) && ascii_compatible)
4360 EMIT_ONE_ASCII_BYTE (c);
4361 else if (CHAR_BYTE8_P (c))
4362 {
4363 c = CHAR_TO_BYTE8 (c);
4364 EMIT_ONE_BYTE (c);
4365 }
4366 else
4367 {
4368 unsigned code;
4369 struct charset *charset = char_charset (c, charset_list, &code);
4370
4371 if (!charset)
4372 {
4373 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4374 {
4375 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4376 charset = CHARSET_FROM_ID (charset_ascii);
4377 }
4378 else
4379 {
4380 c = coding->default_char;
4381 charset = char_charset (c, charset_list, &code);
4382 }
4383 }
4384 if (code == CHARSET_INVALID_CODE (charset))
4385 abort ();
4386 if (charset == charset_kanji)
4387 {
4388 int c1, c2;
4389 JIS_TO_SJIS (code);
4390 c1 = code >> 8, c2 = code & 0xFF;
4391 EMIT_TWO_BYTES (c1, c2);
4392 }
4393 else if (charset == charset_kana)
4394 EMIT_ONE_BYTE (code | 0x80);
4395 else if (charset_kanji2 && charset == charset_kanji2)
4396 {
4397 int c1, c2;
4398
4399 c1 = code >> 8;
4400 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4401 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4402 {
4403 JIS_TO_SJIS2 (code);
4404 c1 = code >> 8, c2 = code & 0xFF;
4405 EMIT_TWO_BYTES (c1, c2);
4406 }
4407 else
4408 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4409 }
4410 else
4411 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4412 }
4413 }
4414 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4415 coding->produced_char += produced_chars;
4416 coding->produced = dst - coding->destination;
4417 return 0;
4418 }
4419
4420 static int
4421 encode_coding_big5 (coding)
4422 struct coding_system *coding;
4423 {
4424 int multibytep = coding->dst_multibyte;
4425 int *charbuf = coding->charbuf;
4426 int *charbuf_end = charbuf + coding->charbuf_used;
4427 unsigned char *dst = coding->destination + coding->produced;
4428 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4429 int safe_room = 4;
4430 int produced_chars = 0;
4431 Lisp_Object attrs, charset_list, val;
4432 int ascii_compatible;
4433 struct charset *charset_roman, *charset_big5;
4434 int c;
4435
4436 CODING_GET_INFO (coding, attrs, charset_list);
4437 val = charset_list;
4438 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4439 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4440 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4441
4442 while (charbuf < charbuf_end)
4443 {
4444 ASSURE_DESTINATION (safe_room);
4445 c = *charbuf++;
4446 /* Now encode the character C. */
4447 if (ASCII_CHAR_P (c) && ascii_compatible)
4448 EMIT_ONE_ASCII_BYTE (c);
4449 else if (CHAR_BYTE8_P (c))
4450 {
4451 c = CHAR_TO_BYTE8 (c);
4452 EMIT_ONE_BYTE (c);
4453 }
4454 else
4455 {
4456 unsigned code;
4457 struct charset *charset = char_charset (c, charset_list, &code);
4458
4459 if (! charset)
4460 {
4461 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4462 {
4463 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4464 charset = CHARSET_FROM_ID (charset_ascii);
4465 }
4466 else
4467 {
4468 c = coding->default_char;
4469 charset = char_charset (c, charset_list, &code);
4470 }
4471 }
4472 if (code == CHARSET_INVALID_CODE (charset))
4473 abort ();
4474 if (charset == charset_big5)
4475 {
4476 int c1, c2;
4477
4478 c1 = code >> 8, c2 = code & 0xFF;
4479 EMIT_TWO_BYTES (c1, c2);
4480 }
4481 else
4482 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4483 }
4484 }
4485 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4486 coding->produced_char += produced_chars;
4487 coding->produced = dst - coding->destination;
4488 return 0;
4489 }
4490
4491 \f
4492 /*** 10. CCL handlers ***/
4493
4494 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4495 Check if a text is encoded in a coding system of which
4496 encoder/decoder are written in CCL program. If it is, return
4497 CATEGORY_MASK_CCL, else return 0. */
4498
4499 static int
4500 detect_coding_ccl (coding, detect_info)
4501 struct coding_system *coding;
4502 struct coding_detection_info *detect_info;
4503 {
4504 const unsigned char *src = coding->source, *src_base;
4505 const unsigned char *src_end = coding->source + coding->src_bytes;
4506 int multibytep = coding->src_multibyte;
4507 int consumed_chars = 0;
4508 int found = 0;
4509 unsigned char *valids;
4510 int head_ascii = coding->head_ascii;
4511 Lisp_Object attrs;
4512
4513 detect_info->checked |= CATEGORY_MASK_CCL;
4514
4515 coding = &coding_categories[coding_category_ccl];
4516 valids = CODING_CCL_VALIDS (coding);
4517 attrs = CODING_ID_ATTRS (coding->id);
4518 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4519 src += head_ascii;
4520
4521 while (1)
4522 {
4523 int c;
4524
4525 src_base = src;
4526 ONE_MORE_BYTE (c);
4527 if (c < 0 || ! valids[c])
4528 break;
4529 if ((valids[c] > 1))
4530 found = CATEGORY_MASK_CCL;
4531 }
4532 detect_info->rejected |= CATEGORY_MASK_CCL;
4533 return 0;
4534
4535 no_more_source:
4536 detect_info->found |= found;
4537 return 1;
4538 }
4539
4540 static void
4541 decode_coding_ccl (coding)
4542 struct coding_system *coding;
4543 {
4544 const unsigned char *src = coding->source + coding->consumed;
4545 const unsigned char *src_end = coding->source + coding->src_bytes;
4546 int *charbuf = coding->charbuf + coding->charbuf_used;
4547 int *charbuf_end = coding->charbuf + coding->charbuf_size;
4548 int consumed_chars = 0;
4549 int multibytep = coding->src_multibyte;
4550 struct ccl_program ccl;
4551 int source_charbuf[1024];
4552 int source_byteidx[1024];
4553 Lisp_Object attrs, charset_list;
4554
4555 CODING_GET_INFO (coding, attrs, charset_list);
4556 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4557
4558 while (src < src_end)
4559 {
4560 const unsigned char *p = src;
4561 int *source, *source_end;
4562 int i = 0;
4563
4564 if (multibytep)
4565 while (i < 1024 && p < src_end)
4566 {
4567 source_byteidx[i] = p - src;
4568 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4569 }
4570 else
4571 while (i < 1024 && p < src_end)
4572 source_charbuf[i++] = *p++;
4573
4574 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4575 ccl.last_block = 1;
4576
4577 source = source_charbuf;
4578 source_end = source + i;
4579 while (source < source_end)
4580 {
4581 ccl_driver (&ccl, source, charbuf,
4582 source_end - source, charbuf_end - charbuf,
4583 charset_list);
4584 source += ccl.consumed;
4585 charbuf += ccl.produced;
4586 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4587 break;
4588 }
4589 if (source < source_end)
4590 src += source_byteidx[source - source_charbuf];
4591 else
4592 src = p;
4593 consumed_chars += source - source_charbuf;
4594
4595 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4596 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4597 break;
4598 }
4599
4600 switch (ccl.status)
4601 {
4602 case CCL_STAT_SUSPEND_BY_SRC:
4603 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4604 break;
4605 case CCL_STAT_SUSPEND_BY_DST:
4606 break;
4607 case CCL_STAT_QUIT:
4608 case CCL_STAT_INVALID_CMD:
4609 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4610 break;
4611 default:
4612 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4613 break;
4614 }
4615 coding->consumed_char += consumed_chars;
4616 coding->consumed = src - coding->source;
4617 coding->charbuf_used = charbuf - coding->charbuf;
4618 }
4619
4620 static int
4621 encode_coding_ccl (coding)
4622 struct coding_system *coding;
4623 {
4624 struct ccl_program ccl;
4625 int multibytep = coding->dst_multibyte;
4626 int *charbuf = coding->charbuf;
4627 int *charbuf_end = charbuf + coding->charbuf_used;
4628 unsigned char *dst = coding->destination + coding->produced;
4629 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4630 int destination_charbuf[1024];
4631 int i, produced_chars = 0;
4632 Lisp_Object attrs, charset_list;
4633
4634 CODING_GET_INFO (coding, attrs, charset_list);
4635 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4636
4637 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4638 ccl.dst_multibyte = coding->dst_multibyte;
4639
4640 while (charbuf < charbuf_end)
4641 {
4642 ccl_driver (&ccl, charbuf, destination_charbuf,
4643 charbuf_end - charbuf, 1024, charset_list);
4644 if (multibytep)
4645 {
4646 ASSURE_DESTINATION (ccl.produced * 2);
4647 for (i = 0; i < ccl.produced; i++)
4648 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4649 }
4650 else
4651 {
4652 ASSURE_DESTINATION (ccl.produced);
4653 for (i = 0; i < ccl.produced; i++)
4654 *dst++ = destination_charbuf[i] & 0xFF;
4655 produced_chars += ccl.produced;
4656 }
4657 charbuf += ccl.consumed;
4658 if (ccl.status == CCL_STAT_QUIT
4659 || ccl.status == CCL_STAT_INVALID_CMD)
4660 break;
4661 }
4662
4663 switch (ccl.status)
4664 {
4665 case CCL_STAT_SUSPEND_BY_SRC:
4666 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4667 break;
4668 case CCL_STAT_SUSPEND_BY_DST:
4669 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4670 break;
4671 case CCL_STAT_QUIT:
4672 case CCL_STAT_INVALID_CMD:
4673 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4674 break;
4675 default:
4676 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4677 break;
4678 }
4679
4680 coding->produced_char += produced_chars;
4681 coding->produced = dst - coding->destination;
4682 return 0;
4683 }
4684
4685
4686 \f
4687 /*** 10, 11. no-conversion handlers ***/
4688
4689 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4690
4691 static void
4692 decode_coding_raw_text (coding)
4693 struct coding_system *coding;
4694 {
4695 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4696
4697 coding->chars_at_source = 1;
4698 coding->consumed_char = coding->src_chars;
4699 coding->consumed = coding->src_bytes;
4700 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4701 {
4702 coding->consumed_char--;
4703 coding->consumed--;
4704 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4705 }
4706 else
4707 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4708 }
4709
4710 static int
4711 encode_coding_raw_text (coding)
4712 struct coding_system *coding;
4713 {
4714 int multibytep = coding->dst_multibyte;
4715 int *charbuf = coding->charbuf;
4716 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4717 unsigned char *dst = coding->destination + coding->produced;
4718 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4719 int produced_chars = 0;
4720 int c;
4721
4722 if (multibytep)
4723 {
4724 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4725
4726 if (coding->src_multibyte)
4727 while (charbuf < charbuf_end)
4728 {
4729 ASSURE_DESTINATION (safe_room);
4730 c = *charbuf++;
4731 if (ASCII_CHAR_P (c))
4732 EMIT_ONE_ASCII_BYTE (c);
4733 else if (CHAR_BYTE8_P (c))
4734 {
4735 c = CHAR_TO_BYTE8 (c);
4736 EMIT_ONE_BYTE (c);
4737 }
4738 else
4739 {
4740 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4741
4742 CHAR_STRING_ADVANCE (c, p1);
4743 while (p0 < p1)
4744 {
4745 EMIT_ONE_BYTE (*p0);
4746 p0++;
4747 }
4748 }
4749 }
4750 else
4751 while (charbuf < charbuf_end)
4752 {
4753 ASSURE_DESTINATION (safe_room);
4754 c = *charbuf++;
4755 EMIT_ONE_BYTE (c);
4756 }
4757 }
4758 else
4759 {
4760 if (coding->src_multibyte)
4761 {
4762 int safe_room = MAX_MULTIBYTE_LENGTH;
4763
4764 while (charbuf < charbuf_end)
4765 {
4766 ASSURE_DESTINATION (safe_room);
4767 c = *charbuf++;
4768 if (ASCII_CHAR_P (c))
4769 *dst++ = c;
4770 else if (CHAR_BYTE8_P (c))
4771 *dst++ = CHAR_TO_BYTE8 (c);
4772 else
4773 CHAR_STRING_ADVANCE (c, dst);
4774 produced_chars++;
4775 }
4776 }
4777 else
4778 {
4779 ASSURE_DESTINATION (charbuf_end - charbuf);
4780 while (charbuf < charbuf_end && dst < dst_end)
4781 *dst++ = *charbuf++;
4782 produced_chars = dst - (coding->destination + coding->dst_bytes);
4783 }
4784 }
4785 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4786 coding->produced_char += produced_chars;
4787 coding->produced = dst - coding->destination;
4788 return 0;
4789 }
4790
4791 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4792 Check if a text is encoded in a charset-based coding system. If it
4793 is, return 1, else return 0. */
4794
4795 static int
4796 detect_coding_charset (coding, detect_info)
4797 struct coding_system *coding;
4798 struct coding_detection_info *detect_info;
4799 {
4800 const unsigned char *src = coding->source, *src_base;
4801 const unsigned char *src_end = coding->source + coding->src_bytes;
4802 int multibytep = coding->src_multibyte;
4803 int consumed_chars = 0;
4804 Lisp_Object attrs, valids;
4805 int found = 0;
4806 int head_ascii = coding->head_ascii;
4807
4808 detect_info->checked |= CATEGORY_MASK_CHARSET;
4809
4810 coding = &coding_categories[coding_category_charset];
4811 attrs = CODING_ID_ATTRS (coding->id);
4812 valids = AREF (attrs, coding_attr_charset_valids);
4813
4814 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4815 src += head_ascii;
4816
4817 while (1)
4818 {
4819 int c;
4820 Lisp_Object val;
4821 struct charset *charset;
4822 int dim, idx;
4823
4824 src_base = src;
4825 ONE_MORE_BYTE (c);
4826 if (c < 0)
4827 continue;
4828 val = AREF (valids, c);
4829 if (NILP (val))
4830 break;
4831 if (c >= 0x80)
4832 found = CATEGORY_MASK_CHARSET;
4833 if (INTEGERP (val))
4834 {
4835 charset = CHARSET_FROM_ID (XFASTINT (val));
4836 dim = CHARSET_DIMENSION (charset);
4837 for (idx = 1; idx < dim; idx++)
4838 {
4839 if (src == src_end)
4840 goto too_short;
4841 ONE_MORE_BYTE (c);
4842 if (c < charset->code_space[(dim - 1 - idx) * 2]
4843 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
4844 break;
4845 }
4846 if (idx < dim)
4847 break;
4848 }
4849 else
4850 {
4851 idx = 1;
4852 for (; CONSP (val); val = XCDR (val))
4853 {
4854 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4855 dim = CHARSET_DIMENSION (charset);
4856 while (idx < dim)
4857 {
4858 if (src == src_end)
4859 goto too_short;
4860 ONE_MORE_BYTE (c);
4861 if (c < charset->code_space[(dim - 1 - idx) * 4]
4862 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
4863 break;
4864 idx++;
4865 }
4866 if (idx == dim)
4867 {
4868 val = Qnil;
4869 break;
4870 }
4871 }
4872 if (CONSP (val))
4873 break;
4874 }
4875 }
4876 too_short:
4877 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4878 return 0;
4879
4880 no_more_source:
4881 detect_info->found |= found;
4882 return 1;
4883 }
4884
4885 static void
4886 decode_coding_charset (coding)
4887 struct coding_system *coding;
4888 {
4889 const unsigned char *src = coding->source + coding->consumed;
4890 const unsigned char *src_end = coding->source + coding->src_bytes;
4891 const unsigned char *src_base;
4892 int *charbuf = coding->charbuf + coding->charbuf_used;
4893 int *charbuf_end
4894 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4895 int consumed_chars = 0, consumed_chars_base;
4896 int multibytep = coding->src_multibyte;
4897 Lisp_Object attrs, charset_list, valids;
4898 int char_offset = coding->produced_char;
4899 int last_offset = char_offset;
4900 int last_id = charset_ascii;
4901 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4902 int byte_after_cr = -1;
4903
4904 CODING_GET_INFO (coding, attrs, charset_list);
4905 valids = AREF (attrs, coding_attr_charset_valids);
4906
4907 while (1)
4908 {
4909 int c;
4910 Lisp_Object val;
4911 struct charset *charset;
4912 int dim;
4913 int len = 1;
4914 unsigned code;
4915
4916 src_base = src;
4917 consumed_chars_base = consumed_chars;
4918
4919 if (charbuf >= charbuf_end)
4920 break;
4921
4922 if (byte_after_cr >= 0)
4923 {
4924 c = byte_after_cr;
4925 byte_after_cr = -1;
4926 }
4927 else
4928 {
4929 ONE_MORE_BYTE (c);
4930 if (eol_crlf && c == '\r')
4931 ONE_MORE_BYTE (byte_after_cr);
4932 }
4933 if (c < 0)
4934 goto invalid_code;
4935 code = c;
4936
4937 val = AREF (valids, c);
4938 if (NILP (val))
4939 goto invalid_code;
4940 if (INTEGERP (val))
4941 {
4942 charset = CHARSET_FROM_ID (XFASTINT (val));
4943 dim = CHARSET_DIMENSION (charset);
4944 while (len < dim)
4945 {
4946 ONE_MORE_BYTE (c);
4947 code = (code << 8) | c;
4948 len++;
4949 }
4950 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4951 charset, code, c);
4952 }
4953 else
4954 {
4955 /* VAL is a list of charset IDs. It is assured that the
4956 list is sorted by charset dimensions (smaller one
4957 comes first). */
4958 while (CONSP (val))
4959 {
4960 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4961 dim = CHARSET_DIMENSION (charset);
4962 while (len < dim)
4963 {
4964 ONE_MORE_BYTE (c);
4965 code = (code << 8) | c;
4966 len++;
4967 }
4968 CODING_DECODE_CHAR (coding, src, src_base,
4969 src_end, charset, code, c);
4970 if (c >= 0)
4971 break;
4972 val = XCDR (val);
4973 }
4974 }
4975 if (c < 0)
4976 goto invalid_code;
4977 if (charset->id != charset_ascii
4978 && last_id != charset->id)
4979 {
4980 if (last_id != charset_ascii)
4981 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4982 last_id = charset->id;
4983 last_offset = char_offset;
4984 }
4985
4986 *charbuf++ = c;
4987 char_offset++;
4988 continue;
4989
4990 invalid_code:
4991 src = src_base;
4992 consumed_chars = consumed_chars_base;
4993 ONE_MORE_BYTE (c);
4994 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4995 char_offset++;
4996 coding->errors++;
4997 }
4998
4999 no_more_source:
5000 if (last_id != charset_ascii)
5001 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5002 coding->consumed_char += consumed_chars_base;
5003 coding->consumed = src_base - coding->source;
5004 coding->charbuf_used = charbuf - coding->charbuf;
5005 }
5006
5007 static int
5008 encode_coding_charset (coding)
5009 struct coding_system *coding;
5010 {
5011 int multibytep = coding->dst_multibyte;
5012 int *charbuf = coding->charbuf;
5013 int *charbuf_end = charbuf + coding->charbuf_used;
5014 unsigned char *dst = coding->destination + coding->produced;
5015 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5016 int safe_room = MAX_MULTIBYTE_LENGTH;
5017 int produced_chars = 0;
5018 Lisp_Object attrs, charset_list;
5019 int ascii_compatible;
5020 int c;
5021
5022 CODING_GET_INFO (coding, attrs, charset_list);
5023 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5024
5025 while (charbuf < charbuf_end)
5026 {
5027 struct charset *charset;
5028 unsigned code;
5029
5030 ASSURE_DESTINATION (safe_room);
5031 c = *charbuf++;
5032 if (ascii_compatible && ASCII_CHAR_P (c))
5033 EMIT_ONE_ASCII_BYTE (c);
5034 else if (CHAR_BYTE8_P (c))
5035 {
5036 c = CHAR_TO_BYTE8 (c);
5037 EMIT_ONE_BYTE (c);
5038 }
5039 else
5040 {
5041 charset = char_charset (c, charset_list, &code);
5042 if (charset)
5043 {
5044 if (CHARSET_DIMENSION (charset) == 1)
5045 EMIT_ONE_BYTE (code);
5046 else if (CHARSET_DIMENSION (charset) == 2)
5047 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5048 else if (CHARSET_DIMENSION (charset) == 3)
5049 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5050 else
5051 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5052 (code >> 8) & 0xFF, code & 0xFF);
5053 }
5054 else
5055 {
5056 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5057 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5058 else
5059 c = coding->default_char;
5060 EMIT_ONE_BYTE (c);
5061 }
5062 }
5063 }
5064
5065 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5066 coding->produced_char += produced_chars;
5067 coding->produced = dst - coding->destination;
5068 return 0;
5069 }
5070
5071 \f
5072 /*** 7. C library functions ***/
5073
5074 /* Setup coding context CODING from information about CODING_SYSTEM.
5075 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5076 CODING_SYSTEM is invalid, signal an error. */
5077
5078 void
5079 setup_coding_system (coding_system, coding)
5080 Lisp_Object coding_system;
5081 struct coding_system *coding;
5082 {
5083 Lisp_Object attrs;
5084 Lisp_Object eol_type;
5085 Lisp_Object coding_type;
5086 Lisp_Object val;
5087
5088 if (NILP (coding_system))
5089 coding_system = Qundecided;
5090
5091 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5092
5093 attrs = CODING_ID_ATTRS (coding->id);
5094 eol_type = CODING_ID_EOL_TYPE (coding->id);
5095
5096 coding->mode = 0;
5097 coding->head_ascii = -1;
5098 if (VECTORP (eol_type))
5099 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5100 | CODING_REQUIRE_DETECTION_MASK);
5101 else if (! EQ (eol_type, Qunix))
5102 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5103 | CODING_REQUIRE_ENCODING_MASK);
5104 else
5105 coding->common_flags = 0;
5106 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5107 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5108 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5109 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5110 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5111 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5112
5113 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5114 coding->max_charset_id = SCHARS (val) - 1;
5115 coding->safe_charsets = (char *) SDATA (val);
5116 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5117
5118 coding_type = CODING_ATTR_TYPE (attrs);
5119 if (EQ (coding_type, Qundecided))
5120 {
5121 coding->detector = NULL;
5122 coding->decoder = decode_coding_raw_text;
5123 coding->encoder = encode_coding_raw_text;
5124 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5125 }
5126 else if (EQ (coding_type, Qiso_2022))
5127 {
5128 int i;
5129 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5130
5131 /* Invoke graphic register 0 to plane 0. */
5132 CODING_ISO_INVOCATION (coding, 0) = 0;
5133 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5134 CODING_ISO_INVOCATION (coding, 1)
5135 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5136 /* Setup the initial status of designation. */
5137 for (i = 0; i < 4; i++)
5138 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5139 /* Not single shifting initially. */
5140 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5141 /* Beginning of buffer should also be regarded as bol. */
5142 CODING_ISO_BOL (coding) = 1;
5143 coding->detector = detect_coding_iso_2022;
5144 coding->decoder = decode_coding_iso_2022;
5145 coding->encoder = encode_coding_iso_2022;
5146 if (flags & CODING_ISO_FLAG_SAFE)
5147 coding->mode |= CODING_MODE_SAFE_ENCODING;
5148 coding->common_flags
5149 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5150 | CODING_REQUIRE_FLUSHING_MASK);
5151 if (flags & CODING_ISO_FLAG_COMPOSITION)
5152 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5153 if (flags & CODING_ISO_FLAG_DESIGNATION)
5154 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5155 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5156 {
5157 setup_iso_safe_charsets (attrs);
5158 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5159 coding->max_charset_id = SCHARS (val) - 1;
5160 coding->safe_charsets = (char *) SDATA (val);
5161 }
5162 CODING_ISO_FLAGS (coding) = flags;
5163 }
5164 else if (EQ (coding_type, Qcharset))
5165 {
5166 coding->detector = detect_coding_charset;
5167 coding->decoder = decode_coding_charset;
5168 coding->encoder = encode_coding_charset;
5169 coding->common_flags
5170 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5171 }
5172 else if (EQ (coding_type, Qutf_8))
5173 {
5174 coding->detector = detect_coding_utf_8;
5175 coding->decoder = decode_coding_utf_8;
5176 coding->encoder = encode_coding_utf_8;
5177 coding->common_flags
5178 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5179 }
5180 else if (EQ (coding_type, Qutf_16))
5181 {
5182 val = AREF (attrs, coding_attr_utf_16_bom);
5183 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5184 : EQ (val, Qt) ? utf_16_with_bom
5185 : utf_16_without_bom);
5186 val = AREF (attrs, coding_attr_utf_16_endian);
5187 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5188 : utf_16_little_endian);
5189 CODING_UTF_16_SURROGATE (coding) = 0;
5190 coding->detector = detect_coding_utf_16;
5191 coding->decoder = decode_coding_utf_16;
5192 coding->encoder = encode_coding_utf_16;
5193 coding->common_flags
5194 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5195 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5196 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5197 }
5198 else if (EQ (coding_type, Qccl))
5199 {
5200 coding->detector = detect_coding_ccl;
5201 coding->decoder = decode_coding_ccl;
5202 coding->encoder = encode_coding_ccl;
5203 coding->common_flags
5204 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5205 | CODING_REQUIRE_FLUSHING_MASK);
5206 }
5207 else if (EQ (coding_type, Qemacs_mule))
5208 {
5209 coding->detector = detect_coding_emacs_mule;
5210 coding->decoder = decode_coding_emacs_mule;
5211 coding->encoder = encode_coding_emacs_mule;
5212 coding->common_flags
5213 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5214 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5215 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5216 {
5217 Lisp_Object tail, safe_charsets;
5218 int max_charset_id = 0;
5219
5220 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5221 tail = XCDR (tail))
5222 if (max_charset_id < XFASTINT (XCAR (tail)))
5223 max_charset_id = XFASTINT (XCAR (tail));
5224 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5225 make_number (255));
5226 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5227 tail = XCDR (tail))
5228 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5229 coding->max_charset_id = max_charset_id;
5230 coding->safe_charsets = (char *) SDATA (safe_charsets);
5231 }
5232 }
5233 else if (EQ (coding_type, Qshift_jis))
5234 {
5235 coding->detector = detect_coding_sjis;
5236 coding->decoder = decode_coding_sjis;
5237 coding->encoder = encode_coding_sjis;
5238 coding->common_flags
5239 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5240 }
5241 else if (EQ (coding_type, Qbig5))
5242 {
5243 coding->detector = detect_coding_big5;
5244 coding->decoder = decode_coding_big5;
5245 coding->encoder = encode_coding_big5;
5246 coding->common_flags
5247 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5248 }
5249 else /* EQ (coding_type, Qraw_text) */
5250 {
5251 coding->detector = NULL;
5252 coding->decoder = decode_coding_raw_text;
5253 coding->encoder = encode_coding_raw_text;
5254 if (! EQ (eol_type, Qunix))
5255 {
5256 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5257 if (! VECTORP (eol_type))
5258 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5259 }
5260
5261 }
5262
5263 return;
5264 }
5265
5266 /* Return a list of charsets supported by CODING. */
5267
5268 Lisp_Object
5269 coding_charset_list (coding)
5270 struct coding_system *coding;
5271 {
5272 Lisp_Object attrs, charset_list;
5273
5274 CODING_GET_INFO (coding, attrs, charset_list);
5275 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5276 {
5277 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5278
5279 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5280 charset_list = Viso_2022_charset_list;
5281 }
5282 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5283 {
5284 charset_list = Vemacs_mule_charset_list;
5285 }
5286 return charset_list;
5287 }
5288
5289
5290 /* Return raw-text or one of its subsidiaries that has the same
5291 eol_type as CODING-SYSTEM. */
5292
5293 Lisp_Object
5294 raw_text_coding_system (coding_system)
5295 Lisp_Object coding_system;
5296 {
5297 Lisp_Object spec, attrs;
5298 Lisp_Object eol_type, raw_text_eol_type;
5299
5300 if (NILP (coding_system))
5301 return Qraw_text;
5302 spec = CODING_SYSTEM_SPEC (coding_system);
5303 attrs = AREF (spec, 0);
5304
5305 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5306 return coding_system;
5307
5308 eol_type = AREF (spec, 2);
5309 if (VECTORP (eol_type))
5310 return Qraw_text;
5311 spec = CODING_SYSTEM_SPEC (Qraw_text);
5312 raw_text_eol_type = AREF (spec, 2);
5313 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5314 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5315 : AREF (raw_text_eol_type, 2));
5316 }
5317
5318
5319 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5320 does, return one of the subsidiary that has the same eol-spec as
5321 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5322 inherit end-of-line format from the system's setting
5323 (system_eol_type). */
5324
5325 Lisp_Object
5326 coding_inherit_eol_type (coding_system, parent)
5327 Lisp_Object coding_system, parent;
5328 {
5329 Lisp_Object spec, eol_type;
5330
5331 if (NILP (coding_system))
5332 coding_system = Qraw_text;
5333 spec = CODING_SYSTEM_SPEC (coding_system);
5334 eol_type = AREF (spec, 2);
5335 if (VECTORP (eol_type))
5336 {
5337 Lisp_Object parent_eol_type;
5338
5339 if (! NILP (parent))
5340 {
5341 Lisp_Object parent_spec;
5342
5343 parent_spec = CODING_SYSTEM_SPEC (parent);
5344 parent_eol_type = AREF (parent_spec, 2);
5345 }
5346 else
5347 parent_eol_type = system_eol_type;
5348 if (EQ (parent_eol_type, Qunix))
5349 coding_system = AREF (eol_type, 0);
5350 else if (EQ (parent_eol_type, Qdos))
5351 coding_system = AREF (eol_type, 1);
5352 else if (EQ (parent_eol_type, Qmac))
5353 coding_system = AREF (eol_type, 2);
5354 }
5355 return coding_system;
5356 }
5357
5358 /* Emacs has a mechanism to automatically detect a coding system if it
5359 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5360 it's impossible to distinguish some coding systems accurately
5361 because they use the same range of codes. So, at first, coding
5362 systems are categorized into 7, those are:
5363
5364 o coding-category-emacs-mule
5365
5366 The category for a coding system which has the same code range
5367 as Emacs' internal format. Assigned the coding-system (Lisp
5368 symbol) `emacs-mule' by default.
5369
5370 o coding-category-sjis
5371
5372 The category for a coding system which has the same code range
5373 as SJIS. Assigned the coding-system (Lisp
5374 symbol) `japanese-shift-jis' by default.
5375
5376 o coding-category-iso-7
5377
5378 The category for a coding system which has the same code range
5379 as ISO2022 of 7-bit environment. This doesn't use any locking
5380 shift and single shift functions. This can encode/decode all
5381 charsets. Assigned the coding-system (Lisp symbol)
5382 `iso-2022-7bit' by default.
5383
5384 o coding-category-iso-7-tight
5385
5386 Same as coding-category-iso-7 except that this can
5387 encode/decode only the specified charsets.
5388
5389 o coding-category-iso-8-1
5390
5391 The category for a coding system which has the same code range
5392 as ISO2022 of 8-bit environment and graphic plane 1 used only
5393 for DIMENSION1 charset. This doesn't use any locking shift
5394 and single shift functions. Assigned the coding-system (Lisp
5395 symbol) `iso-latin-1' by default.
5396
5397 o coding-category-iso-8-2
5398
5399 The category for a coding system which has the same code range
5400 as ISO2022 of 8-bit environment and graphic plane 1 used only
5401 for DIMENSION2 charset. This doesn't use any locking shift
5402 and single shift functions. Assigned the coding-system (Lisp
5403 symbol) `japanese-iso-8bit' by default.
5404
5405 o coding-category-iso-7-else
5406
5407 The category for a coding system which has the same code range
5408 as ISO2022 of 7-bit environemnt but uses locking shift or
5409 single shift functions. Assigned the coding-system (Lisp
5410 symbol) `iso-2022-7bit-lock' by default.
5411
5412 o coding-category-iso-8-else
5413
5414 The category for a coding system which has the same code range
5415 as ISO2022 of 8-bit environemnt but uses locking shift or
5416 single shift functions. Assigned the coding-system (Lisp
5417 symbol) `iso-2022-8bit-ss2' by default.
5418
5419 o coding-category-big5
5420
5421 The category for a coding system which has the same code range
5422 as BIG5. Assigned the coding-system (Lisp symbol)
5423 `cn-big5' by default.
5424
5425 o coding-category-utf-8
5426
5427 The category for a coding system which has the same code range
5428 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
5429 symbol) `utf-8' by default.
5430
5431 o coding-category-utf-16-be
5432
5433 The category for a coding system in which a text has an
5434 Unicode signature (cf. Unicode Standard) in the order of BIG
5435 endian at the head. Assigned the coding-system (Lisp symbol)
5436 `utf-16-be' by default.
5437
5438 o coding-category-utf-16-le
5439
5440 The category for a coding system in which a text has an
5441 Unicode signature (cf. Unicode Standard) in the order of
5442 LITTLE endian at the head. Assigned the coding-system (Lisp
5443 symbol) `utf-16-le' by default.
5444
5445 o coding-category-ccl
5446
5447 The category for a coding system of which encoder/decoder is
5448 written in CCL programs. The default value is nil, i.e., no
5449 coding system is assigned.
5450
5451 o coding-category-binary
5452
5453 The category for a coding system not categorized in any of the
5454 above. Assigned the coding-system (Lisp symbol)
5455 `no-conversion' by default.
5456
5457 Each of them is a Lisp symbol and the value is an actual
5458 `coding-system's (this is also a Lisp symbol) assigned by a user.
5459 What Emacs does actually is to detect a category of coding system.
5460 Then, it uses a `coding-system' assigned to it. If Emacs can't
5461 decide only one possible category, it selects a category of the
5462 highest priority. Priorities of categories are also specified by a
5463 user in a Lisp variable `coding-category-list'.
5464
5465 */
5466
5467 #define EOL_SEEN_NONE 0
5468 #define EOL_SEEN_LF 1
5469 #define EOL_SEEN_CR 2
5470 #define EOL_SEEN_CRLF 4
5471
5472 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5473 SOURCE is encoded. If CATEGORY is one of
5474 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5475 two-byte, else they are encoded by one-byte.
5476
5477 Return one of EOL_SEEN_XXX. */
5478
5479 #define MAX_EOL_CHECK_COUNT 3
5480
5481 static int
5482 detect_eol (source, src_bytes, category)
5483 const unsigned char *source;
5484 EMACS_INT src_bytes;
5485 enum coding_category category;
5486 {
5487 const unsigned char *src = source, *src_end = src + src_bytes;
5488 unsigned char c;
5489 int total = 0;
5490 int eol_seen = EOL_SEEN_NONE;
5491
5492 if ((1 << category) & CATEGORY_MASK_UTF_16)
5493 {
5494 int msb, lsb;
5495
5496 msb = category == (coding_category_utf_16_le
5497 | coding_category_utf_16_le_nosig);
5498 lsb = 1 - msb;
5499
5500 while (src + 1 < src_end)
5501 {
5502 c = src[lsb];
5503 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5504 {
5505 int this_eol;
5506
5507 if (c == '\n')
5508 this_eol = EOL_SEEN_LF;
5509 else if (src + 3 >= src_end
5510 || src[msb + 2] != 0
5511 || src[lsb + 2] != '\n')
5512 this_eol = EOL_SEEN_CR;
5513 else
5514 this_eol = EOL_SEEN_CRLF;
5515
5516 if (eol_seen == EOL_SEEN_NONE)
5517 /* This is the first end-of-line. */
5518 eol_seen = this_eol;
5519 else if (eol_seen != this_eol)
5520 {
5521 /* The found type is different from what found before. */
5522 eol_seen = EOL_SEEN_LF;
5523 break;
5524 }
5525 if (++total == MAX_EOL_CHECK_COUNT)
5526 break;
5527 }
5528 src += 2;
5529 }
5530 }
5531 else
5532 {
5533 while (src < src_end)
5534 {
5535 c = *src++;
5536 if (c == '\n' || c == '\r')
5537 {
5538 int this_eol;
5539
5540 if (c == '\n')
5541 this_eol = EOL_SEEN_LF;
5542 else if (src >= src_end || *src != '\n')
5543 this_eol = EOL_SEEN_CR;
5544 else
5545 this_eol = EOL_SEEN_CRLF, src++;
5546
5547 if (eol_seen == EOL_SEEN_NONE)
5548 /* This is the first end-of-line. */
5549 eol_seen = this_eol;
5550 else if (eol_seen != this_eol)
5551 {
5552 /* The found type is different from what found before. */
5553 eol_seen = EOL_SEEN_LF;
5554 break;
5555 }
5556 if (++total == MAX_EOL_CHECK_COUNT)
5557 break;
5558 }
5559 }
5560 }
5561 return eol_seen;
5562 }
5563
5564
5565 static Lisp_Object
5566 adjust_coding_eol_type (coding, eol_seen)
5567 struct coding_system *coding;
5568 int eol_seen;
5569 {
5570 Lisp_Object eol_type;
5571
5572 eol_type = CODING_ID_EOL_TYPE (coding->id);
5573 if (eol_seen & EOL_SEEN_LF)
5574 {
5575 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5576 eol_type = Qunix;
5577 }
5578 else if (eol_seen & EOL_SEEN_CRLF)
5579 {
5580 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5581 eol_type = Qdos;
5582 }
5583 else if (eol_seen & EOL_SEEN_CR)
5584 {
5585 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5586 eol_type = Qmac;
5587 }
5588 return eol_type;
5589 }
5590
5591 /* Detect how a text specified in CODING is encoded. If a coding
5592 system is detected, update fields of CODING by the detected coding
5593 system. */
5594
5595 void
5596 detect_coding (coding)
5597 struct coding_system *coding;
5598 {
5599 const unsigned char *src, *src_end;
5600
5601 coding->consumed = coding->consumed_char = 0;
5602 coding->produced = coding->produced_char = 0;
5603 coding_set_source (coding);
5604
5605 src_end = coding->source + coding->src_bytes;
5606
5607 /* If we have not yet decided the text encoding type, detect it
5608 now. */
5609 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5610 {
5611 int c, i;
5612 struct coding_detection_info detect_info;
5613
5614 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5615 for (i = 0, src = coding->source; src < src_end; i++, src++)
5616 {
5617 c = *src;
5618 if (c & 0x80)
5619 break;
5620 if (c < 0x20
5621 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5622 && ! inhibit_iso_escape_detection
5623 && ! detect_info.checked)
5624 {
5625 coding->head_ascii = src - (coding->source + coding->consumed);
5626 if (detect_coding_iso_2022 (coding, &detect_info))
5627 {
5628 /* We have scanned the whole data. */
5629 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5630 /* We didn't find an 8-bit code. */
5631 src = src_end;
5632 break;
5633 }
5634 }
5635 }
5636 coding->head_ascii = src - (coding->source + coding->consumed);
5637
5638 if (coding->head_ascii < coding->src_bytes
5639 || detect_info.found)
5640 {
5641 enum coding_category category;
5642 struct coding_system *this;
5643
5644 if (coding->head_ascii == coding->src_bytes)
5645 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5646 for (i = 0; i < coding_category_raw_text; i++)
5647 {
5648 category = coding_priorities[i];
5649 this = coding_categories + category;
5650 if (detect_info.found & (1 << category))
5651 break;
5652 }
5653 else
5654 for (i = 0; i < coding_category_raw_text; i++)
5655 {
5656 category = coding_priorities[i];
5657 this = coding_categories + category;
5658 if (this->id < 0)
5659 {
5660 /* No coding system of this category is defined. */
5661 detect_info.rejected |= (1 << category);
5662 }
5663 else if (category >= coding_category_raw_text)
5664 continue;
5665 else if (detect_info.checked & (1 << category))
5666 {
5667 if (detect_info.found & (1 << category))
5668 break;
5669 }
5670 else if ((*(this->detector)) (coding, &detect_info)
5671 && detect_info.found & (1 << category))
5672 {
5673 if (category == coding_category_utf_16_auto)
5674 {
5675 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5676 category = coding_category_utf_16_le;
5677 else
5678 category = coding_category_utf_16_be;
5679 }
5680 break;
5681 }
5682 }
5683
5684 if (i < coding_category_raw_text)
5685 setup_coding_system (CODING_ID_NAME (this->id), coding);
5686 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5687 setup_coding_system (Qraw_text, coding);
5688 else if (detect_info.rejected)
5689 for (i = 0; i < coding_category_raw_text; i++)
5690 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5691 {
5692 this = coding_categories + coding_priorities[i];
5693 setup_coding_system (CODING_ID_NAME (this->id), coding);
5694 break;
5695 }
5696 }
5697 }
5698 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5699 == coding_category_utf_16_auto)
5700 {
5701 Lisp_Object coding_systems;
5702 struct coding_detection_info detect_info;
5703
5704 coding_systems
5705 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5706 detect_info.found = detect_info.rejected = 0;
5707 if (CONSP (coding_systems)
5708 && detect_coding_utf_16 (coding, &detect_info))
5709 {
5710 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5711 setup_coding_system (XCAR (coding_systems), coding);
5712 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5713 setup_coding_system (XCDR (coding_systems), coding);
5714 }
5715 }
5716 }
5717
5718
5719 static void
5720 decode_eol (coding)
5721 struct coding_system *coding;
5722 {
5723 Lisp_Object eol_type;
5724 unsigned char *p, *pbeg, *pend;
5725
5726 eol_type = CODING_ID_EOL_TYPE (coding->id);
5727 if (EQ (eol_type, Qunix))
5728 return;
5729
5730 if (NILP (coding->dst_object))
5731 pbeg = coding->destination;
5732 else
5733 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5734 pend = pbeg + coding->produced;
5735
5736 if (VECTORP (eol_type))
5737 {
5738 int eol_seen = EOL_SEEN_NONE;
5739
5740 for (p = pbeg; p < pend; p++)
5741 {
5742 if (*p == '\n')
5743 eol_seen |= EOL_SEEN_LF;
5744 else if (*p == '\r')
5745 {
5746 if (p + 1 < pend && *(p + 1) == '\n')
5747 {
5748 eol_seen |= EOL_SEEN_CRLF;
5749 p++;
5750 }
5751 else
5752 eol_seen |= EOL_SEEN_CR;
5753 }
5754 }
5755 if (eol_seen != EOL_SEEN_NONE
5756 && eol_seen != EOL_SEEN_LF
5757 && eol_seen != EOL_SEEN_CRLF
5758 && eol_seen != EOL_SEEN_CR)
5759 eol_seen = EOL_SEEN_LF;
5760 if (eol_seen != EOL_SEEN_NONE)
5761 eol_type = adjust_coding_eol_type (coding, eol_seen);
5762 }
5763
5764 if (EQ (eol_type, Qmac))
5765 {
5766 for (p = pbeg; p < pend; p++)
5767 if (*p == '\r')
5768 *p = '\n';
5769 }
5770 else if (EQ (eol_type, Qdos))
5771 {
5772 int n = 0;
5773
5774 if (NILP (coding->dst_object))
5775 {
5776 /* Start deleting '\r' from the tail to minimize the memory
5777 movement. */
5778 for (p = pend - 2; p >= pbeg; p--)
5779 if (*p == '\r')
5780 {
5781 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5782 n++;
5783 }
5784 }
5785 else
5786 {
5787 int pos_byte = coding->dst_pos_byte;
5788 int pos = coding->dst_pos;
5789 int pos_end = pos + coding->produced_char - 1;
5790
5791 while (pos < pos_end)
5792 {
5793 p = BYTE_POS_ADDR (pos_byte);
5794 if (*p == '\r' && p[1] == '\n')
5795 {
5796 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5797 n++;
5798 pos_end--;
5799 }
5800 pos++;
5801 if (coding->dst_multibyte)
5802 pos_byte += BYTES_BY_CHAR_HEAD (*p);
5803 else
5804 pos_byte++;
5805 }
5806 }
5807 coding->produced -= n;
5808 coding->produced_char -= n;
5809 }
5810 }
5811
5812
5813 /* Return a translation table (or list of them) from coding system
5814 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5815 decoding (ENCODEP is zero). */
5816
5817 static Lisp_Object
5818 get_translation_table (attrs, encodep, max_lookup)
5819 Lisp_Object attrs;
5820 int encodep, *max_lookup;
5821 {
5822 Lisp_Object standard, translation_table;
5823 Lisp_Object val;
5824
5825 if (encodep)
5826 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5827 standard = Vstandard_translation_table_for_encode;
5828 else
5829 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5830 standard = Vstandard_translation_table_for_decode;
5831 if (NILP (translation_table))
5832 translation_table = standard;
5833 else
5834 {
5835 if (SYMBOLP (translation_table))
5836 translation_table = Fget (translation_table, Qtranslation_table);
5837 else if (CONSP (translation_table))
5838 {
5839 translation_table = Fcopy_sequence (translation_table);
5840 for (val = translation_table; CONSP (val); val = XCDR (val))
5841 if (SYMBOLP (XCAR (val)))
5842 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5843 }
5844 if (CHAR_TABLE_P (standard))
5845 {
5846 if (CONSP (translation_table))
5847 translation_table = nconc2 (translation_table,
5848 Fcons (standard, Qnil));
5849 else
5850 translation_table = Fcons (translation_table,
5851 Fcons (standard, Qnil));
5852 }
5853 }
5854
5855 if (max_lookup)
5856 {
5857 *max_lookup = 1;
5858 if (CHAR_TABLE_P (translation_table)
5859 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5860 {
5861 val = XCHAR_TABLE (translation_table)->extras[1];
5862 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5863 *max_lookup = XFASTINT (val);
5864 }
5865 else if (CONSP (translation_table))
5866 {
5867 Lisp_Object tail, val;
5868
5869 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5870 if (CHAR_TABLE_P (XCAR (tail))
5871 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5872 {
5873 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5874 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5875 *max_lookup = XFASTINT (val);
5876 }
5877 }
5878 }
5879 return translation_table;
5880 }
5881
5882 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
5883 do { \
5884 trans = Qnil; \
5885 if (CHAR_TABLE_P (table)) \
5886 { \
5887 trans = CHAR_TABLE_REF (table, c); \
5888 if (CHARACTERP (trans)) \
5889 c = XFASTINT (trans), trans = Qnil; \
5890 } \
5891 else if (CONSP (table)) \
5892 { \
5893 Lisp_Object tail; \
5894 \
5895 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
5896 if (CHAR_TABLE_P (XCAR (tail))) \
5897 { \
5898 trans = CHAR_TABLE_REF (XCAR (tail), c); \
5899 if (CHARACTERP (trans)) \
5900 c = XFASTINT (trans), trans = Qnil; \
5901 else if (! NILP (trans)) \
5902 break; \
5903 } \
5904 } \
5905 } while (0)
5906
5907
5908 static Lisp_Object
5909 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5910 Lisp_Object val;
5911 int *buf, *buf_end;
5912 int last_block;
5913 int *from_nchars, *to_nchars;
5914 {
5915 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
5916 [TO-CHAR ...]. */
5917 if (CONSP (val))
5918 {
5919 Lisp_Object from, tail;
5920 int i, len;
5921
5922 for (tail = val; CONSP (tail); tail = XCDR (tail))
5923 {
5924 val = XCAR (tail);
5925 from = XCAR (val);
5926 len = ASIZE (from);
5927 for (i = 0; i < len; i++)
5928 {
5929 if (buf + i == buf_end)
5930 {
5931 if (! last_block)
5932 return Qt;
5933 break;
5934 }
5935 if (XINT (AREF (from, i)) != buf[i])
5936 break;
5937 }
5938 if (i == len)
5939 {
5940 val = XCDR (val);
5941 *from_nchars = len;
5942 break;
5943 }
5944 }
5945 if (! CONSP (tail))
5946 return Qnil;
5947 }
5948 if (VECTORP (val))
5949 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5950 else
5951 *buf = XINT (val);
5952 return val;
5953 }
5954
5955
5956 static int
5957 produce_chars (coding, translation_table, last_block)
5958 struct coding_system *coding;
5959 Lisp_Object translation_table;
5960 int last_block;
5961 {
5962 unsigned char *dst = coding->destination + coding->produced;
5963 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5964 EMACS_INT produced;
5965 EMACS_INT produced_chars = 0;
5966 int carryover = 0;
5967
5968 if (! coding->chars_at_source)
5969 {
5970 /* Source characters are in coding->charbuf. */
5971 int *buf = coding->charbuf;
5972 int *buf_end = buf + coding->charbuf_used;
5973
5974 if (BUFFERP (coding->src_object)
5975 && EQ (coding->src_object, coding->dst_object))
5976 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5977
5978 while (buf < buf_end)
5979 {
5980 int c = *buf, i;
5981
5982 if (c >= 0)
5983 {
5984 int from_nchars = 1, to_nchars = 1;
5985 Lisp_Object trans = Qnil;
5986
5987 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5988 if (! NILP (trans))
5989 {
5990 trans = get_translation (trans, buf, buf_end, last_block,
5991 &from_nchars, &to_nchars);
5992 if (EQ (trans, Qt))
5993 break;
5994 c = *buf;
5995 }
5996
5997 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5998 {
5999 dst = alloc_destination (coding,
6000 buf_end - buf
6001 + MAX_MULTIBYTE_LENGTH * to_nchars,
6002 dst);
6003 dst_end = coding->destination + coding->dst_bytes;
6004 }
6005
6006 for (i = 0; i < to_nchars; i++)
6007 {
6008 if (i > 0)
6009 c = XINT (AREF (trans, i));
6010 if (coding->dst_multibyte
6011 || ! CHAR_BYTE8_P (c))
6012 CHAR_STRING_ADVANCE (c, dst);
6013 else
6014 *dst++ = CHAR_TO_BYTE8 (c);
6015 }
6016 produced_chars += to_nchars;
6017 *buf++ = to_nchars;
6018 while (--from_nchars > 0)
6019 *buf++ = 0;
6020 }
6021 else
6022 /* This is an annotation datum. (-C) is the length. */
6023 buf += -c;
6024 }
6025 carryover = buf_end - buf;
6026 }
6027 else
6028 {
6029 /* Source characters are at coding->source. */
6030 const unsigned char *src = coding->source;
6031 const unsigned char *src_end = src + coding->consumed;
6032
6033 if (coding->src_multibyte != coding->dst_multibyte)
6034 {
6035 if (coding->src_multibyte)
6036 {
6037 int multibytep = 1;
6038 EMACS_INT consumed_chars;
6039
6040 while (1)
6041 {
6042 const unsigned char *src_base = src;
6043 int c;
6044
6045 ONE_MORE_BYTE (c);
6046 if (dst == dst_end)
6047 {
6048 if (EQ (coding->src_object, coding->dst_object))
6049 dst_end = (unsigned char *) src;
6050 if (dst == dst_end)
6051 {
6052 EMACS_INT offset = src - coding->source;
6053
6054 dst = alloc_destination (coding, src_end - src + 1,
6055 dst);
6056 dst_end = coding->destination + coding->dst_bytes;
6057 coding_set_source (coding);
6058 src = coding->source + offset;
6059 src_end = coding->source + coding->src_bytes;
6060 }
6061 }
6062 *dst++ = c;
6063 produced_chars++;
6064 }
6065 no_more_source:
6066 ;
6067 }
6068 else
6069 while (src < src_end)
6070 {
6071 int multibytep = 1;
6072 int c = *src++;
6073
6074 if (dst >= dst_end - 1)
6075 {
6076 if (EQ (coding->src_object, coding->dst_object))
6077 dst_end = (unsigned char *) src;
6078 if (dst >= dst_end - 1)
6079 {
6080 EMACS_INT offset = src - coding->source;
6081
6082 dst = alloc_destination (coding, src_end - src + 2,
6083 dst);
6084 dst_end = coding->destination + coding->dst_bytes;
6085 coding_set_source (coding);
6086 src = coding->source + offset;
6087 src_end = coding->source + coding->src_bytes;
6088 }
6089 }
6090 EMIT_ONE_BYTE (c);
6091 }
6092 }
6093 else
6094 {
6095 if (!EQ (coding->src_object, coding->dst_object))
6096 {
6097 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6098
6099 if (require > 0)
6100 {
6101 EMACS_INT offset = src - coding->source;
6102
6103 dst = alloc_destination (coding, require, dst);
6104 coding_set_source (coding);
6105 src = coding->source + offset;
6106 src_end = coding->source + coding->src_bytes;
6107 }
6108 }
6109 produced_chars = coding->consumed_char;
6110 while (src < src_end)
6111 *dst++ = *src++;
6112 }
6113 }
6114
6115 produced = dst - (coding->destination + coding->produced);
6116 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6117 insert_from_gap (produced_chars, produced);
6118 coding->produced += produced;
6119 coding->produced_char += produced_chars;
6120 return carryover;
6121 }
6122
6123 /* Compose text in CODING->object according to the annotation data at
6124 CHARBUF. CHARBUF is an array:
6125 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6126 */
6127
6128 static INLINE void
6129 produce_composition (coding, charbuf, pos)
6130 struct coding_system *coding;
6131 int *charbuf;
6132 EMACS_INT pos;
6133 {
6134 int len;
6135 EMACS_INT to;
6136 enum composition_method method;
6137 Lisp_Object components;
6138
6139 len = -charbuf[0];
6140 to = pos + charbuf[2];
6141 if (to <= pos)
6142 return;
6143 method = (enum composition_method) (charbuf[3]);
6144
6145 if (method == COMPOSITION_RELATIVE)
6146 components = Qnil;
6147 else if (method >= COMPOSITION_WITH_RULE
6148 && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6149 {
6150 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6151 int i;
6152
6153 len -= 4;
6154 charbuf += 4;
6155 for (i = 0; i < len; i++)
6156 {
6157 args[i] = make_number (charbuf[i]);
6158 if (charbuf[i] < 0)
6159 return;
6160 }
6161 components = (method == COMPOSITION_WITH_ALTCHARS
6162 ? Fstring (len, args) : Fvector (len, args));
6163 }
6164 else
6165 return;
6166 compose_text (pos, to, components, Qnil, coding->dst_object);
6167 }
6168
6169
6170 /* Put `charset' property on text in CODING->object according to
6171 the annotation data at CHARBUF. CHARBUF is an array:
6172 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6173 */
6174
6175 static INLINE void
6176 produce_charset (coding, charbuf, pos)
6177 struct coding_system *coding;
6178 int *charbuf;
6179 EMACS_INT pos;
6180 {
6181 EMACS_INT from = pos - charbuf[2];
6182 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6183
6184 Fput_text_property (make_number (from), make_number (pos),
6185 Qcharset, CHARSET_NAME (charset),
6186 coding->dst_object);
6187 }
6188
6189
6190 #define CHARBUF_SIZE 0x4000
6191
6192 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6193 do { \
6194 int size = CHARBUF_SIZE;; \
6195 \
6196 coding->charbuf = NULL; \
6197 while (size > 1024) \
6198 { \
6199 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6200 if (coding->charbuf) \
6201 break; \
6202 size >>= 1; \
6203 } \
6204 if (! coding->charbuf) \
6205 { \
6206 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6207 return coding->result; \
6208 } \
6209 coding->charbuf_size = size; \
6210 } while (0)
6211
6212
6213 static void
6214 produce_annotation (coding, pos)
6215 struct coding_system *coding;
6216 EMACS_INT pos;
6217 {
6218 int *charbuf = coding->charbuf;
6219 int *charbuf_end = charbuf + coding->charbuf_used;
6220
6221 if (NILP (coding->dst_object))
6222 return;
6223
6224 while (charbuf < charbuf_end)
6225 {
6226 if (*charbuf >= 0)
6227 pos += *charbuf++;
6228 else
6229 {
6230 int len = -*charbuf;
6231 switch (charbuf[1])
6232 {
6233 case CODING_ANNOTATE_COMPOSITION_MASK:
6234 produce_composition (coding, charbuf, pos);
6235 break;
6236 case CODING_ANNOTATE_CHARSET_MASK:
6237 produce_charset (coding, charbuf, pos);
6238 break;
6239 default:
6240 abort ();
6241 }
6242 charbuf += len;
6243 }
6244 }
6245 }
6246
6247 /* Decode the data at CODING->src_object into CODING->dst_object.
6248 CODING->src_object is a buffer, a string, or nil.
6249 CODING->dst_object is a buffer.
6250
6251 If CODING->src_object is a buffer, it must be the current buffer.
6252 In this case, if CODING->src_pos is positive, it is a position of
6253 the source text in the buffer, otherwise, the source text is in the
6254 gap area of the buffer, and CODING->src_pos specifies the offset of
6255 the text from GPT (which must be the same as PT). If this is the
6256 same buffer as CODING->dst_object, CODING->src_pos must be
6257 negative.
6258
6259 If CODING->src_object is a string, CODING->src_pos is an index to
6260 that string.
6261
6262 If CODING->src_object is nil, CODING->source must already point to
6263 the non-relocatable memory area. In this case, CODING->src_pos is
6264 an offset from CODING->source.
6265
6266 The decoded data is inserted at the current point of the buffer
6267 CODING->dst_object.
6268 */
6269
6270 static int
6271 decode_coding (coding)
6272 struct coding_system *coding;
6273 {
6274 Lisp_Object attrs;
6275 Lisp_Object undo_list;
6276 Lisp_Object translation_table;
6277 int carryover;
6278 int i;
6279
6280 if (BUFFERP (coding->src_object)
6281 && coding->src_pos > 0
6282 && coding->src_pos < GPT
6283 && coding->src_pos + coding->src_chars > GPT)
6284 move_gap_both (coding->src_pos, coding->src_pos_byte);
6285
6286 undo_list = Qt;
6287 if (BUFFERP (coding->dst_object))
6288 {
6289 if (current_buffer != XBUFFER (coding->dst_object))
6290 set_buffer_internal (XBUFFER (coding->dst_object));
6291 if (GPT != PT)
6292 move_gap_both (PT, PT_BYTE);
6293 undo_list = current_buffer->undo_list;
6294 current_buffer->undo_list = Qt;
6295 }
6296
6297 coding->consumed = coding->consumed_char = 0;
6298 coding->produced = coding->produced_char = 0;
6299 coding->chars_at_source = 0;
6300 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6301 coding->errors = 0;
6302
6303 ALLOC_CONVERSION_WORK_AREA (coding);
6304
6305 attrs = CODING_ID_ATTRS (coding->id);
6306 translation_table = get_translation_table (attrs, 0, NULL);
6307
6308 carryover = 0;
6309 do
6310 {
6311 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6312
6313 coding_set_source (coding);
6314 coding->annotated = 0;
6315 coding->charbuf_used = carryover;
6316 (*(coding->decoder)) (coding);
6317 coding_set_destination (coding);
6318 carryover = produce_chars (coding, translation_table, 0);
6319 if (coding->annotated)
6320 produce_annotation (coding, pos);
6321 for (i = 0; i < carryover; i++)
6322 coding->charbuf[i]
6323 = coding->charbuf[coding->charbuf_used - carryover + i];
6324 }
6325 while (coding->consumed < coding->src_bytes
6326 && (coding->result == CODING_RESULT_SUCCESS
6327 || coding->result == CODING_RESULT_INVALID_SRC));
6328
6329 if (carryover > 0)
6330 {
6331 coding_set_destination (coding);
6332 coding->charbuf_used = carryover;
6333 produce_chars (coding, translation_table, 1);
6334 }
6335
6336 coding->carryover_bytes = 0;
6337 if (coding->consumed < coding->src_bytes)
6338 {
6339 int nbytes = coding->src_bytes - coding->consumed;
6340 const unsigned char *src;
6341
6342 coding_set_source (coding);
6343 coding_set_destination (coding);
6344 src = coding->source + coding->consumed;
6345
6346 if (coding->mode & CODING_MODE_LAST_BLOCK)
6347 {
6348 /* Flush out unprocessed data as binary chars. We are sure
6349 that the number of data is less than the size of
6350 coding->charbuf. */
6351 coding->charbuf_used = 0;
6352 while (nbytes-- > 0)
6353 {
6354 int c = *src++;
6355
6356 if (c & 0x80)
6357 c = BYTE8_TO_CHAR (c);
6358 coding->charbuf[coding->charbuf_used++] = c;
6359 }
6360 produce_chars (coding, Qnil, 1);
6361 }
6362 else
6363 {
6364 /* Record unprocessed bytes in coding->carryover. We are
6365 sure that the number of data is less than the size of
6366 coding->carryover. */
6367 unsigned char *p = coding->carryover;
6368
6369 coding->carryover_bytes = nbytes;
6370 while (nbytes-- > 0)
6371 *p++ = *src++;
6372 }
6373 coding->consumed = coding->src_bytes;
6374 }
6375
6376 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6377 decode_eol (coding);
6378 if (BUFFERP (coding->dst_object))
6379 {
6380 current_buffer->undo_list = undo_list;
6381 record_insert (coding->dst_pos, coding->produced_char);
6382 }
6383 return coding->result;
6384 }
6385
6386
6387 /* Extract an annotation datum from a composition starting at POS and
6388 ending before LIMIT of CODING->src_object (buffer or string), store
6389 the data in BUF, set *STOP to a starting position of the next
6390 composition (if any) or to LIMIT, and return the address of the
6391 next element of BUF.
6392
6393 If such an annotation is not found, set *STOP to a starting
6394 position of a composition after POS (if any) or to LIMIT, and
6395 return BUF. */
6396
6397 static INLINE int *
6398 handle_composition_annotation (pos, limit, coding, buf, stop)
6399 EMACS_INT pos, limit;
6400 struct coding_system *coding;
6401 int *buf;
6402 EMACS_INT *stop;
6403 {
6404 EMACS_INT start, end;
6405 Lisp_Object prop;
6406
6407 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6408 || end > limit)
6409 *stop = limit;
6410 else if (start > pos)
6411 *stop = start;
6412 else
6413 {
6414 if (start == pos)
6415 {
6416 /* We found a composition. Store the corresponding
6417 annotation data in BUF. */
6418 int *head = buf;
6419 enum composition_method method = COMPOSITION_METHOD (prop);
6420 int nchars = COMPOSITION_LENGTH (prop);
6421
6422 ADD_COMPOSITION_DATA (buf, nchars, method);
6423 if (method != COMPOSITION_RELATIVE)
6424 {
6425 Lisp_Object components;
6426 int len, i, i_byte;
6427
6428 components = COMPOSITION_COMPONENTS (prop);
6429 if (VECTORP (components))
6430 {
6431 len = XVECTOR (components)->size;
6432 for (i = 0; i < len; i++)
6433 *buf++ = XINT (AREF (components, i));
6434 }
6435 else if (STRINGP (components))
6436 {
6437 len = SCHARS (components);
6438 i = i_byte = 0;
6439 while (i < len)
6440 {
6441 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6442 buf++;
6443 }
6444 }
6445 else if (INTEGERP (components))
6446 {
6447 len = 1;
6448 *buf++ = XINT (components);
6449 }
6450 else if (CONSP (components))
6451 {
6452 for (len = 0; CONSP (components);
6453 len++, components = XCDR (components))
6454 *buf++ = XINT (XCAR (components));
6455 }
6456 else
6457 abort ();
6458 *head -= len;
6459 }
6460 }
6461
6462 if (find_composition (end, limit, &start, &end, &prop,
6463 coding->src_object)
6464 && end <= limit)
6465 *stop = start;
6466 else
6467 *stop = limit;
6468 }
6469 return buf;
6470 }
6471
6472
6473 /* Extract an annotation datum from a text property `charset' at POS of
6474 CODING->src_object (buffer of string), store the data in BUF, set
6475 *STOP to the position where the value of `charset' property changes
6476 (limiting by LIMIT), and return the address of the next element of
6477 BUF.
6478
6479 If the property value is nil, set *STOP to the position where the
6480 property value is non-nil (limiting by LIMIT), and return BUF. */
6481
6482 static INLINE int *
6483 handle_charset_annotation (pos, limit, coding, buf, stop)
6484 EMACS_INT pos, limit;
6485 struct coding_system *coding;
6486 int *buf;
6487 EMACS_INT *stop;
6488 {
6489 Lisp_Object val, next;
6490 int id;
6491
6492 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6493 if (! NILP (val) && CHARSETP (val))
6494 id = XINT (CHARSET_SYMBOL_ID (val));
6495 else
6496 id = -1;
6497 ADD_CHARSET_DATA (buf, 0, id);
6498 next = Fnext_single_property_change (make_number (pos), Qcharset,
6499 coding->src_object,
6500 make_number (limit));
6501 *stop = XINT (next);
6502 return buf;
6503 }
6504
6505
6506 static void
6507 consume_chars (coding, translation_table, max_lookup)
6508 struct coding_system *coding;
6509 Lisp_Object translation_table;
6510 int max_lookup;
6511 {
6512 int *buf = coding->charbuf;
6513 int *buf_end = coding->charbuf + coding->charbuf_size;
6514 const unsigned char *src = coding->source + coding->consumed;
6515 const unsigned char *src_end = coding->source + coding->src_bytes;
6516 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6517 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6518 int multibytep = coding->src_multibyte;
6519 Lisp_Object eol_type;
6520 int c;
6521 EMACS_INT stop, stop_composition, stop_charset;
6522 int *lookup_buf = NULL;
6523
6524 if (! NILP (translation_table))
6525 lookup_buf = alloca (sizeof (int) * max_lookup);
6526
6527 eol_type = CODING_ID_EOL_TYPE (coding->id);
6528 if (VECTORP (eol_type))
6529 eol_type = Qunix;
6530
6531 /* Note: composition handling is not yet implemented. */
6532 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6533
6534 if (NILP (coding->src_object))
6535 stop = stop_composition = stop_charset = end_pos;
6536 else
6537 {
6538 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6539 stop = stop_composition = pos;
6540 else
6541 stop = stop_composition = end_pos;
6542 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6543 stop = stop_charset = pos;
6544 else
6545 stop_charset = end_pos;
6546 }
6547
6548 /* Compensate for CRLF and conversion. */
6549 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6550 while (buf < buf_end)
6551 {
6552 Lisp_Object trans;
6553
6554 if (pos == stop)
6555 {
6556 if (pos == end_pos)
6557 break;
6558 if (pos == stop_composition)
6559 buf = handle_composition_annotation (pos, end_pos, coding,
6560 buf, &stop_composition);
6561 if (pos == stop_charset)
6562 buf = handle_charset_annotation (pos, end_pos, coding,
6563 buf, &stop_charset);
6564 stop = (stop_composition < stop_charset
6565 ? stop_composition : stop_charset);
6566 }
6567
6568 if (! multibytep)
6569 {
6570 EMACS_INT bytes;
6571
6572 if (coding->encoder == encode_coding_raw_text)
6573 c = *src++, pos++;
6574 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6575 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6576 else
6577 c = BYTE8_TO_CHAR (*src), src++, pos++;
6578 }
6579 else
6580 c = STRING_CHAR_ADVANCE (src), pos++;
6581 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6582 c = '\n';
6583 if (! EQ (eol_type, Qunix))
6584 {
6585 if (c == '\n')
6586 {
6587 if (EQ (eol_type, Qdos))
6588 *buf++ = '\r';
6589 else
6590 c = '\r';
6591 }
6592 }
6593
6594 trans = Qnil;
6595 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6596 if (NILP (trans))
6597 *buf++ = c;
6598 else
6599 {
6600 int from_nchars = 1, to_nchars = 1;
6601 int *lookup_buf_end;
6602 const unsigned char *p = src;
6603 int i;
6604
6605 lookup_buf[0] = c;
6606 for (i = 1; i < max_lookup && p < src_end; i++)
6607 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6608 lookup_buf_end = lookup_buf + i;
6609 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6610 &from_nchars, &to_nchars);
6611 if (EQ (trans, Qt)
6612 || buf + to_nchars > buf_end)
6613 break;
6614 *buf++ = *lookup_buf;
6615 for (i = 1; i < to_nchars; i++)
6616 *buf++ = XINT (AREF (trans, i));
6617 for (i = 1; i < from_nchars; i++, pos++)
6618 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6619 }
6620 }
6621
6622 coding->consumed = src - coding->source;
6623 coding->consumed_char = pos - coding->src_pos;
6624 coding->charbuf_used = buf - coding->charbuf;
6625 coding->chars_at_source = 0;
6626 }
6627
6628
6629 /* Encode the text at CODING->src_object into CODING->dst_object.
6630 CODING->src_object is a buffer or a string.
6631 CODING->dst_object is a buffer or nil.
6632
6633 If CODING->src_object is a buffer, it must be the current buffer.
6634 In this case, if CODING->src_pos is positive, it is a position of
6635 the source text in the buffer, otherwise. the source text is in the
6636 gap area of the buffer, and coding->src_pos specifies the offset of
6637 the text from GPT (which must be the same as PT). If this is the
6638 same buffer as CODING->dst_object, CODING->src_pos must be
6639 negative and CODING should not have `pre-write-conversion'.
6640
6641 If CODING->src_object is a string, CODING should not have
6642 `pre-write-conversion'.
6643
6644 If CODING->dst_object is a buffer, the encoded data is inserted at
6645 the current point of that buffer.
6646
6647 If CODING->dst_object is nil, the encoded data is placed at the
6648 memory area specified by CODING->destination. */
6649
6650 static int
6651 encode_coding (coding)
6652 struct coding_system *coding;
6653 {
6654 Lisp_Object attrs;
6655 Lisp_Object translation_table;
6656 int max_lookup;
6657
6658 attrs = CODING_ID_ATTRS (coding->id);
6659 if (coding->encoder == encode_coding_raw_text)
6660 translation_table = Qnil, max_lookup = 0;
6661 else
6662 translation_table = get_translation_table (attrs, 1, &max_lookup);
6663
6664 if (BUFFERP (coding->dst_object))
6665 {
6666 set_buffer_internal (XBUFFER (coding->dst_object));
6667 coding->dst_multibyte
6668 = ! NILP (current_buffer->enable_multibyte_characters);
6669 }
6670
6671 coding->consumed = coding->consumed_char = 0;
6672 coding->produced = coding->produced_char = 0;
6673 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6674 coding->errors = 0;
6675
6676 ALLOC_CONVERSION_WORK_AREA (coding);
6677
6678 do {
6679 coding_set_source (coding);
6680 consume_chars (coding, translation_table, max_lookup);
6681 coding_set_destination (coding);
6682 (*(coding->encoder)) (coding);
6683 } while (coding->consumed_char < coding->src_chars);
6684
6685 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
6686 insert_from_gap (coding->produced_char, coding->produced);
6687
6688 return (coding->result);
6689 }
6690
6691
6692 /* Name (or base name) of work buffer for code conversion. */
6693 static Lisp_Object Vcode_conversion_workbuf_name;
6694
6695 /* A working buffer used by the top level conversion. Once it is
6696 created, it is never destroyed. It has the name
6697 Vcode_conversion_workbuf_name. The other working buffers are
6698 destroyed after the use is finished, and their names are modified
6699 versions of Vcode_conversion_workbuf_name. */
6700 static Lisp_Object Vcode_conversion_reused_workbuf;
6701
6702 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6703 static int reused_workbuf_in_use;
6704
6705
6706 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6707 multibyteness of returning buffer. */
6708
6709 static Lisp_Object
6710 make_conversion_work_buffer (multibyte)
6711 int multibyte;
6712 {
6713 Lisp_Object name, workbuf;
6714 struct buffer *current;
6715
6716 if (reused_workbuf_in_use++)
6717 {
6718 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6719 workbuf = Fget_buffer_create (name);
6720 }
6721 else
6722 {
6723 name = Vcode_conversion_workbuf_name;
6724 workbuf = Fget_buffer_create (name);
6725 if (NILP (Vcode_conversion_reused_workbuf))
6726 Vcode_conversion_reused_workbuf = workbuf;
6727 }
6728 current = current_buffer;
6729 set_buffer_internal (XBUFFER (workbuf));
6730 Ferase_buffer ();
6731 current_buffer->undo_list = Qt;
6732 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6733 set_buffer_internal (current);
6734 return workbuf;
6735 }
6736
6737
6738 static Lisp_Object
6739 code_conversion_restore (arg)
6740 Lisp_Object arg;
6741 {
6742 Lisp_Object current, workbuf;
6743 struct gcpro gcpro1;
6744
6745 GCPRO1 (arg);
6746 current = XCAR (arg);
6747 workbuf = XCDR (arg);
6748 if (! NILP (workbuf))
6749 {
6750 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6751 reused_workbuf_in_use = 0;
6752 else if (! NILP (Fbuffer_live_p (workbuf)))
6753 Fkill_buffer (workbuf);
6754 }
6755 set_buffer_internal (XBUFFER (current));
6756 UNGCPRO;
6757 return Qnil;
6758 }
6759
6760 Lisp_Object
6761 code_conversion_save (with_work_buf, multibyte)
6762 int with_work_buf, multibyte;
6763 {
6764 Lisp_Object workbuf = Qnil;
6765
6766 if (with_work_buf)
6767 workbuf = make_conversion_work_buffer (multibyte);
6768 record_unwind_protect (code_conversion_restore,
6769 Fcons (Fcurrent_buffer (), workbuf));
6770 return workbuf;
6771 }
6772
6773 int
6774 decode_coding_gap (coding, chars, bytes)
6775 struct coding_system *coding;
6776 EMACS_INT chars, bytes;
6777 {
6778 int count = specpdl_ptr - specpdl;
6779 Lisp_Object attrs;
6780
6781 code_conversion_save (0, 0);
6782
6783 coding->src_object = Fcurrent_buffer ();
6784 coding->src_chars = chars;
6785 coding->src_bytes = bytes;
6786 coding->src_pos = -chars;
6787 coding->src_pos_byte = -bytes;
6788 coding->src_multibyte = chars < bytes;
6789 coding->dst_object = coding->src_object;
6790 coding->dst_pos = PT;
6791 coding->dst_pos_byte = PT_BYTE;
6792 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6793
6794 if (CODING_REQUIRE_DETECTION (coding))
6795 detect_coding (coding);
6796
6797 coding->mode |= CODING_MODE_LAST_BLOCK;
6798 current_buffer->text->inhibit_shrinking = 1;
6799 decode_coding (coding);
6800 current_buffer->text->inhibit_shrinking = 0;
6801
6802 attrs = CODING_ID_ATTRS (coding->id);
6803 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6804 {
6805 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6806 Lisp_Object val;
6807
6808 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6809 val = call1 (CODING_ATTR_POST_READ (attrs),
6810 make_number (coding->produced_char));
6811 CHECK_NATNUM (val);
6812 coding->produced_char += Z - prev_Z;
6813 coding->produced += Z_BYTE - prev_Z_BYTE;
6814 }
6815
6816 unbind_to (count, Qnil);
6817 return coding->result;
6818 }
6819
6820 int
6821 encode_coding_gap (coding, chars, bytes)
6822 struct coding_system *coding;
6823 EMACS_INT chars, bytes;
6824 {
6825 int count = specpdl_ptr - specpdl;
6826
6827 code_conversion_save (0, 0);
6828
6829 coding->src_object = Fcurrent_buffer ();
6830 coding->src_chars = chars;
6831 coding->src_bytes = bytes;
6832 coding->src_pos = -chars;
6833 coding->src_pos_byte = -bytes;
6834 coding->src_multibyte = chars < bytes;
6835 coding->dst_object = coding->src_object;
6836 coding->dst_pos = PT;
6837 coding->dst_pos_byte = PT_BYTE;
6838
6839 encode_coding (coding);
6840
6841 unbind_to (count, Qnil);
6842 return coding->result;
6843 }
6844
6845
6846 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6847 SRC_OBJECT into DST_OBJECT by coding context CODING.
6848
6849 SRC_OBJECT is a buffer, a string, or Qnil.
6850
6851 If it is a buffer, the text is at point of the buffer. FROM and TO
6852 are positions in the buffer.
6853
6854 If it is a string, the text is at the beginning of the string.
6855 FROM and TO are indices to the string.
6856
6857 If it is nil, the text is at coding->source. FROM and TO are
6858 indices to coding->source.
6859
6860 DST_OBJECT is a buffer, Qt, or Qnil.
6861
6862 If it is a buffer, the decoded text is inserted at point of the
6863 buffer. If the buffer is the same as SRC_OBJECT, the source text
6864 is deleted.
6865
6866 If it is Qt, a string is made from the decoded text, and
6867 set in CODING->dst_object.
6868
6869 If it is Qnil, the decoded text is stored at CODING->destination.
6870 The caller must allocate CODING->dst_bytes bytes at
6871 CODING->destination by xmalloc. If the decoded text is longer than
6872 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6873 */
6874
6875 void
6876 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6877 dst_object)
6878 struct coding_system *coding;
6879 Lisp_Object src_object;
6880 EMACS_INT from, from_byte, to, to_byte;
6881 Lisp_Object dst_object;
6882 {
6883 int count = specpdl_ptr - specpdl;
6884 unsigned char *destination;
6885 EMACS_INT dst_bytes;
6886 EMACS_INT chars = to - from;
6887 EMACS_INT bytes = to_byte - from_byte;
6888 Lisp_Object attrs;
6889 int saved_pt = -1, saved_pt_byte;
6890 int need_marker_adjustment = 0;
6891 Lisp_Object old_deactivate_mark;
6892
6893 old_deactivate_mark = Vdeactivate_mark;
6894
6895 if (NILP (dst_object))
6896 {
6897 destination = coding->destination;
6898 dst_bytes = coding->dst_bytes;
6899 }
6900
6901 coding->src_object = src_object;
6902 coding->src_chars = chars;
6903 coding->src_bytes = bytes;
6904 coding->src_multibyte = chars < bytes;
6905
6906 if (STRINGP (src_object))
6907 {
6908 coding->src_pos = from;
6909 coding->src_pos_byte = from_byte;
6910 }
6911 else if (BUFFERP (src_object))
6912 {
6913 set_buffer_internal (XBUFFER (src_object));
6914 if (from != GPT)
6915 move_gap_both (from, from_byte);
6916 if (EQ (src_object, dst_object))
6917 {
6918 struct Lisp_Marker *tail;
6919
6920 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
6921 {
6922 tail->need_adjustment
6923 = tail->charpos == (tail->insertion_type ? from : to);
6924 need_marker_adjustment |= tail->need_adjustment;
6925 }
6926 saved_pt = PT, saved_pt_byte = PT_BYTE;
6927 TEMP_SET_PT_BOTH (from, from_byte);
6928 current_buffer->text->inhibit_shrinking = 1;
6929 del_range_both (from, from_byte, to, to_byte, 1);
6930 coding->src_pos = -chars;
6931 coding->src_pos_byte = -bytes;
6932 }
6933 else
6934 {
6935 coding->src_pos = from;
6936 coding->src_pos_byte = from_byte;
6937 }
6938 }
6939
6940 if (CODING_REQUIRE_DETECTION (coding))
6941 detect_coding (coding);
6942 attrs = CODING_ID_ATTRS (coding->id);
6943
6944 if (EQ (dst_object, Qt)
6945 || (! NILP (CODING_ATTR_POST_READ (attrs))
6946 && NILP (dst_object)))
6947 {
6948 coding->dst_object = code_conversion_save (1, 1);
6949 coding->dst_pos = BEG;
6950 coding->dst_pos_byte = BEG_BYTE;
6951 coding->dst_multibyte = 1;
6952 }
6953 else if (BUFFERP (dst_object))
6954 {
6955 code_conversion_save (0, 0);
6956 coding->dst_object = dst_object;
6957 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6958 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6959 coding->dst_multibyte
6960 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6961 }
6962 else
6963 {
6964 code_conversion_save (0, 0);
6965 coding->dst_object = Qnil;
6966 coding->dst_multibyte = 1;
6967 }
6968
6969 decode_coding (coding);
6970
6971 if (BUFFERP (coding->dst_object))
6972 set_buffer_internal (XBUFFER (coding->dst_object));
6973
6974 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6975 {
6976 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
6977 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6978 Lisp_Object val;
6979
6980 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6981 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
6982 old_deactivate_mark);
6983 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6984 make_number (coding->produced_char));
6985 UNGCPRO;
6986 CHECK_NATNUM (val);
6987 coding->produced_char += Z - prev_Z;
6988 coding->produced += Z_BYTE - prev_Z_BYTE;
6989 }
6990
6991 if (EQ (dst_object, Qt))
6992 {
6993 coding->dst_object = Fbuffer_string ();
6994 }
6995 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6996 {
6997 set_buffer_internal (XBUFFER (coding->dst_object));
6998 if (dst_bytes < coding->produced)
6999 {
7000 destination = xrealloc (destination, coding->produced);
7001 if (! destination)
7002 {
7003 record_conversion_result (coding,
7004 CODING_RESULT_INSUFFICIENT_DST);
7005 unbind_to (count, Qnil);
7006 return;
7007 }
7008 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7009 move_gap_both (BEGV, BEGV_BYTE);
7010 bcopy (BEGV_ADDR, destination, coding->produced);
7011 coding->destination = destination;
7012 }
7013 }
7014
7015 if (saved_pt >= 0)
7016 {
7017 /* This is the case of:
7018 (BUFFERP (src_object) && EQ (src_object, dst_object))
7019 As we have moved PT while replacing the original buffer
7020 contents, we must recover it now. */
7021 set_buffer_internal (XBUFFER (src_object));
7022 current_buffer->text->inhibit_shrinking = 0;
7023 if (saved_pt < from)
7024 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7025 else if (saved_pt < from + chars)
7026 TEMP_SET_PT_BOTH (from, from_byte);
7027 else if (! NILP (current_buffer->enable_multibyte_characters))
7028 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7029 saved_pt_byte + (coding->produced - bytes));
7030 else
7031 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7032 saved_pt_byte + (coding->produced - bytes));
7033
7034 if (need_marker_adjustment)
7035 {
7036 struct Lisp_Marker *tail;
7037
7038 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7039 if (tail->need_adjustment)
7040 {
7041 tail->need_adjustment = 0;
7042 if (tail->insertion_type)
7043 {
7044 tail->bytepos = from_byte;
7045 tail->charpos = from;
7046 }
7047 else
7048 {
7049 tail->bytepos = from_byte + coding->produced;
7050 tail->charpos
7051 = (NILP (current_buffer->enable_multibyte_characters)
7052 ? tail->bytepos : from + coding->produced_char);
7053 }
7054 }
7055 }
7056 }
7057
7058 Vdeactivate_mark = old_deactivate_mark;
7059 unbind_to (count, coding->dst_object);
7060 }
7061
7062
7063 void
7064 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7065 dst_object)
7066 struct coding_system *coding;
7067 Lisp_Object src_object;
7068 EMACS_INT from, from_byte, to, to_byte;
7069 Lisp_Object dst_object;
7070 {
7071 int count = specpdl_ptr - specpdl;
7072 EMACS_INT chars = to - from;
7073 EMACS_INT bytes = to_byte - from_byte;
7074 Lisp_Object attrs;
7075 int saved_pt = -1, saved_pt_byte;
7076 int need_marker_adjustment = 0;
7077 int kill_src_buffer = 0;
7078 Lisp_Object old_deactivate_mark;
7079
7080 old_deactivate_mark = Vdeactivate_mark;
7081
7082 coding->src_object = src_object;
7083 coding->src_chars = chars;
7084 coding->src_bytes = bytes;
7085 coding->src_multibyte = chars < bytes;
7086
7087 attrs = CODING_ID_ATTRS (coding->id);
7088
7089 if (EQ (src_object, dst_object))
7090 {
7091 struct Lisp_Marker *tail;
7092
7093 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7094 {
7095 tail->need_adjustment
7096 = tail->charpos == (tail->insertion_type ? from : to);
7097 need_marker_adjustment |= tail->need_adjustment;
7098 }
7099 }
7100
7101 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7102 {
7103 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7104 set_buffer_internal (XBUFFER (coding->src_object));
7105 if (STRINGP (src_object))
7106 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7107 else if (BUFFERP (src_object))
7108 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7109 else
7110 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7111
7112 if (EQ (src_object, dst_object))
7113 {
7114 set_buffer_internal (XBUFFER (src_object));
7115 saved_pt = PT, saved_pt_byte = PT_BYTE;
7116 del_range_both (from, from_byte, to, to_byte, 1);
7117 set_buffer_internal (XBUFFER (coding->src_object));
7118 }
7119
7120 {
7121 Lisp_Object args[3];
7122 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7123
7124 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7125 old_deactivate_mark);
7126 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7127 args[1] = make_number (BEG);
7128 args[2] = make_number (Z);
7129 safe_call (3, args);
7130 UNGCPRO;
7131 }
7132 if (XBUFFER (coding->src_object) != current_buffer)
7133 kill_src_buffer = 1;
7134 coding->src_object = Fcurrent_buffer ();
7135 if (BEG != GPT)
7136 move_gap_both (BEG, BEG_BYTE);
7137 coding->src_chars = Z - BEG;
7138 coding->src_bytes = Z_BYTE - BEG_BYTE;
7139 coding->src_pos = BEG;
7140 coding->src_pos_byte = BEG_BYTE;
7141 coding->src_multibyte = Z < Z_BYTE;
7142 }
7143 else if (STRINGP (src_object))
7144 {
7145 code_conversion_save (0, 0);
7146 coding->src_pos = from;
7147 coding->src_pos_byte = from_byte;
7148 }
7149 else if (BUFFERP (src_object))
7150 {
7151 code_conversion_save (0, 0);
7152 set_buffer_internal (XBUFFER (src_object));
7153 if (EQ (src_object, dst_object))
7154 {
7155 saved_pt = PT, saved_pt_byte = PT_BYTE;
7156 coding->src_object = del_range_1 (from, to, 1, 1);
7157 coding->src_pos = 0;
7158 coding->src_pos_byte = 0;
7159 }
7160 else
7161 {
7162 if (from < GPT && to >= GPT)
7163 move_gap_both (from, from_byte);
7164 coding->src_pos = from;
7165 coding->src_pos_byte = from_byte;
7166 }
7167 }
7168 else
7169 code_conversion_save (0, 0);
7170
7171 if (BUFFERP (dst_object))
7172 {
7173 coding->dst_object = dst_object;
7174 if (EQ (src_object, dst_object))
7175 {
7176 coding->dst_pos = from;
7177 coding->dst_pos_byte = from_byte;
7178 }
7179 else
7180 {
7181 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7182 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7183 }
7184 coding->dst_multibyte
7185 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7186 }
7187 else if (EQ (dst_object, Qt))
7188 {
7189 coding->dst_object = Qnil;
7190 coding->dst_bytes = coding->src_chars;
7191 if (coding->dst_bytes == 0)
7192 coding->dst_bytes = 1;
7193 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7194 coding->dst_multibyte = 0;
7195 }
7196 else
7197 {
7198 coding->dst_object = Qnil;
7199 coding->dst_multibyte = 0;
7200 }
7201
7202 encode_coding (coding);
7203
7204 if (EQ (dst_object, Qt))
7205 {
7206 if (BUFFERP (coding->dst_object))
7207 coding->dst_object = Fbuffer_string ();
7208 else
7209 {
7210 coding->dst_object
7211 = make_unibyte_string ((char *) coding->destination,
7212 coding->produced);
7213 xfree (coding->destination);
7214 }
7215 }
7216
7217 if (saved_pt >= 0)
7218 {
7219 /* This is the case of:
7220 (BUFFERP (src_object) && EQ (src_object, dst_object))
7221 As we have moved PT while replacing the original buffer
7222 contents, we must recover it now. */
7223 set_buffer_internal (XBUFFER (src_object));
7224 if (saved_pt < from)
7225 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7226 else if (saved_pt < from + chars)
7227 TEMP_SET_PT_BOTH (from, from_byte);
7228 else if (! NILP (current_buffer->enable_multibyte_characters))
7229 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7230 saved_pt_byte + (coding->produced - bytes));
7231 else
7232 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7233 saved_pt_byte + (coding->produced - bytes));
7234
7235 if (need_marker_adjustment)
7236 {
7237 struct Lisp_Marker *tail;
7238
7239 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7240 if (tail->need_adjustment)
7241 {
7242 tail->need_adjustment = 0;
7243 if (tail->insertion_type)
7244 {
7245 tail->bytepos = from_byte;
7246 tail->charpos = from;
7247 }
7248 else
7249 {
7250 tail->bytepos = from_byte + coding->produced;
7251 tail->charpos
7252 = (NILP (current_buffer->enable_multibyte_characters)
7253 ? tail->bytepos : from + coding->produced_char);
7254 }
7255 }
7256 }
7257 }
7258
7259 if (kill_src_buffer)
7260 Fkill_buffer (coding->src_object);
7261
7262 Vdeactivate_mark = old_deactivate_mark;
7263 unbind_to (count, Qnil);
7264 }
7265
7266
7267 Lisp_Object
7268 preferred_coding_system ()
7269 {
7270 int id = coding_categories[coding_priorities[0]].id;
7271
7272 return CODING_ID_NAME (id);
7273 }
7274
7275 \f
7276 #ifdef emacs
7277 /*** 8. Emacs Lisp library functions ***/
7278
7279 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7280 doc: /* Return t if OBJECT is nil or a coding-system.
7281 See the documentation of `define-coding-system' for information
7282 about coding-system objects. */)
7283 (obj)
7284 Lisp_Object obj;
7285 {
7286 if (NILP (obj)
7287 || CODING_SYSTEM_ID (obj) >= 0)
7288 return Qt;
7289 if (! SYMBOLP (obj)
7290 || NILP (Fget (obj, Qcoding_system_define_form)))
7291 return Qnil;
7292 return Qt;
7293 }
7294
7295 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7296 Sread_non_nil_coding_system, 1, 1, 0,
7297 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7298 (prompt)
7299 Lisp_Object prompt;
7300 {
7301 Lisp_Object val;
7302 do
7303 {
7304 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7305 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7306 }
7307 while (SCHARS (val) == 0);
7308 return (Fintern (val, Qnil));
7309 }
7310
7311 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7312 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7313 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7314 Ignores case when completing coding systems (all Emacs coding systems
7315 are lower-case). */)
7316 (prompt, default_coding_system)
7317 Lisp_Object prompt, default_coding_system;
7318 {
7319 Lisp_Object val;
7320 int count = SPECPDL_INDEX ();
7321
7322 if (SYMBOLP (default_coding_system))
7323 default_coding_system = SYMBOL_NAME (default_coding_system);
7324 specbind (Qcompletion_ignore_case, Qt);
7325 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7326 Qt, Qnil, Qcoding_system_history,
7327 default_coding_system, Qnil);
7328 unbind_to (count, Qnil);
7329 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7330 }
7331
7332 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7333 1, 1, 0,
7334 doc: /* Check validity of CODING-SYSTEM.
7335 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7336 It is valid if it is nil or a symbol defined as a coding system by the
7337 function `define-coding-system'. */)
7338 (coding_system)
7339 Lisp_Object coding_system;
7340 {
7341 Lisp_Object define_form;
7342
7343 define_form = Fget (coding_system, Qcoding_system_define_form);
7344 if (! NILP (define_form))
7345 {
7346 Fput (coding_system, Qcoding_system_define_form, Qnil);
7347 safe_eval (define_form);
7348 }
7349 if (!NILP (Fcoding_system_p (coding_system)))
7350 return coding_system;
7351 xsignal1 (Qcoding_system_error, coding_system);
7352 }
7353
7354 \f
7355 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7356 HIGHEST is nonzero, return the coding system of the highest
7357 priority among the detected coding systems. Otherwize return a
7358 list of detected coding systems sorted by their priorities. If
7359 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7360 multibyte form but contains only ASCII and eight-bit chars.
7361 Otherwise, the bytes are raw bytes.
7362
7363 CODING-SYSTEM controls the detection as below:
7364
7365 If it is nil, detect both text-format and eol-format. If the
7366 text-format part of CODING-SYSTEM is already specified
7367 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7368 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7369 detect only text-format. */
7370
7371 Lisp_Object
7372 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7373 coding_system)
7374 const unsigned char *src;
7375 EMACS_INT src_chars, src_bytes;
7376 int highest;
7377 int multibytep;
7378 Lisp_Object coding_system;
7379 {
7380 const unsigned char *src_end = src + src_bytes;
7381 Lisp_Object attrs, eol_type;
7382 Lisp_Object val;
7383 struct coding_system coding;
7384 int id;
7385 struct coding_detection_info detect_info;
7386 enum coding_category base_category;
7387
7388 if (NILP (coding_system))
7389 coding_system = Qundecided;
7390 setup_coding_system (coding_system, &coding);
7391 attrs = CODING_ID_ATTRS (coding.id);
7392 eol_type = CODING_ID_EOL_TYPE (coding.id);
7393 coding_system = CODING_ATTR_BASE_NAME (attrs);
7394
7395 coding.source = src;
7396 coding.src_chars = src_chars;
7397 coding.src_bytes = src_bytes;
7398 coding.src_multibyte = multibytep;
7399 coding.consumed = 0;
7400 coding.mode |= CODING_MODE_LAST_BLOCK;
7401
7402 detect_info.checked = detect_info.found = detect_info.rejected = 0;
7403
7404 /* At first, detect text-format if necessary. */
7405 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7406 if (base_category == coding_category_undecided)
7407 {
7408 enum coding_category category;
7409 struct coding_system *this;
7410 int c, i;
7411
7412 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7413 for (i = 0; src < src_end; i++, src++)
7414 {
7415 c = *src;
7416 if (c & 0x80)
7417 break;
7418 if (c < 0x20
7419 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7420 && ! inhibit_iso_escape_detection)
7421 {
7422 coding.head_ascii = src - coding.source;
7423 if (detect_coding_iso_2022 (&coding, &detect_info))
7424 {
7425 /* We have scanned the whole data. */
7426 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7427 /* We didn't find an 8-bit code. */
7428 src = src_end;
7429 break;
7430 }
7431 }
7432 }
7433 coding.head_ascii = src - coding.source;
7434
7435 if (src < src_end
7436 || detect_info.found)
7437 {
7438 if (src == src_end)
7439 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7440 for (i = 0; i < coding_category_raw_text; i++)
7441 {
7442 category = coding_priorities[i];
7443 this = coding_categories + category;
7444 if (detect_info.found & (1 << category))
7445 break;
7446 }
7447 else
7448 for (i = 0; i < coding_category_raw_text; i++)
7449 {
7450 category = coding_priorities[i];
7451 this = coding_categories + category;
7452
7453 if (this->id < 0)
7454 {
7455 /* No coding system of this category is defined. */
7456 detect_info.rejected |= (1 << category);
7457 }
7458 else if (category >= coding_category_raw_text)
7459 continue;
7460 else if (detect_info.checked & (1 << category))
7461 {
7462 if (highest
7463 && (detect_info.found & (1 << category)))
7464 break;
7465 }
7466 else
7467 {
7468 if ((*(this->detector)) (&coding, &detect_info)
7469 && highest
7470 && (detect_info.found & (1 << category)))
7471 {
7472 if (category == coding_category_utf_16_auto)
7473 {
7474 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7475 category = coding_category_utf_16_le;
7476 else
7477 category = coding_category_utf_16_be;
7478 }
7479 break;
7480 }
7481 }
7482 }
7483 }
7484
7485 if (detect_info.rejected == CATEGORY_MASK_ANY)
7486 {
7487 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7488 id = coding_categories[coding_category_raw_text].id;
7489 val = Fcons (make_number (id), Qnil);
7490 }
7491 else if (! detect_info.rejected && ! detect_info.found)
7492 {
7493 detect_info.found = CATEGORY_MASK_ANY;
7494 id = coding_categories[coding_category_undecided].id;
7495 val = Fcons (make_number (id), Qnil);
7496 }
7497 else if (highest)
7498 {
7499 if (detect_info.found)
7500 {
7501 detect_info.found = 1 << category;
7502 val = Fcons (make_number (this->id), Qnil);
7503 }
7504 else
7505 for (i = 0; i < coding_category_raw_text; i++)
7506 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7507 {
7508 detect_info.found = 1 << coding_priorities[i];
7509 id = coding_categories[coding_priorities[i]].id;
7510 val = Fcons (make_number (id), Qnil);
7511 break;
7512 }
7513 }
7514 else
7515 {
7516 int mask = detect_info.rejected | detect_info.found;
7517 int found = 0;
7518 val = Qnil;
7519
7520 for (i = coding_category_raw_text - 1; i >= 0; i--)
7521 {
7522 category = coding_priorities[i];
7523 if (! (mask & (1 << category)))
7524 {
7525 found |= 1 << category;
7526 id = coding_categories[category].id;
7527 if (id >= 0)
7528 val = Fcons (make_number (id), val);
7529 }
7530 }
7531 for (i = coding_category_raw_text - 1; i >= 0; i--)
7532 {
7533 category = coding_priorities[i];
7534 if (detect_info.found & (1 << category))
7535 {
7536 id = coding_categories[category].id;
7537 val = Fcons (make_number (id), val);
7538 }
7539 }
7540 detect_info.found |= found;
7541 }
7542 }
7543 else if (base_category == coding_category_utf_16_auto)
7544 {
7545 if (detect_coding_utf_16 (&coding, &detect_info))
7546 {
7547 struct coding_system *this;
7548
7549 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7550 this = coding_categories + coding_category_utf_16_le;
7551 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7552 this = coding_categories + coding_category_utf_16_be;
7553 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7554 this = coding_categories + coding_category_utf_16_be_nosig;
7555 else
7556 this = coding_categories + coding_category_utf_16_le_nosig;
7557 val = Fcons (make_number (this->id), Qnil);
7558 }
7559 }
7560 else
7561 {
7562 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7563 val = Fcons (make_number (coding.id), Qnil);
7564 }
7565
7566 /* Then, detect eol-format if necessary. */
7567 {
7568 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7569 Lisp_Object tail;
7570
7571 if (VECTORP (eol_type))
7572 {
7573 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7574 normal_eol = detect_eol (coding.source, src_bytes,
7575 coding_category_raw_text);
7576 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7577 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7578 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7579 coding_category_utf_16_be);
7580 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7581 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7582 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7583 coding_category_utf_16_le);
7584 }
7585 else
7586 {
7587 if (EQ (eol_type, Qunix))
7588 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7589 else if (EQ (eol_type, Qdos))
7590 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7591 else
7592 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7593 }
7594
7595 for (tail = val; CONSP (tail); tail = XCDR (tail))
7596 {
7597 enum coding_category category;
7598 int this_eol;
7599
7600 id = XINT (XCAR (tail));
7601 attrs = CODING_ID_ATTRS (id);
7602 category = XINT (CODING_ATTR_CATEGORY (attrs));
7603 eol_type = CODING_ID_EOL_TYPE (id);
7604 if (VECTORP (eol_type))
7605 {
7606 if (category == coding_category_utf_16_be
7607 || category == coding_category_utf_16_be_nosig)
7608 this_eol = utf_16_be_eol;
7609 else if (category == coding_category_utf_16_le
7610 || category == coding_category_utf_16_le_nosig)
7611 this_eol = utf_16_le_eol;
7612 else
7613 this_eol = normal_eol;
7614
7615 if (this_eol == EOL_SEEN_LF)
7616 XSETCAR (tail, AREF (eol_type, 0));
7617 else if (this_eol == EOL_SEEN_CRLF)
7618 XSETCAR (tail, AREF (eol_type, 1));
7619 else if (this_eol == EOL_SEEN_CR)
7620 XSETCAR (tail, AREF (eol_type, 2));
7621 else
7622 XSETCAR (tail, CODING_ID_NAME (id));
7623 }
7624 else
7625 XSETCAR (tail, CODING_ID_NAME (id));
7626 }
7627 }
7628
7629 return (highest ? XCAR (val) : val);
7630 }
7631
7632
7633 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7634 2, 3, 0,
7635 doc: /* Detect coding system of the text in the region between START and END.
7636 Return a list of possible coding systems ordered by priority.
7637
7638 If only ASCII characters are found (except for such ISO-2022 control
7639 characters ISO-2022 as ESC), it returns a list of single element
7640 `undecided' or its subsidiary coding system according to a detected
7641 end-of-line format.
7642
7643 If optional argument HIGHEST is non-nil, return the coding system of
7644 highest priority. */)
7645 (start, end, highest)
7646 Lisp_Object start, end, highest;
7647 {
7648 int from, to;
7649 int from_byte, to_byte;
7650
7651 CHECK_NUMBER_COERCE_MARKER (start);
7652 CHECK_NUMBER_COERCE_MARKER (end);
7653
7654 validate_region (&start, &end);
7655 from = XINT (start), to = XINT (end);
7656 from_byte = CHAR_TO_BYTE (from);
7657 to_byte = CHAR_TO_BYTE (to);
7658
7659 if (from < GPT && to >= GPT)
7660 move_gap_both (to, to_byte);
7661
7662 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7663 to - from, to_byte - from_byte,
7664 !NILP (highest),
7665 !NILP (current_buffer
7666 ->enable_multibyte_characters),
7667 Qnil);
7668 }
7669
7670 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7671 1, 2, 0,
7672 doc: /* Detect coding system of the text in STRING.
7673 Return a list of possible coding systems ordered by priority.
7674
7675 If only ASCII characters are found (except for such ISO-2022 control
7676 characters ISO-2022 as ESC), it returns a list of single element
7677 `undecided' or its subsidiary coding system according to a detected
7678 end-of-line format.
7679
7680 If optional argument HIGHEST is non-nil, return the coding system of
7681 highest priority. */)
7682 (string, highest)
7683 Lisp_Object string, highest;
7684 {
7685 CHECK_STRING (string);
7686
7687 return detect_coding_system (SDATA (string),
7688 SCHARS (string), SBYTES (string),
7689 !NILP (highest), STRING_MULTIBYTE (string),
7690 Qnil);
7691 }
7692
7693
7694 static INLINE int
7695 char_encodable_p (c, attrs)
7696 int c;
7697 Lisp_Object attrs;
7698 {
7699 Lisp_Object tail;
7700 struct charset *charset;
7701 Lisp_Object translation_table;
7702
7703 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7704 if (! NILP (translation_table))
7705 c = translate_char (translation_table, c);
7706 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7707 CONSP (tail); tail = XCDR (tail))
7708 {
7709 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7710 if (CHAR_CHARSET_P (c, charset))
7711 break;
7712 }
7713 return (! NILP (tail));
7714 }
7715
7716
7717 /* Return a list of coding systems that safely encode the text between
7718 START and END. If EXCLUDE is non-nil, it is a list of coding
7719 systems not to check. The returned list doesn't contain any such
7720 coding systems. In any case, if the text contains only ASCII or is
7721 unibyte, return t. */
7722
7723 DEFUN ("find-coding-systems-region-internal",
7724 Ffind_coding_systems_region_internal,
7725 Sfind_coding_systems_region_internal, 2, 3, 0,
7726 doc: /* Internal use only. */)
7727 (start, end, exclude)
7728 Lisp_Object start, end, exclude;
7729 {
7730 Lisp_Object coding_attrs_list, safe_codings;
7731 EMACS_INT start_byte, end_byte;
7732 const unsigned char *p, *pbeg, *pend;
7733 int c;
7734 Lisp_Object tail, elt;
7735
7736 if (STRINGP (start))
7737 {
7738 if (!STRING_MULTIBYTE (start)
7739 || SCHARS (start) == SBYTES (start))
7740 return Qt;
7741 start_byte = 0;
7742 end_byte = SBYTES (start);
7743 }
7744 else
7745 {
7746 CHECK_NUMBER_COERCE_MARKER (start);
7747 CHECK_NUMBER_COERCE_MARKER (end);
7748 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7749 args_out_of_range (start, end);
7750 if (NILP (current_buffer->enable_multibyte_characters))
7751 return Qt;
7752 start_byte = CHAR_TO_BYTE (XINT (start));
7753 end_byte = CHAR_TO_BYTE (XINT (end));
7754 if (XINT (end) - XINT (start) == end_byte - start_byte)
7755 return Qt;
7756
7757 if (XINT (start) < GPT && XINT (end) > GPT)
7758 {
7759 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7760 move_gap_both (XINT (start), start_byte);
7761 else
7762 move_gap_both (XINT (end), end_byte);
7763 }
7764 }
7765
7766 coding_attrs_list = Qnil;
7767 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7768 if (NILP (exclude)
7769 || NILP (Fmemq (XCAR (tail), exclude)))
7770 {
7771 Lisp_Object attrs;
7772
7773 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7774 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7775 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7776 {
7777 ASET (attrs, coding_attr_trans_tbl,
7778 get_translation_table (attrs, 1, NULL));
7779 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7780 }
7781 }
7782
7783 if (STRINGP (start))
7784 p = pbeg = SDATA (start);
7785 else
7786 p = pbeg = BYTE_POS_ADDR (start_byte);
7787 pend = p + (end_byte - start_byte);
7788
7789 while (p < pend && ASCII_BYTE_P (*p)) p++;
7790 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7791
7792 while (p < pend)
7793 {
7794 if (ASCII_BYTE_P (*p))
7795 p++;
7796 else
7797 {
7798 c = STRING_CHAR_ADVANCE (p);
7799
7800 charset_map_loaded = 0;
7801 for (tail = coding_attrs_list; CONSP (tail);)
7802 {
7803 elt = XCAR (tail);
7804 if (NILP (elt))
7805 tail = XCDR (tail);
7806 else if (char_encodable_p (c, elt))
7807 tail = XCDR (tail);
7808 else if (CONSP (XCDR (tail)))
7809 {
7810 XSETCAR (tail, XCAR (XCDR (tail)));
7811 XSETCDR (tail, XCDR (XCDR (tail)));
7812 }
7813 else
7814 {
7815 XSETCAR (tail, Qnil);
7816 tail = XCDR (tail);
7817 }
7818 }
7819 if (charset_map_loaded)
7820 {
7821 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7822
7823 if (STRINGP (start))
7824 pbeg = SDATA (start);
7825 else
7826 pbeg = BYTE_POS_ADDR (start_byte);
7827 p = pbeg + p_offset;
7828 pend = pbeg + pend_offset;
7829 }
7830 }
7831 }
7832
7833 safe_codings = list2 (Qraw_text, Qno_conversion);
7834 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7835 if (! NILP (XCAR (tail)))
7836 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7837
7838 return safe_codings;
7839 }
7840
7841
7842 DEFUN ("unencodable-char-position", Funencodable_char_position,
7843 Sunencodable_char_position, 3, 5, 0,
7844 doc: /*
7845 Return position of first un-encodable character in a region.
7846 START and END specfiy the region and CODING-SYSTEM specifies the
7847 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7848
7849 If optional 4th argument COUNT is non-nil, it specifies at most how
7850 many un-encodable characters to search. In this case, the value is a
7851 list of positions.
7852
7853 If optional 5th argument STRING is non-nil, it is a string to search
7854 for un-encodable characters. In that case, START and END are indexes
7855 to the string. */)
7856 (start, end, coding_system, count, string)
7857 Lisp_Object start, end, coding_system, count, string;
7858 {
7859 int n;
7860 struct coding_system coding;
7861 Lisp_Object attrs, charset_list, translation_table;
7862 Lisp_Object positions;
7863 int from, to;
7864 const unsigned char *p, *stop, *pend;
7865 int ascii_compatible;
7866
7867 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7868 attrs = CODING_ID_ATTRS (coding.id);
7869 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7870 return Qnil;
7871 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7872 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7873 translation_table = get_translation_table (attrs, 1, NULL);
7874
7875 if (NILP (string))
7876 {
7877 validate_region (&start, &end);
7878 from = XINT (start);
7879 to = XINT (end);
7880 if (NILP (current_buffer->enable_multibyte_characters)
7881 || (ascii_compatible
7882 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7883 return Qnil;
7884 p = CHAR_POS_ADDR (from);
7885 pend = CHAR_POS_ADDR (to);
7886 if (from < GPT && to >= GPT)
7887 stop = GPT_ADDR;
7888 else
7889 stop = pend;
7890 }
7891 else
7892 {
7893 CHECK_STRING (string);
7894 CHECK_NATNUM (start);
7895 CHECK_NATNUM (end);
7896 from = XINT (start);
7897 to = XINT (end);
7898 if (from > to
7899 || to > SCHARS (string))
7900 args_out_of_range_3 (string, start, end);
7901 if (! STRING_MULTIBYTE (string))
7902 return Qnil;
7903 p = SDATA (string) + string_char_to_byte (string, from);
7904 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7905 if (ascii_compatible && (to - from) == (pend - p))
7906 return Qnil;
7907 }
7908
7909 if (NILP (count))
7910 n = 1;
7911 else
7912 {
7913 CHECK_NATNUM (count);
7914 n = XINT (count);
7915 }
7916
7917 positions = Qnil;
7918 while (1)
7919 {
7920 int c;
7921
7922 if (ascii_compatible)
7923 while (p < stop && ASCII_BYTE_P (*p))
7924 p++, from++;
7925 if (p >= stop)
7926 {
7927 if (p >= pend)
7928 break;
7929 stop = pend;
7930 p = GAP_END_ADDR;
7931 }
7932
7933 c = STRING_CHAR_ADVANCE (p);
7934 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7935 && ! char_charset (translate_char (translation_table, c),
7936 charset_list, NULL))
7937 {
7938 positions = Fcons (make_number (from), positions);
7939 n--;
7940 if (n == 0)
7941 break;
7942 }
7943
7944 from++;
7945 }
7946
7947 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7948 }
7949
7950
7951 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7952 Scheck_coding_systems_region, 3, 3, 0,
7953 doc: /* Check if the region is encodable by coding systems.
7954
7955 START and END are buffer positions specifying the region.
7956 CODING-SYSTEM-LIST is a list of coding systems to check.
7957
7958 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7959 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7960 whole region, POS0, POS1, ... are buffer positions where non-encodable
7961 characters are found.
7962
7963 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7964 value is nil.
7965
7966 START may be a string. In that case, check if the string is
7967 encodable, and the value contains indices to the string instead of
7968 buffer positions. END is ignored. */)
7969 (start, end, coding_system_list)
7970 Lisp_Object start, end, coding_system_list;
7971 {
7972 Lisp_Object list;
7973 EMACS_INT start_byte, end_byte;
7974 int pos;
7975 const unsigned char *p, *pbeg, *pend;
7976 int c;
7977 Lisp_Object tail, elt, attrs;
7978
7979 if (STRINGP (start))
7980 {
7981 if (!STRING_MULTIBYTE (start)
7982 && SCHARS (start) != SBYTES (start))
7983 return Qnil;
7984 start_byte = 0;
7985 end_byte = SBYTES (start);
7986 pos = 0;
7987 }
7988 else
7989 {
7990 CHECK_NUMBER_COERCE_MARKER (start);
7991 CHECK_NUMBER_COERCE_MARKER (end);
7992 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7993 args_out_of_range (start, end);
7994 if (NILP (current_buffer->enable_multibyte_characters))
7995 return Qnil;
7996 start_byte = CHAR_TO_BYTE (XINT (start));
7997 end_byte = CHAR_TO_BYTE (XINT (end));
7998 if (XINT (end) - XINT (start) == end_byte - start_byte)
7999 return Qt;
8000
8001 if (XINT (start) < GPT && XINT (end) > GPT)
8002 {
8003 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8004 move_gap_both (XINT (start), start_byte);
8005 else
8006 move_gap_both (XINT (end), end_byte);
8007 }
8008 pos = XINT (start);
8009 }
8010
8011 list = Qnil;
8012 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8013 {
8014 elt = XCAR (tail);
8015 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8016 ASET (attrs, coding_attr_trans_tbl,
8017 get_translation_table (attrs, 1, NULL));
8018 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8019 }
8020
8021 if (STRINGP (start))
8022 p = pbeg = SDATA (start);
8023 else
8024 p = pbeg = BYTE_POS_ADDR (start_byte);
8025 pend = p + (end_byte - start_byte);
8026
8027 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8028 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8029
8030 while (p < pend)
8031 {
8032 if (ASCII_BYTE_P (*p))
8033 p++;
8034 else
8035 {
8036 c = STRING_CHAR_ADVANCE (p);
8037
8038 charset_map_loaded = 0;
8039 for (tail = list; CONSP (tail); tail = XCDR (tail))
8040 {
8041 elt = XCDR (XCAR (tail));
8042 if (! char_encodable_p (c, XCAR (elt)))
8043 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8044 }
8045 if (charset_map_loaded)
8046 {
8047 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8048
8049 if (STRINGP (start))
8050 pbeg = SDATA (start);
8051 else
8052 pbeg = BYTE_POS_ADDR (start_byte);
8053 p = pbeg + p_offset;
8054 pend = pbeg + pend_offset;
8055 }
8056 }
8057 pos++;
8058 }
8059
8060 tail = list;
8061 list = Qnil;
8062 for (; CONSP (tail); tail = XCDR (tail))
8063 {
8064 elt = XCAR (tail);
8065 if (CONSP (XCDR (XCDR (elt))))
8066 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8067 list);
8068 }
8069
8070 return list;
8071 }
8072
8073
8074 Lisp_Object
8075 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8076 Lisp_Object start, end, coding_system, dst_object;
8077 int encodep, norecord;
8078 {
8079 struct coding_system coding;
8080 EMACS_INT from, from_byte, to, to_byte;
8081 Lisp_Object src_object;
8082
8083 CHECK_NUMBER_COERCE_MARKER (start);
8084 CHECK_NUMBER_COERCE_MARKER (end);
8085 if (NILP (coding_system))
8086 coding_system = Qno_conversion;
8087 else
8088 CHECK_CODING_SYSTEM (coding_system);
8089 src_object = Fcurrent_buffer ();
8090 if (NILP (dst_object))
8091 dst_object = src_object;
8092 else if (! EQ (dst_object, Qt))
8093 CHECK_BUFFER (dst_object);
8094
8095 validate_region (&start, &end);
8096 from = XFASTINT (start);
8097 from_byte = CHAR_TO_BYTE (from);
8098 to = XFASTINT (end);
8099 to_byte = CHAR_TO_BYTE (to);
8100
8101 setup_coding_system (coding_system, &coding);
8102 coding.mode |= CODING_MODE_LAST_BLOCK;
8103
8104 if (encodep)
8105 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8106 dst_object);
8107 else
8108 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8109 dst_object);
8110 if (! norecord)
8111 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8112
8113 return (BUFFERP (dst_object)
8114 ? make_number (coding.produced_char)
8115 : coding.dst_object);
8116 }
8117
8118
8119 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8120 3, 4, "r\nzCoding system: ",
8121 doc: /* Decode the current region from the specified coding system.
8122 When called from a program, takes four arguments:
8123 START, END, CODING-SYSTEM, and DESTINATION.
8124 START and END are buffer positions.
8125
8126 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8127 If nil, the region between START and END is replaced by the decoded text.
8128 If buffer, the decoded text is inserted in the buffer.
8129 If t, the decoded text is returned.
8130
8131 This function sets `last-coding-system-used' to the precise coding system
8132 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8133 not fully specified.)
8134 It returns the length of the decoded text. */)
8135 (start, end, coding_system, destination)
8136 Lisp_Object start, end, coding_system, destination;
8137 {
8138 return code_convert_region (start, end, coding_system, destination, 0, 0);
8139 }
8140
8141 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8142 3, 4, "r\nzCoding system: ",
8143 doc: /* Encode the current region by specified coding system.
8144 When called from a program, takes three arguments:
8145 START, END, and CODING-SYSTEM. START and END are buffer positions.
8146
8147 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8148 If nil, the region between START and END is replace by the encoded text.
8149 If buffer, the encoded text is inserted in the buffer.
8150 If t, the encoded text is returned.
8151
8152 This function sets `last-coding-system-used' to the precise coding system
8153 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8154 not fully specified.)
8155 It returns the length of the encoded text. */)
8156 (start, end, coding_system, destination)
8157 Lisp_Object start, end, coding_system, destination;
8158 {
8159 return code_convert_region (start, end, coding_system, destination, 1, 0);
8160 }
8161
8162 Lisp_Object
8163 code_convert_string (string, coding_system, dst_object,
8164 encodep, nocopy, norecord)
8165 Lisp_Object string, coding_system, dst_object;
8166 int encodep, nocopy, norecord;
8167 {
8168 struct coding_system coding;
8169 EMACS_INT chars, bytes;
8170
8171 CHECK_STRING (string);
8172 if (NILP (coding_system))
8173 {
8174 if (! norecord)
8175 Vlast_coding_system_used = Qno_conversion;
8176 if (NILP (dst_object))
8177 return (nocopy ? Fcopy_sequence (string) : string);
8178 }
8179
8180 if (NILP (coding_system))
8181 coding_system = Qno_conversion;
8182 else
8183 CHECK_CODING_SYSTEM (coding_system);
8184 if (NILP (dst_object))
8185 dst_object = Qt;
8186 else if (! EQ (dst_object, Qt))
8187 CHECK_BUFFER (dst_object);
8188
8189 setup_coding_system (coding_system, &coding);
8190 coding.mode |= CODING_MODE_LAST_BLOCK;
8191 chars = SCHARS (string);
8192 bytes = SBYTES (string);
8193 if (encodep)
8194 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8195 else
8196 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8197 if (! norecord)
8198 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8199
8200 return (BUFFERP (dst_object)
8201 ? make_number (coding.produced_char)
8202 : coding.dst_object);
8203 }
8204
8205
8206 /* Encode or decode STRING according to CODING_SYSTEM.
8207 Do not set Vlast_coding_system_used.
8208
8209 This function is called only from macros DECODE_FILE and
8210 ENCODE_FILE, thus we ignore character composition. */
8211
8212 Lisp_Object
8213 code_convert_string_norecord (string, coding_system, encodep)
8214 Lisp_Object string, coding_system;
8215 int encodep;
8216 {
8217 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8218 }
8219
8220
8221 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8222 2, 4, 0,
8223 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8224
8225 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8226 if the decoding operation is trivial.
8227
8228 Optional fourth arg BUFFER non-nil meant that the decoded text is
8229 inserted in BUFFER instead of returned as a string. In this case,
8230 the return value is BUFFER.
8231
8232 This function sets `last-coding-system-used' to the precise coding system
8233 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8234 not fully specified. */)
8235 (string, coding_system, nocopy, buffer)
8236 Lisp_Object string, coding_system, nocopy, buffer;
8237 {
8238 return code_convert_string (string, coding_system, buffer,
8239 0, ! NILP (nocopy), 0);
8240 }
8241
8242 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8243 2, 4, 0,
8244 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8245
8246 Optional third arg NOCOPY non-nil means it is OK to return STRING
8247 itself if the encoding operation is trivial.
8248
8249 Optional fourth arg BUFFER non-nil meant that the encoded text is
8250 inserted in BUFFER instead of returned as a string. In this case,
8251 the return value is BUFFER.
8252
8253 This function sets `last-coding-system-used' to the precise coding system
8254 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8255 not fully specified.) */)
8256 (string, coding_system, nocopy, buffer)
8257 Lisp_Object string, coding_system, nocopy, buffer;
8258 {
8259 return code_convert_string (string, coding_system, buffer,
8260 1, ! NILP (nocopy), 1);
8261 }
8262
8263 \f
8264 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8265 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8266 Return the corresponding character. */)
8267 (code)
8268 Lisp_Object code;
8269 {
8270 Lisp_Object spec, attrs, val;
8271 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8272 int c;
8273
8274 CHECK_NATNUM (code);
8275 c = XFASTINT (code);
8276 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8277 attrs = AREF (spec, 0);
8278
8279 if (ASCII_BYTE_P (c)
8280 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8281 return code;
8282
8283 val = CODING_ATTR_CHARSET_LIST (attrs);
8284 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8285 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8286 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8287
8288 if (c <= 0x7F)
8289 charset = charset_roman;
8290 else if (c >= 0xA0 && c < 0xDF)
8291 {
8292 charset = charset_kana;
8293 c -= 0x80;
8294 }
8295 else
8296 {
8297 int s1 = c >> 8, s2 = c & 0xFF;
8298
8299 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8300 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8301 error ("Invalid code: %d", code);
8302 SJIS_TO_JIS (c);
8303 charset = charset_kanji;
8304 }
8305 c = DECODE_CHAR (charset, c);
8306 if (c < 0)
8307 error ("Invalid code: %d", code);
8308 return make_number (c);
8309 }
8310
8311
8312 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8313 doc: /* Encode a Japanese character CH to shift_jis encoding.
8314 Return the corresponding code in SJIS. */)
8315 (ch)
8316 Lisp_Object ch;
8317 {
8318 Lisp_Object spec, attrs, charset_list;
8319 int c;
8320 struct charset *charset;
8321 unsigned code;
8322
8323 CHECK_CHARACTER (ch);
8324 c = XFASTINT (ch);
8325 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8326 attrs = AREF (spec, 0);
8327
8328 if (ASCII_CHAR_P (c)
8329 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8330 return ch;
8331
8332 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8333 charset = char_charset (c, charset_list, &code);
8334 if (code == CHARSET_INVALID_CODE (charset))
8335 error ("Can't encode by shift_jis encoding: %d", c);
8336 JIS_TO_SJIS (code);
8337
8338 return make_number (code);
8339 }
8340
8341 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8342 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8343 Return the corresponding character. */)
8344 (code)
8345 Lisp_Object code;
8346 {
8347 Lisp_Object spec, attrs, val;
8348 struct charset *charset_roman, *charset_big5, *charset;
8349 int c;
8350
8351 CHECK_NATNUM (code);
8352 c = XFASTINT (code);
8353 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8354 attrs = AREF (spec, 0);
8355
8356 if (ASCII_BYTE_P (c)
8357 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8358 return code;
8359
8360 val = CODING_ATTR_CHARSET_LIST (attrs);
8361 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8362 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8363
8364 if (c <= 0x7F)
8365 charset = charset_roman;
8366 else
8367 {
8368 int b1 = c >> 8, b2 = c & 0x7F;
8369 if (b1 < 0xA1 || b1 > 0xFE
8370 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8371 error ("Invalid code: %d", code);
8372 charset = charset_big5;
8373 }
8374 c = DECODE_CHAR (charset, (unsigned )c);
8375 if (c < 0)
8376 error ("Invalid code: %d", code);
8377 return make_number (c);
8378 }
8379
8380 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8381 doc: /* Encode the Big5 character CH to BIG5 coding system.
8382 Return the corresponding character code in Big5. */)
8383 (ch)
8384 Lisp_Object ch;
8385 {
8386 Lisp_Object spec, attrs, charset_list;
8387 struct charset *charset;
8388 int c;
8389 unsigned code;
8390
8391 CHECK_CHARACTER (ch);
8392 c = XFASTINT (ch);
8393 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8394 attrs = AREF (spec, 0);
8395 if (ASCII_CHAR_P (c)
8396 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8397 return ch;
8398
8399 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8400 charset = char_charset (c, charset_list, &code);
8401 if (code == CHARSET_INVALID_CODE (charset))
8402 error ("Can't encode by Big5 encoding: %d", c);
8403
8404 return make_number (code);
8405 }
8406
8407 \f
8408 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
8409 Sset_terminal_coding_system_internal, 1, 2, 0,
8410 doc: /* Internal use only. */)
8411 (coding_system, terminal)
8412 Lisp_Object coding_system;
8413 Lisp_Object terminal;
8414 {
8415 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8416 CHECK_SYMBOL (coding_system);
8417 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
8418 /* We had better not send unsafe characters to terminal. */
8419 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
8420 /* Characer composition should be disabled. */
8421 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8422 terminal_coding->src_multibyte = 1;
8423 terminal_coding->dst_multibyte = 0;
8424 return Qnil;
8425 }
8426
8427 DEFUN ("set-safe-terminal-coding-system-internal",
8428 Fset_safe_terminal_coding_system_internal,
8429 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8430 doc: /* Internal use only. */)
8431 (coding_system)
8432 Lisp_Object coding_system;
8433 {
8434 CHECK_SYMBOL (coding_system);
8435 setup_coding_system (Fcheck_coding_system (coding_system),
8436 &safe_terminal_coding);
8437 /* Characer composition should be disabled. */
8438 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8439 safe_terminal_coding.src_multibyte = 1;
8440 safe_terminal_coding.dst_multibyte = 0;
8441 return Qnil;
8442 }
8443
8444 DEFUN ("terminal-coding-system", Fterminal_coding_system,
8445 Sterminal_coding_system, 0, 1, 0,
8446 doc: /* Return coding system specified for terminal output on the given terminal.
8447 TERMINAL may be a terminal id, a frame, or nil for the selected
8448 frame's terminal device. */)
8449 (terminal)
8450 Lisp_Object terminal;
8451 {
8452 struct coding_system *terminal_coding
8453 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8454 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
8455
8456 /* For backward compatibility, return nil if it is `undecided'. */
8457 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8458 }
8459
8460 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
8461 Sset_keyboard_coding_system_internal, 1, 2, 0,
8462 doc: /* Internal use only. */)
8463 (coding_system, terminal)
8464 Lisp_Object coding_system;
8465 Lisp_Object terminal;
8466 {
8467 struct terminal *t = get_terminal (terminal, 1);
8468 CHECK_SYMBOL (coding_system);
8469 setup_coding_system (Fcheck_coding_system (coding_system),
8470 TERMINAL_KEYBOARD_CODING (t));
8471 /* Characer composition should be disabled. */
8472 TERMINAL_KEYBOARD_CODING (t)->common_flags
8473 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8474 return Qnil;
8475 }
8476
8477 DEFUN ("keyboard-coding-system",
8478 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
8479 doc: /* Return coding system specified for decoding keyboard input. */)
8480 (terminal)
8481 Lisp_Object terminal;
8482 {
8483 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
8484 (get_terminal (terminal, 1))->id);
8485 }
8486
8487 \f
8488 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8489 Sfind_operation_coding_system, 1, MANY, 0,
8490 doc: /* Choose a coding system for an operation based on the target name.
8491 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8492 DECODING-SYSTEM is the coding system to use for decoding
8493 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8494 for encoding (in case OPERATION does encoding).
8495
8496 The first argument OPERATION specifies an I/O primitive:
8497 For file I/O, `insert-file-contents' or `write-region'.
8498 For process I/O, `call-process', `call-process-region', or `start-process'.
8499 For network I/O, `open-network-stream'.
8500
8501 The remaining arguments should be the same arguments that were passed
8502 to the primitive. Depending on which primitive, one of those arguments
8503 is selected as the TARGET. For example, if OPERATION does file I/O,
8504 whichever argument specifies the file name is TARGET.
8505
8506 TARGET has a meaning which depends on OPERATION:
8507 For file I/O, TARGET is a file name (except for the special case below).
8508 For process I/O, TARGET is a process name.
8509 For network I/O, TARGET is a service name or a port number
8510
8511 This function looks up what specified for TARGET in,
8512 `file-coding-system-alist', `process-coding-system-alist',
8513 or `network-coding-system-alist' depending on OPERATION.
8514 They may specify a coding system, a cons of coding systems,
8515 or a function symbol to call.
8516 In the last case, we call the function with one argument,
8517 which is a list of all the arguments given to this function.
8518 If the function can't decide a coding system, it can return
8519 `undecided' so that the normal code-detection is performed.
8520
8521 If OPERATION is `insert-file-contents', the argument corresponding to
8522 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
8523 file name to look up, and BUFFER is a buffer that contains the file's
8524 contents (not yet decoded). If `file-coding-system-alist' specifies a
8525 function to call for FILENAME, that function should examine the
8526 contents of BUFFER instead of reading the file.
8527
8528 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
8529 (nargs, args)
8530 int nargs;
8531 Lisp_Object *args;
8532 {
8533 Lisp_Object operation, target_idx, target, val;
8534 register Lisp_Object chain;
8535
8536 if (nargs < 2)
8537 error ("Too few arguments");
8538 operation = args[0];
8539 if (!SYMBOLP (operation)
8540 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8541 error ("Invalid first arguement");
8542 if (nargs < 1 + XINT (target_idx))
8543 error ("Too few arguments for operation: %s",
8544 SDATA (SYMBOL_NAME (operation)));
8545 target = args[XINT (target_idx) + 1];
8546 if (!(STRINGP (target)
8547 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8548 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
8549 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8550 error ("Invalid %dth argument", XINT (target_idx) + 1);
8551 if (CONSP (target))
8552 target = XCAR (target);
8553
8554 chain = ((EQ (operation, Qinsert_file_contents)
8555 || EQ (operation, Qwrite_region))
8556 ? Vfile_coding_system_alist
8557 : (EQ (operation, Qopen_network_stream)
8558 ? Vnetwork_coding_system_alist
8559 : Vprocess_coding_system_alist));
8560 if (NILP (chain))
8561 return Qnil;
8562
8563 for (; CONSP (chain); chain = XCDR (chain))
8564 {
8565 Lisp_Object elt;
8566
8567 elt = XCAR (chain);
8568 if (CONSP (elt)
8569 && ((STRINGP (target)
8570 && STRINGP (XCAR (elt))
8571 && fast_string_match (XCAR (elt), target) >= 0)
8572 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8573 {
8574 val = XCDR (elt);
8575 /* Here, if VAL is both a valid coding system and a valid
8576 function symbol, we return VAL as a coding system. */
8577 if (CONSP (val))
8578 return val;
8579 if (! SYMBOLP (val))
8580 return Qnil;
8581 if (! NILP (Fcoding_system_p (val)))
8582 return Fcons (val, val);
8583 if (! NILP (Ffboundp (val)))
8584 {
8585 /* We use call1 rather than safe_call1
8586 so as to get bug reports about functions called here
8587 which don't handle the current interface. */
8588 val = call1 (val, Flist (nargs, args));
8589 if (CONSP (val))
8590 return val;
8591 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8592 return Fcons (val, val);
8593 }
8594 return Qnil;
8595 }
8596 }
8597 return Qnil;
8598 }
8599
8600 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8601 Sset_coding_system_priority, 0, MANY, 0,
8602 doc: /* Assign higher priority to the coding systems given as arguments.
8603 If multiple coding systems belongs to the same category,
8604 all but the first one are ignored.
8605
8606 usage: (set-coding-system-priority ...) */)
8607 (nargs, args)
8608 int nargs;
8609 Lisp_Object *args;
8610 {
8611 int i, j;
8612 int changed[coding_category_max];
8613 enum coding_category priorities[coding_category_max];
8614
8615 bzero (changed, sizeof changed);
8616
8617 for (i = j = 0; i < nargs; i++)
8618 {
8619 enum coding_category category;
8620 Lisp_Object spec, attrs;
8621
8622 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8623 attrs = AREF (spec, 0);
8624 category = XINT (CODING_ATTR_CATEGORY (attrs));
8625 if (changed[category])
8626 /* Ignore this coding system because a coding system of the
8627 same category already had a higher priority. */
8628 continue;
8629 changed[category] = 1;
8630 priorities[j++] = category;
8631 if (coding_categories[category].id >= 0
8632 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8633 setup_coding_system (args[i], &coding_categories[category]);
8634 Fset (AREF (Vcoding_category_table, category), args[i]);
8635 }
8636
8637 /* Now we have decided top J priorities. Reflect the order of the
8638 original priorities to the remaining priorities. */
8639
8640 for (i = j, j = 0; i < coding_category_max; i++, j++)
8641 {
8642 while (j < coding_category_max
8643 && changed[coding_priorities[j]])
8644 j++;
8645 if (j == coding_category_max)
8646 abort ();
8647 priorities[i] = coding_priorities[j];
8648 }
8649
8650 bcopy (priorities, coding_priorities, sizeof priorities);
8651
8652 /* Update `coding-category-list'. */
8653 Vcoding_category_list = Qnil;
8654 for (i = coding_category_max - 1; i >= 0; i--)
8655 Vcoding_category_list
8656 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8657 Vcoding_category_list);
8658
8659 return Qnil;
8660 }
8661
8662 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8663 Scoding_system_priority_list, 0, 1, 0,
8664 doc: /* Return a list of coding systems ordered by their priorities.
8665 HIGHESTP non-nil means just return the highest priority one. */)
8666 (highestp)
8667 Lisp_Object highestp;
8668 {
8669 int i;
8670 Lisp_Object val;
8671
8672 for (i = 0, val = Qnil; i < coding_category_max; i++)
8673 {
8674 enum coding_category category = coding_priorities[i];
8675 int id = coding_categories[category].id;
8676 Lisp_Object attrs;
8677
8678 if (id < 0)
8679 continue;
8680 attrs = CODING_ID_ATTRS (id);
8681 if (! NILP (highestp))
8682 return CODING_ATTR_BASE_NAME (attrs);
8683 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8684 }
8685 return Fnreverse (val);
8686 }
8687
8688 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8689
8690 static Lisp_Object
8691 make_subsidiaries (base)
8692 Lisp_Object base;
8693 {
8694 Lisp_Object subsidiaries;
8695 int base_name_len = SBYTES (SYMBOL_NAME (base));
8696 char *buf = (char *) alloca (base_name_len + 6);
8697 int i;
8698
8699 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8700 subsidiaries = Fmake_vector (make_number (3), Qnil);
8701 for (i = 0; i < 3; i++)
8702 {
8703 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8704 ASET (subsidiaries, i, intern (buf));
8705 }
8706 return subsidiaries;
8707 }
8708
8709
8710 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8711 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8712 doc: /* For internal use only.
8713 usage: (define-coding-system-internal ...) */)
8714 (nargs, args)
8715 int nargs;
8716 Lisp_Object *args;
8717 {
8718 Lisp_Object name;
8719 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8720 Lisp_Object attrs; /* Vector of attributes. */
8721 Lisp_Object eol_type;
8722 Lisp_Object aliases;
8723 Lisp_Object coding_type, charset_list, safe_charsets;
8724 enum coding_category category;
8725 Lisp_Object tail, val;
8726 int max_charset_id = 0;
8727 int i;
8728
8729 if (nargs < coding_arg_max)
8730 goto short_args;
8731
8732 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8733
8734 name = args[coding_arg_name];
8735 CHECK_SYMBOL (name);
8736 CODING_ATTR_BASE_NAME (attrs) = name;
8737
8738 val = args[coding_arg_mnemonic];
8739 if (! STRINGP (val))
8740 CHECK_CHARACTER (val);
8741 CODING_ATTR_MNEMONIC (attrs) = val;
8742
8743 coding_type = args[coding_arg_coding_type];
8744 CHECK_SYMBOL (coding_type);
8745 CODING_ATTR_TYPE (attrs) = coding_type;
8746
8747 charset_list = args[coding_arg_charset_list];
8748 if (SYMBOLP (charset_list))
8749 {
8750 if (EQ (charset_list, Qiso_2022))
8751 {
8752 if (! EQ (coding_type, Qiso_2022))
8753 error ("Invalid charset-list");
8754 charset_list = Viso_2022_charset_list;
8755 }
8756 else if (EQ (charset_list, Qemacs_mule))
8757 {
8758 if (! EQ (coding_type, Qemacs_mule))
8759 error ("Invalid charset-list");
8760 charset_list = Vemacs_mule_charset_list;
8761 }
8762 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8763 if (max_charset_id < XFASTINT (XCAR (tail)))
8764 max_charset_id = XFASTINT (XCAR (tail));
8765 }
8766 else
8767 {
8768 charset_list = Fcopy_sequence (charset_list);
8769 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8770 {
8771 struct charset *charset;
8772
8773 val = XCAR (tail);
8774 CHECK_CHARSET_GET_CHARSET (val, charset);
8775 if (EQ (coding_type, Qiso_2022)
8776 ? CHARSET_ISO_FINAL (charset) < 0
8777 : EQ (coding_type, Qemacs_mule)
8778 ? CHARSET_EMACS_MULE_ID (charset) < 0
8779 : 0)
8780 error ("Can't handle charset `%s'",
8781 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8782
8783 XSETCAR (tail, make_number (charset->id));
8784 if (max_charset_id < charset->id)
8785 max_charset_id = charset->id;
8786 }
8787 }
8788 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8789
8790 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8791 make_number (255));
8792 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8793 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8794 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8795
8796 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8797
8798 val = args[coding_arg_decode_translation_table];
8799 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8800 CHECK_SYMBOL (val);
8801 CODING_ATTR_DECODE_TBL (attrs) = val;
8802
8803 val = args[coding_arg_encode_translation_table];
8804 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8805 CHECK_SYMBOL (val);
8806 CODING_ATTR_ENCODE_TBL (attrs) = val;
8807
8808 val = args[coding_arg_post_read_conversion];
8809 CHECK_SYMBOL (val);
8810 CODING_ATTR_POST_READ (attrs) = val;
8811
8812 val = args[coding_arg_pre_write_conversion];
8813 CHECK_SYMBOL (val);
8814 CODING_ATTR_PRE_WRITE (attrs) = val;
8815
8816 val = args[coding_arg_default_char];
8817 if (NILP (val))
8818 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8819 else
8820 {
8821 CHECK_CHARACTER (val);
8822 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8823 }
8824
8825 val = args[coding_arg_for_unibyte];
8826 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8827
8828 val = args[coding_arg_plist];
8829 CHECK_LIST (val);
8830 CODING_ATTR_PLIST (attrs) = val;
8831
8832 if (EQ (coding_type, Qcharset))
8833 {
8834 /* Generate a lisp vector of 256 elements. Each element is nil,
8835 integer, or a list of charset IDs.
8836
8837 If Nth element is nil, the byte code N is invalid in this
8838 coding system.
8839
8840 If Nth element is a number NUM, N is the first byte of a
8841 charset whose ID is NUM.
8842
8843 If Nth element is a list of charset IDs, N is the first byte
8844 of one of them. The list is sorted by dimensions of the
8845 charsets. A charset of smaller dimension comes firtst. */
8846 val = Fmake_vector (make_number (256), Qnil);
8847
8848 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8849 {
8850 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8851 int dim = CHARSET_DIMENSION (charset);
8852 int idx = (dim - 1) * 4;
8853
8854 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8855 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8856
8857 for (i = charset->code_space[idx];
8858 i <= charset->code_space[idx + 1]; i++)
8859 {
8860 Lisp_Object tmp, tmp2;
8861 int dim2;
8862
8863 tmp = AREF (val, i);
8864 if (NILP (tmp))
8865 tmp = XCAR (tail);
8866 else if (NUMBERP (tmp))
8867 {
8868 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8869 if (dim < dim2)
8870 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8871 else
8872 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8873 }
8874 else
8875 {
8876 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8877 {
8878 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8879 if (dim < dim2)
8880 break;
8881 }
8882 if (NILP (tmp2))
8883 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8884 else
8885 {
8886 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8887 XSETCAR (tmp2, XCAR (tail));
8888 }
8889 }
8890 ASET (val, i, tmp);
8891 }
8892 }
8893 ASET (attrs, coding_attr_charset_valids, val);
8894 category = coding_category_charset;
8895 }
8896 else if (EQ (coding_type, Qccl))
8897 {
8898 Lisp_Object valids;
8899
8900 if (nargs < coding_arg_ccl_max)
8901 goto short_args;
8902
8903 val = args[coding_arg_ccl_decoder];
8904 CHECK_CCL_PROGRAM (val);
8905 if (VECTORP (val))
8906 val = Fcopy_sequence (val);
8907 ASET (attrs, coding_attr_ccl_decoder, val);
8908
8909 val = args[coding_arg_ccl_encoder];
8910 CHECK_CCL_PROGRAM (val);
8911 if (VECTORP (val))
8912 val = Fcopy_sequence (val);
8913 ASET (attrs, coding_attr_ccl_encoder, val);
8914
8915 val = args[coding_arg_ccl_valids];
8916 valids = Fmake_string (make_number (256), make_number (0));
8917 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8918 {
8919 int from, to;
8920
8921 val = Fcar (tail);
8922 if (INTEGERP (val))
8923 {
8924 from = to = XINT (val);
8925 if (from < 0 || from > 255)
8926 args_out_of_range_3 (val, make_number (0), make_number (255));
8927 }
8928 else
8929 {
8930 CHECK_CONS (val);
8931 CHECK_NATNUM_CAR (val);
8932 CHECK_NATNUM_CDR (val);
8933 from = XINT (XCAR (val));
8934 if (from > 255)
8935 args_out_of_range_3 (XCAR (val),
8936 make_number (0), make_number (255));
8937 to = XINT (XCDR (val));
8938 if (to < from || to > 255)
8939 args_out_of_range_3 (XCDR (val),
8940 XCAR (val), make_number (255));
8941 }
8942 for (i = from; i <= to; i++)
8943 SSET (valids, i, 1);
8944 }
8945 ASET (attrs, coding_attr_ccl_valids, valids);
8946
8947 category = coding_category_ccl;
8948 }
8949 else if (EQ (coding_type, Qutf_16))
8950 {
8951 Lisp_Object bom, endian;
8952
8953 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8954
8955 if (nargs < coding_arg_utf16_max)
8956 goto short_args;
8957
8958 bom = args[coding_arg_utf16_bom];
8959 if (! NILP (bom) && ! EQ (bom, Qt))
8960 {
8961 CHECK_CONS (bom);
8962 val = XCAR (bom);
8963 CHECK_CODING_SYSTEM (val);
8964 val = XCDR (bom);
8965 CHECK_CODING_SYSTEM (val);
8966 }
8967 ASET (attrs, coding_attr_utf_16_bom, bom);
8968
8969 endian = args[coding_arg_utf16_endian];
8970 CHECK_SYMBOL (endian);
8971 if (NILP (endian))
8972 endian = Qbig;
8973 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8974 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8975 ASET (attrs, coding_attr_utf_16_endian, endian);
8976
8977 category = (CONSP (bom)
8978 ? coding_category_utf_16_auto
8979 : NILP (bom)
8980 ? (EQ (endian, Qbig)
8981 ? coding_category_utf_16_be_nosig
8982 : coding_category_utf_16_le_nosig)
8983 : (EQ (endian, Qbig)
8984 ? coding_category_utf_16_be
8985 : coding_category_utf_16_le));
8986 }
8987 else if (EQ (coding_type, Qiso_2022))
8988 {
8989 Lisp_Object initial, reg_usage, request, flags;
8990 int i;
8991
8992 if (nargs < coding_arg_iso2022_max)
8993 goto short_args;
8994
8995 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8996 CHECK_VECTOR (initial);
8997 for (i = 0; i < 4; i++)
8998 {
8999 val = Faref (initial, make_number (i));
9000 if (! NILP (val))
9001 {
9002 struct charset *charset;
9003
9004 CHECK_CHARSET_GET_CHARSET (val, charset);
9005 ASET (initial, i, make_number (CHARSET_ID (charset)));
9006 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9007 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9008 }
9009 else
9010 ASET (initial, i, make_number (-1));
9011 }
9012
9013 reg_usage = args[coding_arg_iso2022_reg_usage];
9014 CHECK_CONS (reg_usage);
9015 CHECK_NUMBER_CAR (reg_usage);
9016 CHECK_NUMBER_CDR (reg_usage);
9017
9018 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9019 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9020 {
9021 int id;
9022 Lisp_Object tmp;
9023
9024 val = Fcar (tail);
9025 CHECK_CONS (val);
9026 tmp = XCAR (val);
9027 CHECK_CHARSET_GET_ID (tmp, id);
9028 CHECK_NATNUM_CDR (val);
9029 if (XINT (XCDR (val)) >= 4)
9030 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9031 XSETCAR (val, make_number (id));
9032 }
9033
9034 flags = args[coding_arg_iso2022_flags];
9035 CHECK_NATNUM (flags);
9036 i = XINT (flags);
9037 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9038 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9039
9040 ASET (attrs, coding_attr_iso_initial, initial);
9041 ASET (attrs, coding_attr_iso_usage, reg_usage);
9042 ASET (attrs, coding_attr_iso_request, request);
9043 ASET (attrs, coding_attr_iso_flags, flags);
9044 setup_iso_safe_charsets (attrs);
9045
9046 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9047 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9048 | CODING_ISO_FLAG_SINGLE_SHIFT))
9049 ? coding_category_iso_7_else
9050 : EQ (args[coding_arg_charset_list], Qiso_2022)
9051 ? coding_category_iso_7
9052 : coding_category_iso_7_tight);
9053 else
9054 {
9055 int id = XINT (AREF (initial, 1));
9056
9057 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9058 || EQ (args[coding_arg_charset_list], Qiso_2022)
9059 || id < 0)
9060 ? coding_category_iso_8_else
9061 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9062 ? coding_category_iso_8_1
9063 : coding_category_iso_8_2);
9064 }
9065 if (category != coding_category_iso_8_1
9066 && category != coding_category_iso_8_2)
9067 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9068 }
9069 else if (EQ (coding_type, Qemacs_mule))
9070 {
9071 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9072 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9073 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9074 category = coding_category_emacs_mule;
9075 }
9076 else if (EQ (coding_type, Qshift_jis))
9077 {
9078
9079 struct charset *charset;
9080
9081 if (XINT (Flength (charset_list)) != 3
9082 && XINT (Flength (charset_list)) != 4)
9083 error ("There should be three or four charsets");
9084
9085 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9086 if (CHARSET_DIMENSION (charset) != 1)
9087 error ("Dimension of charset %s is not one",
9088 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9089 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9090 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9091
9092 charset_list = XCDR (charset_list);
9093 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9094 if (CHARSET_DIMENSION (charset) != 1)
9095 error ("Dimension of charset %s is not one",
9096 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9097
9098 charset_list = XCDR (charset_list);
9099 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9100 if (CHARSET_DIMENSION (charset) != 2)
9101 error ("Dimension of charset %s is not two",
9102 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9103
9104 charset_list = XCDR (charset_list);
9105 if (! NILP (charset_list))
9106 {
9107 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9108 if (CHARSET_DIMENSION (charset) != 2)
9109 error ("Dimension of charset %s is not two",
9110 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9111 }
9112
9113 category = coding_category_sjis;
9114 Vsjis_coding_system = name;
9115 }
9116 else if (EQ (coding_type, Qbig5))
9117 {
9118 struct charset *charset;
9119
9120 if (XINT (Flength (charset_list)) != 2)
9121 error ("There should be just two charsets");
9122
9123 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9124 if (CHARSET_DIMENSION (charset) != 1)
9125 error ("Dimension of charset %s is not one",
9126 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9127 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9128 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9129
9130 charset_list = XCDR (charset_list);
9131 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9132 if (CHARSET_DIMENSION (charset) != 2)
9133 error ("Dimension of charset %s is not two",
9134 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9135
9136 category = coding_category_big5;
9137 Vbig5_coding_system = name;
9138 }
9139 else if (EQ (coding_type, Qraw_text))
9140 {
9141 category = coding_category_raw_text;
9142 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9143 }
9144 else if (EQ (coding_type, Qutf_8))
9145 {
9146 category = coding_category_utf_8;
9147 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9148 }
9149 else if (EQ (coding_type, Qundecided))
9150 category = coding_category_undecided;
9151 else
9152 error ("Invalid coding system type: %s",
9153 SDATA (SYMBOL_NAME (coding_type)));
9154
9155 CODING_ATTR_CATEGORY (attrs) = make_number (category);
9156 CODING_ATTR_PLIST (attrs)
9157 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9158 CODING_ATTR_PLIST (attrs)));
9159 CODING_ATTR_PLIST (attrs)
9160 = Fcons (QCascii_compatible_p,
9161 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9162 CODING_ATTR_PLIST (attrs)));
9163
9164 eol_type = args[coding_arg_eol_type];
9165 if (! NILP (eol_type)
9166 && ! EQ (eol_type, Qunix)
9167 && ! EQ (eol_type, Qdos)
9168 && ! EQ (eol_type, Qmac))
9169 error ("Invalid eol-type");
9170
9171 aliases = Fcons (name, Qnil);
9172
9173 if (NILP (eol_type))
9174 {
9175 eol_type = make_subsidiaries (name);
9176 for (i = 0; i < 3; i++)
9177 {
9178 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9179
9180 this_name = AREF (eol_type, i);
9181 this_aliases = Fcons (this_name, Qnil);
9182 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9183 this_spec = Fmake_vector (make_number (3), attrs);
9184 ASET (this_spec, 1, this_aliases);
9185 ASET (this_spec, 2, this_eol_type);
9186 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9187 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9188 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9189 if (NILP (val))
9190 Vcoding_system_alist
9191 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9192 Vcoding_system_alist);
9193 }
9194 }
9195
9196 spec_vec = Fmake_vector (make_number (3), attrs);
9197 ASET (spec_vec, 1, aliases);
9198 ASET (spec_vec, 2, eol_type);
9199
9200 Fputhash (name, spec_vec, Vcoding_system_hash_table);
9201 Vcoding_system_list = Fcons (name, Vcoding_system_list);
9202 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9203 if (NILP (val))
9204 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9205 Vcoding_system_alist);
9206
9207 {
9208 int id = coding_categories[category].id;
9209
9210 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9211 setup_coding_system (name, &coding_categories[category]);
9212 }
9213
9214 return Qnil;
9215
9216 short_args:
9217 return Fsignal (Qwrong_number_of_arguments,
9218 Fcons (intern ("define-coding-system-internal"),
9219 make_number (nargs)));
9220 }
9221
9222
9223 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9224 3, 3, 0,
9225 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9226 (coding_system, prop, val)
9227 Lisp_Object coding_system, prop, val;
9228 {
9229 Lisp_Object spec, attrs;
9230
9231 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9232 attrs = AREF (spec, 0);
9233 if (EQ (prop, QCmnemonic))
9234 {
9235 if (! STRINGP (val))
9236 CHECK_CHARACTER (val);
9237 CODING_ATTR_MNEMONIC (attrs) = val;
9238 }
9239 else if (EQ (prop, QCdefalut_char))
9240 {
9241 if (NILP (val))
9242 val = make_number (' ');
9243 else
9244 CHECK_CHARACTER (val);
9245 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9246 }
9247 else if (EQ (prop, QCdecode_translation_table))
9248 {
9249 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9250 CHECK_SYMBOL (val);
9251 CODING_ATTR_DECODE_TBL (attrs) = val;
9252 }
9253 else if (EQ (prop, QCencode_translation_table))
9254 {
9255 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9256 CHECK_SYMBOL (val);
9257 CODING_ATTR_ENCODE_TBL (attrs) = val;
9258 }
9259 else if (EQ (prop, QCpost_read_conversion))
9260 {
9261 CHECK_SYMBOL (val);
9262 CODING_ATTR_POST_READ (attrs) = val;
9263 }
9264 else if (EQ (prop, QCpre_write_conversion))
9265 {
9266 CHECK_SYMBOL (val);
9267 CODING_ATTR_PRE_WRITE (attrs) = val;
9268 }
9269 else if (EQ (prop, QCascii_compatible_p))
9270 {
9271 CODING_ATTR_ASCII_COMPAT (attrs) = val;
9272 }
9273
9274 CODING_ATTR_PLIST (attrs)
9275 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9276 return val;
9277 }
9278
9279
9280 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9281 Sdefine_coding_system_alias, 2, 2, 0,
9282 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9283 (alias, coding_system)
9284 Lisp_Object alias, coding_system;
9285 {
9286 Lisp_Object spec, aliases, eol_type, val;
9287
9288 CHECK_SYMBOL (alias);
9289 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9290 aliases = AREF (spec, 1);
9291 /* ALISES should be a list of length more than zero, and the first
9292 element is a base coding system. Append ALIAS at the tail of the
9293 list. */
9294 while (!NILP (XCDR (aliases)))
9295 aliases = XCDR (aliases);
9296 XSETCDR (aliases, Fcons (alias, Qnil));
9297
9298 eol_type = AREF (spec, 2);
9299 if (VECTORP (eol_type))
9300 {
9301 Lisp_Object subsidiaries;
9302 int i;
9303
9304 subsidiaries = make_subsidiaries (alias);
9305 for (i = 0; i < 3; i++)
9306 Fdefine_coding_system_alias (AREF (subsidiaries, i),
9307 AREF (eol_type, i));
9308 }
9309
9310 Fputhash (alias, spec, Vcoding_system_hash_table);
9311 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9312 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9313 if (NILP (val))
9314 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9315 Vcoding_system_alist);
9316
9317 return Qnil;
9318 }
9319
9320 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9321 1, 1, 0,
9322 doc: /* Return the base of CODING-SYSTEM.
9323 Any alias or subsidiary coding system is not a base coding system. */)
9324 (coding_system)
9325 Lisp_Object coding_system;
9326 {
9327 Lisp_Object spec, attrs;
9328
9329 if (NILP (coding_system))
9330 return (Qno_conversion);
9331 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9332 attrs = AREF (spec, 0);
9333 return CODING_ATTR_BASE_NAME (attrs);
9334 }
9335
9336 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9337 1, 1, 0,
9338 doc: "Return the property list of CODING-SYSTEM.")
9339 (coding_system)
9340 Lisp_Object coding_system;
9341 {
9342 Lisp_Object spec, attrs;
9343
9344 if (NILP (coding_system))
9345 coding_system = Qno_conversion;
9346 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9347 attrs = AREF (spec, 0);
9348 return CODING_ATTR_PLIST (attrs);
9349 }
9350
9351
9352 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9353 1, 1, 0,
9354 doc: /* Return the list of aliases of CODING-SYSTEM. */)
9355 (coding_system)
9356 Lisp_Object coding_system;
9357 {
9358 Lisp_Object spec;
9359
9360 if (NILP (coding_system))
9361 coding_system = Qno_conversion;
9362 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9363 return AREF (spec, 1);
9364 }
9365
9366 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9367 Scoding_system_eol_type, 1, 1, 0,
9368 doc: /* Return eol-type of CODING-SYSTEM.
9369 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9370
9371 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9372 and CR respectively.
9373
9374 A vector value indicates that a format of end-of-line should be
9375 detected automatically. Nth element of the vector is the subsidiary
9376 coding system whose eol-type is N. */)
9377 (coding_system)
9378 Lisp_Object coding_system;
9379 {
9380 Lisp_Object spec, eol_type;
9381 int n;
9382
9383 if (NILP (coding_system))
9384 coding_system = Qno_conversion;
9385 if (! CODING_SYSTEM_P (coding_system))
9386 return Qnil;
9387 spec = CODING_SYSTEM_SPEC (coding_system);
9388 eol_type = AREF (spec, 2);
9389 if (VECTORP (eol_type))
9390 return Fcopy_sequence (eol_type);
9391 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9392 return make_number (n);
9393 }
9394
9395 #endif /* emacs */
9396
9397 \f
9398 /*** 9. Post-amble ***/
9399
9400 void
9401 init_coding_once ()
9402 {
9403 int i;
9404
9405 for (i = 0; i < coding_category_max; i++)
9406 {
9407 coding_categories[i].id = -1;
9408 coding_priorities[i] = i;
9409 }
9410
9411 /* ISO2022 specific initialize routine. */
9412 for (i = 0; i < 0x20; i++)
9413 iso_code_class[i] = ISO_control_0;
9414 for (i = 0x21; i < 0x7F; i++)
9415 iso_code_class[i] = ISO_graphic_plane_0;
9416 for (i = 0x80; i < 0xA0; i++)
9417 iso_code_class[i] = ISO_control_1;
9418 for (i = 0xA1; i < 0xFF; i++)
9419 iso_code_class[i] = ISO_graphic_plane_1;
9420 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9421 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9422 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9423 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9424 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9425 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9426 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9427 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9428 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9429
9430 for (i = 0; i < 256; i++)
9431 {
9432 emacs_mule_bytes[i] = 1;
9433 }
9434 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9435 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9436 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9437 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9438 }
9439
9440 #ifdef emacs
9441
9442 void
9443 syms_of_coding ()
9444 {
9445 staticpro (&Vcoding_system_hash_table);
9446 {
9447 Lisp_Object args[2];
9448 args[0] = QCtest;
9449 args[1] = Qeq;
9450 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9451 }
9452
9453 staticpro (&Vsjis_coding_system);
9454 Vsjis_coding_system = Qnil;
9455
9456 staticpro (&Vbig5_coding_system);
9457 Vbig5_coding_system = Qnil;
9458
9459 staticpro (&Vcode_conversion_reused_workbuf);
9460 Vcode_conversion_reused_workbuf = Qnil;
9461
9462 staticpro (&Vcode_conversion_workbuf_name);
9463 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9464
9465 reused_workbuf_in_use = 0;
9466
9467 DEFSYM (Qcharset, "charset");
9468 DEFSYM (Qtarget_idx, "target-idx");
9469 DEFSYM (Qcoding_system_history, "coding-system-history");
9470 Fset (Qcoding_system_history, Qnil);
9471
9472 /* Target FILENAME is the first argument. */
9473 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9474 /* Target FILENAME is the third argument. */
9475 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9476
9477 DEFSYM (Qcall_process, "call-process");
9478 /* Target PROGRAM is the first argument. */
9479 Fput (Qcall_process, Qtarget_idx, make_number (0));
9480
9481 DEFSYM (Qcall_process_region, "call-process-region");
9482 /* Target PROGRAM is the third argument. */
9483 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9484
9485 DEFSYM (Qstart_process, "start-process");
9486 /* Target PROGRAM is the third argument. */
9487 Fput (Qstart_process, Qtarget_idx, make_number (2));
9488
9489 DEFSYM (Qopen_network_stream, "open-network-stream");
9490 /* Target SERVICE is the fourth argument. */
9491 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9492
9493 DEFSYM (Qcoding_system, "coding-system");
9494 DEFSYM (Qcoding_aliases, "coding-aliases");
9495
9496 DEFSYM (Qeol_type, "eol-type");
9497 DEFSYM (Qunix, "unix");
9498 DEFSYM (Qdos, "dos");
9499
9500 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9501 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9502 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9503 DEFSYM (Qdefault_char, "default-char");
9504 DEFSYM (Qundecided, "undecided");
9505 DEFSYM (Qno_conversion, "no-conversion");
9506 DEFSYM (Qraw_text, "raw-text");
9507
9508 DEFSYM (Qiso_2022, "iso-2022");
9509
9510 DEFSYM (Qutf_8, "utf-8");
9511 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9512
9513 DEFSYM (Qutf_16, "utf-16");
9514 DEFSYM (Qbig, "big");
9515 DEFSYM (Qlittle, "little");
9516
9517 DEFSYM (Qshift_jis, "shift-jis");
9518 DEFSYM (Qbig5, "big5");
9519
9520 DEFSYM (Qcoding_system_p, "coding-system-p");
9521
9522 DEFSYM (Qcoding_system_error, "coding-system-error");
9523 Fput (Qcoding_system_error, Qerror_conditions,
9524 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9525 Fput (Qcoding_system_error, Qerror_message,
9526 build_string ("Invalid coding system"));
9527
9528 /* Intern this now in case it isn't already done.
9529 Setting this variable twice is harmless.
9530 But don't staticpro it here--that is done in alloc.c. */
9531 Qchar_table_extra_slots = intern ("char-table-extra-slots");
9532
9533 DEFSYM (Qtranslation_table, "translation-table");
9534 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9535 DEFSYM (Qtranslation_table_id, "translation-table-id");
9536 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9537 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9538
9539 DEFSYM (Qvalid_codes, "valid-codes");
9540
9541 DEFSYM (Qemacs_mule, "emacs-mule");
9542
9543 DEFSYM (QCcategory, ":category");
9544 DEFSYM (QCmnemonic, ":mnemonic");
9545 DEFSYM (QCdefalut_char, ":default-char");
9546 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9547 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9548 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9549 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9550 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9551
9552 Vcoding_category_table
9553 = Fmake_vector (make_number (coding_category_max), Qnil);
9554 staticpro (&Vcoding_category_table);
9555 /* Followings are target of code detection. */
9556 ASET (Vcoding_category_table, coding_category_iso_7,
9557 intern ("coding-category-iso-7"));
9558 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9559 intern ("coding-category-iso-7-tight"));
9560 ASET (Vcoding_category_table, coding_category_iso_8_1,
9561 intern ("coding-category-iso-8-1"));
9562 ASET (Vcoding_category_table, coding_category_iso_8_2,
9563 intern ("coding-category-iso-8-2"));
9564 ASET (Vcoding_category_table, coding_category_iso_7_else,
9565 intern ("coding-category-iso-7-else"));
9566 ASET (Vcoding_category_table, coding_category_iso_8_else,
9567 intern ("coding-category-iso-8-else"));
9568 ASET (Vcoding_category_table, coding_category_utf_8,
9569 intern ("coding-category-utf-8"));
9570 ASET (Vcoding_category_table, coding_category_utf_16_be,
9571 intern ("coding-category-utf-16-be"));
9572 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9573 intern ("coding-category-utf-16-auto"));
9574 ASET (Vcoding_category_table, coding_category_utf_16_le,
9575 intern ("coding-category-utf-16-le"));
9576 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9577 intern ("coding-category-utf-16-be-nosig"));
9578 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9579 intern ("coding-category-utf-16-le-nosig"));
9580 ASET (Vcoding_category_table, coding_category_charset,
9581 intern ("coding-category-charset"));
9582 ASET (Vcoding_category_table, coding_category_sjis,
9583 intern ("coding-category-sjis"));
9584 ASET (Vcoding_category_table, coding_category_big5,
9585 intern ("coding-category-big5"));
9586 ASET (Vcoding_category_table, coding_category_ccl,
9587 intern ("coding-category-ccl"));
9588 ASET (Vcoding_category_table, coding_category_emacs_mule,
9589 intern ("coding-category-emacs-mule"));
9590 /* Followings are NOT target of code detection. */
9591 ASET (Vcoding_category_table, coding_category_raw_text,
9592 intern ("coding-category-raw-text"));
9593 ASET (Vcoding_category_table, coding_category_undecided,
9594 intern ("coding-category-undecided"));
9595
9596 DEFSYM (Qinsufficient_source, "insufficient-source");
9597 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9598 DEFSYM (Qinvalid_source, "invalid-source");
9599 DEFSYM (Qinterrupted, "interrupted");
9600 DEFSYM (Qinsufficient_memory, "insufficient-memory");
9601 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9602
9603 defsubr (&Scoding_system_p);
9604 defsubr (&Sread_coding_system);
9605 defsubr (&Sread_non_nil_coding_system);
9606 defsubr (&Scheck_coding_system);
9607 defsubr (&Sdetect_coding_region);
9608 defsubr (&Sdetect_coding_string);
9609 defsubr (&Sfind_coding_systems_region_internal);
9610 defsubr (&Sunencodable_char_position);
9611 defsubr (&Scheck_coding_systems_region);
9612 defsubr (&Sdecode_coding_region);
9613 defsubr (&Sencode_coding_region);
9614 defsubr (&Sdecode_coding_string);
9615 defsubr (&Sencode_coding_string);
9616 defsubr (&Sdecode_sjis_char);
9617 defsubr (&Sencode_sjis_char);
9618 defsubr (&Sdecode_big5_char);
9619 defsubr (&Sencode_big5_char);
9620 defsubr (&Sset_terminal_coding_system_internal);
9621 defsubr (&Sset_safe_terminal_coding_system_internal);
9622 defsubr (&Sterminal_coding_system);
9623 defsubr (&Sset_keyboard_coding_system_internal);
9624 defsubr (&Skeyboard_coding_system);
9625 defsubr (&Sfind_operation_coding_system);
9626 defsubr (&Sset_coding_system_priority);
9627 defsubr (&Sdefine_coding_system_internal);
9628 defsubr (&Sdefine_coding_system_alias);
9629 defsubr (&Scoding_system_put);
9630 defsubr (&Scoding_system_base);
9631 defsubr (&Scoding_system_plist);
9632 defsubr (&Scoding_system_aliases);
9633 defsubr (&Scoding_system_eol_type);
9634 defsubr (&Scoding_system_priority_list);
9635
9636 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9637 doc: /* List of coding systems.
9638
9639 Do not alter the value of this variable manually. This variable should be
9640 updated by the functions `define-coding-system' and
9641 `define-coding-system-alias'. */);
9642 Vcoding_system_list = Qnil;
9643
9644 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9645 doc: /* Alist of coding system names.
9646 Each element is one element list of coding system name.
9647 This variable is given to `completing-read' as TABLE argument.
9648
9649 Do not alter the value of this variable manually. This variable should be
9650 updated by the functions `make-coding-system' and
9651 `define-coding-system-alias'. */);
9652 Vcoding_system_alist = Qnil;
9653
9654 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9655 doc: /* List of coding-categories (symbols) ordered by priority.
9656
9657 On detecting a coding system, Emacs tries code detection algorithms
9658 associated with each coding-category one by one in this order. When
9659 one algorithm agrees with a byte sequence of source text, the coding
9660 system bound to the corresponding coding-category is selected.
9661
9662 Don't modify this variable directly, but use `set-coding-priority'. */);
9663 {
9664 int i;
9665
9666 Vcoding_category_list = Qnil;
9667 for (i = coding_category_max - 1; i >= 0; i--)
9668 Vcoding_category_list
9669 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9670 Vcoding_category_list);
9671 }
9672
9673 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9674 doc: /* Specify the coding system for read operations.
9675 It is useful to bind this variable with `let', but do not set it globally.
9676 If the value is a coding system, it is used for decoding on read operation.
9677 If not, an appropriate element is used from one of the coding system alists:
9678 There are three such tables, `file-coding-system-alist',
9679 `process-coding-system-alist', and `network-coding-system-alist'. */);
9680 Vcoding_system_for_read = Qnil;
9681
9682 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9683 doc: /* Specify the coding system for write operations.
9684 Programs bind this variable with `let', but you should not set it globally.
9685 If the value is a coding system, it is used for encoding of output,
9686 when writing it to a file and when sending it to a file or subprocess.
9687
9688 If this does not specify a coding system, an appropriate element
9689 is used from one of the coding system alists:
9690 There are three such tables, `file-coding-system-alist',
9691 `process-coding-system-alist', and `network-coding-system-alist'.
9692 For output to files, if the above procedure does not specify a coding system,
9693 the value of `buffer-file-coding-system' is used. */);
9694 Vcoding_system_for_write = Qnil;
9695
9696 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9697 doc: /*
9698 Coding system used in the latest file or process I/O. */);
9699 Vlast_coding_system_used = Qnil;
9700
9701 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9702 doc: /*
9703 Error status of the last code conversion.
9704
9705 When an error was detected in the last code conversion, this variable
9706 is set to one of the following symbols.
9707 `insufficient-source'
9708 `inconsistent-eol'
9709 `invalid-source'
9710 `interrupted'
9711 `insufficient-memory'
9712 When no error was detected, the value doesn't change. So, to check
9713 the error status of a code conversion by this variable, you must
9714 explicitly set this variable to nil before performing code
9715 conversion. */);
9716 Vlast_code_conversion_error = Qnil;
9717
9718 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9719 doc: /*
9720 *Non-nil means always inhibit code conversion of end-of-line format.
9721 See info node `Coding Systems' and info node `Text and Binary' concerning
9722 such conversion. */);
9723 inhibit_eol_conversion = 0;
9724
9725 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9726 doc: /*
9727 Non-nil means process buffer inherits coding system of process output.
9728 Bind it to t if the process output is to be treated as if it were a file
9729 read from some filesystem. */);
9730 inherit_process_coding_system = 0;
9731
9732 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9733 doc: /*
9734 Alist to decide a coding system to use for a file I/O operation.
9735 The format is ((PATTERN . VAL) ...),
9736 where PATTERN is a regular expression matching a file name,
9737 VAL is a coding system, a cons of coding systems, or a function symbol.
9738 If VAL is a coding system, it is used for both decoding and encoding
9739 the file contents.
9740 If VAL is a cons of coding systems, the car part is used for decoding,
9741 and the cdr part is used for encoding.
9742 If VAL is a function symbol, the function must return a coding system
9743 or a cons of coding systems which are used as above. The function is
9744 called with an argument that is a list of the arguments with which
9745 `find-operation-coding-system' was called. If the function can't decide
9746 a coding system, it can return `undecided' so that the normal
9747 code-detection is performed.
9748
9749 See also the function `find-operation-coding-system'
9750 and the variable `auto-coding-alist'. */);
9751 Vfile_coding_system_alist = Qnil;
9752
9753 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9754 doc: /*
9755 Alist to decide a coding system to use for a process I/O operation.
9756 The format is ((PATTERN . VAL) ...),
9757 where PATTERN is a regular expression matching a program name,
9758 VAL is a coding system, a cons of coding systems, or a function symbol.
9759 If VAL is a coding system, it is used for both decoding what received
9760 from the program and encoding what sent to the program.
9761 If VAL is a cons of coding systems, the car part is used for decoding,
9762 and the cdr part is used for encoding.
9763 If VAL is a function symbol, the function must return a coding system
9764 or a cons of coding systems which are used as above.
9765
9766 See also the function `find-operation-coding-system'. */);
9767 Vprocess_coding_system_alist = Qnil;
9768
9769 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9770 doc: /*
9771 Alist to decide a coding system to use for a network I/O operation.
9772 The format is ((PATTERN . VAL) ...),
9773 where PATTERN is a regular expression matching a network service name
9774 or is a port number to connect to,
9775 VAL is a coding system, a cons of coding systems, or a function symbol.
9776 If VAL is a coding system, it is used for both decoding what received
9777 from the network stream and encoding what sent to the network stream.
9778 If VAL is a cons of coding systems, the car part is used for decoding,
9779 and the cdr part is used for encoding.
9780 If VAL is a function symbol, the function must return a coding system
9781 or a cons of coding systems which are used as above.
9782
9783 See also the function `find-operation-coding-system'. */);
9784 Vnetwork_coding_system_alist = Qnil;
9785
9786 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9787 doc: /* Coding system to use with system messages.
9788 Also used for decoding keyboard input on X Window system. */);
9789 Vlocale_coding_system = Qnil;
9790
9791 /* The eol mnemonics are reset in startup.el system-dependently. */
9792 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9793 doc: /*
9794 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9795 eol_mnemonic_unix = build_string (":");
9796
9797 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9798 doc: /*
9799 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9800 eol_mnemonic_dos = build_string ("\\");
9801
9802 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9803 doc: /*
9804 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9805 eol_mnemonic_mac = build_string ("/");
9806
9807 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9808 doc: /*
9809 *String displayed in mode line when end-of-line format is not yet determined. */);
9810 eol_mnemonic_undecided = build_string (":");
9811
9812 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9813 doc: /*
9814 *Non-nil enables character translation while encoding and decoding. */);
9815 Venable_character_translation = Qt;
9816
9817 DEFVAR_LISP ("standard-translation-table-for-decode",
9818 &Vstandard_translation_table_for_decode,
9819 doc: /* Table for translating characters while decoding. */);
9820 Vstandard_translation_table_for_decode = Qnil;
9821
9822 DEFVAR_LISP ("standard-translation-table-for-encode",
9823 &Vstandard_translation_table_for_encode,
9824 doc: /* Table for translating characters while encoding. */);
9825 Vstandard_translation_table_for_encode = Qnil;
9826
9827 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9828 doc: /* Alist of charsets vs revision numbers.
9829 While encoding, if a charset (car part of an element) is found,
9830 designate it with the escape sequence identifying revision (cdr part
9831 of the element). */);
9832 Vcharset_revision_table = Qnil;
9833
9834 DEFVAR_LISP ("default-process-coding-system",
9835 &Vdefault_process_coding_system,
9836 doc: /* Cons of coding systems used for process I/O by default.
9837 The car part is used for decoding a process output,
9838 the cdr part is used for encoding a text to be sent to a process. */);
9839 Vdefault_process_coding_system = Qnil;
9840
9841 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9842 doc: /*
9843 Table of extra Latin codes in the range 128..159 (inclusive).
9844 This is a vector of length 256.
9845 If Nth element is non-nil, the existence of code N in a file
9846 \(or output of subprocess) doesn't prevent it to be detected as
9847 a coding system of ISO 2022 variant which has a flag
9848 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9849 or reading output of a subprocess.
9850 Only 128th through 159th elements has a meaning. */);
9851 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9852
9853 DEFVAR_LISP ("select-safe-coding-system-function",
9854 &Vselect_safe_coding_system_function,
9855 doc: /*
9856 Function to call to select safe coding system for encoding a text.
9857
9858 If set, this function is called to force a user to select a proper
9859 coding system which can encode the text in the case that a default
9860 coding system used in each operation can't encode the text. The
9861 function should take care that the buffer is not modified while
9862 the coding system is being selected.
9863
9864 The default value is `select-safe-coding-system' (which see). */);
9865 Vselect_safe_coding_system_function = Qnil;
9866
9867 DEFVAR_BOOL ("coding-system-require-warning",
9868 &coding_system_require_warning,
9869 doc: /* Internal use only.
9870 If non-nil, on writing a file, `select-safe-coding-system-function' is
9871 called even if `coding-system-for-write' is non-nil. The command
9872 `universal-coding-system-argument' binds this variable to t temporarily. */);
9873 coding_system_require_warning = 0;
9874
9875
9876 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9877 &inhibit_iso_escape_detection,
9878 doc: /*
9879 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9880
9881 By default, on reading a file, Emacs tries to detect how the text is
9882 encoded. This code detection is sensitive to escape sequences. If
9883 the sequence is valid as ISO2022, the code is determined as one of
9884 the ISO2022 encodings, and the file is decoded by the corresponding
9885 coding system (e.g. `iso-2022-7bit').
9886
9887 However, there may be a case that you want to read escape sequences in
9888 a file as is. In such a case, you can set this variable to non-nil.
9889 Then, as the code detection ignores any escape sequences, no file is
9890 detected as encoded in some ISO2022 encoding. The result is that all
9891 escape sequences become visible in a buffer.
9892
9893 The default value is nil, and it is strongly recommended not to change
9894 it. That is because many Emacs Lisp source files that contain
9895 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9896 in Emacs's distribution, and they won't be decoded correctly on
9897 reading if you suppress escape sequence detection.
9898
9899 The other way to read escape sequences in a file without decoding is
9900 to explicitly specify some coding system that doesn't use ISO2022's
9901 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9902 inhibit_iso_escape_detection = 0;
9903
9904 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9905 doc: /* Char table for translating self-inserting characters.
9906 This is applied to the result of input methods, not their input. See also
9907 `keyboard-translate-table'. */);
9908 Vtranslation_table_for_input = Qnil;
9909
9910 {
9911 Lisp_Object args[coding_arg_max];
9912 Lisp_Object plist[16];
9913 int i;
9914
9915 for (i = 0; i < coding_arg_max; i++)
9916 args[i] = Qnil;
9917
9918 plist[0] = intern (":name");
9919 plist[1] = args[coding_arg_name] = Qno_conversion;
9920 plist[2] = intern (":mnemonic");
9921 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9922 plist[4] = intern (":coding-type");
9923 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9924 plist[6] = intern (":ascii-compatible-p");
9925 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9926 plist[8] = intern (":default-char");
9927 plist[9] = args[coding_arg_default_char] = make_number (0);
9928 plist[10] = intern (":for-unibyte");
9929 plist[11] = args[coding_arg_for_unibyte] = Qt;
9930 plist[12] = intern (":docstring");
9931 plist[13] = build_string ("Do no conversion.\n\
9932 \n\
9933 When you visit a file with this coding, the file is read into a\n\
9934 unibyte buffer as is, thus each byte of a file is treated as a\n\
9935 character.");
9936 plist[14] = intern (":eol-type");
9937 plist[15] = args[coding_arg_eol_type] = Qunix;
9938 args[coding_arg_plist] = Flist (16, plist);
9939 Fdefine_coding_system_internal (coding_arg_max, args);
9940
9941 plist[1] = args[coding_arg_name] = Qundecided;
9942 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9943 plist[5] = args[coding_arg_coding_type] = Qundecided;
9944 /* This is already set.
9945 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9946 plist[8] = intern (":charset-list");
9947 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9948 plist[11] = args[coding_arg_for_unibyte] = Qnil;
9949 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9950 plist[15] = args[coding_arg_eol_type] = Qnil;
9951 args[coding_arg_plist] = Flist (16, plist);
9952 Fdefine_coding_system_internal (coding_arg_max, args);
9953 }
9954
9955 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9956
9957 {
9958 int i;
9959
9960 for (i = 0; i < coding_category_max; i++)
9961 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9962 }
9963 #if defined (MSDOS) || defined (WINDOWSNT)
9964 system_eol_type = Qdos;
9965 #else
9966 system_eol_type = Qunix;
9967 #endif
9968 staticpro (&system_eol_type);
9969 }
9970
9971 char *
9972 emacs_strerror (error_number)
9973 int error_number;
9974 {
9975 char *str;
9976
9977 synchronize_system_messages_locale ();
9978 str = strerror (error_number);
9979
9980 if (! NILP (Vlocale_coding_system))
9981 {
9982 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9983 Vlocale_coding_system,
9984 0);
9985 str = (char *) SDATA (dec);
9986 }
9987
9988 return str;
9989 }
9990
9991 #endif /* emacs */
9992
9993 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9994 (do not change this comment) */