Include "font.h".
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
10
11 This file is part of GNU Emacs.
12
13 GNU Emacs is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs; see the file COPYING. If not, write to
25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 Boston, MA 02110-1301, USA. */
27
28 /*** TABLE OF CONTENTS ***
29
30 0. General comments
31 1. Preamble
32 2. Emacs' internal format (emacs-utf-8) handlers
33 3. UTF-8 handlers
34 4. UTF-16 handlers
35 5. Charset-base coding systems handlers
36 6. emacs-mule (old Emacs' internal format) handlers
37 7. ISO2022 handlers
38 8. Shift-JIS and BIG5 handlers
39 9. CCL handlers
40 10. C library functions
41 11. Emacs Lisp library functions
42 12. Postamble
43
44 */
45
46 /*** 0. General comments ***
47
48
49 CODING SYSTEM
50
51 A coding system is an object for an encoding mechanism that contains
52 information about how to convert byte sequences to character
53 sequences and vice versa. When we say "decode", it means converting
54 a byte sequence of a specific coding system into a character
55 sequence that is represented by Emacs' internal coding system
56 `emacs-utf-8', and when we say "encode", it means converting a
57 character sequence of emacs-utf-8 to a byte sequence of a specific
58 coding system.
59
60 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
61 C level, a coding system is represented by a vector of attributes
62 stored in the hash table Vcharset_hash_table. The conversion from
63 coding system symbol to attributes vector is done by looking up
64 Vcharset_hash_table by the symbol.
65
66 Coding systems are classified into the following types depending on
67 the encoding mechanism. Here's a brief description of the types.
68
69 o UTF-8
70
71 o UTF-16
72
73 o Charset-base coding system
74
75 A coding system defined by one or more (coded) character sets.
76 Decoding and encoding are done by a code converter defined for each
77 character set.
78
79 o Old Emacs internal format (emacs-mule)
80
81 The coding system adopted by old versions of Emacs (20 and 21).
82
83 o ISO2022-base coding system
84
85 The most famous coding system for multiple character sets. X's
86 Compound Text, various EUCs (Extended Unix Code), and coding systems
87 used in the Internet communication such as ISO-2022-JP are all
88 variants of ISO2022.
89
90 o SJIS (or Shift-JIS or MS-Kanji-Code)
91
92 A coding system to encode character sets: ASCII, JISX0201, and
93 JISX0208. Widely used for PC's in Japan. Details are described in
94 section 8.
95
96 o BIG5
97
98 A coding system to encode character sets: ASCII and Big5. Widely
99 used for Chinese (mainly in Taiwan and Hong Kong). Details are
100 described in section 8. In this file, when we write "big5" (all
101 lowercase), we mean the coding system, and when we write "Big5"
102 (capitalized), we mean the character set.
103
104 o CCL
105
106 If a user wants to decode/encode text encoded in a coding system
107 not listed above, he can supply a decoder and an encoder for it in
108 CCL (Code Conversion Language) programs. Emacs executes the CCL
109 program while decoding/encoding.
110
111 o Raw-text
112
113 A coding system for text containing raw eight-bit data. Emacs
114 treats each byte of source text as a character (except for
115 end-of-line conversion).
116
117 o No-conversion
118
119 Like raw text, but don't do end-of-line conversion.
120
121
122 END-OF-LINE FORMAT
123
124 How text end-of-line is encoded depends on operating system. For
125 instance, Unix's format is just one byte of LF (line-feed) code,
126 whereas DOS's format is two-byte sequence of `carriage-return' and
127 `line-feed' codes. MacOS's format is usually one byte of
128 `carriage-return'.
129
130 Since text character encoding and end-of-line encoding are
131 independent, any coding system described above can take any format
132 of end-of-line (except for no-conversion).
133
134 STRUCT CODING_SYSTEM
135
136 Before using a coding system for code conversion (i.e. decoding and
137 encoding), we setup a structure of type `struct coding_system'.
138 This structure keeps various information about a specific code
139 conversion (e.g. the location of source and destination data).
140
141 */
142
143 /* COMMON MACROS */
144
145
146 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
147
148 These functions check if a byte sequence specified as a source in
149 CODING conforms to the format of XXX, and update the members of
150 DETECT_INFO.
151
152 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
153
154 Below is the template of these functions. */
155
156 #if 0
157 static int
158 detect_coding_XXX (coding, detect_info)
159 struct coding_system *coding;
160 struct coding_detection_info *detect_info;
161 {
162 const unsigned char *src = coding->source;
163 const unsigned char *src_end = coding->source + coding->src_bytes;
164 int multibytep = coding->src_multibyte;
165 int consumed_chars = 0;
166 int found = 0;
167 ...;
168
169 while (1)
170 {
171 /* Get one byte from the source. If the souce is exausted, jump
172 to no_more_source:. */
173 ONE_MORE_BYTE (c);
174
175 if (! __C_conforms_to_XXX___ (c))
176 break;
177 if (! __C_strongly_suggests_XXX__ (c))
178 found = CATEGORY_MASK_XXX;
179 }
180 /* The byte sequence is invalid for XXX. */
181 detect_info->rejected |= CATEGORY_MASK_XXX;
182 return 0;
183
184 no_more_source:
185 /* The source exausted successfully. */
186 detect_info->found |= found;
187 return 1;
188 }
189 #endif
190
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
192
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
197
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
202
203 Below is the template of these functions. */
204
205 #if 0
206 static void
207 decode_coding_XXXX (coding)
208 struct coding_system *coding;
209 {
210 const unsigned char *src = coding->source + coding->consumed;
211 const unsigned char *src_end = coding->source + coding->src_bytes;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 const unsigned char *src_base;
216 /* A buffer to produce decoded characters. */
217 int *charbuf = coding->charbuf + coding->charbuf_used;
218 int *charbuf_end = coding->charbuf + coding->charbuf_size;
219 int multibytep = coding->src_multibyte;
220
221 while (1)
222 {
223 src_base = src;
224 if (charbuf < charbuf_end)
225 /* No more room to produce a decoded character. */
226 break;
227 ONE_MORE_BYTE (c);
228 /* Decode it. */
229 }
230
231 no_more_source:
232 if (src_base < src_end
233 && coding->mode & CODING_MODE_LAST_BLOCK)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base < src_end && charbuf < charbuf_end)
237 *charbuf++ = *src_base++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding->consumed = coding->consumed_char = src_base - coding->source;
241 /* Remember how many characters we produced. */
242 coding->charbuf_used = charbuf - coding->charbuf;
243 }
244 #endif
245
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
247
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
252
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
257
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
261
262 Below is a template of these functions. */
263 #if 0
264 static void
265 encode_coding_XXX (coding)
266 struct coding_system *coding;
267 {
268 int multibytep = coding->dst_multibyte;
269 int *charbuf = coding->charbuf;
270 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
271 unsigned char *dst = coding->destination + coding->produced;
272 unsigned char *dst_end = coding->destination + coding->dst_bytes;
273 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
274 int produced_chars = 0;
275
276 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
277 {
278 int c = *charbuf;
279 /* Encode C into DST, and increment DST. */
280 }
281 label_no_more_destination:
282 /* How many chars and bytes we produced. */
283 coding->produced_char += produced_chars;
284 coding->produced = dst - coding->destination;
285 }
286 #endif
287
288 \f
289 /*** 1. Preamble ***/
290
291 #include <config.h>
292 #include <stdio.h>
293
294 #include "lisp.h"
295 #include "buffer.h"
296 #include "character.h"
297 #include "charset.h"
298 #include "ccl.h"
299 #include "composite.h"
300 #include "coding.h"
301 #include "window.h"
302
303 Lisp_Object Vcoding_system_hash_table;
304
305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
306 Lisp_Object Qunix, Qdos;
307 extern Lisp_Object Qmac; /* frame.c */
308 Lisp_Object Qbuffer_file_coding_system;
309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
310 Lisp_Object Qdefault_char;
311 Lisp_Object Qno_conversion, Qundecided;
312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
313 Lisp_Object Qbig, Qlittle;
314 Lisp_Object Qcoding_system_history;
315 Lisp_Object Qvalid_codes;
316 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
317 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
318 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
319 Lisp_Object QCascii_compatible_p;
320
321 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
322 Lisp_Object Qcall_process, Qcall_process_region;
323 Lisp_Object Qstart_process, Qopen_network_stream;
324 Lisp_Object Qtarget_idx;
325
326 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
327 Lisp_Object Qinterrupted, Qinsufficient_memory;
328
329 /* If a symbol has this property, evaluate the value to define the
330 symbol as a coding system. */
331 static Lisp_Object Qcoding_system_define_form;
332
333 int coding_system_require_warning;
334
335 Lisp_Object Vselect_safe_coding_system_function;
336
337 /* Mnemonic string for each format of end-of-line. */
338 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
339 /* Mnemonic string to indicate format of end-of-line is not yet
340 decided. */
341 Lisp_Object eol_mnemonic_undecided;
342
343 /* Format of end-of-line decided by system. This is Qunix on
344 Unix and Mac, Qdos on DOS/Windows.
345 This has an effect only for external encoding (i.e. for output to
346 file and process), not for in-buffer or Lisp string encoding. */
347 static Lisp_Object system_eol_type;
348
349 #ifdef emacs
350
351 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
352
353 Lisp_Object Qcoding_system_p, Qcoding_system_error;
354
355 /* Coding system emacs-mule and raw-text are for converting only
356 end-of-line format. */
357 Lisp_Object Qemacs_mule, Qraw_text;
358 Lisp_Object Qutf_8_emacs;
359
360 /* Coding-systems are handed between Emacs Lisp programs and C internal
361 routines by the following three variables. */
362 /* Coding-system for reading files and receiving data from process. */
363 Lisp_Object Vcoding_system_for_read;
364 /* Coding-system for writing files and sending data to process. */
365 Lisp_Object Vcoding_system_for_write;
366 /* Coding-system actually used in the latest I/O. */
367 Lisp_Object Vlast_coding_system_used;
368 /* Set to non-nil when an error is detected while code conversion. */
369 Lisp_Object Vlast_code_conversion_error;
370 /* A vector of length 256 which contains information about special
371 Latin codes (especially for dealing with Microsoft codes). */
372 Lisp_Object Vlatin_extra_code_table;
373
374 /* Flag to inhibit code conversion of end-of-line format. */
375 int inhibit_eol_conversion;
376
377 /* Flag to inhibit ISO2022 escape sequence detection. */
378 int inhibit_iso_escape_detection;
379
380 /* Flag to make buffer-file-coding-system inherit from process-coding. */
381 int inherit_process_coding_system;
382
383 /* Coding system to be used to encode text for terminal display. */
384 struct coding_system terminal_coding;
385
386 /* Coding system to be used to encode text for terminal display when
387 terminal coding system is nil. */
388 struct coding_system safe_terminal_coding;
389
390 /* Coding system of what is sent from terminal keyboard. */
391 struct coding_system keyboard_coding;
392
393 Lisp_Object Vfile_coding_system_alist;
394 Lisp_Object Vprocess_coding_system_alist;
395 Lisp_Object Vnetwork_coding_system_alist;
396
397 Lisp_Object Vlocale_coding_system;
398
399 #endif /* emacs */
400
401 /* Flag to tell if we look up translation table on character code
402 conversion. */
403 Lisp_Object Venable_character_translation;
404 /* Standard translation table to look up on decoding (reading). */
405 Lisp_Object Vstandard_translation_table_for_decode;
406 /* Standard translation table to look up on encoding (writing). */
407 Lisp_Object Vstandard_translation_table_for_encode;
408
409 Lisp_Object Qtranslation_table;
410 Lisp_Object Qtranslation_table_id;
411 Lisp_Object Qtranslation_table_for_decode;
412 Lisp_Object Qtranslation_table_for_encode;
413
414 /* Alist of charsets vs revision number. */
415 static Lisp_Object Vcharset_revision_table;
416
417 /* Default coding systems used for process I/O. */
418 Lisp_Object Vdefault_process_coding_system;
419
420 /* Char table for translating Quail and self-inserting input. */
421 Lisp_Object Vtranslation_table_for_input;
422
423 /* Two special coding systems. */
424 Lisp_Object Vsjis_coding_system;
425 Lisp_Object Vbig5_coding_system;
426
427 /* ISO2022 section */
428
429 #define CODING_ISO_INITIAL(coding, reg) \
430 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
431 coding_attr_iso_initial), \
432 reg)))
433
434
435 #define CODING_ISO_REQUEST(coding, charset_id) \
436 ((charset_id <= (coding)->max_charset_id \
437 ? (coding)->safe_charsets[charset_id] \
438 : -1))
439
440
441 #define CODING_ISO_FLAGS(coding) \
442 ((coding)->spec.iso_2022.flags)
443 #define CODING_ISO_DESIGNATION(coding, reg) \
444 ((coding)->spec.iso_2022.current_designation[reg])
445 #define CODING_ISO_INVOCATION(coding, plane) \
446 ((coding)->spec.iso_2022.current_invocation[plane])
447 #define CODING_ISO_SINGLE_SHIFTING(coding) \
448 ((coding)->spec.iso_2022.single_shifting)
449 #define CODING_ISO_BOL(coding) \
450 ((coding)->spec.iso_2022.bol)
451 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
452 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
453
454 /* Control characters of ISO2022. */
455 /* code */ /* function */
456 #define ISO_CODE_LF 0x0A /* line-feed */
457 #define ISO_CODE_CR 0x0D /* carriage-return */
458 #define ISO_CODE_SO 0x0E /* shift-out */
459 #define ISO_CODE_SI 0x0F /* shift-in */
460 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
461 #define ISO_CODE_ESC 0x1B /* escape */
462 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
463 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
464 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
465
466 /* All code (1-byte) of ISO2022 is classified into one of the
467 followings. */
468 enum iso_code_class_type
469 {
470 ISO_control_0, /* Control codes in the range
471 0x00..0x1F and 0x7F, except for the
472 following 5 codes. */
473 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
474 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
475 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
476 ISO_escape, /* ISO_CODE_SO (0x1B) */
477 ISO_control_1, /* Control codes in the range
478 0x80..0x9F, except for the
479 following 3 codes. */
480 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
481 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
482 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
483 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
484 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
485 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
486 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
487 };
488
489 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
490 `iso-flags' attribute of an iso2022 coding system. */
491
492 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
493 instead of the correct short-form sequence (e.g. ESC $ A). */
494 #define CODING_ISO_FLAG_LONG_FORM 0x0001
495
496 /* If set, reset graphic planes and registers at end-of-line to the
497 initial state. */
498 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
499
500 /* If set, reset graphic planes and registers before any control
501 characters to the initial state. */
502 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
503
504 /* If set, encode by 7-bit environment. */
505 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
506
507 /* If set, use locking-shift function. */
508 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
509
510 /* If set, use single-shift function. Overwrite
511 CODING_ISO_FLAG_LOCKING_SHIFT. */
512 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
513
514 /* If set, use designation escape sequence. */
515 #define CODING_ISO_FLAG_DESIGNATION 0x0040
516
517 /* If set, produce revision number sequence. */
518 #define CODING_ISO_FLAG_REVISION 0x0080
519
520 /* If set, produce ISO6429's direction specifying sequence. */
521 #define CODING_ISO_FLAG_DIRECTION 0x0100
522
523 /* If set, assume designation states are reset at beginning of line on
524 output. */
525 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
526
527 /* If set, designation sequence should be placed at beginning of line
528 on output. */
529 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
530
531 /* If set, do not encode unsafe charactes on output. */
532 #define CODING_ISO_FLAG_SAFE 0x0800
533
534 /* If set, extra latin codes (128..159) are accepted as a valid code
535 on input. */
536 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
537
538 #define CODING_ISO_FLAG_COMPOSITION 0x2000
539
540 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
541
542 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
543
544 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
545
546 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
547
548 /* A character to be produced on output if encoding of the original
549 character is prohibited by CODING_ISO_FLAG_SAFE. */
550 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
551
552
553 /* UTF-16 section */
554 #define CODING_UTF_16_BOM(coding) \
555 ((coding)->spec.utf_16.bom)
556
557 #define CODING_UTF_16_ENDIAN(coding) \
558 ((coding)->spec.utf_16.endian)
559
560 #define CODING_UTF_16_SURROGATE(coding) \
561 ((coding)->spec.utf_16.surrogate)
562
563
564 /* CCL section */
565 #define CODING_CCL_DECODER(coding) \
566 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
567 #define CODING_CCL_ENCODER(coding) \
568 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
569 #define CODING_CCL_VALIDS(coding) \
570 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
571
572 /* Index for each coding category in `coding_categories' */
573
574 enum coding_category
575 {
576 coding_category_iso_7,
577 coding_category_iso_7_tight,
578 coding_category_iso_8_1,
579 coding_category_iso_8_2,
580 coding_category_iso_7_else,
581 coding_category_iso_8_else,
582 coding_category_utf_8,
583 coding_category_utf_16_auto,
584 coding_category_utf_16_be,
585 coding_category_utf_16_le,
586 coding_category_utf_16_be_nosig,
587 coding_category_utf_16_le_nosig,
588 coding_category_charset,
589 coding_category_sjis,
590 coding_category_big5,
591 coding_category_ccl,
592 coding_category_emacs_mule,
593 /* All above are targets of code detection. */
594 coding_category_raw_text,
595 coding_category_undecided,
596 coding_category_max
597 };
598
599 /* Definitions of flag bits used in detect_coding_XXXX. */
600 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
601 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
602 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
603 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
604 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
605 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
606 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
607 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
608 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
609 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
610 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
611 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
612 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
613 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
614 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
615 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
616 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
617 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
618
619 /* This value is returned if detect_coding_mask () find nothing other
620 than ASCII characters. */
621 #define CATEGORY_MASK_ANY \
622 (CATEGORY_MASK_ISO_7 \
623 | CATEGORY_MASK_ISO_7_TIGHT \
624 | CATEGORY_MASK_ISO_8_1 \
625 | CATEGORY_MASK_ISO_8_2 \
626 | CATEGORY_MASK_ISO_7_ELSE \
627 | CATEGORY_MASK_ISO_8_ELSE \
628 | CATEGORY_MASK_UTF_8 \
629 | CATEGORY_MASK_UTF_16_BE \
630 | CATEGORY_MASK_UTF_16_LE \
631 | CATEGORY_MASK_UTF_16_BE_NOSIG \
632 | CATEGORY_MASK_UTF_16_LE_NOSIG \
633 | CATEGORY_MASK_CHARSET \
634 | CATEGORY_MASK_SJIS \
635 | CATEGORY_MASK_BIG5 \
636 | CATEGORY_MASK_CCL \
637 | CATEGORY_MASK_EMACS_MULE)
638
639
640 #define CATEGORY_MASK_ISO_7BIT \
641 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
642
643 #define CATEGORY_MASK_ISO_8BIT \
644 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
645
646 #define CATEGORY_MASK_ISO_ELSE \
647 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
648
649 #define CATEGORY_MASK_ISO_ESCAPE \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_7_ELSE \
653 | CATEGORY_MASK_ISO_8_ELSE)
654
655 #define CATEGORY_MASK_ISO \
656 ( CATEGORY_MASK_ISO_7BIT \
657 | CATEGORY_MASK_ISO_8BIT \
658 | CATEGORY_MASK_ISO_ELSE)
659
660 #define CATEGORY_MASK_UTF_16 \
661 (CATEGORY_MASK_UTF_16_BE \
662 | CATEGORY_MASK_UTF_16_LE \
663 | CATEGORY_MASK_UTF_16_BE_NOSIG \
664 | CATEGORY_MASK_UTF_16_LE_NOSIG)
665
666
667 /* List of symbols `coding-category-xxx' ordered by priority. This
668 variable is exposed to Emacs Lisp. */
669 static Lisp_Object Vcoding_category_list;
670
671 /* Table of coding categories (Lisp symbols). This variable is for
672 internal use oly. */
673 static Lisp_Object Vcoding_category_table;
674
675 /* Table of coding-categories ordered by priority. */
676 static enum coding_category coding_priorities[coding_category_max];
677
678 /* Nth element is a coding context for the coding system bound to the
679 Nth coding category. */
680 static struct coding_system coding_categories[coding_category_max];
681
682 /*** Commonly used macros and functions ***/
683
684 #ifndef min
685 #define min(a, b) ((a) < (b) ? (a) : (b))
686 #endif
687 #ifndef max
688 #define max(a, b) ((a) > (b) ? (a) : (b))
689 #endif
690
691 #define CODING_GET_INFO(coding, attrs, charset_list) \
692 do { \
693 (attrs) = CODING_ID_ATTRS ((coding)->id); \
694 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
695 } while (0)
696
697
698 /* Safely get one byte from the source text pointed by SRC which ends
699 at SRC_END, and set C to that byte. If there are not enough bytes
700 in the source, it jumps to `no_more_source'. If multibytep is
701 nonzero, and a multibyte character is found at SRC, set C to the
702 negative value of the character code. The caller should declare
703 and set these variables appropriately in advance:
704 src, src_end, multibytep */
705
706 #define ONE_MORE_BYTE(c) \
707 do { \
708 if (src == src_end) \
709 { \
710 if (src_base < src) \
711 record_conversion_result \
712 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
713 goto no_more_source; \
714 } \
715 c = *src++; \
716 if (multibytep && (c & 0x80)) \
717 { \
718 if ((c & 0xFE) == 0xC0) \
719 c = ((c & 1) << 6) | *src++; \
720 else \
721 { \
722 src--; \
723 c = - string_char (src, &src, NULL); \
724 record_conversion_result \
725 (coding, CODING_RESULT_INVALID_SRC); \
726 } \
727 } \
728 consumed_chars++; \
729 } while (0)
730
731
732 #define ONE_MORE_BYTE_NO_CHECK(c) \
733 do { \
734 c = *src++; \
735 if (multibytep && (c & 0x80)) \
736 { \
737 if ((c & 0xFE) == 0xC0) \
738 c = ((c & 1) << 6) | *src++; \
739 else \
740 { \
741 src--; \
742 c = - string_char (src, &src, NULL); \
743 record_conversion_result \
744 (coding, CODING_RESULT_INVALID_SRC); \
745 } \
746 } \
747 consumed_chars++; \
748 } while (0)
749
750
751 /* Store a byte C in the place pointed by DST and increment DST to the
752 next free point, and increment PRODUCED_CHARS. The caller should
753 assure that C is 0..127, and declare and set the variable `dst'
754 appropriately in advance.
755 */
756
757
758 #define EMIT_ONE_ASCII_BYTE(c) \
759 do { \
760 produced_chars++; \
761 *dst++ = (c); \
762 } while (0)
763
764
765 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
766
767 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
768 do { \
769 produced_chars += 2; \
770 *dst++ = (c1), *dst++ = (c2); \
771 } while (0)
772
773
774 /* Store a byte C in the place pointed by DST and increment DST to the
775 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
776 nonzero, store in an appropriate multibyte from. The caller should
777 declare and set the variables `dst' and `multibytep' appropriately
778 in advance. */
779
780 #define EMIT_ONE_BYTE(c) \
781 do { \
782 produced_chars++; \
783 if (multibytep) \
784 { \
785 int ch = (c); \
786 if (ch >= 0x80) \
787 ch = BYTE8_TO_CHAR (ch); \
788 CHAR_STRING_ADVANCE (ch, dst); \
789 } \
790 else \
791 *dst++ = (c); \
792 } while (0)
793
794
795 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
796
797 #define EMIT_TWO_BYTES(c1, c2) \
798 do { \
799 produced_chars += 2; \
800 if (multibytep) \
801 { \
802 int ch; \
803 \
804 ch = (c1); \
805 if (ch >= 0x80) \
806 ch = BYTE8_TO_CHAR (ch); \
807 CHAR_STRING_ADVANCE (ch, dst); \
808 ch = (c2); \
809 if (ch >= 0x80) \
810 ch = BYTE8_TO_CHAR (ch); \
811 CHAR_STRING_ADVANCE (ch, dst); \
812 } \
813 else \
814 { \
815 *dst++ = (c1); \
816 *dst++ = (c2); \
817 } \
818 } while (0)
819
820
821 #define EMIT_THREE_BYTES(c1, c2, c3) \
822 do { \
823 EMIT_ONE_BYTE (c1); \
824 EMIT_TWO_BYTES (c2, c3); \
825 } while (0)
826
827
828 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
829 do { \
830 EMIT_TWO_BYTES (c1, c2); \
831 EMIT_TWO_BYTES (c3, c4); \
832 } while (0)
833
834
835 /* Prototypes for static functions. */
836 static void record_conversion_result P_ ((struct coding_system *coding,
837 enum coding_result_code result));
838 static int detect_coding_utf_8 P_ ((struct coding_system *,
839 struct coding_detection_info *info));
840 static void decode_coding_utf_8 P_ ((struct coding_system *));
841 static int encode_coding_utf_8 P_ ((struct coding_system *));
842
843 static int detect_coding_utf_16 P_ ((struct coding_system *,
844 struct coding_detection_info *info));
845 static void decode_coding_utf_16 P_ ((struct coding_system *));
846 static int encode_coding_utf_16 P_ ((struct coding_system *));
847
848 static int detect_coding_iso_2022 P_ ((struct coding_system *,
849 struct coding_detection_info *info));
850 static void decode_coding_iso_2022 P_ ((struct coding_system *));
851 static int encode_coding_iso_2022 P_ ((struct coding_system *));
852
853 static int detect_coding_emacs_mule P_ ((struct coding_system *,
854 struct coding_detection_info *info));
855 static void decode_coding_emacs_mule P_ ((struct coding_system *));
856 static int encode_coding_emacs_mule P_ ((struct coding_system *));
857
858 static int detect_coding_sjis P_ ((struct coding_system *,
859 struct coding_detection_info *info));
860 static void decode_coding_sjis P_ ((struct coding_system *));
861 static int encode_coding_sjis P_ ((struct coding_system *));
862
863 static int detect_coding_big5 P_ ((struct coding_system *,
864 struct coding_detection_info *info));
865 static void decode_coding_big5 P_ ((struct coding_system *));
866 static int encode_coding_big5 P_ ((struct coding_system *));
867
868 static int detect_coding_ccl P_ ((struct coding_system *,
869 struct coding_detection_info *info));
870 static void decode_coding_ccl P_ ((struct coding_system *));
871 static int encode_coding_ccl P_ ((struct coding_system *));
872
873 static void decode_coding_raw_text P_ ((struct coding_system *));
874 static int encode_coding_raw_text P_ ((struct coding_system *));
875
876 static void coding_set_source P_ ((struct coding_system *));
877 static void coding_set_destination P_ ((struct coding_system *));
878 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
879 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
880 EMACS_INT));
881 static unsigned char *alloc_destination P_ ((struct coding_system *,
882 EMACS_INT, unsigned char *));
883 static void setup_iso_safe_charsets P_ ((Lisp_Object));
884 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
885 int *, int *,
886 unsigned char *));
887 static int detect_eol P_ ((const unsigned char *,
888 EMACS_INT, enum coding_category));
889 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
890 static void decode_eol P_ ((struct coding_system *));
891 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
892 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
893 int, int *, int *));
894 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
895 static INLINE void produce_composition P_ ((struct coding_system *, int *,
896 EMACS_INT));
897 static INLINE void produce_charset P_ ((struct coding_system *, int *,
898 EMACS_INT));
899 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
900 static int decode_coding P_ ((struct coding_system *));
901 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
902 struct coding_system *,
903 int *, EMACS_INT *));
904 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
905 struct coding_system *,
906 int *, EMACS_INT *));
907 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
908 static int encode_coding P_ ((struct coding_system *));
909 static Lisp_Object make_conversion_work_buffer P_ ((int));
910 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
911 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
912 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
913
914 static void
915 record_conversion_result (struct coding_system *coding,
916 enum coding_result_code result)
917 {
918 coding->result = result;
919 switch (result)
920 {
921 case CODING_RESULT_INSUFFICIENT_SRC:
922 Vlast_code_conversion_error = Qinsufficient_source;
923 break;
924 case CODING_RESULT_INCONSISTENT_EOL:
925 Vlast_code_conversion_error = Qinconsistent_eol;
926 break;
927 case CODING_RESULT_INVALID_SRC:
928 Vlast_code_conversion_error = Qinvalid_source;
929 break;
930 case CODING_RESULT_INTERRUPT:
931 Vlast_code_conversion_error = Qinterrupted;
932 break;
933 case CODING_RESULT_INSUFFICIENT_MEM:
934 Vlast_code_conversion_error = Qinsufficient_memory;
935 break;
936 default:
937 Vlast_code_conversion_error = intern ("Unknown error");
938 }
939 }
940
941 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
942 do { \
943 charset_map_loaded = 0; \
944 c = DECODE_CHAR (charset, code); \
945 if (charset_map_loaded) \
946 { \
947 const unsigned char *orig = coding->source; \
948 EMACS_INT offset; \
949 \
950 coding_set_source (coding); \
951 offset = coding->source - orig; \
952 src += offset; \
953 src_base += offset; \
954 src_end += offset; \
955 } \
956 } while (0)
957
958
959 #define ASSURE_DESTINATION(bytes) \
960 do { \
961 if (dst + (bytes) >= dst_end) \
962 { \
963 int more_bytes = charbuf_end - charbuf + (bytes); \
964 \
965 dst = alloc_destination (coding, more_bytes, dst); \
966 dst_end = coding->destination + coding->dst_bytes; \
967 } \
968 } while (0)
969
970
971
972 static void
973 coding_set_source (coding)
974 struct coding_system *coding;
975 {
976 if (BUFFERP (coding->src_object))
977 {
978 struct buffer *buf = XBUFFER (coding->src_object);
979
980 if (coding->src_pos < 0)
981 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
982 else
983 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
984 }
985 else if (STRINGP (coding->src_object))
986 {
987 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
988 }
989 else
990 /* Otherwise, the source is C string and is never relocated
991 automatically. Thus we don't have to update anything. */
992 ;
993 }
994
995 static void
996 coding_set_destination (coding)
997 struct coding_system *coding;
998 {
999 if (BUFFERP (coding->dst_object))
1000 {
1001 if (coding->src_pos < 0)
1002 {
1003 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
1004 coding->dst_bytes = (GAP_END_ADDR
1005 - (coding->src_bytes - coding->consumed)
1006 - coding->destination);
1007 }
1008 else
1009 {
1010 /* We are sure that coding->dst_pos_byte is before the gap
1011 of the buffer. */
1012 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1013 + coding->dst_pos_byte - 1);
1014 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1015 - coding->destination);
1016 }
1017 }
1018 else
1019 /* Otherwise, the destination is C string and is never relocated
1020 automatically. Thus we don't have to update anything. */
1021 ;
1022 }
1023
1024
1025 static void
1026 coding_alloc_by_realloc (coding, bytes)
1027 struct coding_system *coding;
1028 EMACS_INT bytes;
1029 {
1030 coding->destination = (unsigned char *) xrealloc (coding->destination,
1031 coding->dst_bytes + bytes);
1032 coding->dst_bytes += bytes;
1033 }
1034
1035 static void
1036 coding_alloc_by_making_gap (coding, bytes)
1037 struct coding_system *coding;
1038 EMACS_INT bytes;
1039 {
1040 if (BUFFERP (coding->dst_object)
1041 && EQ (coding->src_object, coding->dst_object))
1042 {
1043 EMACS_INT add = coding->src_bytes - coding->consumed;
1044
1045 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1046 make_gap (bytes);
1047 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1048 }
1049 else
1050 {
1051 Lisp_Object this_buffer;
1052
1053 this_buffer = Fcurrent_buffer ();
1054 set_buffer_internal (XBUFFER (coding->dst_object));
1055 make_gap (bytes);
1056 set_buffer_internal (XBUFFER (this_buffer));
1057 }
1058 }
1059
1060
1061 static unsigned char *
1062 alloc_destination (coding, nbytes, dst)
1063 struct coding_system *coding;
1064 EMACS_INT nbytes;
1065 unsigned char *dst;
1066 {
1067 EMACS_INT offset = dst - coding->destination;
1068
1069 if (BUFFERP (coding->dst_object))
1070 coding_alloc_by_making_gap (coding, nbytes);
1071 else
1072 coding_alloc_by_realloc (coding, nbytes);
1073 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1074 coding_set_destination (coding);
1075 dst = coding->destination + offset;
1076 return dst;
1077 }
1078
1079 /** Macros for annotations. */
1080
1081 /* Maximum length of annotation data (sum of annotations for
1082 composition and charset). */
1083 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1084
1085 /* An annotation data is stored in the array coding->charbuf in this
1086 format:
1087 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1088 LENGTH is the number of elements in the annotation.
1089 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1090 NCHARS is the number of characters in the text annotated.
1091
1092 The format of the following elements depend on ANNOTATION_MASK.
1093
1094 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1095 follows:
1096 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1097 METHOD is one of enum composition_method.
1098 Optionnal COMPOSITION-COMPONENTS are characters and composition
1099 rules.
1100
1101 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1102 follows. */
1103
1104 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1105 do { \
1106 *(buf)++ = -(len); \
1107 *(buf)++ = (mask); \
1108 *(buf)++ = (nchars); \
1109 coding->annotated = 1; \
1110 } while (0);
1111
1112 #define ADD_COMPOSITION_DATA(buf, nchars, method) \
1113 do { \
1114 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1115 *buf++ = method; \
1116 } while (0)
1117
1118
1119 #define ADD_CHARSET_DATA(buf, nchars, id) \
1120 do { \
1121 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1122 *buf++ = id; \
1123 } while (0)
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134 Check if a text is encoded in UTF-8. If it is, return 1, else
1135 return 0. */
1136
1137 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1138 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1139 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1140 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1141 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1142 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1143
1144 static int
1145 detect_coding_utf_8 (coding, detect_info)
1146 struct coding_system *coding;
1147 struct coding_detection_info *detect_info;
1148 {
1149 const unsigned char *src = coding->source, *src_base;
1150 const unsigned char *src_end = coding->source + coding->src_bytes;
1151 int multibytep = coding->src_multibyte;
1152 int consumed_chars = 0;
1153 int found = 0;
1154
1155 detect_info->checked |= CATEGORY_MASK_UTF_8;
1156 /* A coding system of this category is always ASCII compatible. */
1157 src += coding->head_ascii;
1158
1159 while (1)
1160 {
1161 int c, c1, c2, c3, c4;
1162
1163 src_base = src;
1164 ONE_MORE_BYTE (c);
1165 if (c < 0 || UTF_8_1_OCTET_P (c))
1166 continue;
1167 ONE_MORE_BYTE (c1);
1168 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1169 break;
1170 if (UTF_8_2_OCTET_LEADING_P (c))
1171 {
1172 found = CATEGORY_MASK_UTF_8;
1173 continue;
1174 }
1175 ONE_MORE_BYTE (c2);
1176 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1177 break;
1178 if (UTF_8_3_OCTET_LEADING_P (c))
1179 {
1180 found = CATEGORY_MASK_UTF_8;
1181 continue;
1182 }
1183 ONE_MORE_BYTE (c3);
1184 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1185 break;
1186 if (UTF_8_4_OCTET_LEADING_P (c))
1187 {
1188 found = CATEGORY_MASK_UTF_8;
1189 continue;
1190 }
1191 ONE_MORE_BYTE (c4);
1192 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1193 break;
1194 if (UTF_8_5_OCTET_LEADING_P (c))
1195 {
1196 found = CATEGORY_MASK_UTF_8;
1197 continue;
1198 }
1199 break;
1200 }
1201 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1202 return 0;
1203
1204 no_more_source:
1205 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1206 {
1207 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1208 return 0;
1209 }
1210 detect_info->found |= found;
1211 return 1;
1212 }
1213
1214
1215 static void
1216 decode_coding_utf_8 (coding)
1217 struct coding_system *coding;
1218 {
1219 const unsigned char *src = coding->source + coding->consumed;
1220 const unsigned char *src_end = coding->source + coding->src_bytes;
1221 const unsigned char *src_base;
1222 int *charbuf = coding->charbuf + coding->charbuf_used;
1223 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1224 int consumed_chars = 0, consumed_chars_base;
1225 int multibytep = coding->src_multibyte;
1226 Lisp_Object attr, charset_list;
1227
1228 CODING_GET_INFO (coding, attr, charset_list);
1229
1230 while (1)
1231 {
1232 int c, c1, c2, c3, c4, c5;
1233
1234 src_base = src;
1235 consumed_chars_base = consumed_chars;
1236
1237 if (charbuf >= charbuf_end)
1238 break;
1239
1240 ONE_MORE_BYTE (c1);
1241 if (c1 < 0)
1242 {
1243 c = - c1;
1244 }
1245 else if (UTF_8_1_OCTET_P(c1))
1246 {
1247 c = c1;
1248 }
1249 else
1250 {
1251 ONE_MORE_BYTE (c2);
1252 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1253 goto invalid_code;
1254 if (UTF_8_2_OCTET_LEADING_P (c1))
1255 {
1256 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1257 /* Reject overlong sequences here and below. Encoders
1258 producing them are incorrect, they can be misleading,
1259 and they mess up read/write invariance. */
1260 if (c < 128)
1261 goto invalid_code;
1262 }
1263 else
1264 {
1265 ONE_MORE_BYTE (c3);
1266 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1267 goto invalid_code;
1268 if (UTF_8_3_OCTET_LEADING_P (c1))
1269 {
1270 c = (((c1 & 0xF) << 12)
1271 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1272 if (c < 0x800
1273 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1274 goto invalid_code;
1275 }
1276 else
1277 {
1278 ONE_MORE_BYTE (c4);
1279 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1280 goto invalid_code;
1281 if (UTF_8_4_OCTET_LEADING_P (c1))
1282 {
1283 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1284 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1285 if (c < 0x10000)
1286 goto invalid_code;
1287 }
1288 else
1289 {
1290 ONE_MORE_BYTE (c5);
1291 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1292 goto invalid_code;
1293 if (UTF_8_5_OCTET_LEADING_P (c1))
1294 {
1295 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1296 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1297 | (c5 & 0x3F));
1298 if ((c > MAX_CHAR) || (c < 0x200000))
1299 goto invalid_code;
1300 }
1301 else
1302 goto invalid_code;
1303 }
1304 }
1305 }
1306 }
1307
1308 *charbuf++ = c;
1309 continue;
1310
1311 invalid_code:
1312 src = src_base;
1313 consumed_chars = consumed_chars_base;
1314 ONE_MORE_BYTE (c);
1315 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1316 coding->errors++;
1317 }
1318
1319 no_more_source:
1320 coding->consumed_char += consumed_chars_base;
1321 coding->consumed = src_base - coding->source;
1322 coding->charbuf_used = charbuf - coding->charbuf;
1323 }
1324
1325
1326 static int
1327 encode_coding_utf_8 (coding)
1328 struct coding_system *coding;
1329 {
1330 int multibytep = coding->dst_multibyte;
1331 int *charbuf = coding->charbuf;
1332 int *charbuf_end = charbuf + coding->charbuf_used;
1333 unsigned char *dst = coding->destination + coding->produced;
1334 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1335 int produced_chars = 0;
1336 int c;
1337
1338 if (multibytep)
1339 {
1340 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1341
1342 while (charbuf < charbuf_end)
1343 {
1344 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1345
1346 ASSURE_DESTINATION (safe_room);
1347 c = *charbuf++;
1348 if (CHAR_BYTE8_P (c))
1349 {
1350 c = CHAR_TO_BYTE8 (c);
1351 EMIT_ONE_BYTE (c);
1352 }
1353 else
1354 {
1355 CHAR_STRING_ADVANCE (c, pend);
1356 for (p = str; p < pend; p++)
1357 EMIT_ONE_BYTE (*p);
1358 }
1359 }
1360 }
1361 else
1362 {
1363 int safe_room = MAX_MULTIBYTE_LENGTH;
1364
1365 while (charbuf < charbuf_end)
1366 {
1367 ASSURE_DESTINATION (safe_room);
1368 c = *charbuf++;
1369 if (CHAR_BYTE8_P (c))
1370 *dst++ = CHAR_TO_BYTE8 (c);
1371 else
1372 dst += CHAR_STRING (c, dst);
1373 produced_chars++;
1374 }
1375 }
1376 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1377 coding->produced_char += produced_chars;
1378 coding->produced = dst - coding->destination;
1379 return 0;
1380 }
1381
1382
1383 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1384 Check if a text is encoded in one of UTF-16 based coding systems.
1385 If it is, return 1, else return 0. */
1386
1387 #define UTF_16_HIGH_SURROGATE_P(val) \
1388 (((val) & 0xFC00) == 0xD800)
1389
1390 #define UTF_16_LOW_SURROGATE_P(val) \
1391 (((val) & 0xFC00) == 0xDC00)
1392
1393 #define UTF_16_INVALID_P(val) \
1394 (((val) == 0xFFFE) \
1395 || ((val) == 0xFFFF) \
1396 || UTF_16_LOW_SURROGATE_P (val))
1397
1398
1399 static int
1400 detect_coding_utf_16 (coding, detect_info)
1401 struct coding_system *coding;
1402 struct coding_detection_info *detect_info;
1403 {
1404 const unsigned char *src = coding->source, *src_base = src;
1405 const unsigned char *src_end = coding->source + coding->src_bytes;
1406 int multibytep = coding->src_multibyte;
1407 int consumed_chars = 0;
1408 int c1, c2;
1409
1410 detect_info->checked |= CATEGORY_MASK_UTF_16;
1411 if (coding->mode & CODING_MODE_LAST_BLOCK
1412 && (coding->src_chars & 1))
1413 {
1414 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1415 return 0;
1416 }
1417
1418 ONE_MORE_BYTE (c1);
1419 ONE_MORE_BYTE (c2);
1420 if ((c1 == 0xFF) && (c2 == 0xFE))
1421 {
1422 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1423 | CATEGORY_MASK_UTF_16_AUTO);
1424 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1425 | CATEGORY_MASK_UTF_16_BE_NOSIG
1426 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1427 }
1428 else if ((c1 == 0xFE) && (c2 == 0xFF))
1429 {
1430 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1431 | CATEGORY_MASK_UTF_16_AUTO);
1432 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1433 | CATEGORY_MASK_UTF_16_BE_NOSIG
1434 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1435 }
1436 else if (c1 >= 0 && c2 >= 0)
1437 {
1438 detect_info->rejected
1439 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1440 }
1441 no_more_source:
1442 return 1;
1443 }
1444
1445 static void
1446 decode_coding_utf_16 (coding)
1447 struct coding_system *coding;
1448 {
1449 const unsigned char *src = coding->source + coding->consumed;
1450 const unsigned char *src_end = coding->source + coding->src_bytes;
1451 const unsigned char *src_base;
1452 int *charbuf = coding->charbuf + coding->charbuf_used;
1453 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1454 int consumed_chars = 0, consumed_chars_base;
1455 int multibytep = coding->src_multibyte;
1456 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1457 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1458 int surrogate = CODING_UTF_16_SURROGATE (coding);
1459 Lisp_Object attr, charset_list;
1460
1461 CODING_GET_INFO (coding, attr, charset_list);
1462
1463 if (bom == utf_16_with_bom)
1464 {
1465 int c, c1, c2;
1466
1467 src_base = src;
1468 ONE_MORE_BYTE (c1);
1469 ONE_MORE_BYTE (c2);
1470 c = (c1 << 8) | c2;
1471
1472 if (endian == utf_16_big_endian
1473 ? c != 0xFEFF : c != 0xFFFE)
1474 {
1475 /* The first two bytes are not BOM. Treat them as bytes
1476 for a normal character. */
1477 src = src_base;
1478 coding->errors++;
1479 }
1480 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1481 }
1482 else if (bom == utf_16_detect_bom)
1483 {
1484 /* We have already tried to detect BOM and failed in
1485 detect_coding. */
1486 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1487 }
1488
1489 while (1)
1490 {
1491 int c, c1, c2;
1492
1493 src_base = src;
1494 consumed_chars_base = consumed_chars;
1495
1496 if (charbuf + 2 >= charbuf_end)
1497 break;
1498
1499 ONE_MORE_BYTE (c1);
1500 if (c1 < 0)
1501 {
1502 *charbuf++ = -c1;
1503 continue;
1504 }
1505 ONE_MORE_BYTE (c2);
1506 if (c2 < 0)
1507 {
1508 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1509 *charbuf++ = -c2;
1510 continue;
1511 }
1512 c = (endian == utf_16_big_endian
1513 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1514 if (surrogate)
1515 {
1516 if (! UTF_16_LOW_SURROGATE_P (c))
1517 {
1518 if (endian == utf_16_big_endian)
1519 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1520 else
1521 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1522 *charbuf++ = c1;
1523 *charbuf++ = c2;
1524 coding->errors++;
1525 if (UTF_16_HIGH_SURROGATE_P (c))
1526 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1527 else
1528 *charbuf++ = c;
1529 }
1530 else
1531 {
1532 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1533 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1534 *charbuf++ = 0x10000 + c;
1535 }
1536 }
1537 else
1538 {
1539 if (UTF_16_HIGH_SURROGATE_P (c))
1540 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1541 else
1542 *charbuf++ = c;
1543 }
1544 }
1545
1546 no_more_source:
1547 coding->consumed_char += consumed_chars_base;
1548 coding->consumed = src_base - coding->source;
1549 coding->charbuf_used = charbuf - coding->charbuf;
1550 }
1551
1552 static int
1553 encode_coding_utf_16 (coding)
1554 struct coding_system *coding;
1555 {
1556 int multibytep = coding->dst_multibyte;
1557 int *charbuf = coding->charbuf;
1558 int *charbuf_end = charbuf + coding->charbuf_used;
1559 unsigned char *dst = coding->destination + coding->produced;
1560 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1561 int safe_room = 8;
1562 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1563 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1564 int produced_chars = 0;
1565 Lisp_Object attrs, charset_list;
1566 int c;
1567
1568 CODING_GET_INFO (coding, attrs, charset_list);
1569
1570 if (bom != utf_16_without_bom)
1571 {
1572 ASSURE_DESTINATION (safe_room);
1573 if (big_endian)
1574 EMIT_TWO_BYTES (0xFE, 0xFF);
1575 else
1576 EMIT_TWO_BYTES (0xFF, 0xFE);
1577 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1578 }
1579
1580 while (charbuf < charbuf_end)
1581 {
1582 ASSURE_DESTINATION (safe_room);
1583 c = *charbuf++;
1584 if (c >= MAX_UNICODE_CHAR)
1585 c = coding->default_char;
1586
1587 if (c < 0x10000)
1588 {
1589 if (big_endian)
1590 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1591 else
1592 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1593 }
1594 else
1595 {
1596 int c1, c2;
1597
1598 c -= 0x10000;
1599 c1 = (c >> 10) + 0xD800;
1600 c2 = (c & 0x3FF) + 0xDC00;
1601 if (big_endian)
1602 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1603 else
1604 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1605 }
1606 }
1607 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1608 coding->produced = dst - coding->destination;
1609 coding->produced_char += produced_chars;
1610 return 0;
1611 }
1612
1613 \f
1614 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1615
1616 /* Emacs' internal format for representation of multiple character
1617 sets is a kind of multi-byte encoding, i.e. characters are
1618 represented by variable-length sequences of one-byte codes.
1619
1620 ASCII characters and control characters (e.g. `tab', `newline') are
1621 represented by one-byte sequences which are their ASCII codes, in
1622 the range 0x00 through 0x7F.
1623
1624 8-bit characters of the range 0x80..0x9F are represented by
1625 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1626 code + 0x20).
1627
1628 8-bit characters of the range 0xA0..0xFF are represented by
1629 one-byte sequences which are their 8-bit code.
1630
1631 The other characters are represented by a sequence of `base
1632 leading-code', optional `extended leading-code', and one or two
1633 `position-code's. The length of the sequence is determined by the
1634 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1635 whereas extended leading-code and position-code take the range 0xA0
1636 through 0xFF. See `charset.h' for more details about leading-code
1637 and position-code.
1638
1639 --- CODE RANGE of Emacs' internal format ---
1640 character set range
1641 ------------- -----
1642 ascii 0x00..0x7F
1643 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1644 eight-bit-graphic 0xA0..0xBF
1645 ELSE 0x81..0x9D + [0xA0..0xFF]+
1646 ---------------------------------------------
1647
1648 As this is the internal character representation, the format is
1649 usually not used externally (i.e. in a file or in a data sent to a
1650 process). But, it is possible to have a text externally in this
1651 format (i.e. by encoding by the coding system `emacs-mule').
1652
1653 In that case, a sequence of one-byte codes has a slightly different
1654 form.
1655
1656 At first, all characters in eight-bit-control are represented by
1657 one-byte sequences which are their 8-bit code.
1658
1659 Next, character composition data are represented by the byte
1660 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1661 where,
1662 METHOD is 0xF0 plus one of composition method (enum
1663 composition_method),
1664
1665 BYTES is 0xA0 plus a byte length of this composition data,
1666
1667 CHARS is 0x20 plus a number of characters composed by this
1668 data,
1669
1670 COMPONENTs are characters of multibye form or composition
1671 rules encoded by two-byte of ASCII codes.
1672
1673 In addition, for backward compatibility, the following formats are
1674 also recognized as composition data on decoding.
1675
1676 0x80 MSEQ ...
1677 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1678
1679 Here,
1680 MSEQ is a multibyte form but in these special format:
1681 ASCII: 0xA0 ASCII_CODE+0x80,
1682 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1683 RULE is a one byte code of the range 0xA0..0xF0 that
1684 represents a composition rule.
1685 */
1686
1687 char emacs_mule_bytes[256];
1688
1689 int
1690 emacs_mule_char (coding, src, nbytes, nchars, id)
1691 struct coding_system *coding;
1692 const unsigned char *src;
1693 int *nbytes, *nchars, *id;
1694 {
1695 const unsigned char *src_end = coding->source + coding->src_bytes;
1696 const unsigned char *src_base = src;
1697 int multibytep = coding->src_multibyte;
1698 struct charset *charset;
1699 unsigned code;
1700 int c;
1701 int consumed_chars = 0;
1702
1703 ONE_MORE_BYTE (c);
1704 if (c < 0)
1705 {
1706 c = -c;
1707 charset = emacs_mule_charset[0];
1708 }
1709 else
1710 {
1711 if (c >= 0xA0)
1712 {
1713 /* Old style component character of a compostion. */
1714 if (c == 0xA0)
1715 {
1716 ONE_MORE_BYTE (c);
1717 c -= 0x80;
1718 }
1719 else
1720 c -= 0x20;
1721 }
1722
1723 switch (emacs_mule_bytes[c])
1724 {
1725 case 2:
1726 if (! (charset = emacs_mule_charset[c]))
1727 goto invalid_code;
1728 ONE_MORE_BYTE (c);
1729 if (c < 0xA0)
1730 goto invalid_code;
1731 code = c & 0x7F;
1732 break;
1733
1734 case 3:
1735 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1736 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1737 {
1738 ONE_MORE_BYTE (c);
1739 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1740 goto invalid_code;
1741 ONE_MORE_BYTE (c);
1742 if (c < 0xA0)
1743 goto invalid_code;
1744 code = c & 0x7F;
1745 }
1746 else
1747 {
1748 if (! (charset = emacs_mule_charset[c]))
1749 goto invalid_code;
1750 ONE_MORE_BYTE (c);
1751 if (c < 0xA0)
1752 goto invalid_code;
1753 code = (c & 0x7F) << 8;
1754 ONE_MORE_BYTE (c);
1755 if (c < 0xA0)
1756 goto invalid_code;
1757 code |= c & 0x7F;
1758 }
1759 break;
1760
1761 case 4:
1762 ONE_MORE_BYTE (c);
1763 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1764 goto invalid_code;
1765 ONE_MORE_BYTE (c);
1766 if (c < 0xA0)
1767 goto invalid_code;
1768 code = (c & 0x7F) << 8;
1769 ONE_MORE_BYTE (c);
1770 if (c < 0xA0)
1771 goto invalid_code;
1772 code |= c & 0x7F;
1773 break;
1774
1775 case 1:
1776 code = c;
1777 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1778 ? charset_ascii : charset_eight_bit);
1779 break;
1780
1781 default:
1782 abort ();
1783 }
1784 c = DECODE_CHAR (charset, code);
1785 if (c < 0)
1786 goto invalid_code;
1787 }
1788 *nbytes = src - src_base;
1789 *nchars = consumed_chars;
1790 if (id)
1791 *id = charset->id;
1792 return c;
1793
1794 no_more_source:
1795 return -2;
1796
1797 invalid_code:
1798 return -1;
1799 }
1800
1801
1802 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1803 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1804 else return 0. */
1805
1806 static int
1807 detect_coding_emacs_mule (coding, detect_info)
1808 struct coding_system *coding;
1809 struct coding_detection_info *detect_info;
1810 {
1811 const unsigned char *src = coding->source, *src_base;
1812 const unsigned char *src_end = coding->source + coding->src_bytes;
1813 int multibytep = coding->src_multibyte;
1814 int consumed_chars = 0;
1815 int c;
1816 int found = 0;
1817
1818 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1819 /* A coding system of this category is always ASCII compatible. */
1820 src += coding->head_ascii;
1821
1822 while (1)
1823 {
1824 src_base = src;
1825 ONE_MORE_BYTE (c);
1826 if (c < 0)
1827 continue;
1828 if (c == 0x80)
1829 {
1830 /* Perhaps the start of composite character. We simple skip
1831 it because analyzing it is too heavy for detecting. But,
1832 at least, we check that the composite character
1833 constitues of more than 4 bytes. */
1834 const unsigned char *src_base;
1835
1836 repeat:
1837 src_base = src;
1838 do
1839 {
1840 ONE_MORE_BYTE (c);
1841 }
1842 while (c >= 0xA0);
1843
1844 if (src - src_base <= 4)
1845 break;
1846 found = CATEGORY_MASK_EMACS_MULE;
1847 if (c == 0x80)
1848 goto repeat;
1849 }
1850
1851 if (c < 0x80)
1852 {
1853 if (c < 0x20
1854 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1855 break;
1856 }
1857 else
1858 {
1859 int more_bytes = emacs_mule_bytes[*src_base] - 1;
1860
1861 while (more_bytes > 0)
1862 {
1863 ONE_MORE_BYTE (c);
1864 if (c < 0xA0)
1865 {
1866 src--; /* Unread the last byte. */
1867 break;
1868 }
1869 more_bytes--;
1870 }
1871 if (more_bytes != 0)
1872 break;
1873 found = CATEGORY_MASK_EMACS_MULE;
1874 }
1875 }
1876 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1877 return 0;
1878
1879 no_more_source:
1880 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1881 {
1882 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1883 return 0;
1884 }
1885 detect_info->found |= found;
1886 return 1;
1887 }
1888
1889
1890 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1891
1892 /* Decode a character represented as a component of composition
1893 sequence of Emacs 20/21 style at SRC. Set C to that character and
1894 update SRC to the head of next character (or an encoded composition
1895 rule). If SRC doesn't points a composition component, set C to -1.
1896 If SRC points an invalid byte sequence, global exit by a return
1897 value 0. */
1898
1899 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1900 if (1) \
1901 { \
1902 int c; \
1903 int nbytes, nchars; \
1904 \
1905 if (src == src_end) \
1906 break; \
1907 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1908 if (c < 0) \
1909 { \
1910 if (c == -2) \
1911 break; \
1912 goto invalid_code; \
1913 } \
1914 *buf++ = c; \
1915 src += nbytes; \
1916 consumed_chars += nchars; \
1917 } \
1918 else
1919
1920
1921 /* Decode a composition rule represented as a component of composition
1922 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1923 and increment BUF. If SRC points an invalid byte sequence, set C
1924 to -1. */
1925
1926 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1927 do { \
1928 int c, gref, nref; \
1929 \
1930 if (src >= src_end) \
1931 goto invalid_code; \
1932 ONE_MORE_BYTE_NO_CHECK (c); \
1933 c -= 0xA0; \
1934 if (c < 0 || c >= 81) \
1935 goto invalid_code; \
1936 \
1937 gref = c / 9, nref = c % 9; \
1938 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1939 } while (0)
1940
1941
1942 /* Decode a composition rule represented as a component of composition
1943 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1944 and increment BUF. If SRC points an invalid byte sequence, set C
1945 to -1. */
1946
1947 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1948 do { \
1949 int gref, nref; \
1950 \
1951 if (src + 1>= src_end) \
1952 goto invalid_code; \
1953 ONE_MORE_BYTE_NO_CHECK (gref); \
1954 gref -= 0x20; \
1955 ONE_MORE_BYTE_NO_CHECK (nref); \
1956 nref -= 0x20; \
1957 if (gref < 0 || gref >= 81 \
1958 || nref < 0 || nref >= 81) \
1959 goto invalid_code; \
1960 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1961 } while (0)
1962
1963
1964 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1965 do { \
1966 /* Emacs 21 style format. The first three bytes at SRC are \
1967 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1968 the byte length of this composition information, CHARS is the \
1969 number of characters composed by this composition. */ \
1970 enum composition_method method = c - 0xF2; \
1971 int *charbuf_base = charbuf; \
1972 int consumed_chars_limit; \
1973 int nbytes, nchars; \
1974 \
1975 ONE_MORE_BYTE (c); \
1976 if (c < 0) \
1977 goto invalid_code; \
1978 nbytes = c - 0xA0; \
1979 if (nbytes < 3) \
1980 goto invalid_code; \
1981 ONE_MORE_BYTE (c); \
1982 if (c < 0) \
1983 goto invalid_code; \
1984 nchars = c - 0xA0; \
1985 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
1986 consumed_chars_limit = consumed_chars_base + nbytes; \
1987 if (method != COMPOSITION_RELATIVE) \
1988 { \
1989 int i = 0; \
1990 while (consumed_chars < consumed_chars_limit) \
1991 { \
1992 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1993 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1994 else \
1995 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1996 i++; \
1997 } \
1998 if (consumed_chars < consumed_chars_limit) \
1999 goto invalid_code; \
2000 charbuf_base[0] -= i; \
2001 } \
2002 } while (0)
2003
2004
2005 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2006 do { \
2007 /* Emacs 20 style format for relative composition. */ \
2008 /* Store multibyte form of characters to be composed. */ \
2009 enum composition_method method = COMPOSITION_RELATIVE; \
2010 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2011 int *buf = components; \
2012 int i, j; \
2013 \
2014 src = src_base; \
2015 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2016 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2017 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2018 if (i < 2) \
2019 goto invalid_code; \
2020 ADD_COMPOSITION_DATA (charbuf, i, method); \
2021 for (j = 0; j < i; j++) \
2022 *charbuf++ = components[j]; \
2023 } while (0)
2024
2025
2026 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2027 do { \
2028 /* Emacs 20 style format for rule-base composition. */ \
2029 /* Store multibyte form of characters to be composed. */ \
2030 enum composition_method method = COMPOSITION_WITH_RULE; \
2031 int *charbuf_base = charbuf; \
2032 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2033 int *buf = components; \
2034 int i, j; \
2035 \
2036 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2037 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
2038 { \
2039 if (*src < 0xA0) \
2040 break; \
2041 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
2042 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2043 } \
2044 if (i <= 1 || (buf - components) % 2 == 0) \
2045 goto invalid_code; \
2046 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
2047 goto no_more_source; \
2048 ADD_COMPOSITION_DATA (charbuf, i, method); \
2049 i = i * 2 - 1; \
2050 for (j = 0; j < i; j++) \
2051 *charbuf++ = components[j]; \
2052 charbuf_base[0] -= i; \
2053 for (j = 0; j < i; j += 2) \
2054 *charbuf++ = components[j]; \
2055 } while (0)
2056
2057
2058 static void
2059 decode_coding_emacs_mule (coding)
2060 struct coding_system *coding;
2061 {
2062 const unsigned char *src = coding->source + coding->consumed;
2063 const unsigned char *src_end = coding->source + coding->src_bytes;
2064 const unsigned char *src_base;
2065 int *charbuf = coding->charbuf + coding->charbuf_used;
2066 int *charbuf_end
2067 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2068 int consumed_chars = 0, consumed_chars_base;
2069 int multibytep = coding->src_multibyte;
2070 Lisp_Object attrs, charset_list;
2071 int char_offset = coding->produced_char;
2072 int last_offset = char_offset;
2073 int last_id = charset_ascii;
2074
2075 CODING_GET_INFO (coding, attrs, charset_list);
2076
2077 while (1)
2078 {
2079 int c;
2080
2081 src_base = src;
2082 consumed_chars_base = consumed_chars;
2083
2084 if (charbuf >= charbuf_end)
2085 break;
2086
2087 ONE_MORE_BYTE (c);
2088 if (c < 0)
2089 {
2090 *charbuf++ = -c;
2091 char_offset++;
2092 }
2093 else if (c < 0x80)
2094 {
2095 *charbuf++ = c;
2096 char_offset++;
2097 }
2098 else if (c == 0x80)
2099 {
2100 ONE_MORE_BYTE (c);
2101 if (c < 0)
2102 goto invalid_code;
2103 if (c - 0xF2 >= COMPOSITION_RELATIVE
2104 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2105 DECODE_EMACS_MULE_21_COMPOSITION (c);
2106 else if (c < 0xC0)
2107 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2108 else if (c == 0xFF)
2109 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2110 else
2111 goto invalid_code;
2112 }
2113 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2114 {
2115 int nbytes, nchars;
2116 int id;
2117
2118 src = src_base;
2119 consumed_chars = consumed_chars_base;
2120 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2121 if (c < 0)
2122 {
2123 if (c == -2)
2124 break;
2125 goto invalid_code;
2126 }
2127 if (last_id != id)
2128 {
2129 if (last_id != charset_ascii)
2130 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2131 last_id = id;
2132 last_offset = char_offset;
2133 }
2134 *charbuf++ = c;
2135 src += nbytes;
2136 consumed_chars += nchars;
2137 char_offset++;
2138 }
2139 else
2140 goto invalid_code;
2141 continue;
2142
2143 invalid_code:
2144 src = src_base;
2145 consumed_chars = consumed_chars_base;
2146 ONE_MORE_BYTE (c);
2147 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2148 char_offset++;
2149 coding->errors++;
2150 }
2151
2152 no_more_source:
2153 if (last_id != charset_ascii)
2154 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2155 coding->consumed_char += consumed_chars_base;
2156 coding->consumed = src_base - coding->source;
2157 coding->charbuf_used = charbuf - coding->charbuf;
2158 }
2159
2160
2161 #define EMACS_MULE_LEADING_CODES(id, codes) \
2162 do { \
2163 if (id < 0xA0) \
2164 codes[0] = id, codes[1] = 0; \
2165 else if (id < 0xE0) \
2166 codes[0] = 0x9A, codes[1] = id; \
2167 else if (id < 0xF0) \
2168 codes[0] = 0x9B, codes[1] = id; \
2169 else if (id < 0xF5) \
2170 codes[0] = 0x9C, codes[1] = id; \
2171 else \
2172 codes[0] = 0x9D, codes[1] = id; \
2173 } while (0);
2174
2175
2176 static int
2177 encode_coding_emacs_mule (coding)
2178 struct coding_system *coding;
2179 {
2180 int multibytep = coding->dst_multibyte;
2181 int *charbuf = coding->charbuf;
2182 int *charbuf_end = charbuf + coding->charbuf_used;
2183 unsigned char *dst = coding->destination + coding->produced;
2184 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2185 int safe_room = 8;
2186 int produced_chars = 0;
2187 Lisp_Object attrs, charset_list;
2188 int c;
2189 int preferred_charset_id = -1;
2190
2191 CODING_GET_INFO (coding, attrs, charset_list);
2192 if (! EQ (charset_list, Vemacs_mule_charset_list))
2193 {
2194 CODING_ATTR_CHARSET_LIST (attrs)
2195 = charset_list = Vemacs_mule_charset_list;
2196 }
2197
2198 while (charbuf < charbuf_end)
2199 {
2200 ASSURE_DESTINATION (safe_room);
2201 c = *charbuf++;
2202
2203 if (c < 0)
2204 {
2205 /* Handle an annotation. */
2206 switch (*charbuf)
2207 {
2208 case CODING_ANNOTATE_COMPOSITION_MASK:
2209 /* Not yet implemented. */
2210 break;
2211 case CODING_ANNOTATE_CHARSET_MASK:
2212 preferred_charset_id = charbuf[3];
2213 if (preferred_charset_id >= 0
2214 && NILP (Fmemq (make_number (preferred_charset_id),
2215 charset_list)))
2216 preferred_charset_id = -1;
2217 break;
2218 default:
2219 abort ();
2220 }
2221 charbuf += -c - 1;
2222 continue;
2223 }
2224
2225 if (ASCII_CHAR_P (c))
2226 EMIT_ONE_ASCII_BYTE (c);
2227 else if (CHAR_BYTE8_P (c))
2228 {
2229 c = CHAR_TO_BYTE8 (c);
2230 EMIT_ONE_BYTE (c);
2231 }
2232 else
2233 {
2234 struct charset *charset;
2235 unsigned code;
2236 int dimension;
2237 int emacs_mule_id;
2238 unsigned char leading_codes[2];
2239
2240 if (preferred_charset_id >= 0)
2241 {
2242 charset = CHARSET_FROM_ID (preferred_charset_id);
2243 if (! CHAR_CHARSET_P (c, charset))
2244 charset = char_charset (c, charset_list, NULL);
2245 }
2246 else
2247 charset = char_charset (c, charset_list, &code);
2248 if (! charset)
2249 {
2250 c = coding->default_char;
2251 if (ASCII_CHAR_P (c))
2252 {
2253 EMIT_ONE_ASCII_BYTE (c);
2254 continue;
2255 }
2256 charset = char_charset (c, charset_list, &code);
2257 }
2258 dimension = CHARSET_DIMENSION (charset);
2259 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2260 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2261 EMIT_ONE_BYTE (leading_codes[0]);
2262 if (leading_codes[1])
2263 EMIT_ONE_BYTE (leading_codes[1]);
2264 if (dimension == 1)
2265 EMIT_ONE_BYTE (code | 0x80);
2266 else
2267 {
2268 code |= 0x8080;
2269 EMIT_ONE_BYTE (code >> 8);
2270 EMIT_ONE_BYTE (code & 0xFF);
2271 }
2272 }
2273 }
2274 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2275 coding->produced_char += produced_chars;
2276 coding->produced = dst - coding->destination;
2277 return 0;
2278 }
2279
2280 \f
2281 /*** 7. ISO2022 handlers ***/
2282
2283 /* The following note describes the coding system ISO2022 briefly.
2284 Since the intention of this note is to help understand the
2285 functions in this file, some parts are NOT ACCURATE or are OVERLY
2286 SIMPLIFIED. For thorough understanding, please refer to the
2287 original document of ISO2022. This is equivalent to the standard
2288 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2289
2290 ISO2022 provides many mechanisms to encode several character sets
2291 in 7-bit and 8-bit environments. For 7-bit environments, all text
2292 is encoded using bytes less than 128. This may make the encoded
2293 text a little bit longer, but the text passes more easily through
2294 several types of gateway, some of which strip off the MSB (Most
2295 Significant Bit).
2296
2297 There are two kinds of character sets: control character sets and
2298 graphic character sets. The former contain control characters such
2299 as `newline' and `escape' to provide control functions (control
2300 functions are also provided by escape sequences). The latter
2301 contain graphic characters such as 'A' and '-'. Emacs recognizes
2302 two control character sets and many graphic character sets.
2303
2304 Graphic character sets are classified into one of the following
2305 four classes, according to the number of bytes (DIMENSION) and
2306 number of characters in one dimension (CHARS) of the set:
2307 - DIMENSION1_CHARS94
2308 - DIMENSION1_CHARS96
2309 - DIMENSION2_CHARS94
2310 - DIMENSION2_CHARS96
2311
2312 In addition, each character set is assigned an identification tag,
2313 unique for each set, called the "final character" (denoted as <F>
2314 hereafter). The <F> of each character set is decided by ECMA(*)
2315 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2316 (0x30..0x3F are for private use only).
2317
2318 Note (*): ECMA = European Computer Manufacturers Association
2319
2320 Here are examples of graphic character sets [NAME(<F>)]:
2321 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2322 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2323 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2324 o DIMENSION2_CHARS96 -- none for the moment
2325
2326 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2327 C0 [0x00..0x1F] -- control character plane 0
2328 GL [0x20..0x7F] -- graphic character plane 0
2329 C1 [0x80..0x9F] -- control character plane 1
2330 GR [0xA0..0xFF] -- graphic character plane 1
2331
2332 A control character set is directly designated and invoked to C0 or
2333 C1 by an escape sequence. The most common case is that:
2334 - ISO646's control character set is designated/invoked to C0, and
2335 - ISO6429's control character set is designated/invoked to C1,
2336 and usually these designations/invocations are omitted in encoded
2337 text. In a 7-bit environment, only C0 can be used, and a control
2338 character for C1 is encoded by an appropriate escape sequence to
2339 fit into the environment. All control characters for C1 are
2340 defined to have corresponding escape sequences.
2341
2342 A graphic character set is at first designated to one of four
2343 graphic registers (G0 through G3), then these graphic registers are
2344 invoked to GL or GR. These designations and invocations can be
2345 done independently. The most common case is that G0 is invoked to
2346 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2347 these invocations and designations are omitted in encoded text.
2348 In a 7-bit environment, only GL can be used.
2349
2350 When a graphic character set of CHARS94 is invoked to GL, codes
2351 0x20 and 0x7F of the GL area work as control characters SPACE and
2352 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2353 be used.
2354
2355 There are two ways of invocation: locking-shift and single-shift.
2356 With locking-shift, the invocation lasts until the next different
2357 invocation, whereas with single-shift, the invocation affects the
2358 following character only and doesn't affect the locking-shift
2359 state. Invocations are done by the following control characters or
2360 escape sequences:
2361
2362 ----------------------------------------------------------------------
2363 abbrev function cntrl escape seq description
2364 ----------------------------------------------------------------------
2365 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2366 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2367 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2368 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2369 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2370 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2371 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2372 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2373 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2374 ----------------------------------------------------------------------
2375 (*) These are not used by any known coding system.
2376
2377 Control characters for these functions are defined by macros
2378 ISO_CODE_XXX in `coding.h'.
2379
2380 Designations are done by the following escape sequences:
2381 ----------------------------------------------------------------------
2382 escape sequence description
2383 ----------------------------------------------------------------------
2384 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2385 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2386 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2387 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2388 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2389 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2390 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2391 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2392 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2393 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2394 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2395 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2396 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2397 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2398 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2399 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2400 ----------------------------------------------------------------------
2401
2402 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2403 of dimension 1, chars 94, and final character <F>, etc...
2404
2405 Note (*): Although these designations are not allowed in ISO2022,
2406 Emacs accepts them on decoding, and produces them on encoding
2407 CHARS96 character sets in a coding system which is characterized as
2408 7-bit environment, non-locking-shift, and non-single-shift.
2409
2410 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2411 '(' must be omitted. We refer to this as "short-form" hereafter.
2412
2413 Now you may notice that there are a lot of ways of encoding the
2414 same multilingual text in ISO2022. Actually, there exist many
2415 coding systems such as Compound Text (used in X11's inter client
2416 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2417 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2418 localized platforms), and all of these are variants of ISO2022.
2419
2420 In addition to the above, Emacs handles two more kinds of escape
2421 sequences: ISO6429's direction specification and Emacs' private
2422 sequence for specifying character composition.
2423
2424 ISO6429's direction specification takes the following form:
2425 o CSI ']' -- end of the current direction
2426 o CSI '0' ']' -- end of the current direction
2427 o CSI '1' ']' -- start of left-to-right text
2428 o CSI '2' ']' -- start of right-to-left text
2429 The control character CSI (0x9B: control sequence introducer) is
2430 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2431
2432 Character composition specification takes the following form:
2433 o ESC '0' -- start relative composition
2434 o ESC '1' -- end composition
2435 o ESC '2' -- start rule-base composition (*)
2436 o ESC '3' -- start relative composition with alternate chars (**)
2437 o ESC '4' -- start rule-base composition with alternate chars (**)
2438 Since these are not standard escape sequences of any ISO standard,
2439 the use of them with these meanings is restricted to Emacs only.
2440
2441 (*) This form is used only in Emacs 20.7 and older versions,
2442 but newer versions can safely decode it.
2443 (**) This form is used only in Emacs 21.1 and newer versions,
2444 and older versions can't decode it.
2445
2446 Here's a list of example usages of these composition escape
2447 sequences (categorized by `enum composition_method').
2448
2449 COMPOSITION_RELATIVE:
2450 ESC 0 CHAR [ CHAR ] ESC 1
2451 COMPOSITION_WITH_RULE:
2452 ESC 2 CHAR [ RULE CHAR ] ESC 1
2453 COMPOSITION_WITH_ALTCHARS:
2454 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2455 COMPOSITION_WITH_RULE_ALTCHARS:
2456 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2457
2458 enum iso_code_class_type iso_code_class[256];
2459
2460 #define SAFE_CHARSET_P(coding, id) \
2461 ((id) <= (coding)->max_charset_id \
2462 && (coding)->safe_charsets[id] >= 0)
2463
2464
2465 #define SHIFT_OUT_OK(category) \
2466 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2467
2468 static void
2469 setup_iso_safe_charsets (attrs)
2470 Lisp_Object attrs;
2471 {
2472 Lisp_Object charset_list, safe_charsets;
2473 Lisp_Object request;
2474 Lisp_Object reg_usage;
2475 Lisp_Object tail;
2476 int reg94, reg96;
2477 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2478 int max_charset_id;
2479
2480 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2481 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2482 && ! EQ (charset_list, Viso_2022_charset_list))
2483 {
2484 CODING_ATTR_CHARSET_LIST (attrs)
2485 = charset_list = Viso_2022_charset_list;
2486 ASET (attrs, coding_attr_safe_charsets, Qnil);
2487 }
2488
2489 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2490 return;
2491
2492 max_charset_id = 0;
2493 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2494 {
2495 int id = XINT (XCAR (tail));
2496 if (max_charset_id < id)
2497 max_charset_id = id;
2498 }
2499
2500 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2501 make_number (255));
2502 request = AREF (attrs, coding_attr_iso_request);
2503 reg_usage = AREF (attrs, coding_attr_iso_usage);
2504 reg94 = XINT (XCAR (reg_usage));
2505 reg96 = XINT (XCDR (reg_usage));
2506
2507 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2508 {
2509 Lisp_Object id;
2510 Lisp_Object reg;
2511 struct charset *charset;
2512
2513 id = XCAR (tail);
2514 charset = CHARSET_FROM_ID (XINT (id));
2515 reg = Fcdr (Fassq (id, request));
2516 if (! NILP (reg))
2517 SSET (safe_charsets, XINT (id), XINT (reg));
2518 else if (charset->iso_chars_96)
2519 {
2520 if (reg96 < 4)
2521 SSET (safe_charsets, XINT (id), reg96);
2522 }
2523 else
2524 {
2525 if (reg94 < 4)
2526 SSET (safe_charsets, XINT (id), reg94);
2527 }
2528 }
2529 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2530 }
2531
2532
2533 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2534 Check if a text is encoded in one of ISO-2022 based codig systems.
2535 If it is, return 1, else return 0. */
2536
2537 static int
2538 detect_coding_iso_2022 (coding, detect_info)
2539 struct coding_system *coding;
2540 struct coding_detection_info *detect_info;
2541 {
2542 const unsigned char *src = coding->source, *src_base = src;
2543 const unsigned char *src_end = coding->source + coding->src_bytes;
2544 int multibytep = coding->src_multibyte;
2545 int single_shifting = 0;
2546 int id;
2547 int c, c1;
2548 int consumed_chars = 0;
2549 int i;
2550 int rejected = 0;
2551 int found = 0;
2552
2553 detect_info->checked |= CATEGORY_MASK_ISO;
2554
2555 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2556 {
2557 struct coding_system *this = &(coding_categories[i]);
2558 Lisp_Object attrs, val;
2559
2560 attrs = CODING_ID_ATTRS (this->id);
2561 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2562 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2563 setup_iso_safe_charsets (attrs);
2564 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2565 this->max_charset_id = SCHARS (val) - 1;
2566 this->safe_charsets = (char *) SDATA (val);
2567 }
2568
2569 /* A coding system of this category is always ASCII compatible. */
2570 src += coding->head_ascii;
2571
2572 while (rejected != CATEGORY_MASK_ISO)
2573 {
2574 src_base = src;
2575 ONE_MORE_BYTE (c);
2576 switch (c)
2577 {
2578 case ISO_CODE_ESC:
2579 if (inhibit_iso_escape_detection)
2580 break;
2581 single_shifting = 0;
2582 ONE_MORE_BYTE (c);
2583 if (c >= '(' && c <= '/')
2584 {
2585 /* Designation sequence for a charset of dimension 1. */
2586 ONE_MORE_BYTE (c1);
2587 if (c1 < ' ' || c1 >= 0x80
2588 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2589 /* Invalid designation sequence. Just ignore. */
2590 break;
2591 }
2592 else if (c == '$')
2593 {
2594 /* Designation sequence for a charset of dimension 2. */
2595 ONE_MORE_BYTE (c);
2596 if (c >= '@' && c <= 'B')
2597 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2598 id = iso_charset_table[1][0][c];
2599 else if (c >= '(' && c <= '/')
2600 {
2601 ONE_MORE_BYTE (c1);
2602 if (c1 < ' ' || c1 >= 0x80
2603 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2604 /* Invalid designation sequence. Just ignore. */
2605 break;
2606 }
2607 else
2608 /* Invalid designation sequence. Just ignore it. */
2609 break;
2610 }
2611 else if (c == 'N' || c == 'O')
2612 {
2613 /* ESC <Fe> for SS2 or SS3. */
2614 single_shifting = 1;
2615 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2616 break;
2617 }
2618 else if (c >= '0' && c <= '4')
2619 {
2620 /* ESC <Fp> for start/end composition. */
2621 found |= CATEGORY_MASK_ISO;
2622 break;
2623 }
2624 else
2625 {
2626 /* Invalid escape sequence. Just ignore it. */
2627 break;
2628 }
2629
2630 /* We found a valid designation sequence for CHARSET. */
2631 rejected |= CATEGORY_MASK_ISO_8BIT;
2632 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2633 id))
2634 found |= CATEGORY_MASK_ISO_7;
2635 else
2636 rejected |= CATEGORY_MASK_ISO_7;
2637 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2638 id))
2639 found |= CATEGORY_MASK_ISO_7_TIGHT;
2640 else
2641 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2642 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2643 id))
2644 found |= CATEGORY_MASK_ISO_7_ELSE;
2645 else
2646 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2647 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2648 id))
2649 found |= CATEGORY_MASK_ISO_8_ELSE;
2650 else
2651 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2652 break;
2653
2654 case ISO_CODE_SO:
2655 case ISO_CODE_SI:
2656 /* Locking shift out/in. */
2657 if (inhibit_iso_escape_detection)
2658 break;
2659 single_shifting = 0;
2660 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2661 found |= CATEGORY_MASK_ISO_ELSE;
2662 break;
2663
2664 case ISO_CODE_CSI:
2665 /* Control sequence introducer. */
2666 single_shifting = 0;
2667 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2668 found |= CATEGORY_MASK_ISO_8_ELSE;
2669 goto check_extra_latin;
2670
2671 case ISO_CODE_SS2:
2672 case ISO_CODE_SS3:
2673 /* Single shift. */
2674 if (inhibit_iso_escape_detection)
2675 break;
2676 single_shifting = 0;
2677 rejected |= CATEGORY_MASK_ISO_7BIT;
2678 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2679 & CODING_ISO_FLAG_SINGLE_SHIFT)
2680 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2681 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2682 & CODING_ISO_FLAG_SINGLE_SHIFT)
2683 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2684 if (single_shifting)
2685 break;
2686 goto check_extra_latin;
2687
2688 default:
2689 if (c < 0)
2690 continue;
2691 if (c < 0x80)
2692 {
2693 single_shifting = 0;
2694 break;
2695 }
2696 if (c >= 0xA0)
2697 {
2698 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2699 found |= CATEGORY_MASK_ISO_8_1;
2700 /* Check the length of succeeding codes of the range
2701 0xA0..0FF. If the byte length is even, we include
2702 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2703 only when we are not single shifting. */
2704 if (! single_shifting
2705 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2706 {
2707 int i = 1;
2708 while (src < src_end)
2709 {
2710 ONE_MORE_BYTE (c);
2711 if (c < 0xA0)
2712 break;
2713 i++;
2714 }
2715
2716 if (i & 1 && src < src_end)
2717 rejected |= CATEGORY_MASK_ISO_8_2;
2718 else
2719 found |= CATEGORY_MASK_ISO_8_2;
2720 }
2721 break;
2722 }
2723 check_extra_latin:
2724 single_shifting = 0;
2725 if (! VECTORP (Vlatin_extra_code_table)
2726 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2727 {
2728 rejected = CATEGORY_MASK_ISO;
2729 break;
2730 }
2731 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2732 & CODING_ISO_FLAG_LATIN_EXTRA)
2733 found |= CATEGORY_MASK_ISO_8_1;
2734 else
2735 rejected |= CATEGORY_MASK_ISO_8_1;
2736 rejected |= CATEGORY_MASK_ISO_8_2;
2737 }
2738 }
2739 detect_info->rejected |= CATEGORY_MASK_ISO;
2740 return 0;
2741
2742 no_more_source:
2743 detect_info->rejected |= rejected;
2744 detect_info->found |= (found & ~rejected);
2745 return 1;
2746 }
2747
2748
2749 /* Set designation state into CODING. Set CHARS_96 to -1 if the
2750 escape sequence should be kept. */
2751 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2752 do { \
2753 int id, prev; \
2754 \
2755 if (final < '0' || final >= 128 \
2756 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2757 || !SAFE_CHARSET_P (coding, id)) \
2758 { \
2759 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2760 chars_96 = -1; \
2761 break; \
2762 } \
2763 prev = CODING_ISO_DESIGNATION (coding, reg); \
2764 if (id == charset_jisx0201_roman) \
2765 { \
2766 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2767 id = charset_ascii; \
2768 } \
2769 else if (id == charset_jisx0208_1978) \
2770 { \
2771 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2772 id = charset_jisx0208; \
2773 } \
2774 CODING_ISO_DESIGNATION (coding, reg) = id; \
2775 /* If there was an invalid designation to REG previously, and this \
2776 designation is ASCII to REG, we should keep this designation \
2777 sequence. */ \
2778 if (prev == -2 && id == charset_ascii) \
2779 chars_96 = -1; \
2780 } while (0)
2781
2782
2783 #define MAYBE_FINISH_COMPOSITION() \
2784 do { \
2785 int i; \
2786 if (composition_state == COMPOSING_NO) \
2787 break; \
2788 /* It is assured that we have enough room for producing \
2789 characters stored in the table `components'. */ \
2790 if (charbuf + component_idx > charbuf_end) \
2791 goto no_more_source; \
2792 composition_state = COMPOSING_NO; \
2793 if (method == COMPOSITION_RELATIVE \
2794 || method == COMPOSITION_WITH_ALTCHARS) \
2795 { \
2796 for (i = 0; i < component_idx; i++) \
2797 *charbuf++ = components[i]; \
2798 char_offset += component_idx; \
2799 } \
2800 else \
2801 { \
2802 for (i = 0; i < component_idx; i += 2) \
2803 *charbuf++ = components[i]; \
2804 char_offset += (component_idx / 2) + 1; \
2805 } \
2806 } while (0)
2807
2808
2809 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2810 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2811 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2812 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2813 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2814 */
2815
2816 #define DECODE_COMPOSITION_START(c1) \
2817 do { \
2818 if (c1 == '0' \
2819 && composition_state == COMPOSING_COMPONENT_RULE) \
2820 { \
2821 component_len = component_idx; \
2822 composition_state = COMPOSING_CHAR; \
2823 } \
2824 else \
2825 { \
2826 const unsigned char *p; \
2827 \
2828 MAYBE_FINISH_COMPOSITION (); \
2829 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2830 goto no_more_source; \
2831 for (p = src; p < src_end - 1; p++) \
2832 if (*p == ISO_CODE_ESC && p[1] == '1') \
2833 break; \
2834 if (p == src_end - 1) \
2835 { \
2836 /* The current composition doesn't end in the current \
2837 source. */ \
2838 record_conversion_result \
2839 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
2840 goto no_more_source; \
2841 } \
2842 \
2843 /* This is surely the start of a composition. */ \
2844 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2845 : c1 == '2' ? COMPOSITION_WITH_RULE \
2846 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2847 : COMPOSITION_WITH_RULE_ALTCHARS); \
2848 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2849 : COMPOSING_COMPONENT_CHAR); \
2850 component_idx = component_len = 0; \
2851 } \
2852 } while (0)
2853
2854
2855 /* Handle compositoin end sequence ESC 1. */
2856
2857 #define DECODE_COMPOSITION_END() \
2858 do { \
2859 int nchars = (component_len > 0 ? component_idx - component_len \
2860 : method == COMPOSITION_RELATIVE ? component_idx \
2861 : (component_idx + 1) / 2); \
2862 int i; \
2863 int *saved_charbuf = charbuf; \
2864 \
2865 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2866 if (method != COMPOSITION_RELATIVE) \
2867 { \
2868 if (component_len == 0) \
2869 for (i = 0; i < component_idx; i++) \
2870 *charbuf++ = components[i]; \
2871 else \
2872 for (i = 0; i < component_len; i++) \
2873 *charbuf++ = components[i]; \
2874 *saved_charbuf = saved_charbuf - charbuf; \
2875 } \
2876 if (method == COMPOSITION_WITH_RULE) \
2877 for (i = 0; i < component_idx; i += 2, char_offset++) \
2878 *charbuf++ = components[i]; \
2879 else \
2880 for (i = component_len; i < component_idx; i++, char_offset++) \
2881 *charbuf++ = components[i]; \
2882 coding->annotated = 1; \
2883 composition_state = COMPOSING_NO; \
2884 } while (0)
2885
2886
2887 /* Decode a composition rule from the byte C1 (and maybe one more byte
2888 from SRC) and store one encoded composition rule in
2889 coding->cmp_data. */
2890
2891 #define DECODE_COMPOSITION_RULE(c1) \
2892 do { \
2893 (c1) -= 32; \
2894 if (c1 < 81) /* old format (before ver.21) */ \
2895 { \
2896 int gref = (c1) / 9; \
2897 int nref = (c1) % 9; \
2898 if (gref == 4) gref = 10; \
2899 if (nref == 4) nref = 10; \
2900 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2901 } \
2902 else if (c1 < 93) /* new format (after ver.21) */ \
2903 { \
2904 ONE_MORE_BYTE (c2); \
2905 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2906 } \
2907 else \
2908 c1 = 0; \
2909 } while (0)
2910
2911
2912 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2913
2914 static void
2915 decode_coding_iso_2022 (coding)
2916 struct coding_system *coding;
2917 {
2918 const unsigned char *src = coding->source + coding->consumed;
2919 const unsigned char *src_end = coding->source + coding->src_bytes;
2920 const unsigned char *src_base;
2921 int *charbuf = coding->charbuf + coding->charbuf_used;
2922 int *charbuf_end
2923 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2924 int consumed_chars = 0, consumed_chars_base;
2925 int multibytep = coding->src_multibyte;
2926 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2927 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2928 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2929 int charset_id_2, charset_id_3;
2930 struct charset *charset;
2931 int c;
2932 /* For handling composition sequence. */
2933 #define COMPOSING_NO 0
2934 #define COMPOSING_CHAR 1
2935 #define COMPOSING_RULE 2
2936 #define COMPOSING_COMPONENT_CHAR 3
2937 #define COMPOSING_COMPONENT_RULE 4
2938
2939 int composition_state = COMPOSING_NO;
2940 enum composition_method method;
2941 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2942 int component_idx;
2943 int component_len;
2944 Lisp_Object attrs, charset_list;
2945 int char_offset = coding->produced_char;
2946 int last_offset = char_offset;
2947 int last_id = charset_ascii;
2948
2949 CODING_GET_INFO (coding, attrs, charset_list);
2950 setup_iso_safe_charsets (attrs);
2951
2952 while (1)
2953 {
2954 int c1, c2;
2955
2956 src_base = src;
2957 consumed_chars_base = consumed_chars;
2958
2959 if (charbuf >= charbuf_end)
2960 break;
2961
2962 ONE_MORE_BYTE (c1);
2963 if (c1 < 0)
2964 goto invalid_code;
2965
2966 /* We produce at most one character. */
2967 switch (iso_code_class [c1])
2968 {
2969 case ISO_0x20_or_0x7F:
2970 if (composition_state != COMPOSING_NO)
2971 {
2972 if (composition_state == COMPOSING_RULE
2973 || composition_state == COMPOSING_COMPONENT_RULE)
2974 {
2975 DECODE_COMPOSITION_RULE (c1);
2976 components[component_idx++] = c1;
2977 composition_state--;
2978 continue;
2979 }
2980 }
2981 if (charset_id_0 < 0
2982 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2983 /* This is SPACE or DEL. */
2984 charset = CHARSET_FROM_ID (charset_ascii);
2985 else
2986 charset = CHARSET_FROM_ID (charset_id_0);
2987 break;
2988
2989 case ISO_graphic_plane_0:
2990 if (composition_state != COMPOSING_NO)
2991 {
2992 if (composition_state == COMPOSING_RULE
2993 || composition_state == COMPOSING_COMPONENT_RULE)
2994 {
2995 DECODE_COMPOSITION_RULE (c1);
2996 components[component_idx++] = c1;
2997 composition_state--;
2998 continue;
2999 }
3000 }
3001 if (charset_id_0 < 0)
3002 charset = CHARSET_FROM_ID (charset_ascii);
3003 else
3004 charset = CHARSET_FROM_ID (charset_id_0);
3005 break;
3006
3007 case ISO_0xA0_or_0xFF:
3008 if (charset_id_1 < 0
3009 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3010 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3011 goto invalid_code;
3012 /* This is a graphic character, we fall down ... */
3013
3014 case ISO_graphic_plane_1:
3015 if (charset_id_1 < 0)
3016 goto invalid_code;
3017 charset = CHARSET_FROM_ID (charset_id_1);
3018 break;
3019
3020 case ISO_control_0:
3021 MAYBE_FINISH_COMPOSITION ();
3022 charset = CHARSET_FROM_ID (charset_ascii);
3023 break;
3024
3025 case ISO_control_1:
3026 MAYBE_FINISH_COMPOSITION ();
3027 goto invalid_code;
3028
3029 case ISO_shift_out:
3030 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3031 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3032 goto invalid_code;
3033 CODING_ISO_INVOCATION (coding, 0) = 1;
3034 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3035 continue;
3036
3037 case ISO_shift_in:
3038 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3039 goto invalid_code;
3040 CODING_ISO_INVOCATION (coding, 0) = 0;
3041 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3042 continue;
3043
3044 case ISO_single_shift_2_7:
3045 case ISO_single_shift_2:
3046 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3047 goto invalid_code;
3048 /* SS2 is handled as an escape sequence of ESC 'N' */
3049 c1 = 'N';
3050 goto label_escape_sequence;
3051
3052 case ISO_single_shift_3:
3053 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3054 goto invalid_code;
3055 /* SS2 is handled as an escape sequence of ESC 'O' */
3056 c1 = 'O';
3057 goto label_escape_sequence;
3058
3059 case ISO_control_sequence_introducer:
3060 /* CSI is handled as an escape sequence of ESC '[' ... */
3061 c1 = '[';
3062 goto label_escape_sequence;
3063
3064 case ISO_escape:
3065 ONE_MORE_BYTE (c1);
3066 label_escape_sequence:
3067 /* Escape sequences handled here are invocation,
3068 designation, direction specification, and character
3069 composition specification. */
3070 switch (c1)
3071 {
3072 case '&': /* revision of following character set */
3073 ONE_MORE_BYTE (c1);
3074 if (!(c1 >= '@' && c1 <= '~'))
3075 goto invalid_code;
3076 ONE_MORE_BYTE (c1);
3077 if (c1 != ISO_CODE_ESC)
3078 goto invalid_code;
3079 ONE_MORE_BYTE (c1);
3080 goto label_escape_sequence;
3081
3082 case '$': /* designation of 2-byte character set */
3083 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3084 goto invalid_code;
3085 {
3086 int reg, chars96;
3087
3088 ONE_MORE_BYTE (c1);
3089 if (c1 >= '@' && c1 <= 'B')
3090 { /* designation of JISX0208.1978, GB2312.1980,
3091 or JISX0208.1980 */
3092 reg = 0, chars96 = 0;
3093 }
3094 else if (c1 >= 0x28 && c1 <= 0x2B)
3095 { /* designation of DIMENSION2_CHARS94 character set */
3096 reg = c1 - 0x28, chars96 = 0;
3097 ONE_MORE_BYTE (c1);
3098 }
3099 else if (c1 >= 0x2C && c1 <= 0x2F)
3100 { /* designation of DIMENSION2_CHARS96 character set */
3101 reg = c1 - 0x2C, chars96 = 1;
3102 ONE_MORE_BYTE (c1);
3103 }
3104 else
3105 goto invalid_code;
3106 DECODE_DESIGNATION (reg, 2, chars96, c1);
3107 /* We must update these variables now. */
3108 if (reg == 0)
3109 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3110 else if (reg == 1)
3111 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3112 if (chars96 < 0)
3113 goto invalid_code;
3114 }
3115 continue;
3116
3117 case 'n': /* invocation of locking-shift-2 */
3118 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3119 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3120 goto invalid_code;
3121 CODING_ISO_INVOCATION (coding, 0) = 2;
3122 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3123 continue;
3124
3125 case 'o': /* invocation of locking-shift-3 */
3126 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3127 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3128 goto invalid_code;
3129 CODING_ISO_INVOCATION (coding, 0) = 3;
3130 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3131 continue;
3132
3133 case 'N': /* invocation of single-shift-2 */
3134 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3135 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3136 goto invalid_code;
3137 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3138 if (charset_id_2 < 0)
3139 charset = CHARSET_FROM_ID (charset_ascii);
3140 else
3141 charset = CHARSET_FROM_ID (charset_id_2);
3142 ONE_MORE_BYTE (c1);
3143 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3144 goto invalid_code;
3145 break;
3146
3147 case 'O': /* invocation of single-shift-3 */
3148 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3149 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3150 goto invalid_code;
3151 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3152 if (charset_id_3 < 0)
3153 charset = CHARSET_FROM_ID (charset_ascii);
3154 else
3155 charset = CHARSET_FROM_ID (charset_id_3);
3156 ONE_MORE_BYTE (c1);
3157 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3158 goto invalid_code;
3159 break;
3160
3161 case '0': case '2': case '3': case '4': /* start composition */
3162 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3163 goto invalid_code;
3164 DECODE_COMPOSITION_START (c1);
3165 continue;
3166
3167 case '1': /* end composition */
3168 if (composition_state == COMPOSING_NO)
3169 goto invalid_code;
3170 DECODE_COMPOSITION_END ();
3171 continue;
3172
3173 case '[': /* specification of direction */
3174 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3175 goto invalid_code;
3176 /* For the moment, nested direction is not supported.
3177 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3178 left-to-right, and nozero means right-to-left. */
3179 ONE_MORE_BYTE (c1);
3180 switch (c1)
3181 {
3182 case ']': /* end of the current direction */
3183 coding->mode &= ~CODING_MODE_DIRECTION;
3184
3185 case '0': /* end of the current direction */
3186 case '1': /* start of left-to-right direction */
3187 ONE_MORE_BYTE (c1);
3188 if (c1 == ']')
3189 coding->mode &= ~CODING_MODE_DIRECTION;
3190 else
3191 goto invalid_code;
3192 break;
3193
3194 case '2': /* start of right-to-left direction */
3195 ONE_MORE_BYTE (c1);
3196 if (c1 == ']')
3197 coding->mode |= CODING_MODE_DIRECTION;
3198 else
3199 goto invalid_code;
3200 break;
3201
3202 default:
3203 goto invalid_code;
3204 }
3205 continue;
3206
3207 case '%':
3208 ONE_MORE_BYTE (c1);
3209 if (c1 == '/')
3210 {
3211 /* CTEXT extended segment:
3212 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3213 We keep these bytes as is for the moment.
3214 They may be decoded by post-read-conversion. */
3215 int dim, M, L;
3216 int size;
3217
3218 ONE_MORE_BYTE (dim);
3219 ONE_MORE_BYTE (M);
3220 ONE_MORE_BYTE (L);
3221 size = ((M - 128) * 128) + (L - 128);
3222 if (charbuf + 8 + size > charbuf_end)
3223 goto break_loop;
3224 *charbuf++ = ISO_CODE_ESC;
3225 *charbuf++ = '%';
3226 *charbuf++ = '/';
3227 *charbuf++ = dim;
3228 *charbuf++ = BYTE8_TO_CHAR (M);
3229 *charbuf++ = BYTE8_TO_CHAR (L);
3230 while (size-- > 0)
3231 {
3232 ONE_MORE_BYTE (c1);
3233 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3234 }
3235 }
3236 else if (c1 == 'G')
3237 {
3238 /* XFree86 extension for embedding UTF-8 in CTEXT:
3239 ESC % G --UTF-8-BYTES-- ESC % @
3240 We keep these bytes as is for the moment.
3241 They may be decoded by post-read-conversion. */
3242 int *p = charbuf;
3243
3244 if (p + 6 > charbuf_end)
3245 goto break_loop;
3246 *p++ = ISO_CODE_ESC;
3247 *p++ = '%';
3248 *p++ = 'G';
3249 while (p < charbuf_end)
3250 {
3251 ONE_MORE_BYTE (c1);
3252 if (c1 == ISO_CODE_ESC
3253 && src + 1 < src_end
3254 && src[0] == '%'
3255 && src[1] == '@')
3256 {
3257 src += 2;
3258 break;
3259 }
3260 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3261 }
3262 if (p + 3 > charbuf_end)
3263 goto break_loop;
3264 *p++ = ISO_CODE_ESC;
3265 *p++ = '%';
3266 *p++ = '@';
3267 charbuf = p;
3268 }
3269 else
3270 goto invalid_code;
3271 continue;
3272 break;
3273
3274 default:
3275 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3276 goto invalid_code;
3277 {
3278 int reg, chars96;
3279
3280 if (c1 >= 0x28 && c1 <= 0x2B)
3281 { /* designation of DIMENSION1_CHARS94 character set */
3282 reg = c1 - 0x28, chars96 = 0;
3283 ONE_MORE_BYTE (c1);
3284 }
3285 else if (c1 >= 0x2C && c1 <= 0x2F)
3286 { /* designation of DIMENSION1_CHARS96 character set */
3287 reg = c1 - 0x2C, chars96 = 1;
3288 ONE_MORE_BYTE (c1);
3289 }
3290 else
3291 goto invalid_code;
3292 DECODE_DESIGNATION (reg, 1, chars96, c1);
3293 /* We must update these variables now. */
3294 if (reg == 0)
3295 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3296 else if (reg == 1)
3297 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3298 if (chars96 < 0)
3299 goto invalid_code;
3300 }
3301 continue;
3302 }
3303 }
3304
3305 if (charset->id != charset_ascii
3306 && last_id != charset->id)
3307 {
3308 if (last_id != charset_ascii)
3309 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3310 last_id = charset->id;
3311 last_offset = char_offset;
3312 }
3313
3314 /* Now we know CHARSET and 1st position code C1 of a character.
3315 Produce a decoded character while getting 2nd position code
3316 C2 if necessary. */
3317 c1 &= 0x7F;
3318 if (CHARSET_DIMENSION (charset) > 1)
3319 {
3320 ONE_MORE_BYTE (c2);
3321 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3322 /* C2 is not in a valid range. */
3323 goto invalid_code;
3324 c1 = (c1 << 8) | (c2 & 0x7F);
3325 if (CHARSET_DIMENSION (charset) > 2)
3326 {
3327 ONE_MORE_BYTE (c2);
3328 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3329 /* C2 is not in a valid range. */
3330 goto invalid_code;
3331 c1 = (c1 << 8) | (c2 & 0x7F);
3332 }
3333 }
3334
3335 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3336 if (c < 0)
3337 {
3338 MAYBE_FINISH_COMPOSITION ();
3339 for (; src_base < src; src_base++, char_offset++)
3340 {
3341 if (ASCII_BYTE_P (*src_base))
3342 *charbuf++ = *src_base;
3343 else
3344 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3345 }
3346 }
3347 else if (composition_state == COMPOSING_NO)
3348 {
3349 *charbuf++ = c;
3350 char_offset++;
3351 }
3352 else
3353 {
3354 components[component_idx++] = c;
3355 if (method == COMPOSITION_WITH_RULE
3356 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3357 && composition_state == COMPOSING_COMPONENT_CHAR))
3358 composition_state++;
3359 }
3360 continue;
3361
3362 invalid_code:
3363 MAYBE_FINISH_COMPOSITION ();
3364 src = src_base;
3365 consumed_chars = consumed_chars_base;
3366 ONE_MORE_BYTE (c);
3367 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3368 char_offset++;
3369 coding->errors++;
3370 continue;
3371
3372 break_loop:
3373 break;
3374 }
3375
3376 no_more_source:
3377 if (last_id != charset_ascii)
3378 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3379 coding->consumed_char += consumed_chars_base;
3380 coding->consumed = src_base - coding->source;
3381 coding->charbuf_used = charbuf - coding->charbuf;
3382 }
3383
3384
3385 /* ISO2022 encoding stuff. */
3386
3387 /*
3388 It is not enough to say just "ISO2022" on encoding, we have to
3389 specify more details. In Emacs, each coding system of ISO2022
3390 variant has the following specifications:
3391 1. Initial designation to G0 thru G3.
3392 2. Allows short-form designation?
3393 3. ASCII should be designated to G0 before control characters?
3394 4. ASCII should be designated to G0 at end of line?
3395 5. 7-bit environment or 8-bit environment?
3396 6. Use locking-shift?
3397 7. Use Single-shift?
3398 And the following two are only for Japanese:
3399 8. Use ASCII in place of JIS0201-1976-Roman?
3400 9. Use JISX0208-1983 in place of JISX0208-1978?
3401 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3402 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3403 details.
3404 */
3405
3406 /* Produce codes (escape sequence) for designating CHARSET to graphic
3407 register REG at DST, and increment DST. If <final-char> of CHARSET is
3408 '@', 'A', or 'B' and the coding system CODING allows, produce
3409 designation sequence of short-form. */
3410
3411 #define ENCODE_DESIGNATION(charset, reg, coding) \
3412 do { \
3413 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3414 char *intermediate_char_94 = "()*+"; \
3415 char *intermediate_char_96 = ",-./"; \
3416 int revision = -1; \
3417 int c; \
3418 \
3419 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3420 revision = CHARSET_ISO_REVISION (charset); \
3421 \
3422 if (revision >= 0) \
3423 { \
3424 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3425 EMIT_ONE_BYTE ('@' + revision); \
3426 } \
3427 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3428 if (CHARSET_DIMENSION (charset) == 1) \
3429 { \
3430 if (! CHARSET_ISO_CHARS_96 (charset)) \
3431 c = intermediate_char_94[reg]; \
3432 else \
3433 c = intermediate_char_96[reg]; \
3434 EMIT_ONE_ASCII_BYTE (c); \
3435 } \
3436 else \
3437 { \
3438 EMIT_ONE_ASCII_BYTE ('$'); \
3439 if (! CHARSET_ISO_CHARS_96 (charset)) \
3440 { \
3441 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3442 || reg != 0 \
3443 || final_char < '@' || final_char > 'B') \
3444 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3445 } \
3446 else \
3447 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3448 } \
3449 EMIT_ONE_ASCII_BYTE (final_char); \
3450 \
3451 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3452 } while (0)
3453
3454
3455 /* The following two macros produce codes (control character or escape
3456 sequence) for ISO2022 single-shift functions (single-shift-2 and
3457 single-shift-3). */
3458
3459 #define ENCODE_SINGLE_SHIFT_2 \
3460 do { \
3461 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3462 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3463 else \
3464 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3465 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3466 } while (0)
3467
3468
3469 #define ENCODE_SINGLE_SHIFT_3 \
3470 do { \
3471 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3472 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3473 else \
3474 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3475 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3476 } while (0)
3477
3478
3479 /* The following four macros produce codes (control character or
3480 escape sequence) for ISO2022 locking-shift functions (shift-in,
3481 shift-out, locking-shift-2, and locking-shift-3). */
3482
3483 #define ENCODE_SHIFT_IN \
3484 do { \
3485 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3486 CODING_ISO_INVOCATION (coding, 0) = 0; \
3487 } while (0)
3488
3489
3490 #define ENCODE_SHIFT_OUT \
3491 do { \
3492 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3493 CODING_ISO_INVOCATION (coding, 0) = 1; \
3494 } while (0)
3495
3496
3497 #define ENCODE_LOCKING_SHIFT_2 \
3498 do { \
3499 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3500 CODING_ISO_INVOCATION (coding, 0) = 2; \
3501 } while (0)
3502
3503
3504 #define ENCODE_LOCKING_SHIFT_3 \
3505 do { \
3506 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3507 CODING_ISO_INVOCATION (coding, 0) = 3; \
3508 } while (0)
3509
3510
3511 /* Produce codes for a DIMENSION1 character whose character set is
3512 CHARSET and whose position-code is C1. Designation and invocation
3513 sequences are also produced in advance if necessary. */
3514
3515 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3516 do { \
3517 int id = CHARSET_ID (charset); \
3518 \
3519 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3520 && id == charset_ascii) \
3521 { \
3522 id = charset_jisx0201_roman; \
3523 charset = CHARSET_FROM_ID (id); \
3524 } \
3525 \
3526 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3527 { \
3528 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3529 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3530 else \
3531 EMIT_ONE_BYTE (c1 | 0x80); \
3532 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3533 break; \
3534 } \
3535 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3536 { \
3537 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3538 break; \
3539 } \
3540 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3541 { \
3542 EMIT_ONE_BYTE (c1 | 0x80); \
3543 break; \
3544 } \
3545 else \
3546 /* Since CHARSET is not yet invoked to any graphic planes, we \
3547 must invoke it, or, at first, designate it to some graphic \
3548 register. Then repeat the loop to actually produce the \
3549 character. */ \
3550 dst = encode_invocation_designation (charset, coding, dst, \
3551 &produced_chars); \
3552 } while (1)
3553
3554
3555 /* Produce codes for a DIMENSION2 character whose character set is
3556 CHARSET and whose position-codes are C1 and C2. Designation and
3557 invocation codes are also produced in advance if necessary. */
3558
3559 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3560 do { \
3561 int id = CHARSET_ID (charset); \
3562 \
3563 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3564 && id == charset_jisx0208) \
3565 { \
3566 id = charset_jisx0208_1978; \
3567 charset = CHARSET_FROM_ID (id); \
3568 } \
3569 \
3570 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3571 { \
3572 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3573 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3574 else \
3575 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3576 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3577 break; \
3578 } \
3579 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3580 { \
3581 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3582 break; \
3583 } \
3584 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3585 { \
3586 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3587 break; \
3588 } \
3589 else \
3590 /* Since CHARSET is not yet invoked to any graphic planes, we \
3591 must invoke it, or, at first, designate it to some graphic \
3592 register. Then repeat the loop to actually produce the \
3593 character. */ \
3594 dst = encode_invocation_designation (charset, coding, dst, \
3595 &produced_chars); \
3596 } while (1)
3597
3598
3599 #define ENCODE_ISO_CHARACTER(charset, c) \
3600 do { \
3601 int code = ENCODE_CHAR ((charset),(c)); \
3602 \
3603 if (CHARSET_DIMENSION (charset) == 1) \
3604 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3605 else \
3606 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3607 } while (0)
3608
3609
3610 /* Produce designation and invocation codes at a place pointed by DST
3611 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3612 Return new DST. */
3613
3614 unsigned char *
3615 encode_invocation_designation (charset, coding, dst, p_nchars)
3616 struct charset *charset;
3617 struct coding_system *coding;
3618 unsigned char *dst;
3619 int *p_nchars;
3620 {
3621 int multibytep = coding->dst_multibyte;
3622 int produced_chars = *p_nchars;
3623 int reg; /* graphic register number */
3624 int id = CHARSET_ID (charset);
3625
3626 /* At first, check designations. */
3627 for (reg = 0; reg < 4; reg++)
3628 if (id == CODING_ISO_DESIGNATION (coding, reg))
3629 break;
3630
3631 if (reg >= 4)
3632 {
3633 /* CHARSET is not yet designated to any graphic registers. */
3634 /* At first check the requested designation. */
3635 reg = CODING_ISO_REQUEST (coding, id);
3636 if (reg < 0)
3637 /* Since CHARSET requests no special designation, designate it
3638 to graphic register 0. */
3639 reg = 0;
3640
3641 ENCODE_DESIGNATION (charset, reg, coding);
3642 }
3643
3644 if (CODING_ISO_INVOCATION (coding, 0) != reg
3645 && CODING_ISO_INVOCATION (coding, 1) != reg)
3646 {
3647 /* Since the graphic register REG is not invoked to any graphic
3648 planes, invoke it to graphic plane 0. */
3649 switch (reg)
3650 {
3651 case 0: /* graphic register 0 */
3652 ENCODE_SHIFT_IN;
3653 break;
3654
3655 case 1: /* graphic register 1 */
3656 ENCODE_SHIFT_OUT;
3657 break;
3658
3659 case 2: /* graphic register 2 */
3660 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3661 ENCODE_SINGLE_SHIFT_2;
3662 else
3663 ENCODE_LOCKING_SHIFT_2;
3664 break;
3665
3666 case 3: /* graphic register 3 */
3667 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3668 ENCODE_SINGLE_SHIFT_3;
3669 else
3670 ENCODE_LOCKING_SHIFT_3;
3671 break;
3672 }
3673 }
3674
3675 *p_nchars = produced_chars;
3676 return dst;
3677 }
3678
3679 /* The following three macros produce codes for indicating direction
3680 of text. */
3681 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3682 do { \
3683 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3684 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3685 else \
3686 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3687 } while (0)
3688
3689
3690 #define ENCODE_DIRECTION_R2L() \
3691 do { \
3692 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3693 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3694 } while (0)
3695
3696
3697 #define ENCODE_DIRECTION_L2R() \
3698 do { \
3699 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3700 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3701 } while (0)
3702
3703
3704 /* Produce codes for designation and invocation to reset the graphic
3705 planes and registers to initial state. */
3706 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3707 do { \
3708 int reg; \
3709 struct charset *charset; \
3710 \
3711 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3712 ENCODE_SHIFT_IN; \
3713 for (reg = 0; reg < 4; reg++) \
3714 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3715 && (CODING_ISO_DESIGNATION (coding, reg) \
3716 != CODING_ISO_INITIAL (coding, reg))) \
3717 { \
3718 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3719 ENCODE_DESIGNATION (charset, reg, coding); \
3720 } \
3721 } while (0)
3722
3723
3724 /* Produce designation sequences of charsets in the line started from
3725 SRC to a place pointed by DST, and return updated DST.
3726
3727 If the current block ends before any end-of-line, we may fail to
3728 find all the necessary designations. */
3729
3730 static unsigned char *
3731 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3732 struct coding_system *coding;
3733 int *charbuf, *charbuf_end;
3734 unsigned char *dst;
3735 {
3736 struct charset *charset;
3737 /* Table of charsets to be designated to each graphic register. */
3738 int r[4];
3739 int c, found = 0, reg;
3740 int produced_chars = 0;
3741 int multibytep = coding->dst_multibyte;
3742 Lisp_Object attrs;
3743 Lisp_Object charset_list;
3744
3745 attrs = CODING_ID_ATTRS (coding->id);
3746 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3747 if (EQ (charset_list, Qiso_2022))
3748 charset_list = Viso_2022_charset_list;
3749
3750 for (reg = 0; reg < 4; reg++)
3751 r[reg] = -1;
3752
3753 while (found < 4)
3754 {
3755 int id;
3756
3757 c = *charbuf++;
3758 if (c == '\n')
3759 break;
3760 charset = char_charset (c, charset_list, NULL);
3761 id = CHARSET_ID (charset);
3762 reg = CODING_ISO_REQUEST (coding, id);
3763 if (reg >= 0 && r[reg] < 0)
3764 {
3765 found++;
3766 r[reg] = id;
3767 }
3768 }
3769
3770 if (found)
3771 {
3772 for (reg = 0; reg < 4; reg++)
3773 if (r[reg] >= 0
3774 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3775 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3776 }
3777
3778 return dst;
3779 }
3780
3781 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3782
3783 static int
3784 encode_coding_iso_2022 (coding)
3785 struct coding_system *coding;
3786 {
3787 int multibytep = coding->dst_multibyte;
3788 int *charbuf = coding->charbuf;
3789 int *charbuf_end = charbuf + coding->charbuf_used;
3790 unsigned char *dst = coding->destination + coding->produced;
3791 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3792 int safe_room = 16;
3793 int bol_designation
3794 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3795 && CODING_ISO_BOL (coding));
3796 int produced_chars = 0;
3797 Lisp_Object attrs, eol_type, charset_list;
3798 int ascii_compatible;
3799 int c;
3800 int preferred_charset_id = -1;
3801
3802 CODING_GET_INFO (coding, attrs, charset_list);
3803 eol_type = CODING_ID_EOL_TYPE (coding->id);
3804 if (VECTORP (eol_type))
3805 eol_type = Qunix;
3806
3807 setup_iso_safe_charsets (attrs);
3808 /* Charset list may have been changed. */
3809 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3810 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3811
3812 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3813
3814 while (charbuf < charbuf_end)
3815 {
3816 ASSURE_DESTINATION (safe_room);
3817
3818 if (bol_designation)
3819 {
3820 unsigned char *dst_prev = dst;
3821
3822 /* We have to produce designation sequences if any now. */
3823 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3824 bol_designation = 0;
3825 /* We are sure that designation sequences are all ASCII bytes. */
3826 produced_chars += dst - dst_prev;
3827 }
3828
3829 c = *charbuf++;
3830
3831 if (c < 0)
3832 {
3833 /* Handle an annotation. */
3834 switch (*charbuf)
3835 {
3836 case CODING_ANNOTATE_COMPOSITION_MASK:
3837 /* Not yet implemented. */
3838 break;
3839 case CODING_ANNOTATE_CHARSET_MASK:
3840 preferred_charset_id = charbuf[2];
3841 if (preferred_charset_id >= 0
3842 && NILP (Fmemq (make_number (preferred_charset_id),
3843 charset_list)))
3844 preferred_charset_id = -1;
3845 break;
3846 default:
3847 abort ();
3848 }
3849 charbuf += -c - 1;
3850 continue;
3851 }
3852
3853 /* Now encode the character C. */
3854 if (c < 0x20 || c == 0x7F)
3855 {
3856 if (c == '\n'
3857 || (c == '\r' && EQ (eol_type, Qmac)))
3858 {
3859 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3860 ENCODE_RESET_PLANE_AND_REGISTER ();
3861 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3862 {
3863 int i;
3864
3865 for (i = 0; i < 4; i++)
3866 CODING_ISO_DESIGNATION (coding, i)
3867 = CODING_ISO_INITIAL (coding, i);
3868 }
3869 bol_designation
3870 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3871 }
3872 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3873 ENCODE_RESET_PLANE_AND_REGISTER ();
3874 EMIT_ONE_ASCII_BYTE (c);
3875 }
3876 else if (ASCII_CHAR_P (c))
3877 {
3878 if (ascii_compatible)
3879 EMIT_ONE_ASCII_BYTE (c);
3880 else
3881 {
3882 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3883 ENCODE_ISO_CHARACTER (charset, c);
3884 }
3885 }
3886 else if (CHAR_BYTE8_P (c))
3887 {
3888 c = CHAR_TO_BYTE8 (c);
3889 EMIT_ONE_BYTE (c);
3890 }
3891 else
3892 {
3893 struct charset *charset;
3894
3895 if (preferred_charset_id >= 0)
3896 {
3897 charset = CHARSET_FROM_ID (preferred_charset_id);
3898 if (! CHAR_CHARSET_P (c, charset))
3899 charset = char_charset (c, charset_list, NULL);
3900 }
3901 else
3902 charset = char_charset (c, charset_list, NULL);
3903 if (!charset)
3904 {
3905 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3906 {
3907 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3908 charset = CHARSET_FROM_ID (charset_ascii);
3909 }
3910 else
3911 {
3912 c = coding->default_char;
3913 charset = char_charset (c, charset_list, NULL);
3914 }
3915 }
3916 ENCODE_ISO_CHARACTER (charset, c);
3917 }
3918 }
3919
3920 if (coding->mode & CODING_MODE_LAST_BLOCK
3921 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3922 {
3923 ASSURE_DESTINATION (safe_room);
3924 ENCODE_RESET_PLANE_AND_REGISTER ();
3925 }
3926 record_conversion_result (coding, CODING_RESULT_SUCCESS);
3927 CODING_ISO_BOL (coding) = bol_designation;
3928 coding->produced_char += produced_chars;
3929 coding->produced = dst - coding->destination;
3930 return 0;
3931 }
3932
3933 \f
3934 /*** 8,9. SJIS and BIG5 handlers ***/
3935
3936 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3937 quite widely. So, for the moment, Emacs supports them in the bare
3938 C code. But, in the future, they may be supported only by CCL. */
3939
3940 /* SJIS is a coding system encoding three character sets: ASCII, right
3941 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3942 as is. A character of charset katakana-jisx0201 is encoded by
3943 "position-code + 0x80". A character of charset japanese-jisx0208
3944 is encoded in 2-byte but two position-codes are divided and shifted
3945 so that it fit in the range below.
3946
3947 --- CODE RANGE of SJIS ---
3948 (character set) (range)
3949 ASCII 0x00 .. 0x7F
3950 KATAKANA-JISX0201 0xA0 .. 0xDF
3951 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3952 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3953 -------------------------------
3954
3955 */
3956
3957 /* BIG5 is a coding system encoding two character sets: ASCII and
3958 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3959 character set and is encoded in two-byte.
3960
3961 --- CODE RANGE of BIG5 ---
3962 (character set) (range)
3963 ASCII 0x00 .. 0x7F
3964 Big5 (1st byte) 0xA1 .. 0xFE
3965 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3966 --------------------------
3967
3968 */
3969
3970 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3971 Check if a text is encoded in SJIS. If it is, return
3972 CATEGORY_MASK_SJIS, else return 0. */
3973
3974 static int
3975 detect_coding_sjis (coding, detect_info)
3976 struct coding_system *coding;
3977 struct coding_detection_info *detect_info;
3978 {
3979 const unsigned char *src = coding->source, *src_base;
3980 const unsigned char *src_end = coding->source + coding->src_bytes;
3981 int multibytep = coding->src_multibyte;
3982 int consumed_chars = 0;
3983 int found = 0;
3984 int c;
3985
3986 detect_info->checked |= CATEGORY_MASK_SJIS;
3987 /* A coding system of this category is always ASCII compatible. */
3988 src += coding->head_ascii;
3989
3990 while (1)
3991 {
3992 src_base = src;
3993 ONE_MORE_BYTE (c);
3994 if (c < 0x80)
3995 continue;
3996 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3997 {
3998 ONE_MORE_BYTE (c);
3999 if (c < 0x40 || c == 0x7F || c > 0xFC)
4000 break;
4001 found = CATEGORY_MASK_SJIS;
4002 }
4003 else if (c >= 0xA0 && c < 0xE0)
4004 found = CATEGORY_MASK_SJIS;
4005 else
4006 break;
4007 }
4008 detect_info->rejected |= CATEGORY_MASK_SJIS;
4009 return 0;
4010
4011 no_more_source:
4012 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4013 {
4014 detect_info->rejected |= CATEGORY_MASK_SJIS;
4015 return 0;
4016 }
4017 detect_info->found |= found;
4018 return 1;
4019 }
4020
4021 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4022 Check if a text is encoded in BIG5. If it is, return
4023 CATEGORY_MASK_BIG5, else return 0. */
4024
4025 static int
4026 detect_coding_big5 (coding, detect_info)
4027 struct coding_system *coding;
4028 struct coding_detection_info *detect_info;
4029 {
4030 const unsigned char *src = coding->source, *src_base;
4031 const unsigned char *src_end = coding->source + coding->src_bytes;
4032 int multibytep = coding->src_multibyte;
4033 int consumed_chars = 0;
4034 int found = 0;
4035 int c;
4036
4037 detect_info->checked |= CATEGORY_MASK_BIG5;
4038 /* A coding system of this category is always ASCII compatible. */
4039 src += coding->head_ascii;
4040
4041 while (1)
4042 {
4043 src_base = src;
4044 ONE_MORE_BYTE (c);
4045 if (c < 0x80)
4046 continue;
4047 if (c >= 0xA1)
4048 {
4049 ONE_MORE_BYTE (c);
4050 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4051 return 0;
4052 found = CATEGORY_MASK_BIG5;
4053 }
4054 else
4055 break;
4056 }
4057 detect_info->rejected |= CATEGORY_MASK_BIG5;
4058 return 0;
4059
4060 no_more_source:
4061 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4062 {
4063 detect_info->rejected |= CATEGORY_MASK_BIG5;
4064 return 0;
4065 }
4066 detect_info->found |= found;
4067 return 1;
4068 }
4069
4070 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4071 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4072
4073 static void
4074 decode_coding_sjis (coding)
4075 struct coding_system *coding;
4076 {
4077 const unsigned char *src = coding->source + coding->consumed;
4078 const unsigned char *src_end = coding->source + coding->src_bytes;
4079 const unsigned char *src_base;
4080 int *charbuf = coding->charbuf + coding->charbuf_used;
4081 int *charbuf_end
4082 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4083 int consumed_chars = 0, consumed_chars_base;
4084 int multibytep = coding->src_multibyte;
4085 struct charset *charset_roman, *charset_kanji, *charset_kana;
4086 struct charset *charset_kanji2;
4087 Lisp_Object attrs, charset_list, val;
4088 int char_offset = coding->produced_char;
4089 int last_offset = char_offset;
4090 int last_id = charset_ascii;
4091
4092 CODING_GET_INFO (coding, attrs, charset_list);
4093
4094 val = charset_list;
4095 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4096 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4097 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4098 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4099
4100 while (1)
4101 {
4102 int c, c1;
4103 struct charset *charset;
4104
4105 src_base = src;
4106 consumed_chars_base = consumed_chars;
4107
4108 if (charbuf >= charbuf_end)
4109 break;
4110
4111 ONE_MORE_BYTE (c);
4112 if (c < 0)
4113 goto invalid_code;
4114 if (c < 0x80)
4115 charset = charset_roman;
4116 else if (c == 0x80 || c == 0xA0)
4117 goto invalid_code;
4118 else if (c >= 0xA1 && c <= 0xDF)
4119 {
4120 /* SJIS -> JISX0201-Kana */
4121 c &= 0x7F;
4122 charset = charset_kana;
4123 }
4124 else if (c <= 0xEF)
4125 {
4126 /* SJIS -> JISX0208 */
4127 ONE_MORE_BYTE (c1);
4128 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4129 goto invalid_code;
4130 c = (c << 8) | c1;
4131 SJIS_TO_JIS (c);
4132 charset = charset_kanji;
4133 }
4134 else if (c <= 0xFC && charset_kanji2)
4135 {
4136 /* SJIS -> JISX0213-2 */
4137 ONE_MORE_BYTE (c1);
4138 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4139 goto invalid_code;
4140 c = (c << 8) | c1;
4141 SJIS_TO_JIS2 (c);
4142 charset = charset_kanji2;
4143 }
4144 else
4145 goto invalid_code;
4146 if (charset->id != charset_ascii
4147 && last_id != charset->id)
4148 {
4149 if (last_id != charset_ascii)
4150 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4151 last_id = charset->id;
4152 last_offset = char_offset;
4153 }
4154 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4155 *charbuf++ = c;
4156 char_offset++;
4157 continue;
4158
4159 invalid_code:
4160 src = src_base;
4161 consumed_chars = consumed_chars_base;
4162 ONE_MORE_BYTE (c);
4163 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4164 char_offset++;
4165 coding->errors++;
4166 }
4167
4168 no_more_source:
4169 if (last_id != charset_ascii)
4170 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4171 coding->consumed_char += consumed_chars_base;
4172 coding->consumed = src_base - coding->source;
4173 coding->charbuf_used = charbuf - coding->charbuf;
4174 }
4175
4176 static void
4177 decode_coding_big5 (coding)
4178 struct coding_system *coding;
4179 {
4180 const unsigned char *src = coding->source + coding->consumed;
4181 const unsigned char *src_end = coding->source + coding->src_bytes;
4182 const unsigned char *src_base;
4183 int *charbuf = coding->charbuf + coding->charbuf_used;
4184 int *charbuf_end
4185 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4186 int consumed_chars = 0, consumed_chars_base;
4187 int multibytep = coding->src_multibyte;
4188 struct charset *charset_roman, *charset_big5;
4189 Lisp_Object attrs, charset_list, val;
4190 int char_offset = coding->produced_char;
4191 int last_offset = char_offset;
4192 int last_id = charset_ascii;
4193
4194 CODING_GET_INFO (coding, attrs, charset_list);
4195 val = charset_list;
4196 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4197 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4198
4199 while (1)
4200 {
4201 int c, c1;
4202 struct charset *charset;
4203
4204 src_base = src;
4205 consumed_chars_base = consumed_chars;
4206
4207 if (charbuf >= charbuf_end)
4208 break;
4209
4210 ONE_MORE_BYTE (c);
4211
4212 if (c < 0)
4213 goto invalid_code;
4214 if (c < 0x80)
4215 charset = charset_roman;
4216 else
4217 {
4218 /* BIG5 -> Big5 */
4219 if (c < 0xA1 || c > 0xFE)
4220 goto invalid_code;
4221 ONE_MORE_BYTE (c1);
4222 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4223 goto invalid_code;
4224 c = c << 8 | c1;
4225 charset = charset_big5;
4226 }
4227 if (charset->id != charset_ascii
4228 && last_id != charset->id)
4229 {
4230 if (last_id != charset_ascii)
4231 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4232 last_id = charset->id;
4233 last_offset = char_offset;
4234 }
4235 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4236 *charbuf++ = c;
4237 char_offset++;
4238 continue;
4239
4240 invalid_code:
4241 src = src_base;
4242 consumed_chars = consumed_chars_base;
4243 ONE_MORE_BYTE (c);
4244 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4245 char_offset++;
4246 coding->errors++;
4247 }
4248
4249 no_more_source:
4250 if (last_id != charset_ascii)
4251 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4252 coding->consumed_char += consumed_chars_base;
4253 coding->consumed = src_base - coding->source;
4254 coding->charbuf_used = charbuf - coding->charbuf;
4255 }
4256
4257 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4258 This function can encode charsets `ascii', `katakana-jisx0201',
4259 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4260 are sure that all these charsets are registered as official charset
4261 (i.e. do not have extended leading-codes). Characters of other
4262 charsets are produced without any encoding. If SJIS_P is 1, encode
4263 SJIS text, else encode BIG5 text. */
4264
4265 static int
4266 encode_coding_sjis (coding)
4267 struct coding_system *coding;
4268 {
4269 int multibytep = coding->dst_multibyte;
4270 int *charbuf = coding->charbuf;
4271 int *charbuf_end = charbuf + coding->charbuf_used;
4272 unsigned char *dst = coding->destination + coding->produced;
4273 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4274 int safe_room = 4;
4275 int produced_chars = 0;
4276 Lisp_Object attrs, charset_list, val;
4277 int ascii_compatible;
4278 struct charset *charset_roman, *charset_kanji, *charset_kana;
4279 struct charset *charset_kanji2;
4280 int c;
4281
4282 CODING_GET_INFO (coding, attrs, charset_list);
4283 val = charset_list;
4284 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4285 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4286 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4287 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4288
4289 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4290
4291 while (charbuf < charbuf_end)
4292 {
4293 ASSURE_DESTINATION (safe_room);
4294 c = *charbuf++;
4295 /* Now encode the character C. */
4296 if (ASCII_CHAR_P (c) && ascii_compatible)
4297 EMIT_ONE_ASCII_BYTE (c);
4298 else if (CHAR_BYTE8_P (c))
4299 {
4300 c = CHAR_TO_BYTE8 (c);
4301 EMIT_ONE_BYTE (c);
4302 }
4303 else
4304 {
4305 unsigned code;
4306 struct charset *charset = char_charset (c, charset_list, &code);
4307
4308 if (!charset)
4309 {
4310 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4311 {
4312 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4313 charset = CHARSET_FROM_ID (charset_ascii);
4314 }
4315 else
4316 {
4317 c = coding->default_char;
4318 charset = char_charset (c, charset_list, &code);
4319 }
4320 }
4321 if (code == CHARSET_INVALID_CODE (charset))
4322 abort ();
4323 if (charset == charset_kanji)
4324 {
4325 int c1, c2;
4326 JIS_TO_SJIS (code);
4327 c1 = code >> 8, c2 = code & 0xFF;
4328 EMIT_TWO_BYTES (c1, c2);
4329 }
4330 else if (charset == charset_kana)
4331 EMIT_ONE_BYTE (code | 0x80);
4332 else if (charset_kanji2 && charset == charset_kanji2)
4333 {
4334 int c1, c2;
4335
4336 c1 = code >> 8;
4337 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4338 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4339 {
4340 JIS_TO_SJIS2 (code);
4341 c1 = code >> 8, c2 = code & 0xFF;
4342 EMIT_TWO_BYTES (c1, c2);
4343 }
4344 else
4345 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4346 }
4347 else
4348 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4349 }
4350 }
4351 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4352 coding->produced_char += produced_chars;
4353 coding->produced = dst - coding->destination;
4354 return 0;
4355 }
4356
4357 static int
4358 encode_coding_big5 (coding)
4359 struct coding_system *coding;
4360 {
4361 int multibytep = coding->dst_multibyte;
4362 int *charbuf = coding->charbuf;
4363 int *charbuf_end = charbuf + coding->charbuf_used;
4364 unsigned char *dst = coding->destination + coding->produced;
4365 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4366 int safe_room = 4;
4367 int produced_chars = 0;
4368 Lisp_Object attrs, charset_list, val;
4369 int ascii_compatible;
4370 struct charset *charset_roman, *charset_big5;
4371 int c;
4372
4373 CODING_GET_INFO (coding, attrs, charset_list);
4374 val = charset_list;
4375 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4376 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4377 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4378
4379 while (charbuf < charbuf_end)
4380 {
4381 ASSURE_DESTINATION (safe_room);
4382 c = *charbuf++;
4383 /* Now encode the character C. */
4384 if (ASCII_CHAR_P (c) && ascii_compatible)
4385 EMIT_ONE_ASCII_BYTE (c);
4386 else if (CHAR_BYTE8_P (c))
4387 {
4388 c = CHAR_TO_BYTE8 (c);
4389 EMIT_ONE_BYTE (c);
4390 }
4391 else
4392 {
4393 unsigned code;
4394 struct charset *charset = char_charset (c, charset_list, &code);
4395
4396 if (! charset)
4397 {
4398 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4399 {
4400 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4401 charset = CHARSET_FROM_ID (charset_ascii);
4402 }
4403 else
4404 {
4405 c = coding->default_char;
4406 charset = char_charset (c, charset_list, &code);
4407 }
4408 }
4409 if (code == CHARSET_INVALID_CODE (charset))
4410 abort ();
4411 if (charset == charset_big5)
4412 {
4413 int c1, c2;
4414
4415 c1 = code >> 8, c2 = code & 0xFF;
4416 EMIT_TWO_BYTES (c1, c2);
4417 }
4418 else
4419 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4420 }
4421 }
4422 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4423 coding->produced_char += produced_chars;
4424 coding->produced = dst - coding->destination;
4425 return 0;
4426 }
4427
4428 \f
4429 /*** 10. CCL handlers ***/
4430
4431 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4432 Check if a text is encoded in a coding system of which
4433 encoder/decoder are written in CCL program. If it is, return
4434 CATEGORY_MASK_CCL, else return 0. */
4435
4436 static int
4437 detect_coding_ccl (coding, detect_info)
4438 struct coding_system *coding;
4439 struct coding_detection_info *detect_info;
4440 {
4441 const unsigned char *src = coding->source, *src_base;
4442 const unsigned char *src_end = coding->source + coding->src_bytes;
4443 int multibytep = coding->src_multibyte;
4444 int consumed_chars = 0;
4445 int found = 0;
4446 unsigned char *valids;
4447 int head_ascii = coding->head_ascii;
4448 Lisp_Object attrs;
4449
4450 detect_info->checked |= CATEGORY_MASK_CCL;
4451
4452 coding = &coding_categories[coding_category_ccl];
4453 valids = CODING_CCL_VALIDS (coding);
4454 attrs = CODING_ID_ATTRS (coding->id);
4455 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4456 src += head_ascii;
4457
4458 while (1)
4459 {
4460 int c;
4461
4462 src_base = src;
4463 ONE_MORE_BYTE (c);
4464 if (c < 0 || ! valids[c])
4465 break;
4466 if ((valids[c] > 1))
4467 found = CATEGORY_MASK_CCL;
4468 }
4469 detect_info->rejected |= CATEGORY_MASK_CCL;
4470 return 0;
4471
4472 no_more_source:
4473 detect_info->found |= found;
4474 return 1;
4475 }
4476
4477 static void
4478 decode_coding_ccl (coding)
4479 struct coding_system *coding;
4480 {
4481 const unsigned char *src = coding->source + coding->consumed;
4482 const unsigned char *src_end = coding->source + coding->src_bytes;
4483 int *charbuf = coding->charbuf + coding->charbuf_used;
4484 int *charbuf_end = coding->charbuf + coding->charbuf_size;
4485 int consumed_chars = 0;
4486 int multibytep = coding->src_multibyte;
4487 struct ccl_program ccl;
4488 int source_charbuf[1024];
4489 int source_byteidx[1024];
4490 Lisp_Object attrs, charset_list;
4491
4492 CODING_GET_INFO (coding, attrs, charset_list);
4493 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4494
4495 while (src < src_end)
4496 {
4497 const unsigned char *p = src;
4498 int *source, *source_end;
4499 int i = 0;
4500
4501 if (multibytep)
4502 while (i < 1024 && p < src_end)
4503 {
4504 source_byteidx[i] = p - src;
4505 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4506 }
4507 else
4508 while (i < 1024 && p < src_end)
4509 source_charbuf[i++] = *p++;
4510
4511 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4512 ccl.last_block = 1;
4513
4514 source = source_charbuf;
4515 source_end = source + i;
4516 while (source < source_end)
4517 {
4518 ccl_driver (&ccl, source, charbuf,
4519 source_end - source, charbuf_end - charbuf,
4520 charset_list);
4521 source += ccl.consumed;
4522 charbuf += ccl.produced;
4523 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4524 break;
4525 }
4526 if (source < source_end)
4527 src += source_byteidx[source - source_charbuf];
4528 else
4529 src = p;
4530 consumed_chars += source - source_charbuf;
4531
4532 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4533 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4534 break;
4535 }
4536
4537 switch (ccl.status)
4538 {
4539 case CCL_STAT_SUSPEND_BY_SRC:
4540 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4541 break;
4542 case CCL_STAT_SUSPEND_BY_DST:
4543 break;
4544 case CCL_STAT_QUIT:
4545 case CCL_STAT_INVALID_CMD:
4546 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4547 break;
4548 default:
4549 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4550 break;
4551 }
4552 coding->consumed_char += consumed_chars;
4553 coding->consumed = src - coding->source;
4554 coding->charbuf_used = charbuf - coding->charbuf;
4555 }
4556
4557 static int
4558 encode_coding_ccl (coding)
4559 struct coding_system *coding;
4560 {
4561 struct ccl_program ccl;
4562 int multibytep = coding->dst_multibyte;
4563 int *charbuf = coding->charbuf;
4564 int *charbuf_end = charbuf + coding->charbuf_used;
4565 unsigned char *dst = coding->destination + coding->produced;
4566 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4567 int destination_charbuf[1024];
4568 int i, produced_chars = 0;
4569 Lisp_Object attrs, charset_list;
4570
4571 CODING_GET_INFO (coding, attrs, charset_list);
4572 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4573
4574 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4575 ccl.dst_multibyte = coding->dst_multibyte;
4576
4577 while (charbuf < charbuf_end)
4578 {
4579 ccl_driver (&ccl, charbuf, destination_charbuf,
4580 charbuf_end - charbuf, 1024, charset_list);
4581 if (multibytep)
4582 {
4583 ASSURE_DESTINATION (ccl.produced * 2);
4584 for (i = 0; i < ccl.produced; i++)
4585 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4586 }
4587 else
4588 {
4589 ASSURE_DESTINATION (ccl.produced);
4590 for (i = 0; i < ccl.produced; i++)
4591 *dst++ = destination_charbuf[i] & 0xFF;
4592 produced_chars += ccl.produced;
4593 }
4594 charbuf += ccl.consumed;
4595 if (ccl.status == CCL_STAT_QUIT
4596 || ccl.status == CCL_STAT_INVALID_CMD)
4597 break;
4598 }
4599
4600 switch (ccl.status)
4601 {
4602 case CCL_STAT_SUSPEND_BY_SRC:
4603 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4604 break;
4605 case CCL_STAT_SUSPEND_BY_DST:
4606 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4607 break;
4608 case CCL_STAT_QUIT:
4609 case CCL_STAT_INVALID_CMD:
4610 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4611 break;
4612 default:
4613 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4614 break;
4615 }
4616
4617 coding->produced_char += produced_chars;
4618 coding->produced = dst - coding->destination;
4619 return 0;
4620 }
4621
4622
4623 \f
4624 /*** 10, 11. no-conversion handlers ***/
4625
4626 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4627
4628 static void
4629 decode_coding_raw_text (coding)
4630 struct coding_system *coding;
4631 {
4632 coding->chars_at_source = 1;
4633 coding->consumed_char = 0;
4634 coding->consumed = 0;
4635 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4636 }
4637
4638 static int
4639 encode_coding_raw_text (coding)
4640 struct coding_system *coding;
4641 {
4642 int multibytep = coding->dst_multibyte;
4643 int *charbuf = coding->charbuf;
4644 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4645 unsigned char *dst = coding->destination + coding->produced;
4646 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4647 int produced_chars = 0;
4648 int c;
4649
4650 if (multibytep)
4651 {
4652 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4653
4654 if (coding->src_multibyte)
4655 while (charbuf < charbuf_end)
4656 {
4657 ASSURE_DESTINATION (safe_room);
4658 c = *charbuf++;
4659 if (ASCII_CHAR_P (c))
4660 EMIT_ONE_ASCII_BYTE (c);
4661 else if (CHAR_BYTE8_P (c))
4662 {
4663 c = CHAR_TO_BYTE8 (c);
4664 EMIT_ONE_BYTE (c);
4665 }
4666 else
4667 {
4668 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4669
4670 CHAR_STRING_ADVANCE (c, p1);
4671 while (p0 < p1)
4672 {
4673 EMIT_ONE_BYTE (*p0);
4674 p0++;
4675 }
4676 }
4677 }
4678 else
4679 while (charbuf < charbuf_end)
4680 {
4681 ASSURE_DESTINATION (safe_room);
4682 c = *charbuf++;
4683 EMIT_ONE_BYTE (c);
4684 }
4685 }
4686 else
4687 {
4688 if (coding->src_multibyte)
4689 {
4690 int safe_room = MAX_MULTIBYTE_LENGTH;
4691
4692 while (charbuf < charbuf_end)
4693 {
4694 ASSURE_DESTINATION (safe_room);
4695 c = *charbuf++;
4696 if (ASCII_CHAR_P (c))
4697 *dst++ = c;
4698 else if (CHAR_BYTE8_P (c))
4699 *dst++ = CHAR_TO_BYTE8 (c);
4700 else
4701 CHAR_STRING_ADVANCE (c, dst);
4702 produced_chars++;
4703 }
4704 }
4705 else
4706 {
4707 ASSURE_DESTINATION (charbuf_end - charbuf);
4708 while (charbuf < charbuf_end && dst < dst_end)
4709 *dst++ = *charbuf++;
4710 produced_chars = dst - (coding->destination + coding->dst_bytes);
4711 }
4712 }
4713 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4714 coding->produced_char += produced_chars;
4715 coding->produced = dst - coding->destination;
4716 return 0;
4717 }
4718
4719 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4720 Check if a text is encoded in a charset-based coding system. If it
4721 is, return 1, else return 0. */
4722
4723 static int
4724 detect_coding_charset (coding, detect_info)
4725 struct coding_system *coding;
4726 struct coding_detection_info *detect_info;
4727 {
4728 const unsigned char *src = coding->source, *src_base;
4729 const unsigned char *src_end = coding->source + coding->src_bytes;
4730 int multibytep = coding->src_multibyte;
4731 int consumed_chars = 0;
4732 Lisp_Object attrs, valids;
4733 int found = 0;
4734
4735 detect_info->checked |= CATEGORY_MASK_CHARSET;
4736
4737 coding = &coding_categories[coding_category_charset];
4738 attrs = CODING_ID_ATTRS (coding->id);
4739 valids = AREF (attrs, coding_attr_charset_valids);
4740
4741 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4742 src += coding->head_ascii;
4743
4744 while (1)
4745 {
4746 int c;
4747
4748 src_base = src;
4749 ONE_MORE_BYTE (c);
4750 if (c < 0)
4751 continue;
4752 if (NILP (AREF (valids, c)))
4753 break;
4754 if (c >= 0x80)
4755 found = CATEGORY_MASK_CHARSET;
4756 }
4757 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4758 return 0;
4759
4760 no_more_source:
4761 detect_info->found |= found;
4762 return 1;
4763 }
4764
4765 static void
4766 decode_coding_charset (coding)
4767 struct coding_system *coding;
4768 {
4769 const unsigned char *src = coding->source + coding->consumed;
4770 const unsigned char *src_end = coding->source + coding->src_bytes;
4771 const unsigned char *src_base;
4772 int *charbuf = coding->charbuf + coding->charbuf_used;
4773 int *charbuf_end
4774 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4775 int consumed_chars = 0, consumed_chars_base;
4776 int multibytep = coding->src_multibyte;
4777 Lisp_Object attrs, charset_list, valids;
4778 int char_offset = coding->produced_char;
4779 int last_offset = char_offset;
4780 int last_id = charset_ascii;
4781
4782 CODING_GET_INFO (coding, attrs, charset_list);
4783 valids = AREF (attrs, coding_attr_charset_valids);
4784
4785 while (1)
4786 {
4787 int c;
4788 Lisp_Object val;
4789 struct charset *charset;
4790 int dim;
4791 int len = 1;
4792 unsigned code;
4793
4794 src_base = src;
4795 consumed_chars_base = consumed_chars;
4796
4797 if (charbuf >= charbuf_end)
4798 break;
4799
4800 ONE_MORE_BYTE (c);
4801 if (c < 0)
4802 goto invalid_code;
4803 code = c;
4804
4805 val = AREF (valids, c);
4806 if (NILP (val))
4807 goto invalid_code;
4808 if (INTEGERP (val))
4809 {
4810 charset = CHARSET_FROM_ID (XFASTINT (val));
4811 dim = CHARSET_DIMENSION (charset);
4812 while (len < dim)
4813 {
4814 ONE_MORE_BYTE (c);
4815 code = (code << 8) | c;
4816 len++;
4817 }
4818 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4819 charset, code, c);
4820 }
4821 else
4822 {
4823 /* VAL is a list of charset IDs. It is assured that the
4824 list is sorted by charset dimensions (smaller one
4825 comes first). */
4826 while (CONSP (val))
4827 {
4828 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4829 dim = CHARSET_DIMENSION (charset);
4830 while (len < dim)
4831 {
4832 ONE_MORE_BYTE (c);
4833 code = (code << 8) | c;
4834 len++;
4835 }
4836 CODING_DECODE_CHAR (coding, src, src_base,
4837 src_end, charset, code, c);
4838 if (c >= 0)
4839 break;
4840 val = XCDR (val);
4841 }
4842 }
4843 if (c < 0)
4844 goto invalid_code;
4845 if (charset->id != charset_ascii
4846 && last_id != charset->id)
4847 {
4848 if (last_id != charset_ascii)
4849 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4850 last_id = charset->id;
4851 last_offset = char_offset;
4852 }
4853
4854 *charbuf++ = c;
4855 char_offset++;
4856 continue;
4857
4858 invalid_code:
4859 src = src_base;
4860 consumed_chars = consumed_chars_base;
4861 ONE_MORE_BYTE (c);
4862 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4863 char_offset++;
4864 coding->errors++;
4865 }
4866
4867 no_more_source:
4868 if (last_id != charset_ascii)
4869 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4870 coding->consumed_char += consumed_chars_base;
4871 coding->consumed = src_base - coding->source;
4872 coding->charbuf_used = charbuf - coding->charbuf;
4873 }
4874
4875 static int
4876 encode_coding_charset (coding)
4877 struct coding_system *coding;
4878 {
4879 int multibytep = coding->dst_multibyte;
4880 int *charbuf = coding->charbuf;
4881 int *charbuf_end = charbuf + coding->charbuf_used;
4882 unsigned char *dst = coding->destination + coding->produced;
4883 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4884 int safe_room = MAX_MULTIBYTE_LENGTH;
4885 int produced_chars = 0;
4886 Lisp_Object attrs, charset_list;
4887 int ascii_compatible;
4888 int c;
4889
4890 CODING_GET_INFO (coding, attrs, charset_list);
4891 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4892
4893 while (charbuf < charbuf_end)
4894 {
4895 struct charset *charset;
4896 unsigned code;
4897
4898 ASSURE_DESTINATION (safe_room);
4899 c = *charbuf++;
4900 if (ascii_compatible && ASCII_CHAR_P (c))
4901 EMIT_ONE_ASCII_BYTE (c);
4902 else if (CHAR_BYTE8_P (c))
4903 {
4904 c = CHAR_TO_BYTE8 (c);
4905 EMIT_ONE_BYTE (c);
4906 }
4907 else
4908 {
4909 charset = char_charset (c, charset_list, &code);
4910 if (charset)
4911 {
4912 if (CHARSET_DIMENSION (charset) == 1)
4913 EMIT_ONE_BYTE (code);
4914 else if (CHARSET_DIMENSION (charset) == 2)
4915 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4916 else if (CHARSET_DIMENSION (charset) == 3)
4917 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4918 else
4919 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4920 (code >> 8) & 0xFF, code & 0xFF);
4921 }
4922 else
4923 {
4924 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4925 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4926 else
4927 c = coding->default_char;
4928 EMIT_ONE_BYTE (c);
4929 }
4930 }
4931 }
4932
4933 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4934 coding->produced_char += produced_chars;
4935 coding->produced = dst - coding->destination;
4936 return 0;
4937 }
4938
4939 \f
4940 /*** 7. C library functions ***/
4941
4942 /* Setup coding context CODING from information about CODING_SYSTEM.
4943 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4944 CODING_SYSTEM is invalid, signal an error. */
4945
4946 void
4947 setup_coding_system (coding_system, coding)
4948 Lisp_Object coding_system;
4949 struct coding_system *coding;
4950 {
4951 Lisp_Object attrs;
4952 Lisp_Object eol_type;
4953 Lisp_Object coding_type;
4954 Lisp_Object val;
4955
4956 if (NILP (coding_system))
4957 coding_system = Qundecided;
4958
4959 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4960
4961 attrs = CODING_ID_ATTRS (coding->id);
4962 eol_type = CODING_ID_EOL_TYPE (coding->id);
4963
4964 coding->mode = 0;
4965 coding->head_ascii = -1;
4966 coding->common_flags
4967 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4968 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4969 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4970 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4971 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4972 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4973 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4974
4975 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4976 coding->max_charset_id = SCHARS (val) - 1;
4977 coding->safe_charsets = (char *) SDATA (val);
4978 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4979
4980 coding_type = CODING_ATTR_TYPE (attrs);
4981 if (EQ (coding_type, Qundecided))
4982 {
4983 coding->detector = NULL;
4984 coding->decoder = decode_coding_raw_text;
4985 coding->encoder = encode_coding_raw_text;
4986 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4987 }
4988 else if (EQ (coding_type, Qiso_2022))
4989 {
4990 int i;
4991 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4992
4993 /* Invoke graphic register 0 to plane 0. */
4994 CODING_ISO_INVOCATION (coding, 0) = 0;
4995 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4996 CODING_ISO_INVOCATION (coding, 1)
4997 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4998 /* Setup the initial status of designation. */
4999 for (i = 0; i < 4; i++)
5000 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5001 /* Not single shifting initially. */
5002 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5003 /* Beginning of buffer should also be regarded as bol. */
5004 CODING_ISO_BOL (coding) = 1;
5005 coding->detector = detect_coding_iso_2022;
5006 coding->decoder = decode_coding_iso_2022;
5007 coding->encoder = encode_coding_iso_2022;
5008 if (flags & CODING_ISO_FLAG_SAFE)
5009 coding->mode |= CODING_MODE_SAFE_ENCODING;
5010 coding->common_flags
5011 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5012 | CODING_REQUIRE_FLUSHING_MASK);
5013 if (flags & CODING_ISO_FLAG_COMPOSITION)
5014 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5015 if (flags & CODING_ISO_FLAG_DESIGNATION)
5016 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5017 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5018 {
5019 setup_iso_safe_charsets (attrs);
5020 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5021 coding->max_charset_id = SCHARS (val) - 1;
5022 coding->safe_charsets = (char *) SDATA (val);
5023 }
5024 CODING_ISO_FLAGS (coding) = flags;
5025 }
5026 else if (EQ (coding_type, Qcharset))
5027 {
5028 coding->detector = detect_coding_charset;
5029 coding->decoder = decode_coding_charset;
5030 coding->encoder = encode_coding_charset;
5031 coding->common_flags
5032 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5033 }
5034 else if (EQ (coding_type, Qutf_8))
5035 {
5036 coding->detector = detect_coding_utf_8;
5037 coding->decoder = decode_coding_utf_8;
5038 coding->encoder = encode_coding_utf_8;
5039 coding->common_flags
5040 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5041 }
5042 else if (EQ (coding_type, Qutf_16))
5043 {
5044 val = AREF (attrs, coding_attr_utf_16_bom);
5045 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5046 : EQ (val, Qt) ? utf_16_with_bom
5047 : utf_16_without_bom);
5048 val = AREF (attrs, coding_attr_utf_16_endian);
5049 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5050 : utf_16_little_endian);
5051 CODING_UTF_16_SURROGATE (coding) = 0;
5052 coding->detector = detect_coding_utf_16;
5053 coding->decoder = decode_coding_utf_16;
5054 coding->encoder = encode_coding_utf_16;
5055 coding->common_flags
5056 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5057 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5058 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5059 }
5060 else if (EQ (coding_type, Qccl))
5061 {
5062 coding->detector = detect_coding_ccl;
5063 coding->decoder = decode_coding_ccl;
5064 coding->encoder = encode_coding_ccl;
5065 coding->common_flags
5066 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5067 | CODING_REQUIRE_FLUSHING_MASK);
5068 }
5069 else if (EQ (coding_type, Qemacs_mule))
5070 {
5071 coding->detector = detect_coding_emacs_mule;
5072 coding->decoder = decode_coding_emacs_mule;
5073 coding->encoder = encode_coding_emacs_mule;
5074 coding->common_flags
5075 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5076 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5077 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5078 {
5079 Lisp_Object tail, safe_charsets;
5080 int max_charset_id = 0;
5081
5082 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5083 tail = XCDR (tail))
5084 if (max_charset_id < XFASTINT (XCAR (tail)))
5085 max_charset_id = XFASTINT (XCAR (tail));
5086 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5087 make_number (255));
5088 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5089 tail = XCDR (tail))
5090 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5091 coding->max_charset_id = max_charset_id;
5092 coding->safe_charsets = (char *) SDATA (safe_charsets);
5093 }
5094 }
5095 else if (EQ (coding_type, Qshift_jis))
5096 {
5097 coding->detector = detect_coding_sjis;
5098 coding->decoder = decode_coding_sjis;
5099 coding->encoder = encode_coding_sjis;
5100 coding->common_flags
5101 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5102 }
5103 else if (EQ (coding_type, Qbig5))
5104 {
5105 coding->detector = detect_coding_big5;
5106 coding->decoder = decode_coding_big5;
5107 coding->encoder = encode_coding_big5;
5108 coding->common_flags
5109 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5110 }
5111 else /* EQ (coding_type, Qraw_text) */
5112 {
5113 coding->detector = NULL;
5114 coding->decoder = decode_coding_raw_text;
5115 coding->encoder = encode_coding_raw_text;
5116 if (! EQ (eol_type, Qunix))
5117 {
5118 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5119 if (! VECTORP (eol_type))
5120 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5121 }
5122
5123 }
5124
5125 return;
5126 }
5127
5128 /* Return a list of charsets supported by CODING. */
5129
5130 Lisp_Object
5131 coding_charset_list (coding)
5132 struct coding_system *coding;
5133 {
5134 Lisp_Object attrs, charset_list;
5135
5136 CODING_GET_INFO (coding, attrs, charset_list);
5137 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5138 {
5139 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5140
5141 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5142 charset_list = Viso_2022_charset_list;
5143 }
5144 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5145 {
5146 charset_list = Vemacs_mule_charset_list;
5147 }
5148 return charset_list;
5149 }
5150
5151
5152 /* Return raw-text or one of its subsidiaries that has the same
5153 eol_type as CODING-SYSTEM. */
5154
5155 Lisp_Object
5156 raw_text_coding_system (coding_system)
5157 Lisp_Object coding_system;
5158 {
5159 Lisp_Object spec, attrs;
5160 Lisp_Object eol_type, raw_text_eol_type;
5161
5162 if (NILP (coding_system))
5163 return Qraw_text;
5164 spec = CODING_SYSTEM_SPEC (coding_system);
5165 attrs = AREF (spec, 0);
5166
5167 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5168 return coding_system;
5169
5170 eol_type = AREF (spec, 2);
5171 if (VECTORP (eol_type))
5172 return Qraw_text;
5173 spec = CODING_SYSTEM_SPEC (Qraw_text);
5174 raw_text_eol_type = AREF (spec, 2);
5175 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5176 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5177 : AREF (raw_text_eol_type, 2));
5178 }
5179
5180
5181 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5182 does, return one of the subsidiary that has the same eol-spec as
5183 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5184 inherit end-of-line format from the system's setting
5185 (system_eol_type). */
5186
5187 Lisp_Object
5188 coding_inherit_eol_type (coding_system, parent)
5189 Lisp_Object coding_system, parent;
5190 {
5191 Lisp_Object spec, eol_type;
5192
5193 if (NILP (coding_system))
5194 coding_system = Qraw_text;
5195 spec = CODING_SYSTEM_SPEC (coding_system);
5196 eol_type = AREF (spec, 2);
5197 if (VECTORP (eol_type))
5198 {
5199 Lisp_Object parent_eol_type;
5200
5201 if (! NILP (parent))
5202 {
5203 Lisp_Object parent_spec;
5204
5205 parent_spec
5206 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5207 parent_eol_type = AREF (parent_spec, 2);
5208 }
5209 else
5210 parent_eol_type = system_eol_type;
5211 if (EQ (parent_eol_type, Qunix))
5212 coding_system = AREF (eol_type, 0);
5213 else if (EQ (parent_eol_type, Qdos))
5214 coding_system = AREF (eol_type, 1);
5215 else if (EQ (parent_eol_type, Qmac))
5216 coding_system = AREF (eol_type, 2);
5217 }
5218 return coding_system;
5219 }
5220
5221 /* Emacs has a mechanism to automatically detect a coding system if it
5222 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5223 it's impossible to distinguish some coding systems accurately
5224 because they use the same range of codes. So, at first, coding
5225 systems are categorized into 7, those are:
5226
5227 o coding-category-emacs-mule
5228
5229 The category for a coding system which has the same code range
5230 as Emacs' internal format. Assigned the coding-system (Lisp
5231 symbol) `emacs-mule' by default.
5232
5233 o coding-category-sjis
5234
5235 The category for a coding system which has the same code range
5236 as SJIS. Assigned the coding-system (Lisp
5237 symbol) `japanese-shift-jis' by default.
5238
5239 o coding-category-iso-7
5240
5241 The category for a coding system which has the same code range
5242 as ISO2022 of 7-bit environment. This doesn't use any locking
5243 shift and single shift functions. This can encode/decode all
5244 charsets. Assigned the coding-system (Lisp symbol)
5245 `iso-2022-7bit' by default.
5246
5247 o coding-category-iso-7-tight
5248
5249 Same as coding-category-iso-7 except that this can
5250 encode/decode only the specified charsets.
5251
5252 o coding-category-iso-8-1
5253
5254 The category for a coding system which has the same code range
5255 as ISO2022 of 8-bit environment and graphic plane 1 used only
5256 for DIMENSION1 charset. This doesn't use any locking shift
5257 and single shift functions. Assigned the coding-system (Lisp
5258 symbol) `iso-latin-1' by default.
5259
5260 o coding-category-iso-8-2
5261
5262 The category for a coding system which has the same code range
5263 as ISO2022 of 8-bit environment and graphic plane 1 used only
5264 for DIMENSION2 charset. This doesn't use any locking shift
5265 and single shift functions. Assigned the coding-system (Lisp
5266 symbol) `japanese-iso-8bit' by default.
5267
5268 o coding-category-iso-7-else
5269
5270 The category for a coding system which has the same code range
5271 as ISO2022 of 7-bit environemnt but uses locking shift or
5272 single shift functions. Assigned the coding-system (Lisp
5273 symbol) `iso-2022-7bit-lock' by default.
5274
5275 o coding-category-iso-8-else
5276
5277 The category for a coding system which has the same code range
5278 as ISO2022 of 8-bit environemnt but uses locking shift or
5279 single shift functions. Assigned the coding-system (Lisp
5280 symbol) `iso-2022-8bit-ss2' by default.
5281
5282 o coding-category-big5
5283
5284 The category for a coding system which has the same code range
5285 as BIG5. Assigned the coding-system (Lisp symbol)
5286 `cn-big5' by default.
5287
5288 o coding-category-utf-8
5289
5290 The category for a coding system which has the same code range
5291 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
5292 symbol) `utf-8' by default.
5293
5294 o coding-category-utf-16-be
5295
5296 The category for a coding system in which a text has an
5297 Unicode signature (cf. Unicode Standard) in the order of BIG
5298 endian at the head. Assigned the coding-system (Lisp symbol)
5299 `utf-16-be' by default.
5300
5301 o coding-category-utf-16-le
5302
5303 The category for a coding system in which a text has an
5304 Unicode signature (cf. Unicode Standard) in the order of
5305 LITTLE endian at the head. Assigned the coding-system (Lisp
5306 symbol) `utf-16-le' by default.
5307
5308 o coding-category-ccl
5309
5310 The category for a coding system of which encoder/decoder is
5311 written in CCL programs. The default value is nil, i.e., no
5312 coding system is assigned.
5313
5314 o coding-category-binary
5315
5316 The category for a coding system not categorized in any of the
5317 above. Assigned the coding-system (Lisp symbol)
5318 `no-conversion' by default.
5319
5320 Each of them is a Lisp symbol and the value is an actual
5321 `coding-system's (this is also a Lisp symbol) assigned by a user.
5322 What Emacs does actually is to detect a category of coding system.
5323 Then, it uses a `coding-system' assigned to it. If Emacs can't
5324 decide only one possible category, it selects a category of the
5325 highest priority. Priorities of categories are also specified by a
5326 user in a Lisp variable `coding-category-list'.
5327
5328 */
5329
5330 #define EOL_SEEN_NONE 0
5331 #define EOL_SEEN_LF 1
5332 #define EOL_SEEN_CR 2
5333 #define EOL_SEEN_CRLF 4
5334
5335 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5336 SOURCE is encoded. If CATEGORY is one of
5337 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5338 two-byte, else they are encoded by one-byte.
5339
5340 Return one of EOL_SEEN_XXX. */
5341
5342 #define MAX_EOL_CHECK_COUNT 3
5343
5344 static int
5345 detect_eol (source, src_bytes, category)
5346 const unsigned char *source;
5347 EMACS_INT src_bytes;
5348 enum coding_category category;
5349 {
5350 const unsigned char *src = source, *src_end = src + src_bytes;
5351 unsigned char c;
5352 int total = 0;
5353 int eol_seen = EOL_SEEN_NONE;
5354
5355 if ((1 << category) & CATEGORY_MASK_UTF_16)
5356 {
5357 int msb, lsb;
5358
5359 msb = category == (coding_category_utf_16_le
5360 | coding_category_utf_16_le_nosig);
5361 lsb = 1 - msb;
5362
5363 while (src + 1 < src_end)
5364 {
5365 c = src[lsb];
5366 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5367 {
5368 int this_eol;
5369
5370 if (c == '\n')
5371 this_eol = EOL_SEEN_LF;
5372 else if (src + 3 >= src_end
5373 || src[msb + 2] != 0
5374 || src[lsb + 2] != '\n')
5375 this_eol = EOL_SEEN_CR;
5376 else
5377 this_eol = EOL_SEEN_CRLF;
5378
5379 if (eol_seen == EOL_SEEN_NONE)
5380 /* This is the first end-of-line. */
5381 eol_seen = this_eol;
5382 else if (eol_seen != this_eol)
5383 {
5384 /* The found type is different from what found before. */
5385 eol_seen = EOL_SEEN_LF;
5386 break;
5387 }
5388 if (++total == MAX_EOL_CHECK_COUNT)
5389 break;
5390 }
5391 src += 2;
5392 }
5393 }
5394 else
5395 {
5396 while (src < src_end)
5397 {
5398 c = *src++;
5399 if (c == '\n' || c == '\r')
5400 {
5401 int this_eol;
5402
5403 if (c == '\n')
5404 this_eol = EOL_SEEN_LF;
5405 else if (src >= src_end || *src != '\n')
5406 this_eol = EOL_SEEN_CR;
5407 else
5408 this_eol = EOL_SEEN_CRLF, src++;
5409
5410 if (eol_seen == EOL_SEEN_NONE)
5411 /* This is the first end-of-line. */
5412 eol_seen = this_eol;
5413 else if (eol_seen != this_eol)
5414 {
5415 /* The found type is different from what found before. */
5416 eol_seen = EOL_SEEN_LF;
5417 break;
5418 }
5419 if (++total == MAX_EOL_CHECK_COUNT)
5420 break;
5421 }
5422 }
5423 }
5424 return eol_seen;
5425 }
5426
5427
5428 static Lisp_Object
5429 adjust_coding_eol_type (coding, eol_seen)
5430 struct coding_system *coding;
5431 int eol_seen;
5432 {
5433 Lisp_Object eol_type;
5434
5435 eol_type = CODING_ID_EOL_TYPE (coding->id);
5436 if (eol_seen & EOL_SEEN_LF)
5437 {
5438 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5439 eol_type = Qunix;
5440 }
5441 else if (eol_seen & EOL_SEEN_CRLF)
5442 {
5443 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5444 eol_type = Qdos;
5445 }
5446 else if (eol_seen & EOL_SEEN_CR)
5447 {
5448 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5449 eol_type = Qmac;
5450 }
5451 return eol_type;
5452 }
5453
5454 /* Detect how a text specified in CODING is encoded. If a coding
5455 system is detected, update fields of CODING by the detected coding
5456 system. */
5457
5458 void
5459 detect_coding (coding)
5460 struct coding_system *coding;
5461 {
5462 const unsigned char *src, *src_end;
5463
5464 coding->consumed = coding->consumed_char = 0;
5465 coding->produced = coding->produced_char = 0;
5466 coding_set_source (coding);
5467
5468 src_end = coding->source + coding->src_bytes;
5469
5470 /* If we have not yet decided the text encoding type, detect it
5471 now. */
5472 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5473 {
5474 int c, i;
5475 struct coding_detection_info detect_info;
5476
5477 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5478 for (i = 0, src = coding->source; src < src_end; i++, src++)
5479 {
5480 c = *src;
5481 if (c & 0x80)
5482 break;
5483 if (c < 0x20
5484 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5485 && ! inhibit_iso_escape_detection
5486 && ! detect_info.checked)
5487 {
5488 coding->head_ascii = src - (coding->source + coding->consumed);
5489 if (detect_coding_iso_2022 (coding, &detect_info))
5490 {
5491 /* We have scanned the whole data. */
5492 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5493 /* We didn't find an 8-bit code. */
5494 src = src_end;
5495 break;
5496 }
5497 }
5498 }
5499 coding->head_ascii = src - (coding->source + coding->consumed);
5500
5501 if (coding->head_ascii < coding->src_bytes
5502 || detect_info.found)
5503 {
5504 enum coding_category category;
5505 struct coding_system *this;
5506
5507 if (coding->head_ascii == coding->src_bytes)
5508 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5509 for (i = 0; i < coding_category_raw_text; i++)
5510 {
5511 category = coding_priorities[i];
5512 this = coding_categories + category;
5513 if (detect_info.found & (1 << category))
5514 break;
5515 }
5516 else
5517 for (i = 0; i < coding_category_raw_text; i++)
5518 {
5519 category = coding_priorities[i];
5520 this = coding_categories + category;
5521 if (this->id < 0)
5522 {
5523 /* No coding system of this category is defined. */
5524 detect_info.rejected |= (1 << category);
5525 }
5526 else if (category >= coding_category_raw_text)
5527 continue;
5528 else if (detect_info.checked & (1 << category))
5529 {
5530 if (detect_info.found & (1 << category))
5531 break;
5532 }
5533 else if ((*(this->detector)) (coding, &detect_info)
5534 && detect_info.found & (1 << category))
5535 {
5536 if (category == coding_category_utf_16_auto)
5537 {
5538 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5539 category = coding_category_utf_16_le;
5540 else
5541 category = coding_category_utf_16_be;
5542 }
5543 break;
5544 }
5545 }
5546
5547 if (i < coding_category_raw_text)
5548 setup_coding_system (CODING_ID_NAME (this->id), coding);
5549 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5550 setup_coding_system (Qraw_text, coding);
5551 else if (detect_info.rejected)
5552 for (i = 0; i < coding_category_raw_text; i++)
5553 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5554 {
5555 this = coding_categories + coding_priorities[i];
5556 setup_coding_system (CODING_ID_NAME (this->id), coding);
5557 break;
5558 }
5559 }
5560 }
5561 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5562 == coding_category_utf_16_auto)
5563 {
5564 Lisp_Object coding_systems;
5565 struct coding_detection_info detect_info;
5566
5567 coding_systems
5568 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5569 detect_info.found = detect_info.rejected = 0;
5570 if (CONSP (coding_systems)
5571 && detect_coding_utf_16 (coding, &detect_info))
5572 {
5573 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5574 setup_coding_system (XCAR (coding_systems), coding);
5575 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5576 setup_coding_system (XCDR (coding_systems), coding);
5577 }
5578 }
5579 }
5580
5581
5582 static void
5583 decode_eol (coding)
5584 struct coding_system *coding;
5585 {
5586 Lisp_Object eol_type;
5587 unsigned char *p, *pbeg, *pend;
5588
5589 eol_type = CODING_ID_EOL_TYPE (coding->id);
5590 if (EQ (eol_type, Qunix))
5591 return;
5592
5593 if (NILP (coding->dst_object))
5594 pbeg = coding->destination;
5595 else
5596 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5597 pend = pbeg + coding->produced;
5598
5599 if (VECTORP (eol_type))
5600 {
5601 int eol_seen = EOL_SEEN_NONE;
5602
5603 for (p = pbeg; p < pend; p++)
5604 {
5605 if (*p == '\n')
5606 eol_seen |= EOL_SEEN_LF;
5607 else if (*p == '\r')
5608 {
5609 if (p + 1 < pend && *(p + 1) == '\n')
5610 {
5611 eol_seen |= EOL_SEEN_CRLF;
5612 p++;
5613 }
5614 else
5615 eol_seen |= EOL_SEEN_CR;
5616 }
5617 }
5618 if (eol_seen != EOL_SEEN_NONE
5619 && eol_seen != EOL_SEEN_LF
5620 && eol_seen != EOL_SEEN_CRLF
5621 && eol_seen != EOL_SEEN_CR)
5622 eol_seen = EOL_SEEN_LF;
5623 if (eol_seen != EOL_SEEN_NONE)
5624 eol_type = adjust_coding_eol_type (coding, eol_seen);
5625 }
5626
5627 if (EQ (eol_type, Qmac))
5628 {
5629 for (p = pbeg; p < pend; p++)
5630 if (*p == '\r')
5631 *p = '\n';
5632 }
5633 else if (EQ (eol_type, Qdos))
5634 {
5635 int n = 0;
5636
5637 if (NILP (coding->dst_object))
5638 {
5639 /* Start deleting '\r' from the tail to minimize the memory
5640 movement. */
5641 for (p = pend - 2; p >= pbeg; p--)
5642 if (*p == '\r')
5643 {
5644 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5645 n++;
5646 }
5647 }
5648 else
5649 {
5650 int pos_byte = coding->dst_pos_byte;
5651 int pos = coding->dst_pos;
5652 int pos_end = pos + coding->produced_char - 1;
5653
5654 while (pos < pos_end)
5655 {
5656 p = BYTE_POS_ADDR (pos_byte);
5657 if (*p == '\r' && p[1] == '\n')
5658 {
5659 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5660 n++;
5661 pos_end--;
5662 }
5663 pos++;
5664 pos_byte += BYTES_BY_CHAR_HEAD (*p);
5665 }
5666 }
5667 coding->produced -= n;
5668 coding->produced_char -= n;
5669 }
5670 }
5671
5672
5673 /* Return a translation table (or list of them) from coding system
5674 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5675 decoding (ENCODEP is zero). */
5676
5677 static Lisp_Object
5678 get_translation_table (attrs, encodep, max_lookup)
5679 Lisp_Object attrs;
5680 int encodep, *max_lookup;
5681 {
5682 Lisp_Object standard, translation_table;
5683 Lisp_Object val;
5684
5685 if (encodep)
5686 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5687 standard = Vstandard_translation_table_for_encode;
5688 else
5689 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5690 standard = Vstandard_translation_table_for_decode;
5691 if (NILP (translation_table))
5692 translation_table = standard;
5693 else
5694 {
5695 if (SYMBOLP (translation_table))
5696 translation_table = Fget (translation_table, Qtranslation_table);
5697 else if (CONSP (translation_table))
5698 {
5699 translation_table = Fcopy_sequence (translation_table);
5700 for (val = translation_table; CONSP (val); val = XCDR (val))
5701 if (SYMBOLP (XCAR (val)))
5702 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5703 }
5704 if (CHAR_TABLE_P (standard))
5705 {
5706 if (CONSP (translation_table))
5707 translation_table = nconc2 (translation_table,
5708 Fcons (standard, Qnil));
5709 else
5710 translation_table = Fcons (translation_table,
5711 Fcons (standard, Qnil));
5712 }
5713 }
5714
5715 if (max_lookup)
5716 {
5717 *max_lookup = 1;
5718 if (CHAR_TABLE_P (translation_table)
5719 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5720 {
5721 val = XCHAR_TABLE (translation_table)->extras[1];
5722 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5723 *max_lookup = XFASTINT (val);
5724 }
5725 else if (CONSP (translation_table))
5726 {
5727 Lisp_Object tail, val;
5728
5729 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5730 if (CHAR_TABLE_P (XCAR (tail))
5731 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5732 {
5733 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5734 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5735 *max_lookup = XFASTINT (val);
5736 }
5737 }
5738 }
5739 return translation_table;
5740 }
5741
5742 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
5743 do { \
5744 trans = Qnil; \
5745 if (CHAR_TABLE_P (table)) \
5746 { \
5747 trans = CHAR_TABLE_REF (table, c); \
5748 if (CHARACTERP (trans)) \
5749 c = XFASTINT (trans), trans = Qnil; \
5750 } \
5751 else if (CONSP (table)) \
5752 { \
5753 Lisp_Object tail; \
5754 \
5755 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
5756 if (CHAR_TABLE_P (XCAR (tail))) \
5757 { \
5758 trans = CHAR_TABLE_REF (XCAR (tail), c); \
5759 if (CHARACTERP (trans)) \
5760 c = XFASTINT (trans), trans = Qnil; \
5761 else if (! NILP (trans)) \
5762 break; \
5763 } \
5764 } \
5765 } while (0)
5766
5767
5768 static Lisp_Object
5769 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5770 Lisp_Object val;
5771 int *buf, *buf_end;
5772 int last_block;
5773 int *from_nchars, *to_nchars;
5774 {
5775 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
5776 [TO-CHAR ...]. */
5777 if (CONSP (val))
5778 {
5779 Lisp_Object from, tail;
5780 int i, len;
5781
5782 for (tail = val; CONSP (tail); tail = XCDR (tail))
5783 {
5784 val = XCAR (tail);
5785 from = XCAR (val);
5786 len = ASIZE (from);
5787 for (i = 0; i < len; i++)
5788 {
5789 if (buf + i == buf_end)
5790 {
5791 if (! last_block)
5792 return Qt;
5793 break;
5794 }
5795 if (XINT (AREF (from, i)) != buf[i])
5796 break;
5797 }
5798 if (i == len)
5799 {
5800 val = XCDR (val);
5801 *from_nchars = len;
5802 break;
5803 }
5804 }
5805 if (! CONSP (tail))
5806 return Qnil;
5807 }
5808 if (VECTORP (val))
5809 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5810 else
5811 *buf = XINT (val);
5812 return val;
5813 }
5814
5815
5816 static int
5817 produce_chars (coding, translation_table, last_block)
5818 struct coding_system *coding;
5819 Lisp_Object translation_table;
5820 int last_block;
5821 {
5822 unsigned char *dst = coding->destination + coding->produced;
5823 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5824 int produced;
5825 int produced_chars = 0;
5826 int carryover = 0;
5827
5828 if (! coding->chars_at_source)
5829 {
5830 /* Characters are in coding->charbuf. */
5831 int *buf = coding->charbuf;
5832 int *buf_end = buf + coding->charbuf_used;
5833
5834 if (BUFFERP (coding->src_object)
5835 && EQ (coding->src_object, coding->dst_object))
5836 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5837
5838 while (buf < buf_end)
5839 {
5840 int c = *buf, i;
5841
5842 if (c >= 0)
5843 {
5844 int from_nchars = 1, to_nchars = 1;
5845 Lisp_Object trans = Qnil;
5846
5847 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5848 if (! NILP (trans))
5849 {
5850 trans = get_translation (trans, buf, buf_end, last_block,
5851 &from_nchars, &to_nchars);
5852 if (EQ (trans, Qt))
5853 break;
5854 c = *buf;
5855 }
5856
5857 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5858 {
5859 dst = alloc_destination (coding,
5860 buf_end - buf
5861 + MAX_MULTIBYTE_LENGTH * to_nchars,
5862 dst);
5863 dst_end = coding->destination + coding->dst_bytes;
5864 }
5865
5866 for (i = 0; i < to_nchars; i++)
5867 {
5868 if (i > 0)
5869 c = XINT (AREF (trans, i));
5870 if (coding->dst_multibyte
5871 || ! CHAR_BYTE8_P (c))
5872 CHAR_STRING_ADVANCE (c, dst);
5873 else
5874 *dst++ = CHAR_TO_BYTE8 (c);
5875 }
5876 produced_chars += to_nchars;
5877 *buf++ = to_nchars;
5878 while (--from_nchars > 0)
5879 *buf++ = 0;
5880 }
5881 else
5882 /* This is an annotation datum. (-C) is the length. */
5883 buf += -c;
5884 }
5885 carryover = buf_end - buf;
5886 }
5887 else
5888 {
5889 const unsigned char *src = coding->source;
5890 const unsigned char *src_end = src + coding->src_bytes;
5891 Lisp_Object eol_type;
5892
5893 eol_type = CODING_ID_EOL_TYPE (coding->id);
5894
5895 if (coding->src_multibyte != coding->dst_multibyte)
5896 {
5897 if (coding->src_multibyte)
5898 {
5899 int multibytep = 1;
5900 int consumed_chars;
5901
5902 while (1)
5903 {
5904 const unsigned char *src_base = src;
5905 int c;
5906
5907 ONE_MORE_BYTE (c);
5908 if (c == '\r')
5909 {
5910 if (EQ (eol_type, Qdos))
5911 {
5912 if (src == src_end)
5913 {
5914 record_conversion_result
5915 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5916 goto no_more_source;
5917 }
5918 if (*src == '\n')
5919 c = *src++;
5920 }
5921 else if (EQ (eol_type, Qmac))
5922 c = '\n';
5923 }
5924 if (dst == dst_end)
5925 {
5926 coding->consumed = src - coding->source;
5927
5928 if (EQ (coding->src_object, coding->dst_object))
5929 dst_end = (unsigned char *) src;
5930 if (dst == dst_end)
5931 {
5932 dst = alloc_destination (coding, src_end - src + 1,
5933 dst);
5934 dst_end = coding->destination + coding->dst_bytes;
5935 coding_set_source (coding);
5936 src = coding->source + coding->consumed;
5937 src_end = coding->source + coding->src_bytes;
5938 }
5939 }
5940 *dst++ = c;
5941 produced_chars++;
5942 }
5943 no_more_source:
5944 ;
5945 }
5946 else
5947 while (src < src_end)
5948 {
5949 int multibytep = 1;
5950 int c = *src++;
5951
5952 if (c == '\r')
5953 {
5954 if (EQ (eol_type, Qdos))
5955 {
5956 if (src < src_end
5957 && *src == '\n')
5958 c = *src++;
5959 }
5960 else if (EQ (eol_type, Qmac))
5961 c = '\n';
5962 }
5963 if (dst >= dst_end - 1)
5964 {
5965 coding->consumed = src - coding->source;
5966
5967 if (EQ (coding->src_object, coding->dst_object))
5968 dst_end = (unsigned char *) src;
5969 if (dst >= dst_end - 1)
5970 {
5971 dst = alloc_destination (coding, src_end - src + 2,
5972 dst);
5973 dst_end = coding->destination + coding->dst_bytes;
5974 coding_set_source (coding);
5975 src = coding->source + coding->consumed;
5976 src_end = coding->source + coding->src_bytes;
5977 }
5978 }
5979 EMIT_ONE_BYTE (c);
5980 }
5981 }
5982 else
5983 {
5984 if (!EQ (coding->src_object, coding->dst_object))
5985 {
5986 int require = coding->src_bytes - coding->dst_bytes;
5987
5988 if (require > 0)
5989 {
5990 EMACS_INT offset = src - coding->source;
5991
5992 dst = alloc_destination (coding, require, dst);
5993 coding_set_source (coding);
5994 src = coding->source + offset;
5995 src_end = coding->source + coding->src_bytes;
5996 }
5997 }
5998 produced_chars = coding->src_chars;
5999 while (src < src_end)
6000 {
6001 int c = *src++;
6002
6003 if (c == '\r')
6004 {
6005 if (EQ (eol_type, Qdos))
6006 {
6007 if (src < src_end
6008 && *src == '\n')
6009 c = *src++;
6010 produced_chars--;
6011 }
6012 else if (EQ (eol_type, Qmac))
6013 c = '\n';
6014 }
6015 *dst++ = c;
6016 }
6017 }
6018 coding->consumed = coding->src_bytes;
6019 coding->consumed_char = coding->src_chars;
6020 }
6021
6022 produced = dst - (coding->destination + coding->produced);
6023 if (BUFFERP (coding->dst_object))
6024 insert_from_gap (produced_chars, produced);
6025 coding->produced += produced;
6026 coding->produced_char += produced_chars;
6027 return carryover;
6028 }
6029
6030 /* Compose text in CODING->object according to the annotation data at
6031 CHARBUF. CHARBUF is an array:
6032 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6033 */
6034
6035 static INLINE void
6036 produce_composition (coding, charbuf, pos)
6037 struct coding_system *coding;
6038 int *charbuf;
6039 EMACS_INT pos;
6040 {
6041 int len;
6042 EMACS_INT to;
6043 enum composition_method method;
6044 Lisp_Object components;
6045
6046 len = -charbuf[0];
6047 to = pos + charbuf[2];
6048 if (to <= pos)
6049 return;
6050 method = (enum composition_method) (charbuf[3]);
6051
6052 if (method == COMPOSITION_RELATIVE)
6053 components = Qnil;
6054 else if (method >= COMPOSITION_WITH_RULE
6055 && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6056 {
6057 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6058 int i;
6059
6060 len -= 4;
6061 charbuf += 4;
6062 for (i = 0; i < len; i++)
6063 {
6064 args[i] = make_number (charbuf[i]);
6065 if (charbuf[i] < 0)
6066 return;
6067 }
6068 components = (method == COMPOSITION_WITH_ALTCHARS
6069 ? Fstring (len, args) : Fvector (len, args));
6070 }
6071 else
6072 return;
6073 compose_text (pos, to, components, Qnil, coding->dst_object);
6074 }
6075
6076
6077 /* Put `charset' property on text in CODING->object according to
6078 the annotation data at CHARBUF. CHARBUF is an array:
6079 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6080 */
6081
6082 static INLINE void
6083 produce_charset (coding, charbuf, pos)
6084 struct coding_system *coding;
6085 int *charbuf;
6086 EMACS_INT pos;
6087 {
6088 EMACS_INT from = pos - charbuf[2];
6089 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6090
6091 Fput_text_property (make_number (from), make_number (pos),
6092 Qcharset, CHARSET_NAME (charset),
6093 coding->dst_object);
6094 }
6095
6096
6097 #define CHARBUF_SIZE 0x4000
6098
6099 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6100 do { \
6101 int size = CHARBUF_SIZE;; \
6102 \
6103 coding->charbuf = NULL; \
6104 while (size > 1024) \
6105 { \
6106 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6107 if (coding->charbuf) \
6108 break; \
6109 size >>= 1; \
6110 } \
6111 if (! coding->charbuf) \
6112 { \
6113 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6114 return coding->result; \
6115 } \
6116 coding->charbuf_size = size; \
6117 } while (0)
6118
6119
6120 static void
6121 produce_annotation (coding, pos)
6122 struct coding_system *coding;
6123 EMACS_INT pos;
6124 {
6125 int *charbuf = coding->charbuf;
6126 int *charbuf_end = charbuf + coding->charbuf_used;
6127
6128 if (NILP (coding->dst_object))
6129 return;
6130
6131 while (charbuf < charbuf_end)
6132 {
6133 if (*charbuf >= 0)
6134 pos += *charbuf++;
6135 else
6136 {
6137 int len = -*charbuf;
6138 switch (charbuf[1])
6139 {
6140 case CODING_ANNOTATE_COMPOSITION_MASK:
6141 produce_composition (coding, charbuf, pos);
6142 break;
6143 case CODING_ANNOTATE_CHARSET_MASK:
6144 produce_charset (coding, charbuf, pos);
6145 break;
6146 default:
6147 abort ();
6148 }
6149 charbuf += len;
6150 }
6151 }
6152 }
6153
6154 /* Decode the data at CODING->src_object into CODING->dst_object.
6155 CODING->src_object is a buffer, a string, or nil.
6156 CODING->dst_object is a buffer.
6157
6158 If CODING->src_object is a buffer, it must be the current buffer.
6159 In this case, if CODING->src_pos is positive, it is a position of
6160 the source text in the buffer, otherwise, the source text is in the
6161 gap area of the buffer, and CODING->src_pos specifies the offset of
6162 the text from GPT (which must be the same as PT). If this is the
6163 same buffer as CODING->dst_object, CODING->src_pos must be
6164 negative.
6165
6166 If CODING->src_object is a string, CODING->src_pos is an index to
6167 that string.
6168
6169 If CODING->src_object is nil, CODING->source must already point to
6170 the non-relocatable memory area. In this case, CODING->src_pos is
6171 an offset from CODING->source.
6172
6173 The decoded data is inserted at the current point of the buffer
6174 CODING->dst_object.
6175 */
6176
6177 static int
6178 decode_coding (coding)
6179 struct coding_system *coding;
6180 {
6181 Lisp_Object attrs;
6182 Lisp_Object undo_list;
6183 Lisp_Object translation_table;
6184 int carryover;
6185 int i;
6186
6187 if (BUFFERP (coding->src_object)
6188 && coding->src_pos > 0
6189 && coding->src_pos < GPT
6190 && coding->src_pos + coding->src_chars > GPT)
6191 move_gap_both (coding->src_pos, coding->src_pos_byte);
6192
6193 undo_list = Qt;
6194 if (BUFFERP (coding->dst_object))
6195 {
6196 if (current_buffer != XBUFFER (coding->dst_object))
6197 set_buffer_internal (XBUFFER (coding->dst_object));
6198 if (GPT != PT)
6199 move_gap_both (PT, PT_BYTE);
6200 undo_list = current_buffer->undo_list;
6201 current_buffer->undo_list = Qt;
6202 }
6203
6204 coding->consumed = coding->consumed_char = 0;
6205 coding->produced = coding->produced_char = 0;
6206 coding->chars_at_source = 0;
6207 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6208 coding->errors = 0;
6209
6210 ALLOC_CONVERSION_WORK_AREA (coding);
6211
6212 attrs = CODING_ID_ATTRS (coding->id);
6213 translation_table = get_translation_table (attrs, 0, NULL);
6214
6215 carryover = 0;
6216 do
6217 {
6218 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6219
6220 coding_set_source (coding);
6221 coding->annotated = 0;
6222 coding->charbuf_used = carryover;
6223 (*(coding->decoder)) (coding);
6224 coding_set_destination (coding);
6225 carryover = produce_chars (coding, translation_table, 0);
6226 if (coding->annotated)
6227 produce_annotation (coding, pos);
6228 for (i = 0; i < carryover; i++)
6229 coding->charbuf[i]
6230 = coding->charbuf[coding->charbuf_used - carryover + i];
6231 }
6232 while (coding->consumed < coding->src_bytes
6233 && (coding->result == CODING_RESULT_SUCCESS
6234 || coding->result == CODING_RESULT_INVALID_SRC));
6235
6236 if (carryover > 0)
6237 {
6238 coding_set_destination (coding);
6239 coding->charbuf_used = carryover;
6240 produce_chars (coding, translation_table, 1);
6241 }
6242
6243 coding->carryover_bytes = 0;
6244 if (coding->consumed < coding->src_bytes)
6245 {
6246 int nbytes = coding->src_bytes - coding->consumed;
6247 const unsigned char *src;
6248
6249 coding_set_source (coding);
6250 coding_set_destination (coding);
6251 src = coding->source + coding->consumed;
6252
6253 if (coding->mode & CODING_MODE_LAST_BLOCK)
6254 {
6255 /* Flush out unprocessed data as binary chars. We are sure
6256 that the number of data is less than the size of
6257 coding->charbuf. */
6258 coding->charbuf_used = 0;
6259 while (nbytes-- > 0)
6260 {
6261 int c = *src++;
6262
6263 if (c & 0x80)
6264 c = BYTE8_TO_CHAR (c);
6265 coding->charbuf[coding->charbuf_used++] = c;
6266 }
6267 produce_chars (coding, Qnil, 1);
6268 }
6269 else
6270 {
6271 /* Record unprocessed bytes in coding->carryover. We are
6272 sure that the number of data is less than the size of
6273 coding->carryover. */
6274 unsigned char *p = coding->carryover;
6275
6276 coding->carryover_bytes = nbytes;
6277 while (nbytes-- > 0)
6278 *p++ = *src++;
6279 }
6280 coding->consumed = coding->src_bytes;
6281 }
6282
6283 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6284 decode_eol (coding);
6285 if (BUFFERP (coding->dst_object))
6286 {
6287 current_buffer->undo_list = undo_list;
6288 record_insert (coding->dst_pos, coding->produced_char);
6289 }
6290 return coding->result;
6291 }
6292
6293
6294 /* Extract an annotation datum from a composition starting at POS and
6295 ending before LIMIT of CODING->src_object (buffer or string), store
6296 the data in BUF, set *STOP to a starting position of the next
6297 composition (if any) or to LIMIT, and return the address of the
6298 next element of BUF.
6299
6300 If such an annotation is not found, set *STOP to a starting
6301 position of a composition after POS (if any) or to LIMIT, and
6302 return BUF. */
6303
6304 static INLINE int *
6305 handle_composition_annotation (pos, limit, coding, buf, stop)
6306 EMACS_INT pos, limit;
6307 struct coding_system *coding;
6308 int *buf;
6309 EMACS_INT *stop;
6310 {
6311 EMACS_INT start, end;
6312 Lisp_Object prop;
6313
6314 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6315 || end > limit)
6316 *stop = limit;
6317 else if (start > pos)
6318 *stop = start;
6319 else
6320 {
6321 if (start == pos)
6322 {
6323 /* We found a composition. Store the corresponding
6324 annotation data in BUF. */
6325 int *head = buf;
6326 enum composition_method method = COMPOSITION_METHOD (prop);
6327 int nchars = COMPOSITION_LENGTH (prop);
6328
6329 ADD_COMPOSITION_DATA (buf, nchars, method);
6330 if (method != COMPOSITION_RELATIVE)
6331 {
6332 Lisp_Object components;
6333 int len, i, i_byte;
6334
6335 components = COMPOSITION_COMPONENTS (prop);
6336 if (VECTORP (components))
6337 {
6338 len = XVECTOR (components)->size;
6339 for (i = 0; i < len; i++)
6340 *buf++ = XINT (AREF (components, i));
6341 }
6342 else if (STRINGP (components))
6343 {
6344 len = SCHARS (components);
6345 i = i_byte = 0;
6346 while (i < len)
6347 {
6348 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6349 buf++;
6350 }
6351 }
6352 else if (INTEGERP (components))
6353 {
6354 len = 1;
6355 *buf++ = XINT (components);
6356 }
6357 else if (CONSP (components))
6358 {
6359 for (len = 0; CONSP (components);
6360 len++, components = XCDR (components))
6361 *buf++ = XINT (XCAR (components));
6362 }
6363 else
6364 abort ();
6365 *head -= len;
6366 }
6367 }
6368
6369 if (find_composition (end, limit, &start, &end, &prop,
6370 coding->src_object)
6371 && end <= limit)
6372 *stop = start;
6373 else
6374 *stop = limit;
6375 }
6376 return buf;
6377 }
6378
6379
6380 /* Extract an annotation datum from a text property `charset' at POS of
6381 CODING->src_object (buffer of string), store the data in BUF, set
6382 *STOP to the position where the value of `charset' property changes
6383 (limiting by LIMIT), and return the address of the next element of
6384 BUF.
6385
6386 If the property value is nil, set *STOP to the position where the
6387 property value is non-nil (limiting by LIMIT), and return BUF. */
6388
6389 static INLINE int *
6390 handle_charset_annotation (pos, limit, coding, buf, stop)
6391 EMACS_INT pos, limit;
6392 struct coding_system *coding;
6393 int *buf;
6394 EMACS_INT *stop;
6395 {
6396 Lisp_Object val, next;
6397 int id;
6398
6399 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6400 if (! NILP (val) && CHARSETP (val))
6401 id = XINT (CHARSET_SYMBOL_ID (val));
6402 else
6403 id = -1;
6404 ADD_CHARSET_DATA (buf, 0, id);
6405 next = Fnext_single_property_change (make_number (pos), Qcharset,
6406 coding->src_object,
6407 make_number (limit));
6408 *stop = XINT (next);
6409 return buf;
6410 }
6411
6412
6413 static void
6414 consume_chars (coding, translation_table, max_lookup)
6415 struct coding_system *coding;
6416 Lisp_Object translation_table;
6417 int max_lookup;
6418 {
6419 int *buf = coding->charbuf;
6420 int *buf_end = coding->charbuf + coding->charbuf_size;
6421 const unsigned char *src = coding->source + coding->consumed;
6422 const unsigned char *src_end = coding->source + coding->src_bytes;
6423 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6424 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6425 int multibytep = coding->src_multibyte;
6426 Lisp_Object eol_type;
6427 int c;
6428 EMACS_INT stop, stop_composition, stop_charset;
6429 int *lookup_buf = NULL;
6430
6431 if (! NILP (translation_table))
6432 lookup_buf = alloca (sizeof (int) * max_lookup);
6433
6434 eol_type = CODING_ID_EOL_TYPE (coding->id);
6435 if (VECTORP (eol_type))
6436 eol_type = Qunix;
6437
6438 /* Note: composition handling is not yet implemented. */
6439 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6440
6441 if (NILP (coding->src_object))
6442 stop = stop_composition = stop_charset = end_pos;
6443 else
6444 {
6445 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6446 stop = stop_composition = pos;
6447 else
6448 stop = stop_composition = end_pos;
6449 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6450 stop = stop_charset = pos;
6451 else
6452 stop_charset = end_pos;
6453 }
6454
6455 /* Compensate for CRLF and conversion. */
6456 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6457 while (buf < buf_end)
6458 {
6459 Lisp_Object trans;
6460
6461 if (pos == stop)
6462 {
6463 if (pos == end_pos)
6464 break;
6465 if (pos == stop_composition)
6466 buf = handle_composition_annotation (pos, end_pos, coding,
6467 buf, &stop_composition);
6468 if (pos == stop_charset)
6469 buf = handle_charset_annotation (pos, end_pos, coding,
6470 buf, &stop_charset);
6471 stop = (stop_composition < stop_charset
6472 ? stop_composition : stop_charset);
6473 }
6474
6475 if (! multibytep)
6476 {
6477 EMACS_INT bytes;
6478
6479 if (coding->encoder == encode_coding_raw_text)
6480 c = *src++, pos++;
6481 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6482 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6483 else
6484 c = BYTE8_TO_CHAR (*src), src++, pos++;
6485 }
6486 else
6487 c = STRING_CHAR_ADVANCE (src), pos++;
6488 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6489 c = '\n';
6490 if (! EQ (eol_type, Qunix))
6491 {
6492 if (c == '\n')
6493 {
6494 if (EQ (eol_type, Qdos))
6495 *buf++ = '\r';
6496 else
6497 c = '\r';
6498 }
6499 }
6500
6501 trans = Qnil;
6502 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6503 if (NILP (trans))
6504 *buf++ = c;
6505 else
6506 {
6507 int from_nchars = 1, to_nchars = 1;
6508 int *lookup_buf_end;
6509 const unsigned char *p = src;
6510 int i;
6511
6512 lookup_buf[0] = c;
6513 for (i = 1; i < max_lookup && p < src_end; i++)
6514 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6515 lookup_buf_end = lookup_buf + i;
6516 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6517 &from_nchars, &to_nchars);
6518 if (EQ (trans, Qt)
6519 || buf + to_nchars > buf_end)
6520 break;
6521 *buf++ = *lookup_buf;
6522 for (i = 1; i < to_nchars; i++)
6523 *buf++ = XINT (AREF (trans, i));
6524 for (i = 1; i < from_nchars; i++, pos++)
6525 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6526 }
6527 }
6528
6529 coding->consumed = src - coding->source;
6530 coding->consumed_char = pos - coding->src_pos;
6531 coding->charbuf_used = buf - coding->charbuf;
6532 coding->chars_at_source = 0;
6533 }
6534
6535
6536 /* Encode the text at CODING->src_object into CODING->dst_object.
6537 CODING->src_object is a buffer or a string.
6538 CODING->dst_object is a buffer or nil.
6539
6540 If CODING->src_object is a buffer, it must be the current buffer.
6541 In this case, if CODING->src_pos is positive, it is a position of
6542 the source text in the buffer, otherwise. the source text is in the
6543 gap area of the buffer, and coding->src_pos specifies the offset of
6544 the text from GPT (which must be the same as PT). If this is the
6545 same buffer as CODING->dst_object, CODING->src_pos must be
6546 negative and CODING should not have `pre-write-conversion'.
6547
6548 If CODING->src_object is a string, CODING should not have
6549 `pre-write-conversion'.
6550
6551 If CODING->dst_object is a buffer, the encoded data is inserted at
6552 the current point of that buffer.
6553
6554 If CODING->dst_object is nil, the encoded data is placed at the
6555 memory area specified by CODING->destination. */
6556
6557 static int
6558 encode_coding (coding)
6559 struct coding_system *coding;
6560 {
6561 Lisp_Object attrs;
6562 Lisp_Object translation_table;
6563 int max_lookup;
6564
6565 attrs = CODING_ID_ATTRS (coding->id);
6566 if (coding->encoder == encode_coding_raw_text)
6567 translation_table = Qnil, max_lookup = 0;
6568 else
6569 translation_table = get_translation_table (attrs, 1, &max_lookup);
6570
6571 if (BUFFERP (coding->dst_object))
6572 {
6573 set_buffer_internal (XBUFFER (coding->dst_object));
6574 coding->dst_multibyte
6575 = ! NILP (current_buffer->enable_multibyte_characters);
6576 }
6577
6578 coding->consumed = coding->consumed_char = 0;
6579 coding->produced = coding->produced_char = 0;
6580 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6581 coding->errors = 0;
6582
6583 ALLOC_CONVERSION_WORK_AREA (coding);
6584
6585 do {
6586 coding_set_source (coding);
6587 consume_chars (coding, translation_table, max_lookup);
6588 coding_set_destination (coding);
6589 (*(coding->encoder)) (coding);
6590 } while (coding->consumed_char < coding->src_chars);
6591
6592 if (BUFFERP (coding->dst_object))
6593 insert_from_gap (coding->produced_char, coding->produced);
6594
6595 return (coding->result);
6596 }
6597
6598
6599 /* Name (or base name) of work buffer for code conversion. */
6600 static Lisp_Object Vcode_conversion_workbuf_name;
6601
6602 /* A working buffer used by the top level conversion. Once it is
6603 created, it is never destroyed. It has the name
6604 Vcode_conversion_workbuf_name. The other working buffers are
6605 destroyed after the use is finished, and their names are modified
6606 versions of Vcode_conversion_workbuf_name. */
6607 static Lisp_Object Vcode_conversion_reused_workbuf;
6608
6609 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6610 static int reused_workbuf_in_use;
6611
6612
6613 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6614 multibyteness of returning buffer. */
6615
6616 static Lisp_Object
6617 make_conversion_work_buffer (multibyte)
6618 int multibyte;
6619 {
6620 Lisp_Object name, workbuf;
6621 struct buffer *current;
6622
6623 if (reused_workbuf_in_use++)
6624 {
6625 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6626 workbuf = Fget_buffer_create (name);
6627 }
6628 else
6629 {
6630 name = Vcode_conversion_workbuf_name;
6631 workbuf = Fget_buffer_create (name);
6632 if (NILP (Vcode_conversion_reused_workbuf))
6633 Vcode_conversion_reused_workbuf = workbuf;
6634 }
6635 current = current_buffer;
6636 set_buffer_internal (XBUFFER (workbuf));
6637 Ferase_buffer ();
6638 current_buffer->undo_list = Qt;
6639 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6640 set_buffer_internal (current);
6641 return workbuf;
6642 }
6643
6644
6645 static Lisp_Object
6646 code_conversion_restore (arg)
6647 Lisp_Object arg;
6648 {
6649 Lisp_Object current, workbuf;
6650 struct gcpro gcpro1;
6651
6652 GCPRO1 (arg);
6653 current = XCAR (arg);
6654 workbuf = XCDR (arg);
6655 if (! NILP (workbuf))
6656 {
6657 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6658 reused_workbuf_in_use = 0;
6659 else if (! NILP (Fbuffer_live_p (workbuf)))
6660 Fkill_buffer (workbuf);
6661 }
6662 set_buffer_internal (XBUFFER (current));
6663 UNGCPRO;
6664 return Qnil;
6665 }
6666
6667 Lisp_Object
6668 code_conversion_save (with_work_buf, multibyte)
6669 int with_work_buf, multibyte;
6670 {
6671 Lisp_Object workbuf = Qnil;
6672
6673 if (with_work_buf)
6674 workbuf = make_conversion_work_buffer (multibyte);
6675 record_unwind_protect (code_conversion_restore,
6676 Fcons (Fcurrent_buffer (), workbuf));
6677 return workbuf;
6678 }
6679
6680 int
6681 decode_coding_gap (coding, chars, bytes)
6682 struct coding_system *coding;
6683 EMACS_INT chars, bytes;
6684 {
6685 int count = specpdl_ptr - specpdl;
6686 Lisp_Object attrs;
6687
6688 code_conversion_save (0, 0);
6689
6690 coding->src_object = Fcurrent_buffer ();
6691 coding->src_chars = chars;
6692 coding->src_bytes = bytes;
6693 coding->src_pos = -chars;
6694 coding->src_pos_byte = -bytes;
6695 coding->src_multibyte = chars < bytes;
6696 coding->dst_object = coding->src_object;
6697 coding->dst_pos = PT;
6698 coding->dst_pos_byte = PT_BYTE;
6699 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6700
6701 if (CODING_REQUIRE_DETECTION (coding))
6702 detect_coding (coding);
6703
6704 coding->mode |= CODING_MODE_LAST_BLOCK;
6705 decode_coding (coding);
6706
6707 attrs = CODING_ID_ATTRS (coding->id);
6708 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6709 {
6710 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6711 Lisp_Object val;
6712
6713 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6714 val = call1 (CODING_ATTR_POST_READ (attrs),
6715 make_number (coding->produced_char));
6716 CHECK_NATNUM (val);
6717 coding->produced_char += Z - prev_Z;
6718 coding->produced += Z_BYTE - prev_Z_BYTE;
6719 }
6720
6721 unbind_to (count, Qnil);
6722 return coding->result;
6723 }
6724
6725 int
6726 encode_coding_gap (coding, chars, bytes)
6727 struct coding_system *coding;
6728 EMACS_INT chars, bytes;
6729 {
6730 int count = specpdl_ptr - specpdl;
6731
6732 code_conversion_save (0, 0);
6733
6734 coding->src_object = Fcurrent_buffer ();
6735 coding->src_chars = chars;
6736 coding->src_bytes = bytes;
6737 coding->src_pos = -chars;
6738 coding->src_pos_byte = -bytes;
6739 coding->src_multibyte = chars < bytes;
6740 coding->dst_object = coding->src_object;
6741 coding->dst_pos = PT;
6742 coding->dst_pos_byte = PT_BYTE;
6743
6744 encode_coding (coding);
6745
6746 unbind_to (count, Qnil);
6747 return coding->result;
6748 }
6749
6750
6751 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6752 SRC_OBJECT into DST_OBJECT by coding context CODING.
6753
6754 SRC_OBJECT is a buffer, a string, or Qnil.
6755
6756 If it is a buffer, the text is at point of the buffer. FROM and TO
6757 are positions in the buffer.
6758
6759 If it is a string, the text is at the beginning of the string.
6760 FROM and TO are indices to the string.
6761
6762 If it is nil, the text is at coding->source. FROM and TO are
6763 indices to coding->source.
6764
6765 DST_OBJECT is a buffer, Qt, or Qnil.
6766
6767 If it is a buffer, the decoded text is inserted at point of the
6768 buffer. If the buffer is the same as SRC_OBJECT, the source text
6769 is deleted.
6770
6771 If it is Qt, a string is made from the decoded text, and
6772 set in CODING->dst_object.
6773
6774 If it is Qnil, the decoded text is stored at CODING->destination.
6775 The caller must allocate CODING->dst_bytes bytes at
6776 CODING->destination by xmalloc. If the decoded text is longer than
6777 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6778 */
6779
6780 void
6781 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6782 dst_object)
6783 struct coding_system *coding;
6784 Lisp_Object src_object;
6785 EMACS_INT from, from_byte, to, to_byte;
6786 Lisp_Object dst_object;
6787 {
6788 int count = specpdl_ptr - specpdl;
6789 unsigned char *destination;
6790 EMACS_INT dst_bytes;
6791 EMACS_INT chars = to - from;
6792 EMACS_INT bytes = to_byte - from_byte;
6793 Lisp_Object attrs;
6794 Lisp_Object buffer;
6795 int saved_pt = -1, saved_pt_byte;
6796
6797 buffer = Fcurrent_buffer ();
6798
6799 if (NILP (dst_object))
6800 {
6801 destination = coding->destination;
6802 dst_bytes = coding->dst_bytes;
6803 }
6804
6805 coding->src_object = src_object;
6806 coding->src_chars = chars;
6807 coding->src_bytes = bytes;
6808 coding->src_multibyte = chars < bytes;
6809
6810 if (STRINGP (src_object))
6811 {
6812 coding->src_pos = from;
6813 coding->src_pos_byte = from_byte;
6814 }
6815 else if (BUFFERP (src_object))
6816 {
6817 set_buffer_internal (XBUFFER (src_object));
6818 if (from != GPT)
6819 move_gap_both (from, from_byte);
6820 if (EQ (src_object, dst_object))
6821 {
6822 saved_pt = PT, saved_pt_byte = PT_BYTE;
6823 TEMP_SET_PT_BOTH (from, from_byte);
6824 del_range_both (from, from_byte, to, to_byte, 1);
6825 coding->src_pos = -chars;
6826 coding->src_pos_byte = -bytes;
6827 }
6828 else
6829 {
6830 coding->src_pos = from;
6831 coding->src_pos_byte = from_byte;
6832 }
6833 }
6834
6835 if (CODING_REQUIRE_DETECTION (coding))
6836 detect_coding (coding);
6837 attrs = CODING_ID_ATTRS (coding->id);
6838
6839 if (EQ (dst_object, Qt)
6840 || (! NILP (CODING_ATTR_POST_READ (attrs))
6841 && NILP (dst_object)))
6842 {
6843 coding->dst_object = code_conversion_save (1, 1);
6844 coding->dst_pos = BEG;
6845 coding->dst_pos_byte = BEG_BYTE;
6846 coding->dst_multibyte = 1;
6847 }
6848 else if (BUFFERP (dst_object))
6849 {
6850 code_conversion_save (0, 0);
6851 coding->dst_object = dst_object;
6852 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6853 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6854 coding->dst_multibyte
6855 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6856 }
6857 else
6858 {
6859 code_conversion_save (0, 0);
6860 coding->dst_object = Qnil;
6861 coding->dst_multibyte = 1;
6862 }
6863
6864 decode_coding (coding);
6865
6866 if (BUFFERP (coding->dst_object))
6867 set_buffer_internal (XBUFFER (coding->dst_object));
6868
6869 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6870 {
6871 struct gcpro gcpro1, gcpro2;
6872 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6873 Lisp_Object val;
6874
6875 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6876 GCPRO2 (coding->src_object, coding->dst_object);
6877 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6878 make_number (coding->produced_char));
6879 UNGCPRO;
6880 CHECK_NATNUM (val);
6881 coding->produced_char += Z - prev_Z;
6882 coding->produced += Z_BYTE - prev_Z_BYTE;
6883 }
6884
6885 if (EQ (dst_object, Qt))
6886 {
6887 coding->dst_object = Fbuffer_string ();
6888 }
6889 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6890 {
6891 set_buffer_internal (XBUFFER (coding->dst_object));
6892 if (dst_bytes < coding->produced)
6893 {
6894 destination
6895 = (unsigned char *) xrealloc (destination, coding->produced);
6896 if (! destination)
6897 {
6898 record_conversion_result (coding,
6899 CODING_RESULT_INSUFFICIENT_DST);
6900 unbind_to (count, Qnil);
6901 return;
6902 }
6903 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6904 move_gap_both (BEGV, BEGV_BYTE);
6905 bcopy (BEGV_ADDR, destination, coding->produced);
6906 coding->destination = destination;
6907 }
6908 }
6909
6910 if (saved_pt >= 0)
6911 {
6912 /* This is the case of:
6913 (BUFFERP (src_object) && EQ (src_object, dst_object))
6914 As we have moved PT while replacing the original buffer
6915 contents, we must recover it now. */
6916 set_buffer_internal (XBUFFER (src_object));
6917 if (saved_pt < from)
6918 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6919 else if (saved_pt < from + chars)
6920 TEMP_SET_PT_BOTH (from, from_byte);
6921 else if (! NILP (current_buffer->enable_multibyte_characters))
6922 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6923 saved_pt_byte + (coding->produced - bytes));
6924 else
6925 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6926 saved_pt_byte + (coding->produced - bytes));
6927 }
6928
6929 unbind_to (count, coding->dst_object);
6930 }
6931
6932
6933 void
6934 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6935 dst_object)
6936 struct coding_system *coding;
6937 Lisp_Object src_object;
6938 EMACS_INT from, from_byte, to, to_byte;
6939 Lisp_Object dst_object;
6940 {
6941 int count = specpdl_ptr - specpdl;
6942 EMACS_INT chars = to - from;
6943 EMACS_INT bytes = to_byte - from_byte;
6944 Lisp_Object attrs;
6945 Lisp_Object buffer;
6946 int saved_pt = -1, saved_pt_byte;
6947 int kill_src_buffer = 0;
6948
6949 buffer = Fcurrent_buffer ();
6950
6951 coding->src_object = src_object;
6952 coding->src_chars = chars;
6953 coding->src_bytes = bytes;
6954 coding->src_multibyte = chars < bytes;
6955
6956 attrs = CODING_ID_ATTRS (coding->id);
6957
6958 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6959 {
6960 coding->src_object = code_conversion_save (1, coding->src_multibyte);
6961 set_buffer_internal (XBUFFER (coding->src_object));
6962 if (STRINGP (src_object))
6963 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6964 else if (BUFFERP (src_object))
6965 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6966 else
6967 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6968
6969 if (EQ (src_object, dst_object))
6970 {
6971 set_buffer_internal (XBUFFER (src_object));
6972 saved_pt = PT, saved_pt_byte = PT_BYTE;
6973 del_range_both (from, from_byte, to, to_byte, 1);
6974 set_buffer_internal (XBUFFER (coding->src_object));
6975 }
6976
6977 {
6978 Lisp_Object args[3];
6979
6980 args[0] = CODING_ATTR_PRE_WRITE (attrs);
6981 args[1] = make_number (BEG);
6982 args[2] = make_number (Z);
6983 safe_call (3, args);
6984 }
6985 if (XBUFFER (coding->src_object) != current_buffer)
6986 kill_src_buffer = 1;
6987 coding->src_object = Fcurrent_buffer ();
6988 if (BEG != GPT)
6989 move_gap_both (BEG, BEG_BYTE);
6990 coding->src_chars = Z - BEG;
6991 coding->src_bytes = Z_BYTE - BEG_BYTE;
6992 coding->src_pos = BEG;
6993 coding->src_pos_byte = BEG_BYTE;
6994 coding->src_multibyte = Z < Z_BYTE;
6995 }
6996 else if (STRINGP (src_object))
6997 {
6998 code_conversion_save (0, 0);
6999 coding->src_pos = from;
7000 coding->src_pos_byte = from_byte;
7001 }
7002 else if (BUFFERP (src_object))
7003 {
7004 code_conversion_save (0, 0);
7005 set_buffer_internal (XBUFFER (src_object));
7006 if (EQ (src_object, dst_object))
7007 {
7008 saved_pt = PT, saved_pt_byte = PT_BYTE;
7009 coding->src_object = del_range_1 (from, to, 1, 1);
7010 coding->src_pos = 0;
7011 coding->src_pos_byte = 0;
7012 }
7013 else
7014 {
7015 if (from < GPT && to >= GPT)
7016 move_gap_both (from, from_byte);
7017 coding->src_pos = from;
7018 coding->src_pos_byte = from_byte;
7019 }
7020 }
7021 else
7022 code_conversion_save (0, 0);
7023
7024 if (BUFFERP (dst_object))
7025 {
7026 coding->dst_object = dst_object;
7027 if (EQ (src_object, dst_object))
7028 {
7029 coding->dst_pos = from;
7030 coding->dst_pos_byte = from_byte;
7031 }
7032 else
7033 {
7034 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7035 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7036 }
7037 coding->dst_multibyte
7038 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7039 }
7040 else if (EQ (dst_object, Qt))
7041 {
7042 coding->dst_object = Qnil;
7043 coding->dst_bytes = coding->src_chars;
7044 if (coding->dst_bytes == 0)
7045 coding->dst_bytes = 1;
7046 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7047 coding->dst_multibyte = 0;
7048 }
7049 else
7050 {
7051 coding->dst_object = Qnil;
7052 coding->dst_multibyte = 0;
7053 }
7054
7055 encode_coding (coding);
7056
7057 if (EQ (dst_object, Qt))
7058 {
7059 if (BUFFERP (coding->dst_object))
7060 coding->dst_object = Fbuffer_string ();
7061 else
7062 {
7063 coding->dst_object
7064 = make_unibyte_string ((char *) coding->destination,
7065 coding->produced);
7066 xfree (coding->destination);
7067 }
7068 }
7069
7070 if (saved_pt >= 0)
7071 {
7072 /* This is the case of:
7073 (BUFFERP (src_object) && EQ (src_object, dst_object))
7074 As we have moved PT while replacing the original buffer
7075 contents, we must recover it now. */
7076 set_buffer_internal (XBUFFER (src_object));
7077 if (saved_pt < from)
7078 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7079 else if (saved_pt < from + chars)
7080 TEMP_SET_PT_BOTH (from, from_byte);
7081 else if (! NILP (current_buffer->enable_multibyte_characters))
7082 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7083 saved_pt_byte + (coding->produced - bytes));
7084 else
7085 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7086 saved_pt_byte + (coding->produced - bytes));
7087 }
7088
7089 if (kill_src_buffer)
7090 Fkill_buffer (coding->src_object);
7091 unbind_to (count, Qnil);
7092 }
7093
7094
7095 Lisp_Object
7096 preferred_coding_system ()
7097 {
7098 int id = coding_categories[coding_priorities[0]].id;
7099
7100 return CODING_ID_NAME (id);
7101 }
7102
7103 \f
7104 #ifdef emacs
7105 /*** 8. Emacs Lisp library functions ***/
7106
7107 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7108 doc: /* Return t if OBJECT is nil or a coding-system.
7109 See the documentation of `define-coding-system' for information
7110 about coding-system objects. */)
7111 (obj)
7112 Lisp_Object obj;
7113 {
7114 if (NILP (obj)
7115 || CODING_SYSTEM_ID (obj) >= 0)
7116 return Qt;
7117 if (! SYMBOLP (obj)
7118 || NILP (Fget (obj, Qcoding_system_define_form)))
7119 return Qnil;
7120 return Qt;
7121 }
7122
7123 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7124 Sread_non_nil_coding_system, 1, 1, 0,
7125 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7126 (prompt)
7127 Lisp_Object prompt;
7128 {
7129 Lisp_Object val;
7130 do
7131 {
7132 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7133 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7134 }
7135 while (SCHARS (val) == 0);
7136 return (Fintern (val, Qnil));
7137 }
7138
7139 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7140 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7141 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
7142 (prompt, default_coding_system)
7143 Lisp_Object prompt, default_coding_system;
7144 {
7145 Lisp_Object val;
7146 if (SYMBOLP (default_coding_system))
7147 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7148 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7149 Qt, Qnil, Qcoding_system_history,
7150 default_coding_system, Qnil);
7151 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7152 }
7153
7154 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7155 1, 1, 0,
7156 doc: /* Check validity of CODING-SYSTEM.
7157 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7158 It is valid if it is nil or a symbol defined as a coding system by the
7159 function `define-coding-system'. */)
7160 (coding_system)
7161 Lisp_Object coding_system;
7162 {
7163 Lisp_Object define_form;
7164
7165 define_form = Fget (coding_system, Qcoding_system_define_form);
7166 if (! NILP (define_form))
7167 {
7168 Fput (coding_system, Qcoding_system_define_form, Qnil);
7169 safe_eval (define_form);
7170 }
7171 if (!NILP (Fcoding_system_p (coding_system)))
7172 return coding_system;
7173 while (1)
7174 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7175 }
7176
7177 \f
7178 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7179 HIGHEST is nonzero, return the coding system of the highest
7180 priority among the detected coding systems. Otherwize return a
7181 list of detected coding systems sorted by their priorities. If
7182 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7183 multibyte form but contains only ASCII and eight-bit chars.
7184 Otherwise, the bytes are raw bytes.
7185
7186 CODING-SYSTEM controls the detection as below:
7187
7188 If it is nil, detect both text-format and eol-format. If the
7189 text-format part of CODING-SYSTEM is already specified
7190 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7191 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7192 detect only text-format. */
7193
7194 Lisp_Object
7195 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7196 coding_system)
7197 const unsigned char *src;
7198 int src_chars, src_bytes, highest;
7199 int multibytep;
7200 Lisp_Object coding_system;
7201 {
7202 const unsigned char *src_end = src + src_bytes;
7203 Lisp_Object attrs, eol_type;
7204 Lisp_Object val;
7205 struct coding_system coding;
7206 int id;
7207 struct coding_detection_info detect_info;
7208 enum coding_category base_category;
7209
7210 if (NILP (coding_system))
7211 coding_system = Qundecided;
7212 setup_coding_system (coding_system, &coding);
7213 attrs = CODING_ID_ATTRS (coding.id);
7214 eol_type = CODING_ID_EOL_TYPE (coding.id);
7215 coding_system = CODING_ATTR_BASE_NAME (attrs);
7216
7217 coding.source = src;
7218 coding.src_chars = src_chars;
7219 coding.src_bytes = src_bytes;
7220 coding.src_multibyte = multibytep;
7221 coding.consumed = 0;
7222 coding.mode |= CODING_MODE_LAST_BLOCK;
7223
7224 detect_info.checked = detect_info.found = detect_info.rejected = 0;
7225
7226 /* At first, detect text-format if necessary. */
7227 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7228 if (base_category == coding_category_undecided)
7229 {
7230 enum coding_category category;
7231 struct coding_system *this;
7232 int c, i;
7233
7234 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7235 for (i = 0; src < src_end; i++, src++)
7236 {
7237 c = *src;
7238 if (c & 0x80)
7239 break;
7240 if (c < 0x20
7241 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7242 && inhibit_iso_escape_detection)
7243 {
7244 coding.head_ascii = src - coding.source;
7245 if (detect_coding_iso_2022 (&coding, &detect_info))
7246 {
7247 /* We have scanned the whole data. */
7248 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7249 /* We didn't find an 8-bit code. */
7250 src = src_end;
7251 break;
7252 }
7253 }
7254 }
7255 coding.head_ascii = src - coding.source;
7256
7257 if (src < src_end
7258 || detect_info.found)
7259 {
7260 if (src == src_end)
7261 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7262 for (i = 0; i < coding_category_raw_text; i++)
7263 {
7264 category = coding_priorities[i];
7265 if (detect_info.found & (1 << category))
7266 break;
7267 }
7268 else
7269 for (i = 0; i < coding_category_raw_text; i++)
7270 {
7271 category = coding_priorities[i];
7272 this = coding_categories + category;
7273
7274 if (this->id < 0)
7275 {
7276 /* No coding system of this category is defined. */
7277 detect_info.rejected |= (1 << category);
7278 }
7279 else if (category >= coding_category_raw_text)
7280 continue;
7281 else if (detect_info.checked & (1 << category))
7282 {
7283 if (highest
7284 && (detect_info.found & (1 << category)))
7285 break;
7286 }
7287 else
7288 {
7289 if ((*(this->detector)) (&coding, &detect_info)
7290 && highest
7291 && (detect_info.found & (1 << category)))
7292 {
7293 if (category == coding_category_utf_16_auto)
7294 {
7295 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7296 category = coding_category_utf_16_le;
7297 else
7298 category = coding_category_utf_16_be;
7299 }
7300 break;
7301 }
7302 }
7303 }
7304 }
7305
7306 if (detect_info.rejected == CATEGORY_MASK_ANY)
7307 {
7308 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7309 id = coding_categories[coding_category_raw_text].id;
7310 val = Fcons (make_number (id), Qnil);
7311 }
7312 else if (! detect_info.rejected && ! detect_info.found)
7313 {
7314 detect_info.found = CATEGORY_MASK_ANY;
7315 id = coding_categories[coding_category_undecided].id;
7316 val = Fcons (make_number (id), Qnil);
7317 }
7318 else if (highest)
7319 {
7320 if (detect_info.found)
7321 {
7322 detect_info.found = 1 << category;
7323 val = Fcons (make_number (this->id), Qnil);
7324 }
7325 else
7326 for (i = 0; i < coding_category_raw_text; i++)
7327 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7328 {
7329 detect_info.found = 1 << coding_priorities[i];
7330 id = coding_categories[coding_priorities[i]].id;
7331 val = Fcons (make_number (id), Qnil);
7332 break;
7333 }
7334 }
7335 else
7336 {
7337 int mask = detect_info.rejected | detect_info.found;
7338 int found = 0;
7339 val = Qnil;
7340
7341 for (i = coding_category_raw_text - 1; i >= 0; i--)
7342 {
7343 category = coding_priorities[i];
7344 if (! (mask & (1 << category)))
7345 {
7346 found |= 1 << category;
7347 id = coding_categories[category].id;
7348 val = Fcons (make_number (id), val);
7349 }
7350 }
7351 for (i = coding_category_raw_text - 1; i >= 0; i--)
7352 {
7353 category = coding_priorities[i];
7354 if (detect_info.found & (1 << category))
7355 {
7356 id = coding_categories[category].id;
7357 val = Fcons (make_number (id), val);
7358 }
7359 }
7360 detect_info.found |= found;
7361 }
7362 }
7363 else if (base_category == coding_category_utf_16_auto)
7364 {
7365 if (detect_coding_utf_16 (&coding, &detect_info))
7366 {
7367 struct coding_system *this;
7368
7369 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7370 this = coding_categories + coding_category_utf_16_le;
7371 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7372 this = coding_categories + coding_category_utf_16_be;
7373 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7374 this = coding_categories + coding_category_utf_16_be_nosig;
7375 else
7376 this = coding_categories + coding_category_utf_16_le_nosig;
7377 val = Fcons (make_number (this->id), Qnil);
7378 }
7379 }
7380 else
7381 {
7382 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7383 val = Fcons (make_number (coding.id), Qnil);
7384 }
7385
7386 /* Then, detect eol-format if necessary. */
7387 {
7388 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7389 Lisp_Object tail;
7390
7391 if (VECTORP (eol_type))
7392 {
7393 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7394 normal_eol = detect_eol (coding.source, src_bytes,
7395 coding_category_raw_text);
7396 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7397 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7398 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7399 coding_category_utf_16_be);
7400 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7401 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7402 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7403 coding_category_utf_16_le);
7404 }
7405 else
7406 {
7407 if (EQ (eol_type, Qunix))
7408 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7409 else if (EQ (eol_type, Qdos))
7410 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7411 else
7412 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7413 }
7414
7415 for (tail = val; CONSP (tail); tail = XCDR (tail))
7416 {
7417 enum coding_category category;
7418 int this_eol;
7419
7420 id = XINT (XCAR (tail));
7421 attrs = CODING_ID_ATTRS (id);
7422 category = XINT (CODING_ATTR_CATEGORY (attrs));
7423 eol_type = CODING_ID_EOL_TYPE (id);
7424 if (VECTORP (eol_type))
7425 {
7426 if (category == coding_category_utf_16_be
7427 || category == coding_category_utf_16_be_nosig)
7428 this_eol = utf_16_be_eol;
7429 else if (category == coding_category_utf_16_le
7430 || category == coding_category_utf_16_le_nosig)
7431 this_eol = utf_16_le_eol;
7432 else
7433 this_eol = normal_eol;
7434
7435 if (this_eol == EOL_SEEN_LF)
7436 XSETCAR (tail, AREF (eol_type, 0));
7437 else if (this_eol == EOL_SEEN_CRLF)
7438 XSETCAR (tail, AREF (eol_type, 1));
7439 else if (this_eol == EOL_SEEN_CR)
7440 XSETCAR (tail, AREF (eol_type, 2));
7441 else
7442 XSETCAR (tail, CODING_ID_NAME (id));
7443 }
7444 else
7445 XSETCAR (tail, CODING_ID_NAME (id));
7446 }
7447 }
7448
7449 return (highest ? XCAR (val) : val);
7450 }
7451
7452
7453 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7454 2, 3, 0,
7455 doc: /* Detect coding system of the text in the region between START and END.
7456 Return a list of possible coding systems ordered by priority.
7457
7458 If only ASCII characters are found, it returns a list of single element
7459 `undecided' or its subsidiary coding system according to a detected
7460 end-of-line format.
7461
7462 If optional argument HIGHEST is non-nil, return the coding system of
7463 highest priority. */)
7464 (start, end, highest)
7465 Lisp_Object start, end, highest;
7466 {
7467 int from, to;
7468 int from_byte, to_byte;
7469
7470 CHECK_NUMBER_COERCE_MARKER (start);
7471 CHECK_NUMBER_COERCE_MARKER (end);
7472
7473 validate_region (&start, &end);
7474 from = XINT (start), to = XINT (end);
7475 from_byte = CHAR_TO_BYTE (from);
7476 to_byte = CHAR_TO_BYTE (to);
7477
7478 if (from < GPT && to >= GPT)
7479 move_gap_both (to, to_byte);
7480
7481 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7482 to - from, to_byte - from_byte,
7483 !NILP (highest),
7484 !NILP (current_buffer
7485 ->enable_multibyte_characters),
7486 Qnil);
7487 }
7488
7489 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7490 1, 2, 0,
7491 doc: /* Detect coding system of the text in STRING.
7492 Return a list of possible coding systems ordered by priority.
7493
7494 If only ASCII characters are found, it returns a list of single element
7495 `undecided' or its subsidiary coding system according to a detected
7496 end-of-line format.
7497
7498 If optional argument HIGHEST is non-nil, return the coding system of
7499 highest priority. */)
7500 (string, highest)
7501 Lisp_Object string, highest;
7502 {
7503 CHECK_STRING (string);
7504
7505 return detect_coding_system (SDATA (string),
7506 SCHARS (string), SBYTES (string),
7507 !NILP (highest), STRING_MULTIBYTE (string),
7508 Qnil);
7509 }
7510
7511
7512 static INLINE int
7513 char_encodable_p (c, attrs)
7514 int c;
7515 Lisp_Object attrs;
7516 {
7517 Lisp_Object tail;
7518 struct charset *charset;
7519 Lisp_Object translation_table;
7520
7521 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7522 if (! NILP (translation_table))
7523 c = translate_char (translation_table, c);
7524 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7525 CONSP (tail); tail = XCDR (tail))
7526 {
7527 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7528 if (CHAR_CHARSET_P (c, charset))
7529 break;
7530 }
7531 return (! NILP (tail));
7532 }
7533
7534
7535 /* Return a list of coding systems that safely encode the text between
7536 START and END. If EXCLUDE is non-nil, it is a list of coding
7537 systems not to check. The returned list doesn't contain any such
7538 coding systems. In any case, if the text contains only ASCII or is
7539 unibyte, return t. */
7540
7541 DEFUN ("find-coding-systems-region-internal",
7542 Ffind_coding_systems_region_internal,
7543 Sfind_coding_systems_region_internal, 2, 3, 0,
7544 doc: /* Internal use only. */)
7545 (start, end, exclude)
7546 Lisp_Object start, end, exclude;
7547 {
7548 Lisp_Object coding_attrs_list, safe_codings;
7549 EMACS_INT start_byte, end_byte;
7550 const unsigned char *p, *pbeg, *pend;
7551 int c;
7552 Lisp_Object tail, elt;
7553
7554 if (STRINGP (start))
7555 {
7556 if (!STRING_MULTIBYTE (start)
7557 || SCHARS (start) == SBYTES (start))
7558 return Qt;
7559 start_byte = 0;
7560 end_byte = SBYTES (start);
7561 }
7562 else
7563 {
7564 CHECK_NUMBER_COERCE_MARKER (start);
7565 CHECK_NUMBER_COERCE_MARKER (end);
7566 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7567 args_out_of_range (start, end);
7568 if (NILP (current_buffer->enable_multibyte_characters))
7569 return Qt;
7570 start_byte = CHAR_TO_BYTE (XINT (start));
7571 end_byte = CHAR_TO_BYTE (XINT (end));
7572 if (XINT (end) - XINT (start) == end_byte - start_byte)
7573 return Qt;
7574
7575 if (XINT (start) < GPT && XINT (end) > GPT)
7576 {
7577 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7578 move_gap_both (XINT (start), start_byte);
7579 else
7580 move_gap_both (XINT (end), end_byte);
7581 }
7582 }
7583
7584 coding_attrs_list = Qnil;
7585 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7586 if (NILP (exclude)
7587 || NILP (Fmemq (XCAR (tail), exclude)))
7588 {
7589 Lisp_Object attrs;
7590
7591 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7592 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7593 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7594 {
7595 ASET (attrs, coding_attr_trans_tbl,
7596 get_translation_table (attrs, 1, NULL));
7597 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7598 }
7599 }
7600
7601 if (STRINGP (start))
7602 p = pbeg = SDATA (start);
7603 else
7604 p = pbeg = BYTE_POS_ADDR (start_byte);
7605 pend = p + (end_byte - start_byte);
7606
7607 while (p < pend && ASCII_BYTE_P (*p)) p++;
7608 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7609
7610 while (p < pend)
7611 {
7612 if (ASCII_BYTE_P (*p))
7613 p++;
7614 else
7615 {
7616 c = STRING_CHAR_ADVANCE (p);
7617
7618 charset_map_loaded = 0;
7619 for (tail = coding_attrs_list; CONSP (tail);)
7620 {
7621 elt = XCAR (tail);
7622 if (NILP (elt))
7623 tail = XCDR (tail);
7624 else if (char_encodable_p (c, elt))
7625 tail = XCDR (tail);
7626 else if (CONSP (XCDR (tail)))
7627 {
7628 XSETCAR (tail, XCAR (XCDR (tail)));
7629 XSETCDR (tail, XCDR (XCDR (tail)));
7630 }
7631 else
7632 {
7633 XSETCAR (tail, Qnil);
7634 tail = XCDR (tail);
7635 }
7636 }
7637 if (charset_map_loaded)
7638 {
7639 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7640
7641 if (STRINGP (start))
7642 pbeg = SDATA (start);
7643 else
7644 pbeg = BYTE_POS_ADDR (start_byte);
7645 p = pbeg + p_offset;
7646 pend = pbeg + pend_offset;
7647 }
7648 }
7649 }
7650
7651 safe_codings = list2 (Qraw_text, Qno_conversion);
7652 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7653 if (! NILP (XCAR (tail)))
7654 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7655
7656 return safe_codings;
7657 }
7658
7659
7660 DEFUN ("unencodable-char-position", Funencodable_char_position,
7661 Sunencodable_char_position, 3, 5, 0,
7662 doc: /*
7663 Return position of first un-encodable character in a region.
7664 START and END specfiy the region and CODING-SYSTEM specifies the
7665 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7666
7667 If optional 4th argument COUNT is non-nil, it specifies at most how
7668 many un-encodable characters to search. In this case, the value is a
7669 list of positions.
7670
7671 If optional 5th argument STRING is non-nil, it is a string to search
7672 for un-encodable characters. In that case, START and END are indexes
7673 to the string. */)
7674 (start, end, coding_system, count, string)
7675 Lisp_Object start, end, coding_system, count, string;
7676 {
7677 int n;
7678 struct coding_system coding;
7679 Lisp_Object attrs, charset_list, translation_table;
7680 Lisp_Object positions;
7681 int from, to;
7682 const unsigned char *p, *stop, *pend;
7683 int ascii_compatible;
7684
7685 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7686 attrs = CODING_ID_ATTRS (coding.id);
7687 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7688 return Qnil;
7689 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7690 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7691 translation_table = get_translation_table (attrs, 1, NULL);
7692
7693 if (NILP (string))
7694 {
7695 validate_region (&start, &end);
7696 from = XINT (start);
7697 to = XINT (end);
7698 if (NILP (current_buffer->enable_multibyte_characters)
7699 || (ascii_compatible
7700 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7701 return Qnil;
7702 p = CHAR_POS_ADDR (from);
7703 pend = CHAR_POS_ADDR (to);
7704 if (from < GPT && to >= GPT)
7705 stop = GPT_ADDR;
7706 else
7707 stop = pend;
7708 }
7709 else
7710 {
7711 CHECK_STRING (string);
7712 CHECK_NATNUM (start);
7713 CHECK_NATNUM (end);
7714 from = XINT (start);
7715 to = XINT (end);
7716 if (from > to
7717 || to > SCHARS (string))
7718 args_out_of_range_3 (string, start, end);
7719 if (! STRING_MULTIBYTE (string))
7720 return Qnil;
7721 p = SDATA (string) + string_char_to_byte (string, from);
7722 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7723 if (ascii_compatible && (to - from) == (pend - p))
7724 return Qnil;
7725 }
7726
7727 if (NILP (count))
7728 n = 1;
7729 else
7730 {
7731 CHECK_NATNUM (count);
7732 n = XINT (count);
7733 }
7734
7735 positions = Qnil;
7736 while (1)
7737 {
7738 int c;
7739
7740 if (ascii_compatible)
7741 while (p < stop && ASCII_BYTE_P (*p))
7742 p++, from++;
7743 if (p >= stop)
7744 {
7745 if (p >= pend)
7746 break;
7747 stop = pend;
7748 p = GAP_END_ADDR;
7749 }
7750
7751 c = STRING_CHAR_ADVANCE (p);
7752 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7753 && ! char_charset (translate_char (translation_table, c),
7754 charset_list, NULL))
7755 {
7756 positions = Fcons (make_number (from), positions);
7757 n--;
7758 if (n == 0)
7759 break;
7760 }
7761
7762 from++;
7763 }
7764
7765 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7766 }
7767
7768
7769 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7770 Scheck_coding_systems_region, 3, 3, 0,
7771 doc: /* Check if the region is encodable by coding systems.
7772
7773 START and END are buffer positions specifying the region.
7774 CODING-SYSTEM-LIST is a list of coding systems to check.
7775
7776 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7777 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7778 whole region, POS0, POS1, ... are buffer positions where non-encodable
7779 characters are found.
7780
7781 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7782 value is nil.
7783
7784 START may be a string. In that case, check if the string is
7785 encodable, and the value contains indices to the string instead of
7786 buffer positions. END is ignored. */)
7787 (start, end, coding_system_list)
7788 Lisp_Object start, end, coding_system_list;
7789 {
7790 Lisp_Object list;
7791 EMACS_INT start_byte, end_byte;
7792 int pos;
7793 const unsigned char *p, *pbeg, *pend;
7794 int c;
7795 Lisp_Object tail, elt, attrs;
7796
7797 if (STRINGP (start))
7798 {
7799 if (!STRING_MULTIBYTE (start)
7800 && SCHARS (start) != SBYTES (start))
7801 return Qnil;
7802 start_byte = 0;
7803 end_byte = SBYTES (start);
7804 pos = 0;
7805 }
7806 else
7807 {
7808 CHECK_NUMBER_COERCE_MARKER (start);
7809 CHECK_NUMBER_COERCE_MARKER (end);
7810 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7811 args_out_of_range (start, end);
7812 if (NILP (current_buffer->enable_multibyte_characters))
7813 return Qnil;
7814 start_byte = CHAR_TO_BYTE (XINT (start));
7815 end_byte = CHAR_TO_BYTE (XINT (end));
7816 if (XINT (end) - XINT (start) == end_byte - start_byte)
7817 return Qt;
7818
7819 if (XINT (start) < GPT && XINT (end) > GPT)
7820 {
7821 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7822 move_gap_both (XINT (start), start_byte);
7823 else
7824 move_gap_both (XINT (end), end_byte);
7825 }
7826 pos = XINT (start);
7827 }
7828
7829 list = Qnil;
7830 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7831 {
7832 elt = XCAR (tail);
7833 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7834 ASET (attrs, coding_attr_trans_tbl,
7835 get_translation_table (attrs, 1, NULL));
7836 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7837 }
7838
7839 if (STRINGP (start))
7840 p = pbeg = SDATA (start);
7841 else
7842 p = pbeg = BYTE_POS_ADDR (start_byte);
7843 pend = p + (end_byte - start_byte);
7844
7845 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7846 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7847
7848 while (p < pend)
7849 {
7850 if (ASCII_BYTE_P (*p))
7851 p++;
7852 else
7853 {
7854 c = STRING_CHAR_ADVANCE (p);
7855
7856 charset_map_loaded = 0;
7857 for (tail = list; CONSP (tail); tail = XCDR (tail))
7858 {
7859 elt = XCDR (XCAR (tail));
7860 if (! char_encodable_p (c, XCAR (elt)))
7861 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7862 }
7863 if (charset_map_loaded)
7864 {
7865 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7866
7867 if (STRINGP (start))
7868 pbeg = SDATA (start);
7869 else
7870 pbeg = BYTE_POS_ADDR (start_byte);
7871 p = pbeg + p_offset;
7872 pend = pbeg + pend_offset;
7873 }
7874 }
7875 pos++;
7876 }
7877
7878 tail = list;
7879 list = Qnil;
7880 for (; CONSP (tail); tail = XCDR (tail))
7881 {
7882 elt = XCAR (tail);
7883 if (CONSP (XCDR (XCDR (elt))))
7884 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7885 list);
7886 }
7887
7888 return list;
7889 }
7890
7891
7892 Lisp_Object
7893 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7894 Lisp_Object start, end, coding_system, dst_object;
7895 int encodep, norecord;
7896 {
7897 struct coding_system coding;
7898 EMACS_INT from, from_byte, to, to_byte;
7899 Lisp_Object src_object;
7900
7901 CHECK_NUMBER_COERCE_MARKER (start);
7902 CHECK_NUMBER_COERCE_MARKER (end);
7903 if (NILP (coding_system))
7904 coding_system = Qno_conversion;
7905 else
7906 CHECK_CODING_SYSTEM (coding_system);
7907 src_object = Fcurrent_buffer ();
7908 if (NILP (dst_object))
7909 dst_object = src_object;
7910 else if (! EQ (dst_object, Qt))
7911 CHECK_BUFFER (dst_object);
7912
7913 validate_region (&start, &end);
7914 from = XFASTINT (start);
7915 from_byte = CHAR_TO_BYTE (from);
7916 to = XFASTINT (end);
7917 to_byte = CHAR_TO_BYTE (to);
7918
7919 setup_coding_system (coding_system, &coding);
7920 coding.mode |= CODING_MODE_LAST_BLOCK;
7921
7922 if (encodep)
7923 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7924 dst_object);
7925 else
7926 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7927 dst_object);
7928 if (! norecord)
7929 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7930
7931 return (BUFFERP (dst_object)
7932 ? make_number (coding.produced_char)
7933 : coding.dst_object);
7934 }
7935
7936
7937 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7938 3, 4, "r\nzCoding system: ",
7939 doc: /* Decode the current region from the specified coding system.
7940 When called from a program, takes four arguments:
7941 START, END, CODING-SYSTEM, and DESTINATION.
7942 START and END are buffer positions.
7943
7944 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7945 If nil, the region between START and END is replace by the decoded text.
7946 If buffer, the decoded text is inserted in the buffer.
7947 If t, the decoded text is returned.
7948
7949 This function sets `last-coding-system-used' to the precise coding system
7950 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7951 not fully specified.)
7952 It returns the length of the decoded text. */)
7953 (start, end, coding_system, destination)
7954 Lisp_Object start, end, coding_system, destination;
7955 {
7956 return code_convert_region (start, end, coding_system, destination, 0, 0);
7957 }
7958
7959 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7960 3, 4, "r\nzCoding system: ",
7961 doc: /* Encode the current region by specified coding system.
7962 When called from a program, takes three arguments:
7963 START, END, and CODING-SYSTEM. START and END are buffer positions.
7964
7965 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7966 If nil, the region between START and END is replace by the encoded text.
7967 If buffer, the encoded text is inserted in the buffer.
7968 If t, the encoded text is returned.
7969
7970 This function sets `last-coding-system-used' to the precise coding system
7971 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7972 not fully specified.)
7973 It returns the length of the encoded text. */)
7974 (start, end, coding_system, destination)
7975 Lisp_Object start, end, coding_system, destination;
7976 {
7977 return code_convert_region (start, end, coding_system, destination, 1, 0);
7978 }
7979
7980 Lisp_Object
7981 code_convert_string (string, coding_system, dst_object,
7982 encodep, nocopy, norecord)
7983 Lisp_Object string, coding_system, dst_object;
7984 int encodep, nocopy, norecord;
7985 {
7986 struct coding_system coding;
7987 EMACS_INT chars, bytes;
7988
7989 CHECK_STRING (string);
7990 if (NILP (coding_system))
7991 {
7992 if (! norecord)
7993 Vlast_coding_system_used = Qno_conversion;
7994 if (NILP (dst_object))
7995 return (nocopy ? Fcopy_sequence (string) : string);
7996 }
7997
7998 if (NILP (coding_system))
7999 coding_system = Qno_conversion;
8000 else
8001 CHECK_CODING_SYSTEM (coding_system);
8002 if (NILP (dst_object))
8003 dst_object = Qt;
8004 else if (! EQ (dst_object, Qt))
8005 CHECK_BUFFER (dst_object);
8006
8007 setup_coding_system (coding_system, &coding);
8008 coding.mode |= CODING_MODE_LAST_BLOCK;
8009 chars = SCHARS (string);
8010 bytes = SBYTES (string);
8011 if (encodep)
8012 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8013 else
8014 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8015 if (! norecord)
8016 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8017
8018 return (BUFFERP (dst_object)
8019 ? make_number (coding.produced_char)
8020 : coding.dst_object);
8021 }
8022
8023
8024 /* Encode or decode STRING according to CODING_SYSTEM.
8025 Do not set Vlast_coding_system_used.
8026
8027 This function is called only from macros DECODE_FILE and
8028 ENCODE_FILE, thus we ignore character composition. */
8029
8030 Lisp_Object
8031 code_convert_string_norecord (string, coding_system, encodep)
8032 Lisp_Object string, coding_system;
8033 int encodep;
8034 {
8035 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8036 }
8037
8038
8039 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8040 2, 4, 0,
8041 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8042
8043 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8044 if the decoding operation is trivial.
8045
8046 Optional fourth arg BUFFER non-nil meant that the decoded text is
8047 inserted in BUFFER instead of returned as a string. In this case,
8048 the return value is BUFFER.
8049
8050 This function sets `last-coding-system-used' to the precise coding system
8051 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8052 not fully specified. */)
8053 (string, coding_system, nocopy, buffer)
8054 Lisp_Object string, coding_system, nocopy, buffer;
8055 {
8056 return code_convert_string (string, coding_system, buffer,
8057 0, ! NILP (nocopy), 0);
8058 }
8059
8060 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8061 2, 4, 0,
8062 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8063
8064 Optional third arg NOCOPY non-nil means it is OK to return STRING
8065 itself if the encoding operation is trivial.
8066
8067 Optional fourth arg BUFFER non-nil meant that the encoded text is
8068 inserted in BUFFER instead of returned as a string. In this case,
8069 the return value is BUFFER.
8070
8071 This function sets `last-coding-system-used' to the precise coding system
8072 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8073 not fully specified.) */)
8074 (string, coding_system, nocopy, buffer)
8075 Lisp_Object string, coding_system, nocopy, buffer;
8076 {
8077 return code_convert_string (string, coding_system, buffer,
8078 1, ! NILP (nocopy), 1);
8079 }
8080
8081 \f
8082 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8083 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8084 Return the corresponding character. */)
8085 (code)
8086 Lisp_Object code;
8087 {
8088 Lisp_Object spec, attrs, val;
8089 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8090 int c;
8091
8092 CHECK_NATNUM (code);
8093 c = XFASTINT (code);
8094 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8095 attrs = AREF (spec, 0);
8096
8097 if (ASCII_BYTE_P (c)
8098 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8099 return code;
8100
8101 val = CODING_ATTR_CHARSET_LIST (attrs);
8102 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8103 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8104 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8105
8106 if (c <= 0x7F)
8107 charset = charset_roman;
8108 else if (c >= 0xA0 && c < 0xDF)
8109 {
8110 charset = charset_kana;
8111 c -= 0x80;
8112 }
8113 else
8114 {
8115 int s1 = c >> 8, s2 = c & 0xFF;
8116
8117 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8118 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8119 error ("Invalid code: %d", code);
8120 SJIS_TO_JIS (c);
8121 charset = charset_kanji;
8122 }
8123 c = DECODE_CHAR (charset, c);
8124 if (c < 0)
8125 error ("Invalid code: %d", code);
8126 return make_number (c);
8127 }
8128
8129
8130 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8131 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
8132 Return the corresponding code in SJIS. */)
8133 (ch)
8134 Lisp_Object ch;
8135 {
8136 Lisp_Object spec, attrs, charset_list;
8137 int c;
8138 struct charset *charset;
8139 unsigned code;
8140
8141 CHECK_CHARACTER (ch);
8142 c = XFASTINT (ch);
8143 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8144 attrs = AREF (spec, 0);
8145
8146 if (ASCII_CHAR_P (c)
8147 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8148 return ch;
8149
8150 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8151 charset = char_charset (c, charset_list, &code);
8152 if (code == CHARSET_INVALID_CODE (charset))
8153 error ("Can't encode by shift_jis encoding: %d", c);
8154 JIS_TO_SJIS (code);
8155
8156 return make_number (code);
8157 }
8158
8159 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8160 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8161 Return the corresponding character. */)
8162 (code)
8163 Lisp_Object code;
8164 {
8165 Lisp_Object spec, attrs, val;
8166 struct charset *charset_roman, *charset_big5, *charset;
8167 int c;
8168
8169 CHECK_NATNUM (code);
8170 c = XFASTINT (code);
8171 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8172 attrs = AREF (spec, 0);
8173
8174 if (ASCII_BYTE_P (c)
8175 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8176 return code;
8177
8178 val = CODING_ATTR_CHARSET_LIST (attrs);
8179 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8180 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8181
8182 if (c <= 0x7F)
8183 charset = charset_roman;
8184 else
8185 {
8186 int b1 = c >> 8, b2 = c & 0x7F;
8187 if (b1 < 0xA1 || b1 > 0xFE
8188 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8189 error ("Invalid code: %d", code);
8190 charset = charset_big5;
8191 }
8192 c = DECODE_CHAR (charset, (unsigned )c);
8193 if (c < 0)
8194 error ("Invalid code: %d", code);
8195 return make_number (c);
8196 }
8197
8198 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8199 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8200 Return the corresponding character code in Big5. */)
8201 (ch)
8202 Lisp_Object ch;
8203 {
8204 Lisp_Object spec, attrs, charset_list;
8205 struct charset *charset;
8206 int c;
8207 unsigned code;
8208
8209 CHECK_CHARACTER (ch);
8210 c = XFASTINT (ch);
8211 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8212 attrs = AREF (spec, 0);
8213 if (ASCII_CHAR_P (c)
8214 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8215 return ch;
8216
8217 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8218 charset = char_charset (c, charset_list, &code);
8219 if (code == CHARSET_INVALID_CODE (charset))
8220 error ("Can't encode by Big5 encoding: %d", c);
8221
8222 return make_number (code);
8223 }
8224
8225 \f
8226 DEFUN ("set-terminal-coding-system-internal",
8227 Fset_terminal_coding_system_internal,
8228 Sset_terminal_coding_system_internal, 1, 1, 0,
8229 doc: /* Internal use only. */)
8230 (coding_system)
8231 Lisp_Object coding_system;
8232 {
8233 CHECK_SYMBOL (coding_system);
8234 setup_coding_system (Fcheck_coding_system (coding_system),
8235 &terminal_coding);
8236
8237 /* We had better not send unsafe characters to terminal. */
8238 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8239 /* Characer composition should be disabled. */
8240 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8241 terminal_coding.src_multibyte = 1;
8242 terminal_coding.dst_multibyte = 0;
8243 return Qnil;
8244 }
8245
8246 DEFUN ("set-safe-terminal-coding-system-internal",
8247 Fset_safe_terminal_coding_system_internal,
8248 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8249 doc: /* Internal use only. */)
8250 (coding_system)
8251 Lisp_Object coding_system;
8252 {
8253 CHECK_SYMBOL (coding_system);
8254 setup_coding_system (Fcheck_coding_system (coding_system),
8255 &safe_terminal_coding);
8256 /* Characer composition should be disabled. */
8257 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8258 safe_terminal_coding.src_multibyte = 1;
8259 safe_terminal_coding.dst_multibyte = 0;
8260 return Qnil;
8261 }
8262
8263 DEFUN ("terminal-coding-system",
8264 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8265 doc: /* Return coding system specified for terminal output. */)
8266 ()
8267 {
8268 Lisp_Object coding_system;
8269
8270 coding_system = CODING_ID_NAME (terminal_coding.id);
8271 /* For backward compatibility, return nil if it is `undecided'. */
8272 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8273 }
8274
8275 DEFUN ("set-keyboard-coding-system-internal",
8276 Fset_keyboard_coding_system_internal,
8277 Sset_keyboard_coding_system_internal, 1, 1, 0,
8278 doc: /* Internal use only. */)
8279 (coding_system)
8280 Lisp_Object coding_system;
8281 {
8282 CHECK_SYMBOL (coding_system);
8283 setup_coding_system (Fcheck_coding_system (coding_system),
8284 &keyboard_coding);
8285 /* Characer composition should be disabled. */
8286 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8287 return Qnil;
8288 }
8289
8290 DEFUN ("keyboard-coding-system",
8291 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8292 doc: /* Return coding system specified for decoding keyboard input. */)
8293 ()
8294 {
8295 return CODING_ID_NAME (keyboard_coding.id);
8296 }
8297
8298 \f
8299 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8300 Sfind_operation_coding_system, 1, MANY, 0,
8301 doc: /* Choose a coding system for an operation based on the target name.
8302 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8303 DECODING-SYSTEM is the coding system to use for decoding
8304 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8305 for encoding (in case OPERATION does encoding).
8306
8307 The first argument OPERATION specifies an I/O primitive:
8308 For file I/O, `insert-file-contents' or `write-region'.
8309 For process I/O, `call-process', `call-process-region', or `start-process'.
8310 For network I/O, `open-network-stream'.
8311
8312 The remaining arguments should be the same arguments that were passed
8313 to the primitive. Depending on which primitive, one of those arguments
8314 is selected as the TARGET. For example, if OPERATION does file I/O,
8315 whichever argument specifies the file name is TARGET.
8316
8317 TARGET has a meaning which depends on OPERATION:
8318 For file I/O, TARGET is a file name.
8319 For process I/O, TARGET is a process name.
8320 For network I/O, TARGET is a service name or a port number
8321
8322 This function looks up what specified for TARGET in,
8323 `file-coding-system-alist', `process-coding-system-alist',
8324 or `network-coding-system-alist' depending on OPERATION.
8325 They may specify a coding system, a cons of coding systems,
8326 or a function symbol to call.
8327 In the last case, we call the function with one argument,
8328 which is a list of all the arguments given to this function.
8329
8330 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
8331 (nargs, args)
8332 int nargs;
8333 Lisp_Object *args;
8334 {
8335 Lisp_Object operation, target_idx, target, val;
8336 register Lisp_Object chain;
8337
8338 if (nargs < 2)
8339 error ("Too few arguments");
8340 operation = args[0];
8341 if (!SYMBOLP (operation)
8342 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8343 error ("Invalid first arguement");
8344 if (nargs < 1 + XINT (target_idx))
8345 error ("Too few arguments for operation: %s",
8346 SDATA (SYMBOL_NAME (operation)));
8347 target = args[XINT (target_idx) + 1];
8348 if (!(STRINGP (target)
8349 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8350 error ("Invalid %dth argument", XINT (target_idx) + 1);
8351
8352 chain = ((EQ (operation, Qinsert_file_contents)
8353 || EQ (operation, Qwrite_region))
8354 ? Vfile_coding_system_alist
8355 : (EQ (operation, Qopen_network_stream)
8356 ? Vnetwork_coding_system_alist
8357 : Vprocess_coding_system_alist));
8358 if (NILP (chain))
8359 return Qnil;
8360
8361 for (; CONSP (chain); chain = XCDR (chain))
8362 {
8363 Lisp_Object elt;
8364
8365 elt = XCAR (chain);
8366 if (CONSP (elt)
8367 && ((STRINGP (target)
8368 && STRINGP (XCAR (elt))
8369 && fast_string_match (XCAR (elt), target) >= 0)
8370 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8371 {
8372 val = XCDR (elt);
8373 /* Here, if VAL is both a valid coding system and a valid
8374 function symbol, we return VAL as a coding system. */
8375 if (CONSP (val))
8376 return val;
8377 if (! SYMBOLP (val))
8378 return Qnil;
8379 if (! NILP (Fcoding_system_p (val)))
8380 return Fcons (val, val);
8381 if (! NILP (Ffboundp (val)))
8382 {
8383 val = call1 (val, Flist (nargs, args));
8384 if (CONSP (val))
8385 return val;
8386 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8387 return Fcons (val, val);
8388 }
8389 return Qnil;
8390 }
8391 }
8392 return Qnil;
8393 }
8394
8395 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8396 Sset_coding_system_priority, 0, MANY, 0,
8397 doc: /* Assign higher priority to the coding systems given as arguments.
8398 If multiple coding systems belongs to the same category,
8399 all but the first one are ignored.
8400
8401 usage: (set-coding-system-priority ...) */)
8402 (nargs, args)
8403 int nargs;
8404 Lisp_Object *args;
8405 {
8406 int i, j;
8407 int changed[coding_category_max];
8408 enum coding_category priorities[coding_category_max];
8409
8410 bzero (changed, sizeof changed);
8411
8412 for (i = j = 0; i < nargs; i++)
8413 {
8414 enum coding_category category;
8415 Lisp_Object spec, attrs;
8416
8417 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8418 attrs = AREF (spec, 0);
8419 category = XINT (CODING_ATTR_CATEGORY (attrs));
8420 if (changed[category])
8421 /* Ignore this coding system because a coding system of the
8422 same category already had a higher priority. */
8423 continue;
8424 changed[category] = 1;
8425 priorities[j++] = category;
8426 if (coding_categories[category].id >= 0
8427 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8428 setup_coding_system (args[i], &coding_categories[category]);
8429 Fset (AREF (Vcoding_category_table, category), args[i]);
8430 }
8431
8432 /* Now we have decided top J priorities. Reflect the order of the
8433 original priorities to the remaining priorities. */
8434
8435 for (i = j, j = 0; i < coding_category_max; i++, j++)
8436 {
8437 while (j < coding_category_max
8438 && changed[coding_priorities[j]])
8439 j++;
8440 if (j == coding_category_max)
8441 abort ();
8442 priorities[i] = coding_priorities[j];
8443 }
8444
8445 bcopy (priorities, coding_priorities, sizeof priorities);
8446
8447 /* Update `coding-category-list'. */
8448 Vcoding_category_list = Qnil;
8449 for (i = coding_category_max - 1; i >= 0; i--)
8450 Vcoding_category_list
8451 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8452 Vcoding_category_list);
8453
8454 return Qnil;
8455 }
8456
8457 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8458 Scoding_system_priority_list, 0, 1, 0,
8459 doc: /* Return a list of coding systems ordered by their priorities.
8460 HIGHESTP non-nil means just return the highest priority one. */)
8461 (highestp)
8462 Lisp_Object highestp;
8463 {
8464 int i;
8465 Lisp_Object val;
8466
8467 for (i = 0, val = Qnil; i < coding_category_max; i++)
8468 {
8469 enum coding_category category = coding_priorities[i];
8470 int id = coding_categories[category].id;
8471 Lisp_Object attrs;
8472
8473 if (id < 0)
8474 continue;
8475 attrs = CODING_ID_ATTRS (id);
8476 if (! NILP (highestp))
8477 return CODING_ATTR_BASE_NAME (attrs);
8478 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8479 }
8480 return Fnreverse (val);
8481 }
8482
8483 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8484
8485 static Lisp_Object
8486 make_subsidiaries (base)
8487 Lisp_Object base;
8488 {
8489 Lisp_Object subsidiaries;
8490 int base_name_len = SBYTES (SYMBOL_NAME (base));
8491 char *buf = (char *) alloca (base_name_len + 6);
8492 int i;
8493
8494 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8495 subsidiaries = Fmake_vector (make_number (3), Qnil);
8496 for (i = 0; i < 3; i++)
8497 {
8498 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8499 ASET (subsidiaries, i, intern (buf));
8500 }
8501 return subsidiaries;
8502 }
8503
8504
8505 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8506 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8507 doc: /* For internal use only.
8508 usage: (define-coding-system-internal ...) */)
8509 (nargs, args)
8510 int nargs;
8511 Lisp_Object *args;
8512 {
8513 Lisp_Object name;
8514 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8515 Lisp_Object attrs; /* Vector of attributes. */
8516 Lisp_Object eol_type;
8517 Lisp_Object aliases;
8518 Lisp_Object coding_type, charset_list, safe_charsets;
8519 enum coding_category category;
8520 Lisp_Object tail, val;
8521 int max_charset_id = 0;
8522 int i;
8523
8524 if (nargs < coding_arg_max)
8525 goto short_args;
8526
8527 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8528
8529 name = args[coding_arg_name];
8530 CHECK_SYMBOL (name);
8531 CODING_ATTR_BASE_NAME (attrs) = name;
8532
8533 val = args[coding_arg_mnemonic];
8534 if (! STRINGP (val))
8535 CHECK_CHARACTER (val);
8536 CODING_ATTR_MNEMONIC (attrs) = val;
8537
8538 coding_type = args[coding_arg_coding_type];
8539 CHECK_SYMBOL (coding_type);
8540 CODING_ATTR_TYPE (attrs) = coding_type;
8541
8542 charset_list = args[coding_arg_charset_list];
8543 if (SYMBOLP (charset_list))
8544 {
8545 if (EQ (charset_list, Qiso_2022))
8546 {
8547 if (! EQ (coding_type, Qiso_2022))
8548 error ("Invalid charset-list");
8549 charset_list = Viso_2022_charset_list;
8550 }
8551 else if (EQ (charset_list, Qemacs_mule))
8552 {
8553 if (! EQ (coding_type, Qemacs_mule))
8554 error ("Invalid charset-list");
8555 charset_list = Vemacs_mule_charset_list;
8556 }
8557 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8558 if (max_charset_id < XFASTINT (XCAR (tail)))
8559 max_charset_id = XFASTINT (XCAR (tail));
8560 }
8561 else
8562 {
8563 charset_list = Fcopy_sequence (charset_list);
8564 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8565 {
8566 struct charset *charset;
8567
8568 val = Fcar (tail);
8569 CHECK_CHARSET_GET_CHARSET (val, charset);
8570 if (EQ (coding_type, Qiso_2022)
8571 ? CHARSET_ISO_FINAL (charset) < 0
8572 : EQ (coding_type, Qemacs_mule)
8573 ? CHARSET_EMACS_MULE_ID (charset) < 0
8574 : 0)
8575 error ("Can't handle charset `%s'",
8576 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8577
8578 XSETCAR (tail, make_number (charset->id));
8579 if (max_charset_id < charset->id)
8580 max_charset_id = charset->id;
8581 }
8582 }
8583 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8584
8585 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8586 make_number (255));
8587 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8588 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8589 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8590
8591 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8592
8593 val = args[coding_arg_decode_translation_table];
8594 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8595 CHECK_SYMBOL (val);
8596 CODING_ATTR_DECODE_TBL (attrs) = val;
8597
8598 val = args[coding_arg_encode_translation_table];
8599 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8600 CHECK_SYMBOL (val);
8601 CODING_ATTR_ENCODE_TBL (attrs) = val;
8602
8603 val = args[coding_arg_post_read_conversion];
8604 CHECK_SYMBOL (val);
8605 CODING_ATTR_POST_READ (attrs) = val;
8606
8607 val = args[coding_arg_pre_write_conversion];
8608 CHECK_SYMBOL (val);
8609 CODING_ATTR_PRE_WRITE (attrs) = val;
8610
8611 val = args[coding_arg_default_char];
8612 if (NILP (val))
8613 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8614 else
8615 {
8616 CHECK_CHARACTER (val);
8617 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8618 }
8619
8620 val = args[coding_arg_for_unibyte];
8621 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8622
8623 val = args[coding_arg_plist];
8624 CHECK_LIST (val);
8625 CODING_ATTR_PLIST (attrs) = val;
8626
8627 if (EQ (coding_type, Qcharset))
8628 {
8629 /* Generate a lisp vector of 256 elements. Each element is nil,
8630 integer, or a list of charset IDs.
8631
8632 If Nth element is nil, the byte code N is invalid in this
8633 coding system.
8634
8635 If Nth element is a number NUM, N is the first byte of a
8636 charset whose ID is NUM.
8637
8638 If Nth element is a list of charset IDs, N is the first byte
8639 of one of them. The list is sorted by dimensions of the
8640 charsets. A charset of smaller dimension comes firtst. */
8641 val = Fmake_vector (make_number (256), Qnil);
8642
8643 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8644 {
8645 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8646 int dim = CHARSET_DIMENSION (charset);
8647 int idx = (dim - 1) * 4;
8648
8649 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8650 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8651
8652 for (i = charset->code_space[idx];
8653 i <= charset->code_space[idx + 1]; i++)
8654 {
8655 Lisp_Object tmp, tmp2;
8656 int dim2;
8657
8658 tmp = AREF (val, i);
8659 if (NILP (tmp))
8660 tmp = XCAR (tail);
8661 else if (NUMBERP (tmp))
8662 {
8663 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8664 if (dim < dim2)
8665 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8666 else
8667 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8668 }
8669 else
8670 {
8671 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8672 {
8673 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8674 if (dim < dim2)
8675 break;
8676 }
8677 if (NILP (tmp2))
8678 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8679 else
8680 {
8681 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8682 XSETCAR (tmp2, XCAR (tail));
8683 }
8684 }
8685 ASET (val, i, tmp);
8686 }
8687 }
8688 ASET (attrs, coding_attr_charset_valids, val);
8689 category = coding_category_charset;
8690 }
8691 else if (EQ (coding_type, Qccl))
8692 {
8693 Lisp_Object valids;
8694
8695 if (nargs < coding_arg_ccl_max)
8696 goto short_args;
8697
8698 val = args[coding_arg_ccl_decoder];
8699 CHECK_CCL_PROGRAM (val);
8700 if (VECTORP (val))
8701 val = Fcopy_sequence (val);
8702 ASET (attrs, coding_attr_ccl_decoder, val);
8703
8704 val = args[coding_arg_ccl_encoder];
8705 CHECK_CCL_PROGRAM (val);
8706 if (VECTORP (val))
8707 val = Fcopy_sequence (val);
8708 ASET (attrs, coding_attr_ccl_encoder, val);
8709
8710 val = args[coding_arg_ccl_valids];
8711 valids = Fmake_string (make_number (256), make_number (0));
8712 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8713 {
8714 int from, to;
8715
8716 val = Fcar (tail);
8717 if (INTEGERP (val))
8718 {
8719 from = to = XINT (val);
8720 if (from < 0 || from > 255)
8721 args_out_of_range_3 (val, make_number (0), make_number (255));
8722 }
8723 else
8724 {
8725 CHECK_CONS (val);
8726 CHECK_NATNUM_CAR (val);
8727 CHECK_NATNUM_CDR (val);
8728 from = XINT (XCAR (val));
8729 if (from > 255)
8730 args_out_of_range_3 (XCAR (val),
8731 make_number (0), make_number (255));
8732 to = XINT (XCDR (val));
8733 if (to < from || to > 255)
8734 args_out_of_range_3 (XCDR (val),
8735 XCAR (val), make_number (255));
8736 }
8737 for (i = from; i <= to; i++)
8738 SSET (valids, i, 1);
8739 }
8740 ASET (attrs, coding_attr_ccl_valids, valids);
8741
8742 category = coding_category_ccl;
8743 }
8744 else if (EQ (coding_type, Qutf_16))
8745 {
8746 Lisp_Object bom, endian;
8747
8748 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8749
8750 if (nargs < coding_arg_utf16_max)
8751 goto short_args;
8752
8753 bom = args[coding_arg_utf16_bom];
8754 if (! NILP (bom) && ! EQ (bom, Qt))
8755 {
8756 CHECK_CONS (bom);
8757 val = XCAR (bom);
8758 CHECK_CODING_SYSTEM (val);
8759 val = XCDR (bom);
8760 CHECK_CODING_SYSTEM (val);
8761 }
8762 ASET (attrs, coding_attr_utf_16_bom, bom);
8763
8764 endian = args[coding_arg_utf16_endian];
8765 CHECK_SYMBOL (endian);
8766 if (NILP (endian))
8767 endian = Qbig;
8768 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8769 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8770 ASET (attrs, coding_attr_utf_16_endian, endian);
8771
8772 category = (CONSP (bom)
8773 ? coding_category_utf_16_auto
8774 : NILP (bom)
8775 ? (EQ (endian, Qbig)
8776 ? coding_category_utf_16_be_nosig
8777 : coding_category_utf_16_le_nosig)
8778 : (EQ (endian, Qbig)
8779 ? coding_category_utf_16_be
8780 : coding_category_utf_16_le));
8781 }
8782 else if (EQ (coding_type, Qiso_2022))
8783 {
8784 Lisp_Object initial, reg_usage, request, flags;
8785 int i;
8786
8787 if (nargs < coding_arg_iso2022_max)
8788 goto short_args;
8789
8790 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8791 CHECK_VECTOR (initial);
8792 for (i = 0; i < 4; i++)
8793 {
8794 val = Faref (initial, make_number (i));
8795 if (! NILP (val))
8796 {
8797 struct charset *charset;
8798
8799 CHECK_CHARSET_GET_CHARSET (val, charset);
8800 ASET (initial, i, make_number (CHARSET_ID (charset)));
8801 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8802 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8803 }
8804 else
8805 ASET (initial, i, make_number (-1));
8806 }
8807
8808 reg_usage = args[coding_arg_iso2022_reg_usage];
8809 CHECK_CONS (reg_usage);
8810 CHECK_NUMBER_CAR (reg_usage);
8811 CHECK_NUMBER_CDR (reg_usage);
8812
8813 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8814 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8815 {
8816 int id;
8817 Lisp_Object tmp;
8818
8819 val = Fcar (tail);
8820 CHECK_CONS (val);
8821 tmp = XCAR (val);
8822 CHECK_CHARSET_GET_ID (tmp, id);
8823 CHECK_NATNUM_CDR (val);
8824 if (XINT (XCDR (val)) >= 4)
8825 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8826 XSETCAR (val, make_number (id));
8827 }
8828
8829 flags = args[coding_arg_iso2022_flags];
8830 CHECK_NATNUM (flags);
8831 i = XINT (flags);
8832 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8833 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8834
8835 ASET (attrs, coding_attr_iso_initial, initial);
8836 ASET (attrs, coding_attr_iso_usage, reg_usage);
8837 ASET (attrs, coding_attr_iso_request, request);
8838 ASET (attrs, coding_attr_iso_flags, flags);
8839 setup_iso_safe_charsets (attrs);
8840
8841 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8842 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8843 | CODING_ISO_FLAG_SINGLE_SHIFT))
8844 ? coding_category_iso_7_else
8845 : EQ (args[coding_arg_charset_list], Qiso_2022)
8846 ? coding_category_iso_7
8847 : coding_category_iso_7_tight);
8848 else
8849 {
8850 int id = XINT (AREF (initial, 1));
8851
8852 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8853 || EQ (args[coding_arg_charset_list], Qiso_2022)
8854 || id < 0)
8855 ? coding_category_iso_8_else
8856 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8857 ? coding_category_iso_8_1
8858 : coding_category_iso_8_2);
8859 }
8860 if (category != coding_category_iso_8_1
8861 && category != coding_category_iso_8_2)
8862 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8863 }
8864 else if (EQ (coding_type, Qemacs_mule))
8865 {
8866 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8867 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8868 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8869 category = coding_category_emacs_mule;
8870 }
8871 else if (EQ (coding_type, Qshift_jis))
8872 {
8873
8874 struct charset *charset;
8875
8876 if (XINT (Flength (charset_list)) != 3
8877 && XINT (Flength (charset_list)) != 4)
8878 error ("There should be three or four charsets");
8879
8880 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8881 if (CHARSET_DIMENSION (charset) != 1)
8882 error ("Dimension of charset %s is not one",
8883 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8884 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8885 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8886
8887 charset_list = XCDR (charset_list);
8888 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8889 if (CHARSET_DIMENSION (charset) != 1)
8890 error ("Dimension of charset %s is not one",
8891 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8892
8893 charset_list = XCDR (charset_list);
8894 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8895 if (CHARSET_DIMENSION (charset) != 2)
8896 error ("Dimension of charset %s is not two",
8897 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8898
8899 charset_list = XCDR (charset_list);
8900 if (! NILP (charset_list))
8901 {
8902 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8903 if (CHARSET_DIMENSION (charset) != 2)
8904 error ("Dimension of charset %s is not two",
8905 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8906 }
8907
8908 category = coding_category_sjis;
8909 Vsjis_coding_system = name;
8910 }
8911 else if (EQ (coding_type, Qbig5))
8912 {
8913 struct charset *charset;
8914
8915 if (XINT (Flength (charset_list)) != 2)
8916 error ("There should be just two charsets");
8917
8918 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8919 if (CHARSET_DIMENSION (charset) != 1)
8920 error ("Dimension of charset %s is not one",
8921 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8922 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8923 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8924
8925 charset_list = XCDR (charset_list);
8926 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8927 if (CHARSET_DIMENSION (charset) != 2)
8928 error ("Dimension of charset %s is not two",
8929 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8930
8931 category = coding_category_big5;
8932 Vbig5_coding_system = name;
8933 }
8934 else if (EQ (coding_type, Qraw_text))
8935 {
8936 category = coding_category_raw_text;
8937 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8938 }
8939 else if (EQ (coding_type, Qutf_8))
8940 {
8941 category = coding_category_utf_8;
8942 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8943 }
8944 else if (EQ (coding_type, Qundecided))
8945 category = coding_category_undecided;
8946 else
8947 error ("Invalid coding system type: %s",
8948 SDATA (SYMBOL_NAME (coding_type)));
8949
8950 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8951 CODING_ATTR_PLIST (attrs)
8952 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8953 CODING_ATTR_PLIST (attrs)));
8954 CODING_ATTR_PLIST (attrs)
8955 = Fcons (QCascii_compatible_p,
8956 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
8957 CODING_ATTR_PLIST (attrs)));
8958
8959 eol_type = args[coding_arg_eol_type];
8960 if (! NILP (eol_type)
8961 && ! EQ (eol_type, Qunix)
8962 && ! EQ (eol_type, Qdos)
8963 && ! EQ (eol_type, Qmac))
8964 error ("Invalid eol-type");
8965
8966 aliases = Fcons (name, Qnil);
8967
8968 if (NILP (eol_type))
8969 {
8970 eol_type = make_subsidiaries (name);
8971 for (i = 0; i < 3; i++)
8972 {
8973 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8974
8975 this_name = AREF (eol_type, i);
8976 this_aliases = Fcons (this_name, Qnil);
8977 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8978 this_spec = Fmake_vector (make_number (3), attrs);
8979 ASET (this_spec, 1, this_aliases);
8980 ASET (this_spec, 2, this_eol_type);
8981 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8982 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8983 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
8984 if (NILP (val))
8985 Vcoding_system_alist
8986 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8987 Vcoding_system_alist);
8988 }
8989 }
8990
8991 spec_vec = Fmake_vector (make_number (3), attrs);
8992 ASET (spec_vec, 1, aliases);
8993 ASET (spec_vec, 2, eol_type);
8994
8995 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8996 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8997 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
8998 if (NILP (val))
8999 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9000 Vcoding_system_alist);
9001
9002 {
9003 int id = coding_categories[category].id;
9004
9005 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9006 setup_coding_system (name, &coding_categories[category]);
9007 }
9008
9009 return Qnil;
9010
9011 short_args:
9012 return Fsignal (Qwrong_number_of_arguments,
9013 Fcons (intern ("define-coding-system-internal"),
9014 make_number (nargs)));
9015 }
9016
9017
9018 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9019 3, 3, 0,
9020 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9021 (coding_system, prop, val)
9022 Lisp_Object coding_system, prop, val;
9023 {
9024 Lisp_Object spec, attrs;
9025
9026 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9027 attrs = AREF (spec, 0);
9028 if (EQ (prop, QCmnemonic))
9029 {
9030 if (! STRINGP (val))
9031 CHECK_CHARACTER (val);
9032 CODING_ATTR_MNEMONIC (attrs) = val;
9033 }
9034 else if (EQ (prop, QCdefalut_char))
9035 {
9036 if (NILP (val))
9037 val = make_number (' ');
9038 else
9039 CHECK_CHARACTER (val);
9040 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9041 }
9042 else if (EQ (prop, QCdecode_translation_table))
9043 {
9044 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9045 CHECK_SYMBOL (val);
9046 CODING_ATTR_DECODE_TBL (attrs) = val;
9047 }
9048 else if (EQ (prop, QCencode_translation_table))
9049 {
9050 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9051 CHECK_SYMBOL (val);
9052 CODING_ATTR_ENCODE_TBL (attrs) = val;
9053 }
9054 else if (EQ (prop, QCpost_read_conversion))
9055 {
9056 CHECK_SYMBOL (val);
9057 CODING_ATTR_POST_READ (attrs) = val;
9058 }
9059 else if (EQ (prop, QCpre_write_conversion))
9060 {
9061 CHECK_SYMBOL (val);
9062 CODING_ATTR_PRE_WRITE (attrs) = val;
9063 }
9064 else if (EQ (prop, QCascii_compatible_p))
9065 {
9066 CODING_ATTR_ASCII_COMPAT (attrs) = val;
9067 }
9068
9069 CODING_ATTR_PLIST (attrs)
9070 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9071 return val;
9072 }
9073
9074
9075 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9076 Sdefine_coding_system_alias, 2, 2, 0,
9077 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9078 (alias, coding_system)
9079 Lisp_Object alias, coding_system;
9080 {
9081 Lisp_Object spec, aliases, eol_type, val;
9082
9083 CHECK_SYMBOL (alias);
9084 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9085 aliases = AREF (spec, 1);
9086 /* ALISES should be a list of length more than zero, and the first
9087 element is a base coding system. Append ALIAS at the tail of the
9088 list. */
9089 while (!NILP (XCDR (aliases)))
9090 aliases = XCDR (aliases);
9091 XSETCDR (aliases, Fcons (alias, Qnil));
9092
9093 eol_type = AREF (spec, 2);
9094 if (VECTORP (eol_type))
9095 {
9096 Lisp_Object subsidiaries;
9097 int i;
9098
9099 subsidiaries = make_subsidiaries (alias);
9100 for (i = 0; i < 3; i++)
9101 Fdefine_coding_system_alias (AREF (subsidiaries, i),
9102 AREF (eol_type, i));
9103 }
9104
9105 Fputhash (alias, spec, Vcoding_system_hash_table);
9106 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9107 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9108 if (NILP (val))
9109 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9110 Vcoding_system_alist);
9111
9112 return Qnil;
9113 }
9114
9115 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9116 1, 1, 0,
9117 doc: /* Return the base of CODING-SYSTEM.
9118 Any alias or subsidiary coding system is not a base coding system. */)
9119 (coding_system)
9120 Lisp_Object coding_system;
9121 {
9122 Lisp_Object spec, attrs;
9123
9124 if (NILP (coding_system))
9125 return (Qno_conversion);
9126 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9127 attrs = AREF (spec, 0);
9128 return CODING_ATTR_BASE_NAME (attrs);
9129 }
9130
9131 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9132 1, 1, 0,
9133 doc: "Return the property list of CODING-SYSTEM.")
9134 (coding_system)
9135 Lisp_Object coding_system;
9136 {
9137 Lisp_Object spec, attrs;
9138
9139 if (NILP (coding_system))
9140 coding_system = Qno_conversion;
9141 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9142 attrs = AREF (spec, 0);
9143 return CODING_ATTR_PLIST (attrs);
9144 }
9145
9146
9147 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9148 1, 1, 0,
9149 doc: /* Return the list of aliases of CODING-SYSTEM. */)
9150 (coding_system)
9151 Lisp_Object coding_system;
9152 {
9153 Lisp_Object spec;
9154
9155 if (NILP (coding_system))
9156 coding_system = Qno_conversion;
9157 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9158 return AREF (spec, 1);
9159 }
9160
9161 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9162 Scoding_system_eol_type, 1, 1, 0,
9163 doc: /* Return eol-type of CODING-SYSTEM.
9164 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9165
9166 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9167 and CR respectively.
9168
9169 A vector value indicates that a format of end-of-line should be
9170 detected automatically. Nth element of the vector is the subsidiary
9171 coding system whose eol-type is N. */)
9172 (coding_system)
9173 Lisp_Object coding_system;
9174 {
9175 Lisp_Object spec, eol_type;
9176 int n;
9177
9178 if (NILP (coding_system))
9179 coding_system = Qno_conversion;
9180 if (! CODING_SYSTEM_P (coding_system))
9181 return Qnil;
9182 spec = CODING_SYSTEM_SPEC (coding_system);
9183 eol_type = AREF (spec, 2);
9184 if (VECTORP (eol_type))
9185 return Fcopy_sequence (eol_type);
9186 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9187 return make_number (n);
9188 }
9189
9190 #endif /* emacs */
9191
9192 \f
9193 /*** 9. Post-amble ***/
9194
9195 void
9196 init_coding_once ()
9197 {
9198 int i;
9199
9200 for (i = 0; i < coding_category_max; i++)
9201 {
9202 coding_categories[i].id = -1;
9203 coding_priorities[i] = i;
9204 }
9205
9206 /* ISO2022 specific initialize routine. */
9207 for (i = 0; i < 0x20; i++)
9208 iso_code_class[i] = ISO_control_0;
9209 for (i = 0x21; i < 0x7F; i++)
9210 iso_code_class[i] = ISO_graphic_plane_0;
9211 for (i = 0x80; i < 0xA0; i++)
9212 iso_code_class[i] = ISO_control_1;
9213 for (i = 0xA1; i < 0xFF; i++)
9214 iso_code_class[i] = ISO_graphic_plane_1;
9215 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9216 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9217 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9218 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9219 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9220 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9221 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9222 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9223 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9224
9225 for (i = 0; i < 256; i++)
9226 {
9227 emacs_mule_bytes[i] = 1;
9228 }
9229 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9230 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9231 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9232 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9233 }
9234
9235 #ifdef emacs
9236
9237 void
9238 syms_of_coding ()
9239 {
9240 staticpro (&Vcoding_system_hash_table);
9241 {
9242 Lisp_Object args[2];
9243 args[0] = QCtest;
9244 args[1] = Qeq;
9245 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9246 }
9247
9248 staticpro (&Vsjis_coding_system);
9249 Vsjis_coding_system = Qnil;
9250
9251 staticpro (&Vbig5_coding_system);
9252 Vbig5_coding_system = Qnil;
9253
9254 staticpro (&Vcode_conversion_reused_workbuf);
9255 Vcode_conversion_reused_workbuf = Qnil;
9256
9257 staticpro (&Vcode_conversion_workbuf_name);
9258 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9259
9260 reused_workbuf_in_use = 0;
9261
9262 DEFSYM (Qcharset, "charset");
9263 DEFSYM (Qtarget_idx, "target-idx");
9264 DEFSYM (Qcoding_system_history, "coding-system-history");
9265 Fset (Qcoding_system_history, Qnil);
9266
9267 /* Target FILENAME is the first argument. */
9268 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9269 /* Target FILENAME is the third argument. */
9270 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9271
9272 DEFSYM (Qcall_process, "call-process");
9273 /* Target PROGRAM is the first argument. */
9274 Fput (Qcall_process, Qtarget_idx, make_number (0));
9275
9276 DEFSYM (Qcall_process_region, "call-process-region");
9277 /* Target PROGRAM is the third argument. */
9278 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9279
9280 DEFSYM (Qstart_process, "start-process");
9281 /* Target PROGRAM is the third argument. */
9282 Fput (Qstart_process, Qtarget_idx, make_number (2));
9283
9284 DEFSYM (Qopen_network_stream, "open-network-stream");
9285 /* Target SERVICE is the fourth argument. */
9286 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9287
9288 DEFSYM (Qcoding_system, "coding-system");
9289 DEFSYM (Qcoding_aliases, "coding-aliases");
9290
9291 DEFSYM (Qeol_type, "eol-type");
9292 DEFSYM (Qunix, "unix");
9293 DEFSYM (Qdos, "dos");
9294
9295 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9296 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9297 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9298 DEFSYM (Qdefault_char, "default-char");
9299 DEFSYM (Qundecided, "undecided");
9300 DEFSYM (Qno_conversion, "no-conversion");
9301 DEFSYM (Qraw_text, "raw-text");
9302
9303 DEFSYM (Qiso_2022, "iso-2022");
9304
9305 DEFSYM (Qutf_8, "utf-8");
9306 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9307
9308 DEFSYM (Qutf_16, "utf-16");
9309 DEFSYM (Qbig, "big");
9310 DEFSYM (Qlittle, "little");
9311
9312 DEFSYM (Qshift_jis, "shift-jis");
9313 DEFSYM (Qbig5, "big5");
9314
9315 DEFSYM (Qcoding_system_p, "coding-system-p");
9316
9317 DEFSYM (Qcoding_system_error, "coding-system-error");
9318 Fput (Qcoding_system_error, Qerror_conditions,
9319 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9320 Fput (Qcoding_system_error, Qerror_message,
9321 build_string ("Invalid coding system"));
9322
9323 /* Intern this now in case it isn't already done.
9324 Setting this variable twice is harmless.
9325 But don't staticpro it here--that is done in alloc.c. */
9326 Qchar_table_extra_slots = intern ("char-table-extra-slots");
9327
9328 DEFSYM (Qtranslation_table, "translation-table");
9329 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9330 DEFSYM (Qtranslation_table_id, "translation-table-id");
9331 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9332 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9333
9334 DEFSYM (Qvalid_codes, "valid-codes");
9335
9336 DEFSYM (Qemacs_mule, "emacs-mule");
9337
9338 DEFSYM (QCcategory, ":category");
9339 DEFSYM (QCmnemonic, ":mnemonic");
9340 DEFSYM (QCdefalut_char, ":default-char");
9341 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9342 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9343 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9344 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9345 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9346
9347 Vcoding_category_table
9348 = Fmake_vector (make_number (coding_category_max), Qnil);
9349 staticpro (&Vcoding_category_table);
9350 /* Followings are target of code detection. */
9351 ASET (Vcoding_category_table, coding_category_iso_7,
9352 intern ("coding-category-iso-7"));
9353 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9354 intern ("coding-category-iso-7-tight"));
9355 ASET (Vcoding_category_table, coding_category_iso_8_1,
9356 intern ("coding-category-iso-8-1"));
9357 ASET (Vcoding_category_table, coding_category_iso_8_2,
9358 intern ("coding-category-iso-8-2"));
9359 ASET (Vcoding_category_table, coding_category_iso_7_else,
9360 intern ("coding-category-iso-7-else"));
9361 ASET (Vcoding_category_table, coding_category_iso_8_else,
9362 intern ("coding-category-iso-8-else"));
9363 ASET (Vcoding_category_table, coding_category_utf_8,
9364 intern ("coding-category-utf-8"));
9365 ASET (Vcoding_category_table, coding_category_utf_16_be,
9366 intern ("coding-category-utf-16-be"));
9367 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9368 intern ("coding-category-utf-16-auto"));
9369 ASET (Vcoding_category_table, coding_category_utf_16_le,
9370 intern ("coding-category-utf-16-le"));
9371 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9372 intern ("coding-category-utf-16-be-nosig"));
9373 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9374 intern ("coding-category-utf-16-le-nosig"));
9375 ASET (Vcoding_category_table, coding_category_charset,
9376 intern ("coding-category-charset"));
9377 ASET (Vcoding_category_table, coding_category_sjis,
9378 intern ("coding-category-sjis"));
9379 ASET (Vcoding_category_table, coding_category_big5,
9380 intern ("coding-category-big5"));
9381 ASET (Vcoding_category_table, coding_category_ccl,
9382 intern ("coding-category-ccl"));
9383 ASET (Vcoding_category_table, coding_category_emacs_mule,
9384 intern ("coding-category-emacs-mule"));
9385 /* Followings are NOT target of code detection. */
9386 ASET (Vcoding_category_table, coding_category_raw_text,
9387 intern ("coding-category-raw-text"));
9388 ASET (Vcoding_category_table, coding_category_undecided,
9389 intern ("coding-category-undecided"));
9390
9391 DEFSYM (Qinsufficient_source, "insufficient-source");
9392 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9393 DEFSYM (Qinvalid_source, "invalid-source");
9394 DEFSYM (Qinterrupted, "interrupted");
9395 DEFSYM (Qinsufficient_memory, "insufficient-memory");
9396 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9397
9398 defsubr (&Scoding_system_p);
9399 defsubr (&Sread_coding_system);
9400 defsubr (&Sread_non_nil_coding_system);
9401 defsubr (&Scheck_coding_system);
9402 defsubr (&Sdetect_coding_region);
9403 defsubr (&Sdetect_coding_string);
9404 defsubr (&Sfind_coding_systems_region_internal);
9405 defsubr (&Sunencodable_char_position);
9406 defsubr (&Scheck_coding_systems_region);
9407 defsubr (&Sdecode_coding_region);
9408 defsubr (&Sencode_coding_region);
9409 defsubr (&Sdecode_coding_string);
9410 defsubr (&Sencode_coding_string);
9411 defsubr (&Sdecode_sjis_char);
9412 defsubr (&Sencode_sjis_char);
9413 defsubr (&Sdecode_big5_char);
9414 defsubr (&Sencode_big5_char);
9415 defsubr (&Sset_terminal_coding_system_internal);
9416 defsubr (&Sset_safe_terminal_coding_system_internal);
9417 defsubr (&Sterminal_coding_system);
9418 defsubr (&Sset_keyboard_coding_system_internal);
9419 defsubr (&Skeyboard_coding_system);
9420 defsubr (&Sfind_operation_coding_system);
9421 defsubr (&Sset_coding_system_priority);
9422 defsubr (&Sdefine_coding_system_internal);
9423 defsubr (&Sdefine_coding_system_alias);
9424 defsubr (&Scoding_system_put);
9425 defsubr (&Scoding_system_base);
9426 defsubr (&Scoding_system_plist);
9427 defsubr (&Scoding_system_aliases);
9428 defsubr (&Scoding_system_eol_type);
9429 defsubr (&Scoding_system_priority_list);
9430
9431 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9432 doc: /* List of coding systems.
9433
9434 Do not alter the value of this variable manually. This variable should be
9435 updated by the functions `define-coding-system' and
9436 `define-coding-system-alias'. */);
9437 Vcoding_system_list = Qnil;
9438
9439 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9440 doc: /* Alist of coding system names.
9441 Each element is one element list of coding system name.
9442 This variable is given to `completing-read' as TABLE argument.
9443
9444 Do not alter the value of this variable manually. This variable should be
9445 updated by the functions `make-coding-system' and
9446 `define-coding-system-alias'. */);
9447 Vcoding_system_alist = Qnil;
9448
9449 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9450 doc: /* List of coding-categories (symbols) ordered by priority.
9451
9452 On detecting a coding system, Emacs tries code detection algorithms
9453 associated with each coding-category one by one in this order. When
9454 one algorithm agrees with a byte sequence of source text, the coding
9455 system bound to the corresponding coding-category is selected.
9456
9457 Don't modify this variable directly, but use `set-coding-priority'. */);
9458 {
9459 int i;
9460
9461 Vcoding_category_list = Qnil;
9462 for (i = coding_category_max - 1; i >= 0; i--)
9463 Vcoding_category_list
9464 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9465 Vcoding_category_list);
9466 }
9467
9468 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9469 doc: /* Specify the coding system for read operations.
9470 It is useful to bind this variable with `let', but do not set it globally.
9471 If the value is a coding system, it is used for decoding on read operation.
9472 If not, an appropriate element is used from one of the coding system alists:
9473 There are three such tables, `file-coding-system-alist',
9474 `process-coding-system-alist', and `network-coding-system-alist'. */);
9475 Vcoding_system_for_read = Qnil;
9476
9477 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9478 doc: /* Specify the coding system for write operations.
9479 Programs bind this variable with `let', but you should not set it globally.
9480 If the value is a coding system, it is used for encoding of output,
9481 when writing it to a file and when sending it to a file or subprocess.
9482
9483 If this does not specify a coding system, an appropriate element
9484 is used from one of the coding system alists:
9485 There are three such tables, `file-coding-system-alist',
9486 `process-coding-system-alist', and `network-coding-system-alist'.
9487 For output to files, if the above procedure does not specify a coding system,
9488 the value of `buffer-file-coding-system' is used. */);
9489 Vcoding_system_for_write = Qnil;
9490
9491 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9492 doc: /*
9493 Coding system used in the latest file or process I/O. */);
9494 Vlast_coding_system_used = Qnil;
9495
9496 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9497 doc: /*
9498 Error status of the last code conversion.
9499
9500 When an error was detected in the last code conversion, this variable
9501 is set to one of the following symbols.
9502 `insufficient-source'
9503 `inconsistent-eol'
9504 `invalid-source'
9505 `interrupted'
9506 `insufficient-memory'
9507 When no error was detected, the value doesn't change. So, to check
9508 the error status of a code conversion by this variable, you must
9509 explicitly set this variable to nil before performing code
9510 conversion. */);
9511 Vlast_code_conversion_error = Qnil;
9512
9513 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9514 doc: /*
9515 *Non-nil means always inhibit code conversion of end-of-line format.
9516 See info node `Coding Systems' and info node `Text and Binary' concerning
9517 such conversion. */);
9518 inhibit_eol_conversion = 0;
9519
9520 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9521 doc: /*
9522 Non-nil means process buffer inherits coding system of process output.
9523 Bind it to t if the process output is to be treated as if it were a file
9524 read from some filesystem. */);
9525 inherit_process_coding_system = 0;
9526
9527 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9528 doc: /*
9529 Alist to decide a coding system to use for a file I/O operation.
9530 The format is ((PATTERN . VAL) ...),
9531 where PATTERN is a regular expression matching a file name,
9532 VAL is a coding system, a cons of coding systems, or a function symbol.
9533 If VAL is a coding system, it is used for both decoding and encoding
9534 the file contents.
9535 If VAL is a cons of coding systems, the car part is used for decoding,
9536 and the cdr part is used for encoding.
9537 If VAL is a function symbol, the function must return a coding system
9538 or a cons of coding systems which are used as above. The function gets
9539 the arguments with which `find-operation-coding-systems' was called.
9540
9541 See also the function `find-operation-coding-system'
9542 and the variable `auto-coding-alist'. */);
9543 Vfile_coding_system_alist = Qnil;
9544
9545 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9546 doc: /*
9547 Alist to decide a coding system to use for a process I/O operation.
9548 The format is ((PATTERN . VAL) ...),
9549 where PATTERN is a regular expression matching a program name,
9550 VAL is a coding system, a cons of coding systems, or a function symbol.
9551 If VAL is a coding system, it is used for both decoding what received
9552 from the program and encoding what sent to the program.
9553 If VAL is a cons of coding systems, the car part is used for decoding,
9554 and the cdr part is used for encoding.
9555 If VAL is a function symbol, the function must return a coding system
9556 or a cons of coding systems which are used as above.
9557
9558 See also the function `find-operation-coding-system'. */);
9559 Vprocess_coding_system_alist = Qnil;
9560
9561 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9562 doc: /*
9563 Alist to decide a coding system to use for a network I/O operation.
9564 The format is ((PATTERN . VAL) ...),
9565 where PATTERN is a regular expression matching a network service name
9566 or is a port number to connect to,
9567 VAL is a coding system, a cons of coding systems, or a function symbol.
9568 If VAL is a coding system, it is used for both decoding what received
9569 from the network stream and encoding what sent to the network stream.
9570 If VAL is a cons of coding systems, the car part is used for decoding,
9571 and the cdr part is used for encoding.
9572 If VAL is a function symbol, the function must return a coding system
9573 or a cons of coding systems which are used as above.
9574
9575 See also the function `find-operation-coding-system'. */);
9576 Vnetwork_coding_system_alist = Qnil;
9577
9578 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9579 doc: /* Coding system to use with system messages.
9580 Also used for decoding keyboard input on X Window system. */);
9581 Vlocale_coding_system = Qnil;
9582
9583 /* The eol mnemonics are reset in startup.el system-dependently. */
9584 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9585 doc: /*
9586 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9587 eol_mnemonic_unix = build_string (":");
9588
9589 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9590 doc: /*
9591 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9592 eol_mnemonic_dos = build_string ("\\");
9593
9594 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9595 doc: /*
9596 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9597 eol_mnemonic_mac = build_string ("/");
9598
9599 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9600 doc: /*
9601 *String displayed in mode line when end-of-line format is not yet determined. */);
9602 eol_mnemonic_undecided = build_string (":");
9603
9604 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9605 doc: /*
9606 *Non-nil enables character translation while encoding and decoding. */);
9607 Venable_character_translation = Qt;
9608
9609 DEFVAR_LISP ("standard-translation-table-for-decode",
9610 &Vstandard_translation_table_for_decode,
9611 doc: /* Table for translating characters while decoding. */);
9612 Vstandard_translation_table_for_decode = Qnil;
9613
9614 DEFVAR_LISP ("standard-translation-table-for-encode",
9615 &Vstandard_translation_table_for_encode,
9616 doc: /* Table for translating characters while encoding. */);
9617 Vstandard_translation_table_for_encode = Qnil;
9618
9619 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9620 doc: /* Alist of charsets vs revision numbers.
9621 While encoding, if a charset (car part of an element) is found,
9622 designate it with the escape sequence identifying revision (cdr part
9623 of the element). */);
9624 Vcharset_revision_table = Qnil;
9625
9626 DEFVAR_LISP ("default-process-coding-system",
9627 &Vdefault_process_coding_system,
9628 doc: /* Cons of coding systems used for process I/O by default.
9629 The car part is used for decoding a process output,
9630 the cdr part is used for encoding a text to be sent to a process. */);
9631 Vdefault_process_coding_system = Qnil;
9632
9633 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9634 doc: /*
9635 Table of extra Latin codes in the range 128..159 (inclusive).
9636 This is a vector of length 256.
9637 If Nth element is non-nil, the existence of code N in a file
9638 \(or output of subprocess) doesn't prevent it to be detected as
9639 a coding system of ISO 2022 variant which has a flag
9640 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9641 or reading output of a subprocess.
9642 Only 128th through 159th elements has a meaning. */);
9643 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9644
9645 DEFVAR_LISP ("select-safe-coding-system-function",
9646 &Vselect_safe_coding_system_function,
9647 doc: /*
9648 Function to call to select safe coding system for encoding a text.
9649
9650 If set, this function is called to force a user to select a proper
9651 coding system which can encode the text in the case that a default
9652 coding system used in each operation can't encode the text.
9653
9654 The default value is `select-safe-coding-system' (which see). */);
9655 Vselect_safe_coding_system_function = Qnil;
9656
9657 DEFVAR_BOOL ("coding-system-require-warning",
9658 &coding_system_require_warning,
9659 doc: /* Internal use only.
9660 If non-nil, on writing a file, `select-safe-coding-system-function' is
9661 called even if `coding-system-for-write' is non-nil. The command
9662 `universal-coding-system-argument' binds this variable to t temporarily. */);
9663 coding_system_require_warning = 0;
9664
9665
9666 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9667 &inhibit_iso_escape_detection,
9668 doc: /*
9669 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9670
9671 By default, on reading a file, Emacs tries to detect how the text is
9672 encoded. This code detection is sensitive to escape sequences. If
9673 the sequence is valid as ISO2022, the code is determined as one of
9674 the ISO2022 encodings, and the file is decoded by the corresponding
9675 coding system (e.g. `iso-2022-7bit').
9676
9677 However, there may be a case that you want to read escape sequences in
9678 a file as is. In such a case, you can set this variable to non-nil.
9679 Then, as the code detection ignores any escape sequences, no file is
9680 detected as encoded in some ISO2022 encoding. The result is that all
9681 escape sequences become visible in a buffer.
9682
9683 The default value is nil, and it is strongly recommended not to change
9684 it. That is because many Emacs Lisp source files that contain
9685 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9686 in Emacs's distribution, and they won't be decoded correctly on
9687 reading if you suppress escape sequence detection.
9688
9689 The other way to read escape sequences in a file without decoding is
9690 to explicitly specify some coding system that doesn't use ISO2022's
9691 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9692 inhibit_iso_escape_detection = 0;
9693
9694 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9695 doc: /* Char table for translating self-inserting characters.
9696 This is applied to the result of input methods, not their input. See also
9697 `keyboard-translate-table'. */);
9698 Vtranslation_table_for_input = Qnil;
9699
9700 {
9701 Lisp_Object args[coding_arg_max];
9702 Lisp_Object plist[16];
9703 int i;
9704
9705 for (i = 0; i < coding_arg_max; i++)
9706 args[i] = Qnil;
9707
9708 plist[0] = intern (":name");
9709 plist[1] = args[coding_arg_name] = Qno_conversion;
9710 plist[2] = intern (":mnemonic");
9711 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9712 plist[4] = intern (":coding-type");
9713 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9714 plist[6] = intern (":ascii-compatible-p");
9715 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9716 plist[8] = intern (":default-char");
9717 plist[9] = args[coding_arg_default_char] = make_number (0);
9718 plist[10] = intern (":for-unibyte");
9719 plist[11] = args[coding_arg_for_unibyte] = Qt;
9720 plist[12] = intern (":docstring");
9721 plist[13] = build_string ("Do no conversion.\n\
9722 \n\
9723 When you visit a file with this coding, the file is read into a\n\
9724 unibyte buffer as is, thus each byte of a file is treated as a\n\
9725 character.");
9726 plist[14] = intern (":eol-type");
9727 plist[15] = args[coding_arg_eol_type] = Qunix;
9728 args[coding_arg_plist] = Flist (16, plist);
9729 Fdefine_coding_system_internal (coding_arg_max, args);
9730
9731 plist[1] = args[coding_arg_name] = Qundecided;
9732 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9733 plist[5] = args[coding_arg_coding_type] = Qundecided;
9734 /* This is already set.
9735 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9736 plist[8] = intern (":charset-list");
9737 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9738 plist[11] = args[coding_arg_for_unibyte] = Qnil;
9739 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9740 plist[15] = args[coding_arg_eol_type] = Qnil;
9741 args[coding_arg_plist] = Flist (16, plist);
9742 Fdefine_coding_system_internal (coding_arg_max, args);
9743 }
9744
9745 setup_coding_system (Qno_conversion, &keyboard_coding);
9746 setup_coding_system (Qundecided, &terminal_coding);
9747 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9748
9749 {
9750 int i;
9751
9752 for (i = 0; i < coding_category_max; i++)
9753 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9754 }
9755 #if defined (MSDOS) || defined (WINDOWSNT)
9756 system_eol_type = Qdos;
9757 #else
9758 system_eol_type = Qunix;
9759 #endif
9760 staticpro (&system_eol_type);
9761 }
9762
9763 char *
9764 emacs_strerror (error_number)
9765 int error_number;
9766 {
9767 char *str;
9768
9769 synchronize_system_messages_locale ();
9770 str = strerror (error_number);
9771
9772 if (! NILP (Vlocale_coding_system))
9773 {
9774 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9775 Vlocale_coding_system,
9776 0);
9777 str = (char *) SDATA (dec);
9778 }
9779
9780 return str;
9781 }
9782
9783 #endif /* emacs */
9784
9785 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9786 (do not change this comment) */