Merge from emacs--devo--0
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
10
11 This file is part of GNU Emacs.
12
13 GNU Emacs is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs; see the file COPYING. If not, write to
25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 Boston, MA 02110-1301, USA. */
27
28 /*** TABLE OF CONTENTS ***
29
30 0. General comments
31 1. Preamble
32 2. Emacs' internal format (emacs-utf-8) handlers
33 3. UTF-8 handlers
34 4. UTF-16 handlers
35 5. Charset-base coding systems handlers
36 6. emacs-mule (old Emacs' internal format) handlers
37 7. ISO2022 handlers
38 8. Shift-JIS and BIG5 handlers
39 9. CCL handlers
40 10. C library functions
41 11. Emacs Lisp library functions
42 12. Postamble
43
44 */
45
46 /*** 0. General comments ***
47
48
49 CODING SYSTEM
50
51 A coding system is an object for an encoding mechanism that contains
52 information about how to convert byte sequences to character
53 sequences and vice versa. When we say "decode", it means converting
54 a byte sequence of a specific coding system into a character
55 sequence that is represented by Emacs' internal coding system
56 `emacs-utf-8', and when we say "encode", it means converting a
57 character sequence of emacs-utf-8 to a byte sequence of a specific
58 coding system.
59
60 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
61 C level, a coding system is represented by a vector of attributes
62 stored in the hash table Vcharset_hash_table. The conversion from
63 coding system symbol to attributes vector is done by looking up
64 Vcharset_hash_table by the symbol.
65
66 Coding systems are classified into the following types depending on
67 the encoding mechanism. Here's a brief description of the types.
68
69 o UTF-8
70
71 o UTF-16
72
73 o Charset-base coding system
74
75 A coding system defined by one or more (coded) character sets.
76 Decoding and encoding are done by a code converter defined for each
77 character set.
78
79 o Old Emacs internal format (emacs-mule)
80
81 The coding system adopted by old versions of Emacs (20 and 21).
82
83 o ISO2022-base coding system
84
85 The most famous coding system for multiple character sets. X's
86 Compound Text, various EUCs (Extended Unix Code), and coding systems
87 used in the Internet communication such as ISO-2022-JP are all
88 variants of ISO2022.
89
90 o SJIS (or Shift-JIS or MS-Kanji-Code)
91
92 A coding system to encode character sets: ASCII, JISX0201, and
93 JISX0208. Widely used for PC's in Japan. Details are described in
94 section 8.
95
96 o BIG5
97
98 A coding system to encode character sets: ASCII and Big5. Widely
99 used for Chinese (mainly in Taiwan and Hong Kong). Details are
100 described in section 8. In this file, when we write "big5" (all
101 lowercase), we mean the coding system, and when we write "Big5"
102 (capitalized), we mean the character set.
103
104 o CCL
105
106 If a user wants to decode/encode text encoded in a coding system
107 not listed above, he can supply a decoder and an encoder for it in
108 CCL (Code Conversion Language) programs. Emacs executes the CCL
109 program while decoding/encoding.
110
111 o Raw-text
112
113 A coding system for text containing raw eight-bit data. Emacs
114 treats each byte of source text as a character (except for
115 end-of-line conversion).
116
117 o No-conversion
118
119 Like raw text, but don't do end-of-line conversion.
120
121
122 END-OF-LINE FORMAT
123
124 How text end-of-line is encoded depends on operating system. For
125 instance, Unix's format is just one byte of LF (line-feed) code,
126 whereas DOS's format is two-byte sequence of `carriage-return' and
127 `line-feed' codes. MacOS's format is usually one byte of
128 `carriage-return'.
129
130 Since text character encoding and end-of-line encoding are
131 independent, any coding system described above can take any format
132 of end-of-line (except for no-conversion).
133
134 STRUCT CODING_SYSTEM
135
136 Before using a coding system for code conversion (i.e. decoding and
137 encoding), we setup a structure of type `struct coding_system'.
138 This structure keeps various information about a specific code
139 conversion (e.g. the location of source and destination data).
140
141 */
142
143 /* COMMON MACROS */
144
145
146 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
147
148 These functions check if a byte sequence specified as a source in
149 CODING conforms to the format of XXX, and update the members of
150 DETECT_INFO.
151
152 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
153
154 Below is the template of these functions. */
155
156 #if 0
157 static int
158 detect_coding_XXX (coding, detect_info)
159 struct coding_system *coding;
160 struct coding_detection_info *detect_info;
161 {
162 const unsigned char *src = coding->source;
163 const unsigned char *src_end = coding->source + coding->src_bytes;
164 int multibytep = coding->src_multibyte;
165 int consumed_chars = 0;
166 int found = 0;
167 ...;
168
169 while (1)
170 {
171 /* Get one byte from the source. If the souce is exausted, jump
172 to no_more_source:. */
173 ONE_MORE_BYTE (c);
174
175 if (! __C_conforms_to_XXX___ (c))
176 break;
177 if (! __C_strongly_suggests_XXX__ (c))
178 found = CATEGORY_MASK_XXX;
179 }
180 /* The byte sequence is invalid for XXX. */
181 detect_info->rejected |= CATEGORY_MASK_XXX;
182 return 0;
183
184 no_more_source:
185 /* The source exausted successfully. */
186 detect_info->found |= found;
187 return 1;
188 }
189 #endif
190
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
192
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
197
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
202
203 Below is the template of these functions. */
204
205 #if 0
206 static void
207 decode_coding_XXXX (coding)
208 struct coding_system *coding;
209 {
210 const unsigned char *src = coding->source + coding->consumed;
211 const unsigned char *src_end = coding->source + coding->src_bytes;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 const unsigned char *src_base;
216 /* A buffer to produce decoded characters. */
217 int *charbuf = coding->charbuf + coding->charbuf_used;
218 int *charbuf_end = coding->charbuf + coding->charbuf_size;
219 int multibytep = coding->src_multibyte;
220
221 while (1)
222 {
223 src_base = src;
224 if (charbuf < charbuf_end)
225 /* No more room to produce a decoded character. */
226 break;
227 ONE_MORE_BYTE (c);
228 /* Decode it. */
229 }
230
231 no_more_source:
232 if (src_base < src_end
233 && coding->mode & CODING_MODE_LAST_BLOCK)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base < src_end && charbuf < charbuf_end)
237 *charbuf++ = *src_base++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding->consumed = coding->consumed_char = src_base - coding->source;
241 /* Remember how many characters we produced. */
242 coding->charbuf_used = charbuf - coding->charbuf;
243 }
244 #endif
245
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
247
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
252
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
257
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
261
262 Below is a template of these functions. */
263 #if 0
264 static void
265 encode_coding_XXX (coding)
266 struct coding_system *coding;
267 {
268 int multibytep = coding->dst_multibyte;
269 int *charbuf = coding->charbuf;
270 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
271 unsigned char *dst = coding->destination + coding->produced;
272 unsigned char *dst_end = coding->destination + coding->dst_bytes;
273 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
274 int produced_chars = 0;
275
276 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
277 {
278 int c = *charbuf;
279 /* Encode C into DST, and increment DST. */
280 }
281 label_no_more_destination:
282 /* How many chars and bytes we produced. */
283 coding->produced_char += produced_chars;
284 coding->produced = dst - coding->destination;
285 }
286 #endif
287
288 \f
289 /*** 1. Preamble ***/
290
291 #include <config.h>
292 #include <stdio.h>
293
294 #include "lisp.h"
295 #include "buffer.h"
296 #include "character.h"
297 #include "charset.h"
298 #include "ccl.h"
299 #include "composite.h"
300 #include "coding.h"
301 #include "window.h"
302
303 Lisp_Object Vcoding_system_hash_table;
304
305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
306 Lisp_Object Qunix, Qdos;
307 extern Lisp_Object Qmac; /* frame.c */
308 Lisp_Object Qbuffer_file_coding_system;
309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
310 Lisp_Object Qdefault_char;
311 Lisp_Object Qno_conversion, Qundecided;
312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
313 Lisp_Object Qbig, Qlittle;
314 Lisp_Object Qcoding_system_history;
315 Lisp_Object Qvalid_codes;
316 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
317 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
318 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
319 Lisp_Object QCascii_compatible_p;
320
321 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
322 Lisp_Object Qcall_process, Qcall_process_region;
323 Lisp_Object Qstart_process, Qopen_network_stream;
324 Lisp_Object Qtarget_idx;
325
326 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
327 Lisp_Object Qinterrupted, Qinsufficient_memory;
328
329 /* If a symbol has this property, evaluate the value to define the
330 symbol as a coding system. */
331 static Lisp_Object Qcoding_system_define_form;
332
333 int coding_system_require_warning;
334
335 Lisp_Object Vselect_safe_coding_system_function;
336
337 /* Mnemonic string for each format of end-of-line. */
338 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
339 /* Mnemonic string to indicate format of end-of-line is not yet
340 decided. */
341 Lisp_Object eol_mnemonic_undecided;
342
343 /* Format of end-of-line decided by system. This is Qunix on
344 Unix and Mac, Qdos on DOS/Windows.
345 This has an effect only for external encoding (i.e. for output to
346 file and process), not for in-buffer or Lisp string encoding. */
347 static Lisp_Object system_eol_type;
348
349 #ifdef emacs
350
351 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
352
353 Lisp_Object Qcoding_system_p, Qcoding_system_error;
354
355 /* Coding system emacs-mule and raw-text are for converting only
356 end-of-line format. */
357 Lisp_Object Qemacs_mule, Qraw_text;
358 Lisp_Object Qutf_8_emacs;
359
360 /* Coding-systems are handed between Emacs Lisp programs and C internal
361 routines by the following three variables. */
362 /* Coding-system for reading files and receiving data from process. */
363 Lisp_Object Vcoding_system_for_read;
364 /* Coding-system for writing files and sending data to process. */
365 Lisp_Object Vcoding_system_for_write;
366 /* Coding-system actually used in the latest I/O. */
367 Lisp_Object Vlast_coding_system_used;
368 /* Set to non-nil when an error is detected while code conversion. */
369 Lisp_Object Vlast_code_conversion_error;
370 /* A vector of length 256 which contains information about special
371 Latin codes (especially for dealing with Microsoft codes). */
372 Lisp_Object Vlatin_extra_code_table;
373
374 /* Flag to inhibit code conversion of end-of-line format. */
375 int inhibit_eol_conversion;
376
377 /* Flag to inhibit ISO2022 escape sequence detection. */
378 int inhibit_iso_escape_detection;
379
380 /* Flag to make buffer-file-coding-system inherit from process-coding. */
381 int inherit_process_coding_system;
382
383 /* Coding system to be used to encode text for terminal display. */
384 struct coding_system terminal_coding;
385
386 /* Coding system to be used to encode text for terminal display when
387 terminal coding system is nil. */
388 struct coding_system safe_terminal_coding;
389
390 /* Coding system of what is sent from terminal keyboard. */
391 struct coding_system keyboard_coding;
392
393 Lisp_Object Vfile_coding_system_alist;
394 Lisp_Object Vprocess_coding_system_alist;
395 Lisp_Object Vnetwork_coding_system_alist;
396
397 Lisp_Object Vlocale_coding_system;
398
399 #endif /* emacs */
400
401 /* Flag to tell if we look up translation table on character code
402 conversion. */
403 Lisp_Object Venable_character_translation;
404 /* Standard translation table to look up on decoding (reading). */
405 Lisp_Object Vstandard_translation_table_for_decode;
406 /* Standard translation table to look up on encoding (writing). */
407 Lisp_Object Vstandard_translation_table_for_encode;
408
409 Lisp_Object Qtranslation_table;
410 Lisp_Object Qtranslation_table_id;
411 Lisp_Object Qtranslation_table_for_decode;
412 Lisp_Object Qtranslation_table_for_encode;
413
414 /* Alist of charsets vs revision number. */
415 static Lisp_Object Vcharset_revision_table;
416
417 /* Default coding systems used for process I/O. */
418 Lisp_Object Vdefault_process_coding_system;
419
420 /* Char table for translating Quail and self-inserting input. */
421 Lisp_Object Vtranslation_table_for_input;
422
423 /* Two special coding systems. */
424 Lisp_Object Vsjis_coding_system;
425 Lisp_Object Vbig5_coding_system;
426
427 /* ISO2022 section */
428
429 #define CODING_ISO_INITIAL(coding, reg) \
430 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
431 coding_attr_iso_initial), \
432 reg)))
433
434
435 #define CODING_ISO_REQUEST(coding, charset_id) \
436 ((charset_id <= (coding)->max_charset_id \
437 ? (coding)->safe_charsets[charset_id] \
438 : -1))
439
440
441 #define CODING_ISO_FLAGS(coding) \
442 ((coding)->spec.iso_2022.flags)
443 #define CODING_ISO_DESIGNATION(coding, reg) \
444 ((coding)->spec.iso_2022.current_designation[reg])
445 #define CODING_ISO_INVOCATION(coding, plane) \
446 ((coding)->spec.iso_2022.current_invocation[plane])
447 #define CODING_ISO_SINGLE_SHIFTING(coding) \
448 ((coding)->spec.iso_2022.single_shifting)
449 #define CODING_ISO_BOL(coding) \
450 ((coding)->spec.iso_2022.bol)
451 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
452 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
453
454 /* Control characters of ISO2022. */
455 /* code */ /* function */
456 #define ISO_CODE_LF 0x0A /* line-feed */
457 #define ISO_CODE_CR 0x0D /* carriage-return */
458 #define ISO_CODE_SO 0x0E /* shift-out */
459 #define ISO_CODE_SI 0x0F /* shift-in */
460 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
461 #define ISO_CODE_ESC 0x1B /* escape */
462 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
463 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
464 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
465
466 /* All code (1-byte) of ISO2022 is classified into one of the
467 followings. */
468 enum iso_code_class_type
469 {
470 ISO_control_0, /* Control codes in the range
471 0x00..0x1F and 0x7F, except for the
472 following 5 codes. */
473 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
474 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
475 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
476 ISO_escape, /* ISO_CODE_SO (0x1B) */
477 ISO_control_1, /* Control codes in the range
478 0x80..0x9F, except for the
479 following 3 codes. */
480 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
481 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
482 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
483 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
484 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
485 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
486 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
487 };
488
489 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
490 `iso-flags' attribute of an iso2022 coding system. */
491
492 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
493 instead of the correct short-form sequence (e.g. ESC $ A). */
494 #define CODING_ISO_FLAG_LONG_FORM 0x0001
495
496 /* If set, reset graphic planes and registers at end-of-line to the
497 initial state. */
498 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
499
500 /* If set, reset graphic planes and registers before any control
501 characters to the initial state. */
502 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
503
504 /* If set, encode by 7-bit environment. */
505 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
506
507 /* If set, use locking-shift function. */
508 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
509
510 /* If set, use single-shift function. Overwrite
511 CODING_ISO_FLAG_LOCKING_SHIFT. */
512 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
513
514 /* If set, use designation escape sequence. */
515 #define CODING_ISO_FLAG_DESIGNATION 0x0040
516
517 /* If set, produce revision number sequence. */
518 #define CODING_ISO_FLAG_REVISION 0x0080
519
520 /* If set, produce ISO6429's direction specifying sequence. */
521 #define CODING_ISO_FLAG_DIRECTION 0x0100
522
523 /* If set, assume designation states are reset at beginning of line on
524 output. */
525 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
526
527 /* If set, designation sequence should be placed at beginning of line
528 on output. */
529 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
530
531 /* If set, do not encode unsafe charactes on output. */
532 #define CODING_ISO_FLAG_SAFE 0x0800
533
534 /* If set, extra latin codes (128..159) are accepted as a valid code
535 on input. */
536 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
537
538 #define CODING_ISO_FLAG_COMPOSITION 0x2000
539
540 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
541
542 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
543
544 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
545
546 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
547
548 /* A character to be produced on output if encoding of the original
549 character is prohibited by CODING_ISO_FLAG_SAFE. */
550 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
551
552
553 /* UTF-16 section */
554 #define CODING_UTF_16_BOM(coding) \
555 ((coding)->spec.utf_16.bom)
556
557 #define CODING_UTF_16_ENDIAN(coding) \
558 ((coding)->spec.utf_16.endian)
559
560 #define CODING_UTF_16_SURROGATE(coding) \
561 ((coding)->spec.utf_16.surrogate)
562
563
564 /* CCL section */
565 #define CODING_CCL_DECODER(coding) \
566 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
567 #define CODING_CCL_ENCODER(coding) \
568 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
569 #define CODING_CCL_VALIDS(coding) \
570 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
571
572 /* Index for each coding category in `coding_categories' */
573
574 enum coding_category
575 {
576 coding_category_iso_7,
577 coding_category_iso_7_tight,
578 coding_category_iso_8_1,
579 coding_category_iso_8_2,
580 coding_category_iso_7_else,
581 coding_category_iso_8_else,
582 coding_category_utf_8,
583 coding_category_utf_16_auto,
584 coding_category_utf_16_be,
585 coding_category_utf_16_le,
586 coding_category_utf_16_be_nosig,
587 coding_category_utf_16_le_nosig,
588 coding_category_charset,
589 coding_category_sjis,
590 coding_category_big5,
591 coding_category_ccl,
592 coding_category_emacs_mule,
593 /* All above are targets of code detection. */
594 coding_category_raw_text,
595 coding_category_undecided,
596 coding_category_max
597 };
598
599 /* Definitions of flag bits used in detect_coding_XXXX. */
600 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
601 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
602 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
603 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
604 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
605 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
606 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
607 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
608 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
609 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
610 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
611 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
612 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
613 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
614 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
615 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
616 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
617 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
618
619 /* This value is returned if detect_coding_mask () find nothing other
620 than ASCII characters. */
621 #define CATEGORY_MASK_ANY \
622 (CATEGORY_MASK_ISO_7 \
623 | CATEGORY_MASK_ISO_7_TIGHT \
624 | CATEGORY_MASK_ISO_8_1 \
625 | CATEGORY_MASK_ISO_8_2 \
626 | CATEGORY_MASK_ISO_7_ELSE \
627 | CATEGORY_MASK_ISO_8_ELSE \
628 | CATEGORY_MASK_UTF_8 \
629 | CATEGORY_MASK_UTF_16_BE \
630 | CATEGORY_MASK_UTF_16_LE \
631 | CATEGORY_MASK_UTF_16_BE_NOSIG \
632 | CATEGORY_MASK_UTF_16_LE_NOSIG \
633 | CATEGORY_MASK_CHARSET \
634 | CATEGORY_MASK_SJIS \
635 | CATEGORY_MASK_BIG5 \
636 | CATEGORY_MASK_CCL \
637 | CATEGORY_MASK_EMACS_MULE)
638
639
640 #define CATEGORY_MASK_ISO_7BIT \
641 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
642
643 #define CATEGORY_MASK_ISO_8BIT \
644 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
645
646 #define CATEGORY_MASK_ISO_ELSE \
647 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
648
649 #define CATEGORY_MASK_ISO_ESCAPE \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_7_ELSE \
653 | CATEGORY_MASK_ISO_8_ELSE)
654
655 #define CATEGORY_MASK_ISO \
656 ( CATEGORY_MASK_ISO_7BIT \
657 | CATEGORY_MASK_ISO_8BIT \
658 | CATEGORY_MASK_ISO_ELSE)
659
660 #define CATEGORY_MASK_UTF_16 \
661 (CATEGORY_MASK_UTF_16_BE \
662 | CATEGORY_MASK_UTF_16_LE \
663 | CATEGORY_MASK_UTF_16_BE_NOSIG \
664 | CATEGORY_MASK_UTF_16_LE_NOSIG)
665
666
667 /* List of symbols `coding-category-xxx' ordered by priority. This
668 variable is exposed to Emacs Lisp. */
669 static Lisp_Object Vcoding_category_list;
670
671 /* Table of coding categories (Lisp symbols). This variable is for
672 internal use oly. */
673 static Lisp_Object Vcoding_category_table;
674
675 /* Table of coding-categories ordered by priority. */
676 static enum coding_category coding_priorities[coding_category_max];
677
678 /* Nth element is a coding context for the coding system bound to the
679 Nth coding category. */
680 static struct coding_system coding_categories[coding_category_max];
681
682 /*** Commonly used macros and functions ***/
683
684 #ifndef min
685 #define min(a, b) ((a) < (b) ? (a) : (b))
686 #endif
687 #ifndef max
688 #define max(a, b) ((a) > (b) ? (a) : (b))
689 #endif
690
691 #define CODING_GET_INFO(coding, attrs, charset_list) \
692 do { \
693 (attrs) = CODING_ID_ATTRS ((coding)->id); \
694 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
695 } while (0)
696
697
698 /* Safely get one byte from the source text pointed by SRC which ends
699 at SRC_END, and set C to that byte. If there are not enough bytes
700 in the source, it jumps to `no_more_source'. If multibytep is
701 nonzero, and a multibyte character is found at SRC, set C to the
702 negative value of the character code. The caller should declare
703 and set these variables appropriately in advance:
704 src, src_end, multibytep */
705
706 #define ONE_MORE_BYTE(c) \
707 do { \
708 if (src == src_end) \
709 { \
710 if (src_base < src) \
711 record_conversion_result \
712 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
713 goto no_more_source; \
714 } \
715 c = *src++; \
716 if (multibytep && (c & 0x80)) \
717 { \
718 if ((c & 0xFE) == 0xC0) \
719 c = ((c & 1) << 6) | *src++; \
720 else \
721 { \
722 src--; \
723 c = - string_char (src, &src, NULL); \
724 record_conversion_result \
725 (coding, CODING_RESULT_INVALID_SRC); \
726 } \
727 } \
728 consumed_chars++; \
729 } while (0)
730
731
732 #define ONE_MORE_BYTE_NO_CHECK(c) \
733 do { \
734 c = *src++; \
735 if (multibytep && (c & 0x80)) \
736 { \
737 if ((c & 0xFE) == 0xC0) \
738 c = ((c & 1) << 6) | *src++; \
739 else \
740 { \
741 src--; \
742 c = - string_char (src, &src, NULL); \
743 record_conversion_result \
744 (coding, CODING_RESULT_INVALID_SRC); \
745 } \
746 } \
747 consumed_chars++; \
748 } while (0)
749
750
751 /* Store a byte C in the place pointed by DST and increment DST to the
752 next free point, and increment PRODUCED_CHARS. The caller should
753 assure that C is 0..127, and declare and set the variable `dst'
754 appropriately in advance.
755 */
756
757
758 #define EMIT_ONE_ASCII_BYTE(c) \
759 do { \
760 produced_chars++; \
761 *dst++ = (c); \
762 } while (0)
763
764
765 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
766
767 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
768 do { \
769 produced_chars += 2; \
770 *dst++ = (c1), *dst++ = (c2); \
771 } while (0)
772
773
774 /* Store a byte C in the place pointed by DST and increment DST to the
775 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
776 nonzero, store in an appropriate multibyte from. The caller should
777 declare and set the variables `dst' and `multibytep' appropriately
778 in advance. */
779
780 #define EMIT_ONE_BYTE(c) \
781 do { \
782 produced_chars++; \
783 if (multibytep) \
784 { \
785 int ch = (c); \
786 if (ch >= 0x80) \
787 ch = BYTE8_TO_CHAR (ch); \
788 CHAR_STRING_ADVANCE (ch, dst); \
789 } \
790 else \
791 *dst++ = (c); \
792 } while (0)
793
794
795 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
796
797 #define EMIT_TWO_BYTES(c1, c2) \
798 do { \
799 produced_chars += 2; \
800 if (multibytep) \
801 { \
802 int ch; \
803 \
804 ch = (c1); \
805 if (ch >= 0x80) \
806 ch = BYTE8_TO_CHAR (ch); \
807 CHAR_STRING_ADVANCE (ch, dst); \
808 ch = (c2); \
809 if (ch >= 0x80) \
810 ch = BYTE8_TO_CHAR (ch); \
811 CHAR_STRING_ADVANCE (ch, dst); \
812 } \
813 else \
814 { \
815 *dst++ = (c1); \
816 *dst++ = (c2); \
817 } \
818 } while (0)
819
820
821 #define EMIT_THREE_BYTES(c1, c2, c3) \
822 do { \
823 EMIT_ONE_BYTE (c1); \
824 EMIT_TWO_BYTES (c2, c3); \
825 } while (0)
826
827
828 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
829 do { \
830 EMIT_TWO_BYTES (c1, c2); \
831 EMIT_TWO_BYTES (c3, c4); \
832 } while (0)
833
834
835 /* Prototypes for static functions. */
836 static void record_conversion_result P_ ((struct coding_system *coding,
837 enum coding_result_code result));
838 static int detect_coding_utf_8 P_ ((struct coding_system *,
839 struct coding_detection_info *info));
840 static void decode_coding_utf_8 P_ ((struct coding_system *));
841 static int encode_coding_utf_8 P_ ((struct coding_system *));
842
843 static int detect_coding_utf_16 P_ ((struct coding_system *,
844 struct coding_detection_info *info));
845 static void decode_coding_utf_16 P_ ((struct coding_system *));
846 static int encode_coding_utf_16 P_ ((struct coding_system *));
847
848 static int detect_coding_iso_2022 P_ ((struct coding_system *,
849 struct coding_detection_info *info));
850 static void decode_coding_iso_2022 P_ ((struct coding_system *));
851 static int encode_coding_iso_2022 P_ ((struct coding_system *));
852
853 static int detect_coding_emacs_mule P_ ((struct coding_system *,
854 struct coding_detection_info *info));
855 static void decode_coding_emacs_mule P_ ((struct coding_system *));
856 static int encode_coding_emacs_mule P_ ((struct coding_system *));
857
858 static int detect_coding_sjis P_ ((struct coding_system *,
859 struct coding_detection_info *info));
860 static void decode_coding_sjis P_ ((struct coding_system *));
861 static int encode_coding_sjis P_ ((struct coding_system *));
862
863 static int detect_coding_big5 P_ ((struct coding_system *,
864 struct coding_detection_info *info));
865 static void decode_coding_big5 P_ ((struct coding_system *));
866 static int encode_coding_big5 P_ ((struct coding_system *));
867
868 static int detect_coding_ccl P_ ((struct coding_system *,
869 struct coding_detection_info *info));
870 static void decode_coding_ccl P_ ((struct coding_system *));
871 static int encode_coding_ccl P_ ((struct coding_system *));
872
873 static void decode_coding_raw_text P_ ((struct coding_system *));
874 static int encode_coding_raw_text P_ ((struct coding_system *));
875
876 static void coding_set_source P_ ((struct coding_system *));
877 static void coding_set_destination P_ ((struct coding_system *));
878 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
879 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
880 EMACS_INT));
881 static unsigned char *alloc_destination P_ ((struct coding_system *,
882 EMACS_INT, unsigned char *));
883 static void setup_iso_safe_charsets P_ ((Lisp_Object));
884 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
885 int *, int *,
886 unsigned char *));
887 static int detect_eol P_ ((const unsigned char *,
888 EMACS_INT, enum coding_category));
889 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
890 static void decode_eol P_ ((struct coding_system *));
891 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
892 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
893 int, int *, int *));
894 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
895 static INLINE void produce_composition P_ ((struct coding_system *, int *,
896 EMACS_INT));
897 static INLINE void produce_charset P_ ((struct coding_system *, int *,
898 EMACS_INT));
899 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
900 static int decode_coding P_ ((struct coding_system *));
901 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
902 struct coding_system *,
903 int *, EMACS_INT *));
904 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
905 struct coding_system *,
906 int *, EMACS_INT *));
907 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
908 static int encode_coding P_ ((struct coding_system *));
909 static Lisp_Object make_conversion_work_buffer P_ ((int));
910 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
911 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
912 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
913
914 static void
915 record_conversion_result (struct coding_system *coding,
916 enum coding_result_code result)
917 {
918 coding->result = result;
919 switch (result)
920 {
921 case CODING_RESULT_INSUFFICIENT_SRC:
922 Vlast_code_conversion_error = Qinsufficient_source;
923 break;
924 case CODING_RESULT_INCONSISTENT_EOL:
925 Vlast_code_conversion_error = Qinconsistent_eol;
926 break;
927 case CODING_RESULT_INVALID_SRC:
928 Vlast_code_conversion_error = Qinvalid_source;
929 break;
930 case CODING_RESULT_INTERRUPT:
931 Vlast_code_conversion_error = Qinterrupted;
932 break;
933 case CODING_RESULT_INSUFFICIENT_MEM:
934 Vlast_code_conversion_error = Qinsufficient_memory;
935 break;
936 default:
937 Vlast_code_conversion_error = intern ("Unknown error");
938 }
939 }
940
941 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
942 do { \
943 charset_map_loaded = 0; \
944 c = DECODE_CHAR (charset, code); \
945 if (charset_map_loaded) \
946 { \
947 const unsigned char *orig = coding->source; \
948 EMACS_INT offset; \
949 \
950 coding_set_source (coding); \
951 offset = coding->source - orig; \
952 src += offset; \
953 src_base += offset; \
954 src_end += offset; \
955 } \
956 } while (0)
957
958
959 #define ASSURE_DESTINATION(bytes) \
960 do { \
961 if (dst + (bytes) >= dst_end) \
962 { \
963 int more_bytes = charbuf_end - charbuf + (bytes); \
964 \
965 dst = alloc_destination (coding, more_bytes, dst); \
966 dst_end = coding->destination + coding->dst_bytes; \
967 } \
968 } while (0)
969
970
971
972 static void
973 coding_set_source (coding)
974 struct coding_system *coding;
975 {
976 if (BUFFERP (coding->src_object))
977 {
978 struct buffer *buf = XBUFFER (coding->src_object);
979
980 if (coding->src_pos < 0)
981 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
982 else
983 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
984 }
985 else if (STRINGP (coding->src_object))
986 {
987 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
988 }
989 else
990 /* Otherwise, the source is C string and is never relocated
991 automatically. Thus we don't have to update anything. */
992 ;
993 }
994
995 static void
996 coding_set_destination (coding)
997 struct coding_system *coding;
998 {
999 if (BUFFERP (coding->dst_object))
1000 {
1001 if (coding->src_pos < 0)
1002 {
1003 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
1004 coding->dst_bytes = (GAP_END_ADDR
1005 - (coding->src_bytes - coding->consumed)
1006 - coding->destination);
1007 }
1008 else
1009 {
1010 /* We are sure that coding->dst_pos_byte is before the gap
1011 of the buffer. */
1012 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1013 + coding->dst_pos_byte - 1);
1014 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1015 - coding->destination);
1016 }
1017 }
1018 else
1019 /* Otherwise, the destination is C string and is never relocated
1020 automatically. Thus we don't have to update anything. */
1021 ;
1022 }
1023
1024
1025 static void
1026 coding_alloc_by_realloc (coding, bytes)
1027 struct coding_system *coding;
1028 EMACS_INT bytes;
1029 {
1030 coding->destination = (unsigned char *) xrealloc (coding->destination,
1031 coding->dst_bytes + bytes);
1032 coding->dst_bytes += bytes;
1033 }
1034
1035 static void
1036 coding_alloc_by_making_gap (coding, bytes)
1037 struct coding_system *coding;
1038 EMACS_INT bytes;
1039 {
1040 if (BUFFERP (coding->dst_object)
1041 && EQ (coding->src_object, coding->dst_object))
1042 {
1043 EMACS_INT add = coding->src_bytes - coding->consumed;
1044
1045 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1046 make_gap (bytes);
1047 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1048 }
1049 else
1050 {
1051 Lisp_Object this_buffer;
1052
1053 this_buffer = Fcurrent_buffer ();
1054 set_buffer_internal (XBUFFER (coding->dst_object));
1055 make_gap (bytes);
1056 set_buffer_internal (XBUFFER (this_buffer));
1057 }
1058 }
1059
1060
1061 static unsigned char *
1062 alloc_destination (coding, nbytes, dst)
1063 struct coding_system *coding;
1064 EMACS_INT nbytes;
1065 unsigned char *dst;
1066 {
1067 EMACS_INT offset = dst - coding->destination;
1068
1069 if (BUFFERP (coding->dst_object))
1070 coding_alloc_by_making_gap (coding, nbytes);
1071 else
1072 coding_alloc_by_realloc (coding, nbytes);
1073 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1074 coding_set_destination (coding);
1075 dst = coding->destination + offset;
1076 return dst;
1077 }
1078
1079 /** Macros for annotations. */
1080
1081 /* Maximum length of annotation data (sum of annotations for
1082 composition and charset). */
1083 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1084
1085 /* An annotation data is stored in the array coding->charbuf in this
1086 format:
1087 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1088 LENGTH is the number of elements in the annotation.
1089 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1090 NCHARS is the number of characters in the text annotated.
1091
1092 The format of the following elements depend on ANNOTATION_MASK.
1093
1094 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1095 follows:
1096 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1097 METHOD is one of enum composition_method.
1098 Optionnal COMPOSITION-COMPONENTS are characters and composition
1099 rules.
1100
1101 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1102 follows. */
1103
1104 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1105 do { \
1106 *(buf)++ = -(len); \
1107 *(buf)++ = (mask); \
1108 *(buf)++ = (nchars); \
1109 coding->annotated = 1; \
1110 } while (0);
1111
1112 #define ADD_COMPOSITION_DATA(buf, nchars, method) \
1113 do { \
1114 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1115 *buf++ = method; \
1116 } while (0)
1117
1118
1119 #define ADD_CHARSET_DATA(buf, nchars, id) \
1120 do { \
1121 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1122 *buf++ = id; \
1123 } while (0)
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134 Check if a text is encoded in UTF-8. If it is, return 1, else
1135 return 0. */
1136
1137 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1138 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1139 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1140 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1141 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1142 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1143
1144 static int
1145 detect_coding_utf_8 (coding, detect_info)
1146 struct coding_system *coding;
1147 struct coding_detection_info *detect_info;
1148 {
1149 const unsigned char *src = coding->source, *src_base;
1150 const unsigned char *src_end = coding->source + coding->src_bytes;
1151 int multibytep = coding->src_multibyte;
1152 int consumed_chars = 0;
1153 int found = 0;
1154
1155 detect_info->checked |= CATEGORY_MASK_UTF_8;
1156 /* A coding system of this category is always ASCII compatible. */
1157 src += coding->head_ascii;
1158
1159 while (1)
1160 {
1161 int c, c1, c2, c3, c4;
1162
1163 src_base = src;
1164 ONE_MORE_BYTE (c);
1165 if (c < 0 || UTF_8_1_OCTET_P (c))
1166 continue;
1167 ONE_MORE_BYTE (c1);
1168 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1169 break;
1170 if (UTF_8_2_OCTET_LEADING_P (c))
1171 {
1172 found = CATEGORY_MASK_UTF_8;
1173 continue;
1174 }
1175 ONE_MORE_BYTE (c2);
1176 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1177 break;
1178 if (UTF_8_3_OCTET_LEADING_P (c))
1179 {
1180 found = CATEGORY_MASK_UTF_8;
1181 continue;
1182 }
1183 ONE_MORE_BYTE (c3);
1184 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1185 break;
1186 if (UTF_8_4_OCTET_LEADING_P (c))
1187 {
1188 found = CATEGORY_MASK_UTF_8;
1189 continue;
1190 }
1191 ONE_MORE_BYTE (c4);
1192 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1193 break;
1194 if (UTF_8_5_OCTET_LEADING_P (c))
1195 {
1196 found = CATEGORY_MASK_UTF_8;
1197 continue;
1198 }
1199 break;
1200 }
1201 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1202 return 0;
1203
1204 no_more_source:
1205 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1206 {
1207 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1208 return 0;
1209 }
1210 detect_info->found |= found;
1211 return 1;
1212 }
1213
1214
1215 static void
1216 decode_coding_utf_8 (coding)
1217 struct coding_system *coding;
1218 {
1219 const unsigned char *src = coding->source + coding->consumed;
1220 const unsigned char *src_end = coding->source + coding->src_bytes;
1221 const unsigned char *src_base;
1222 int *charbuf = coding->charbuf + coding->charbuf_used;
1223 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1224 int consumed_chars = 0, consumed_chars_base;
1225 int multibytep = coding->src_multibyte;
1226 Lisp_Object attr, charset_list;
1227
1228 CODING_GET_INFO (coding, attr, charset_list);
1229
1230 while (1)
1231 {
1232 int c, c1, c2, c3, c4, c5;
1233
1234 src_base = src;
1235 consumed_chars_base = consumed_chars;
1236
1237 if (charbuf >= charbuf_end)
1238 break;
1239
1240 ONE_MORE_BYTE (c1);
1241 if (c1 < 0)
1242 {
1243 c = - c1;
1244 }
1245 else if (UTF_8_1_OCTET_P(c1))
1246 {
1247 c = c1;
1248 }
1249 else
1250 {
1251 ONE_MORE_BYTE (c2);
1252 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1253 goto invalid_code;
1254 if (UTF_8_2_OCTET_LEADING_P (c1))
1255 {
1256 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1257 /* Reject overlong sequences here and below. Encoders
1258 producing them are incorrect, they can be misleading,
1259 and they mess up read/write invariance. */
1260 if (c < 128)
1261 goto invalid_code;
1262 }
1263 else
1264 {
1265 ONE_MORE_BYTE (c3);
1266 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1267 goto invalid_code;
1268 if (UTF_8_3_OCTET_LEADING_P (c1))
1269 {
1270 c = (((c1 & 0xF) << 12)
1271 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1272 if (c < 0x800
1273 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1274 goto invalid_code;
1275 }
1276 else
1277 {
1278 ONE_MORE_BYTE (c4);
1279 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1280 goto invalid_code;
1281 if (UTF_8_4_OCTET_LEADING_P (c1))
1282 {
1283 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1284 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1285 if (c < 0x10000)
1286 goto invalid_code;
1287 }
1288 else
1289 {
1290 ONE_MORE_BYTE (c5);
1291 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1292 goto invalid_code;
1293 if (UTF_8_5_OCTET_LEADING_P (c1))
1294 {
1295 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1296 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1297 | (c5 & 0x3F));
1298 if ((c > MAX_CHAR) || (c < 0x200000))
1299 goto invalid_code;
1300 }
1301 else
1302 goto invalid_code;
1303 }
1304 }
1305 }
1306 }
1307
1308 *charbuf++ = c;
1309 continue;
1310
1311 invalid_code:
1312 src = src_base;
1313 consumed_chars = consumed_chars_base;
1314 ONE_MORE_BYTE (c);
1315 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1316 coding->errors++;
1317 }
1318
1319 no_more_source:
1320 coding->consumed_char += consumed_chars_base;
1321 coding->consumed = src_base - coding->source;
1322 coding->charbuf_used = charbuf - coding->charbuf;
1323 }
1324
1325
1326 static int
1327 encode_coding_utf_8 (coding)
1328 struct coding_system *coding;
1329 {
1330 int multibytep = coding->dst_multibyte;
1331 int *charbuf = coding->charbuf;
1332 int *charbuf_end = charbuf + coding->charbuf_used;
1333 unsigned char *dst = coding->destination + coding->produced;
1334 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1335 int produced_chars = 0;
1336 int c;
1337
1338 if (multibytep)
1339 {
1340 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1341
1342 while (charbuf < charbuf_end)
1343 {
1344 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1345
1346 ASSURE_DESTINATION (safe_room);
1347 c = *charbuf++;
1348 if (CHAR_BYTE8_P (c))
1349 {
1350 c = CHAR_TO_BYTE8 (c);
1351 EMIT_ONE_BYTE (c);
1352 }
1353 else
1354 {
1355 CHAR_STRING_ADVANCE (c, pend);
1356 for (p = str; p < pend; p++)
1357 EMIT_ONE_BYTE (*p);
1358 }
1359 }
1360 }
1361 else
1362 {
1363 int safe_room = MAX_MULTIBYTE_LENGTH;
1364
1365 while (charbuf < charbuf_end)
1366 {
1367 ASSURE_DESTINATION (safe_room);
1368 c = *charbuf++;
1369 if (CHAR_BYTE8_P (c))
1370 *dst++ = CHAR_TO_BYTE8 (c);
1371 else
1372 dst += CHAR_STRING (c, dst);
1373 produced_chars++;
1374 }
1375 }
1376 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1377 coding->produced_char += produced_chars;
1378 coding->produced = dst - coding->destination;
1379 return 0;
1380 }
1381
1382
1383 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1384 Check if a text is encoded in one of UTF-16 based coding systems.
1385 If it is, return 1, else return 0. */
1386
1387 #define UTF_16_HIGH_SURROGATE_P(val) \
1388 (((val) & 0xFC00) == 0xD800)
1389
1390 #define UTF_16_LOW_SURROGATE_P(val) \
1391 (((val) & 0xFC00) == 0xDC00)
1392
1393 #define UTF_16_INVALID_P(val) \
1394 (((val) == 0xFFFE) \
1395 || ((val) == 0xFFFF) \
1396 || UTF_16_LOW_SURROGATE_P (val))
1397
1398
1399 static int
1400 detect_coding_utf_16 (coding, detect_info)
1401 struct coding_system *coding;
1402 struct coding_detection_info *detect_info;
1403 {
1404 const unsigned char *src = coding->source, *src_base = src;
1405 const unsigned char *src_end = coding->source + coding->src_bytes;
1406 int multibytep = coding->src_multibyte;
1407 int consumed_chars = 0;
1408 int c1, c2;
1409
1410 detect_info->checked |= CATEGORY_MASK_UTF_16;
1411 if (coding->mode & CODING_MODE_LAST_BLOCK
1412 && (coding->src_chars & 1))
1413 {
1414 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1415 return 0;
1416 }
1417
1418 ONE_MORE_BYTE (c1);
1419 ONE_MORE_BYTE (c2);
1420 if ((c1 == 0xFF) && (c2 == 0xFE))
1421 {
1422 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1423 | CATEGORY_MASK_UTF_16_AUTO);
1424 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1425 | CATEGORY_MASK_UTF_16_BE_NOSIG
1426 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1427 }
1428 else if ((c1 == 0xFE) && (c2 == 0xFF))
1429 {
1430 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1431 | CATEGORY_MASK_UTF_16_AUTO);
1432 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1433 | CATEGORY_MASK_UTF_16_BE_NOSIG
1434 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1435 }
1436 else if (c1 >= 0 && c2 >= 0)
1437 {
1438 detect_info->rejected
1439 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1440 }
1441 no_more_source:
1442 return 1;
1443 }
1444
1445 static void
1446 decode_coding_utf_16 (coding)
1447 struct coding_system *coding;
1448 {
1449 const unsigned char *src = coding->source + coding->consumed;
1450 const unsigned char *src_end = coding->source + coding->src_bytes;
1451 const unsigned char *src_base;
1452 int *charbuf = coding->charbuf + coding->charbuf_used;
1453 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1454 int consumed_chars = 0, consumed_chars_base;
1455 int multibytep = coding->src_multibyte;
1456 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1457 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1458 int surrogate = CODING_UTF_16_SURROGATE (coding);
1459 Lisp_Object attr, charset_list;
1460
1461 CODING_GET_INFO (coding, attr, charset_list);
1462
1463 if (bom == utf_16_with_bom)
1464 {
1465 int c, c1, c2;
1466
1467 src_base = src;
1468 ONE_MORE_BYTE (c1);
1469 ONE_MORE_BYTE (c2);
1470 c = (c1 << 8) | c2;
1471
1472 if (endian == utf_16_big_endian
1473 ? c != 0xFEFF : c != 0xFFFE)
1474 {
1475 /* The first two bytes are not BOM. Treat them as bytes
1476 for a normal character. */
1477 src = src_base;
1478 coding->errors++;
1479 }
1480 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1481 }
1482 else if (bom == utf_16_detect_bom)
1483 {
1484 /* We have already tried to detect BOM and failed in
1485 detect_coding. */
1486 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1487 }
1488
1489 while (1)
1490 {
1491 int c, c1, c2;
1492
1493 src_base = src;
1494 consumed_chars_base = consumed_chars;
1495
1496 if (charbuf + 2 >= charbuf_end)
1497 break;
1498
1499 ONE_MORE_BYTE (c1);
1500 if (c1 < 0)
1501 {
1502 *charbuf++ = -c1;
1503 continue;
1504 }
1505 ONE_MORE_BYTE (c2);
1506 if (c2 < 0)
1507 {
1508 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1509 *charbuf++ = -c2;
1510 continue;
1511 }
1512 c = (endian == utf_16_big_endian
1513 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1514 if (surrogate)
1515 {
1516 if (! UTF_16_LOW_SURROGATE_P (c))
1517 {
1518 if (endian == utf_16_big_endian)
1519 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1520 else
1521 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1522 *charbuf++ = c1;
1523 *charbuf++ = c2;
1524 coding->errors++;
1525 if (UTF_16_HIGH_SURROGATE_P (c))
1526 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1527 else
1528 *charbuf++ = c;
1529 }
1530 else
1531 {
1532 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1533 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1534 *charbuf++ = 0x10000 + c;
1535 }
1536 }
1537 else
1538 {
1539 if (UTF_16_HIGH_SURROGATE_P (c))
1540 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1541 else
1542 *charbuf++ = c;
1543 }
1544 }
1545
1546 no_more_source:
1547 coding->consumed_char += consumed_chars_base;
1548 coding->consumed = src_base - coding->source;
1549 coding->charbuf_used = charbuf - coding->charbuf;
1550 }
1551
1552 static int
1553 encode_coding_utf_16 (coding)
1554 struct coding_system *coding;
1555 {
1556 int multibytep = coding->dst_multibyte;
1557 int *charbuf = coding->charbuf;
1558 int *charbuf_end = charbuf + coding->charbuf_used;
1559 unsigned char *dst = coding->destination + coding->produced;
1560 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1561 int safe_room = 8;
1562 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1563 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1564 int produced_chars = 0;
1565 Lisp_Object attrs, charset_list;
1566 int c;
1567
1568 CODING_GET_INFO (coding, attrs, charset_list);
1569
1570 if (bom != utf_16_without_bom)
1571 {
1572 ASSURE_DESTINATION (safe_room);
1573 if (big_endian)
1574 EMIT_TWO_BYTES (0xFE, 0xFF);
1575 else
1576 EMIT_TWO_BYTES (0xFF, 0xFE);
1577 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1578 }
1579
1580 while (charbuf < charbuf_end)
1581 {
1582 ASSURE_DESTINATION (safe_room);
1583 c = *charbuf++;
1584 if (c >= MAX_UNICODE_CHAR)
1585 c = coding->default_char;
1586
1587 if (c < 0x10000)
1588 {
1589 if (big_endian)
1590 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1591 else
1592 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1593 }
1594 else
1595 {
1596 int c1, c2;
1597
1598 c -= 0x10000;
1599 c1 = (c >> 10) + 0xD800;
1600 c2 = (c & 0x3FF) + 0xDC00;
1601 if (big_endian)
1602 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1603 else
1604 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1605 }
1606 }
1607 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1608 coding->produced = dst - coding->destination;
1609 coding->produced_char += produced_chars;
1610 return 0;
1611 }
1612
1613 \f
1614 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1615
1616 /* Emacs' internal format for representation of multiple character
1617 sets is a kind of multi-byte encoding, i.e. characters are
1618 represented by variable-length sequences of one-byte codes.
1619
1620 ASCII characters and control characters (e.g. `tab', `newline') are
1621 represented by one-byte sequences which are their ASCII codes, in
1622 the range 0x00 through 0x7F.
1623
1624 8-bit characters of the range 0x80..0x9F are represented by
1625 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1626 code + 0x20).
1627
1628 8-bit characters of the range 0xA0..0xFF are represented by
1629 one-byte sequences which are their 8-bit code.
1630
1631 The other characters are represented by a sequence of `base
1632 leading-code', optional `extended leading-code', and one or two
1633 `position-code's. The length of the sequence is determined by the
1634 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1635 whereas extended leading-code and position-code take the range 0xA0
1636 through 0xFF. See `charset.h' for more details about leading-code
1637 and position-code.
1638
1639 --- CODE RANGE of Emacs' internal format ---
1640 character set range
1641 ------------- -----
1642 ascii 0x00..0x7F
1643 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1644 eight-bit-graphic 0xA0..0xBF
1645 ELSE 0x81..0x9D + [0xA0..0xFF]+
1646 ---------------------------------------------
1647
1648 As this is the internal character representation, the format is
1649 usually not used externally (i.e. in a file or in a data sent to a
1650 process). But, it is possible to have a text externally in this
1651 format (i.e. by encoding by the coding system `emacs-mule').
1652
1653 In that case, a sequence of one-byte codes has a slightly different
1654 form.
1655
1656 At first, all characters in eight-bit-control are represented by
1657 one-byte sequences which are their 8-bit code.
1658
1659 Next, character composition data are represented by the byte
1660 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1661 where,
1662 METHOD is 0xF0 plus one of composition method (enum
1663 composition_method),
1664
1665 BYTES is 0xA0 plus a byte length of this composition data,
1666
1667 CHARS is 0x20 plus a number of characters composed by this
1668 data,
1669
1670 COMPONENTs are characters of multibye form or composition
1671 rules encoded by two-byte of ASCII codes.
1672
1673 In addition, for backward compatibility, the following formats are
1674 also recognized as composition data on decoding.
1675
1676 0x80 MSEQ ...
1677 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1678
1679 Here,
1680 MSEQ is a multibyte form but in these special format:
1681 ASCII: 0xA0 ASCII_CODE+0x80,
1682 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1683 RULE is a one byte code of the range 0xA0..0xF0 that
1684 represents a composition rule.
1685 */
1686
1687 char emacs_mule_bytes[256];
1688
1689 int
1690 emacs_mule_char (coding, src, nbytes, nchars, id)
1691 struct coding_system *coding;
1692 const unsigned char *src;
1693 int *nbytes, *nchars, *id;
1694 {
1695 const unsigned char *src_end = coding->source + coding->src_bytes;
1696 const unsigned char *src_base = src;
1697 int multibytep = coding->src_multibyte;
1698 struct charset *charset;
1699 unsigned code;
1700 int c;
1701 int consumed_chars = 0;
1702
1703 ONE_MORE_BYTE (c);
1704 if (c < 0)
1705 {
1706 c = -c;
1707 charset = emacs_mule_charset[0];
1708 }
1709 else
1710 {
1711 if (c >= 0xA0)
1712 {
1713 /* Old style component character of a compostion. */
1714 if (c == 0xA0)
1715 {
1716 ONE_MORE_BYTE (c);
1717 c -= 0x80;
1718 }
1719 else
1720 c -= 0x20;
1721 }
1722
1723 switch (emacs_mule_bytes[c])
1724 {
1725 case 2:
1726 if (! (charset = emacs_mule_charset[c]))
1727 goto invalid_code;
1728 ONE_MORE_BYTE (c);
1729 if (c < 0xA0)
1730 goto invalid_code;
1731 code = c & 0x7F;
1732 break;
1733
1734 case 3:
1735 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1736 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1737 {
1738 ONE_MORE_BYTE (c);
1739 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1740 goto invalid_code;
1741 ONE_MORE_BYTE (c);
1742 if (c < 0xA0)
1743 goto invalid_code;
1744 code = c & 0x7F;
1745 }
1746 else
1747 {
1748 if (! (charset = emacs_mule_charset[c]))
1749 goto invalid_code;
1750 ONE_MORE_BYTE (c);
1751 if (c < 0xA0)
1752 goto invalid_code;
1753 code = (c & 0x7F) << 8;
1754 ONE_MORE_BYTE (c);
1755 if (c < 0xA0)
1756 goto invalid_code;
1757 code |= c & 0x7F;
1758 }
1759 break;
1760
1761 case 4:
1762 ONE_MORE_BYTE (c);
1763 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1764 goto invalid_code;
1765 ONE_MORE_BYTE (c);
1766 if (c < 0xA0)
1767 goto invalid_code;
1768 code = (c & 0x7F) << 8;
1769 ONE_MORE_BYTE (c);
1770 if (c < 0xA0)
1771 goto invalid_code;
1772 code |= c & 0x7F;
1773 break;
1774
1775 case 1:
1776 code = c;
1777 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1778 ? charset_ascii : charset_eight_bit);
1779 break;
1780
1781 default:
1782 abort ();
1783 }
1784 c = DECODE_CHAR (charset, code);
1785 if (c < 0)
1786 goto invalid_code;
1787 }
1788 *nbytes = src - src_base;
1789 *nchars = consumed_chars;
1790 if (id)
1791 *id = charset->id;
1792 return c;
1793
1794 no_more_source:
1795 return -2;
1796
1797 invalid_code:
1798 return -1;
1799 }
1800
1801
1802 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1803 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1804 else return 0. */
1805
1806 static int
1807 detect_coding_emacs_mule (coding, detect_info)
1808 struct coding_system *coding;
1809 struct coding_detection_info *detect_info;
1810 {
1811 const unsigned char *src = coding->source, *src_base;
1812 const unsigned char *src_end = coding->source + coding->src_bytes;
1813 int multibytep = coding->src_multibyte;
1814 int consumed_chars = 0;
1815 int c;
1816 int found = 0;
1817
1818 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1819 /* A coding system of this category is always ASCII compatible. */
1820 src += coding->head_ascii;
1821
1822 while (1)
1823 {
1824 src_base = src;
1825 ONE_MORE_BYTE (c);
1826 if (c < 0)
1827 continue;
1828 if (c == 0x80)
1829 {
1830 /* Perhaps the start of composite character. We simple skip
1831 it because analyzing it is too heavy for detecting. But,
1832 at least, we check that the composite character
1833 constitues of more than 4 bytes. */
1834 const unsigned char *src_base;
1835
1836 repeat:
1837 src_base = src;
1838 do
1839 {
1840 ONE_MORE_BYTE (c);
1841 }
1842 while (c >= 0xA0);
1843
1844 if (src - src_base <= 4)
1845 break;
1846 found = CATEGORY_MASK_EMACS_MULE;
1847 if (c == 0x80)
1848 goto repeat;
1849 }
1850
1851 if (c < 0x80)
1852 {
1853 if (c < 0x20
1854 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1855 break;
1856 }
1857 else
1858 {
1859 int more_bytes = emacs_mule_bytes[*src_base] - 1;
1860
1861 while (more_bytes > 0)
1862 {
1863 ONE_MORE_BYTE (c);
1864 if (c < 0xA0)
1865 {
1866 src--; /* Unread the last byte. */
1867 break;
1868 }
1869 more_bytes--;
1870 }
1871 if (more_bytes != 0)
1872 break;
1873 found = CATEGORY_MASK_EMACS_MULE;
1874 }
1875 }
1876 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1877 return 0;
1878
1879 no_more_source:
1880 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1881 {
1882 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1883 return 0;
1884 }
1885 detect_info->found |= found;
1886 return 1;
1887 }
1888
1889
1890 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1891
1892 /* Decode a character represented as a component of composition
1893 sequence of Emacs 20/21 style at SRC. Set C to that character and
1894 update SRC to the head of next character (or an encoded composition
1895 rule). If SRC doesn't points a composition component, set C to -1.
1896 If SRC points an invalid byte sequence, global exit by a return
1897 value 0. */
1898
1899 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1900 if (1) \
1901 { \
1902 int c; \
1903 int nbytes, nchars; \
1904 \
1905 if (src == src_end) \
1906 break; \
1907 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1908 if (c < 0) \
1909 { \
1910 if (c == -2) \
1911 break; \
1912 goto invalid_code; \
1913 } \
1914 *buf++ = c; \
1915 src += nbytes; \
1916 consumed_chars += nchars; \
1917 } \
1918 else
1919
1920
1921 /* Decode a composition rule represented as a component of composition
1922 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1923 and increment BUF. If SRC points an invalid byte sequence, set C
1924 to -1. */
1925
1926 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1927 do { \
1928 int c, gref, nref; \
1929 \
1930 if (src >= src_end) \
1931 goto invalid_code; \
1932 ONE_MORE_BYTE_NO_CHECK (c); \
1933 c -= 0xA0; \
1934 if (c < 0 || c >= 81) \
1935 goto invalid_code; \
1936 \
1937 gref = c / 9, nref = c % 9; \
1938 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1939 } while (0)
1940
1941
1942 /* Decode a composition rule represented as a component of composition
1943 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1944 and increment BUF. If SRC points an invalid byte sequence, set C
1945 to -1. */
1946
1947 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1948 do { \
1949 int gref, nref; \
1950 \
1951 if (src + 1>= src_end) \
1952 goto invalid_code; \
1953 ONE_MORE_BYTE_NO_CHECK (gref); \
1954 gref -= 0x20; \
1955 ONE_MORE_BYTE_NO_CHECK (nref); \
1956 nref -= 0x20; \
1957 if (gref < 0 || gref >= 81 \
1958 || nref < 0 || nref >= 81) \
1959 goto invalid_code; \
1960 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1961 } while (0)
1962
1963
1964 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1965 do { \
1966 /* Emacs 21 style format. The first three bytes at SRC are \
1967 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1968 the byte length of this composition information, CHARS is the \
1969 number of characters composed by this composition. */ \
1970 enum composition_method method = c - 0xF2; \
1971 int *charbuf_base = charbuf; \
1972 int consumed_chars_limit; \
1973 int nbytes, nchars; \
1974 \
1975 ONE_MORE_BYTE (c); \
1976 if (c < 0) \
1977 goto invalid_code; \
1978 nbytes = c - 0xA0; \
1979 if (nbytes < 3) \
1980 goto invalid_code; \
1981 ONE_MORE_BYTE (c); \
1982 if (c < 0) \
1983 goto invalid_code; \
1984 nchars = c - 0xA0; \
1985 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
1986 consumed_chars_limit = consumed_chars_base + nbytes; \
1987 if (method != COMPOSITION_RELATIVE) \
1988 { \
1989 int i = 0; \
1990 while (consumed_chars < consumed_chars_limit) \
1991 { \
1992 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1993 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1994 else \
1995 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1996 i++; \
1997 } \
1998 if (consumed_chars < consumed_chars_limit) \
1999 goto invalid_code; \
2000 charbuf_base[0] -= i; \
2001 } \
2002 } while (0)
2003
2004
2005 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2006 do { \
2007 /* Emacs 20 style format for relative composition. */ \
2008 /* Store multibyte form of characters to be composed. */ \
2009 enum composition_method method = COMPOSITION_RELATIVE; \
2010 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2011 int *buf = components; \
2012 int i, j; \
2013 \
2014 src = src_base; \
2015 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2016 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2017 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2018 if (i < 2) \
2019 goto invalid_code; \
2020 ADD_COMPOSITION_DATA (charbuf, i, method); \
2021 for (j = 0; j < i; j++) \
2022 *charbuf++ = components[j]; \
2023 } while (0)
2024
2025
2026 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2027 do { \
2028 /* Emacs 20 style format for rule-base composition. */ \
2029 /* Store multibyte form of characters to be composed. */ \
2030 enum composition_method method = COMPOSITION_WITH_RULE; \
2031 int *charbuf_base = charbuf; \
2032 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2033 int *buf = components; \
2034 int i, j; \
2035 \
2036 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2037 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
2038 { \
2039 if (*src < 0xA0) \
2040 break; \
2041 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
2042 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2043 } \
2044 if (i <= 1 || (buf - components) % 2 == 0) \
2045 goto invalid_code; \
2046 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
2047 goto no_more_source; \
2048 ADD_COMPOSITION_DATA (charbuf, i, method); \
2049 i = i * 2 - 1; \
2050 for (j = 0; j < i; j++) \
2051 *charbuf++ = components[j]; \
2052 charbuf_base[0] -= i; \
2053 for (j = 0; j < i; j += 2) \
2054 *charbuf++ = components[j]; \
2055 } while (0)
2056
2057
2058 static void
2059 decode_coding_emacs_mule (coding)
2060 struct coding_system *coding;
2061 {
2062 const unsigned char *src = coding->source + coding->consumed;
2063 const unsigned char *src_end = coding->source + coding->src_bytes;
2064 const unsigned char *src_base;
2065 int *charbuf = coding->charbuf + coding->charbuf_used;
2066 int *charbuf_end
2067 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2068 int consumed_chars = 0, consumed_chars_base;
2069 int multibytep = coding->src_multibyte;
2070 Lisp_Object attrs, charset_list;
2071 int char_offset = coding->produced_char;
2072 int last_offset = char_offset;
2073 int last_id = charset_ascii;
2074
2075 CODING_GET_INFO (coding, attrs, charset_list);
2076
2077 while (1)
2078 {
2079 int c;
2080
2081 src_base = src;
2082 consumed_chars_base = consumed_chars;
2083
2084 if (charbuf >= charbuf_end)
2085 break;
2086
2087 ONE_MORE_BYTE (c);
2088 if (c < 0)
2089 {
2090 *charbuf++ = -c;
2091 char_offset++;
2092 }
2093 else if (c < 0x80)
2094 {
2095 *charbuf++ = c;
2096 char_offset++;
2097 }
2098 else if (c == 0x80)
2099 {
2100 ONE_MORE_BYTE (c);
2101 if (c < 0)
2102 goto invalid_code;
2103 if (c - 0xF2 >= COMPOSITION_RELATIVE
2104 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2105 DECODE_EMACS_MULE_21_COMPOSITION (c);
2106 else if (c < 0xC0)
2107 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2108 else if (c == 0xFF)
2109 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2110 else
2111 goto invalid_code;
2112 }
2113 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2114 {
2115 int nbytes, nchars;
2116 int id;
2117
2118 src = src_base;
2119 consumed_chars = consumed_chars_base;
2120 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2121 if (c < 0)
2122 {
2123 if (c == -2)
2124 break;
2125 goto invalid_code;
2126 }
2127 if (last_id != id)
2128 {
2129 if (last_id != charset_ascii)
2130 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2131 last_id = id;
2132 last_offset = char_offset;
2133 }
2134 *charbuf++ = c;
2135 src += nbytes;
2136 consumed_chars += nchars;
2137 char_offset++;
2138 }
2139 else
2140 goto invalid_code;
2141 continue;
2142
2143 invalid_code:
2144 src = src_base;
2145 consumed_chars = consumed_chars_base;
2146 ONE_MORE_BYTE (c);
2147 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2148 char_offset++;
2149 coding->errors++;
2150 }
2151
2152 no_more_source:
2153 if (last_id != charset_ascii)
2154 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2155 coding->consumed_char += consumed_chars_base;
2156 coding->consumed = src_base - coding->source;
2157 coding->charbuf_used = charbuf - coding->charbuf;
2158 }
2159
2160
2161 #define EMACS_MULE_LEADING_CODES(id, codes) \
2162 do { \
2163 if (id < 0xA0) \
2164 codes[0] = id, codes[1] = 0; \
2165 else if (id < 0xE0) \
2166 codes[0] = 0x9A, codes[1] = id; \
2167 else if (id < 0xF0) \
2168 codes[0] = 0x9B, codes[1] = id; \
2169 else if (id < 0xF5) \
2170 codes[0] = 0x9C, codes[1] = id; \
2171 else \
2172 codes[0] = 0x9D, codes[1] = id; \
2173 } while (0);
2174
2175
2176 static int
2177 encode_coding_emacs_mule (coding)
2178 struct coding_system *coding;
2179 {
2180 int multibytep = coding->dst_multibyte;
2181 int *charbuf = coding->charbuf;
2182 int *charbuf_end = charbuf + coding->charbuf_used;
2183 unsigned char *dst = coding->destination + coding->produced;
2184 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2185 int safe_room = 8;
2186 int produced_chars = 0;
2187 Lisp_Object attrs, charset_list;
2188 int c;
2189 int preferred_charset_id = -1;
2190
2191 CODING_GET_INFO (coding, attrs, charset_list);
2192 if (! EQ (charset_list, Vemacs_mule_charset_list))
2193 {
2194 CODING_ATTR_CHARSET_LIST (attrs)
2195 = charset_list = Vemacs_mule_charset_list;
2196 }
2197
2198 while (charbuf < charbuf_end)
2199 {
2200 ASSURE_DESTINATION (safe_room);
2201 c = *charbuf++;
2202
2203 if (c < 0)
2204 {
2205 /* Handle an annotation. */
2206 switch (*charbuf)
2207 {
2208 case CODING_ANNOTATE_COMPOSITION_MASK:
2209 /* Not yet implemented. */
2210 break;
2211 case CODING_ANNOTATE_CHARSET_MASK:
2212 preferred_charset_id = charbuf[3];
2213 if (preferred_charset_id >= 0
2214 && NILP (Fmemq (make_number (preferred_charset_id),
2215 charset_list)))
2216 preferred_charset_id = -1;
2217 break;
2218 default:
2219 abort ();
2220 }
2221 charbuf += -c - 1;
2222 continue;
2223 }
2224
2225 if (ASCII_CHAR_P (c))
2226 EMIT_ONE_ASCII_BYTE (c);
2227 else if (CHAR_BYTE8_P (c))
2228 {
2229 c = CHAR_TO_BYTE8 (c);
2230 EMIT_ONE_BYTE (c);
2231 }
2232 else
2233 {
2234 struct charset *charset;
2235 unsigned code;
2236 int dimension;
2237 int emacs_mule_id;
2238 unsigned char leading_codes[2];
2239
2240 if (preferred_charset_id >= 0)
2241 {
2242 charset = CHARSET_FROM_ID (preferred_charset_id);
2243 if (! CHAR_CHARSET_P (c, charset))
2244 charset = char_charset (c, charset_list, NULL);
2245 }
2246 else
2247 charset = char_charset (c, charset_list, &code);
2248 if (! charset)
2249 {
2250 c = coding->default_char;
2251 if (ASCII_CHAR_P (c))
2252 {
2253 EMIT_ONE_ASCII_BYTE (c);
2254 continue;
2255 }
2256 charset = char_charset (c, charset_list, &code);
2257 }
2258 dimension = CHARSET_DIMENSION (charset);
2259 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2260 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2261 EMIT_ONE_BYTE (leading_codes[0]);
2262 if (leading_codes[1])
2263 EMIT_ONE_BYTE (leading_codes[1]);
2264 if (dimension == 1)
2265 EMIT_ONE_BYTE (code | 0x80);
2266 else
2267 {
2268 code |= 0x8080;
2269 EMIT_ONE_BYTE (code >> 8);
2270 EMIT_ONE_BYTE (code & 0xFF);
2271 }
2272 }
2273 }
2274 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2275 coding->produced_char += produced_chars;
2276 coding->produced = dst - coding->destination;
2277 return 0;
2278 }
2279
2280 \f
2281 /*** 7. ISO2022 handlers ***/
2282
2283 /* The following note describes the coding system ISO2022 briefly.
2284 Since the intention of this note is to help understand the
2285 functions in this file, some parts are NOT ACCURATE or are OVERLY
2286 SIMPLIFIED. For thorough understanding, please refer to the
2287 original document of ISO2022. This is equivalent to the standard
2288 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2289
2290 ISO2022 provides many mechanisms to encode several character sets
2291 in 7-bit and 8-bit environments. For 7-bit environments, all text
2292 is encoded using bytes less than 128. This may make the encoded
2293 text a little bit longer, but the text passes more easily through
2294 several types of gateway, some of which strip off the MSB (Most
2295 Significant Bit).
2296
2297 There are two kinds of character sets: control character sets and
2298 graphic character sets. The former contain control characters such
2299 as `newline' and `escape' to provide control functions (control
2300 functions are also provided by escape sequences). The latter
2301 contain graphic characters such as 'A' and '-'. Emacs recognizes
2302 two control character sets and many graphic character sets.
2303
2304 Graphic character sets are classified into one of the following
2305 four classes, according to the number of bytes (DIMENSION) and
2306 number of characters in one dimension (CHARS) of the set:
2307 - DIMENSION1_CHARS94
2308 - DIMENSION1_CHARS96
2309 - DIMENSION2_CHARS94
2310 - DIMENSION2_CHARS96
2311
2312 In addition, each character set is assigned an identification tag,
2313 unique for each set, called the "final character" (denoted as <F>
2314 hereafter). The <F> of each character set is decided by ECMA(*)
2315 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2316 (0x30..0x3F are for private use only).
2317
2318 Note (*): ECMA = European Computer Manufacturers Association
2319
2320 Here are examples of graphic character sets [NAME(<F>)]:
2321 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2322 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2323 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2324 o DIMENSION2_CHARS96 -- none for the moment
2325
2326 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2327 C0 [0x00..0x1F] -- control character plane 0
2328 GL [0x20..0x7F] -- graphic character plane 0
2329 C1 [0x80..0x9F] -- control character plane 1
2330 GR [0xA0..0xFF] -- graphic character plane 1
2331
2332 A control character set is directly designated and invoked to C0 or
2333 C1 by an escape sequence. The most common case is that:
2334 - ISO646's control character set is designated/invoked to C0, and
2335 - ISO6429's control character set is designated/invoked to C1,
2336 and usually these designations/invocations are omitted in encoded
2337 text. In a 7-bit environment, only C0 can be used, and a control
2338 character for C1 is encoded by an appropriate escape sequence to
2339 fit into the environment. All control characters for C1 are
2340 defined to have corresponding escape sequences.
2341
2342 A graphic character set is at first designated to one of four
2343 graphic registers (G0 through G3), then these graphic registers are
2344 invoked to GL or GR. These designations and invocations can be
2345 done independently. The most common case is that G0 is invoked to
2346 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2347 these invocations and designations are omitted in encoded text.
2348 In a 7-bit environment, only GL can be used.
2349
2350 When a graphic character set of CHARS94 is invoked to GL, codes
2351 0x20 and 0x7F of the GL area work as control characters SPACE and
2352 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2353 be used.
2354
2355 There are two ways of invocation: locking-shift and single-shift.
2356 With locking-shift, the invocation lasts until the next different
2357 invocation, whereas with single-shift, the invocation affects the
2358 following character only and doesn't affect the locking-shift
2359 state. Invocations are done by the following control characters or
2360 escape sequences:
2361
2362 ----------------------------------------------------------------------
2363 abbrev function cntrl escape seq description
2364 ----------------------------------------------------------------------
2365 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2366 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2367 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2368 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2369 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2370 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2371 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2372 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2373 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2374 ----------------------------------------------------------------------
2375 (*) These are not used by any known coding system.
2376
2377 Control characters for these functions are defined by macros
2378 ISO_CODE_XXX in `coding.h'.
2379
2380 Designations are done by the following escape sequences:
2381 ----------------------------------------------------------------------
2382 escape sequence description
2383 ----------------------------------------------------------------------
2384 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2385 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2386 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2387 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2388 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2389 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2390 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2391 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2392 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2393 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2394 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2395 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2396 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2397 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2398 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2399 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2400 ----------------------------------------------------------------------
2401
2402 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2403 of dimension 1, chars 94, and final character <F>, etc...
2404
2405 Note (*): Although these designations are not allowed in ISO2022,
2406 Emacs accepts them on decoding, and produces them on encoding
2407 CHARS96 character sets in a coding system which is characterized as
2408 7-bit environment, non-locking-shift, and non-single-shift.
2409
2410 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2411 '(' must be omitted. We refer to this as "short-form" hereafter.
2412
2413 Now you may notice that there are a lot of ways of encoding the
2414 same multilingual text in ISO2022. Actually, there exist many
2415 coding systems such as Compound Text (used in X11's inter client
2416 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2417 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2418 localized platforms), and all of these are variants of ISO2022.
2419
2420 In addition to the above, Emacs handles two more kinds of escape
2421 sequences: ISO6429's direction specification and Emacs' private
2422 sequence for specifying character composition.
2423
2424 ISO6429's direction specification takes the following form:
2425 o CSI ']' -- end of the current direction
2426 o CSI '0' ']' -- end of the current direction
2427 o CSI '1' ']' -- start of left-to-right text
2428 o CSI '2' ']' -- start of right-to-left text
2429 The control character CSI (0x9B: control sequence introducer) is
2430 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2431
2432 Character composition specification takes the following form:
2433 o ESC '0' -- start relative composition
2434 o ESC '1' -- end composition
2435 o ESC '2' -- start rule-base composition (*)
2436 o ESC '3' -- start relative composition with alternate chars (**)
2437 o ESC '4' -- start rule-base composition with alternate chars (**)
2438 Since these are not standard escape sequences of any ISO standard,
2439 the use of them with these meanings is restricted to Emacs only.
2440
2441 (*) This form is used only in Emacs 20.7 and older versions,
2442 but newer versions can safely decode it.
2443 (**) This form is used only in Emacs 21.1 and newer versions,
2444 and older versions can't decode it.
2445
2446 Here's a list of example usages of these composition escape
2447 sequences (categorized by `enum composition_method').
2448
2449 COMPOSITION_RELATIVE:
2450 ESC 0 CHAR [ CHAR ] ESC 1
2451 COMPOSITION_WITH_RULE:
2452 ESC 2 CHAR [ RULE CHAR ] ESC 1
2453 COMPOSITION_WITH_ALTCHARS:
2454 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2455 COMPOSITION_WITH_RULE_ALTCHARS:
2456 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2457
2458 enum iso_code_class_type iso_code_class[256];
2459
2460 #define SAFE_CHARSET_P(coding, id) \
2461 ((id) <= (coding)->max_charset_id \
2462 && (coding)->safe_charsets[id] >= 0)
2463
2464
2465 #define SHIFT_OUT_OK(category) \
2466 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2467
2468 static void
2469 setup_iso_safe_charsets (attrs)
2470 Lisp_Object attrs;
2471 {
2472 Lisp_Object charset_list, safe_charsets;
2473 Lisp_Object request;
2474 Lisp_Object reg_usage;
2475 Lisp_Object tail;
2476 int reg94, reg96;
2477 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2478 int max_charset_id;
2479
2480 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2481 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2482 && ! EQ (charset_list, Viso_2022_charset_list))
2483 {
2484 CODING_ATTR_CHARSET_LIST (attrs)
2485 = charset_list = Viso_2022_charset_list;
2486 ASET (attrs, coding_attr_safe_charsets, Qnil);
2487 }
2488
2489 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2490 return;
2491
2492 max_charset_id = 0;
2493 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2494 {
2495 int id = XINT (XCAR (tail));
2496 if (max_charset_id < id)
2497 max_charset_id = id;
2498 }
2499
2500 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2501 make_number (255));
2502 request = AREF (attrs, coding_attr_iso_request);
2503 reg_usage = AREF (attrs, coding_attr_iso_usage);
2504 reg94 = XINT (XCAR (reg_usage));
2505 reg96 = XINT (XCDR (reg_usage));
2506
2507 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2508 {
2509 Lisp_Object id;
2510 Lisp_Object reg;
2511 struct charset *charset;
2512
2513 id = XCAR (tail);
2514 charset = CHARSET_FROM_ID (XINT (id));
2515 reg = Fcdr (Fassq (id, request));
2516 if (! NILP (reg))
2517 SSET (safe_charsets, XINT (id), XINT (reg));
2518 else if (charset->iso_chars_96)
2519 {
2520 if (reg96 < 4)
2521 SSET (safe_charsets, XINT (id), reg96);
2522 }
2523 else
2524 {
2525 if (reg94 < 4)
2526 SSET (safe_charsets, XINT (id), reg94);
2527 }
2528 }
2529 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2530 }
2531
2532
2533 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2534 Check if a text is encoded in one of ISO-2022 based codig systems.
2535 If it is, return 1, else return 0. */
2536
2537 static int
2538 detect_coding_iso_2022 (coding, detect_info)
2539 struct coding_system *coding;
2540 struct coding_detection_info *detect_info;
2541 {
2542 const unsigned char *src = coding->source, *src_base = src;
2543 const unsigned char *src_end = coding->source + coding->src_bytes;
2544 int multibytep = coding->src_multibyte;
2545 int single_shifting = 0;
2546 int id;
2547 int c, c1;
2548 int consumed_chars = 0;
2549 int i;
2550 int rejected = 0;
2551 int found = 0;
2552
2553 detect_info->checked |= CATEGORY_MASK_ISO;
2554
2555 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2556 {
2557 struct coding_system *this = &(coding_categories[i]);
2558 Lisp_Object attrs, val;
2559
2560 attrs = CODING_ID_ATTRS (this->id);
2561 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2562 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2563 setup_iso_safe_charsets (attrs);
2564 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2565 this->max_charset_id = SCHARS (val) - 1;
2566 this->safe_charsets = (char *) SDATA (val);
2567 }
2568
2569 /* A coding system of this category is always ASCII compatible. */
2570 src += coding->head_ascii;
2571
2572 while (rejected != CATEGORY_MASK_ISO)
2573 {
2574 src_base = src;
2575 ONE_MORE_BYTE (c);
2576 switch (c)
2577 {
2578 case ISO_CODE_ESC:
2579 if (inhibit_iso_escape_detection)
2580 break;
2581 single_shifting = 0;
2582 ONE_MORE_BYTE (c);
2583 if (c >= '(' && c <= '/')
2584 {
2585 /* Designation sequence for a charset of dimension 1. */
2586 ONE_MORE_BYTE (c1);
2587 if (c1 < ' ' || c1 >= 0x80
2588 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2589 /* Invalid designation sequence. Just ignore. */
2590 break;
2591 }
2592 else if (c == '$')
2593 {
2594 /* Designation sequence for a charset of dimension 2. */
2595 ONE_MORE_BYTE (c);
2596 if (c >= '@' && c <= 'B')
2597 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2598 id = iso_charset_table[1][0][c];
2599 else if (c >= '(' && c <= '/')
2600 {
2601 ONE_MORE_BYTE (c1);
2602 if (c1 < ' ' || c1 >= 0x80
2603 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2604 /* Invalid designation sequence. Just ignore. */
2605 break;
2606 }
2607 else
2608 /* Invalid designation sequence. Just ignore it. */
2609 break;
2610 }
2611 else if (c == 'N' || c == 'O')
2612 {
2613 /* ESC <Fe> for SS2 or SS3. */
2614 single_shifting = 1;
2615 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2616 break;
2617 }
2618 else if (c >= '0' && c <= '4')
2619 {
2620 /* ESC <Fp> for start/end composition. */
2621 found |= CATEGORY_MASK_ISO;
2622 break;
2623 }
2624 else
2625 {
2626 /* Invalid escape sequence. Just ignore it. */
2627 break;
2628 }
2629
2630 /* We found a valid designation sequence for CHARSET. */
2631 rejected |= CATEGORY_MASK_ISO_8BIT;
2632 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2633 id))
2634 found |= CATEGORY_MASK_ISO_7;
2635 else
2636 rejected |= CATEGORY_MASK_ISO_7;
2637 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2638 id))
2639 found |= CATEGORY_MASK_ISO_7_TIGHT;
2640 else
2641 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2642 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2643 id))
2644 found |= CATEGORY_MASK_ISO_7_ELSE;
2645 else
2646 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2647 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2648 id))
2649 found |= CATEGORY_MASK_ISO_8_ELSE;
2650 else
2651 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2652 break;
2653
2654 case ISO_CODE_SO:
2655 case ISO_CODE_SI:
2656 /* Locking shift out/in. */
2657 if (inhibit_iso_escape_detection)
2658 break;
2659 single_shifting = 0;
2660 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2661 found |= CATEGORY_MASK_ISO_ELSE;
2662 break;
2663
2664 case ISO_CODE_CSI:
2665 /* Control sequence introducer. */
2666 single_shifting = 0;
2667 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2668 found |= CATEGORY_MASK_ISO_8_ELSE;
2669 goto check_extra_latin;
2670
2671 case ISO_CODE_SS2:
2672 case ISO_CODE_SS3:
2673 /* Single shift. */
2674 if (inhibit_iso_escape_detection)
2675 break;
2676 single_shifting = 0;
2677 rejected |= CATEGORY_MASK_ISO_7BIT;
2678 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2679 & CODING_ISO_FLAG_SINGLE_SHIFT)
2680 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2681 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2682 & CODING_ISO_FLAG_SINGLE_SHIFT)
2683 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2684 if (single_shifting)
2685 break;
2686 goto check_extra_latin;
2687
2688 default:
2689 if (c < 0)
2690 continue;
2691 if (c < 0x80)
2692 {
2693 single_shifting = 0;
2694 break;
2695 }
2696 if (c >= 0xA0)
2697 {
2698 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2699 found |= CATEGORY_MASK_ISO_8_1;
2700 /* Check the length of succeeding codes of the range
2701 0xA0..0FF. If the byte length is even, we include
2702 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2703 only when we are not single shifting. */
2704 if (! single_shifting
2705 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2706 {
2707 int i = 1;
2708 while (src < src_end)
2709 {
2710 ONE_MORE_BYTE (c);
2711 if (c < 0xA0)
2712 break;
2713 i++;
2714 }
2715
2716 if (i & 1 && src < src_end)
2717 rejected |= CATEGORY_MASK_ISO_8_2;
2718 else
2719 found |= CATEGORY_MASK_ISO_8_2;
2720 }
2721 break;
2722 }
2723 check_extra_latin:
2724 single_shifting = 0;
2725 if (! VECTORP (Vlatin_extra_code_table)
2726 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2727 {
2728 rejected = CATEGORY_MASK_ISO;
2729 break;
2730 }
2731 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2732 & CODING_ISO_FLAG_LATIN_EXTRA)
2733 found |= CATEGORY_MASK_ISO_8_1;
2734 else
2735 rejected |= CATEGORY_MASK_ISO_8_1;
2736 rejected |= CATEGORY_MASK_ISO_8_2;
2737 }
2738 }
2739 detect_info->rejected |= CATEGORY_MASK_ISO;
2740 return 0;
2741
2742 no_more_source:
2743 detect_info->rejected |= rejected;
2744 detect_info->found |= (found & ~rejected);
2745 return 1;
2746 }
2747
2748
2749 /* Set designation state into CODING. Set CHARS_96 to -1 if the
2750 escape sequence should be kept. */
2751 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2752 do { \
2753 int id, prev; \
2754 \
2755 if (final < '0' || final >= 128 \
2756 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2757 || !SAFE_CHARSET_P (coding, id)) \
2758 { \
2759 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2760 chars_96 = -1; \
2761 break; \
2762 } \
2763 prev = CODING_ISO_DESIGNATION (coding, reg); \
2764 if (id == charset_jisx0201_roman) \
2765 { \
2766 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2767 id = charset_ascii; \
2768 } \
2769 else if (id == charset_jisx0208_1978) \
2770 { \
2771 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2772 id = charset_jisx0208; \
2773 } \
2774 CODING_ISO_DESIGNATION (coding, reg) = id; \
2775 /* If there was an invalid designation to REG previously, and this \
2776 designation is ASCII to REG, we should keep this designation \
2777 sequence. */ \
2778 if (prev == -2 && id == charset_ascii) \
2779 chars_96 = -1; \
2780 } while (0)
2781
2782
2783 #define MAYBE_FINISH_COMPOSITION() \
2784 do { \
2785 int i; \
2786 if (composition_state == COMPOSING_NO) \
2787 break; \
2788 /* It is assured that we have enough room for producing \
2789 characters stored in the table `components'. */ \
2790 if (charbuf + component_idx > charbuf_end) \
2791 goto no_more_source; \
2792 composition_state = COMPOSING_NO; \
2793 if (method == COMPOSITION_RELATIVE \
2794 || method == COMPOSITION_WITH_ALTCHARS) \
2795 { \
2796 for (i = 0; i < component_idx; i++) \
2797 *charbuf++ = components[i]; \
2798 char_offset += component_idx; \
2799 } \
2800 else \
2801 { \
2802 for (i = 0; i < component_idx; i += 2) \
2803 *charbuf++ = components[i]; \
2804 char_offset += (component_idx / 2) + 1; \
2805 } \
2806 } while (0)
2807
2808
2809 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2810 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2811 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2812 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2813 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2814 */
2815
2816 #define DECODE_COMPOSITION_START(c1) \
2817 do { \
2818 if (c1 == '0' \
2819 && composition_state == COMPOSING_COMPONENT_RULE) \
2820 { \
2821 component_len = component_idx; \
2822 composition_state = COMPOSING_CHAR; \
2823 } \
2824 else \
2825 { \
2826 const unsigned char *p; \
2827 \
2828 MAYBE_FINISH_COMPOSITION (); \
2829 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2830 goto no_more_source; \
2831 for (p = src; p < src_end - 1; p++) \
2832 if (*p == ISO_CODE_ESC && p[1] == '1') \
2833 break; \
2834 if (p == src_end - 1) \
2835 { \
2836 /* The current composition doesn't end in the current \
2837 source. */ \
2838 record_conversion_result \
2839 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
2840 goto no_more_source; \
2841 } \
2842 \
2843 /* This is surely the start of a composition. */ \
2844 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2845 : c1 == '2' ? COMPOSITION_WITH_RULE \
2846 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2847 : COMPOSITION_WITH_RULE_ALTCHARS); \
2848 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2849 : COMPOSING_COMPONENT_CHAR); \
2850 component_idx = component_len = 0; \
2851 } \
2852 } while (0)
2853
2854
2855 /* Handle compositoin end sequence ESC 1. */
2856
2857 #define DECODE_COMPOSITION_END() \
2858 do { \
2859 int nchars = (component_len > 0 ? component_idx - component_len \
2860 : method == COMPOSITION_RELATIVE ? component_idx \
2861 : (component_idx + 1) / 2); \
2862 int i; \
2863 int *saved_charbuf = charbuf; \
2864 \
2865 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2866 if (method != COMPOSITION_RELATIVE) \
2867 { \
2868 if (component_len == 0) \
2869 for (i = 0; i < component_idx; i++) \
2870 *charbuf++ = components[i]; \
2871 else \
2872 for (i = 0; i < component_len; i++) \
2873 *charbuf++ = components[i]; \
2874 *saved_charbuf = saved_charbuf - charbuf; \
2875 } \
2876 if (method == COMPOSITION_WITH_RULE) \
2877 for (i = 0; i < component_idx; i += 2, char_offset++) \
2878 *charbuf++ = components[i]; \
2879 else \
2880 for (i = component_len; i < component_idx; i++, char_offset++) \
2881 *charbuf++ = components[i]; \
2882 coding->annotated = 1; \
2883 composition_state = COMPOSING_NO; \
2884 } while (0)
2885
2886
2887 /* Decode a composition rule from the byte C1 (and maybe one more byte
2888 from SRC) and store one encoded composition rule in
2889 coding->cmp_data. */
2890
2891 #define DECODE_COMPOSITION_RULE(c1) \
2892 do { \
2893 (c1) -= 32; \
2894 if (c1 < 81) /* old format (before ver.21) */ \
2895 { \
2896 int gref = (c1) / 9; \
2897 int nref = (c1) % 9; \
2898 if (gref == 4) gref = 10; \
2899 if (nref == 4) nref = 10; \
2900 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2901 } \
2902 else if (c1 < 93) /* new format (after ver.21) */ \
2903 { \
2904 ONE_MORE_BYTE (c2); \
2905 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2906 } \
2907 else \
2908 c1 = 0; \
2909 } while (0)
2910
2911
2912 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2913
2914 static void
2915 decode_coding_iso_2022 (coding)
2916 struct coding_system *coding;
2917 {
2918 const unsigned char *src = coding->source + coding->consumed;
2919 const unsigned char *src_end = coding->source + coding->src_bytes;
2920 const unsigned char *src_base;
2921 int *charbuf = coding->charbuf + coding->charbuf_used;
2922 int *charbuf_end
2923 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2924 int consumed_chars = 0, consumed_chars_base;
2925 int multibytep = coding->src_multibyte;
2926 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2927 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2928 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2929 int charset_id_2, charset_id_3;
2930 struct charset *charset;
2931 int c;
2932 /* For handling composition sequence. */
2933 #define COMPOSING_NO 0
2934 #define COMPOSING_CHAR 1
2935 #define COMPOSING_RULE 2
2936 #define COMPOSING_COMPONENT_CHAR 3
2937 #define COMPOSING_COMPONENT_RULE 4
2938
2939 int composition_state = COMPOSING_NO;
2940 enum composition_method method;
2941 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2942 int component_idx;
2943 int component_len;
2944 Lisp_Object attrs, charset_list;
2945 int char_offset = coding->produced_char;
2946 int last_offset = char_offset;
2947 int last_id = charset_ascii;
2948
2949 CODING_GET_INFO (coding, attrs, charset_list);
2950 setup_iso_safe_charsets (attrs);
2951
2952 while (1)
2953 {
2954 int c1, c2;
2955
2956 src_base = src;
2957 consumed_chars_base = consumed_chars;
2958
2959 if (charbuf >= charbuf_end)
2960 break;
2961
2962 ONE_MORE_BYTE (c1);
2963 if (c1 < 0)
2964 goto invalid_code;
2965
2966 /* We produce at most one character. */
2967 switch (iso_code_class [c1])
2968 {
2969 case ISO_0x20_or_0x7F:
2970 if (composition_state != COMPOSING_NO)
2971 {
2972 if (composition_state == COMPOSING_RULE
2973 || composition_state == COMPOSING_COMPONENT_RULE)
2974 {
2975 DECODE_COMPOSITION_RULE (c1);
2976 components[component_idx++] = c1;
2977 composition_state--;
2978 continue;
2979 }
2980 }
2981 if (charset_id_0 < 0
2982 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2983 /* This is SPACE or DEL. */
2984 charset = CHARSET_FROM_ID (charset_ascii);
2985 else
2986 charset = CHARSET_FROM_ID (charset_id_0);
2987 break;
2988
2989 case ISO_graphic_plane_0:
2990 if (composition_state != COMPOSING_NO)
2991 {
2992 if (composition_state == COMPOSING_RULE
2993 || composition_state == COMPOSING_COMPONENT_RULE)
2994 {
2995 DECODE_COMPOSITION_RULE (c1);
2996 components[component_idx++] = c1;
2997 composition_state--;
2998 continue;
2999 }
3000 }
3001 if (charset_id_0 < 0)
3002 charset = CHARSET_FROM_ID (charset_ascii);
3003 else
3004 charset = CHARSET_FROM_ID (charset_id_0);
3005 break;
3006
3007 case ISO_0xA0_or_0xFF:
3008 if (charset_id_1 < 0
3009 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3010 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3011 goto invalid_code;
3012 /* This is a graphic character, we fall down ... */
3013
3014 case ISO_graphic_plane_1:
3015 if (charset_id_1 < 0)
3016 goto invalid_code;
3017 charset = CHARSET_FROM_ID (charset_id_1);
3018 break;
3019
3020 case ISO_control_0:
3021 MAYBE_FINISH_COMPOSITION ();
3022 charset = CHARSET_FROM_ID (charset_ascii);
3023 break;
3024
3025 case ISO_control_1:
3026 MAYBE_FINISH_COMPOSITION ();
3027 goto invalid_code;
3028
3029 case ISO_shift_out:
3030 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3031 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3032 goto invalid_code;
3033 CODING_ISO_INVOCATION (coding, 0) = 1;
3034 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3035 continue;
3036
3037 case ISO_shift_in:
3038 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3039 goto invalid_code;
3040 CODING_ISO_INVOCATION (coding, 0) = 0;
3041 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3042 continue;
3043
3044 case ISO_single_shift_2_7:
3045 case ISO_single_shift_2:
3046 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3047 goto invalid_code;
3048 /* SS2 is handled as an escape sequence of ESC 'N' */
3049 c1 = 'N';
3050 goto label_escape_sequence;
3051
3052 case ISO_single_shift_3:
3053 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3054 goto invalid_code;
3055 /* SS2 is handled as an escape sequence of ESC 'O' */
3056 c1 = 'O';
3057 goto label_escape_sequence;
3058
3059 case ISO_control_sequence_introducer:
3060 /* CSI is handled as an escape sequence of ESC '[' ... */
3061 c1 = '[';
3062 goto label_escape_sequence;
3063
3064 case ISO_escape:
3065 ONE_MORE_BYTE (c1);
3066 label_escape_sequence:
3067 /* Escape sequences handled here are invocation,
3068 designation, direction specification, and character
3069 composition specification. */
3070 switch (c1)
3071 {
3072 case '&': /* revision of following character set */
3073 ONE_MORE_BYTE (c1);
3074 if (!(c1 >= '@' && c1 <= '~'))
3075 goto invalid_code;
3076 ONE_MORE_BYTE (c1);
3077 if (c1 != ISO_CODE_ESC)
3078 goto invalid_code;
3079 ONE_MORE_BYTE (c1);
3080 goto label_escape_sequence;
3081
3082 case '$': /* designation of 2-byte character set */
3083 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3084 goto invalid_code;
3085 {
3086 int reg, chars96;
3087
3088 ONE_MORE_BYTE (c1);
3089 if (c1 >= '@' && c1 <= 'B')
3090 { /* designation of JISX0208.1978, GB2312.1980,
3091 or JISX0208.1980 */
3092 reg = 0, chars96 = 0;
3093 }
3094 else if (c1 >= 0x28 && c1 <= 0x2B)
3095 { /* designation of DIMENSION2_CHARS94 character set */
3096 reg = c1 - 0x28, chars96 = 0;
3097 ONE_MORE_BYTE (c1);
3098 }
3099 else if (c1 >= 0x2C && c1 <= 0x2F)
3100 { /* designation of DIMENSION2_CHARS96 character set */
3101 reg = c1 - 0x2C, chars96 = 1;
3102 ONE_MORE_BYTE (c1);
3103 }
3104 else
3105 goto invalid_code;
3106 DECODE_DESIGNATION (reg, 2, chars96, c1);
3107 /* We must update these variables now. */
3108 if (reg == 0)
3109 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3110 else if (reg == 1)
3111 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3112 if (chars96 < 0)
3113 goto invalid_code;
3114 }
3115 continue;
3116
3117 case 'n': /* invocation of locking-shift-2 */
3118 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3119 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3120 goto invalid_code;
3121 CODING_ISO_INVOCATION (coding, 0) = 2;
3122 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3123 continue;
3124
3125 case 'o': /* invocation of locking-shift-3 */
3126 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3127 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3128 goto invalid_code;
3129 CODING_ISO_INVOCATION (coding, 0) = 3;
3130 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3131 continue;
3132
3133 case 'N': /* invocation of single-shift-2 */
3134 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3135 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3136 goto invalid_code;
3137 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3138 if (charset_id_2 < 0)
3139 charset = CHARSET_FROM_ID (charset_ascii);
3140 else
3141 charset = CHARSET_FROM_ID (charset_id_2);
3142 ONE_MORE_BYTE (c1);
3143 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3144 goto invalid_code;
3145 break;
3146
3147 case 'O': /* invocation of single-shift-3 */
3148 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3149 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3150 goto invalid_code;
3151 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3152 if (charset_id_3 < 0)
3153 charset = CHARSET_FROM_ID (charset_ascii);
3154 else
3155 charset = CHARSET_FROM_ID (charset_id_3);
3156 ONE_MORE_BYTE (c1);
3157 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3158 goto invalid_code;
3159 break;
3160
3161 case '0': case '2': case '3': case '4': /* start composition */
3162 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3163 goto invalid_code;
3164 DECODE_COMPOSITION_START (c1);
3165 continue;
3166
3167 case '1': /* end composition */
3168 if (composition_state == COMPOSING_NO)
3169 goto invalid_code;
3170 DECODE_COMPOSITION_END ();
3171 continue;
3172
3173 case '[': /* specification of direction */
3174 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3175 goto invalid_code;
3176 /* For the moment, nested direction is not supported.
3177 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3178 left-to-right, and nozero means right-to-left. */
3179 ONE_MORE_BYTE (c1);
3180 switch (c1)
3181 {
3182 case ']': /* end of the current direction */
3183 coding->mode &= ~CODING_MODE_DIRECTION;
3184
3185 case '0': /* end of the current direction */
3186 case '1': /* start of left-to-right direction */
3187 ONE_MORE_BYTE (c1);
3188 if (c1 == ']')
3189 coding->mode &= ~CODING_MODE_DIRECTION;
3190 else
3191 goto invalid_code;
3192 break;
3193
3194 case '2': /* start of right-to-left direction */
3195 ONE_MORE_BYTE (c1);
3196 if (c1 == ']')
3197 coding->mode |= CODING_MODE_DIRECTION;
3198 else
3199 goto invalid_code;
3200 break;
3201
3202 default:
3203 goto invalid_code;
3204 }
3205 continue;
3206
3207 case '%':
3208 ONE_MORE_BYTE (c1);
3209 if (c1 == '/')
3210 {
3211 /* CTEXT extended segment:
3212 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3213 We keep these bytes as is for the moment.
3214 They may be decoded by post-read-conversion. */
3215 int dim, M, L;
3216 int size;
3217
3218 ONE_MORE_BYTE (dim);
3219 ONE_MORE_BYTE (M);
3220 ONE_MORE_BYTE (L);
3221 size = ((M - 128) * 128) + (L - 128);
3222 if (charbuf + 8 + size > charbuf_end)
3223 goto break_loop;
3224 *charbuf++ = ISO_CODE_ESC;
3225 *charbuf++ = '%';
3226 *charbuf++ = '/';
3227 *charbuf++ = dim;
3228 *charbuf++ = BYTE8_TO_CHAR (M);
3229 *charbuf++ = BYTE8_TO_CHAR (L);
3230 while (size-- > 0)
3231 {
3232 ONE_MORE_BYTE (c1);
3233 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3234 }
3235 }
3236 else if (c1 == 'G')
3237 {
3238 /* XFree86 extension for embedding UTF-8 in CTEXT:
3239 ESC % G --UTF-8-BYTES-- ESC % @
3240 We keep these bytes as is for the moment.
3241 They may be decoded by post-read-conversion. */
3242 int *p = charbuf;
3243
3244 if (p + 6 > charbuf_end)
3245 goto break_loop;
3246 *p++ = ISO_CODE_ESC;
3247 *p++ = '%';
3248 *p++ = 'G';
3249 while (p < charbuf_end)
3250 {
3251 ONE_MORE_BYTE (c1);
3252 if (c1 == ISO_CODE_ESC
3253 && src + 1 < src_end
3254 && src[0] == '%'
3255 && src[1] == '@')
3256 {
3257 src += 2;
3258 break;
3259 }
3260 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3261 }
3262 if (p + 3 > charbuf_end)
3263 goto break_loop;
3264 *p++ = ISO_CODE_ESC;
3265 *p++ = '%';
3266 *p++ = '@';
3267 charbuf = p;
3268 }
3269 else
3270 goto invalid_code;
3271 continue;
3272 break;
3273
3274 default:
3275 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3276 goto invalid_code;
3277 {
3278 int reg, chars96;
3279
3280 if (c1 >= 0x28 && c1 <= 0x2B)
3281 { /* designation of DIMENSION1_CHARS94 character set */
3282 reg = c1 - 0x28, chars96 = 0;
3283 ONE_MORE_BYTE (c1);
3284 }
3285 else if (c1 >= 0x2C && c1 <= 0x2F)
3286 { /* designation of DIMENSION1_CHARS96 character set */
3287 reg = c1 - 0x2C, chars96 = 1;
3288 ONE_MORE_BYTE (c1);
3289 }
3290 else
3291 goto invalid_code;
3292 DECODE_DESIGNATION (reg, 1, chars96, c1);
3293 /* We must update these variables now. */
3294 if (reg == 0)
3295 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3296 else if (reg == 1)
3297 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3298 if (chars96 < 0)
3299 goto invalid_code;
3300 }
3301 continue;
3302 }
3303 }
3304
3305 if (charset->id != charset_ascii
3306 && last_id != charset->id)
3307 {
3308 if (last_id != charset_ascii)
3309 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3310 last_id = charset->id;
3311 last_offset = char_offset;
3312 }
3313
3314 /* Now we know CHARSET and 1st position code C1 of a character.
3315 Produce a decoded character while getting 2nd position code
3316 C2 if necessary. */
3317 c1 &= 0x7F;
3318 if (CHARSET_DIMENSION (charset) > 1)
3319 {
3320 ONE_MORE_BYTE (c2);
3321 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3322 /* C2 is not in a valid range. */
3323 goto invalid_code;
3324 c1 = (c1 << 8) | (c2 & 0x7F);
3325 if (CHARSET_DIMENSION (charset) > 2)
3326 {
3327 ONE_MORE_BYTE (c2);
3328 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3329 /* C2 is not in a valid range. */
3330 goto invalid_code;
3331 c1 = (c1 << 8) | (c2 & 0x7F);
3332 }
3333 }
3334
3335 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3336 if (c < 0)
3337 {
3338 MAYBE_FINISH_COMPOSITION ();
3339 for (; src_base < src; src_base++, char_offset++)
3340 {
3341 if (ASCII_BYTE_P (*src_base))
3342 *charbuf++ = *src_base;
3343 else
3344 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3345 }
3346 }
3347 else if (composition_state == COMPOSING_NO)
3348 {
3349 *charbuf++ = c;
3350 char_offset++;
3351 }
3352 else
3353 {
3354 components[component_idx++] = c;
3355 if (method == COMPOSITION_WITH_RULE
3356 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3357 && composition_state == COMPOSING_COMPONENT_CHAR))
3358 composition_state++;
3359 }
3360 continue;
3361
3362 invalid_code:
3363 MAYBE_FINISH_COMPOSITION ();
3364 src = src_base;
3365 consumed_chars = consumed_chars_base;
3366 ONE_MORE_BYTE (c);
3367 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3368 char_offset++;
3369 coding->errors++;
3370 continue;
3371
3372 break_loop:
3373 break;
3374 }
3375
3376 no_more_source:
3377 if (last_id != charset_ascii)
3378 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3379 coding->consumed_char += consumed_chars_base;
3380 coding->consumed = src_base - coding->source;
3381 coding->charbuf_used = charbuf - coding->charbuf;
3382 }
3383
3384
3385 /* ISO2022 encoding stuff. */
3386
3387 /*
3388 It is not enough to say just "ISO2022" on encoding, we have to
3389 specify more details. In Emacs, each coding system of ISO2022
3390 variant has the following specifications:
3391 1. Initial designation to G0 thru G3.
3392 2. Allows short-form designation?
3393 3. ASCII should be designated to G0 before control characters?
3394 4. ASCII should be designated to G0 at end of line?
3395 5. 7-bit environment or 8-bit environment?
3396 6. Use locking-shift?
3397 7. Use Single-shift?
3398 And the following two are only for Japanese:
3399 8. Use ASCII in place of JIS0201-1976-Roman?
3400 9. Use JISX0208-1983 in place of JISX0208-1978?
3401 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3402 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3403 details.
3404 */
3405
3406 /* Produce codes (escape sequence) for designating CHARSET to graphic
3407 register REG at DST, and increment DST. If <final-char> of CHARSET is
3408 '@', 'A', or 'B' and the coding system CODING allows, produce
3409 designation sequence of short-form. */
3410
3411 #define ENCODE_DESIGNATION(charset, reg, coding) \
3412 do { \
3413 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3414 char *intermediate_char_94 = "()*+"; \
3415 char *intermediate_char_96 = ",-./"; \
3416 int revision = -1; \
3417 int c; \
3418 \
3419 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3420 revision = CHARSET_ISO_REVISION (charset); \
3421 \
3422 if (revision >= 0) \
3423 { \
3424 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3425 EMIT_ONE_BYTE ('@' + revision); \
3426 } \
3427 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3428 if (CHARSET_DIMENSION (charset) == 1) \
3429 { \
3430 if (! CHARSET_ISO_CHARS_96 (charset)) \
3431 c = intermediate_char_94[reg]; \
3432 else \
3433 c = intermediate_char_96[reg]; \
3434 EMIT_ONE_ASCII_BYTE (c); \
3435 } \
3436 else \
3437 { \
3438 EMIT_ONE_ASCII_BYTE ('$'); \
3439 if (! CHARSET_ISO_CHARS_96 (charset)) \
3440 { \
3441 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3442 || reg != 0 \
3443 || final_char < '@' || final_char > 'B') \
3444 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3445 } \
3446 else \
3447 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3448 } \
3449 EMIT_ONE_ASCII_BYTE (final_char); \
3450 \
3451 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3452 } while (0)
3453
3454
3455 /* The following two macros produce codes (control character or escape
3456 sequence) for ISO2022 single-shift functions (single-shift-2 and
3457 single-shift-3). */
3458
3459 #define ENCODE_SINGLE_SHIFT_2 \
3460 do { \
3461 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3462 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3463 else \
3464 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3465 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3466 } while (0)
3467
3468
3469 #define ENCODE_SINGLE_SHIFT_3 \
3470 do { \
3471 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3472 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3473 else \
3474 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3475 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3476 } while (0)
3477
3478
3479 /* The following four macros produce codes (control character or
3480 escape sequence) for ISO2022 locking-shift functions (shift-in,
3481 shift-out, locking-shift-2, and locking-shift-3). */
3482
3483 #define ENCODE_SHIFT_IN \
3484 do { \
3485 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3486 CODING_ISO_INVOCATION (coding, 0) = 0; \
3487 } while (0)
3488
3489
3490 #define ENCODE_SHIFT_OUT \
3491 do { \
3492 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3493 CODING_ISO_INVOCATION (coding, 0) = 1; \
3494 } while (0)
3495
3496
3497 #define ENCODE_LOCKING_SHIFT_2 \
3498 do { \
3499 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3500 CODING_ISO_INVOCATION (coding, 0) = 2; \
3501 } while (0)
3502
3503
3504 #define ENCODE_LOCKING_SHIFT_3 \
3505 do { \
3506 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3507 CODING_ISO_INVOCATION (coding, 0) = 3; \
3508 } while (0)
3509
3510
3511 /* Produce codes for a DIMENSION1 character whose character set is
3512 CHARSET and whose position-code is C1. Designation and invocation
3513 sequences are also produced in advance if necessary. */
3514
3515 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3516 do { \
3517 int id = CHARSET_ID (charset); \
3518 \
3519 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3520 && id == charset_ascii) \
3521 { \
3522 id = charset_jisx0201_roman; \
3523 charset = CHARSET_FROM_ID (id); \
3524 } \
3525 \
3526 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3527 { \
3528 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3529 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3530 else \
3531 EMIT_ONE_BYTE (c1 | 0x80); \
3532 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3533 break; \
3534 } \
3535 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3536 { \
3537 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3538 break; \
3539 } \
3540 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3541 { \
3542 EMIT_ONE_BYTE (c1 | 0x80); \
3543 break; \
3544 } \
3545 else \
3546 /* Since CHARSET is not yet invoked to any graphic planes, we \
3547 must invoke it, or, at first, designate it to some graphic \
3548 register. Then repeat the loop to actually produce the \
3549 character. */ \
3550 dst = encode_invocation_designation (charset, coding, dst, \
3551 &produced_chars); \
3552 } while (1)
3553
3554
3555 /* Produce codes for a DIMENSION2 character whose character set is
3556 CHARSET and whose position-codes are C1 and C2. Designation and
3557 invocation codes are also produced in advance if necessary. */
3558
3559 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3560 do { \
3561 int id = CHARSET_ID (charset); \
3562 \
3563 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3564 && id == charset_jisx0208) \
3565 { \
3566 id = charset_jisx0208_1978; \
3567 charset = CHARSET_FROM_ID (id); \
3568 } \
3569 \
3570 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3571 { \
3572 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3573 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3574 else \
3575 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3576 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3577 break; \
3578 } \
3579 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3580 { \
3581 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3582 break; \
3583 } \
3584 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3585 { \
3586 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3587 break; \
3588 } \
3589 else \
3590 /* Since CHARSET is not yet invoked to any graphic planes, we \
3591 must invoke it, or, at first, designate it to some graphic \
3592 register. Then repeat the loop to actually produce the \
3593 character. */ \
3594 dst = encode_invocation_designation (charset, coding, dst, \
3595 &produced_chars); \
3596 } while (1)
3597
3598
3599 #define ENCODE_ISO_CHARACTER(charset, c) \
3600 do { \
3601 int code = ENCODE_CHAR ((charset),(c)); \
3602 \
3603 if (CHARSET_DIMENSION (charset) == 1) \
3604 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3605 else \
3606 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3607 } while (0)
3608
3609
3610 /* Produce designation and invocation codes at a place pointed by DST
3611 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3612 Return new DST. */
3613
3614 unsigned char *
3615 encode_invocation_designation (charset, coding, dst, p_nchars)
3616 struct charset *charset;
3617 struct coding_system *coding;
3618 unsigned char *dst;
3619 int *p_nchars;
3620 {
3621 int multibytep = coding->dst_multibyte;
3622 int produced_chars = *p_nchars;
3623 int reg; /* graphic register number */
3624 int id = CHARSET_ID (charset);
3625
3626 /* At first, check designations. */
3627 for (reg = 0; reg < 4; reg++)
3628 if (id == CODING_ISO_DESIGNATION (coding, reg))
3629 break;
3630
3631 if (reg >= 4)
3632 {
3633 /* CHARSET is not yet designated to any graphic registers. */
3634 /* At first check the requested designation. */
3635 reg = CODING_ISO_REQUEST (coding, id);
3636 if (reg < 0)
3637 /* Since CHARSET requests no special designation, designate it
3638 to graphic register 0. */
3639 reg = 0;
3640
3641 ENCODE_DESIGNATION (charset, reg, coding);
3642 }
3643
3644 if (CODING_ISO_INVOCATION (coding, 0) != reg
3645 && CODING_ISO_INVOCATION (coding, 1) != reg)
3646 {
3647 /* Since the graphic register REG is not invoked to any graphic
3648 planes, invoke it to graphic plane 0. */
3649 switch (reg)
3650 {
3651 case 0: /* graphic register 0 */
3652 ENCODE_SHIFT_IN;
3653 break;
3654
3655 case 1: /* graphic register 1 */
3656 ENCODE_SHIFT_OUT;
3657 break;
3658
3659 case 2: /* graphic register 2 */
3660 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3661 ENCODE_SINGLE_SHIFT_2;
3662 else
3663 ENCODE_LOCKING_SHIFT_2;
3664 break;
3665
3666 case 3: /* graphic register 3 */
3667 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3668 ENCODE_SINGLE_SHIFT_3;
3669 else
3670 ENCODE_LOCKING_SHIFT_3;
3671 break;
3672 }
3673 }
3674
3675 *p_nchars = produced_chars;
3676 return dst;
3677 }
3678
3679 /* The following three macros produce codes for indicating direction
3680 of text. */
3681 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3682 do { \
3683 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3684 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3685 else \
3686 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3687 } while (0)
3688
3689
3690 #define ENCODE_DIRECTION_R2L() \
3691 do { \
3692 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3693 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3694 } while (0)
3695
3696
3697 #define ENCODE_DIRECTION_L2R() \
3698 do { \
3699 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3700 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3701 } while (0)
3702
3703
3704 /* Produce codes for designation and invocation to reset the graphic
3705 planes and registers to initial state. */
3706 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3707 do { \
3708 int reg; \
3709 struct charset *charset; \
3710 \
3711 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3712 ENCODE_SHIFT_IN; \
3713 for (reg = 0; reg < 4; reg++) \
3714 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3715 && (CODING_ISO_DESIGNATION (coding, reg) \
3716 != CODING_ISO_INITIAL (coding, reg))) \
3717 { \
3718 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3719 ENCODE_DESIGNATION (charset, reg, coding); \
3720 } \
3721 } while (0)
3722
3723
3724 /* Produce designation sequences of charsets in the line started from
3725 SRC to a place pointed by DST, and return updated DST.
3726
3727 If the current block ends before any end-of-line, we may fail to
3728 find all the necessary designations. */
3729
3730 static unsigned char *
3731 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3732 struct coding_system *coding;
3733 int *charbuf, *charbuf_end;
3734 unsigned char *dst;
3735 {
3736 struct charset *charset;
3737 /* Table of charsets to be designated to each graphic register. */
3738 int r[4];
3739 int c, found = 0, reg;
3740 int produced_chars = 0;
3741 int multibytep = coding->dst_multibyte;
3742 Lisp_Object attrs;
3743 Lisp_Object charset_list;
3744
3745 attrs = CODING_ID_ATTRS (coding->id);
3746 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3747 if (EQ (charset_list, Qiso_2022))
3748 charset_list = Viso_2022_charset_list;
3749
3750 for (reg = 0; reg < 4; reg++)
3751 r[reg] = -1;
3752
3753 while (found < 4)
3754 {
3755 int id;
3756
3757 c = *charbuf++;
3758 if (c == '\n')
3759 break;
3760 charset = char_charset (c, charset_list, NULL);
3761 id = CHARSET_ID (charset);
3762 reg = CODING_ISO_REQUEST (coding, id);
3763 if (reg >= 0 && r[reg] < 0)
3764 {
3765 found++;
3766 r[reg] = id;
3767 }
3768 }
3769
3770 if (found)
3771 {
3772 for (reg = 0; reg < 4; reg++)
3773 if (r[reg] >= 0
3774 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3775 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3776 }
3777
3778 return dst;
3779 }
3780
3781 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3782
3783 static int
3784 encode_coding_iso_2022 (coding)
3785 struct coding_system *coding;
3786 {
3787 int multibytep = coding->dst_multibyte;
3788 int *charbuf = coding->charbuf;
3789 int *charbuf_end = charbuf + coding->charbuf_used;
3790 unsigned char *dst = coding->destination + coding->produced;
3791 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3792 int safe_room = 16;
3793 int bol_designation
3794 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3795 && CODING_ISO_BOL (coding));
3796 int produced_chars = 0;
3797 Lisp_Object attrs, eol_type, charset_list;
3798 int ascii_compatible;
3799 int c;
3800 int preferred_charset_id = -1;
3801
3802 CODING_GET_INFO (coding, attrs, charset_list);
3803 eol_type = CODING_ID_EOL_TYPE (coding->id);
3804 if (VECTORP (eol_type))
3805 eol_type = Qunix;
3806
3807 setup_iso_safe_charsets (attrs);
3808 /* Charset list may have been changed. */
3809 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3810 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3811
3812 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3813
3814 while (charbuf < charbuf_end)
3815 {
3816 ASSURE_DESTINATION (safe_room);
3817
3818 if (bol_designation)
3819 {
3820 unsigned char *dst_prev = dst;
3821
3822 /* We have to produce designation sequences if any now. */
3823 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3824 bol_designation = 0;
3825 /* We are sure that designation sequences are all ASCII bytes. */
3826 produced_chars += dst - dst_prev;
3827 }
3828
3829 c = *charbuf++;
3830
3831 if (c < 0)
3832 {
3833 /* Handle an annotation. */
3834 switch (*charbuf)
3835 {
3836 case CODING_ANNOTATE_COMPOSITION_MASK:
3837 /* Not yet implemented. */
3838 break;
3839 case CODING_ANNOTATE_CHARSET_MASK:
3840 preferred_charset_id = charbuf[2];
3841 if (preferred_charset_id >= 0
3842 && NILP (Fmemq (make_number (preferred_charset_id),
3843 charset_list)))
3844 preferred_charset_id = -1;
3845 break;
3846 default:
3847 abort ();
3848 }
3849 charbuf += -c - 1;
3850 continue;
3851 }
3852
3853 /* Now encode the character C. */
3854 if (c < 0x20 || c == 0x7F)
3855 {
3856 if (c == '\n'
3857 || (c == '\r' && EQ (eol_type, Qmac)))
3858 {
3859 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3860 ENCODE_RESET_PLANE_AND_REGISTER ();
3861 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3862 {
3863 int i;
3864
3865 for (i = 0; i < 4; i++)
3866 CODING_ISO_DESIGNATION (coding, i)
3867 = CODING_ISO_INITIAL (coding, i);
3868 }
3869 bol_designation
3870 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3871 }
3872 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3873 ENCODE_RESET_PLANE_AND_REGISTER ();
3874 EMIT_ONE_ASCII_BYTE (c);
3875 }
3876 else if (ASCII_CHAR_P (c))
3877 {
3878 if (ascii_compatible)
3879 EMIT_ONE_ASCII_BYTE (c);
3880 else
3881 {
3882 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3883 ENCODE_ISO_CHARACTER (charset, c);
3884 }
3885 }
3886 else if (CHAR_BYTE8_P (c))
3887 {
3888 c = CHAR_TO_BYTE8 (c);
3889 EMIT_ONE_BYTE (c);
3890 }
3891 else
3892 {
3893 struct charset *charset;
3894
3895 if (preferred_charset_id >= 0)
3896 {
3897 charset = CHARSET_FROM_ID (preferred_charset_id);
3898 if (! CHAR_CHARSET_P (c, charset))
3899 charset = char_charset (c, charset_list, NULL);
3900 }
3901 else
3902 charset = char_charset (c, charset_list, NULL);
3903 if (!charset)
3904 {
3905 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3906 {
3907 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3908 charset = CHARSET_FROM_ID (charset_ascii);
3909 }
3910 else
3911 {
3912 c = coding->default_char;
3913 charset = char_charset (c, charset_list, NULL);
3914 }
3915 }
3916 ENCODE_ISO_CHARACTER (charset, c);
3917 }
3918 }
3919
3920 if (coding->mode & CODING_MODE_LAST_BLOCK
3921 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3922 {
3923 ASSURE_DESTINATION (safe_room);
3924 ENCODE_RESET_PLANE_AND_REGISTER ();
3925 }
3926 record_conversion_result (coding, CODING_RESULT_SUCCESS);
3927 CODING_ISO_BOL (coding) = bol_designation;
3928 coding->produced_char += produced_chars;
3929 coding->produced = dst - coding->destination;
3930 return 0;
3931 }
3932
3933 \f
3934 /*** 8,9. SJIS and BIG5 handlers ***/
3935
3936 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3937 quite widely. So, for the moment, Emacs supports them in the bare
3938 C code. But, in the future, they may be supported only by CCL. */
3939
3940 /* SJIS is a coding system encoding three character sets: ASCII, right
3941 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3942 as is. A character of charset katakana-jisx0201 is encoded by
3943 "position-code + 0x80". A character of charset japanese-jisx0208
3944 is encoded in 2-byte but two position-codes are divided and shifted
3945 so that it fit in the range below.
3946
3947 --- CODE RANGE of SJIS ---
3948 (character set) (range)
3949 ASCII 0x00 .. 0x7F
3950 KATAKANA-JISX0201 0xA0 .. 0xDF
3951 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3952 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3953 -------------------------------
3954
3955 */
3956
3957 /* BIG5 is a coding system encoding two character sets: ASCII and
3958 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3959 character set and is encoded in two-byte.
3960
3961 --- CODE RANGE of BIG5 ---
3962 (character set) (range)
3963 ASCII 0x00 .. 0x7F
3964 Big5 (1st byte) 0xA1 .. 0xFE
3965 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3966 --------------------------
3967
3968 */
3969
3970 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3971 Check if a text is encoded in SJIS. If it is, return
3972 CATEGORY_MASK_SJIS, else return 0. */
3973
3974 static int
3975 detect_coding_sjis (coding, detect_info)
3976 struct coding_system *coding;
3977 struct coding_detection_info *detect_info;
3978 {
3979 const unsigned char *src = coding->source, *src_base;
3980 const unsigned char *src_end = coding->source + coding->src_bytes;
3981 int multibytep = coding->src_multibyte;
3982 int consumed_chars = 0;
3983 int found = 0;
3984 int c;
3985
3986 detect_info->checked |= CATEGORY_MASK_SJIS;
3987 /* A coding system of this category is always ASCII compatible. */
3988 src += coding->head_ascii;
3989
3990 while (1)
3991 {
3992 src_base = src;
3993 ONE_MORE_BYTE (c);
3994 if (c < 0x80)
3995 continue;
3996 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3997 {
3998 ONE_MORE_BYTE (c);
3999 if (c < 0x40 || c == 0x7F || c > 0xFC)
4000 break;
4001 found = CATEGORY_MASK_SJIS;
4002 }
4003 else if (c >= 0xA0 && c < 0xE0)
4004 found = CATEGORY_MASK_SJIS;
4005 else
4006 break;
4007 }
4008 detect_info->rejected |= CATEGORY_MASK_SJIS;
4009 return 0;
4010
4011 no_more_source:
4012 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4013 {
4014 detect_info->rejected |= CATEGORY_MASK_SJIS;
4015 return 0;
4016 }
4017 detect_info->found |= found;
4018 return 1;
4019 }
4020
4021 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4022 Check if a text is encoded in BIG5. If it is, return
4023 CATEGORY_MASK_BIG5, else return 0. */
4024
4025 static int
4026 detect_coding_big5 (coding, detect_info)
4027 struct coding_system *coding;
4028 struct coding_detection_info *detect_info;
4029 {
4030 const unsigned char *src = coding->source, *src_base;
4031 const unsigned char *src_end = coding->source + coding->src_bytes;
4032 int multibytep = coding->src_multibyte;
4033 int consumed_chars = 0;
4034 int found = 0;
4035 int c;
4036
4037 detect_info->checked |= CATEGORY_MASK_BIG5;
4038 /* A coding system of this category is always ASCII compatible. */
4039 src += coding->head_ascii;
4040
4041 while (1)
4042 {
4043 src_base = src;
4044 ONE_MORE_BYTE (c);
4045 if (c < 0x80)
4046 continue;
4047 if (c >= 0xA1)
4048 {
4049 ONE_MORE_BYTE (c);
4050 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4051 return 0;
4052 found = CATEGORY_MASK_BIG5;
4053 }
4054 else
4055 break;
4056 }
4057 detect_info->rejected |= CATEGORY_MASK_BIG5;
4058 return 0;
4059
4060 no_more_source:
4061 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4062 {
4063 detect_info->rejected |= CATEGORY_MASK_BIG5;
4064 return 0;
4065 }
4066 detect_info->found |= found;
4067 return 1;
4068 }
4069
4070 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4071 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4072
4073 static void
4074 decode_coding_sjis (coding)
4075 struct coding_system *coding;
4076 {
4077 const unsigned char *src = coding->source + coding->consumed;
4078 const unsigned char *src_end = coding->source + coding->src_bytes;
4079 const unsigned char *src_base;
4080 int *charbuf = coding->charbuf + coding->charbuf_used;
4081 int *charbuf_end
4082 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4083 int consumed_chars = 0, consumed_chars_base;
4084 int multibytep = coding->src_multibyte;
4085 struct charset *charset_roman, *charset_kanji, *charset_kana;
4086 struct charset *charset_kanji2;
4087 Lisp_Object attrs, charset_list, val;
4088 int char_offset = coding->produced_char;
4089 int last_offset = char_offset;
4090 int last_id = charset_ascii;
4091
4092 CODING_GET_INFO (coding, attrs, charset_list);
4093
4094 val = charset_list;
4095 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4096 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4097 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4098 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4099
4100 while (1)
4101 {
4102 int c, c1;
4103 struct charset *charset;
4104
4105 src_base = src;
4106 consumed_chars_base = consumed_chars;
4107
4108 if (charbuf >= charbuf_end)
4109 break;
4110
4111 ONE_MORE_BYTE (c);
4112 if (c < 0)
4113 goto invalid_code;
4114 if (c < 0x80)
4115 charset = charset_roman;
4116 else if (c == 0x80 || c == 0xA0)
4117 goto invalid_code;
4118 else if (c >= 0xA1 && c <= 0xDF)
4119 {
4120 /* SJIS -> JISX0201-Kana */
4121 c &= 0x7F;
4122 charset = charset_kana;
4123 }
4124 else if (c <= 0xEF)
4125 {
4126 /* SJIS -> JISX0208 */
4127 ONE_MORE_BYTE (c1);
4128 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4129 goto invalid_code;
4130 c = (c << 8) | c1;
4131 SJIS_TO_JIS (c);
4132 charset = charset_kanji;
4133 }
4134 else if (c <= 0xFC && charset_kanji2)
4135 {
4136 /* SJIS -> JISX0213-2 */
4137 ONE_MORE_BYTE (c1);
4138 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4139 goto invalid_code;
4140 c = (c << 8) | c1;
4141 SJIS_TO_JIS2 (c);
4142 charset = charset_kanji2;
4143 }
4144 else
4145 goto invalid_code;
4146 if (charset->id != charset_ascii
4147 && last_id != charset->id)
4148 {
4149 if (last_id != charset_ascii)
4150 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4151 last_id = charset->id;
4152 last_offset = char_offset;
4153 }
4154 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4155 *charbuf++ = c;
4156 char_offset++;
4157 continue;
4158
4159 invalid_code:
4160 src = src_base;
4161 consumed_chars = consumed_chars_base;
4162 ONE_MORE_BYTE (c);
4163 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4164 char_offset++;
4165 coding->errors++;
4166 }
4167
4168 no_more_source:
4169 if (last_id != charset_ascii)
4170 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4171 coding->consumed_char += consumed_chars_base;
4172 coding->consumed = src_base - coding->source;
4173 coding->charbuf_used = charbuf - coding->charbuf;
4174 }
4175
4176 static void
4177 decode_coding_big5 (coding)
4178 struct coding_system *coding;
4179 {
4180 const unsigned char *src = coding->source + coding->consumed;
4181 const unsigned char *src_end = coding->source + coding->src_bytes;
4182 const unsigned char *src_base;
4183 int *charbuf = coding->charbuf + coding->charbuf_used;
4184 int *charbuf_end
4185 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4186 int consumed_chars = 0, consumed_chars_base;
4187 int multibytep = coding->src_multibyte;
4188 struct charset *charset_roman, *charset_big5;
4189 Lisp_Object attrs, charset_list, val;
4190 int char_offset = coding->produced_char;
4191 int last_offset = char_offset;
4192 int last_id = charset_ascii;
4193
4194 CODING_GET_INFO (coding, attrs, charset_list);
4195 val = charset_list;
4196 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4197 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4198
4199 while (1)
4200 {
4201 int c, c1;
4202 struct charset *charset;
4203
4204 src_base = src;
4205 consumed_chars_base = consumed_chars;
4206
4207 if (charbuf >= charbuf_end)
4208 break;
4209
4210 ONE_MORE_BYTE (c);
4211
4212 if (c < 0)
4213 goto invalid_code;
4214 if (c < 0x80)
4215 charset = charset_roman;
4216 else
4217 {
4218 /* BIG5 -> Big5 */
4219 if (c < 0xA1 || c > 0xFE)
4220 goto invalid_code;
4221 ONE_MORE_BYTE (c1);
4222 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4223 goto invalid_code;
4224 c = c << 8 | c1;
4225 charset = charset_big5;
4226 }
4227 if (charset->id != charset_ascii
4228 && last_id != charset->id)
4229 {
4230 if (last_id != charset_ascii)
4231 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4232 last_id = charset->id;
4233 last_offset = char_offset;
4234 }
4235 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4236 *charbuf++ = c;
4237 char_offset++;
4238 continue;
4239
4240 invalid_code:
4241 src = src_base;
4242 consumed_chars = consumed_chars_base;
4243 ONE_MORE_BYTE (c);
4244 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4245 char_offset++;
4246 coding->errors++;
4247 }
4248
4249 no_more_source:
4250 if (last_id != charset_ascii)
4251 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4252 coding->consumed_char += consumed_chars_base;
4253 coding->consumed = src_base - coding->source;
4254 coding->charbuf_used = charbuf - coding->charbuf;
4255 }
4256
4257 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4258 This function can encode charsets `ascii', `katakana-jisx0201',
4259 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4260 are sure that all these charsets are registered as official charset
4261 (i.e. do not have extended leading-codes). Characters of other
4262 charsets are produced without any encoding. If SJIS_P is 1, encode
4263 SJIS text, else encode BIG5 text. */
4264
4265 static int
4266 encode_coding_sjis (coding)
4267 struct coding_system *coding;
4268 {
4269 int multibytep = coding->dst_multibyte;
4270 int *charbuf = coding->charbuf;
4271 int *charbuf_end = charbuf + coding->charbuf_used;
4272 unsigned char *dst = coding->destination + coding->produced;
4273 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4274 int safe_room = 4;
4275 int produced_chars = 0;
4276 Lisp_Object attrs, charset_list, val;
4277 int ascii_compatible;
4278 struct charset *charset_roman, *charset_kanji, *charset_kana;
4279 struct charset *charset_kanji2;
4280 int c;
4281
4282 CODING_GET_INFO (coding, attrs, charset_list);
4283 val = charset_list;
4284 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4285 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4286 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4287 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4288
4289 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4290
4291 while (charbuf < charbuf_end)
4292 {
4293 ASSURE_DESTINATION (safe_room);
4294 c = *charbuf++;
4295 /* Now encode the character C. */
4296 if (ASCII_CHAR_P (c) && ascii_compatible)
4297 EMIT_ONE_ASCII_BYTE (c);
4298 else if (CHAR_BYTE8_P (c))
4299 {
4300 c = CHAR_TO_BYTE8 (c);
4301 EMIT_ONE_BYTE (c);
4302 }
4303 else
4304 {
4305 unsigned code;
4306 struct charset *charset = char_charset (c, charset_list, &code);
4307
4308 if (!charset)
4309 {
4310 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4311 {
4312 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4313 charset = CHARSET_FROM_ID (charset_ascii);
4314 }
4315 else
4316 {
4317 c = coding->default_char;
4318 charset = char_charset (c, charset_list, &code);
4319 }
4320 }
4321 if (code == CHARSET_INVALID_CODE (charset))
4322 abort ();
4323 if (charset == charset_kanji)
4324 {
4325 int c1, c2;
4326 JIS_TO_SJIS (code);
4327 c1 = code >> 8, c2 = code & 0xFF;
4328 EMIT_TWO_BYTES (c1, c2);
4329 }
4330 else if (charset == charset_kana)
4331 EMIT_ONE_BYTE (code | 0x80);
4332 else if (charset_kanji2 && charset == charset_kanji2)
4333 {
4334 int c1, c2;
4335
4336 c1 = code >> 8;
4337 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4338 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4339 {
4340 JIS_TO_SJIS2 (code);
4341 c1 = code >> 8, c2 = code & 0xFF;
4342 EMIT_TWO_BYTES (c1, c2);
4343 }
4344 else
4345 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4346 }
4347 else
4348 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4349 }
4350 }
4351 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4352 coding->produced_char += produced_chars;
4353 coding->produced = dst - coding->destination;
4354 return 0;
4355 }
4356
4357 static int
4358 encode_coding_big5 (coding)
4359 struct coding_system *coding;
4360 {
4361 int multibytep = coding->dst_multibyte;
4362 int *charbuf = coding->charbuf;
4363 int *charbuf_end = charbuf + coding->charbuf_used;
4364 unsigned char *dst = coding->destination + coding->produced;
4365 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4366 int safe_room = 4;
4367 int produced_chars = 0;
4368 Lisp_Object attrs, charset_list, val;
4369 int ascii_compatible;
4370 struct charset *charset_roman, *charset_big5;
4371 int c;
4372
4373 CODING_GET_INFO (coding, attrs, charset_list);
4374 val = charset_list;
4375 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4376 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4377 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4378
4379 while (charbuf < charbuf_end)
4380 {
4381 ASSURE_DESTINATION (safe_room);
4382 c = *charbuf++;
4383 /* Now encode the character C. */
4384 if (ASCII_CHAR_P (c) && ascii_compatible)
4385 EMIT_ONE_ASCII_BYTE (c);
4386 else if (CHAR_BYTE8_P (c))
4387 {
4388 c = CHAR_TO_BYTE8 (c);
4389 EMIT_ONE_BYTE (c);
4390 }
4391 else
4392 {
4393 unsigned code;
4394 struct charset *charset = char_charset (c, charset_list, &code);
4395
4396 if (! charset)
4397 {
4398 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4399 {
4400 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4401 charset = CHARSET_FROM_ID (charset_ascii);
4402 }
4403 else
4404 {
4405 c = coding->default_char;
4406 charset = char_charset (c, charset_list, &code);
4407 }
4408 }
4409 if (code == CHARSET_INVALID_CODE (charset))
4410 abort ();
4411 if (charset == charset_big5)
4412 {
4413 int c1, c2;
4414
4415 c1 = code >> 8, c2 = code & 0xFF;
4416 EMIT_TWO_BYTES (c1, c2);
4417 }
4418 else
4419 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4420 }
4421 }
4422 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4423 coding->produced_char += produced_chars;
4424 coding->produced = dst - coding->destination;
4425 return 0;
4426 }
4427
4428 \f
4429 /*** 10. CCL handlers ***/
4430
4431 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4432 Check if a text is encoded in a coding system of which
4433 encoder/decoder are written in CCL program. If it is, return
4434 CATEGORY_MASK_CCL, else return 0. */
4435
4436 static int
4437 detect_coding_ccl (coding, detect_info)
4438 struct coding_system *coding;
4439 struct coding_detection_info *detect_info;
4440 {
4441 const unsigned char *src = coding->source, *src_base;
4442 const unsigned char *src_end = coding->source + coding->src_bytes;
4443 int multibytep = coding->src_multibyte;
4444 int consumed_chars = 0;
4445 int found = 0;
4446 unsigned char *valids;
4447 int head_ascii = coding->head_ascii;
4448 Lisp_Object attrs;
4449
4450 detect_info->checked |= CATEGORY_MASK_CCL;
4451
4452 coding = &coding_categories[coding_category_ccl];
4453 valids = CODING_CCL_VALIDS (coding);
4454 attrs = CODING_ID_ATTRS (coding->id);
4455 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4456 src += head_ascii;
4457
4458 while (1)
4459 {
4460 int c;
4461
4462 src_base = src;
4463 ONE_MORE_BYTE (c);
4464 if (c < 0 || ! valids[c])
4465 break;
4466 if ((valids[c] > 1))
4467 found = CATEGORY_MASK_CCL;
4468 }
4469 detect_info->rejected |= CATEGORY_MASK_CCL;
4470 return 0;
4471
4472 no_more_source:
4473 detect_info->found |= found;
4474 return 1;
4475 }
4476
4477 static void
4478 decode_coding_ccl (coding)
4479 struct coding_system *coding;
4480 {
4481 const unsigned char *src = coding->source + coding->consumed;
4482 const unsigned char *src_end = coding->source + coding->src_bytes;
4483 int *charbuf = coding->charbuf + coding->charbuf_used;
4484 int *charbuf_end = coding->charbuf + coding->charbuf_size;
4485 int consumed_chars = 0;
4486 int multibytep = coding->src_multibyte;
4487 struct ccl_program ccl;
4488 int source_charbuf[1024];
4489 int source_byteidx[1024];
4490 Lisp_Object attrs, charset_list;
4491
4492 CODING_GET_INFO (coding, attrs, charset_list);
4493 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4494
4495 while (src < src_end)
4496 {
4497 const unsigned char *p = src;
4498 int *source, *source_end;
4499 int i = 0;
4500
4501 if (multibytep)
4502 while (i < 1024 && p < src_end)
4503 {
4504 source_byteidx[i] = p - src;
4505 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4506 }
4507 else
4508 while (i < 1024 && p < src_end)
4509 source_charbuf[i++] = *p++;
4510
4511 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4512 ccl.last_block = 1;
4513
4514 source = source_charbuf;
4515 source_end = source + i;
4516 while (source < source_end)
4517 {
4518 ccl_driver (&ccl, source, charbuf,
4519 source_end - source, charbuf_end - charbuf,
4520 charset_list);
4521 source += ccl.consumed;
4522 charbuf += ccl.produced;
4523 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4524 break;
4525 }
4526 if (source < source_end)
4527 src += source_byteidx[source - source_charbuf];
4528 else
4529 src = p;
4530 consumed_chars += source - source_charbuf;
4531
4532 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4533 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4534 break;
4535 }
4536
4537 switch (ccl.status)
4538 {
4539 case CCL_STAT_SUSPEND_BY_SRC:
4540 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4541 break;
4542 case CCL_STAT_SUSPEND_BY_DST:
4543 break;
4544 case CCL_STAT_QUIT:
4545 case CCL_STAT_INVALID_CMD:
4546 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4547 break;
4548 default:
4549 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4550 break;
4551 }
4552 coding->consumed_char += consumed_chars;
4553 coding->consumed = src - coding->source;
4554 coding->charbuf_used = charbuf - coding->charbuf;
4555 }
4556
4557 static int
4558 encode_coding_ccl (coding)
4559 struct coding_system *coding;
4560 {
4561 struct ccl_program ccl;
4562 int multibytep = coding->dst_multibyte;
4563 int *charbuf = coding->charbuf;
4564 int *charbuf_end = charbuf + coding->charbuf_used;
4565 unsigned char *dst = coding->destination + coding->produced;
4566 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4567 int destination_charbuf[1024];
4568 int i, produced_chars = 0;
4569 Lisp_Object attrs, charset_list;
4570
4571 CODING_GET_INFO (coding, attrs, charset_list);
4572 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4573
4574 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4575 ccl.dst_multibyte = coding->dst_multibyte;
4576
4577 while (charbuf < charbuf_end)
4578 {
4579 ccl_driver (&ccl, charbuf, destination_charbuf,
4580 charbuf_end - charbuf, 1024, charset_list);
4581 if (multibytep)
4582 {
4583 ASSURE_DESTINATION (ccl.produced * 2);
4584 for (i = 0; i < ccl.produced; i++)
4585 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4586 }
4587 else
4588 {
4589 ASSURE_DESTINATION (ccl.produced);
4590 for (i = 0; i < ccl.produced; i++)
4591 *dst++ = destination_charbuf[i] & 0xFF;
4592 produced_chars += ccl.produced;
4593 }
4594 charbuf += ccl.consumed;
4595 if (ccl.status == CCL_STAT_QUIT
4596 || ccl.status == CCL_STAT_INVALID_CMD)
4597 break;
4598 }
4599
4600 switch (ccl.status)
4601 {
4602 case CCL_STAT_SUSPEND_BY_SRC:
4603 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4604 break;
4605 case CCL_STAT_SUSPEND_BY_DST:
4606 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4607 break;
4608 case CCL_STAT_QUIT:
4609 case CCL_STAT_INVALID_CMD:
4610 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4611 break;
4612 default:
4613 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4614 break;
4615 }
4616
4617 coding->produced_char += produced_chars;
4618 coding->produced = dst - coding->destination;
4619 return 0;
4620 }
4621
4622
4623 \f
4624 /*** 10, 11. no-conversion handlers ***/
4625
4626 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4627
4628 static void
4629 decode_coding_raw_text (coding)
4630 struct coding_system *coding;
4631 {
4632 coding->chars_at_source = 1;
4633 coding->consumed_char = 0;
4634 coding->consumed = 0;
4635 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4636 }
4637
4638 static int
4639 encode_coding_raw_text (coding)
4640 struct coding_system *coding;
4641 {
4642 int multibytep = coding->dst_multibyte;
4643 int *charbuf = coding->charbuf;
4644 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4645 unsigned char *dst = coding->destination + coding->produced;
4646 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4647 int produced_chars = 0;
4648 int c;
4649
4650 if (multibytep)
4651 {
4652 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4653
4654 if (coding->src_multibyte)
4655 while (charbuf < charbuf_end)
4656 {
4657 ASSURE_DESTINATION (safe_room);
4658 c = *charbuf++;
4659 if (ASCII_CHAR_P (c))
4660 EMIT_ONE_ASCII_BYTE (c);
4661 else if (CHAR_BYTE8_P (c))
4662 {
4663 c = CHAR_TO_BYTE8 (c);
4664 EMIT_ONE_BYTE (c);
4665 }
4666 else
4667 {
4668 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4669
4670 CHAR_STRING_ADVANCE (c, p1);
4671 while (p0 < p1)
4672 {
4673 EMIT_ONE_BYTE (*p0);
4674 p0++;
4675 }
4676 }
4677 }
4678 else
4679 while (charbuf < charbuf_end)
4680 {
4681 ASSURE_DESTINATION (safe_room);
4682 c = *charbuf++;
4683 EMIT_ONE_BYTE (c);
4684 }
4685 }
4686 else
4687 {
4688 if (coding->src_multibyte)
4689 {
4690 int safe_room = MAX_MULTIBYTE_LENGTH;
4691
4692 while (charbuf < charbuf_end)
4693 {
4694 ASSURE_DESTINATION (safe_room);
4695 c = *charbuf++;
4696 if (ASCII_CHAR_P (c))
4697 *dst++ = c;
4698 else if (CHAR_BYTE8_P (c))
4699 *dst++ = CHAR_TO_BYTE8 (c);
4700 else
4701 CHAR_STRING_ADVANCE (c, dst);
4702 produced_chars++;
4703 }
4704 }
4705 else
4706 {
4707 ASSURE_DESTINATION (charbuf_end - charbuf);
4708 while (charbuf < charbuf_end && dst < dst_end)
4709 *dst++ = *charbuf++;
4710 produced_chars = dst - (coding->destination + coding->dst_bytes);
4711 }
4712 }
4713 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4714 coding->produced_char += produced_chars;
4715 coding->produced = dst - coding->destination;
4716 return 0;
4717 }
4718
4719 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4720 Check if a text is encoded in a charset-based coding system. If it
4721 is, return 1, else return 0. */
4722
4723 static int
4724 detect_coding_charset (coding, detect_info)
4725 struct coding_system *coding;
4726 struct coding_detection_info *detect_info;
4727 {
4728 const unsigned char *src = coding->source, *src_base;
4729 const unsigned char *src_end = coding->source + coding->src_bytes;
4730 int multibytep = coding->src_multibyte;
4731 int consumed_chars = 0;
4732 Lisp_Object attrs, valids;
4733 int found = 0;
4734 int head_ascii = coding->head_ascii;
4735
4736 detect_info->checked |= CATEGORY_MASK_CHARSET;
4737
4738 coding = &coding_categories[coding_category_charset];
4739 attrs = CODING_ID_ATTRS (coding->id);
4740 valids = AREF (attrs, coding_attr_charset_valids);
4741
4742 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4743 src += head_ascii;
4744
4745 while (1)
4746 {
4747 int c;
4748 Lisp_Object val;
4749 struct charset *charset;
4750 int dim, idx;
4751
4752 src_base = src;
4753 ONE_MORE_BYTE (c);
4754 if (c < 0)
4755 continue;
4756 val = AREF (valids, c);
4757 if (NILP (val))
4758 break;
4759 if (c >= 0x80)
4760 found = CATEGORY_MASK_CHARSET;
4761 if (INTEGERP (val))
4762 {
4763 charset = CHARSET_FROM_ID (XFASTINT (val));
4764 dim = CHARSET_DIMENSION (charset);
4765 for (idx = 1; idx < dim; idx++)
4766 {
4767 if (src == src_end)
4768 goto too_short;
4769 ONE_MORE_BYTE (c);
4770 if (c < charset->code_space[(dim - 1 - idx) * 2]
4771 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
4772 break;
4773 }
4774 if (idx < dim)
4775 break;
4776 }
4777 else
4778 {
4779 idx = 1;
4780 for (; CONSP (val); val = XCDR (val))
4781 {
4782 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4783 dim = CHARSET_DIMENSION (charset);
4784 while (idx < dim)
4785 {
4786 if (src == src_end)
4787 goto too_short;
4788 ONE_MORE_BYTE (c);
4789 if (c < charset->code_space[(dim - 1 - idx) * 4]
4790 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
4791 break;
4792 idx++;
4793 }
4794 if (idx == dim)
4795 {
4796 val = Qnil;
4797 break;
4798 }
4799 }
4800 if (CONSP (val))
4801 break;
4802 }
4803 }
4804 too_short:
4805 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4806 return 0;
4807
4808 no_more_source:
4809 detect_info->found |= found;
4810 return 1;
4811 }
4812
4813 static void
4814 decode_coding_charset (coding)
4815 struct coding_system *coding;
4816 {
4817 const unsigned char *src = coding->source + coding->consumed;
4818 const unsigned char *src_end = coding->source + coding->src_bytes;
4819 const unsigned char *src_base;
4820 int *charbuf = coding->charbuf + coding->charbuf_used;
4821 int *charbuf_end
4822 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4823 int consumed_chars = 0, consumed_chars_base;
4824 int multibytep = coding->src_multibyte;
4825 Lisp_Object attrs, charset_list, valids;
4826 int char_offset = coding->produced_char;
4827 int last_offset = char_offset;
4828 int last_id = charset_ascii;
4829
4830 CODING_GET_INFO (coding, attrs, charset_list);
4831 valids = AREF (attrs, coding_attr_charset_valids);
4832
4833 while (1)
4834 {
4835 int c;
4836 Lisp_Object val;
4837 struct charset *charset;
4838 int dim;
4839 int len = 1;
4840 unsigned code;
4841
4842 src_base = src;
4843 consumed_chars_base = consumed_chars;
4844
4845 if (charbuf >= charbuf_end)
4846 break;
4847
4848 ONE_MORE_BYTE (c);
4849 if (c < 0)
4850 goto invalid_code;
4851 code = c;
4852
4853 val = AREF (valids, c);
4854 if (NILP (val))
4855 goto invalid_code;
4856 if (INTEGERP (val))
4857 {
4858 charset = CHARSET_FROM_ID (XFASTINT (val));
4859 dim = CHARSET_DIMENSION (charset);
4860 while (len < dim)
4861 {
4862 ONE_MORE_BYTE (c);
4863 code = (code << 8) | c;
4864 len++;
4865 }
4866 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4867 charset, code, c);
4868 }
4869 else
4870 {
4871 /* VAL is a list of charset IDs. It is assured that the
4872 list is sorted by charset dimensions (smaller one
4873 comes first). */
4874 while (CONSP (val))
4875 {
4876 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4877 dim = CHARSET_DIMENSION (charset);
4878 while (len < dim)
4879 {
4880 ONE_MORE_BYTE (c);
4881 code = (code << 8) | c;
4882 len++;
4883 }
4884 CODING_DECODE_CHAR (coding, src, src_base,
4885 src_end, charset, code, c);
4886 if (c >= 0)
4887 break;
4888 val = XCDR (val);
4889 }
4890 }
4891 if (c < 0)
4892 goto invalid_code;
4893 if (charset->id != charset_ascii
4894 && last_id != charset->id)
4895 {
4896 if (last_id != charset_ascii)
4897 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4898 last_id = charset->id;
4899 last_offset = char_offset;
4900 }
4901
4902 *charbuf++ = c;
4903 char_offset++;
4904 continue;
4905
4906 invalid_code:
4907 src = src_base;
4908 consumed_chars = consumed_chars_base;
4909 ONE_MORE_BYTE (c);
4910 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4911 char_offset++;
4912 coding->errors++;
4913 }
4914
4915 no_more_source:
4916 if (last_id != charset_ascii)
4917 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4918 coding->consumed_char += consumed_chars_base;
4919 coding->consumed = src_base - coding->source;
4920 coding->charbuf_used = charbuf - coding->charbuf;
4921 }
4922
4923 static int
4924 encode_coding_charset (coding)
4925 struct coding_system *coding;
4926 {
4927 int multibytep = coding->dst_multibyte;
4928 int *charbuf = coding->charbuf;
4929 int *charbuf_end = charbuf + coding->charbuf_used;
4930 unsigned char *dst = coding->destination + coding->produced;
4931 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4932 int safe_room = MAX_MULTIBYTE_LENGTH;
4933 int produced_chars = 0;
4934 Lisp_Object attrs, charset_list;
4935 int ascii_compatible;
4936 int c;
4937
4938 CODING_GET_INFO (coding, attrs, charset_list);
4939 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4940
4941 while (charbuf < charbuf_end)
4942 {
4943 struct charset *charset;
4944 unsigned code;
4945
4946 ASSURE_DESTINATION (safe_room);
4947 c = *charbuf++;
4948 if (ascii_compatible && ASCII_CHAR_P (c))
4949 EMIT_ONE_ASCII_BYTE (c);
4950 else if (CHAR_BYTE8_P (c))
4951 {
4952 c = CHAR_TO_BYTE8 (c);
4953 EMIT_ONE_BYTE (c);
4954 }
4955 else
4956 {
4957 charset = char_charset (c, charset_list, &code);
4958 if (charset)
4959 {
4960 if (CHARSET_DIMENSION (charset) == 1)
4961 EMIT_ONE_BYTE (code);
4962 else if (CHARSET_DIMENSION (charset) == 2)
4963 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4964 else if (CHARSET_DIMENSION (charset) == 3)
4965 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4966 else
4967 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4968 (code >> 8) & 0xFF, code & 0xFF);
4969 }
4970 else
4971 {
4972 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4973 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4974 else
4975 c = coding->default_char;
4976 EMIT_ONE_BYTE (c);
4977 }
4978 }
4979 }
4980
4981 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4982 coding->produced_char += produced_chars;
4983 coding->produced = dst - coding->destination;
4984 return 0;
4985 }
4986
4987 \f
4988 /*** 7. C library functions ***/
4989
4990 /* Setup coding context CODING from information about CODING_SYSTEM.
4991 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4992 CODING_SYSTEM is invalid, signal an error. */
4993
4994 void
4995 setup_coding_system (coding_system, coding)
4996 Lisp_Object coding_system;
4997 struct coding_system *coding;
4998 {
4999 Lisp_Object attrs;
5000 Lisp_Object eol_type;
5001 Lisp_Object coding_type;
5002 Lisp_Object val;
5003
5004 if (NILP (coding_system))
5005 coding_system = Qundecided;
5006
5007 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5008
5009 attrs = CODING_ID_ATTRS (coding->id);
5010 eol_type = CODING_ID_EOL_TYPE (coding->id);
5011
5012 coding->mode = 0;
5013 coding->head_ascii = -1;
5014 if (VECTORP (eol_type))
5015 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5016 | CODING_REQUIRE_DETECTION_MASK);
5017 else if (! EQ (eol_type, Qunix))
5018 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5019 | CODING_REQUIRE_ENCODING_MASK);
5020 else
5021 coding->common_flags = 0;
5022 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5023 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5024 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5025 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5026 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5027 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5028
5029 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5030 coding->max_charset_id = SCHARS (val) - 1;
5031 coding->safe_charsets = (char *) SDATA (val);
5032 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5033
5034 coding_type = CODING_ATTR_TYPE (attrs);
5035 if (EQ (coding_type, Qundecided))
5036 {
5037 coding->detector = NULL;
5038 coding->decoder = decode_coding_raw_text;
5039 coding->encoder = encode_coding_raw_text;
5040 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5041 }
5042 else if (EQ (coding_type, Qiso_2022))
5043 {
5044 int i;
5045 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5046
5047 /* Invoke graphic register 0 to plane 0. */
5048 CODING_ISO_INVOCATION (coding, 0) = 0;
5049 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5050 CODING_ISO_INVOCATION (coding, 1)
5051 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5052 /* Setup the initial status of designation. */
5053 for (i = 0; i < 4; i++)
5054 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5055 /* Not single shifting initially. */
5056 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5057 /* Beginning of buffer should also be regarded as bol. */
5058 CODING_ISO_BOL (coding) = 1;
5059 coding->detector = detect_coding_iso_2022;
5060 coding->decoder = decode_coding_iso_2022;
5061 coding->encoder = encode_coding_iso_2022;
5062 if (flags & CODING_ISO_FLAG_SAFE)
5063 coding->mode |= CODING_MODE_SAFE_ENCODING;
5064 coding->common_flags
5065 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5066 | CODING_REQUIRE_FLUSHING_MASK);
5067 if (flags & CODING_ISO_FLAG_COMPOSITION)
5068 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5069 if (flags & CODING_ISO_FLAG_DESIGNATION)
5070 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5071 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5072 {
5073 setup_iso_safe_charsets (attrs);
5074 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5075 coding->max_charset_id = SCHARS (val) - 1;
5076 coding->safe_charsets = (char *) SDATA (val);
5077 }
5078 CODING_ISO_FLAGS (coding) = flags;
5079 }
5080 else if (EQ (coding_type, Qcharset))
5081 {
5082 coding->detector = detect_coding_charset;
5083 coding->decoder = decode_coding_charset;
5084 coding->encoder = encode_coding_charset;
5085 coding->common_flags
5086 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5087 }
5088 else if (EQ (coding_type, Qutf_8))
5089 {
5090 coding->detector = detect_coding_utf_8;
5091 coding->decoder = decode_coding_utf_8;
5092 coding->encoder = encode_coding_utf_8;
5093 coding->common_flags
5094 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5095 }
5096 else if (EQ (coding_type, Qutf_16))
5097 {
5098 val = AREF (attrs, coding_attr_utf_16_bom);
5099 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5100 : EQ (val, Qt) ? utf_16_with_bom
5101 : utf_16_without_bom);
5102 val = AREF (attrs, coding_attr_utf_16_endian);
5103 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5104 : utf_16_little_endian);
5105 CODING_UTF_16_SURROGATE (coding) = 0;
5106 coding->detector = detect_coding_utf_16;
5107 coding->decoder = decode_coding_utf_16;
5108 coding->encoder = encode_coding_utf_16;
5109 coding->common_flags
5110 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5111 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5112 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5113 }
5114 else if (EQ (coding_type, Qccl))
5115 {
5116 coding->detector = detect_coding_ccl;
5117 coding->decoder = decode_coding_ccl;
5118 coding->encoder = encode_coding_ccl;
5119 coding->common_flags
5120 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5121 | CODING_REQUIRE_FLUSHING_MASK);
5122 }
5123 else if (EQ (coding_type, Qemacs_mule))
5124 {
5125 coding->detector = detect_coding_emacs_mule;
5126 coding->decoder = decode_coding_emacs_mule;
5127 coding->encoder = encode_coding_emacs_mule;
5128 coding->common_flags
5129 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5130 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5131 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5132 {
5133 Lisp_Object tail, safe_charsets;
5134 int max_charset_id = 0;
5135
5136 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5137 tail = XCDR (tail))
5138 if (max_charset_id < XFASTINT (XCAR (tail)))
5139 max_charset_id = XFASTINT (XCAR (tail));
5140 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5141 make_number (255));
5142 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5143 tail = XCDR (tail))
5144 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5145 coding->max_charset_id = max_charset_id;
5146 coding->safe_charsets = (char *) SDATA (safe_charsets);
5147 }
5148 }
5149 else if (EQ (coding_type, Qshift_jis))
5150 {
5151 coding->detector = detect_coding_sjis;
5152 coding->decoder = decode_coding_sjis;
5153 coding->encoder = encode_coding_sjis;
5154 coding->common_flags
5155 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5156 }
5157 else if (EQ (coding_type, Qbig5))
5158 {
5159 coding->detector = detect_coding_big5;
5160 coding->decoder = decode_coding_big5;
5161 coding->encoder = encode_coding_big5;
5162 coding->common_flags
5163 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5164 }
5165 else /* EQ (coding_type, Qraw_text) */
5166 {
5167 coding->detector = NULL;
5168 coding->decoder = decode_coding_raw_text;
5169 coding->encoder = encode_coding_raw_text;
5170 if (! EQ (eol_type, Qunix))
5171 {
5172 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5173 if (! VECTORP (eol_type))
5174 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5175 }
5176
5177 }
5178
5179 return;
5180 }
5181
5182 /* Return a list of charsets supported by CODING. */
5183
5184 Lisp_Object
5185 coding_charset_list (coding)
5186 struct coding_system *coding;
5187 {
5188 Lisp_Object attrs, charset_list;
5189
5190 CODING_GET_INFO (coding, attrs, charset_list);
5191 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5192 {
5193 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5194
5195 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5196 charset_list = Viso_2022_charset_list;
5197 }
5198 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5199 {
5200 charset_list = Vemacs_mule_charset_list;
5201 }
5202 return charset_list;
5203 }
5204
5205
5206 /* Return raw-text or one of its subsidiaries that has the same
5207 eol_type as CODING-SYSTEM. */
5208
5209 Lisp_Object
5210 raw_text_coding_system (coding_system)
5211 Lisp_Object coding_system;
5212 {
5213 Lisp_Object spec, attrs;
5214 Lisp_Object eol_type, raw_text_eol_type;
5215
5216 if (NILP (coding_system))
5217 return Qraw_text;
5218 spec = CODING_SYSTEM_SPEC (coding_system);
5219 attrs = AREF (spec, 0);
5220
5221 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5222 return coding_system;
5223
5224 eol_type = AREF (spec, 2);
5225 if (VECTORP (eol_type))
5226 return Qraw_text;
5227 spec = CODING_SYSTEM_SPEC (Qraw_text);
5228 raw_text_eol_type = AREF (spec, 2);
5229 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5230 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5231 : AREF (raw_text_eol_type, 2));
5232 }
5233
5234
5235 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5236 does, return one of the subsidiary that has the same eol-spec as
5237 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5238 inherit end-of-line format from the system's setting
5239 (system_eol_type). */
5240
5241 Lisp_Object
5242 coding_inherit_eol_type (coding_system, parent)
5243 Lisp_Object coding_system, parent;
5244 {
5245 Lisp_Object spec, eol_type;
5246
5247 if (NILP (coding_system))
5248 coding_system = Qraw_text;
5249 spec = CODING_SYSTEM_SPEC (coding_system);
5250 eol_type = AREF (spec, 2);
5251 if (VECTORP (eol_type))
5252 {
5253 Lisp_Object parent_eol_type;
5254
5255 if (! NILP (parent))
5256 {
5257 Lisp_Object parent_spec;
5258
5259 parent_spec = CODING_SYSTEM_SPEC (parent);
5260 parent_eol_type = AREF (parent_spec, 2);
5261 }
5262 else
5263 parent_eol_type = system_eol_type;
5264 if (EQ (parent_eol_type, Qunix))
5265 coding_system = AREF (eol_type, 0);
5266 else if (EQ (parent_eol_type, Qdos))
5267 coding_system = AREF (eol_type, 1);
5268 else if (EQ (parent_eol_type, Qmac))
5269 coding_system = AREF (eol_type, 2);
5270 }
5271 return coding_system;
5272 }
5273
5274 /* Emacs has a mechanism to automatically detect a coding system if it
5275 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5276 it's impossible to distinguish some coding systems accurately
5277 because they use the same range of codes. So, at first, coding
5278 systems are categorized into 7, those are:
5279
5280 o coding-category-emacs-mule
5281
5282 The category for a coding system which has the same code range
5283 as Emacs' internal format. Assigned the coding-system (Lisp
5284 symbol) `emacs-mule' by default.
5285
5286 o coding-category-sjis
5287
5288 The category for a coding system which has the same code range
5289 as SJIS. Assigned the coding-system (Lisp
5290 symbol) `japanese-shift-jis' by default.
5291
5292 o coding-category-iso-7
5293
5294 The category for a coding system which has the same code range
5295 as ISO2022 of 7-bit environment. This doesn't use any locking
5296 shift and single shift functions. This can encode/decode all
5297 charsets. Assigned the coding-system (Lisp symbol)
5298 `iso-2022-7bit' by default.
5299
5300 o coding-category-iso-7-tight
5301
5302 Same as coding-category-iso-7 except that this can
5303 encode/decode only the specified charsets.
5304
5305 o coding-category-iso-8-1
5306
5307 The category for a coding system which has the same code range
5308 as ISO2022 of 8-bit environment and graphic plane 1 used only
5309 for DIMENSION1 charset. This doesn't use any locking shift
5310 and single shift functions. Assigned the coding-system (Lisp
5311 symbol) `iso-latin-1' by default.
5312
5313 o coding-category-iso-8-2
5314
5315 The category for a coding system which has the same code range
5316 as ISO2022 of 8-bit environment and graphic plane 1 used only
5317 for DIMENSION2 charset. This doesn't use any locking shift
5318 and single shift functions. Assigned the coding-system (Lisp
5319 symbol) `japanese-iso-8bit' by default.
5320
5321 o coding-category-iso-7-else
5322
5323 The category for a coding system which has the same code range
5324 as ISO2022 of 7-bit environemnt but uses locking shift or
5325 single shift functions. Assigned the coding-system (Lisp
5326 symbol) `iso-2022-7bit-lock' by default.
5327
5328 o coding-category-iso-8-else
5329
5330 The category for a coding system which has the same code range
5331 as ISO2022 of 8-bit environemnt but uses locking shift or
5332 single shift functions. Assigned the coding-system (Lisp
5333 symbol) `iso-2022-8bit-ss2' by default.
5334
5335 o coding-category-big5
5336
5337 The category for a coding system which has the same code range
5338 as BIG5. Assigned the coding-system (Lisp symbol)
5339 `cn-big5' by default.
5340
5341 o coding-category-utf-8
5342
5343 The category for a coding system which has the same code range
5344 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
5345 symbol) `utf-8' by default.
5346
5347 o coding-category-utf-16-be
5348
5349 The category for a coding system in which a text has an
5350 Unicode signature (cf. Unicode Standard) in the order of BIG
5351 endian at the head. Assigned the coding-system (Lisp symbol)
5352 `utf-16-be' by default.
5353
5354 o coding-category-utf-16-le
5355
5356 The category for a coding system in which a text has an
5357 Unicode signature (cf. Unicode Standard) in the order of
5358 LITTLE endian at the head. Assigned the coding-system (Lisp
5359 symbol) `utf-16-le' by default.
5360
5361 o coding-category-ccl
5362
5363 The category for a coding system of which encoder/decoder is
5364 written in CCL programs. The default value is nil, i.e., no
5365 coding system is assigned.
5366
5367 o coding-category-binary
5368
5369 The category for a coding system not categorized in any of the
5370 above. Assigned the coding-system (Lisp symbol)
5371 `no-conversion' by default.
5372
5373 Each of them is a Lisp symbol and the value is an actual
5374 `coding-system's (this is also a Lisp symbol) assigned by a user.
5375 What Emacs does actually is to detect a category of coding system.
5376 Then, it uses a `coding-system' assigned to it. If Emacs can't
5377 decide only one possible category, it selects a category of the
5378 highest priority. Priorities of categories are also specified by a
5379 user in a Lisp variable `coding-category-list'.
5380
5381 */
5382
5383 #define EOL_SEEN_NONE 0
5384 #define EOL_SEEN_LF 1
5385 #define EOL_SEEN_CR 2
5386 #define EOL_SEEN_CRLF 4
5387
5388 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5389 SOURCE is encoded. If CATEGORY is one of
5390 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5391 two-byte, else they are encoded by one-byte.
5392
5393 Return one of EOL_SEEN_XXX. */
5394
5395 #define MAX_EOL_CHECK_COUNT 3
5396
5397 static int
5398 detect_eol (source, src_bytes, category)
5399 const unsigned char *source;
5400 EMACS_INT src_bytes;
5401 enum coding_category category;
5402 {
5403 const unsigned char *src = source, *src_end = src + src_bytes;
5404 unsigned char c;
5405 int total = 0;
5406 int eol_seen = EOL_SEEN_NONE;
5407
5408 if ((1 << category) & CATEGORY_MASK_UTF_16)
5409 {
5410 int msb, lsb;
5411
5412 msb = category == (coding_category_utf_16_le
5413 | coding_category_utf_16_le_nosig);
5414 lsb = 1 - msb;
5415
5416 while (src + 1 < src_end)
5417 {
5418 c = src[lsb];
5419 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5420 {
5421 int this_eol;
5422
5423 if (c == '\n')
5424 this_eol = EOL_SEEN_LF;
5425 else if (src + 3 >= src_end
5426 || src[msb + 2] != 0
5427 || src[lsb + 2] != '\n')
5428 this_eol = EOL_SEEN_CR;
5429 else
5430 this_eol = EOL_SEEN_CRLF;
5431
5432 if (eol_seen == EOL_SEEN_NONE)
5433 /* This is the first end-of-line. */
5434 eol_seen = this_eol;
5435 else if (eol_seen != this_eol)
5436 {
5437 /* The found type is different from what found before. */
5438 eol_seen = EOL_SEEN_LF;
5439 break;
5440 }
5441 if (++total == MAX_EOL_CHECK_COUNT)
5442 break;
5443 }
5444 src += 2;
5445 }
5446 }
5447 else
5448 {
5449 while (src < src_end)
5450 {
5451 c = *src++;
5452 if (c == '\n' || c == '\r')
5453 {
5454 int this_eol;
5455
5456 if (c == '\n')
5457 this_eol = EOL_SEEN_LF;
5458 else if (src >= src_end || *src != '\n')
5459 this_eol = EOL_SEEN_CR;
5460 else
5461 this_eol = EOL_SEEN_CRLF, src++;
5462
5463 if (eol_seen == EOL_SEEN_NONE)
5464 /* This is the first end-of-line. */
5465 eol_seen = this_eol;
5466 else if (eol_seen != this_eol)
5467 {
5468 /* The found type is different from what found before. */
5469 eol_seen = EOL_SEEN_LF;
5470 break;
5471 }
5472 if (++total == MAX_EOL_CHECK_COUNT)
5473 break;
5474 }
5475 }
5476 }
5477 return eol_seen;
5478 }
5479
5480
5481 static Lisp_Object
5482 adjust_coding_eol_type (coding, eol_seen)
5483 struct coding_system *coding;
5484 int eol_seen;
5485 {
5486 Lisp_Object eol_type;
5487
5488 eol_type = CODING_ID_EOL_TYPE (coding->id);
5489 if (eol_seen & EOL_SEEN_LF)
5490 {
5491 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5492 eol_type = Qunix;
5493 }
5494 else if (eol_seen & EOL_SEEN_CRLF)
5495 {
5496 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5497 eol_type = Qdos;
5498 }
5499 else if (eol_seen & EOL_SEEN_CR)
5500 {
5501 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5502 eol_type = Qmac;
5503 }
5504 return eol_type;
5505 }
5506
5507 /* Detect how a text specified in CODING is encoded. If a coding
5508 system is detected, update fields of CODING by the detected coding
5509 system. */
5510
5511 void
5512 detect_coding (coding)
5513 struct coding_system *coding;
5514 {
5515 const unsigned char *src, *src_end;
5516
5517 coding->consumed = coding->consumed_char = 0;
5518 coding->produced = coding->produced_char = 0;
5519 coding_set_source (coding);
5520
5521 src_end = coding->source + coding->src_bytes;
5522
5523 /* If we have not yet decided the text encoding type, detect it
5524 now. */
5525 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5526 {
5527 int c, i;
5528 struct coding_detection_info detect_info;
5529
5530 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5531 for (i = 0, src = coding->source; src < src_end; i++, src++)
5532 {
5533 c = *src;
5534 if (c & 0x80)
5535 break;
5536 if (c < 0x20
5537 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5538 && ! inhibit_iso_escape_detection
5539 && ! detect_info.checked)
5540 {
5541 coding->head_ascii = src - (coding->source + coding->consumed);
5542 if (detect_coding_iso_2022 (coding, &detect_info))
5543 {
5544 /* We have scanned the whole data. */
5545 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5546 /* We didn't find an 8-bit code. */
5547 src = src_end;
5548 break;
5549 }
5550 }
5551 }
5552 coding->head_ascii = src - (coding->source + coding->consumed);
5553
5554 if (coding->head_ascii < coding->src_bytes
5555 || detect_info.found)
5556 {
5557 enum coding_category category;
5558 struct coding_system *this;
5559
5560 if (coding->head_ascii == coding->src_bytes)
5561 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5562 for (i = 0; i < coding_category_raw_text; i++)
5563 {
5564 category = coding_priorities[i];
5565 this = coding_categories + category;
5566 if (detect_info.found & (1 << category))
5567 break;
5568 }
5569 else
5570 for (i = 0; i < coding_category_raw_text; i++)
5571 {
5572 category = coding_priorities[i];
5573 this = coding_categories + category;
5574 if (this->id < 0)
5575 {
5576 /* No coding system of this category is defined. */
5577 detect_info.rejected |= (1 << category);
5578 }
5579 else if (category >= coding_category_raw_text)
5580 continue;
5581 else if (detect_info.checked & (1 << category))
5582 {
5583 if (detect_info.found & (1 << category))
5584 break;
5585 }
5586 else if ((*(this->detector)) (coding, &detect_info)
5587 && detect_info.found & (1 << category))
5588 {
5589 if (category == coding_category_utf_16_auto)
5590 {
5591 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5592 category = coding_category_utf_16_le;
5593 else
5594 category = coding_category_utf_16_be;
5595 }
5596 break;
5597 }
5598 }
5599
5600 if (i < coding_category_raw_text)
5601 setup_coding_system (CODING_ID_NAME (this->id), coding);
5602 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5603 setup_coding_system (Qraw_text, coding);
5604 else if (detect_info.rejected)
5605 for (i = 0; i < coding_category_raw_text; i++)
5606 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5607 {
5608 this = coding_categories + coding_priorities[i];
5609 setup_coding_system (CODING_ID_NAME (this->id), coding);
5610 break;
5611 }
5612 }
5613 }
5614 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5615 == coding_category_utf_16_auto)
5616 {
5617 Lisp_Object coding_systems;
5618 struct coding_detection_info detect_info;
5619
5620 coding_systems
5621 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5622 detect_info.found = detect_info.rejected = 0;
5623 if (CONSP (coding_systems)
5624 && detect_coding_utf_16 (coding, &detect_info))
5625 {
5626 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5627 setup_coding_system (XCAR (coding_systems), coding);
5628 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5629 setup_coding_system (XCDR (coding_systems), coding);
5630 }
5631 }
5632 }
5633
5634
5635 static void
5636 decode_eol (coding)
5637 struct coding_system *coding;
5638 {
5639 Lisp_Object eol_type;
5640 unsigned char *p, *pbeg, *pend;
5641
5642 eol_type = CODING_ID_EOL_TYPE (coding->id);
5643 if (EQ (eol_type, Qunix))
5644 return;
5645
5646 if (NILP (coding->dst_object))
5647 pbeg = coding->destination;
5648 else
5649 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5650 pend = pbeg + coding->produced;
5651
5652 if (VECTORP (eol_type))
5653 {
5654 int eol_seen = EOL_SEEN_NONE;
5655
5656 for (p = pbeg; p < pend; p++)
5657 {
5658 if (*p == '\n')
5659 eol_seen |= EOL_SEEN_LF;
5660 else if (*p == '\r')
5661 {
5662 if (p + 1 < pend && *(p + 1) == '\n')
5663 {
5664 eol_seen |= EOL_SEEN_CRLF;
5665 p++;
5666 }
5667 else
5668 eol_seen |= EOL_SEEN_CR;
5669 }
5670 }
5671 if (eol_seen != EOL_SEEN_NONE
5672 && eol_seen != EOL_SEEN_LF
5673 && eol_seen != EOL_SEEN_CRLF
5674 && eol_seen != EOL_SEEN_CR)
5675 eol_seen = EOL_SEEN_LF;
5676 if (eol_seen != EOL_SEEN_NONE)
5677 eol_type = adjust_coding_eol_type (coding, eol_seen);
5678 }
5679
5680 if (EQ (eol_type, Qmac))
5681 {
5682 for (p = pbeg; p < pend; p++)
5683 if (*p == '\r')
5684 *p = '\n';
5685 }
5686 else if (EQ (eol_type, Qdos))
5687 {
5688 int n = 0;
5689
5690 if (NILP (coding->dst_object))
5691 {
5692 /* Start deleting '\r' from the tail to minimize the memory
5693 movement. */
5694 for (p = pend - 2; p >= pbeg; p--)
5695 if (*p == '\r')
5696 {
5697 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5698 n++;
5699 }
5700 }
5701 else
5702 {
5703 int pos_byte = coding->dst_pos_byte;
5704 int pos = coding->dst_pos;
5705 int pos_end = pos + coding->produced_char - 1;
5706
5707 while (pos < pos_end)
5708 {
5709 p = BYTE_POS_ADDR (pos_byte);
5710 if (*p == '\r' && p[1] == '\n')
5711 {
5712 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5713 n++;
5714 pos_end--;
5715 }
5716 pos++;
5717 pos_byte += BYTES_BY_CHAR_HEAD (*p);
5718 }
5719 }
5720 coding->produced -= n;
5721 coding->produced_char -= n;
5722 }
5723 }
5724
5725
5726 /* Return a translation table (or list of them) from coding system
5727 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5728 decoding (ENCODEP is zero). */
5729
5730 static Lisp_Object
5731 get_translation_table (attrs, encodep, max_lookup)
5732 Lisp_Object attrs;
5733 int encodep, *max_lookup;
5734 {
5735 Lisp_Object standard, translation_table;
5736 Lisp_Object val;
5737
5738 if (encodep)
5739 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5740 standard = Vstandard_translation_table_for_encode;
5741 else
5742 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5743 standard = Vstandard_translation_table_for_decode;
5744 if (NILP (translation_table))
5745 translation_table = standard;
5746 else
5747 {
5748 if (SYMBOLP (translation_table))
5749 translation_table = Fget (translation_table, Qtranslation_table);
5750 else if (CONSP (translation_table))
5751 {
5752 translation_table = Fcopy_sequence (translation_table);
5753 for (val = translation_table; CONSP (val); val = XCDR (val))
5754 if (SYMBOLP (XCAR (val)))
5755 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5756 }
5757 if (CHAR_TABLE_P (standard))
5758 {
5759 if (CONSP (translation_table))
5760 translation_table = nconc2 (translation_table,
5761 Fcons (standard, Qnil));
5762 else
5763 translation_table = Fcons (translation_table,
5764 Fcons (standard, Qnil));
5765 }
5766 }
5767
5768 if (max_lookup)
5769 {
5770 *max_lookup = 1;
5771 if (CHAR_TABLE_P (translation_table)
5772 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5773 {
5774 val = XCHAR_TABLE (translation_table)->extras[1];
5775 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5776 *max_lookup = XFASTINT (val);
5777 }
5778 else if (CONSP (translation_table))
5779 {
5780 Lisp_Object tail, val;
5781
5782 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5783 if (CHAR_TABLE_P (XCAR (tail))
5784 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5785 {
5786 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5787 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5788 *max_lookup = XFASTINT (val);
5789 }
5790 }
5791 }
5792 return translation_table;
5793 }
5794
5795 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
5796 do { \
5797 trans = Qnil; \
5798 if (CHAR_TABLE_P (table)) \
5799 { \
5800 trans = CHAR_TABLE_REF (table, c); \
5801 if (CHARACTERP (trans)) \
5802 c = XFASTINT (trans), trans = Qnil; \
5803 } \
5804 else if (CONSP (table)) \
5805 { \
5806 Lisp_Object tail; \
5807 \
5808 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
5809 if (CHAR_TABLE_P (XCAR (tail))) \
5810 { \
5811 trans = CHAR_TABLE_REF (XCAR (tail), c); \
5812 if (CHARACTERP (trans)) \
5813 c = XFASTINT (trans), trans = Qnil; \
5814 else if (! NILP (trans)) \
5815 break; \
5816 } \
5817 } \
5818 } while (0)
5819
5820
5821 static Lisp_Object
5822 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5823 Lisp_Object val;
5824 int *buf, *buf_end;
5825 int last_block;
5826 int *from_nchars, *to_nchars;
5827 {
5828 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
5829 [TO-CHAR ...]. */
5830 if (CONSP (val))
5831 {
5832 Lisp_Object from, tail;
5833 int i, len;
5834
5835 for (tail = val; CONSP (tail); tail = XCDR (tail))
5836 {
5837 val = XCAR (tail);
5838 from = XCAR (val);
5839 len = ASIZE (from);
5840 for (i = 0; i < len; i++)
5841 {
5842 if (buf + i == buf_end)
5843 {
5844 if (! last_block)
5845 return Qt;
5846 break;
5847 }
5848 if (XINT (AREF (from, i)) != buf[i])
5849 break;
5850 }
5851 if (i == len)
5852 {
5853 val = XCDR (val);
5854 *from_nchars = len;
5855 break;
5856 }
5857 }
5858 if (! CONSP (tail))
5859 return Qnil;
5860 }
5861 if (VECTORP (val))
5862 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5863 else
5864 *buf = XINT (val);
5865 return val;
5866 }
5867
5868
5869 static int
5870 produce_chars (coding, translation_table, last_block)
5871 struct coding_system *coding;
5872 Lisp_Object translation_table;
5873 int last_block;
5874 {
5875 unsigned char *dst = coding->destination + coding->produced;
5876 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5877 int produced;
5878 int produced_chars = 0;
5879 int carryover = 0;
5880
5881 if (! coding->chars_at_source)
5882 {
5883 /* Characters are in coding->charbuf. */
5884 int *buf = coding->charbuf;
5885 int *buf_end = buf + coding->charbuf_used;
5886
5887 if (BUFFERP (coding->src_object)
5888 && EQ (coding->src_object, coding->dst_object))
5889 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5890
5891 while (buf < buf_end)
5892 {
5893 int c = *buf, i;
5894
5895 if (c >= 0)
5896 {
5897 int from_nchars = 1, to_nchars = 1;
5898 Lisp_Object trans = Qnil;
5899
5900 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5901 if (! NILP (trans))
5902 {
5903 trans = get_translation (trans, buf, buf_end, last_block,
5904 &from_nchars, &to_nchars);
5905 if (EQ (trans, Qt))
5906 break;
5907 c = *buf;
5908 }
5909
5910 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5911 {
5912 dst = alloc_destination (coding,
5913 buf_end - buf
5914 + MAX_MULTIBYTE_LENGTH * to_nchars,
5915 dst);
5916 dst_end = coding->destination + coding->dst_bytes;
5917 }
5918
5919 for (i = 0; i < to_nchars; i++)
5920 {
5921 if (i > 0)
5922 c = XINT (AREF (trans, i));
5923 if (coding->dst_multibyte
5924 || ! CHAR_BYTE8_P (c))
5925 CHAR_STRING_ADVANCE (c, dst);
5926 else
5927 *dst++ = CHAR_TO_BYTE8 (c);
5928 }
5929 produced_chars += to_nchars;
5930 *buf++ = to_nchars;
5931 while (--from_nchars > 0)
5932 *buf++ = 0;
5933 }
5934 else
5935 /* This is an annotation datum. (-C) is the length. */
5936 buf += -c;
5937 }
5938 carryover = buf_end - buf;
5939 }
5940 else
5941 {
5942 const unsigned char *src = coding->source;
5943 const unsigned char *src_end = src + coding->src_bytes;
5944 Lisp_Object eol_type;
5945
5946 eol_type = CODING_ID_EOL_TYPE (coding->id);
5947
5948 if (coding->src_multibyte != coding->dst_multibyte)
5949 {
5950 if (coding->src_multibyte)
5951 {
5952 int multibytep = 1;
5953 int consumed_chars;
5954
5955 while (1)
5956 {
5957 const unsigned char *src_base = src;
5958 int c;
5959
5960 ONE_MORE_BYTE (c);
5961 if (c == '\r')
5962 {
5963 if (EQ (eol_type, Qdos))
5964 {
5965 if (src == src_end)
5966 {
5967 record_conversion_result
5968 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5969 goto no_more_source;
5970 }
5971 if (*src == '\n')
5972 c = *src++;
5973 }
5974 else if (EQ (eol_type, Qmac))
5975 c = '\n';
5976 }
5977 if (dst == dst_end)
5978 {
5979 coding->consumed = src - coding->source;
5980
5981 if (EQ (coding->src_object, coding->dst_object))
5982 dst_end = (unsigned char *) src;
5983 if (dst == dst_end)
5984 {
5985 dst = alloc_destination (coding, src_end - src + 1,
5986 dst);
5987 dst_end = coding->destination + coding->dst_bytes;
5988 coding_set_source (coding);
5989 src = coding->source + coding->consumed;
5990 src_end = coding->source + coding->src_bytes;
5991 }
5992 }
5993 *dst++ = c;
5994 produced_chars++;
5995 }
5996 no_more_source:
5997 ;
5998 }
5999 else
6000 while (src < src_end)
6001 {
6002 int multibytep = 1;
6003 int c = *src++;
6004
6005 if (c == '\r')
6006 {
6007 if (EQ (eol_type, Qdos))
6008 {
6009 if (src < src_end
6010 && *src == '\n')
6011 c = *src++;
6012 }
6013 else if (EQ (eol_type, Qmac))
6014 c = '\n';
6015 }
6016 if (dst >= dst_end - 1)
6017 {
6018 coding->consumed = src - coding->source;
6019
6020 if (EQ (coding->src_object, coding->dst_object))
6021 dst_end = (unsigned char *) src;
6022 if (dst >= dst_end - 1)
6023 {
6024 dst = alloc_destination (coding, src_end - src + 2,
6025 dst);
6026 dst_end = coding->destination + coding->dst_bytes;
6027 coding_set_source (coding);
6028 src = coding->source + coding->consumed;
6029 src_end = coding->source + coding->src_bytes;
6030 }
6031 }
6032 EMIT_ONE_BYTE (c);
6033 }
6034 }
6035 else
6036 {
6037 if (!EQ (coding->src_object, coding->dst_object))
6038 {
6039 int require = coding->src_bytes - coding->dst_bytes;
6040
6041 if (require > 0)
6042 {
6043 EMACS_INT offset = src - coding->source;
6044
6045 dst = alloc_destination (coding, require, dst);
6046 coding_set_source (coding);
6047 src = coding->source + offset;
6048 src_end = coding->source + coding->src_bytes;
6049 }
6050 }
6051 produced_chars = coding->src_chars;
6052 while (src < src_end)
6053 {
6054 int c = *src++;
6055
6056 if (c == '\r')
6057 {
6058 if (EQ (eol_type, Qdos))
6059 {
6060 if (src < src_end
6061 && *src == '\n')
6062 c = *src++;
6063 produced_chars--;
6064 }
6065 else if (EQ (eol_type, Qmac))
6066 c = '\n';
6067 }
6068 *dst++ = c;
6069 }
6070 }
6071 coding->consumed = coding->src_bytes;
6072 coding->consumed_char = coding->src_chars;
6073 }
6074
6075 produced = dst - (coding->destination + coding->produced);
6076 if (BUFFERP (coding->dst_object))
6077 insert_from_gap (produced_chars, produced);
6078 coding->produced += produced;
6079 coding->produced_char += produced_chars;
6080 return carryover;
6081 }
6082
6083 /* Compose text in CODING->object according to the annotation data at
6084 CHARBUF. CHARBUF is an array:
6085 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6086 */
6087
6088 static INLINE void
6089 produce_composition (coding, charbuf, pos)
6090 struct coding_system *coding;
6091 int *charbuf;
6092 EMACS_INT pos;
6093 {
6094 int len;
6095 EMACS_INT to;
6096 enum composition_method method;
6097 Lisp_Object components;
6098
6099 len = -charbuf[0];
6100 to = pos + charbuf[2];
6101 if (to <= pos)
6102 return;
6103 method = (enum composition_method) (charbuf[3]);
6104
6105 if (method == COMPOSITION_RELATIVE)
6106 components = Qnil;
6107 else if (method >= COMPOSITION_WITH_RULE
6108 && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6109 {
6110 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6111 int i;
6112
6113 len -= 4;
6114 charbuf += 4;
6115 for (i = 0; i < len; i++)
6116 {
6117 args[i] = make_number (charbuf[i]);
6118 if (charbuf[i] < 0)
6119 return;
6120 }
6121 components = (method == COMPOSITION_WITH_ALTCHARS
6122 ? Fstring (len, args) : Fvector (len, args));
6123 }
6124 else
6125 return;
6126 compose_text (pos, to, components, Qnil, coding->dst_object);
6127 }
6128
6129
6130 /* Put `charset' property on text in CODING->object according to
6131 the annotation data at CHARBUF. CHARBUF is an array:
6132 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6133 */
6134
6135 static INLINE void
6136 produce_charset (coding, charbuf, pos)
6137 struct coding_system *coding;
6138 int *charbuf;
6139 EMACS_INT pos;
6140 {
6141 EMACS_INT from = pos - charbuf[2];
6142 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6143
6144 Fput_text_property (make_number (from), make_number (pos),
6145 Qcharset, CHARSET_NAME (charset),
6146 coding->dst_object);
6147 }
6148
6149
6150 #define CHARBUF_SIZE 0x4000
6151
6152 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6153 do { \
6154 int size = CHARBUF_SIZE;; \
6155 \
6156 coding->charbuf = NULL; \
6157 while (size > 1024) \
6158 { \
6159 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6160 if (coding->charbuf) \
6161 break; \
6162 size >>= 1; \
6163 } \
6164 if (! coding->charbuf) \
6165 { \
6166 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6167 return coding->result; \
6168 } \
6169 coding->charbuf_size = size; \
6170 } while (0)
6171
6172
6173 static void
6174 produce_annotation (coding, pos)
6175 struct coding_system *coding;
6176 EMACS_INT pos;
6177 {
6178 int *charbuf = coding->charbuf;
6179 int *charbuf_end = charbuf + coding->charbuf_used;
6180
6181 if (NILP (coding->dst_object))
6182 return;
6183
6184 while (charbuf < charbuf_end)
6185 {
6186 if (*charbuf >= 0)
6187 pos += *charbuf++;
6188 else
6189 {
6190 int len = -*charbuf;
6191 switch (charbuf[1])
6192 {
6193 case CODING_ANNOTATE_COMPOSITION_MASK:
6194 produce_composition (coding, charbuf, pos);
6195 break;
6196 case CODING_ANNOTATE_CHARSET_MASK:
6197 produce_charset (coding, charbuf, pos);
6198 break;
6199 default:
6200 abort ();
6201 }
6202 charbuf += len;
6203 }
6204 }
6205 }
6206
6207 /* Decode the data at CODING->src_object into CODING->dst_object.
6208 CODING->src_object is a buffer, a string, or nil.
6209 CODING->dst_object is a buffer.
6210
6211 If CODING->src_object is a buffer, it must be the current buffer.
6212 In this case, if CODING->src_pos is positive, it is a position of
6213 the source text in the buffer, otherwise, the source text is in the
6214 gap area of the buffer, and CODING->src_pos specifies the offset of
6215 the text from GPT (which must be the same as PT). If this is the
6216 same buffer as CODING->dst_object, CODING->src_pos must be
6217 negative.
6218
6219 If CODING->src_object is a string, CODING->src_pos is an index to
6220 that string.
6221
6222 If CODING->src_object is nil, CODING->source must already point to
6223 the non-relocatable memory area. In this case, CODING->src_pos is
6224 an offset from CODING->source.
6225
6226 The decoded data is inserted at the current point of the buffer
6227 CODING->dst_object.
6228 */
6229
6230 static int
6231 decode_coding (coding)
6232 struct coding_system *coding;
6233 {
6234 Lisp_Object attrs;
6235 Lisp_Object undo_list;
6236 Lisp_Object translation_table;
6237 int carryover;
6238 int i;
6239
6240 if (BUFFERP (coding->src_object)
6241 && coding->src_pos > 0
6242 && coding->src_pos < GPT
6243 && coding->src_pos + coding->src_chars > GPT)
6244 move_gap_both (coding->src_pos, coding->src_pos_byte);
6245
6246 undo_list = Qt;
6247 if (BUFFERP (coding->dst_object))
6248 {
6249 if (current_buffer != XBUFFER (coding->dst_object))
6250 set_buffer_internal (XBUFFER (coding->dst_object));
6251 if (GPT != PT)
6252 move_gap_both (PT, PT_BYTE);
6253 undo_list = current_buffer->undo_list;
6254 current_buffer->undo_list = Qt;
6255 }
6256
6257 coding->consumed = coding->consumed_char = 0;
6258 coding->produced = coding->produced_char = 0;
6259 coding->chars_at_source = 0;
6260 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6261 coding->errors = 0;
6262
6263 ALLOC_CONVERSION_WORK_AREA (coding);
6264
6265 attrs = CODING_ID_ATTRS (coding->id);
6266 translation_table = get_translation_table (attrs, 0, NULL);
6267
6268 carryover = 0;
6269 do
6270 {
6271 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6272
6273 coding_set_source (coding);
6274 coding->annotated = 0;
6275 coding->charbuf_used = carryover;
6276 (*(coding->decoder)) (coding);
6277 coding_set_destination (coding);
6278 carryover = produce_chars (coding, translation_table, 0);
6279 if (coding->annotated)
6280 produce_annotation (coding, pos);
6281 for (i = 0; i < carryover; i++)
6282 coding->charbuf[i]
6283 = coding->charbuf[coding->charbuf_used - carryover + i];
6284 }
6285 while (coding->consumed < coding->src_bytes
6286 && (coding->result == CODING_RESULT_SUCCESS
6287 || coding->result == CODING_RESULT_INVALID_SRC));
6288
6289 if (carryover > 0)
6290 {
6291 coding_set_destination (coding);
6292 coding->charbuf_used = carryover;
6293 produce_chars (coding, translation_table, 1);
6294 }
6295
6296 coding->carryover_bytes = 0;
6297 if (coding->consumed < coding->src_bytes)
6298 {
6299 int nbytes = coding->src_bytes - coding->consumed;
6300 const unsigned char *src;
6301
6302 coding_set_source (coding);
6303 coding_set_destination (coding);
6304 src = coding->source + coding->consumed;
6305
6306 if (coding->mode & CODING_MODE_LAST_BLOCK)
6307 {
6308 /* Flush out unprocessed data as binary chars. We are sure
6309 that the number of data is less than the size of
6310 coding->charbuf. */
6311 coding->charbuf_used = 0;
6312 while (nbytes-- > 0)
6313 {
6314 int c = *src++;
6315
6316 if (c & 0x80)
6317 c = BYTE8_TO_CHAR (c);
6318 coding->charbuf[coding->charbuf_used++] = c;
6319 }
6320 produce_chars (coding, Qnil, 1);
6321 }
6322 else
6323 {
6324 /* Record unprocessed bytes in coding->carryover. We are
6325 sure that the number of data is less than the size of
6326 coding->carryover. */
6327 unsigned char *p = coding->carryover;
6328
6329 coding->carryover_bytes = nbytes;
6330 while (nbytes-- > 0)
6331 *p++ = *src++;
6332 }
6333 coding->consumed = coding->src_bytes;
6334 }
6335
6336 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6337 decode_eol (coding);
6338 if (BUFFERP (coding->dst_object))
6339 {
6340 current_buffer->undo_list = undo_list;
6341 record_insert (coding->dst_pos, coding->produced_char);
6342 }
6343 return coding->result;
6344 }
6345
6346
6347 /* Extract an annotation datum from a composition starting at POS and
6348 ending before LIMIT of CODING->src_object (buffer or string), store
6349 the data in BUF, set *STOP to a starting position of the next
6350 composition (if any) or to LIMIT, and return the address of the
6351 next element of BUF.
6352
6353 If such an annotation is not found, set *STOP to a starting
6354 position of a composition after POS (if any) or to LIMIT, and
6355 return BUF. */
6356
6357 static INLINE int *
6358 handle_composition_annotation (pos, limit, coding, buf, stop)
6359 EMACS_INT pos, limit;
6360 struct coding_system *coding;
6361 int *buf;
6362 EMACS_INT *stop;
6363 {
6364 EMACS_INT start, end;
6365 Lisp_Object prop;
6366
6367 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6368 || end > limit)
6369 *stop = limit;
6370 else if (start > pos)
6371 *stop = start;
6372 else
6373 {
6374 if (start == pos)
6375 {
6376 /* We found a composition. Store the corresponding
6377 annotation data in BUF. */
6378 int *head = buf;
6379 enum composition_method method = COMPOSITION_METHOD (prop);
6380 int nchars = COMPOSITION_LENGTH (prop);
6381
6382 ADD_COMPOSITION_DATA (buf, nchars, method);
6383 if (method != COMPOSITION_RELATIVE)
6384 {
6385 Lisp_Object components;
6386 int len, i, i_byte;
6387
6388 components = COMPOSITION_COMPONENTS (prop);
6389 if (VECTORP (components))
6390 {
6391 len = XVECTOR (components)->size;
6392 for (i = 0; i < len; i++)
6393 *buf++ = XINT (AREF (components, i));
6394 }
6395 else if (STRINGP (components))
6396 {
6397 len = SCHARS (components);
6398 i = i_byte = 0;
6399 while (i < len)
6400 {
6401 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6402 buf++;
6403 }
6404 }
6405 else if (INTEGERP (components))
6406 {
6407 len = 1;
6408 *buf++ = XINT (components);
6409 }
6410 else if (CONSP (components))
6411 {
6412 for (len = 0; CONSP (components);
6413 len++, components = XCDR (components))
6414 *buf++ = XINT (XCAR (components));
6415 }
6416 else
6417 abort ();
6418 *head -= len;
6419 }
6420 }
6421
6422 if (find_composition (end, limit, &start, &end, &prop,
6423 coding->src_object)
6424 && end <= limit)
6425 *stop = start;
6426 else
6427 *stop = limit;
6428 }
6429 return buf;
6430 }
6431
6432
6433 /* Extract an annotation datum from a text property `charset' at POS of
6434 CODING->src_object (buffer of string), store the data in BUF, set
6435 *STOP to the position where the value of `charset' property changes
6436 (limiting by LIMIT), and return the address of the next element of
6437 BUF.
6438
6439 If the property value is nil, set *STOP to the position where the
6440 property value is non-nil (limiting by LIMIT), and return BUF. */
6441
6442 static INLINE int *
6443 handle_charset_annotation (pos, limit, coding, buf, stop)
6444 EMACS_INT pos, limit;
6445 struct coding_system *coding;
6446 int *buf;
6447 EMACS_INT *stop;
6448 {
6449 Lisp_Object val, next;
6450 int id;
6451
6452 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6453 if (! NILP (val) && CHARSETP (val))
6454 id = XINT (CHARSET_SYMBOL_ID (val));
6455 else
6456 id = -1;
6457 ADD_CHARSET_DATA (buf, 0, id);
6458 next = Fnext_single_property_change (make_number (pos), Qcharset,
6459 coding->src_object,
6460 make_number (limit));
6461 *stop = XINT (next);
6462 return buf;
6463 }
6464
6465
6466 static void
6467 consume_chars (coding, translation_table, max_lookup)
6468 struct coding_system *coding;
6469 Lisp_Object translation_table;
6470 int max_lookup;
6471 {
6472 int *buf = coding->charbuf;
6473 int *buf_end = coding->charbuf + coding->charbuf_size;
6474 const unsigned char *src = coding->source + coding->consumed;
6475 const unsigned char *src_end = coding->source + coding->src_bytes;
6476 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6477 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6478 int multibytep = coding->src_multibyte;
6479 Lisp_Object eol_type;
6480 int c;
6481 EMACS_INT stop, stop_composition, stop_charset;
6482 int *lookup_buf = NULL;
6483
6484 if (! NILP (translation_table))
6485 lookup_buf = alloca (sizeof (int) * max_lookup);
6486
6487 eol_type = CODING_ID_EOL_TYPE (coding->id);
6488 if (VECTORP (eol_type))
6489 eol_type = Qunix;
6490
6491 /* Note: composition handling is not yet implemented. */
6492 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6493
6494 if (NILP (coding->src_object))
6495 stop = stop_composition = stop_charset = end_pos;
6496 else
6497 {
6498 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6499 stop = stop_composition = pos;
6500 else
6501 stop = stop_composition = end_pos;
6502 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6503 stop = stop_charset = pos;
6504 else
6505 stop_charset = end_pos;
6506 }
6507
6508 /* Compensate for CRLF and conversion. */
6509 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6510 while (buf < buf_end)
6511 {
6512 Lisp_Object trans;
6513
6514 if (pos == stop)
6515 {
6516 if (pos == end_pos)
6517 break;
6518 if (pos == stop_composition)
6519 buf = handle_composition_annotation (pos, end_pos, coding,
6520 buf, &stop_composition);
6521 if (pos == stop_charset)
6522 buf = handle_charset_annotation (pos, end_pos, coding,
6523 buf, &stop_charset);
6524 stop = (stop_composition < stop_charset
6525 ? stop_composition : stop_charset);
6526 }
6527
6528 if (! multibytep)
6529 {
6530 EMACS_INT bytes;
6531
6532 if (coding->encoder == encode_coding_raw_text)
6533 c = *src++, pos++;
6534 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6535 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6536 else
6537 c = BYTE8_TO_CHAR (*src), src++, pos++;
6538 }
6539 else
6540 c = STRING_CHAR_ADVANCE (src), pos++;
6541 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6542 c = '\n';
6543 if (! EQ (eol_type, Qunix))
6544 {
6545 if (c == '\n')
6546 {
6547 if (EQ (eol_type, Qdos))
6548 *buf++ = '\r';
6549 else
6550 c = '\r';
6551 }
6552 }
6553
6554 trans = Qnil;
6555 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6556 if (NILP (trans))
6557 *buf++ = c;
6558 else
6559 {
6560 int from_nchars = 1, to_nchars = 1;
6561 int *lookup_buf_end;
6562 const unsigned char *p = src;
6563 int i;
6564
6565 lookup_buf[0] = c;
6566 for (i = 1; i < max_lookup && p < src_end; i++)
6567 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6568 lookup_buf_end = lookup_buf + i;
6569 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6570 &from_nchars, &to_nchars);
6571 if (EQ (trans, Qt)
6572 || buf + to_nchars > buf_end)
6573 break;
6574 *buf++ = *lookup_buf;
6575 for (i = 1; i < to_nchars; i++)
6576 *buf++ = XINT (AREF (trans, i));
6577 for (i = 1; i < from_nchars; i++, pos++)
6578 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6579 }
6580 }
6581
6582 coding->consumed = src - coding->source;
6583 coding->consumed_char = pos - coding->src_pos;
6584 coding->charbuf_used = buf - coding->charbuf;
6585 coding->chars_at_source = 0;
6586 }
6587
6588
6589 /* Encode the text at CODING->src_object into CODING->dst_object.
6590 CODING->src_object is a buffer or a string.
6591 CODING->dst_object is a buffer or nil.
6592
6593 If CODING->src_object is a buffer, it must be the current buffer.
6594 In this case, if CODING->src_pos is positive, it is a position of
6595 the source text in the buffer, otherwise. the source text is in the
6596 gap area of the buffer, and coding->src_pos specifies the offset of
6597 the text from GPT (which must be the same as PT). If this is the
6598 same buffer as CODING->dst_object, CODING->src_pos must be
6599 negative and CODING should not have `pre-write-conversion'.
6600
6601 If CODING->src_object is a string, CODING should not have
6602 `pre-write-conversion'.
6603
6604 If CODING->dst_object is a buffer, the encoded data is inserted at
6605 the current point of that buffer.
6606
6607 If CODING->dst_object is nil, the encoded data is placed at the
6608 memory area specified by CODING->destination. */
6609
6610 static int
6611 encode_coding (coding)
6612 struct coding_system *coding;
6613 {
6614 Lisp_Object attrs;
6615 Lisp_Object translation_table;
6616 int max_lookup;
6617
6618 attrs = CODING_ID_ATTRS (coding->id);
6619 if (coding->encoder == encode_coding_raw_text)
6620 translation_table = Qnil, max_lookup = 0;
6621 else
6622 translation_table = get_translation_table (attrs, 1, &max_lookup);
6623
6624 if (BUFFERP (coding->dst_object))
6625 {
6626 set_buffer_internal (XBUFFER (coding->dst_object));
6627 coding->dst_multibyte
6628 = ! NILP (current_buffer->enable_multibyte_characters);
6629 }
6630
6631 coding->consumed = coding->consumed_char = 0;
6632 coding->produced = coding->produced_char = 0;
6633 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6634 coding->errors = 0;
6635
6636 ALLOC_CONVERSION_WORK_AREA (coding);
6637
6638 do {
6639 coding_set_source (coding);
6640 consume_chars (coding, translation_table, max_lookup);
6641 coding_set_destination (coding);
6642 (*(coding->encoder)) (coding);
6643 } while (coding->consumed_char < coding->src_chars);
6644
6645 if (BUFFERP (coding->dst_object))
6646 insert_from_gap (coding->produced_char, coding->produced);
6647
6648 return (coding->result);
6649 }
6650
6651
6652 /* Name (or base name) of work buffer for code conversion. */
6653 static Lisp_Object Vcode_conversion_workbuf_name;
6654
6655 /* A working buffer used by the top level conversion. Once it is
6656 created, it is never destroyed. It has the name
6657 Vcode_conversion_workbuf_name. The other working buffers are
6658 destroyed after the use is finished, and their names are modified
6659 versions of Vcode_conversion_workbuf_name. */
6660 static Lisp_Object Vcode_conversion_reused_workbuf;
6661
6662 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6663 static int reused_workbuf_in_use;
6664
6665
6666 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6667 multibyteness of returning buffer. */
6668
6669 static Lisp_Object
6670 make_conversion_work_buffer (multibyte)
6671 int multibyte;
6672 {
6673 Lisp_Object name, workbuf;
6674 struct buffer *current;
6675
6676 if (reused_workbuf_in_use++)
6677 {
6678 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6679 workbuf = Fget_buffer_create (name);
6680 }
6681 else
6682 {
6683 name = Vcode_conversion_workbuf_name;
6684 workbuf = Fget_buffer_create (name);
6685 if (NILP (Vcode_conversion_reused_workbuf))
6686 Vcode_conversion_reused_workbuf = workbuf;
6687 }
6688 current = current_buffer;
6689 set_buffer_internal (XBUFFER (workbuf));
6690 Ferase_buffer ();
6691 current_buffer->undo_list = Qt;
6692 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6693 set_buffer_internal (current);
6694 return workbuf;
6695 }
6696
6697
6698 static Lisp_Object
6699 code_conversion_restore (arg)
6700 Lisp_Object arg;
6701 {
6702 Lisp_Object current, workbuf;
6703 struct gcpro gcpro1;
6704
6705 GCPRO1 (arg);
6706 current = XCAR (arg);
6707 workbuf = XCDR (arg);
6708 if (! NILP (workbuf))
6709 {
6710 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6711 reused_workbuf_in_use = 0;
6712 else if (! NILP (Fbuffer_live_p (workbuf)))
6713 Fkill_buffer (workbuf);
6714 }
6715 set_buffer_internal (XBUFFER (current));
6716 UNGCPRO;
6717 return Qnil;
6718 }
6719
6720 Lisp_Object
6721 code_conversion_save (with_work_buf, multibyte)
6722 int with_work_buf, multibyte;
6723 {
6724 Lisp_Object workbuf = Qnil;
6725
6726 if (with_work_buf)
6727 workbuf = make_conversion_work_buffer (multibyte);
6728 record_unwind_protect (code_conversion_restore,
6729 Fcons (Fcurrent_buffer (), workbuf));
6730 return workbuf;
6731 }
6732
6733 int
6734 decode_coding_gap (coding, chars, bytes)
6735 struct coding_system *coding;
6736 EMACS_INT chars, bytes;
6737 {
6738 int count = specpdl_ptr - specpdl;
6739 Lisp_Object attrs;
6740
6741 code_conversion_save (0, 0);
6742
6743 coding->src_object = Fcurrent_buffer ();
6744 coding->src_chars = chars;
6745 coding->src_bytes = bytes;
6746 coding->src_pos = -chars;
6747 coding->src_pos_byte = -bytes;
6748 coding->src_multibyte = chars < bytes;
6749 coding->dst_object = coding->src_object;
6750 coding->dst_pos = PT;
6751 coding->dst_pos_byte = PT_BYTE;
6752 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6753
6754 if (CODING_REQUIRE_DETECTION (coding))
6755 detect_coding (coding);
6756
6757 coding->mode |= CODING_MODE_LAST_BLOCK;
6758 decode_coding (coding);
6759
6760 attrs = CODING_ID_ATTRS (coding->id);
6761 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6762 {
6763 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6764 Lisp_Object val;
6765
6766 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6767 val = call1 (CODING_ATTR_POST_READ (attrs),
6768 make_number (coding->produced_char));
6769 CHECK_NATNUM (val);
6770 coding->produced_char += Z - prev_Z;
6771 coding->produced += Z_BYTE - prev_Z_BYTE;
6772 }
6773
6774 unbind_to (count, Qnil);
6775 return coding->result;
6776 }
6777
6778 int
6779 encode_coding_gap (coding, chars, bytes)
6780 struct coding_system *coding;
6781 EMACS_INT chars, bytes;
6782 {
6783 int count = specpdl_ptr - specpdl;
6784
6785 code_conversion_save (0, 0);
6786
6787 coding->src_object = Fcurrent_buffer ();
6788 coding->src_chars = chars;
6789 coding->src_bytes = bytes;
6790 coding->src_pos = -chars;
6791 coding->src_pos_byte = -bytes;
6792 coding->src_multibyte = chars < bytes;
6793 coding->dst_object = coding->src_object;
6794 coding->dst_pos = PT;
6795 coding->dst_pos_byte = PT_BYTE;
6796
6797 encode_coding (coding);
6798
6799 unbind_to (count, Qnil);
6800 return coding->result;
6801 }
6802
6803
6804 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6805 SRC_OBJECT into DST_OBJECT by coding context CODING.
6806
6807 SRC_OBJECT is a buffer, a string, or Qnil.
6808
6809 If it is a buffer, the text is at point of the buffer. FROM and TO
6810 are positions in the buffer.
6811
6812 If it is a string, the text is at the beginning of the string.
6813 FROM and TO are indices to the string.
6814
6815 If it is nil, the text is at coding->source. FROM and TO are
6816 indices to coding->source.
6817
6818 DST_OBJECT is a buffer, Qt, or Qnil.
6819
6820 If it is a buffer, the decoded text is inserted at point of the
6821 buffer. If the buffer is the same as SRC_OBJECT, the source text
6822 is deleted.
6823
6824 If it is Qt, a string is made from the decoded text, and
6825 set in CODING->dst_object.
6826
6827 If it is Qnil, the decoded text is stored at CODING->destination.
6828 The caller must allocate CODING->dst_bytes bytes at
6829 CODING->destination by xmalloc. If the decoded text is longer than
6830 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6831 */
6832
6833 void
6834 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6835 dst_object)
6836 struct coding_system *coding;
6837 Lisp_Object src_object;
6838 EMACS_INT from, from_byte, to, to_byte;
6839 Lisp_Object dst_object;
6840 {
6841 int count = specpdl_ptr - specpdl;
6842 unsigned char *destination;
6843 EMACS_INT dst_bytes;
6844 EMACS_INT chars = to - from;
6845 EMACS_INT bytes = to_byte - from_byte;
6846 Lisp_Object attrs;
6847 Lisp_Object buffer;
6848 int saved_pt = -1, saved_pt_byte;
6849
6850 buffer = Fcurrent_buffer ();
6851
6852 if (NILP (dst_object))
6853 {
6854 destination = coding->destination;
6855 dst_bytes = coding->dst_bytes;
6856 }
6857
6858 coding->src_object = src_object;
6859 coding->src_chars = chars;
6860 coding->src_bytes = bytes;
6861 coding->src_multibyte = chars < bytes;
6862
6863 if (STRINGP (src_object))
6864 {
6865 coding->src_pos = from;
6866 coding->src_pos_byte = from_byte;
6867 }
6868 else if (BUFFERP (src_object))
6869 {
6870 set_buffer_internal (XBUFFER (src_object));
6871 if (from != GPT)
6872 move_gap_both (from, from_byte);
6873 if (EQ (src_object, dst_object))
6874 {
6875 saved_pt = PT, saved_pt_byte = PT_BYTE;
6876 TEMP_SET_PT_BOTH (from, from_byte);
6877 del_range_both (from, from_byte, to, to_byte, 1);
6878 coding->src_pos = -chars;
6879 coding->src_pos_byte = -bytes;
6880 }
6881 else
6882 {
6883 coding->src_pos = from;
6884 coding->src_pos_byte = from_byte;
6885 }
6886 }
6887
6888 if (CODING_REQUIRE_DETECTION (coding))
6889 detect_coding (coding);
6890 attrs = CODING_ID_ATTRS (coding->id);
6891
6892 if (EQ (dst_object, Qt)
6893 || (! NILP (CODING_ATTR_POST_READ (attrs))
6894 && NILP (dst_object)))
6895 {
6896 coding->dst_object = code_conversion_save (1, 1);
6897 coding->dst_pos = BEG;
6898 coding->dst_pos_byte = BEG_BYTE;
6899 coding->dst_multibyte = 1;
6900 }
6901 else if (BUFFERP (dst_object))
6902 {
6903 code_conversion_save (0, 0);
6904 coding->dst_object = dst_object;
6905 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6906 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6907 coding->dst_multibyte
6908 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6909 }
6910 else
6911 {
6912 code_conversion_save (0, 0);
6913 coding->dst_object = Qnil;
6914 coding->dst_multibyte = 1;
6915 }
6916
6917 decode_coding (coding);
6918
6919 if (BUFFERP (coding->dst_object))
6920 set_buffer_internal (XBUFFER (coding->dst_object));
6921
6922 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6923 {
6924 struct gcpro gcpro1, gcpro2;
6925 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6926 Lisp_Object val;
6927
6928 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6929 GCPRO2 (coding->src_object, coding->dst_object);
6930 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6931 make_number (coding->produced_char));
6932 UNGCPRO;
6933 CHECK_NATNUM (val);
6934 coding->produced_char += Z - prev_Z;
6935 coding->produced += Z_BYTE - prev_Z_BYTE;
6936 }
6937
6938 if (EQ (dst_object, Qt))
6939 {
6940 coding->dst_object = Fbuffer_string ();
6941 }
6942 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6943 {
6944 set_buffer_internal (XBUFFER (coding->dst_object));
6945 if (dst_bytes < coding->produced)
6946 {
6947 destination
6948 = (unsigned char *) xrealloc (destination, coding->produced);
6949 if (! destination)
6950 {
6951 record_conversion_result (coding,
6952 CODING_RESULT_INSUFFICIENT_DST);
6953 unbind_to (count, Qnil);
6954 return;
6955 }
6956 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6957 move_gap_both (BEGV, BEGV_BYTE);
6958 bcopy (BEGV_ADDR, destination, coding->produced);
6959 coding->destination = destination;
6960 }
6961 }
6962
6963 if (saved_pt >= 0)
6964 {
6965 /* This is the case of:
6966 (BUFFERP (src_object) && EQ (src_object, dst_object))
6967 As we have moved PT while replacing the original buffer
6968 contents, we must recover it now. */
6969 set_buffer_internal (XBUFFER (src_object));
6970 if (saved_pt < from)
6971 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6972 else if (saved_pt < from + chars)
6973 TEMP_SET_PT_BOTH (from, from_byte);
6974 else if (! NILP (current_buffer->enable_multibyte_characters))
6975 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6976 saved_pt_byte + (coding->produced - bytes));
6977 else
6978 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6979 saved_pt_byte + (coding->produced - bytes));
6980 }
6981
6982 unbind_to (count, coding->dst_object);
6983 }
6984
6985
6986 void
6987 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6988 dst_object)
6989 struct coding_system *coding;
6990 Lisp_Object src_object;
6991 EMACS_INT from, from_byte, to, to_byte;
6992 Lisp_Object dst_object;
6993 {
6994 int count = specpdl_ptr - specpdl;
6995 EMACS_INT chars = to - from;
6996 EMACS_INT bytes = to_byte - from_byte;
6997 Lisp_Object attrs;
6998 Lisp_Object buffer;
6999 int saved_pt = -1, saved_pt_byte;
7000 int kill_src_buffer = 0;
7001
7002 buffer = Fcurrent_buffer ();
7003
7004 coding->src_object = src_object;
7005 coding->src_chars = chars;
7006 coding->src_bytes = bytes;
7007 coding->src_multibyte = chars < bytes;
7008
7009 attrs = CODING_ID_ATTRS (coding->id);
7010
7011 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7012 {
7013 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7014 set_buffer_internal (XBUFFER (coding->src_object));
7015 if (STRINGP (src_object))
7016 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7017 else if (BUFFERP (src_object))
7018 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7019 else
7020 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7021
7022 if (EQ (src_object, dst_object))
7023 {
7024 set_buffer_internal (XBUFFER (src_object));
7025 saved_pt = PT, saved_pt_byte = PT_BYTE;
7026 del_range_both (from, from_byte, to, to_byte, 1);
7027 set_buffer_internal (XBUFFER (coding->src_object));
7028 }
7029
7030 {
7031 Lisp_Object args[3];
7032
7033 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7034 args[1] = make_number (BEG);
7035 args[2] = make_number (Z);
7036 safe_call (3, args);
7037 }
7038 if (XBUFFER (coding->src_object) != current_buffer)
7039 kill_src_buffer = 1;
7040 coding->src_object = Fcurrent_buffer ();
7041 if (BEG != GPT)
7042 move_gap_both (BEG, BEG_BYTE);
7043 coding->src_chars = Z - BEG;
7044 coding->src_bytes = Z_BYTE - BEG_BYTE;
7045 coding->src_pos = BEG;
7046 coding->src_pos_byte = BEG_BYTE;
7047 coding->src_multibyte = Z < Z_BYTE;
7048 }
7049 else if (STRINGP (src_object))
7050 {
7051 code_conversion_save (0, 0);
7052 coding->src_pos = from;
7053 coding->src_pos_byte = from_byte;
7054 }
7055 else if (BUFFERP (src_object))
7056 {
7057 code_conversion_save (0, 0);
7058 set_buffer_internal (XBUFFER (src_object));
7059 if (EQ (src_object, dst_object))
7060 {
7061 saved_pt = PT, saved_pt_byte = PT_BYTE;
7062 coding->src_object = del_range_1 (from, to, 1, 1);
7063 coding->src_pos = 0;
7064 coding->src_pos_byte = 0;
7065 }
7066 else
7067 {
7068 if (from < GPT && to >= GPT)
7069 move_gap_both (from, from_byte);
7070 coding->src_pos = from;
7071 coding->src_pos_byte = from_byte;
7072 }
7073 }
7074 else
7075 code_conversion_save (0, 0);
7076
7077 if (BUFFERP (dst_object))
7078 {
7079 coding->dst_object = dst_object;
7080 if (EQ (src_object, dst_object))
7081 {
7082 coding->dst_pos = from;
7083 coding->dst_pos_byte = from_byte;
7084 }
7085 else
7086 {
7087 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7088 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7089 }
7090 coding->dst_multibyte
7091 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7092 }
7093 else if (EQ (dst_object, Qt))
7094 {
7095 coding->dst_object = Qnil;
7096 coding->dst_bytes = coding->src_chars;
7097 if (coding->dst_bytes == 0)
7098 coding->dst_bytes = 1;
7099 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7100 coding->dst_multibyte = 0;
7101 }
7102 else
7103 {
7104 coding->dst_object = Qnil;
7105 coding->dst_multibyte = 0;
7106 }
7107
7108 encode_coding (coding);
7109
7110 if (EQ (dst_object, Qt))
7111 {
7112 if (BUFFERP (coding->dst_object))
7113 coding->dst_object = Fbuffer_string ();
7114 else
7115 {
7116 coding->dst_object
7117 = make_unibyte_string ((char *) coding->destination,
7118 coding->produced);
7119 xfree (coding->destination);
7120 }
7121 }
7122
7123 if (saved_pt >= 0)
7124 {
7125 /* This is the case of:
7126 (BUFFERP (src_object) && EQ (src_object, dst_object))
7127 As we have moved PT while replacing the original buffer
7128 contents, we must recover it now. */
7129 set_buffer_internal (XBUFFER (src_object));
7130 if (saved_pt < from)
7131 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7132 else if (saved_pt < from + chars)
7133 TEMP_SET_PT_BOTH (from, from_byte);
7134 else if (! NILP (current_buffer->enable_multibyte_characters))
7135 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7136 saved_pt_byte + (coding->produced - bytes));
7137 else
7138 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7139 saved_pt_byte + (coding->produced - bytes));
7140 }
7141
7142 if (kill_src_buffer)
7143 Fkill_buffer (coding->src_object);
7144 unbind_to (count, Qnil);
7145 }
7146
7147
7148 Lisp_Object
7149 preferred_coding_system ()
7150 {
7151 int id = coding_categories[coding_priorities[0]].id;
7152
7153 return CODING_ID_NAME (id);
7154 }
7155
7156 \f
7157 #ifdef emacs
7158 /*** 8. Emacs Lisp library functions ***/
7159
7160 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7161 doc: /* Return t if OBJECT is nil or a coding-system.
7162 See the documentation of `define-coding-system' for information
7163 about coding-system objects. */)
7164 (obj)
7165 Lisp_Object obj;
7166 {
7167 if (NILP (obj)
7168 || CODING_SYSTEM_ID (obj) >= 0)
7169 return Qt;
7170 if (! SYMBOLP (obj)
7171 || NILP (Fget (obj, Qcoding_system_define_form)))
7172 return Qnil;
7173 return Qt;
7174 }
7175
7176 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7177 Sread_non_nil_coding_system, 1, 1, 0,
7178 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7179 (prompt)
7180 Lisp_Object prompt;
7181 {
7182 Lisp_Object val;
7183 do
7184 {
7185 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7186 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7187 }
7188 while (SCHARS (val) == 0);
7189 return (Fintern (val, Qnil));
7190 }
7191
7192 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7193 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7194 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
7195 (prompt, default_coding_system)
7196 Lisp_Object prompt, default_coding_system;
7197 {
7198 Lisp_Object val;
7199 if (SYMBOLP (default_coding_system))
7200 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7201 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7202 Qt, Qnil, Qcoding_system_history,
7203 default_coding_system, Qnil);
7204 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7205 }
7206
7207 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7208 1, 1, 0,
7209 doc: /* Check validity of CODING-SYSTEM.
7210 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7211 It is valid if it is nil or a symbol defined as a coding system by the
7212 function `define-coding-system'. */)
7213 (coding_system)
7214 Lisp_Object coding_system;
7215 {
7216 Lisp_Object define_form;
7217
7218 define_form = Fget (coding_system, Qcoding_system_define_form);
7219 if (! NILP (define_form))
7220 {
7221 Fput (coding_system, Qcoding_system_define_form, Qnil);
7222 safe_eval (define_form);
7223 }
7224 if (!NILP (Fcoding_system_p (coding_system)))
7225 return coding_system;
7226 xsignal1 (Qcoding_system_error, coding_system);
7227 }
7228
7229 \f
7230 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7231 HIGHEST is nonzero, return the coding system of the highest
7232 priority among the detected coding systems. Otherwize return a
7233 list of detected coding systems sorted by their priorities. If
7234 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7235 multibyte form but contains only ASCII and eight-bit chars.
7236 Otherwise, the bytes are raw bytes.
7237
7238 CODING-SYSTEM controls the detection as below:
7239
7240 If it is nil, detect both text-format and eol-format. If the
7241 text-format part of CODING-SYSTEM is already specified
7242 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7243 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7244 detect only text-format. */
7245
7246 Lisp_Object
7247 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7248 coding_system)
7249 const unsigned char *src;
7250 int src_chars, src_bytes, highest;
7251 int multibytep;
7252 Lisp_Object coding_system;
7253 {
7254 const unsigned char *src_end = src + src_bytes;
7255 Lisp_Object attrs, eol_type;
7256 Lisp_Object val;
7257 struct coding_system coding;
7258 int id;
7259 struct coding_detection_info detect_info;
7260 enum coding_category base_category;
7261
7262 if (NILP (coding_system))
7263 coding_system = Qundecided;
7264 setup_coding_system (coding_system, &coding);
7265 attrs = CODING_ID_ATTRS (coding.id);
7266 eol_type = CODING_ID_EOL_TYPE (coding.id);
7267 coding_system = CODING_ATTR_BASE_NAME (attrs);
7268
7269 coding.source = src;
7270 coding.src_chars = src_chars;
7271 coding.src_bytes = src_bytes;
7272 coding.src_multibyte = multibytep;
7273 coding.consumed = 0;
7274 coding.mode |= CODING_MODE_LAST_BLOCK;
7275
7276 detect_info.checked = detect_info.found = detect_info.rejected = 0;
7277
7278 /* At first, detect text-format if necessary. */
7279 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7280 if (base_category == coding_category_undecided)
7281 {
7282 enum coding_category category;
7283 struct coding_system *this;
7284 int c, i;
7285
7286 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7287 for (i = 0; src < src_end; i++, src++)
7288 {
7289 c = *src;
7290 if (c & 0x80)
7291 break;
7292 if (c < 0x20
7293 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7294 && ! inhibit_iso_escape_detection)
7295 {
7296 coding.head_ascii = src - coding.source;
7297 if (detect_coding_iso_2022 (&coding, &detect_info))
7298 {
7299 /* We have scanned the whole data. */
7300 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7301 /* We didn't find an 8-bit code. */
7302 src = src_end;
7303 break;
7304 }
7305 }
7306 }
7307 coding.head_ascii = src - coding.source;
7308
7309 if (src < src_end
7310 || detect_info.found)
7311 {
7312 if (src == src_end)
7313 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7314 for (i = 0; i < coding_category_raw_text; i++)
7315 {
7316 category = coding_priorities[i];
7317 this = coding_categories + category;
7318 if (detect_info.found & (1 << category))
7319 break;
7320 }
7321 else
7322 for (i = 0; i < coding_category_raw_text; i++)
7323 {
7324 category = coding_priorities[i];
7325 this = coding_categories + category;
7326
7327 if (this->id < 0)
7328 {
7329 /* No coding system of this category is defined. */
7330 detect_info.rejected |= (1 << category);
7331 }
7332 else if (category >= coding_category_raw_text)
7333 continue;
7334 else if (detect_info.checked & (1 << category))
7335 {
7336 if (highest
7337 && (detect_info.found & (1 << category)))
7338 break;
7339 }
7340 else
7341 {
7342 if ((*(this->detector)) (&coding, &detect_info)
7343 && highest
7344 && (detect_info.found & (1 << category)))
7345 {
7346 if (category == coding_category_utf_16_auto)
7347 {
7348 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7349 category = coding_category_utf_16_le;
7350 else
7351 category = coding_category_utf_16_be;
7352 }
7353 break;
7354 }
7355 }
7356 }
7357 }
7358
7359 if (detect_info.rejected == CATEGORY_MASK_ANY)
7360 {
7361 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7362 id = coding_categories[coding_category_raw_text].id;
7363 val = Fcons (make_number (id), Qnil);
7364 }
7365 else if (! detect_info.rejected && ! detect_info.found)
7366 {
7367 detect_info.found = CATEGORY_MASK_ANY;
7368 id = coding_categories[coding_category_undecided].id;
7369 val = Fcons (make_number (id), Qnil);
7370 }
7371 else if (highest)
7372 {
7373 if (detect_info.found)
7374 {
7375 detect_info.found = 1 << category;
7376 val = Fcons (make_number (this->id), Qnil);
7377 }
7378 else
7379 for (i = 0; i < coding_category_raw_text; i++)
7380 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7381 {
7382 detect_info.found = 1 << coding_priorities[i];
7383 id = coding_categories[coding_priorities[i]].id;
7384 val = Fcons (make_number (id), Qnil);
7385 break;
7386 }
7387 }
7388 else
7389 {
7390 int mask = detect_info.rejected | detect_info.found;
7391 int found = 0;
7392 val = Qnil;
7393
7394 for (i = coding_category_raw_text - 1; i >= 0; i--)
7395 {
7396 category = coding_priorities[i];
7397 if (! (mask & (1 << category)))
7398 {
7399 found |= 1 << category;
7400 id = coding_categories[category].id;
7401 if (id >= 0)
7402 val = Fcons (make_number (id), val);
7403 }
7404 }
7405 for (i = coding_category_raw_text - 1; i >= 0; i--)
7406 {
7407 category = coding_priorities[i];
7408 if (detect_info.found & (1 << category))
7409 {
7410 id = coding_categories[category].id;
7411 val = Fcons (make_number (id), val);
7412 }
7413 }
7414 detect_info.found |= found;
7415 }
7416 }
7417 else if (base_category == coding_category_utf_16_auto)
7418 {
7419 if (detect_coding_utf_16 (&coding, &detect_info))
7420 {
7421 struct coding_system *this;
7422
7423 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7424 this = coding_categories + coding_category_utf_16_le;
7425 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7426 this = coding_categories + coding_category_utf_16_be;
7427 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7428 this = coding_categories + coding_category_utf_16_be_nosig;
7429 else
7430 this = coding_categories + coding_category_utf_16_le_nosig;
7431 val = Fcons (make_number (this->id), Qnil);
7432 }
7433 }
7434 else
7435 {
7436 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7437 val = Fcons (make_number (coding.id), Qnil);
7438 }
7439
7440 /* Then, detect eol-format if necessary. */
7441 {
7442 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7443 Lisp_Object tail;
7444
7445 if (VECTORP (eol_type))
7446 {
7447 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7448 normal_eol = detect_eol (coding.source, src_bytes,
7449 coding_category_raw_text);
7450 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7451 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7452 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7453 coding_category_utf_16_be);
7454 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7455 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7456 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7457 coding_category_utf_16_le);
7458 }
7459 else
7460 {
7461 if (EQ (eol_type, Qunix))
7462 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7463 else if (EQ (eol_type, Qdos))
7464 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7465 else
7466 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7467 }
7468
7469 for (tail = val; CONSP (tail); tail = XCDR (tail))
7470 {
7471 enum coding_category category;
7472 int this_eol;
7473
7474 id = XINT (XCAR (tail));
7475 attrs = CODING_ID_ATTRS (id);
7476 category = XINT (CODING_ATTR_CATEGORY (attrs));
7477 eol_type = CODING_ID_EOL_TYPE (id);
7478 if (VECTORP (eol_type))
7479 {
7480 if (category == coding_category_utf_16_be
7481 || category == coding_category_utf_16_be_nosig)
7482 this_eol = utf_16_be_eol;
7483 else if (category == coding_category_utf_16_le
7484 || category == coding_category_utf_16_le_nosig)
7485 this_eol = utf_16_le_eol;
7486 else
7487 this_eol = normal_eol;
7488
7489 if (this_eol == EOL_SEEN_LF)
7490 XSETCAR (tail, AREF (eol_type, 0));
7491 else if (this_eol == EOL_SEEN_CRLF)
7492 XSETCAR (tail, AREF (eol_type, 1));
7493 else if (this_eol == EOL_SEEN_CR)
7494 XSETCAR (tail, AREF (eol_type, 2));
7495 else
7496 XSETCAR (tail, CODING_ID_NAME (id));
7497 }
7498 else
7499 XSETCAR (tail, CODING_ID_NAME (id));
7500 }
7501 }
7502
7503 return (highest ? XCAR (val) : val);
7504 }
7505
7506
7507 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7508 2, 3, 0,
7509 doc: /* Detect coding system of the text in the region between START and END.
7510 Return a list of possible coding systems ordered by priority.
7511
7512 If only ASCII characters are found (except for such ISO-2022 control
7513 characters ISO-2022 as ESC), it returns a list of single element
7514 `undecided' or its subsidiary coding system according to a detected
7515 end-of-line format.
7516
7517 If optional argument HIGHEST is non-nil, return the coding system of
7518 highest priority. */)
7519 (start, end, highest)
7520 Lisp_Object start, end, highest;
7521 {
7522 int from, to;
7523 int from_byte, to_byte;
7524
7525 CHECK_NUMBER_COERCE_MARKER (start);
7526 CHECK_NUMBER_COERCE_MARKER (end);
7527
7528 validate_region (&start, &end);
7529 from = XINT (start), to = XINT (end);
7530 from_byte = CHAR_TO_BYTE (from);
7531 to_byte = CHAR_TO_BYTE (to);
7532
7533 if (from < GPT && to >= GPT)
7534 move_gap_both (to, to_byte);
7535
7536 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7537 to - from, to_byte - from_byte,
7538 !NILP (highest),
7539 !NILP (current_buffer
7540 ->enable_multibyte_characters),
7541 Qnil);
7542 }
7543
7544 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7545 1, 2, 0,
7546 doc: /* Detect coding system of the text in STRING.
7547 Return a list of possible coding systems ordered by priority.
7548
7549 If only ASCII characters are found (except for such ISO-2022 control
7550 characters ISO-2022 as ESC), it returns a list of single element
7551 `undecided' or its subsidiary coding system according to a detected
7552 end-of-line format.
7553
7554 If optional argument HIGHEST is non-nil, return the coding system of
7555 highest priority. */)
7556 (string, highest)
7557 Lisp_Object string, highest;
7558 {
7559 CHECK_STRING (string);
7560
7561 return detect_coding_system (SDATA (string),
7562 SCHARS (string), SBYTES (string),
7563 !NILP (highest), STRING_MULTIBYTE (string),
7564 Qnil);
7565 }
7566
7567
7568 static INLINE int
7569 char_encodable_p (c, attrs)
7570 int c;
7571 Lisp_Object attrs;
7572 {
7573 Lisp_Object tail;
7574 struct charset *charset;
7575 Lisp_Object translation_table;
7576
7577 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7578 if (! NILP (translation_table))
7579 c = translate_char (translation_table, c);
7580 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7581 CONSP (tail); tail = XCDR (tail))
7582 {
7583 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7584 if (CHAR_CHARSET_P (c, charset))
7585 break;
7586 }
7587 return (! NILP (tail));
7588 }
7589
7590
7591 /* Return a list of coding systems that safely encode the text between
7592 START and END. If EXCLUDE is non-nil, it is a list of coding
7593 systems not to check. The returned list doesn't contain any such
7594 coding systems. In any case, if the text contains only ASCII or is
7595 unibyte, return t. */
7596
7597 DEFUN ("find-coding-systems-region-internal",
7598 Ffind_coding_systems_region_internal,
7599 Sfind_coding_systems_region_internal, 2, 3, 0,
7600 doc: /* Internal use only. */)
7601 (start, end, exclude)
7602 Lisp_Object start, end, exclude;
7603 {
7604 Lisp_Object coding_attrs_list, safe_codings;
7605 EMACS_INT start_byte, end_byte;
7606 const unsigned char *p, *pbeg, *pend;
7607 int c;
7608 Lisp_Object tail, elt;
7609
7610 if (STRINGP (start))
7611 {
7612 if (!STRING_MULTIBYTE (start)
7613 || SCHARS (start) == SBYTES (start))
7614 return Qt;
7615 start_byte = 0;
7616 end_byte = SBYTES (start);
7617 }
7618 else
7619 {
7620 CHECK_NUMBER_COERCE_MARKER (start);
7621 CHECK_NUMBER_COERCE_MARKER (end);
7622 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7623 args_out_of_range (start, end);
7624 if (NILP (current_buffer->enable_multibyte_characters))
7625 return Qt;
7626 start_byte = CHAR_TO_BYTE (XINT (start));
7627 end_byte = CHAR_TO_BYTE (XINT (end));
7628 if (XINT (end) - XINT (start) == end_byte - start_byte)
7629 return Qt;
7630
7631 if (XINT (start) < GPT && XINT (end) > GPT)
7632 {
7633 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7634 move_gap_both (XINT (start), start_byte);
7635 else
7636 move_gap_both (XINT (end), end_byte);
7637 }
7638 }
7639
7640 coding_attrs_list = Qnil;
7641 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7642 if (NILP (exclude)
7643 || NILP (Fmemq (XCAR (tail), exclude)))
7644 {
7645 Lisp_Object attrs;
7646
7647 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7648 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7649 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7650 {
7651 ASET (attrs, coding_attr_trans_tbl,
7652 get_translation_table (attrs, 1, NULL));
7653 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7654 }
7655 }
7656
7657 if (STRINGP (start))
7658 p = pbeg = SDATA (start);
7659 else
7660 p = pbeg = BYTE_POS_ADDR (start_byte);
7661 pend = p + (end_byte - start_byte);
7662
7663 while (p < pend && ASCII_BYTE_P (*p)) p++;
7664 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7665
7666 while (p < pend)
7667 {
7668 if (ASCII_BYTE_P (*p))
7669 p++;
7670 else
7671 {
7672 c = STRING_CHAR_ADVANCE (p);
7673
7674 charset_map_loaded = 0;
7675 for (tail = coding_attrs_list; CONSP (tail);)
7676 {
7677 elt = XCAR (tail);
7678 if (NILP (elt))
7679 tail = XCDR (tail);
7680 else if (char_encodable_p (c, elt))
7681 tail = XCDR (tail);
7682 else if (CONSP (XCDR (tail)))
7683 {
7684 XSETCAR (tail, XCAR (XCDR (tail)));
7685 XSETCDR (tail, XCDR (XCDR (tail)));
7686 }
7687 else
7688 {
7689 XSETCAR (tail, Qnil);
7690 tail = XCDR (tail);
7691 }
7692 }
7693 if (charset_map_loaded)
7694 {
7695 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7696
7697 if (STRINGP (start))
7698 pbeg = SDATA (start);
7699 else
7700 pbeg = BYTE_POS_ADDR (start_byte);
7701 p = pbeg + p_offset;
7702 pend = pbeg + pend_offset;
7703 }
7704 }
7705 }
7706
7707 safe_codings = list2 (Qraw_text, Qno_conversion);
7708 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7709 if (! NILP (XCAR (tail)))
7710 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7711
7712 return safe_codings;
7713 }
7714
7715
7716 DEFUN ("unencodable-char-position", Funencodable_char_position,
7717 Sunencodable_char_position, 3, 5, 0,
7718 doc: /*
7719 Return position of first un-encodable character in a region.
7720 START and END specfiy the region and CODING-SYSTEM specifies the
7721 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7722
7723 If optional 4th argument COUNT is non-nil, it specifies at most how
7724 many un-encodable characters to search. In this case, the value is a
7725 list of positions.
7726
7727 If optional 5th argument STRING is non-nil, it is a string to search
7728 for un-encodable characters. In that case, START and END are indexes
7729 to the string. */)
7730 (start, end, coding_system, count, string)
7731 Lisp_Object start, end, coding_system, count, string;
7732 {
7733 int n;
7734 struct coding_system coding;
7735 Lisp_Object attrs, charset_list, translation_table;
7736 Lisp_Object positions;
7737 int from, to;
7738 const unsigned char *p, *stop, *pend;
7739 int ascii_compatible;
7740
7741 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7742 attrs = CODING_ID_ATTRS (coding.id);
7743 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7744 return Qnil;
7745 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7746 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7747 translation_table = get_translation_table (attrs, 1, NULL);
7748
7749 if (NILP (string))
7750 {
7751 validate_region (&start, &end);
7752 from = XINT (start);
7753 to = XINT (end);
7754 if (NILP (current_buffer->enable_multibyte_characters)
7755 || (ascii_compatible
7756 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7757 return Qnil;
7758 p = CHAR_POS_ADDR (from);
7759 pend = CHAR_POS_ADDR (to);
7760 if (from < GPT && to >= GPT)
7761 stop = GPT_ADDR;
7762 else
7763 stop = pend;
7764 }
7765 else
7766 {
7767 CHECK_STRING (string);
7768 CHECK_NATNUM (start);
7769 CHECK_NATNUM (end);
7770 from = XINT (start);
7771 to = XINT (end);
7772 if (from > to
7773 || to > SCHARS (string))
7774 args_out_of_range_3 (string, start, end);
7775 if (! STRING_MULTIBYTE (string))
7776 return Qnil;
7777 p = SDATA (string) + string_char_to_byte (string, from);
7778 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7779 if (ascii_compatible && (to - from) == (pend - p))
7780 return Qnil;
7781 }
7782
7783 if (NILP (count))
7784 n = 1;
7785 else
7786 {
7787 CHECK_NATNUM (count);
7788 n = XINT (count);
7789 }
7790
7791 positions = Qnil;
7792 while (1)
7793 {
7794 int c;
7795
7796 if (ascii_compatible)
7797 while (p < stop && ASCII_BYTE_P (*p))
7798 p++, from++;
7799 if (p >= stop)
7800 {
7801 if (p >= pend)
7802 break;
7803 stop = pend;
7804 p = GAP_END_ADDR;
7805 }
7806
7807 c = STRING_CHAR_ADVANCE (p);
7808 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7809 && ! char_charset (translate_char (translation_table, c),
7810 charset_list, NULL))
7811 {
7812 positions = Fcons (make_number (from), positions);
7813 n--;
7814 if (n == 0)
7815 break;
7816 }
7817
7818 from++;
7819 }
7820
7821 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7822 }
7823
7824
7825 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7826 Scheck_coding_systems_region, 3, 3, 0,
7827 doc: /* Check if the region is encodable by coding systems.
7828
7829 START and END are buffer positions specifying the region.
7830 CODING-SYSTEM-LIST is a list of coding systems to check.
7831
7832 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7833 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7834 whole region, POS0, POS1, ... are buffer positions where non-encodable
7835 characters are found.
7836
7837 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7838 value is nil.
7839
7840 START may be a string. In that case, check if the string is
7841 encodable, and the value contains indices to the string instead of
7842 buffer positions. END is ignored. */)
7843 (start, end, coding_system_list)
7844 Lisp_Object start, end, coding_system_list;
7845 {
7846 Lisp_Object list;
7847 EMACS_INT start_byte, end_byte;
7848 int pos;
7849 const unsigned char *p, *pbeg, *pend;
7850 int c;
7851 Lisp_Object tail, elt, attrs;
7852
7853 if (STRINGP (start))
7854 {
7855 if (!STRING_MULTIBYTE (start)
7856 && SCHARS (start) != SBYTES (start))
7857 return Qnil;
7858 start_byte = 0;
7859 end_byte = SBYTES (start);
7860 pos = 0;
7861 }
7862 else
7863 {
7864 CHECK_NUMBER_COERCE_MARKER (start);
7865 CHECK_NUMBER_COERCE_MARKER (end);
7866 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7867 args_out_of_range (start, end);
7868 if (NILP (current_buffer->enable_multibyte_characters))
7869 return Qnil;
7870 start_byte = CHAR_TO_BYTE (XINT (start));
7871 end_byte = CHAR_TO_BYTE (XINT (end));
7872 if (XINT (end) - XINT (start) == end_byte - start_byte)
7873 return Qt;
7874
7875 if (XINT (start) < GPT && XINT (end) > GPT)
7876 {
7877 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7878 move_gap_both (XINT (start), start_byte);
7879 else
7880 move_gap_both (XINT (end), end_byte);
7881 }
7882 pos = XINT (start);
7883 }
7884
7885 list = Qnil;
7886 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7887 {
7888 elt = XCAR (tail);
7889 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7890 ASET (attrs, coding_attr_trans_tbl,
7891 get_translation_table (attrs, 1, NULL));
7892 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7893 }
7894
7895 if (STRINGP (start))
7896 p = pbeg = SDATA (start);
7897 else
7898 p = pbeg = BYTE_POS_ADDR (start_byte);
7899 pend = p + (end_byte - start_byte);
7900
7901 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7902 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7903
7904 while (p < pend)
7905 {
7906 if (ASCII_BYTE_P (*p))
7907 p++;
7908 else
7909 {
7910 c = STRING_CHAR_ADVANCE (p);
7911
7912 charset_map_loaded = 0;
7913 for (tail = list; CONSP (tail); tail = XCDR (tail))
7914 {
7915 elt = XCDR (XCAR (tail));
7916 if (! char_encodable_p (c, XCAR (elt)))
7917 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7918 }
7919 if (charset_map_loaded)
7920 {
7921 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7922
7923 if (STRINGP (start))
7924 pbeg = SDATA (start);
7925 else
7926 pbeg = BYTE_POS_ADDR (start_byte);
7927 p = pbeg + p_offset;
7928 pend = pbeg + pend_offset;
7929 }
7930 }
7931 pos++;
7932 }
7933
7934 tail = list;
7935 list = Qnil;
7936 for (; CONSP (tail); tail = XCDR (tail))
7937 {
7938 elt = XCAR (tail);
7939 if (CONSP (XCDR (XCDR (elt))))
7940 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7941 list);
7942 }
7943
7944 return list;
7945 }
7946
7947
7948 Lisp_Object
7949 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7950 Lisp_Object start, end, coding_system, dst_object;
7951 int encodep, norecord;
7952 {
7953 struct coding_system coding;
7954 EMACS_INT from, from_byte, to, to_byte;
7955 Lisp_Object src_object;
7956
7957 CHECK_NUMBER_COERCE_MARKER (start);
7958 CHECK_NUMBER_COERCE_MARKER (end);
7959 if (NILP (coding_system))
7960 coding_system = Qno_conversion;
7961 else
7962 CHECK_CODING_SYSTEM (coding_system);
7963 src_object = Fcurrent_buffer ();
7964 if (NILP (dst_object))
7965 dst_object = src_object;
7966 else if (! EQ (dst_object, Qt))
7967 CHECK_BUFFER (dst_object);
7968
7969 validate_region (&start, &end);
7970 from = XFASTINT (start);
7971 from_byte = CHAR_TO_BYTE (from);
7972 to = XFASTINT (end);
7973 to_byte = CHAR_TO_BYTE (to);
7974
7975 setup_coding_system (coding_system, &coding);
7976 coding.mode |= CODING_MODE_LAST_BLOCK;
7977
7978 if (encodep)
7979 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7980 dst_object);
7981 else
7982 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7983 dst_object);
7984 if (! norecord)
7985 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7986
7987 return (BUFFERP (dst_object)
7988 ? make_number (coding.produced_char)
7989 : coding.dst_object);
7990 }
7991
7992
7993 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7994 3, 4, "r\nzCoding system: ",
7995 doc: /* Decode the current region from the specified coding system.
7996 When called from a program, takes four arguments:
7997 START, END, CODING-SYSTEM, and DESTINATION.
7998 START and END are buffer positions.
7999
8000 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8001 If nil, the region between START and END is replace by the decoded text.
8002 If buffer, the decoded text is inserted in the buffer.
8003 If t, the decoded text is returned.
8004
8005 This function sets `last-coding-system-used' to the precise coding system
8006 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8007 not fully specified.)
8008 It returns the length of the decoded text. */)
8009 (start, end, coding_system, destination)
8010 Lisp_Object start, end, coding_system, destination;
8011 {
8012 return code_convert_region (start, end, coding_system, destination, 0, 0);
8013 }
8014
8015 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8016 3, 4, "r\nzCoding system: ",
8017 doc: /* Encode the current region by specified coding system.
8018 When called from a program, takes three arguments:
8019 START, END, and CODING-SYSTEM. START and END are buffer positions.
8020
8021 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8022 If nil, the region between START and END is replace by the encoded text.
8023 If buffer, the encoded text is inserted in the buffer.
8024 If t, the encoded text is returned.
8025
8026 This function sets `last-coding-system-used' to the precise coding system
8027 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8028 not fully specified.)
8029 It returns the length of the encoded text. */)
8030 (start, end, coding_system, destination)
8031 Lisp_Object start, end, coding_system, destination;
8032 {
8033 return code_convert_region (start, end, coding_system, destination, 1, 0);
8034 }
8035
8036 Lisp_Object
8037 code_convert_string (string, coding_system, dst_object,
8038 encodep, nocopy, norecord)
8039 Lisp_Object string, coding_system, dst_object;
8040 int encodep, nocopy, norecord;
8041 {
8042 struct coding_system coding;
8043 EMACS_INT chars, bytes;
8044
8045 CHECK_STRING (string);
8046 if (NILP (coding_system))
8047 {
8048 if (! norecord)
8049 Vlast_coding_system_used = Qno_conversion;
8050 if (NILP (dst_object))
8051 return (nocopy ? Fcopy_sequence (string) : string);
8052 }
8053
8054 if (NILP (coding_system))
8055 coding_system = Qno_conversion;
8056 else
8057 CHECK_CODING_SYSTEM (coding_system);
8058 if (NILP (dst_object))
8059 dst_object = Qt;
8060 else if (! EQ (dst_object, Qt))
8061 CHECK_BUFFER (dst_object);
8062
8063 setup_coding_system (coding_system, &coding);
8064 coding.mode |= CODING_MODE_LAST_BLOCK;
8065 chars = SCHARS (string);
8066 bytes = SBYTES (string);
8067 if (encodep)
8068 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8069 else
8070 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8071 if (! norecord)
8072 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8073
8074 return (BUFFERP (dst_object)
8075 ? make_number (coding.produced_char)
8076 : coding.dst_object);
8077 }
8078
8079
8080 /* Encode or decode STRING according to CODING_SYSTEM.
8081 Do not set Vlast_coding_system_used.
8082
8083 This function is called only from macros DECODE_FILE and
8084 ENCODE_FILE, thus we ignore character composition. */
8085
8086 Lisp_Object
8087 code_convert_string_norecord (string, coding_system, encodep)
8088 Lisp_Object string, coding_system;
8089 int encodep;
8090 {
8091 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8092 }
8093
8094
8095 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8096 2, 4, 0,
8097 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8098
8099 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8100 if the decoding operation is trivial.
8101
8102 Optional fourth arg BUFFER non-nil meant that the decoded text is
8103 inserted in BUFFER instead of returned as a string. In this case,
8104 the return value is BUFFER.
8105
8106 This function sets `last-coding-system-used' to the precise coding system
8107 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8108 not fully specified. */)
8109 (string, coding_system, nocopy, buffer)
8110 Lisp_Object string, coding_system, nocopy, buffer;
8111 {
8112 return code_convert_string (string, coding_system, buffer,
8113 0, ! NILP (nocopy), 0);
8114 }
8115
8116 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8117 2, 4, 0,
8118 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8119
8120 Optional third arg NOCOPY non-nil means it is OK to return STRING
8121 itself if the encoding operation is trivial.
8122
8123 Optional fourth arg BUFFER non-nil meant that the encoded text is
8124 inserted in BUFFER instead of returned as a string. In this case,
8125 the return value is BUFFER.
8126
8127 This function sets `last-coding-system-used' to the precise coding system
8128 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8129 not fully specified.) */)
8130 (string, coding_system, nocopy, buffer)
8131 Lisp_Object string, coding_system, nocopy, buffer;
8132 {
8133 return code_convert_string (string, coding_system, buffer,
8134 1, ! NILP (nocopy), 1);
8135 }
8136
8137 \f
8138 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8139 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8140 Return the corresponding character. */)
8141 (code)
8142 Lisp_Object code;
8143 {
8144 Lisp_Object spec, attrs, val;
8145 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8146 int c;
8147
8148 CHECK_NATNUM (code);
8149 c = XFASTINT (code);
8150 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8151 attrs = AREF (spec, 0);
8152
8153 if (ASCII_BYTE_P (c)
8154 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8155 return code;
8156
8157 val = CODING_ATTR_CHARSET_LIST (attrs);
8158 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8159 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8160 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8161
8162 if (c <= 0x7F)
8163 charset = charset_roman;
8164 else if (c >= 0xA0 && c < 0xDF)
8165 {
8166 charset = charset_kana;
8167 c -= 0x80;
8168 }
8169 else
8170 {
8171 int s1 = c >> 8, s2 = c & 0xFF;
8172
8173 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8174 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8175 error ("Invalid code: %d", code);
8176 SJIS_TO_JIS (c);
8177 charset = charset_kanji;
8178 }
8179 c = DECODE_CHAR (charset, c);
8180 if (c < 0)
8181 error ("Invalid code: %d", code);
8182 return make_number (c);
8183 }
8184
8185
8186 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8187 doc: /* Encode a Japanese character CH to shift_jis encoding.
8188 Return the corresponding code in SJIS. */)
8189 (ch)
8190 Lisp_Object ch;
8191 {
8192 Lisp_Object spec, attrs, charset_list;
8193 int c;
8194 struct charset *charset;
8195 unsigned code;
8196
8197 CHECK_CHARACTER (ch);
8198 c = XFASTINT (ch);
8199 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8200 attrs = AREF (spec, 0);
8201
8202 if (ASCII_CHAR_P (c)
8203 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8204 return ch;
8205
8206 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8207 charset = char_charset (c, charset_list, &code);
8208 if (code == CHARSET_INVALID_CODE (charset))
8209 error ("Can't encode by shift_jis encoding: %d", c);
8210 JIS_TO_SJIS (code);
8211
8212 return make_number (code);
8213 }
8214
8215 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8216 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8217 Return the corresponding character. */)
8218 (code)
8219 Lisp_Object code;
8220 {
8221 Lisp_Object spec, attrs, val;
8222 struct charset *charset_roman, *charset_big5, *charset;
8223 int c;
8224
8225 CHECK_NATNUM (code);
8226 c = XFASTINT (code);
8227 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8228 attrs = AREF (spec, 0);
8229
8230 if (ASCII_BYTE_P (c)
8231 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8232 return code;
8233
8234 val = CODING_ATTR_CHARSET_LIST (attrs);
8235 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8236 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8237
8238 if (c <= 0x7F)
8239 charset = charset_roman;
8240 else
8241 {
8242 int b1 = c >> 8, b2 = c & 0x7F;
8243 if (b1 < 0xA1 || b1 > 0xFE
8244 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8245 error ("Invalid code: %d", code);
8246 charset = charset_big5;
8247 }
8248 c = DECODE_CHAR (charset, (unsigned )c);
8249 if (c < 0)
8250 error ("Invalid code: %d", code);
8251 return make_number (c);
8252 }
8253
8254 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8255 doc: /* Encode the Big5 character CH to BIG5 coding system.
8256 Return the corresponding character code in Big5. */)
8257 (ch)
8258 Lisp_Object ch;
8259 {
8260 Lisp_Object spec, attrs, charset_list;
8261 struct charset *charset;
8262 int c;
8263 unsigned code;
8264
8265 CHECK_CHARACTER (ch);
8266 c = XFASTINT (ch);
8267 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8268 attrs = AREF (spec, 0);
8269 if (ASCII_CHAR_P (c)
8270 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8271 return ch;
8272
8273 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8274 charset = char_charset (c, charset_list, &code);
8275 if (code == CHARSET_INVALID_CODE (charset))
8276 error ("Can't encode by Big5 encoding: %d", c);
8277
8278 return make_number (code);
8279 }
8280
8281 \f
8282 DEFUN ("set-terminal-coding-system-internal",
8283 Fset_terminal_coding_system_internal,
8284 Sset_terminal_coding_system_internal, 1, 1, 0,
8285 doc: /* Internal use only. */)
8286 (coding_system)
8287 Lisp_Object coding_system;
8288 {
8289 CHECK_SYMBOL (coding_system);
8290 setup_coding_system (Fcheck_coding_system (coding_system),
8291 &terminal_coding);
8292
8293 /* We had better not send unsafe characters to terminal. */
8294 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8295 /* Characer composition should be disabled. */
8296 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8297 terminal_coding.src_multibyte = 1;
8298 terminal_coding.dst_multibyte = 0;
8299 return Qnil;
8300 }
8301
8302 DEFUN ("set-safe-terminal-coding-system-internal",
8303 Fset_safe_terminal_coding_system_internal,
8304 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8305 doc: /* Internal use only. */)
8306 (coding_system)
8307 Lisp_Object coding_system;
8308 {
8309 CHECK_SYMBOL (coding_system);
8310 setup_coding_system (Fcheck_coding_system (coding_system),
8311 &safe_terminal_coding);
8312 /* Characer composition should be disabled. */
8313 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8314 safe_terminal_coding.src_multibyte = 1;
8315 safe_terminal_coding.dst_multibyte = 0;
8316 return Qnil;
8317 }
8318
8319 DEFUN ("terminal-coding-system",
8320 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8321 doc: /* Return coding system specified for terminal output. */)
8322 ()
8323 {
8324 Lisp_Object coding_system;
8325
8326 coding_system = CODING_ID_NAME (terminal_coding.id);
8327 /* For backward compatibility, return nil if it is `undecided'. */
8328 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8329 }
8330
8331 DEFUN ("set-keyboard-coding-system-internal",
8332 Fset_keyboard_coding_system_internal,
8333 Sset_keyboard_coding_system_internal, 1, 1, 0,
8334 doc: /* Internal use only. */)
8335 (coding_system)
8336 Lisp_Object coding_system;
8337 {
8338 CHECK_SYMBOL (coding_system);
8339 setup_coding_system (Fcheck_coding_system (coding_system),
8340 &keyboard_coding);
8341 /* Characer composition should be disabled. */
8342 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8343 return Qnil;
8344 }
8345
8346 DEFUN ("keyboard-coding-system",
8347 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8348 doc: /* Return coding system specified for decoding keyboard input. */)
8349 ()
8350 {
8351 return CODING_ID_NAME (keyboard_coding.id);
8352 }
8353
8354 \f
8355 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8356 Sfind_operation_coding_system, 1, MANY, 0,
8357 doc: /* Choose a coding system for an operation based on the target name.
8358 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8359 DECODING-SYSTEM is the coding system to use for decoding
8360 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8361 for encoding (in case OPERATION does encoding).
8362
8363 The first argument OPERATION specifies an I/O primitive:
8364 For file I/O, `insert-file-contents' or `write-region'.
8365 For process I/O, `call-process', `call-process-region', or `start-process'.
8366 For network I/O, `open-network-stream'.
8367
8368 The remaining arguments should be the same arguments that were passed
8369 to the primitive. Depending on which primitive, one of those arguments
8370 is selected as the TARGET. For example, if OPERATION does file I/O,
8371 whichever argument specifies the file name is TARGET.
8372
8373 TARGET has a meaning which depends on OPERATION:
8374 For file I/O, TARGET is a file name (except for the special case below).
8375 For process I/O, TARGET is a process name.
8376 For network I/O, TARGET is a service name or a port number
8377
8378 This function looks up what specified for TARGET in,
8379 `file-coding-system-alist', `process-coding-system-alist',
8380 or `network-coding-system-alist' depending on OPERATION.
8381 They may specify a coding system, a cons of coding systems,
8382 or a function symbol to call.
8383 In the last case, we call the function with one argument,
8384 which is a list of all the arguments given to this function.
8385
8386 If OPERATION is `insert-file-contents', the argument corresponding to
8387 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
8388 file name to look up, and BUFFER is a buffer that contains the file's
8389 contents (not yet decoded). If `file-coding-system-alist' specifies a
8390 function to call for FILENAME, that function should examine the
8391 contents of BUFFER instead of reading the file.
8392
8393 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
8394 (nargs, args)
8395 int nargs;
8396 Lisp_Object *args;
8397 {
8398 Lisp_Object operation, target_idx, target, val;
8399 register Lisp_Object chain;
8400
8401 if (nargs < 2)
8402 error ("Too few arguments");
8403 operation = args[0];
8404 if (!SYMBOLP (operation)
8405 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8406 error ("Invalid first arguement");
8407 if (nargs < 1 + XINT (target_idx))
8408 error ("Too few arguments for operation: %s",
8409 SDATA (SYMBOL_NAME (operation)));
8410 target = args[XINT (target_idx) + 1];
8411 if (!(STRINGP (target)
8412 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8413 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
8414 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8415 error ("Invalid %dth argument", XINT (target_idx) + 1);
8416 if (CONSP (target))
8417 target = XCAR (target);
8418
8419 chain = ((EQ (operation, Qinsert_file_contents)
8420 || EQ (operation, Qwrite_region))
8421 ? Vfile_coding_system_alist
8422 : (EQ (operation, Qopen_network_stream)
8423 ? Vnetwork_coding_system_alist
8424 : Vprocess_coding_system_alist));
8425 if (NILP (chain))
8426 return Qnil;
8427
8428 for (; CONSP (chain); chain = XCDR (chain))
8429 {
8430 Lisp_Object elt;
8431
8432 elt = XCAR (chain);
8433 if (CONSP (elt)
8434 && ((STRINGP (target)
8435 && STRINGP (XCAR (elt))
8436 && fast_string_match (XCAR (elt), target) >= 0)
8437 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8438 {
8439 val = XCDR (elt);
8440 /* Here, if VAL is both a valid coding system and a valid
8441 function symbol, we return VAL as a coding system. */
8442 if (CONSP (val))
8443 return val;
8444 if (! SYMBOLP (val))
8445 return Qnil;
8446 if (! NILP (Fcoding_system_p (val)))
8447 return Fcons (val, val);
8448 if (! NILP (Ffboundp (val)))
8449 {
8450 /* We use call1 rather than safe_call1
8451 so as to get bug reports about functions called here
8452 which don't handle the current interface. */
8453 val = call1 (val, Flist (nargs, args));
8454 if (CONSP (val))
8455 return val;
8456 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8457 return Fcons (val, val);
8458 }
8459 return Qnil;
8460 }
8461 }
8462 return Qnil;
8463 }
8464
8465 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8466 Sset_coding_system_priority, 0, MANY, 0,
8467 doc: /* Assign higher priority to the coding systems given as arguments.
8468 If multiple coding systems belongs to the same category,
8469 all but the first one are ignored.
8470
8471 usage: (set-coding-system-priority ...) */)
8472 (nargs, args)
8473 int nargs;
8474 Lisp_Object *args;
8475 {
8476 int i, j;
8477 int changed[coding_category_max];
8478 enum coding_category priorities[coding_category_max];
8479
8480 bzero (changed, sizeof changed);
8481
8482 for (i = j = 0; i < nargs; i++)
8483 {
8484 enum coding_category category;
8485 Lisp_Object spec, attrs;
8486
8487 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8488 attrs = AREF (spec, 0);
8489 category = XINT (CODING_ATTR_CATEGORY (attrs));
8490 if (changed[category])
8491 /* Ignore this coding system because a coding system of the
8492 same category already had a higher priority. */
8493 continue;
8494 changed[category] = 1;
8495 priorities[j++] = category;
8496 if (coding_categories[category].id >= 0
8497 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8498 setup_coding_system (args[i], &coding_categories[category]);
8499 Fset (AREF (Vcoding_category_table, category), args[i]);
8500 }
8501
8502 /* Now we have decided top J priorities. Reflect the order of the
8503 original priorities to the remaining priorities. */
8504
8505 for (i = j, j = 0; i < coding_category_max; i++, j++)
8506 {
8507 while (j < coding_category_max
8508 && changed[coding_priorities[j]])
8509 j++;
8510 if (j == coding_category_max)
8511 abort ();
8512 priorities[i] = coding_priorities[j];
8513 }
8514
8515 bcopy (priorities, coding_priorities, sizeof priorities);
8516
8517 /* Update `coding-category-list'. */
8518 Vcoding_category_list = Qnil;
8519 for (i = coding_category_max - 1; i >= 0; i--)
8520 Vcoding_category_list
8521 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8522 Vcoding_category_list);
8523
8524 return Qnil;
8525 }
8526
8527 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8528 Scoding_system_priority_list, 0, 1, 0,
8529 doc: /* Return a list of coding systems ordered by their priorities.
8530 HIGHESTP non-nil means just return the highest priority one. */)
8531 (highestp)
8532 Lisp_Object highestp;
8533 {
8534 int i;
8535 Lisp_Object val;
8536
8537 for (i = 0, val = Qnil; i < coding_category_max; i++)
8538 {
8539 enum coding_category category = coding_priorities[i];
8540 int id = coding_categories[category].id;
8541 Lisp_Object attrs;
8542
8543 if (id < 0)
8544 continue;
8545 attrs = CODING_ID_ATTRS (id);
8546 if (! NILP (highestp))
8547 return CODING_ATTR_BASE_NAME (attrs);
8548 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8549 }
8550 return Fnreverse (val);
8551 }
8552
8553 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8554
8555 static Lisp_Object
8556 make_subsidiaries (base)
8557 Lisp_Object base;
8558 {
8559 Lisp_Object subsidiaries;
8560 int base_name_len = SBYTES (SYMBOL_NAME (base));
8561 char *buf = (char *) alloca (base_name_len + 6);
8562 int i;
8563
8564 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8565 subsidiaries = Fmake_vector (make_number (3), Qnil);
8566 for (i = 0; i < 3; i++)
8567 {
8568 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8569 ASET (subsidiaries, i, intern (buf));
8570 }
8571 return subsidiaries;
8572 }
8573
8574
8575 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8576 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8577 doc: /* For internal use only.
8578 usage: (define-coding-system-internal ...) */)
8579 (nargs, args)
8580 int nargs;
8581 Lisp_Object *args;
8582 {
8583 Lisp_Object name;
8584 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8585 Lisp_Object attrs; /* Vector of attributes. */
8586 Lisp_Object eol_type;
8587 Lisp_Object aliases;
8588 Lisp_Object coding_type, charset_list, safe_charsets;
8589 enum coding_category category;
8590 Lisp_Object tail, val;
8591 int max_charset_id = 0;
8592 int i;
8593
8594 if (nargs < coding_arg_max)
8595 goto short_args;
8596
8597 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8598
8599 name = args[coding_arg_name];
8600 CHECK_SYMBOL (name);
8601 CODING_ATTR_BASE_NAME (attrs) = name;
8602
8603 val = args[coding_arg_mnemonic];
8604 if (! STRINGP (val))
8605 CHECK_CHARACTER (val);
8606 CODING_ATTR_MNEMONIC (attrs) = val;
8607
8608 coding_type = args[coding_arg_coding_type];
8609 CHECK_SYMBOL (coding_type);
8610 CODING_ATTR_TYPE (attrs) = coding_type;
8611
8612 charset_list = args[coding_arg_charset_list];
8613 if (SYMBOLP (charset_list))
8614 {
8615 if (EQ (charset_list, Qiso_2022))
8616 {
8617 if (! EQ (coding_type, Qiso_2022))
8618 error ("Invalid charset-list");
8619 charset_list = Viso_2022_charset_list;
8620 }
8621 else if (EQ (charset_list, Qemacs_mule))
8622 {
8623 if (! EQ (coding_type, Qemacs_mule))
8624 error ("Invalid charset-list");
8625 charset_list = Vemacs_mule_charset_list;
8626 }
8627 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8628 if (max_charset_id < XFASTINT (XCAR (tail)))
8629 max_charset_id = XFASTINT (XCAR (tail));
8630 }
8631 else
8632 {
8633 charset_list = Fcopy_sequence (charset_list);
8634 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8635 {
8636 struct charset *charset;
8637
8638 val = Fcar (tail);
8639 CHECK_CHARSET_GET_CHARSET (val, charset);
8640 if (EQ (coding_type, Qiso_2022)
8641 ? CHARSET_ISO_FINAL (charset) < 0
8642 : EQ (coding_type, Qemacs_mule)
8643 ? CHARSET_EMACS_MULE_ID (charset) < 0
8644 : 0)
8645 error ("Can't handle charset `%s'",
8646 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8647
8648 XSETCAR (tail, make_number (charset->id));
8649 if (max_charset_id < charset->id)
8650 max_charset_id = charset->id;
8651 }
8652 }
8653 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8654
8655 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8656 make_number (255));
8657 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8658 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8659 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8660
8661 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8662
8663 val = args[coding_arg_decode_translation_table];
8664 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8665 CHECK_SYMBOL (val);
8666 CODING_ATTR_DECODE_TBL (attrs) = val;
8667
8668 val = args[coding_arg_encode_translation_table];
8669 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8670 CHECK_SYMBOL (val);
8671 CODING_ATTR_ENCODE_TBL (attrs) = val;
8672
8673 val = args[coding_arg_post_read_conversion];
8674 CHECK_SYMBOL (val);
8675 CODING_ATTR_POST_READ (attrs) = val;
8676
8677 val = args[coding_arg_pre_write_conversion];
8678 CHECK_SYMBOL (val);
8679 CODING_ATTR_PRE_WRITE (attrs) = val;
8680
8681 val = args[coding_arg_default_char];
8682 if (NILP (val))
8683 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8684 else
8685 {
8686 CHECK_CHARACTER (val);
8687 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8688 }
8689
8690 val = args[coding_arg_for_unibyte];
8691 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8692
8693 val = args[coding_arg_plist];
8694 CHECK_LIST (val);
8695 CODING_ATTR_PLIST (attrs) = val;
8696
8697 if (EQ (coding_type, Qcharset))
8698 {
8699 /* Generate a lisp vector of 256 elements. Each element is nil,
8700 integer, or a list of charset IDs.
8701
8702 If Nth element is nil, the byte code N is invalid in this
8703 coding system.
8704
8705 If Nth element is a number NUM, N is the first byte of a
8706 charset whose ID is NUM.
8707
8708 If Nth element is a list of charset IDs, N is the first byte
8709 of one of them. The list is sorted by dimensions of the
8710 charsets. A charset of smaller dimension comes firtst. */
8711 val = Fmake_vector (make_number (256), Qnil);
8712
8713 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8714 {
8715 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8716 int dim = CHARSET_DIMENSION (charset);
8717 int idx = (dim - 1) * 4;
8718
8719 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8720 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8721
8722 for (i = charset->code_space[idx];
8723 i <= charset->code_space[idx + 1]; i++)
8724 {
8725 Lisp_Object tmp, tmp2;
8726 int dim2;
8727
8728 tmp = AREF (val, i);
8729 if (NILP (tmp))
8730 tmp = XCAR (tail);
8731 else if (NUMBERP (tmp))
8732 {
8733 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8734 if (dim < dim2)
8735 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8736 else
8737 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8738 }
8739 else
8740 {
8741 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8742 {
8743 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8744 if (dim < dim2)
8745 break;
8746 }
8747 if (NILP (tmp2))
8748 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8749 else
8750 {
8751 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8752 XSETCAR (tmp2, XCAR (tail));
8753 }
8754 }
8755 ASET (val, i, tmp);
8756 }
8757 }
8758 ASET (attrs, coding_attr_charset_valids, val);
8759 category = coding_category_charset;
8760 }
8761 else if (EQ (coding_type, Qccl))
8762 {
8763 Lisp_Object valids;
8764
8765 if (nargs < coding_arg_ccl_max)
8766 goto short_args;
8767
8768 val = args[coding_arg_ccl_decoder];
8769 CHECK_CCL_PROGRAM (val);
8770 if (VECTORP (val))
8771 val = Fcopy_sequence (val);
8772 ASET (attrs, coding_attr_ccl_decoder, val);
8773
8774 val = args[coding_arg_ccl_encoder];
8775 CHECK_CCL_PROGRAM (val);
8776 if (VECTORP (val))
8777 val = Fcopy_sequence (val);
8778 ASET (attrs, coding_attr_ccl_encoder, val);
8779
8780 val = args[coding_arg_ccl_valids];
8781 valids = Fmake_string (make_number (256), make_number (0));
8782 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8783 {
8784 int from, to;
8785
8786 val = Fcar (tail);
8787 if (INTEGERP (val))
8788 {
8789 from = to = XINT (val);
8790 if (from < 0 || from > 255)
8791 args_out_of_range_3 (val, make_number (0), make_number (255));
8792 }
8793 else
8794 {
8795 CHECK_CONS (val);
8796 CHECK_NATNUM_CAR (val);
8797 CHECK_NATNUM_CDR (val);
8798 from = XINT (XCAR (val));
8799 if (from > 255)
8800 args_out_of_range_3 (XCAR (val),
8801 make_number (0), make_number (255));
8802 to = XINT (XCDR (val));
8803 if (to < from || to > 255)
8804 args_out_of_range_3 (XCDR (val),
8805 XCAR (val), make_number (255));
8806 }
8807 for (i = from; i <= to; i++)
8808 SSET (valids, i, 1);
8809 }
8810 ASET (attrs, coding_attr_ccl_valids, valids);
8811
8812 category = coding_category_ccl;
8813 }
8814 else if (EQ (coding_type, Qutf_16))
8815 {
8816 Lisp_Object bom, endian;
8817
8818 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8819
8820 if (nargs < coding_arg_utf16_max)
8821 goto short_args;
8822
8823 bom = args[coding_arg_utf16_bom];
8824 if (! NILP (bom) && ! EQ (bom, Qt))
8825 {
8826 CHECK_CONS (bom);
8827 val = XCAR (bom);
8828 CHECK_CODING_SYSTEM (val);
8829 val = XCDR (bom);
8830 CHECK_CODING_SYSTEM (val);
8831 }
8832 ASET (attrs, coding_attr_utf_16_bom, bom);
8833
8834 endian = args[coding_arg_utf16_endian];
8835 CHECK_SYMBOL (endian);
8836 if (NILP (endian))
8837 endian = Qbig;
8838 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8839 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8840 ASET (attrs, coding_attr_utf_16_endian, endian);
8841
8842 category = (CONSP (bom)
8843 ? coding_category_utf_16_auto
8844 : NILP (bom)
8845 ? (EQ (endian, Qbig)
8846 ? coding_category_utf_16_be_nosig
8847 : coding_category_utf_16_le_nosig)
8848 : (EQ (endian, Qbig)
8849 ? coding_category_utf_16_be
8850 : coding_category_utf_16_le));
8851 }
8852 else if (EQ (coding_type, Qiso_2022))
8853 {
8854 Lisp_Object initial, reg_usage, request, flags;
8855 int i;
8856
8857 if (nargs < coding_arg_iso2022_max)
8858 goto short_args;
8859
8860 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8861 CHECK_VECTOR (initial);
8862 for (i = 0; i < 4; i++)
8863 {
8864 val = Faref (initial, make_number (i));
8865 if (! NILP (val))
8866 {
8867 struct charset *charset;
8868
8869 CHECK_CHARSET_GET_CHARSET (val, charset);
8870 ASET (initial, i, make_number (CHARSET_ID (charset)));
8871 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8872 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8873 }
8874 else
8875 ASET (initial, i, make_number (-1));
8876 }
8877
8878 reg_usage = args[coding_arg_iso2022_reg_usage];
8879 CHECK_CONS (reg_usage);
8880 CHECK_NUMBER_CAR (reg_usage);
8881 CHECK_NUMBER_CDR (reg_usage);
8882
8883 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8884 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8885 {
8886 int id;
8887 Lisp_Object tmp;
8888
8889 val = Fcar (tail);
8890 CHECK_CONS (val);
8891 tmp = XCAR (val);
8892 CHECK_CHARSET_GET_ID (tmp, id);
8893 CHECK_NATNUM_CDR (val);
8894 if (XINT (XCDR (val)) >= 4)
8895 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8896 XSETCAR (val, make_number (id));
8897 }
8898
8899 flags = args[coding_arg_iso2022_flags];
8900 CHECK_NATNUM (flags);
8901 i = XINT (flags);
8902 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8903 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8904
8905 ASET (attrs, coding_attr_iso_initial, initial);
8906 ASET (attrs, coding_attr_iso_usage, reg_usage);
8907 ASET (attrs, coding_attr_iso_request, request);
8908 ASET (attrs, coding_attr_iso_flags, flags);
8909 setup_iso_safe_charsets (attrs);
8910
8911 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8912 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8913 | CODING_ISO_FLAG_SINGLE_SHIFT))
8914 ? coding_category_iso_7_else
8915 : EQ (args[coding_arg_charset_list], Qiso_2022)
8916 ? coding_category_iso_7
8917 : coding_category_iso_7_tight);
8918 else
8919 {
8920 int id = XINT (AREF (initial, 1));
8921
8922 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8923 || EQ (args[coding_arg_charset_list], Qiso_2022)
8924 || id < 0)
8925 ? coding_category_iso_8_else
8926 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8927 ? coding_category_iso_8_1
8928 : coding_category_iso_8_2);
8929 }
8930 if (category != coding_category_iso_8_1
8931 && category != coding_category_iso_8_2)
8932 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8933 }
8934 else if (EQ (coding_type, Qemacs_mule))
8935 {
8936 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8937 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8938 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8939 category = coding_category_emacs_mule;
8940 }
8941 else if (EQ (coding_type, Qshift_jis))
8942 {
8943
8944 struct charset *charset;
8945
8946 if (XINT (Flength (charset_list)) != 3
8947 && XINT (Flength (charset_list)) != 4)
8948 error ("There should be three or four charsets");
8949
8950 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8951 if (CHARSET_DIMENSION (charset) != 1)
8952 error ("Dimension of charset %s is not one",
8953 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8954 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8955 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8956
8957 charset_list = XCDR (charset_list);
8958 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8959 if (CHARSET_DIMENSION (charset) != 1)
8960 error ("Dimension of charset %s is not one",
8961 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8962
8963 charset_list = XCDR (charset_list);
8964 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8965 if (CHARSET_DIMENSION (charset) != 2)
8966 error ("Dimension of charset %s is not two",
8967 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8968
8969 charset_list = XCDR (charset_list);
8970 if (! NILP (charset_list))
8971 {
8972 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8973 if (CHARSET_DIMENSION (charset) != 2)
8974 error ("Dimension of charset %s is not two",
8975 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8976 }
8977
8978 category = coding_category_sjis;
8979 Vsjis_coding_system = name;
8980 }
8981 else if (EQ (coding_type, Qbig5))
8982 {
8983 struct charset *charset;
8984
8985 if (XINT (Flength (charset_list)) != 2)
8986 error ("There should be just two charsets");
8987
8988 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8989 if (CHARSET_DIMENSION (charset) != 1)
8990 error ("Dimension of charset %s is not one",
8991 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8992 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8993 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8994
8995 charset_list = XCDR (charset_list);
8996 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8997 if (CHARSET_DIMENSION (charset) != 2)
8998 error ("Dimension of charset %s is not two",
8999 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9000
9001 category = coding_category_big5;
9002 Vbig5_coding_system = name;
9003 }
9004 else if (EQ (coding_type, Qraw_text))
9005 {
9006 category = coding_category_raw_text;
9007 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9008 }
9009 else if (EQ (coding_type, Qutf_8))
9010 {
9011 category = coding_category_utf_8;
9012 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9013 }
9014 else if (EQ (coding_type, Qundecided))
9015 category = coding_category_undecided;
9016 else
9017 error ("Invalid coding system type: %s",
9018 SDATA (SYMBOL_NAME (coding_type)));
9019
9020 CODING_ATTR_CATEGORY (attrs) = make_number (category);
9021 CODING_ATTR_PLIST (attrs)
9022 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9023 CODING_ATTR_PLIST (attrs)));
9024 CODING_ATTR_PLIST (attrs)
9025 = Fcons (QCascii_compatible_p,
9026 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9027 CODING_ATTR_PLIST (attrs)));
9028
9029 eol_type = args[coding_arg_eol_type];
9030 if (! NILP (eol_type)
9031 && ! EQ (eol_type, Qunix)
9032 && ! EQ (eol_type, Qdos)
9033 && ! EQ (eol_type, Qmac))
9034 error ("Invalid eol-type");
9035
9036 aliases = Fcons (name, Qnil);
9037
9038 if (NILP (eol_type))
9039 {
9040 eol_type = make_subsidiaries (name);
9041 for (i = 0; i < 3; i++)
9042 {
9043 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9044
9045 this_name = AREF (eol_type, i);
9046 this_aliases = Fcons (this_name, Qnil);
9047 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9048 this_spec = Fmake_vector (make_number (3), attrs);
9049 ASET (this_spec, 1, this_aliases);
9050 ASET (this_spec, 2, this_eol_type);
9051 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9052 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9053 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9054 if (NILP (val))
9055 Vcoding_system_alist
9056 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9057 Vcoding_system_alist);
9058 }
9059 }
9060
9061 spec_vec = Fmake_vector (make_number (3), attrs);
9062 ASET (spec_vec, 1, aliases);
9063 ASET (spec_vec, 2, eol_type);
9064
9065 Fputhash (name, spec_vec, Vcoding_system_hash_table);
9066 Vcoding_system_list = Fcons (name, Vcoding_system_list);
9067 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9068 if (NILP (val))
9069 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9070 Vcoding_system_alist);
9071
9072 {
9073 int id = coding_categories[category].id;
9074
9075 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9076 setup_coding_system (name, &coding_categories[category]);
9077 }
9078
9079 return Qnil;
9080
9081 short_args:
9082 return Fsignal (Qwrong_number_of_arguments,
9083 Fcons (intern ("define-coding-system-internal"),
9084 make_number (nargs)));
9085 }
9086
9087
9088 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9089 3, 3, 0,
9090 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9091 (coding_system, prop, val)
9092 Lisp_Object coding_system, prop, val;
9093 {
9094 Lisp_Object spec, attrs;
9095
9096 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9097 attrs = AREF (spec, 0);
9098 if (EQ (prop, QCmnemonic))
9099 {
9100 if (! STRINGP (val))
9101 CHECK_CHARACTER (val);
9102 CODING_ATTR_MNEMONIC (attrs) = val;
9103 }
9104 else if (EQ (prop, QCdefalut_char))
9105 {
9106 if (NILP (val))
9107 val = make_number (' ');
9108 else
9109 CHECK_CHARACTER (val);
9110 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9111 }
9112 else if (EQ (prop, QCdecode_translation_table))
9113 {
9114 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9115 CHECK_SYMBOL (val);
9116 CODING_ATTR_DECODE_TBL (attrs) = val;
9117 }
9118 else if (EQ (prop, QCencode_translation_table))
9119 {
9120 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9121 CHECK_SYMBOL (val);
9122 CODING_ATTR_ENCODE_TBL (attrs) = val;
9123 }
9124 else if (EQ (prop, QCpost_read_conversion))
9125 {
9126 CHECK_SYMBOL (val);
9127 CODING_ATTR_POST_READ (attrs) = val;
9128 }
9129 else if (EQ (prop, QCpre_write_conversion))
9130 {
9131 CHECK_SYMBOL (val);
9132 CODING_ATTR_PRE_WRITE (attrs) = val;
9133 }
9134 else if (EQ (prop, QCascii_compatible_p))
9135 {
9136 CODING_ATTR_ASCII_COMPAT (attrs) = val;
9137 }
9138
9139 CODING_ATTR_PLIST (attrs)
9140 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9141 return val;
9142 }
9143
9144
9145 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9146 Sdefine_coding_system_alias, 2, 2, 0,
9147 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9148 (alias, coding_system)
9149 Lisp_Object alias, coding_system;
9150 {
9151 Lisp_Object spec, aliases, eol_type, val;
9152
9153 CHECK_SYMBOL (alias);
9154 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9155 aliases = AREF (spec, 1);
9156 /* ALISES should be a list of length more than zero, and the first
9157 element is a base coding system. Append ALIAS at the tail of the
9158 list. */
9159 while (!NILP (XCDR (aliases)))
9160 aliases = XCDR (aliases);
9161 XSETCDR (aliases, Fcons (alias, Qnil));
9162
9163 eol_type = AREF (spec, 2);
9164 if (VECTORP (eol_type))
9165 {
9166 Lisp_Object subsidiaries;
9167 int i;
9168
9169 subsidiaries = make_subsidiaries (alias);
9170 for (i = 0; i < 3; i++)
9171 Fdefine_coding_system_alias (AREF (subsidiaries, i),
9172 AREF (eol_type, i));
9173 }
9174
9175 Fputhash (alias, spec, Vcoding_system_hash_table);
9176 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9177 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9178 if (NILP (val))
9179 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9180 Vcoding_system_alist);
9181
9182 return Qnil;
9183 }
9184
9185 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9186 1, 1, 0,
9187 doc: /* Return the base of CODING-SYSTEM.
9188 Any alias or subsidiary coding system is not a base coding system. */)
9189 (coding_system)
9190 Lisp_Object coding_system;
9191 {
9192 Lisp_Object spec, attrs;
9193
9194 if (NILP (coding_system))
9195 return (Qno_conversion);
9196 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9197 attrs = AREF (spec, 0);
9198 return CODING_ATTR_BASE_NAME (attrs);
9199 }
9200
9201 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9202 1, 1, 0,
9203 doc: "Return the property list of CODING-SYSTEM.")
9204 (coding_system)
9205 Lisp_Object coding_system;
9206 {
9207 Lisp_Object spec, attrs;
9208
9209 if (NILP (coding_system))
9210 coding_system = Qno_conversion;
9211 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9212 attrs = AREF (spec, 0);
9213 return CODING_ATTR_PLIST (attrs);
9214 }
9215
9216
9217 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9218 1, 1, 0,
9219 doc: /* Return the list of aliases of CODING-SYSTEM. */)
9220 (coding_system)
9221 Lisp_Object coding_system;
9222 {
9223 Lisp_Object spec;
9224
9225 if (NILP (coding_system))
9226 coding_system = Qno_conversion;
9227 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9228 return AREF (spec, 1);
9229 }
9230
9231 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9232 Scoding_system_eol_type, 1, 1, 0,
9233 doc: /* Return eol-type of CODING-SYSTEM.
9234 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9235
9236 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9237 and CR respectively.
9238
9239 A vector value indicates that a format of end-of-line should be
9240 detected automatically. Nth element of the vector is the subsidiary
9241 coding system whose eol-type is N. */)
9242 (coding_system)
9243 Lisp_Object coding_system;
9244 {
9245 Lisp_Object spec, eol_type;
9246 int n;
9247
9248 if (NILP (coding_system))
9249 coding_system = Qno_conversion;
9250 if (! CODING_SYSTEM_P (coding_system))
9251 return Qnil;
9252 spec = CODING_SYSTEM_SPEC (coding_system);
9253 eol_type = AREF (spec, 2);
9254 if (VECTORP (eol_type))
9255 return Fcopy_sequence (eol_type);
9256 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9257 return make_number (n);
9258 }
9259
9260 #endif /* emacs */
9261
9262 \f
9263 /*** 9. Post-amble ***/
9264
9265 void
9266 init_coding_once ()
9267 {
9268 int i;
9269
9270 for (i = 0; i < coding_category_max; i++)
9271 {
9272 coding_categories[i].id = -1;
9273 coding_priorities[i] = i;
9274 }
9275
9276 /* ISO2022 specific initialize routine. */
9277 for (i = 0; i < 0x20; i++)
9278 iso_code_class[i] = ISO_control_0;
9279 for (i = 0x21; i < 0x7F; i++)
9280 iso_code_class[i] = ISO_graphic_plane_0;
9281 for (i = 0x80; i < 0xA0; i++)
9282 iso_code_class[i] = ISO_control_1;
9283 for (i = 0xA1; i < 0xFF; i++)
9284 iso_code_class[i] = ISO_graphic_plane_1;
9285 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9286 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9287 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9288 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9289 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9290 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9291 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9292 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9293 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9294
9295 for (i = 0; i < 256; i++)
9296 {
9297 emacs_mule_bytes[i] = 1;
9298 }
9299 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9300 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9301 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9302 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9303 }
9304
9305 #ifdef emacs
9306
9307 void
9308 syms_of_coding ()
9309 {
9310 staticpro (&Vcoding_system_hash_table);
9311 {
9312 Lisp_Object args[2];
9313 args[0] = QCtest;
9314 args[1] = Qeq;
9315 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9316 }
9317
9318 staticpro (&Vsjis_coding_system);
9319 Vsjis_coding_system = Qnil;
9320
9321 staticpro (&Vbig5_coding_system);
9322 Vbig5_coding_system = Qnil;
9323
9324 staticpro (&Vcode_conversion_reused_workbuf);
9325 Vcode_conversion_reused_workbuf = Qnil;
9326
9327 staticpro (&Vcode_conversion_workbuf_name);
9328 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9329
9330 reused_workbuf_in_use = 0;
9331
9332 DEFSYM (Qcharset, "charset");
9333 DEFSYM (Qtarget_idx, "target-idx");
9334 DEFSYM (Qcoding_system_history, "coding-system-history");
9335 Fset (Qcoding_system_history, Qnil);
9336
9337 /* Target FILENAME is the first argument. */
9338 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9339 /* Target FILENAME is the third argument. */
9340 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9341
9342 DEFSYM (Qcall_process, "call-process");
9343 /* Target PROGRAM is the first argument. */
9344 Fput (Qcall_process, Qtarget_idx, make_number (0));
9345
9346 DEFSYM (Qcall_process_region, "call-process-region");
9347 /* Target PROGRAM is the third argument. */
9348 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9349
9350 DEFSYM (Qstart_process, "start-process");
9351 /* Target PROGRAM is the third argument. */
9352 Fput (Qstart_process, Qtarget_idx, make_number (2));
9353
9354 DEFSYM (Qopen_network_stream, "open-network-stream");
9355 /* Target SERVICE is the fourth argument. */
9356 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9357
9358 DEFSYM (Qcoding_system, "coding-system");
9359 DEFSYM (Qcoding_aliases, "coding-aliases");
9360
9361 DEFSYM (Qeol_type, "eol-type");
9362 DEFSYM (Qunix, "unix");
9363 DEFSYM (Qdos, "dos");
9364
9365 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9366 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9367 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9368 DEFSYM (Qdefault_char, "default-char");
9369 DEFSYM (Qundecided, "undecided");
9370 DEFSYM (Qno_conversion, "no-conversion");
9371 DEFSYM (Qraw_text, "raw-text");
9372
9373 DEFSYM (Qiso_2022, "iso-2022");
9374
9375 DEFSYM (Qutf_8, "utf-8");
9376 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9377
9378 DEFSYM (Qutf_16, "utf-16");
9379 DEFSYM (Qbig, "big");
9380 DEFSYM (Qlittle, "little");
9381
9382 DEFSYM (Qshift_jis, "shift-jis");
9383 DEFSYM (Qbig5, "big5");
9384
9385 DEFSYM (Qcoding_system_p, "coding-system-p");
9386
9387 DEFSYM (Qcoding_system_error, "coding-system-error");
9388 Fput (Qcoding_system_error, Qerror_conditions,
9389 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9390 Fput (Qcoding_system_error, Qerror_message,
9391 build_string ("Invalid coding system"));
9392
9393 /* Intern this now in case it isn't already done.
9394 Setting this variable twice is harmless.
9395 But don't staticpro it here--that is done in alloc.c. */
9396 Qchar_table_extra_slots = intern ("char-table-extra-slots");
9397
9398 DEFSYM (Qtranslation_table, "translation-table");
9399 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9400 DEFSYM (Qtranslation_table_id, "translation-table-id");
9401 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9402 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9403
9404 DEFSYM (Qvalid_codes, "valid-codes");
9405
9406 DEFSYM (Qemacs_mule, "emacs-mule");
9407
9408 DEFSYM (QCcategory, ":category");
9409 DEFSYM (QCmnemonic, ":mnemonic");
9410 DEFSYM (QCdefalut_char, ":default-char");
9411 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9412 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9413 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9414 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9415 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9416
9417 Vcoding_category_table
9418 = Fmake_vector (make_number (coding_category_max), Qnil);
9419 staticpro (&Vcoding_category_table);
9420 /* Followings are target of code detection. */
9421 ASET (Vcoding_category_table, coding_category_iso_7,
9422 intern ("coding-category-iso-7"));
9423 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9424 intern ("coding-category-iso-7-tight"));
9425 ASET (Vcoding_category_table, coding_category_iso_8_1,
9426 intern ("coding-category-iso-8-1"));
9427 ASET (Vcoding_category_table, coding_category_iso_8_2,
9428 intern ("coding-category-iso-8-2"));
9429 ASET (Vcoding_category_table, coding_category_iso_7_else,
9430 intern ("coding-category-iso-7-else"));
9431 ASET (Vcoding_category_table, coding_category_iso_8_else,
9432 intern ("coding-category-iso-8-else"));
9433 ASET (Vcoding_category_table, coding_category_utf_8,
9434 intern ("coding-category-utf-8"));
9435 ASET (Vcoding_category_table, coding_category_utf_16_be,
9436 intern ("coding-category-utf-16-be"));
9437 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9438 intern ("coding-category-utf-16-auto"));
9439 ASET (Vcoding_category_table, coding_category_utf_16_le,
9440 intern ("coding-category-utf-16-le"));
9441 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9442 intern ("coding-category-utf-16-be-nosig"));
9443 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9444 intern ("coding-category-utf-16-le-nosig"));
9445 ASET (Vcoding_category_table, coding_category_charset,
9446 intern ("coding-category-charset"));
9447 ASET (Vcoding_category_table, coding_category_sjis,
9448 intern ("coding-category-sjis"));
9449 ASET (Vcoding_category_table, coding_category_big5,
9450 intern ("coding-category-big5"));
9451 ASET (Vcoding_category_table, coding_category_ccl,
9452 intern ("coding-category-ccl"));
9453 ASET (Vcoding_category_table, coding_category_emacs_mule,
9454 intern ("coding-category-emacs-mule"));
9455 /* Followings are NOT target of code detection. */
9456 ASET (Vcoding_category_table, coding_category_raw_text,
9457 intern ("coding-category-raw-text"));
9458 ASET (Vcoding_category_table, coding_category_undecided,
9459 intern ("coding-category-undecided"));
9460
9461 DEFSYM (Qinsufficient_source, "insufficient-source");
9462 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9463 DEFSYM (Qinvalid_source, "invalid-source");
9464 DEFSYM (Qinterrupted, "interrupted");
9465 DEFSYM (Qinsufficient_memory, "insufficient-memory");
9466 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9467
9468 defsubr (&Scoding_system_p);
9469 defsubr (&Sread_coding_system);
9470 defsubr (&Sread_non_nil_coding_system);
9471 defsubr (&Scheck_coding_system);
9472 defsubr (&Sdetect_coding_region);
9473 defsubr (&Sdetect_coding_string);
9474 defsubr (&Sfind_coding_systems_region_internal);
9475 defsubr (&Sunencodable_char_position);
9476 defsubr (&Scheck_coding_systems_region);
9477 defsubr (&Sdecode_coding_region);
9478 defsubr (&Sencode_coding_region);
9479 defsubr (&Sdecode_coding_string);
9480 defsubr (&Sencode_coding_string);
9481 defsubr (&Sdecode_sjis_char);
9482 defsubr (&Sencode_sjis_char);
9483 defsubr (&Sdecode_big5_char);
9484 defsubr (&Sencode_big5_char);
9485 defsubr (&Sset_terminal_coding_system_internal);
9486 defsubr (&Sset_safe_terminal_coding_system_internal);
9487 defsubr (&Sterminal_coding_system);
9488 defsubr (&Sset_keyboard_coding_system_internal);
9489 defsubr (&Skeyboard_coding_system);
9490 defsubr (&Sfind_operation_coding_system);
9491 defsubr (&Sset_coding_system_priority);
9492 defsubr (&Sdefine_coding_system_internal);
9493 defsubr (&Sdefine_coding_system_alias);
9494 defsubr (&Scoding_system_put);
9495 defsubr (&Scoding_system_base);
9496 defsubr (&Scoding_system_plist);
9497 defsubr (&Scoding_system_aliases);
9498 defsubr (&Scoding_system_eol_type);
9499 defsubr (&Scoding_system_priority_list);
9500
9501 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9502 doc: /* List of coding systems.
9503
9504 Do not alter the value of this variable manually. This variable should be
9505 updated by the functions `define-coding-system' and
9506 `define-coding-system-alias'. */);
9507 Vcoding_system_list = Qnil;
9508
9509 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9510 doc: /* Alist of coding system names.
9511 Each element is one element list of coding system name.
9512 This variable is given to `completing-read' as TABLE argument.
9513
9514 Do not alter the value of this variable manually. This variable should be
9515 updated by the functions `make-coding-system' and
9516 `define-coding-system-alias'. */);
9517 Vcoding_system_alist = Qnil;
9518
9519 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9520 doc: /* List of coding-categories (symbols) ordered by priority.
9521
9522 On detecting a coding system, Emacs tries code detection algorithms
9523 associated with each coding-category one by one in this order. When
9524 one algorithm agrees with a byte sequence of source text, the coding
9525 system bound to the corresponding coding-category is selected.
9526
9527 Don't modify this variable directly, but use `set-coding-priority'. */);
9528 {
9529 int i;
9530
9531 Vcoding_category_list = Qnil;
9532 for (i = coding_category_max - 1; i >= 0; i--)
9533 Vcoding_category_list
9534 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9535 Vcoding_category_list);
9536 }
9537
9538 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9539 doc: /* Specify the coding system for read operations.
9540 It is useful to bind this variable with `let', but do not set it globally.
9541 If the value is a coding system, it is used for decoding on read operation.
9542 If not, an appropriate element is used from one of the coding system alists:
9543 There are three such tables, `file-coding-system-alist',
9544 `process-coding-system-alist', and `network-coding-system-alist'. */);
9545 Vcoding_system_for_read = Qnil;
9546
9547 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9548 doc: /* Specify the coding system for write operations.
9549 Programs bind this variable with `let', but you should not set it globally.
9550 If the value is a coding system, it is used for encoding of output,
9551 when writing it to a file and when sending it to a file or subprocess.
9552
9553 If this does not specify a coding system, an appropriate element
9554 is used from one of the coding system alists:
9555 There are three such tables, `file-coding-system-alist',
9556 `process-coding-system-alist', and `network-coding-system-alist'.
9557 For output to files, if the above procedure does not specify a coding system,
9558 the value of `buffer-file-coding-system' is used. */);
9559 Vcoding_system_for_write = Qnil;
9560
9561 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9562 doc: /*
9563 Coding system used in the latest file or process I/O. */);
9564 Vlast_coding_system_used = Qnil;
9565
9566 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9567 doc: /*
9568 Error status of the last code conversion.
9569
9570 When an error was detected in the last code conversion, this variable
9571 is set to one of the following symbols.
9572 `insufficient-source'
9573 `inconsistent-eol'
9574 `invalid-source'
9575 `interrupted'
9576 `insufficient-memory'
9577 When no error was detected, the value doesn't change. So, to check
9578 the error status of a code conversion by this variable, you must
9579 explicitly set this variable to nil before performing code
9580 conversion. */);
9581 Vlast_code_conversion_error = Qnil;
9582
9583 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9584 doc: /*
9585 *Non-nil means always inhibit code conversion of end-of-line format.
9586 See info node `Coding Systems' and info node `Text and Binary' concerning
9587 such conversion. */);
9588 inhibit_eol_conversion = 0;
9589
9590 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9591 doc: /*
9592 Non-nil means process buffer inherits coding system of process output.
9593 Bind it to t if the process output is to be treated as if it were a file
9594 read from some filesystem. */);
9595 inherit_process_coding_system = 0;
9596
9597 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9598 doc: /*
9599 Alist to decide a coding system to use for a file I/O operation.
9600 The format is ((PATTERN . VAL) ...),
9601 where PATTERN is a regular expression matching a file name,
9602 VAL is a coding system, a cons of coding systems, or a function symbol.
9603 If VAL is a coding system, it is used for both decoding and encoding
9604 the file contents.
9605 If VAL is a cons of coding systems, the car part is used for decoding,
9606 and the cdr part is used for encoding.
9607 If VAL is a function symbol, the function must return a coding system
9608 or a cons of coding systems which are used as above. The function gets
9609 the arguments with which `find-operation-coding-systems' was called.
9610
9611 See also the function `find-operation-coding-system'
9612 and the variable `auto-coding-alist'. */);
9613 Vfile_coding_system_alist = Qnil;
9614
9615 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9616 doc: /*
9617 Alist to decide a coding system to use for a process I/O operation.
9618 The format is ((PATTERN . VAL) ...),
9619 where PATTERN is a regular expression matching a program name,
9620 VAL is a coding system, a cons of coding systems, or a function symbol.
9621 If VAL is a coding system, it is used for both decoding what received
9622 from the program and encoding what sent to the program.
9623 If VAL is a cons of coding systems, the car part is used for decoding,
9624 and the cdr part is used for encoding.
9625 If VAL is a function symbol, the function must return a coding system
9626 or a cons of coding systems which are used as above.
9627
9628 See also the function `find-operation-coding-system'. */);
9629 Vprocess_coding_system_alist = Qnil;
9630
9631 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9632 doc: /*
9633 Alist to decide a coding system to use for a network I/O operation.
9634 The format is ((PATTERN . VAL) ...),
9635 where PATTERN is a regular expression matching a network service name
9636 or is a port number to connect to,
9637 VAL is a coding system, a cons of coding systems, or a function symbol.
9638 If VAL is a coding system, it is used for both decoding what received
9639 from the network stream and encoding what sent to the network stream.
9640 If VAL is a cons of coding systems, the car part is used for decoding,
9641 and the cdr part is used for encoding.
9642 If VAL is a function symbol, the function must return a coding system
9643 or a cons of coding systems which are used as above.
9644
9645 See also the function `find-operation-coding-system'. */);
9646 Vnetwork_coding_system_alist = Qnil;
9647
9648 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9649 doc: /* Coding system to use with system messages.
9650 Also used for decoding keyboard input on X Window system. */);
9651 Vlocale_coding_system = Qnil;
9652
9653 /* The eol mnemonics are reset in startup.el system-dependently. */
9654 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9655 doc: /*
9656 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9657 eol_mnemonic_unix = build_string (":");
9658
9659 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9660 doc: /*
9661 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9662 eol_mnemonic_dos = build_string ("\\");
9663
9664 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9665 doc: /*
9666 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9667 eol_mnemonic_mac = build_string ("/");
9668
9669 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9670 doc: /*
9671 *String displayed in mode line when end-of-line format is not yet determined. */);
9672 eol_mnemonic_undecided = build_string (":");
9673
9674 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9675 doc: /*
9676 *Non-nil enables character translation while encoding and decoding. */);
9677 Venable_character_translation = Qt;
9678
9679 DEFVAR_LISP ("standard-translation-table-for-decode",
9680 &Vstandard_translation_table_for_decode,
9681 doc: /* Table for translating characters while decoding. */);
9682 Vstandard_translation_table_for_decode = Qnil;
9683
9684 DEFVAR_LISP ("standard-translation-table-for-encode",
9685 &Vstandard_translation_table_for_encode,
9686 doc: /* Table for translating characters while encoding. */);
9687 Vstandard_translation_table_for_encode = Qnil;
9688
9689 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9690 doc: /* Alist of charsets vs revision numbers.
9691 While encoding, if a charset (car part of an element) is found,
9692 designate it with the escape sequence identifying revision (cdr part
9693 of the element). */);
9694 Vcharset_revision_table = Qnil;
9695
9696 DEFVAR_LISP ("default-process-coding-system",
9697 &Vdefault_process_coding_system,
9698 doc: /* Cons of coding systems used for process I/O by default.
9699 The car part is used for decoding a process output,
9700 the cdr part is used for encoding a text to be sent to a process. */);
9701 Vdefault_process_coding_system = Qnil;
9702
9703 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9704 doc: /*
9705 Table of extra Latin codes in the range 128..159 (inclusive).
9706 This is a vector of length 256.
9707 If Nth element is non-nil, the existence of code N in a file
9708 \(or output of subprocess) doesn't prevent it to be detected as
9709 a coding system of ISO 2022 variant which has a flag
9710 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9711 or reading output of a subprocess.
9712 Only 128th through 159th elements has a meaning. */);
9713 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9714
9715 DEFVAR_LISP ("select-safe-coding-system-function",
9716 &Vselect_safe_coding_system_function,
9717 doc: /*
9718 Function to call to select safe coding system for encoding a text.
9719
9720 If set, this function is called to force a user to select a proper
9721 coding system which can encode the text in the case that a default
9722 coding system used in each operation can't encode the text.
9723
9724 The default value is `select-safe-coding-system' (which see). */);
9725 Vselect_safe_coding_system_function = Qnil;
9726
9727 DEFVAR_BOOL ("coding-system-require-warning",
9728 &coding_system_require_warning,
9729 doc: /* Internal use only.
9730 If non-nil, on writing a file, `select-safe-coding-system-function' is
9731 called even if `coding-system-for-write' is non-nil. The command
9732 `universal-coding-system-argument' binds this variable to t temporarily. */);
9733 coding_system_require_warning = 0;
9734
9735
9736 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9737 &inhibit_iso_escape_detection,
9738 doc: /*
9739 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9740
9741 By default, on reading a file, Emacs tries to detect how the text is
9742 encoded. This code detection is sensitive to escape sequences. If
9743 the sequence is valid as ISO2022, the code is determined as one of
9744 the ISO2022 encodings, and the file is decoded by the corresponding
9745 coding system (e.g. `iso-2022-7bit').
9746
9747 However, there may be a case that you want to read escape sequences in
9748 a file as is. In such a case, you can set this variable to non-nil.
9749 Then, as the code detection ignores any escape sequences, no file is
9750 detected as encoded in some ISO2022 encoding. The result is that all
9751 escape sequences become visible in a buffer.
9752
9753 The default value is nil, and it is strongly recommended not to change
9754 it. That is because many Emacs Lisp source files that contain
9755 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9756 in Emacs's distribution, and they won't be decoded correctly on
9757 reading if you suppress escape sequence detection.
9758
9759 The other way to read escape sequences in a file without decoding is
9760 to explicitly specify some coding system that doesn't use ISO2022's
9761 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9762 inhibit_iso_escape_detection = 0;
9763
9764 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9765 doc: /* Char table for translating self-inserting characters.
9766 This is applied to the result of input methods, not their input. See also
9767 `keyboard-translate-table'. */);
9768 Vtranslation_table_for_input = Qnil;
9769
9770 {
9771 Lisp_Object args[coding_arg_max];
9772 Lisp_Object plist[16];
9773 int i;
9774
9775 for (i = 0; i < coding_arg_max; i++)
9776 args[i] = Qnil;
9777
9778 plist[0] = intern (":name");
9779 plist[1] = args[coding_arg_name] = Qno_conversion;
9780 plist[2] = intern (":mnemonic");
9781 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9782 plist[4] = intern (":coding-type");
9783 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9784 plist[6] = intern (":ascii-compatible-p");
9785 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9786 plist[8] = intern (":default-char");
9787 plist[9] = args[coding_arg_default_char] = make_number (0);
9788 plist[10] = intern (":for-unibyte");
9789 plist[11] = args[coding_arg_for_unibyte] = Qt;
9790 plist[12] = intern (":docstring");
9791 plist[13] = build_string ("Do no conversion.\n\
9792 \n\
9793 When you visit a file with this coding, the file is read into a\n\
9794 unibyte buffer as is, thus each byte of a file is treated as a\n\
9795 character.");
9796 plist[14] = intern (":eol-type");
9797 plist[15] = args[coding_arg_eol_type] = Qunix;
9798 args[coding_arg_plist] = Flist (16, plist);
9799 Fdefine_coding_system_internal (coding_arg_max, args);
9800
9801 plist[1] = args[coding_arg_name] = Qundecided;
9802 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9803 plist[5] = args[coding_arg_coding_type] = Qundecided;
9804 /* This is already set.
9805 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9806 plist[8] = intern (":charset-list");
9807 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9808 plist[11] = args[coding_arg_for_unibyte] = Qnil;
9809 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9810 plist[15] = args[coding_arg_eol_type] = Qnil;
9811 args[coding_arg_plist] = Flist (16, plist);
9812 Fdefine_coding_system_internal (coding_arg_max, args);
9813 }
9814
9815 setup_coding_system (Qno_conversion, &keyboard_coding);
9816 setup_coding_system (Qundecided, &terminal_coding);
9817 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9818
9819 {
9820 int i;
9821
9822 for (i = 0; i < coding_category_max; i++)
9823 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9824 }
9825 #if defined (MSDOS) || defined (WINDOWSNT)
9826 system_eol_type = Qdos;
9827 #else
9828 system_eol_type = Qunix;
9829 #endif
9830 staticpro (&system_eol_type);
9831 }
9832
9833 char *
9834 emacs_strerror (error_number)
9835 int error_number;
9836 {
9837 char *str;
9838
9839 synchronize_system_messages_locale ();
9840 str = strerror (error_number);
9841
9842 if (! NILP (Vlocale_coding_system))
9843 {
9844 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9845 Vlocale_coding_system,
9846 0);
9847 str = (char *) SDATA (dec);
9848 }
9849
9850 return str;
9851 }
9852
9853 #endif /* emacs */
9854
9855 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9856 (do not change this comment) */