(inhibit_pre_post_conversion): Removed (unused).
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory;
315
316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
318 Lisp_Object Qstart_process, Qopen_network_stream;
319 Lisp_Object Qtarget_idx;
320
321 int coding_system_require_warning;
322
323 Lisp_Object Vselect_safe_coding_system_function;
324
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
327 /* Mnemonic string to indicate format of end-of-line is not yet
328 decided. */
329 Lisp_Object eol_mnemonic_undecided;
330
331 #ifdef emacs
332
333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
334
335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
336
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule, Qraw_text;
340 Lisp_Object Qutf_8_emacs;
341
342 /* Coding-systems are handed between Emacs Lisp programs and C internal
343 routines by the following three variables. */
344 /* Coding-system for reading files and receiving data from process. */
345 Lisp_Object Vcoding_system_for_read;
346 /* Coding-system for writing files and sending data to process. */
347 Lisp_Object Vcoding_system_for_write;
348 /* Coding-system actually used in the latest I/O. */
349 Lisp_Object Vlast_coding_system_used;
350
351 /* A vector of length 256 which contains information about special
352 Latin codes (especially for dealing with Microsoft codes). */
353 Lisp_Object Vlatin_extra_code_table;
354
355 /* Flag to inhibit code conversion of end-of-line format. */
356 int inhibit_eol_conversion;
357
358 /* Flag to inhibit ISO2022 escape sequence detection. */
359 int inhibit_iso_escape_detection;
360
361 /* Flag to make buffer-file-coding-system inherit from process-coding. */
362 int inherit_process_coding_system;
363
364 /* Coding system to be used to encode text for terminal display. */
365 struct coding_system terminal_coding;
366
367 /* Coding system to be used to encode text for terminal display when
368 terminal coding system is nil. */
369 struct coding_system safe_terminal_coding;
370
371 /* Coding system of what is sent from terminal keyboard. */
372 struct coding_system keyboard_coding;
373
374 Lisp_Object Vfile_coding_system_alist;
375 Lisp_Object Vprocess_coding_system_alist;
376 Lisp_Object Vnetwork_coding_system_alist;
377
378 Lisp_Object Vlocale_coding_system;
379
380 #endif /* emacs */
381
382 /* Flag to tell if we look up translation table on character code
383 conversion. */
384 Lisp_Object Venable_character_translation;
385 /* Standard translation table to look up on decoding (reading). */
386 Lisp_Object Vstandard_translation_table_for_decode;
387 /* Standard translation table to look up on encoding (writing). */
388 Lisp_Object Vstandard_translation_table_for_encode;
389
390 Lisp_Object Qtranslation_table;
391 Lisp_Object Qtranslation_table_id;
392 Lisp_Object Qtranslation_table_for_decode;
393 Lisp_Object Qtranslation_table_for_encode;
394
395 /* Alist of charsets vs revision number. */
396 static Lisp_Object Vcharset_revision_table;
397
398 /* Default coding systems used for process I/O. */
399 Lisp_Object Vdefault_process_coding_system;
400
401 /* Char table for translating Quail and self-inserting input. */
402 Lisp_Object Vtranslation_table_for_input;
403
404 /* Two special coding systems. */
405 Lisp_Object Vsjis_coding_system;
406 Lisp_Object Vbig5_coding_system;
407
408
409 static int detect_coding_utf_8 P_ ((struct coding_system *,
410 struct coding_detection_info *info));
411 static void decode_coding_utf_8 P_ ((struct coding_system *));
412 static int encode_coding_utf_8 P_ ((struct coding_system *));
413
414 static int detect_coding_utf_16 P_ ((struct coding_system *,
415 struct coding_detection_info *info));
416 static void decode_coding_utf_16 P_ ((struct coding_system *));
417 static int encode_coding_utf_16 P_ ((struct coding_system *));
418
419 static int detect_coding_iso_2022 P_ ((struct coding_system *,
420 struct coding_detection_info *info));
421 static void decode_coding_iso_2022 P_ ((struct coding_system *));
422 static int encode_coding_iso_2022 P_ ((struct coding_system *));
423
424 static int detect_coding_emacs_mule P_ ((struct coding_system *,
425 struct coding_detection_info *info));
426 static void decode_coding_emacs_mule P_ ((struct coding_system *));
427 static int encode_coding_emacs_mule P_ ((struct coding_system *));
428
429 static int detect_coding_sjis P_ ((struct coding_system *,
430 struct coding_detection_info *info));
431 static void decode_coding_sjis P_ ((struct coding_system *));
432 static int encode_coding_sjis P_ ((struct coding_system *));
433
434 static int detect_coding_big5 P_ ((struct coding_system *,
435 struct coding_detection_info *info));
436 static void decode_coding_big5 P_ ((struct coding_system *));
437 static int encode_coding_big5 P_ ((struct coding_system *));
438
439 static int detect_coding_ccl P_ ((struct coding_system *,
440 struct coding_detection_info *info));
441 static void decode_coding_ccl P_ ((struct coding_system *));
442 static int encode_coding_ccl P_ ((struct coding_system *));
443
444 static void decode_coding_raw_text P_ ((struct coding_system *));
445 static int encode_coding_raw_text P_ ((struct coding_system *));
446
447
448 /* ISO2022 section */
449
450 #define CODING_ISO_INITIAL(coding, reg) \
451 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
452 coding_attr_iso_initial), \
453 reg)))
454
455
456 #define CODING_ISO_REQUEST(coding, charset_id) \
457 ((charset_id <= (coding)->max_charset_id \
458 ? (coding)->safe_charsets[charset_id] \
459 : -1))
460
461
462 #define CODING_ISO_FLAGS(coding) \
463 ((coding)->spec.iso_2022.flags)
464 #define CODING_ISO_DESIGNATION(coding, reg) \
465 ((coding)->spec.iso_2022.current_designation[reg])
466 #define CODING_ISO_INVOCATION(coding, plane) \
467 ((coding)->spec.iso_2022.current_invocation[plane])
468 #define CODING_ISO_SINGLE_SHIFTING(coding) \
469 ((coding)->spec.iso_2022.single_shifting)
470 #define CODING_ISO_BOL(coding) \
471 ((coding)->spec.iso_2022.bol)
472 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
473 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
474
475 /* Control characters of ISO2022. */
476 /* code */ /* function */
477 #define ISO_CODE_LF 0x0A /* line-feed */
478 #define ISO_CODE_CR 0x0D /* carriage-return */
479 #define ISO_CODE_SO 0x0E /* shift-out */
480 #define ISO_CODE_SI 0x0F /* shift-in */
481 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
482 #define ISO_CODE_ESC 0x1B /* escape */
483 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
484 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
485 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
486
487 /* All code (1-byte) of ISO2022 is classified into one of the
488 followings. */
489 enum iso_code_class_type
490 {
491 ISO_control_0, /* Control codes in the range
492 0x00..0x1F and 0x7F, except for the
493 following 5 codes. */
494 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
495 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
496 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
497 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
498 ISO_escape, /* ISO_CODE_SO (0x1B) */
499 ISO_control_1, /* Control codes in the range
500 0x80..0x9F, except for the
501 following 3 codes. */
502 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
503 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
504 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
505 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
506 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
507 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
508 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
509 };
510
511 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
512 `iso-flags' attribute of an iso2022 coding system. */
513
514 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
515 instead of the correct short-form sequence (e.g. ESC $ A). */
516 #define CODING_ISO_FLAG_LONG_FORM 0x0001
517
518 /* If set, reset graphic planes and registers at end-of-line to the
519 initial state. */
520 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
521
522 /* If set, reset graphic planes and registers before any control
523 characters to the initial state. */
524 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
525
526 /* If set, encode by 7-bit environment. */
527 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
528
529 /* If set, use locking-shift function. */
530 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
531
532 /* If set, use single-shift function. Overwrite
533 CODING_ISO_FLAG_LOCKING_SHIFT. */
534 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
535
536 /* If set, use designation escape sequence. */
537 #define CODING_ISO_FLAG_DESIGNATION 0x0040
538
539 /* If set, produce revision number sequence. */
540 #define CODING_ISO_FLAG_REVISION 0x0080
541
542 /* If set, produce ISO6429's direction specifying sequence. */
543 #define CODING_ISO_FLAG_DIRECTION 0x0100
544
545 /* If set, assume designation states are reset at beginning of line on
546 output. */
547 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
548
549 /* If set, designation sequence should be placed at beginning of line
550 on output. */
551 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
552
553 /* If set, do not encode unsafe charactes on output. */
554 #define CODING_ISO_FLAG_SAFE 0x0800
555
556 /* If set, extra latin codes (128..159) are accepted as a valid code
557 on input. */
558 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
559
560 #define CODING_ISO_FLAG_COMPOSITION 0x2000
561
562 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
563
564 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
565
566 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
567
568 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
569
570 /* A character to be produced on output if encoding of the original
571 character is prohibited by CODING_ISO_FLAG_SAFE. */
572 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
573
574
575 /* UTF-16 section */
576 #define CODING_UTF_16_BOM(coding) \
577 ((coding)->spec.utf_16.bom)
578
579 #define CODING_UTF_16_ENDIAN(coding) \
580 ((coding)->spec.utf_16.endian)
581
582 #define CODING_UTF_16_SURROGATE(coding) \
583 ((coding)->spec.utf_16.surrogate)
584
585
586 /* CCL section */
587 #define CODING_CCL_DECODER(coding) \
588 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
589 #define CODING_CCL_ENCODER(coding) \
590 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
591 #define CODING_CCL_VALIDS(coding) \
592 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
593
594 /* Index for each coding category in `coding_categories' */
595
596 enum coding_category
597 {
598 coding_category_iso_7,
599 coding_category_iso_7_tight,
600 coding_category_iso_8_1,
601 coding_category_iso_8_2,
602 coding_category_iso_7_else,
603 coding_category_iso_8_else,
604 coding_category_utf_8,
605 coding_category_utf_16_auto,
606 coding_category_utf_16_be,
607 coding_category_utf_16_le,
608 coding_category_utf_16_be_nosig,
609 coding_category_utf_16_le_nosig,
610 coding_category_charset,
611 coding_category_sjis,
612 coding_category_big5,
613 coding_category_ccl,
614 coding_category_emacs_mule,
615 /* All above are targets of code detection. */
616 coding_category_raw_text,
617 coding_category_undecided,
618 coding_category_max
619 };
620
621 /* Definitions of flag bits used in detect_coding_XXXX. */
622 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
623 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
624 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
625 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
626 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
627 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
628 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
629 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
630 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
631 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
632 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
633 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
634 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
635 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
636 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
637 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
638 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
639 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
640
641 /* This value is returned if detect_coding_mask () find nothing other
642 than ASCII characters. */
643 #define CATEGORY_MASK_ANY \
644 (CATEGORY_MASK_ISO_7 \
645 | CATEGORY_MASK_ISO_7_TIGHT \
646 | CATEGORY_MASK_ISO_8_1 \
647 | CATEGORY_MASK_ISO_8_2 \
648 | CATEGORY_MASK_ISO_7_ELSE \
649 | CATEGORY_MASK_ISO_8_ELSE \
650 | CATEGORY_MASK_UTF_8 \
651 | CATEGORY_MASK_UTF_16_BE \
652 | CATEGORY_MASK_UTF_16_LE \
653 | CATEGORY_MASK_UTF_16_BE_NOSIG \
654 | CATEGORY_MASK_UTF_16_LE_NOSIG \
655 | CATEGORY_MASK_CHARSET \
656 | CATEGORY_MASK_SJIS \
657 | CATEGORY_MASK_BIG5 \
658 | CATEGORY_MASK_CCL \
659 | CATEGORY_MASK_EMACS_MULE)
660
661
662 #define CATEGORY_MASK_ISO_7BIT \
663 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
664
665 #define CATEGORY_MASK_ISO_8BIT \
666 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
667
668 #define CATEGORY_MASK_ISO_ELSE \
669 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
670
671 #define CATEGORY_MASK_ISO_ESCAPE \
672 (CATEGORY_MASK_ISO_7 \
673 | CATEGORY_MASK_ISO_7_TIGHT \
674 | CATEGORY_MASK_ISO_7_ELSE \
675 | CATEGORY_MASK_ISO_8_ELSE)
676
677 #define CATEGORY_MASK_ISO \
678 ( CATEGORY_MASK_ISO_7BIT \
679 | CATEGORY_MASK_ISO_8BIT \
680 | CATEGORY_MASK_ISO_ELSE)
681
682 #define CATEGORY_MASK_UTF_16 \
683 (CATEGORY_MASK_UTF_16_BE \
684 | CATEGORY_MASK_UTF_16_LE \
685 | CATEGORY_MASK_UTF_16_BE_NOSIG \
686 | CATEGORY_MASK_UTF_16_LE_NOSIG)
687
688
689 /* List of symbols `coding-category-xxx' ordered by priority. This
690 variable is exposed to Emacs Lisp. */
691 static Lisp_Object Vcoding_category_list;
692
693 /* Table of coding categories (Lisp symbols). This variable is for
694 internal use oly. */
695 static Lisp_Object Vcoding_category_table;
696
697 /* Table of coding-categories ordered by priority. */
698 static enum coding_category coding_priorities[coding_category_max];
699
700 /* Nth element is a coding context for the coding system bound to the
701 Nth coding category. */
702 static struct coding_system coding_categories[coding_category_max];
703
704 /*** Commonly used macros and functions ***/
705
706 #ifndef min
707 #define min(a, b) ((a) < (b) ? (a) : (b))
708 #endif
709 #ifndef max
710 #define max(a, b) ((a) > (b) ? (a) : (b))
711 #endif
712
713 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
714 do { \
715 attrs = CODING_ID_ATTRS (coding->id); \
716 eol_type = CODING_ID_EOL_TYPE (coding->id); \
717 if (VECTORP (eol_type)) \
718 eol_type = Qunix; \
719 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
720 } while (0)
721
722
723 /* Safely get one byte from the source text pointed by SRC which ends
724 at SRC_END, and set C to that byte. If there are not enough bytes
725 in the source, it jumps to `no_more_source'. The caller
726 should declare and set these variables appropriately in advance:
727 src, src_end, multibytep
728 */
729
730 #define ONE_MORE_BYTE(c) \
731 do { \
732 if (src == src_end) \
733 { \
734 if (src_base < src) \
735 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
736 goto no_more_source; \
737 } \
738 c = *src++; \
739 if (multibytep && (c & 0x80)) \
740 { \
741 if ((c & 0xFE) != 0xC0) \
742 error ("Undecodable char found"); \
743 c = ((c & 1) << 6) | *src++; \
744 } \
745 consumed_chars++; \
746 } while (0)
747
748
749 #define ONE_MORE_BYTE_NO_CHECK(c) \
750 do { \
751 c = *src++; \
752 if (multibytep && (c & 0x80)) \
753 { \
754 if ((c & 0xFE) != 0xC0) \
755 error ("Undecodable char found"); \
756 c = ((c & 1) << 6) | *src++; \
757 } \
758 consumed_chars++; \
759 } while (0)
760
761
762 /* Store a byte C in the place pointed by DST and increment DST to the
763 next free point, and increment PRODUCED_CHARS. The caller should
764 assure that C is 0..127, and declare and set the variable `dst'
765 appropriately in advance.
766 */
767
768
769 #define EMIT_ONE_ASCII_BYTE(c) \
770 do { \
771 produced_chars++; \
772 *dst++ = (c); \
773 } while (0)
774
775
776 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
777
778 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
779 do { \
780 produced_chars += 2; \
781 *dst++ = (c1), *dst++ = (c2); \
782 } while (0)
783
784
785 /* Store a byte C in the place pointed by DST and increment DST to the
786 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
787 nonzero, store in an appropriate multibyte from. The caller should
788 declare and set the variables `dst' and `multibytep' appropriately
789 in advance. */
790
791 #define EMIT_ONE_BYTE(c) \
792 do { \
793 produced_chars++; \
794 if (multibytep) \
795 { \
796 int ch = (c); \
797 if (ch >= 0x80) \
798 ch = BYTE8_TO_CHAR (ch); \
799 CHAR_STRING_ADVANCE (ch, dst); \
800 } \
801 else \
802 *dst++ = (c); \
803 } while (0)
804
805
806 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
807
808 #define EMIT_TWO_BYTES(c1, c2) \
809 do { \
810 produced_chars += 2; \
811 if (multibytep) \
812 { \
813 int ch; \
814 \
815 ch = (c1); \
816 if (ch >= 0x80) \
817 ch = BYTE8_TO_CHAR (ch); \
818 CHAR_STRING_ADVANCE (ch, dst); \
819 ch = (c2); \
820 if (ch >= 0x80) \
821 ch = BYTE8_TO_CHAR (ch); \
822 CHAR_STRING_ADVANCE (ch, dst); \
823 } \
824 else \
825 { \
826 *dst++ = (c1); \
827 *dst++ = (c2); \
828 } \
829 } while (0)
830
831
832 #define EMIT_THREE_BYTES(c1, c2, c3) \
833 do { \
834 EMIT_ONE_BYTE (c1); \
835 EMIT_TWO_BYTES (c2, c3); \
836 } while (0)
837
838
839 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
840 do { \
841 EMIT_TWO_BYTES (c1, c2); \
842 EMIT_TWO_BYTES (c3, c4); \
843 } while (0)
844
845
846 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
847 do { \
848 charset_map_loaded = 0; \
849 c = DECODE_CHAR (charset, code); \
850 if (charset_map_loaded) \
851 { \
852 const unsigned char *orig = coding->source; \
853 EMACS_INT offset; \
854 \
855 coding_set_source (coding); \
856 offset = coding->source - orig; \
857 src += offset; \
858 src_base += offset; \
859 src_end += offset; \
860 } \
861 } while (0)
862
863
864 #define ASSURE_DESTINATION(bytes) \
865 do { \
866 if (dst + (bytes) >= dst_end) \
867 { \
868 int more_bytes = charbuf_end - charbuf + (bytes); \
869 \
870 dst = alloc_destination (coding, more_bytes, dst); \
871 dst_end = coding->destination + coding->dst_bytes; \
872 } \
873 } while (0)
874
875
876
877 static void
878 coding_set_source (coding)
879 struct coding_system *coding;
880 {
881 if (BUFFERP (coding->src_object))
882 {
883 struct buffer *buf = XBUFFER (coding->src_object);
884
885 if (coding->src_pos < 0)
886 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
887 else
888 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
889 }
890 else if (STRINGP (coding->src_object))
891 {
892 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
893 }
894 else
895 /* Otherwise, the source is C string and is never relocated
896 automatically. Thus we don't have to update anything. */
897 ;
898 }
899
900 static void
901 coding_set_destination (coding)
902 struct coding_system *coding;
903 {
904 if (BUFFERP (coding->dst_object))
905 {
906 if (coding->src_pos < 0)
907 {
908 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
909 coding->dst_bytes = (GAP_END_ADDR
910 - (coding->src_bytes - coding->consumed)
911 - coding->destination);
912 }
913 else
914 {
915 /* We are sure that coding->dst_pos_byte is before the gap
916 of the buffer. */
917 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
918 + coding->dst_pos_byte - 1);
919 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
920 - coding->destination);
921 }
922 }
923 else
924 /* Otherwise, the destination is C string and is never relocated
925 automatically. Thus we don't have to update anything. */
926 ;
927 }
928
929
930 static void
931 coding_alloc_by_realloc (coding, bytes)
932 struct coding_system *coding;
933 EMACS_INT bytes;
934 {
935 coding->destination = (unsigned char *) xrealloc (coding->destination,
936 coding->dst_bytes + bytes);
937 coding->dst_bytes += bytes;
938 }
939
940 static void
941 coding_alloc_by_making_gap (coding, bytes)
942 struct coding_system *coding;
943 EMACS_INT bytes;
944 {
945 if (BUFFERP (coding->dst_object)
946 && EQ (coding->src_object, coding->dst_object))
947 {
948 EMACS_INT add = coding->src_bytes - coding->consumed;
949
950 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
951 make_gap (bytes);
952 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
953 }
954 else
955 {
956 Lisp_Object this_buffer;
957
958 this_buffer = Fcurrent_buffer ();
959 set_buffer_internal (XBUFFER (coding->dst_object));
960 make_gap (bytes);
961 set_buffer_internal (XBUFFER (this_buffer));
962 }
963 }
964
965
966 static unsigned char *
967 alloc_destination (coding, nbytes, dst)
968 struct coding_system *coding;
969 EMACS_INT nbytes;
970 unsigned char *dst;
971 {
972 EMACS_INT offset = dst - coding->destination;
973
974 if (BUFFERP (coding->dst_object))
975 coding_alloc_by_making_gap (coding, nbytes);
976 else
977 coding_alloc_by_realloc (coding, nbytes);
978 coding->result = CODING_RESULT_SUCCESS;
979 coding_set_destination (coding);
980 dst = coding->destination + offset;
981 return dst;
982 }
983
984 /** Macros for annotations. */
985
986 /* Maximum length of annotation data (sum of annotations for
987 composition and charset). */
988 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
989
990 /* An annotation data is stored in the array coding->charbuf in this
991 format:
992 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
993 LENGTH is the number of elements in the annotation.
994 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
995 FROM and TO specify the range of text annotated. They are relative
996 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
997
998 The format of the following elements depend on ANNOTATION_MASK.
999
1000 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1001 follows:
1002 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1003 METHOD is one of enum composition_method.
1004 Optionnal COMPOSITION-COMPONENTS are characters and composition
1005 rules.
1006
1007 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1008 follows. */
1009
1010 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1011 do { \
1012 *(buf)++ = -(len); \
1013 *(buf)++ = (mask); \
1014 *(buf)++ = (from); \
1015 *(buf)++ = (to); \
1016 coding->annotated = 1; \
1017 } while (0);
1018
1019 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1020 do { \
1021 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1022 *buf++ = method; \
1023 } while (0)
1024
1025
1026 #define ADD_CHARSET_DATA(buf, from, to, id) \
1027 do { \
1028 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1029 *buf++ = id; \
1030 } while (0)
1031
1032 \f
1033 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1034
1035
1036
1037 \f
1038 /*** 3. UTF-8 ***/
1039
1040 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1041 Check if a text is encoded in UTF-8. If it is, return 1, else
1042 return 0. */
1043
1044 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1045 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1046 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1047 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1048 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1049 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1050
1051 static int
1052 detect_coding_utf_8 (coding, detect_info)
1053 struct coding_system *coding;
1054 struct coding_detection_info *detect_info;
1055 {
1056 const unsigned char *src = coding->source, *src_base = src;
1057 const unsigned char *src_end = coding->source + coding->src_bytes;
1058 int multibytep = coding->src_multibyte;
1059 int consumed_chars = 0;
1060 int found = 0;
1061 int incomplete;
1062
1063 detect_info->checked |= CATEGORY_MASK_UTF_8;
1064 /* A coding system of this category is always ASCII compatible. */
1065 src += coding->head_ascii;
1066
1067 while (1)
1068 {
1069 int c, c1, c2, c3, c4;
1070
1071 incomplete = 0;
1072 ONE_MORE_BYTE (c);
1073 if (UTF_8_1_OCTET_P (c))
1074 continue;
1075 incomplete = 1;
1076 ONE_MORE_BYTE (c1);
1077 if (! UTF_8_EXTRA_OCTET_P (c1))
1078 break;
1079 if (UTF_8_2_OCTET_LEADING_P (c))
1080 {
1081 found = CATEGORY_MASK_UTF_8;
1082 continue;
1083 }
1084 ONE_MORE_BYTE (c2);
1085 if (! UTF_8_EXTRA_OCTET_P (c2))
1086 break;
1087 if (UTF_8_3_OCTET_LEADING_P (c))
1088 {
1089 found = CATEGORY_MASK_UTF_8;
1090 continue;
1091 }
1092 ONE_MORE_BYTE (c3);
1093 if (! UTF_8_EXTRA_OCTET_P (c3))
1094 break;
1095 if (UTF_8_4_OCTET_LEADING_P (c))
1096 {
1097 found = CATEGORY_MASK_UTF_8;
1098 continue;
1099 }
1100 ONE_MORE_BYTE (c4);
1101 if (! UTF_8_EXTRA_OCTET_P (c4))
1102 break;
1103 if (UTF_8_5_OCTET_LEADING_P (c))
1104 {
1105 found = CATEGORY_MASK_UTF_8;
1106 continue;
1107 }
1108 break;
1109 }
1110 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1111 return 0;
1112
1113 no_more_source:
1114 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1115 {
1116 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1117 return 0;
1118 }
1119 detect_info->found |= found;
1120 return 1;
1121 }
1122
1123
1124 static void
1125 decode_coding_utf_8 (coding)
1126 struct coding_system *coding;
1127 {
1128 const unsigned char *src = coding->source + coding->consumed;
1129 const unsigned char *src_end = coding->source + coding->src_bytes;
1130 const unsigned char *src_base;
1131 int *charbuf = coding->charbuf;
1132 int *charbuf_end = charbuf + coding->charbuf_size;
1133 int consumed_chars = 0, consumed_chars_base;
1134 int multibytep = coding->src_multibyte;
1135 Lisp_Object attr, eol_type, charset_list;
1136
1137 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1138
1139 while (1)
1140 {
1141 int c, c1, c2, c3, c4, c5;
1142
1143 src_base = src;
1144 consumed_chars_base = consumed_chars;
1145
1146 if (charbuf >= charbuf_end)
1147 break;
1148
1149 ONE_MORE_BYTE (c1);
1150 if (UTF_8_1_OCTET_P(c1))
1151 {
1152 c = c1;
1153 if (c == '\r')
1154 {
1155 if (EQ (eol_type, Qdos))
1156 {
1157 if (src == src_end)
1158 {
1159 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1160 goto no_more_source;
1161 }
1162 if (*src == '\n')
1163 ONE_MORE_BYTE (c);
1164 }
1165 else if (EQ (eol_type, Qmac))
1166 c = '\n';
1167 }
1168 }
1169 else
1170 {
1171 ONE_MORE_BYTE (c2);
1172 if (! UTF_8_EXTRA_OCTET_P (c2))
1173 goto invalid_code;
1174 if (UTF_8_2_OCTET_LEADING_P (c1))
1175 {
1176 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1177 /* Reject overlong sequences here and below. Encoders
1178 producing them are incorrect, they can be misleading,
1179 and they mess up read/write invariance. */
1180 if (c < 128)
1181 goto invalid_code;
1182 }
1183 else
1184 {
1185 ONE_MORE_BYTE (c3);
1186 if (! UTF_8_EXTRA_OCTET_P (c3))
1187 goto invalid_code;
1188 if (UTF_8_3_OCTET_LEADING_P (c1))
1189 {
1190 c = (((c1 & 0xF) << 12)
1191 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1192 if (c < 0x800
1193 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1194 goto invalid_code;
1195 }
1196 else
1197 {
1198 ONE_MORE_BYTE (c4);
1199 if (! UTF_8_EXTRA_OCTET_P (c4))
1200 goto invalid_code;
1201 if (UTF_8_4_OCTET_LEADING_P (c1))
1202 {
1203 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1204 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1205 if (c < 0x10000)
1206 goto invalid_code;
1207 }
1208 else
1209 {
1210 ONE_MORE_BYTE (c5);
1211 if (! UTF_8_EXTRA_OCTET_P (c5))
1212 goto invalid_code;
1213 if (UTF_8_5_OCTET_LEADING_P (c1))
1214 {
1215 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1216 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1217 | (c5 & 0x3F));
1218 if ((c > MAX_CHAR) || (c < 0x200000))
1219 goto invalid_code;
1220 }
1221 else
1222 goto invalid_code;
1223 }
1224 }
1225 }
1226 }
1227
1228 *charbuf++ = c;
1229 continue;
1230
1231 invalid_code:
1232 src = src_base;
1233 consumed_chars = consumed_chars_base;
1234 ONE_MORE_BYTE (c);
1235 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1236 coding->errors++;
1237 }
1238
1239 no_more_source:
1240 coding->consumed_char += consumed_chars_base;
1241 coding->consumed = src_base - coding->source;
1242 coding->charbuf_used = charbuf - coding->charbuf;
1243 }
1244
1245
1246 static int
1247 encode_coding_utf_8 (coding)
1248 struct coding_system *coding;
1249 {
1250 int multibytep = coding->dst_multibyte;
1251 int *charbuf = coding->charbuf;
1252 int *charbuf_end = charbuf + coding->charbuf_used;
1253 unsigned char *dst = coding->destination + coding->produced;
1254 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1255 int produced_chars = 0;
1256 int c;
1257
1258 if (multibytep)
1259 {
1260 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1261
1262 while (charbuf < charbuf_end)
1263 {
1264 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1265
1266 ASSURE_DESTINATION (safe_room);
1267 c = *charbuf++;
1268 if (CHAR_BYTE8_P (c))
1269 {
1270 c = CHAR_TO_BYTE8 (c);
1271 EMIT_ONE_BYTE (c);
1272 }
1273 else
1274 {
1275 CHAR_STRING_ADVANCE (c, pend);
1276 for (p = str; p < pend; p++)
1277 EMIT_ONE_BYTE (*p);
1278 }
1279 }
1280 }
1281 else
1282 {
1283 int safe_room = MAX_MULTIBYTE_LENGTH;
1284
1285 while (charbuf < charbuf_end)
1286 {
1287 ASSURE_DESTINATION (safe_room);
1288 c = *charbuf++;
1289 dst += CHAR_STRING (c, dst);
1290 produced_chars++;
1291 }
1292 }
1293 coding->result = CODING_RESULT_SUCCESS;
1294 coding->produced_char += produced_chars;
1295 coding->produced = dst - coding->destination;
1296 return 0;
1297 }
1298
1299
1300 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1301 Check if a text is encoded in one of UTF-16 based coding systems.
1302 If it is, return 1, else return 0. */
1303
1304 #define UTF_16_HIGH_SURROGATE_P(val) \
1305 (((val) & 0xFC00) == 0xD800)
1306
1307 #define UTF_16_LOW_SURROGATE_P(val) \
1308 (((val) & 0xFC00) == 0xDC00)
1309
1310 #define UTF_16_INVALID_P(val) \
1311 (((val) == 0xFFFE) \
1312 || ((val) == 0xFFFF) \
1313 || UTF_16_LOW_SURROGATE_P (val))
1314
1315
1316 static int
1317 detect_coding_utf_16 (coding, detect_info)
1318 struct coding_system *coding;
1319 struct coding_detection_info *detect_info;
1320 {
1321 const unsigned char *src = coding->source, *src_base = src;
1322 const unsigned char *src_end = coding->source + coding->src_bytes;
1323 int multibytep = coding->src_multibyte;
1324 int consumed_chars = 0;
1325 int c1, c2;
1326
1327 detect_info->checked |= CATEGORY_MASK_UTF_16;
1328
1329 if (coding->mode & CODING_MODE_LAST_BLOCK
1330 && (coding->src_bytes & 1))
1331 {
1332 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1333 return 0;
1334 }
1335 ONE_MORE_BYTE (c1);
1336 ONE_MORE_BYTE (c2);
1337
1338 if ((c1 == 0xFF) && (c2 == 0xFE))
1339 {
1340 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1341 | CATEGORY_MASK_UTF_16_AUTO);
1342 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1343 }
1344 else if ((c1 == 0xFE) && (c2 == 0xFF))
1345 {
1346 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1347 | CATEGORY_MASK_UTF_16_AUTO);
1348 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1349 }
1350 no_more_source:
1351 return 1;
1352 }
1353
1354 static void
1355 decode_coding_utf_16 (coding)
1356 struct coding_system *coding;
1357 {
1358 const unsigned char *src = coding->source + coding->consumed;
1359 const unsigned char *src_end = coding->source + coding->src_bytes;
1360 const unsigned char *src_base;
1361 int *charbuf = coding->charbuf;
1362 int *charbuf_end = charbuf + coding->charbuf_size;
1363 int consumed_chars = 0, consumed_chars_base;
1364 int multibytep = coding->src_multibyte;
1365 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1366 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1367 int surrogate = CODING_UTF_16_SURROGATE (coding);
1368 Lisp_Object attr, eol_type, charset_list;
1369
1370 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1371
1372 if (bom == utf_16_with_bom)
1373 {
1374 int c, c1, c2;
1375
1376 src_base = src;
1377 ONE_MORE_BYTE (c1);
1378 ONE_MORE_BYTE (c2);
1379 c = (c1 << 8) | c2;
1380
1381 if (endian == utf_16_big_endian
1382 ? c != 0xFEFF : c != 0xFFFE)
1383 {
1384 /* The first two bytes are not BOM. Treat them as bytes
1385 for a normal character. */
1386 src = src_base;
1387 coding->errors++;
1388 }
1389 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1390 }
1391 else if (bom == utf_16_detect_bom)
1392 {
1393 /* We have already tried to detect BOM and failed in
1394 detect_coding. */
1395 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1396 }
1397
1398 while (1)
1399 {
1400 int c, c1, c2;
1401
1402 src_base = src;
1403 consumed_chars_base = consumed_chars;
1404
1405 if (charbuf + 2 >= charbuf_end)
1406 break;
1407
1408 ONE_MORE_BYTE (c1);
1409 ONE_MORE_BYTE (c2);
1410 c = (endian == utf_16_big_endian
1411 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1412 if (surrogate)
1413 {
1414 if (! UTF_16_LOW_SURROGATE_P (c))
1415 {
1416 if (endian == utf_16_big_endian)
1417 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1418 else
1419 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1420 *charbuf++ = c1;
1421 *charbuf++ = c2;
1422 coding->errors++;
1423 if (UTF_16_HIGH_SURROGATE_P (c))
1424 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1425 else
1426 *charbuf++ = c;
1427 }
1428 else
1429 {
1430 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1431 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1432 *charbuf++ = c;
1433 }
1434 }
1435 else
1436 {
1437 if (UTF_16_HIGH_SURROGATE_P (c))
1438 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1439 else
1440 *charbuf++ = c;
1441 }
1442 }
1443
1444 no_more_source:
1445 coding->consumed_char += consumed_chars_base;
1446 coding->consumed = src_base - coding->source;
1447 coding->charbuf_used = charbuf - coding->charbuf;
1448 }
1449
1450 static int
1451 encode_coding_utf_16 (coding)
1452 struct coding_system *coding;
1453 {
1454 int multibytep = coding->dst_multibyte;
1455 int *charbuf = coding->charbuf;
1456 int *charbuf_end = charbuf + coding->charbuf_used;
1457 unsigned char *dst = coding->destination + coding->produced;
1458 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1459 int safe_room = 8;
1460 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1461 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1462 int produced_chars = 0;
1463 Lisp_Object attrs, eol_type, charset_list;
1464 int c;
1465
1466 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1467
1468 if (bom != utf_16_without_bom)
1469 {
1470 ASSURE_DESTINATION (safe_room);
1471 if (big_endian)
1472 EMIT_TWO_BYTES (0xFE, 0xFF);
1473 else
1474 EMIT_TWO_BYTES (0xFF, 0xFE);
1475 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1476 }
1477
1478 while (charbuf < charbuf_end)
1479 {
1480 ASSURE_DESTINATION (safe_room);
1481 c = *charbuf++;
1482 if (c >= MAX_UNICODE_CHAR)
1483 c = coding->default_char;
1484
1485 if (c < 0x10000)
1486 {
1487 if (big_endian)
1488 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1489 else
1490 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1491 }
1492 else
1493 {
1494 int c1, c2;
1495
1496 c -= 0x10000;
1497 c1 = (c >> 10) + 0xD800;
1498 c2 = (c & 0x3FF) + 0xDC00;
1499 if (big_endian)
1500 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1501 else
1502 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1503 }
1504 }
1505 coding->result = CODING_RESULT_SUCCESS;
1506 coding->produced = dst - coding->destination;
1507 coding->produced_char += produced_chars;
1508 return 0;
1509 }
1510
1511 \f
1512 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1513
1514 /* Emacs' internal format for representation of multiple character
1515 sets is a kind of multi-byte encoding, i.e. characters are
1516 represented by variable-length sequences of one-byte codes.
1517
1518 ASCII characters and control characters (e.g. `tab', `newline') are
1519 represented by one-byte sequences which are their ASCII codes, in
1520 the range 0x00 through 0x7F.
1521
1522 8-bit characters of the range 0x80..0x9F are represented by
1523 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1524 code + 0x20).
1525
1526 8-bit characters of the range 0xA0..0xFF are represented by
1527 one-byte sequences which are their 8-bit code.
1528
1529 The other characters are represented by a sequence of `base
1530 leading-code', optional `extended leading-code', and one or two
1531 `position-code's. The length of the sequence is determined by the
1532 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1533 whereas extended leading-code and position-code take the range 0xA0
1534 through 0xFF. See `charset.h' for more details about leading-code
1535 and position-code.
1536
1537 --- CODE RANGE of Emacs' internal format ---
1538 character set range
1539 ------------- -----
1540 ascii 0x00..0x7F
1541 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1542 eight-bit-graphic 0xA0..0xBF
1543 ELSE 0x81..0x9D + [0xA0..0xFF]+
1544 ---------------------------------------------
1545
1546 As this is the internal character representation, the format is
1547 usually not used externally (i.e. in a file or in a data sent to a
1548 process). But, it is possible to have a text externally in this
1549 format (i.e. by encoding by the coding system `emacs-mule').
1550
1551 In that case, a sequence of one-byte codes has a slightly different
1552 form.
1553
1554 At first, all characters in eight-bit-control are represented by
1555 one-byte sequences which are their 8-bit code.
1556
1557 Next, character composition data are represented by the byte
1558 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1559 where,
1560 METHOD is 0xF0 plus one of composition method (enum
1561 composition_method),
1562
1563 BYTES is 0xA0 plus a byte length of this composition data,
1564
1565 CHARS is 0x20 plus a number of characters composed by this
1566 data,
1567
1568 COMPONENTs are characters of multibye form or composition
1569 rules encoded by two-byte of ASCII codes.
1570
1571 In addition, for backward compatibility, the following formats are
1572 also recognized as composition data on decoding.
1573
1574 0x80 MSEQ ...
1575 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1576
1577 Here,
1578 MSEQ is a multibyte form but in these special format:
1579 ASCII: 0xA0 ASCII_CODE+0x80,
1580 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1581 RULE is a one byte code of the range 0xA0..0xF0 that
1582 represents a composition rule.
1583 */
1584
1585 char emacs_mule_bytes[256];
1586
1587 int
1588 emacs_mule_char (coding, src, nbytes, nchars, id)
1589 struct coding_system *coding;
1590 unsigned char *src;
1591 int *nbytes, *nchars, *id;
1592 {
1593 const unsigned char *src_end = coding->source + coding->src_bytes;
1594 const unsigned char *src_base = src;
1595 int multibytep = coding->src_multibyte;
1596 struct charset *charset;
1597 unsigned code;
1598 int c;
1599 int consumed_chars = 0;
1600
1601 ONE_MORE_BYTE (c);
1602 switch (emacs_mule_bytes[c])
1603 {
1604 case 2:
1605 if (! (charset = emacs_mule_charset[c]))
1606 goto invalid_code;
1607 ONE_MORE_BYTE (c);
1608 code = c & 0x7F;
1609 break;
1610
1611 case 3:
1612 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1613 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1614 {
1615 ONE_MORE_BYTE (c);
1616 if (! (charset = emacs_mule_charset[c]))
1617 goto invalid_code;
1618 ONE_MORE_BYTE (c);
1619 code = c & 0x7F;
1620 }
1621 else
1622 {
1623 if (! (charset = emacs_mule_charset[c]))
1624 goto invalid_code;
1625 ONE_MORE_BYTE (c);
1626 code = (c & 0x7F) << 8;
1627 ONE_MORE_BYTE (c);
1628 code |= c & 0x7F;
1629 }
1630 break;
1631
1632 case 4:
1633 ONE_MORE_BYTE (c);
1634 if (! (charset = emacs_mule_charset[c]))
1635 goto invalid_code;
1636 ONE_MORE_BYTE (c);
1637 code = (c & 0x7F) << 8;
1638 ONE_MORE_BYTE (c);
1639 code |= c & 0x7F;
1640 break;
1641
1642 case 1:
1643 code = c;
1644 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1645 ? charset_ascii : charset_eight_bit);
1646 break;
1647
1648 default:
1649 abort ();
1650 }
1651 c = DECODE_CHAR (charset, code);
1652 if (c < 0)
1653 goto invalid_code;
1654 *nbytes = src - src_base;
1655 *nchars = consumed_chars;
1656 if (id)
1657 *id = charset->id;
1658 return c;
1659
1660 no_more_source:
1661 return -2;
1662
1663 invalid_code:
1664 return -1;
1665 }
1666
1667
1668 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1669 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1670 else return 0. */
1671
1672 static int
1673 detect_coding_emacs_mule (coding, detect_info)
1674 struct coding_system *coding;
1675 struct coding_detection_info *detect_info;
1676 {
1677 const unsigned char *src = coding->source, *src_base = src;
1678 const unsigned char *src_end = coding->source + coding->src_bytes;
1679 int multibytep = coding->src_multibyte;
1680 int consumed_chars = 0;
1681 int c;
1682 int found = 0;
1683 int incomplete;
1684
1685 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1686 /* A coding system of this category is always ASCII compatible. */
1687 src += coding->head_ascii;
1688
1689 while (1)
1690 {
1691 incomplete = 0;
1692 ONE_MORE_BYTE (c);
1693 incomplete = 1;
1694
1695 if (c == 0x80)
1696 {
1697 /* Perhaps the start of composite character. We simple skip
1698 it because analyzing it is too heavy for detecting. But,
1699 at least, we check that the composite character
1700 constitues of more than 4 bytes. */
1701 const unsigned char *src_base;
1702
1703 repeat:
1704 src_base = src;
1705 do
1706 {
1707 ONE_MORE_BYTE (c);
1708 }
1709 while (c >= 0xA0);
1710
1711 if (src - src_base <= 4)
1712 break;
1713 found = CATEGORY_MASK_EMACS_MULE;
1714 if (c == 0x80)
1715 goto repeat;
1716 }
1717
1718 if (c < 0x80)
1719 {
1720 if (c < 0x20
1721 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1722 break;
1723 }
1724 else
1725 {
1726 const unsigned char *src_base = src - 1;
1727
1728 do
1729 {
1730 ONE_MORE_BYTE (c);
1731 }
1732 while (c >= 0xA0);
1733 if (src - src_base != emacs_mule_bytes[*src_base])
1734 break;
1735 found = CATEGORY_MASK_EMACS_MULE;
1736 }
1737 }
1738 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1739 return 0;
1740
1741 no_more_source:
1742 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1743 {
1744 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1745 return 0;
1746 }
1747 detect_info->found |= found;
1748 return 1;
1749 }
1750
1751
1752 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1753
1754 /* Decode a character represented as a component of composition
1755 sequence of Emacs 20/21 style at SRC. Set C to that character and
1756 update SRC to the head of next character (or an encoded composition
1757 rule). If SRC doesn't points a composition component, set C to -1.
1758 If SRC points an invalid byte sequence, global exit by a return
1759 value 0. */
1760
1761 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1762 if (1) \
1763 { \
1764 int c; \
1765 int nbytes, nchars; \
1766 \
1767 if (src == src_end) \
1768 break; \
1769 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1770 if (c < 0) \
1771 { \
1772 if (c == -2) \
1773 break; \
1774 goto invalid_code; \
1775 } \
1776 *buf++ = c; \
1777 src += nbytes; \
1778 consumed_chars += nchars; \
1779 } \
1780 else
1781
1782
1783 /* Decode a composition rule represented as a component of composition
1784 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1785 and increment BUF. If SRC points an invalid byte sequence, set C
1786 to -1. */
1787
1788 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1789 do { \
1790 int c, gref, nref; \
1791 \
1792 if (src >= src_end) \
1793 goto invalid_code; \
1794 ONE_MORE_BYTE_NO_CHECK (c); \
1795 c -= 0x20; \
1796 if (c < 0 || c >= 81) \
1797 goto invalid_code; \
1798 \
1799 gref = c / 9, nref = c % 9; \
1800 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1801 } while (0)
1802
1803
1804 /* Decode a composition rule represented as a component of composition
1805 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1806 and increment BUF. If SRC points an invalid byte sequence, set C
1807 to -1. */
1808
1809 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1810 do { \
1811 int gref, nref; \
1812 \
1813 if (src + 1>= src_end) \
1814 goto invalid_code; \
1815 ONE_MORE_BYTE_NO_CHECK (gref); \
1816 gref -= 0x20; \
1817 ONE_MORE_BYTE_NO_CHECK (nref); \
1818 nref -= 0x20; \
1819 if (gref < 0 || gref >= 81 \
1820 || nref < 0 || nref >= 81) \
1821 goto invalid_code; \
1822 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1823 } while (0)
1824
1825
1826 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1827 do { \
1828 /* Emacs 21 style format. The first three bytes at SRC are \
1829 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1830 the byte length of this composition information, CHARS is the \
1831 number of characters composed by this composition. */ \
1832 enum composition_method method = c - 0xF2; \
1833 int *charbuf_base = charbuf; \
1834 int from, to; \
1835 int consumed_chars_limit; \
1836 int nbytes, nchars; \
1837 \
1838 ONE_MORE_BYTE (c); \
1839 nbytes = c - 0xA0; \
1840 if (nbytes < 3) \
1841 goto invalid_code; \
1842 ONE_MORE_BYTE (c); \
1843 nchars = c - 0xA0; \
1844 from = coding->produced + char_offset; \
1845 to = from + nchars; \
1846 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1847 consumed_chars_limit = consumed_chars_base + nbytes; \
1848 if (method != COMPOSITION_RELATIVE) \
1849 { \
1850 int i = 0; \
1851 while (consumed_chars < consumed_chars_limit) \
1852 { \
1853 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1854 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1855 else \
1856 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1857 i++; \
1858 } \
1859 if (consumed_chars < consumed_chars_limit) \
1860 goto invalid_code; \
1861 charbuf_base[0] -= i; \
1862 } \
1863 } while (0)
1864
1865
1866 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1867 do { \
1868 /* Emacs 20 style format for relative composition. */ \
1869 /* Store multibyte form of characters to be composed. */ \
1870 enum composition_method method = COMPOSITION_RELATIVE; \
1871 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1872 int *buf = components; \
1873 int i, j; \
1874 int from, to; \
1875 \
1876 src = src_base; \
1877 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1878 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1879 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1880 if (i < 2) \
1881 goto invalid_code; \
1882 from = coding->produced_char + char_offset; \
1883 to = from + i; \
1884 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1885 for (j = 0; j < i; j++) \
1886 *charbuf++ = components[j]; \
1887 } while (0)
1888
1889
1890 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1891 do { \
1892 /* Emacs 20 style format for rule-base composition. */ \
1893 /* Store multibyte form of characters to be composed. */ \
1894 enum composition_method method = COMPOSITION_WITH_RULE; \
1895 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1896 int *buf = components; \
1897 int i, j; \
1898 int from, to; \
1899 \
1900 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1901 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1902 { \
1903 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1904 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1905 } \
1906 if (i < 1 || (buf - components) % 2 == 0) \
1907 goto invalid_code; \
1908 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1909 goto no_more_source; \
1910 from = coding->produced_char + char_offset; \
1911 to = from + i; \
1912 ADD_COMPOSITION_DATA (buf, from, to, method); \
1913 for (j = 0; j < i; j++) \
1914 *charbuf++ = components[j]; \
1915 for (j = 0; j < i; j += 2) \
1916 *charbuf++ = components[j]; \
1917 } while (0)
1918
1919
1920 static void
1921 decode_coding_emacs_mule (coding)
1922 struct coding_system *coding;
1923 {
1924 const unsigned char *src = coding->source + coding->consumed;
1925 const unsigned char *src_end = coding->source + coding->src_bytes;
1926 const unsigned char *src_base;
1927 int *charbuf = coding->charbuf;
1928 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1929 int consumed_chars = 0, consumed_chars_base;
1930 int multibytep = coding->src_multibyte;
1931 Lisp_Object attrs, eol_type, charset_list;
1932 int char_offset = coding->produced_char;
1933 int last_offset = char_offset;
1934 int last_id = charset_ascii;
1935
1936 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1937
1938 while (1)
1939 {
1940 int c;
1941
1942 src_base = src;
1943 consumed_chars_base = consumed_chars;
1944
1945 if (charbuf >= charbuf_end)
1946 break;
1947
1948 ONE_MORE_BYTE (c);
1949
1950 if (c < 0x80)
1951 {
1952 if (c == '\r')
1953 {
1954 if (EQ (eol_type, Qdos))
1955 {
1956 if (src == src_end)
1957 {
1958 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1959 goto no_more_source;
1960 }
1961 if (*src == '\n')
1962 ONE_MORE_BYTE (c);
1963 }
1964 else if (EQ (eol_type, Qmac))
1965 c = '\n';
1966 }
1967 *charbuf++ = c;
1968 char_offset++;
1969 }
1970 else if (c == 0x80)
1971 {
1972 ONE_MORE_BYTE (c);
1973 if (c - 0xF2 >= COMPOSITION_RELATIVE
1974 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1975 DECODE_EMACS_MULE_21_COMPOSITION (c);
1976 else if (c < 0xC0)
1977 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1978 else if (c == 0xFF)
1979 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1980 else
1981 goto invalid_code;
1982 }
1983 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1984 {
1985 int nbytes, nchars;
1986 int id;
1987
1988 src = src_base;
1989 consumed_chars = consumed_chars_base;
1990 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1991 if (c < 0)
1992 {
1993 if (c == -2)
1994 break;
1995 goto invalid_code;
1996 }
1997 if (last_id != id)
1998 {
1999 if (last_id != charset_ascii)
2000 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2001 last_id = id;
2002 last_offset = char_offset;
2003 }
2004 *charbuf++ = c;
2005 src += nbytes;
2006 consumed_chars += nchars;
2007 char_offset++;
2008 }
2009 continue;
2010
2011 invalid_code:
2012 src = src_base;
2013 consumed_chars = consumed_chars_base;
2014 ONE_MORE_BYTE (c);
2015 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2016 char_offset++;
2017 coding->errors++;
2018 }
2019
2020 no_more_source:
2021 if (last_id != charset_ascii)
2022 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2023 coding->consumed_char += consumed_chars_base;
2024 coding->consumed = src_base - coding->source;
2025 coding->charbuf_used = charbuf - coding->charbuf;
2026 }
2027
2028
2029 #define EMACS_MULE_LEADING_CODES(id, codes) \
2030 do { \
2031 if (id < 0xA0) \
2032 codes[0] = id, codes[1] = 0; \
2033 else if (id < 0xE0) \
2034 codes[0] = 0x9A, codes[1] = id; \
2035 else if (id < 0xF0) \
2036 codes[0] = 0x9B, codes[1] = id; \
2037 else if (id < 0xF5) \
2038 codes[0] = 0x9C, codes[1] = id; \
2039 else \
2040 codes[0] = 0x9D, codes[1] = id; \
2041 } while (0);
2042
2043
2044 static int
2045 encode_coding_emacs_mule (coding)
2046 struct coding_system *coding;
2047 {
2048 int multibytep = coding->dst_multibyte;
2049 int *charbuf = coding->charbuf;
2050 int *charbuf_end = charbuf + coding->charbuf_used;
2051 unsigned char *dst = coding->destination + coding->produced;
2052 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2053 int safe_room = 8;
2054 int produced_chars = 0;
2055 Lisp_Object attrs, eol_type, charset_list;
2056 int c;
2057 int preferred_charset_id = -1;
2058
2059 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2060
2061 while (charbuf < charbuf_end)
2062 {
2063 ASSURE_DESTINATION (safe_room);
2064 c = *charbuf++;
2065
2066 if (c < 0)
2067 {
2068 /* Handle an annotation. */
2069 switch (*charbuf)
2070 {
2071 case CODING_ANNOTATE_COMPOSITION_MASK:
2072 /* Not yet implemented. */
2073 break;
2074 case CODING_ANNOTATE_CHARSET_MASK:
2075 preferred_charset_id = charbuf[3];
2076 if (preferred_charset_id >= 0
2077 && NILP (Fmemq (make_number (preferred_charset_id),
2078 charset_list)))
2079 preferred_charset_id = -1;
2080 break;
2081 default:
2082 abort ();
2083 }
2084 charbuf += -c - 1;
2085 continue;
2086 }
2087
2088 if (ASCII_CHAR_P (c))
2089 EMIT_ONE_ASCII_BYTE (c);
2090 else if (CHAR_BYTE8_P (c))
2091 {
2092 c = CHAR_TO_BYTE8 (c);
2093 EMIT_ONE_BYTE (c);
2094 }
2095 else
2096 {
2097 struct charset *charset;
2098 unsigned code;
2099 int dimension;
2100 int emacs_mule_id;
2101 unsigned char leading_codes[2];
2102
2103 if (preferred_charset_id >= 0)
2104 {
2105 charset = CHARSET_FROM_ID (preferred_charset_id);
2106 if (! CHAR_CHARSET_P (c, charset))
2107 charset = char_charset (c, charset_list, NULL);
2108 }
2109 else
2110 charset = char_charset (c, charset_list, &code);
2111 if (! charset)
2112 {
2113 c = coding->default_char;
2114 if (ASCII_CHAR_P (c))
2115 {
2116 EMIT_ONE_ASCII_BYTE (c);
2117 continue;
2118 }
2119 charset = char_charset (c, charset_list, &code);
2120 }
2121 dimension = CHARSET_DIMENSION (charset);
2122 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2123 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2124 EMIT_ONE_BYTE (leading_codes[0]);
2125 if (leading_codes[1])
2126 EMIT_ONE_BYTE (leading_codes[1]);
2127 if (dimension == 1)
2128 EMIT_ONE_BYTE (code);
2129 else
2130 {
2131 EMIT_ONE_BYTE (code >> 8);
2132 EMIT_ONE_BYTE (code & 0xFF);
2133 }
2134 }
2135 }
2136 coding->result = CODING_RESULT_SUCCESS;
2137 coding->produced_char += produced_chars;
2138 coding->produced = dst - coding->destination;
2139 return 0;
2140 }
2141
2142 \f
2143 /*** 7. ISO2022 handlers ***/
2144
2145 /* The following note describes the coding system ISO2022 briefly.
2146 Since the intention of this note is to help understand the
2147 functions in this file, some parts are NOT ACCURATE or are OVERLY
2148 SIMPLIFIED. For thorough understanding, please refer to the
2149 original document of ISO2022. This is equivalent to the standard
2150 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2151
2152 ISO2022 provides many mechanisms to encode several character sets
2153 in 7-bit and 8-bit environments. For 7-bit environments, all text
2154 is encoded using bytes less than 128. This may make the encoded
2155 text a little bit longer, but the text passes more easily through
2156 several types of gateway, some of which strip off the MSB (Most
2157 Significant Bit).
2158
2159 There are two kinds of character sets: control character sets and
2160 graphic character sets. The former contain control characters such
2161 as `newline' and `escape' to provide control functions (control
2162 functions are also provided by escape sequences). The latter
2163 contain graphic characters such as 'A' and '-'. Emacs recognizes
2164 two control character sets and many graphic character sets.
2165
2166 Graphic character sets are classified into one of the following
2167 four classes, according to the number of bytes (DIMENSION) and
2168 number of characters in one dimension (CHARS) of the set:
2169 - DIMENSION1_CHARS94
2170 - DIMENSION1_CHARS96
2171 - DIMENSION2_CHARS94
2172 - DIMENSION2_CHARS96
2173
2174 In addition, each character set is assigned an identification tag,
2175 unique for each set, called the "final character" (denoted as <F>
2176 hereafter). The <F> of each character set is decided by ECMA(*)
2177 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2178 (0x30..0x3F are for private use only).
2179
2180 Note (*): ECMA = European Computer Manufacturers Association
2181
2182 Here are examples of graphic character sets [NAME(<F>)]:
2183 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2184 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2185 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2186 o DIMENSION2_CHARS96 -- none for the moment
2187
2188 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2189 C0 [0x00..0x1F] -- control character plane 0
2190 GL [0x20..0x7F] -- graphic character plane 0
2191 C1 [0x80..0x9F] -- control character plane 1
2192 GR [0xA0..0xFF] -- graphic character plane 1
2193
2194 A control character set is directly designated and invoked to C0 or
2195 C1 by an escape sequence. The most common case is that:
2196 - ISO646's control character set is designated/invoked to C0, and
2197 - ISO6429's control character set is designated/invoked to C1,
2198 and usually these designations/invocations are omitted in encoded
2199 text. In a 7-bit environment, only C0 can be used, and a control
2200 character for C1 is encoded by an appropriate escape sequence to
2201 fit into the environment. All control characters for C1 are
2202 defined to have corresponding escape sequences.
2203
2204 A graphic character set is at first designated to one of four
2205 graphic registers (G0 through G3), then these graphic registers are
2206 invoked to GL or GR. These designations and invocations can be
2207 done independently. The most common case is that G0 is invoked to
2208 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2209 these invocations and designations are omitted in encoded text.
2210 In a 7-bit environment, only GL can be used.
2211
2212 When a graphic character set of CHARS94 is invoked to GL, codes
2213 0x20 and 0x7F of the GL area work as control characters SPACE and
2214 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2215 be used.
2216
2217 There are two ways of invocation: locking-shift and single-shift.
2218 With locking-shift, the invocation lasts until the next different
2219 invocation, whereas with single-shift, the invocation affects the
2220 following character only and doesn't affect the locking-shift
2221 state. Invocations are done by the following control characters or
2222 escape sequences:
2223
2224 ----------------------------------------------------------------------
2225 abbrev function cntrl escape seq description
2226 ----------------------------------------------------------------------
2227 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2228 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2229 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2230 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2231 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2232 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2233 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2234 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2235 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2236 ----------------------------------------------------------------------
2237 (*) These are not used by any known coding system.
2238
2239 Control characters for these functions are defined by macros
2240 ISO_CODE_XXX in `coding.h'.
2241
2242 Designations are done by the following escape sequences:
2243 ----------------------------------------------------------------------
2244 escape sequence description
2245 ----------------------------------------------------------------------
2246 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2247 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2248 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2249 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2250 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2251 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2252 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2253 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2254 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2255 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2256 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2257 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2258 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2259 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2260 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2261 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2262 ----------------------------------------------------------------------
2263
2264 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2265 of dimension 1, chars 94, and final character <F>, etc...
2266
2267 Note (*): Although these designations are not allowed in ISO2022,
2268 Emacs accepts them on decoding, and produces them on encoding
2269 CHARS96 character sets in a coding system which is characterized as
2270 7-bit environment, non-locking-shift, and non-single-shift.
2271
2272 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2273 '(' must be omitted. We refer to this as "short-form" hereafter.
2274
2275 Now you may notice that there are a lot of ways of encoding the
2276 same multilingual text in ISO2022. Actually, there exist many
2277 coding systems such as Compound Text (used in X11's inter client
2278 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2279 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2280 localized platforms), and all of these are variants of ISO2022.
2281
2282 In addition to the above, Emacs handles two more kinds of escape
2283 sequences: ISO6429's direction specification and Emacs' private
2284 sequence for specifying character composition.
2285
2286 ISO6429's direction specification takes the following form:
2287 o CSI ']' -- end of the current direction
2288 o CSI '0' ']' -- end of the current direction
2289 o CSI '1' ']' -- start of left-to-right text
2290 o CSI '2' ']' -- start of right-to-left text
2291 The control character CSI (0x9B: control sequence introducer) is
2292 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2293
2294 Character composition specification takes the following form:
2295 o ESC '0' -- start relative composition
2296 o ESC '1' -- end composition
2297 o ESC '2' -- start rule-base composition (*)
2298 o ESC '3' -- start relative composition with alternate chars (**)
2299 o ESC '4' -- start rule-base composition with alternate chars (**)
2300 Since these are not standard escape sequences of any ISO standard,
2301 the use of them with these meanings is restricted to Emacs only.
2302
2303 (*) This form is used only in Emacs 20.7 and older versions,
2304 but newer versions can safely decode it.
2305 (**) This form is used only in Emacs 21.1 and newer versions,
2306 and older versions can't decode it.
2307
2308 Here's a list of example usages of these composition escape
2309 sequences (categorized by `enum composition_method').
2310
2311 COMPOSITION_RELATIVE:
2312 ESC 0 CHAR [ CHAR ] ESC 1
2313 COMPOSITION_WITH_RULE:
2314 ESC 2 CHAR [ RULE CHAR ] ESC 1
2315 COMPOSITION_WITH_ALTCHARS:
2316 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2317 COMPOSITION_WITH_RULE_ALTCHARS:
2318 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2319
2320 enum iso_code_class_type iso_code_class[256];
2321
2322 #define SAFE_CHARSET_P(coding, id) \
2323 ((id) <= (coding)->max_charset_id \
2324 && (coding)->safe_charsets[id] >= 0)
2325
2326
2327 #define SHIFT_OUT_OK(category) \
2328 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2329
2330 static void
2331 setup_iso_safe_charsets (attrs)
2332 Lisp_Object attrs;
2333 {
2334 Lisp_Object charset_list, safe_charsets;
2335 Lisp_Object request;
2336 Lisp_Object reg_usage;
2337 Lisp_Object tail;
2338 int reg94, reg96;
2339 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2340 int max_charset_id;
2341
2342 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2343 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2344 && ! EQ (charset_list, Viso_2022_charset_list))
2345 {
2346 CODING_ATTR_CHARSET_LIST (attrs)
2347 = charset_list = Viso_2022_charset_list;
2348 ASET (attrs, coding_attr_safe_charsets, Qnil);
2349 }
2350
2351 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2352 return;
2353
2354 max_charset_id = 0;
2355 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2356 {
2357 int id = XINT (XCAR (tail));
2358 if (max_charset_id < id)
2359 max_charset_id = id;
2360 }
2361
2362 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2363 make_number (255));
2364 request = AREF (attrs, coding_attr_iso_request);
2365 reg_usage = AREF (attrs, coding_attr_iso_usage);
2366 reg94 = XINT (XCAR (reg_usage));
2367 reg96 = XINT (XCDR (reg_usage));
2368
2369 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2370 {
2371 Lisp_Object id;
2372 Lisp_Object reg;
2373 struct charset *charset;
2374
2375 id = XCAR (tail);
2376 charset = CHARSET_FROM_ID (XINT (id));
2377 reg = Fcdr (Fassq (id, request));
2378 if (! NILP (reg))
2379 SSET (safe_charsets, XINT (id), XINT (reg));
2380 else if (charset->iso_chars_96)
2381 {
2382 if (reg96 < 4)
2383 SSET (safe_charsets, XINT (id), reg96);
2384 }
2385 else
2386 {
2387 if (reg94 < 4)
2388 SSET (safe_charsets, XINT (id), reg94);
2389 }
2390 }
2391 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2392 }
2393
2394
2395 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2396 Check if a text is encoded in one of ISO-2022 based codig systems.
2397 If it is, return 1, else return 0. */
2398
2399 static int
2400 detect_coding_iso_2022 (coding, detect_info)
2401 struct coding_system *coding;
2402 struct coding_detection_info *detect_info;
2403 {
2404 const unsigned char *src = coding->source, *src_base = src;
2405 const unsigned char *src_end = coding->source + coding->src_bytes;
2406 int multibytep = coding->src_multibyte;
2407 int single_shifting = 0;
2408 int id;
2409 int c, c1;
2410 int consumed_chars = 0;
2411 int i;
2412 int rejected = 0;
2413 int found = 0;
2414
2415 detect_info->checked |= CATEGORY_MASK_ISO;
2416
2417 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2418 {
2419 struct coding_system *this = &(coding_categories[i]);
2420 Lisp_Object attrs, val;
2421
2422 attrs = CODING_ID_ATTRS (this->id);
2423 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2424 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2425 setup_iso_safe_charsets (attrs);
2426 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2427 this->max_charset_id = SCHARS (val) - 1;
2428 this->safe_charsets = (char *) SDATA (val);
2429 }
2430
2431 /* A coding system of this category is always ASCII compatible. */
2432 src += coding->head_ascii;
2433
2434 while (rejected != CATEGORY_MASK_ISO)
2435 {
2436 ONE_MORE_BYTE (c);
2437 switch (c)
2438 {
2439 case ISO_CODE_ESC:
2440 if (inhibit_iso_escape_detection)
2441 break;
2442 single_shifting = 0;
2443 ONE_MORE_BYTE (c);
2444 if (c >= '(' && c <= '/')
2445 {
2446 /* Designation sequence for a charset of dimension 1. */
2447 ONE_MORE_BYTE (c1);
2448 if (c1 < ' ' || c1 >= 0x80
2449 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2450 /* Invalid designation sequence. Just ignore. */
2451 break;
2452 }
2453 else if (c == '$')
2454 {
2455 /* Designation sequence for a charset of dimension 2. */
2456 ONE_MORE_BYTE (c);
2457 if (c >= '@' && c <= 'B')
2458 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2459 id = iso_charset_table[1][0][c];
2460 else if (c >= '(' && c <= '/')
2461 {
2462 ONE_MORE_BYTE (c1);
2463 if (c1 < ' ' || c1 >= 0x80
2464 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2465 /* Invalid designation sequence. Just ignore. */
2466 break;
2467 }
2468 else
2469 /* Invalid designation sequence. Just ignore it. */
2470 break;
2471 }
2472 else if (c == 'N' || c == 'O')
2473 {
2474 /* ESC <Fe> for SS2 or SS3. */
2475 single_shifting = 1;
2476 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2477 break;
2478 }
2479 else if (c >= '0' && c <= '4')
2480 {
2481 /* ESC <Fp> for start/end composition. */
2482 found |= CATEGORY_MASK_ISO;
2483 break;
2484 }
2485 else
2486 {
2487 /* Invalid escape sequence. Just ignore it. */
2488 break;
2489 }
2490
2491 /* We found a valid designation sequence for CHARSET. */
2492 rejected |= CATEGORY_MASK_ISO_8BIT;
2493 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2494 id))
2495 found |= CATEGORY_MASK_ISO_7;
2496 else
2497 rejected |= CATEGORY_MASK_ISO_7;
2498 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2499 id))
2500 found |= CATEGORY_MASK_ISO_7_TIGHT;
2501 else
2502 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2503 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2504 id))
2505 found |= CATEGORY_MASK_ISO_7_ELSE;
2506 else
2507 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2508 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2509 id))
2510 found |= CATEGORY_MASK_ISO_8_ELSE;
2511 else
2512 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2513 break;
2514
2515 case ISO_CODE_SO:
2516 case ISO_CODE_SI:
2517 /* Locking shift out/in. */
2518 if (inhibit_iso_escape_detection)
2519 break;
2520 single_shifting = 0;
2521 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2522 found |= CATEGORY_MASK_ISO_ELSE;
2523 break;
2524
2525 case ISO_CODE_CSI:
2526 /* Control sequence introducer. */
2527 single_shifting = 0;
2528 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2529 found |= CATEGORY_MASK_ISO_8_ELSE;
2530 goto check_extra_latin;
2531
2532
2533 case ISO_CODE_SS2:
2534 case ISO_CODE_SS3:
2535 /* Single shift. */
2536 if (inhibit_iso_escape_detection)
2537 break;
2538 single_shifting = 1;
2539 rejected |= CATEGORY_MASK_ISO_7BIT;
2540 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2541 & CODING_ISO_FLAG_SINGLE_SHIFT)
2542 found |= CATEGORY_MASK_ISO_8_1;
2543 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2544 & CODING_ISO_FLAG_SINGLE_SHIFT)
2545 found |= CATEGORY_MASK_ISO_8_2;
2546 goto check_extra_latin;
2547
2548 default:
2549 if (c < 0x80)
2550 {
2551 single_shifting = 0;
2552 break;
2553 }
2554 if (c >= 0xA0)
2555 {
2556 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2557 found |= CATEGORY_MASK_ISO_8_1;
2558 /* Check the length of succeeding codes of the range
2559 0xA0..0FF. If the byte length is even, we include
2560 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2561 only when we are not single shifting. */
2562 if (! single_shifting
2563 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2564 {
2565 int i = 1;
2566 while (src < src_end)
2567 {
2568 ONE_MORE_BYTE (c);
2569 if (c < 0xA0)
2570 break;
2571 i++;
2572 }
2573
2574 if (i & 1 && src < src_end)
2575 rejected |= CATEGORY_MASK_ISO_8_2;
2576 else
2577 found |= CATEGORY_MASK_ISO_8_2;
2578 }
2579 break;
2580 }
2581 check_extra_latin:
2582 single_shifting = 0;
2583 if (! VECTORP (Vlatin_extra_code_table)
2584 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2585 {
2586 rejected = CATEGORY_MASK_ISO;
2587 break;
2588 }
2589 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2590 & CODING_ISO_FLAG_LATIN_EXTRA)
2591 found |= CATEGORY_MASK_ISO_8_1;
2592 else
2593 rejected |= CATEGORY_MASK_ISO_8_1;
2594 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2595 & CODING_ISO_FLAG_LATIN_EXTRA)
2596 found |= CATEGORY_MASK_ISO_8_2;
2597 else
2598 rejected |= CATEGORY_MASK_ISO_8_2;
2599 }
2600 }
2601 detect_info->rejected |= CATEGORY_MASK_ISO;
2602 return 0;
2603
2604 no_more_source:
2605 detect_info->rejected |= rejected;
2606 detect_info->found |= (found & ~rejected);
2607 return 1;
2608 }
2609
2610
2611 /* Set designation state into CODING. */
2612 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2613 do { \
2614 int id, prev; \
2615 \
2616 if (final < '0' || final >= 128 \
2617 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2618 || !SAFE_CHARSET_P (coding, id)) \
2619 { \
2620 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2621 goto invalid_code; \
2622 } \
2623 prev = CODING_ISO_DESIGNATION (coding, reg); \
2624 if (id == charset_jisx0201_roman) \
2625 { \
2626 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2627 id = charset_ascii; \
2628 } \
2629 else if (id == charset_jisx0208_1978) \
2630 { \
2631 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2632 id = charset_jisx0208; \
2633 } \
2634 CODING_ISO_DESIGNATION (coding, reg) = id; \
2635 /* If there was an invalid designation to REG previously, and this \
2636 designation is ASCII to REG, we should keep this designation \
2637 sequence. */ \
2638 if (prev == -2 && id == charset_ascii) \
2639 goto invalid_code; \
2640 } while (0)
2641
2642
2643 #define MAYBE_FINISH_COMPOSITION() \
2644 do { \
2645 int i; \
2646 if (composition_state == COMPOSING_NO) \
2647 break; \
2648 /* It is assured that we have enough room for producing \
2649 characters stored in the table `components'. */ \
2650 if (charbuf + component_idx > charbuf_end) \
2651 goto no_more_source; \
2652 composition_state = COMPOSING_NO; \
2653 if (method == COMPOSITION_RELATIVE \
2654 || method == COMPOSITION_WITH_ALTCHARS) \
2655 { \
2656 for (i = 0; i < component_idx; i++) \
2657 *charbuf++ = components[i]; \
2658 char_offset += component_idx; \
2659 } \
2660 else \
2661 { \
2662 for (i = 0; i < component_idx; i += 2) \
2663 *charbuf++ = components[i]; \
2664 char_offset += (component_idx / 2) + 1; \
2665 } \
2666 } while (0)
2667
2668
2669 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2670 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2671 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2672 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2673 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2674 */
2675
2676 #define DECODE_COMPOSITION_START(c1) \
2677 do { \
2678 if (c1 == '0' \
2679 && composition_state == COMPOSING_COMPONENT_RULE) \
2680 { \
2681 component_len = component_idx; \
2682 composition_state = COMPOSING_CHAR; \
2683 } \
2684 else \
2685 { \
2686 const unsigned char *p; \
2687 \
2688 MAYBE_FINISH_COMPOSITION (); \
2689 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2690 goto no_more_source; \
2691 for (p = src; p < src_end - 1; p++) \
2692 if (*p == ISO_CODE_ESC && p[1] == '1') \
2693 break; \
2694 if (p == src_end - 1) \
2695 { \
2696 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2697 goto invalid_code; \
2698 goto no_more_source; \
2699 } \
2700 \
2701 /* This is surely the start of a composition. */ \
2702 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2703 : c1 == '2' ? COMPOSITION_WITH_RULE \
2704 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2705 : COMPOSITION_WITH_RULE_ALTCHARS); \
2706 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2707 : COMPOSING_COMPONENT_CHAR); \
2708 component_idx = component_len = 0; \
2709 } \
2710 } while (0)
2711
2712
2713 /* Handle compositoin end sequence ESC 1. */
2714
2715 #define DECODE_COMPOSITION_END() \
2716 do { \
2717 int nchars = (component_len > 0 ? component_idx - component_len \
2718 : method == COMPOSITION_RELATIVE ? component_idx \
2719 : (component_idx + 1) / 2); \
2720 int i; \
2721 int *saved_charbuf = charbuf; \
2722 int from = char_offset; \
2723 int to = from + nchars; \
2724 \
2725 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2726 if (method != COMPOSITION_RELATIVE) \
2727 { \
2728 if (component_len == 0) \
2729 for (i = 0; i < component_idx; i++) \
2730 *charbuf++ = components[i]; \
2731 else \
2732 for (i = 0; i < component_len; i++) \
2733 *charbuf++ = components[i]; \
2734 *saved_charbuf = saved_charbuf - charbuf; \
2735 } \
2736 if (method == COMPOSITION_WITH_RULE) \
2737 for (i = 0; i < component_idx; i += 2, char_offset++) \
2738 *charbuf++ = components[i]; \
2739 else \
2740 for (i = component_len; i < component_idx; i++, char_offset++) \
2741 *charbuf++ = components[i]; \
2742 coding->annotated = 1; \
2743 composition_state = COMPOSING_NO; \
2744 } while (0)
2745
2746
2747 /* Decode a composition rule from the byte C1 (and maybe one more byte
2748 from SRC) and store one encoded composition rule in
2749 coding->cmp_data. */
2750
2751 #define DECODE_COMPOSITION_RULE(c1) \
2752 do { \
2753 (c1) -= 32; \
2754 if (c1 < 81) /* old format (before ver.21) */ \
2755 { \
2756 int gref = (c1) / 9; \
2757 int nref = (c1) % 9; \
2758 if (gref == 4) gref = 10; \
2759 if (nref == 4) nref = 10; \
2760 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2761 } \
2762 else if (c1 < 93) /* new format (after ver.21) */ \
2763 { \
2764 ONE_MORE_BYTE (c2); \
2765 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2766 } \
2767 else \
2768 c1 = 0; \
2769 } while (0)
2770
2771
2772 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2773
2774 static void
2775 decode_coding_iso_2022 (coding)
2776 struct coding_system *coding;
2777 {
2778 const unsigned char *src = coding->source + coding->consumed;
2779 const unsigned char *src_end = coding->source + coding->src_bytes;
2780 const unsigned char *src_base;
2781 int *charbuf = coding->charbuf;
2782 int *charbuf_end
2783 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2784 int consumed_chars = 0, consumed_chars_base;
2785 int multibytep = coding->src_multibyte;
2786 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2787 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2788 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2789 struct charset *charset;
2790 int c;
2791 /* For handling composition sequence. */
2792 #define COMPOSING_NO 0
2793 #define COMPOSING_CHAR 1
2794 #define COMPOSING_RULE 2
2795 #define COMPOSING_COMPONENT_CHAR 3
2796 #define COMPOSING_COMPONENT_RULE 4
2797
2798 int composition_state = COMPOSING_NO;
2799 enum composition_method method;
2800 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2801 int component_idx;
2802 int component_len;
2803 Lisp_Object attrs, eol_type, charset_list;
2804 int char_offset = coding->produced_char;
2805 int last_offset = char_offset;
2806 int last_id = charset_ascii;
2807
2808 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2809 setup_iso_safe_charsets (attrs);
2810
2811 while (1)
2812 {
2813 int c1, c2;
2814
2815 src_base = src;
2816 consumed_chars_base = consumed_chars;
2817
2818 if (charbuf >= charbuf_end)
2819 break;
2820
2821 ONE_MORE_BYTE (c1);
2822
2823 /* We produce at most one character. */
2824 switch (iso_code_class [c1])
2825 {
2826 case ISO_0x20_or_0x7F:
2827 if (composition_state != COMPOSING_NO)
2828 {
2829 if (composition_state == COMPOSING_RULE
2830 || composition_state == COMPOSING_COMPONENT_RULE)
2831 {
2832 DECODE_COMPOSITION_RULE (c1);
2833 components[component_idx++] = c1;
2834 composition_state--;
2835 continue;
2836 }
2837 }
2838 if (charset_id_0 < 0
2839 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2840 /* This is SPACE or DEL. */
2841 charset = CHARSET_FROM_ID (charset_ascii);
2842 else
2843 charset = CHARSET_FROM_ID (charset_id_0);
2844 break;
2845
2846 case ISO_graphic_plane_0:
2847 if (composition_state != COMPOSING_NO)
2848 {
2849 if (composition_state == COMPOSING_RULE
2850 || composition_state == COMPOSING_COMPONENT_RULE)
2851 {
2852 DECODE_COMPOSITION_RULE (c1);
2853 components[component_idx++] = c1;
2854 composition_state--;
2855 continue;
2856 }
2857 }
2858 charset = CHARSET_FROM_ID (charset_id_0);
2859 break;
2860
2861 case ISO_0xA0_or_0xFF:
2862 if (charset_id_1 < 0
2863 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2864 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2865 goto invalid_code;
2866 /* This is a graphic character, we fall down ... */
2867
2868 case ISO_graphic_plane_1:
2869 if (charset_id_1 < 0)
2870 goto invalid_code;
2871 charset = CHARSET_FROM_ID (charset_id_1);
2872 break;
2873
2874 case ISO_carriage_return:
2875 if (c1 == '\r')
2876 {
2877 if (EQ (eol_type, Qdos))
2878 {
2879 if (src == src_end)
2880 {
2881 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2882 goto no_more_source;
2883 }
2884 if (*src == '\n')
2885 ONE_MORE_BYTE (c1);
2886 }
2887 else if (EQ (eol_type, Qmac))
2888 c1 = '\n';
2889 }
2890 /* fall through */
2891
2892 case ISO_control_0:
2893 MAYBE_FINISH_COMPOSITION ();
2894 charset = CHARSET_FROM_ID (charset_ascii);
2895 break;
2896
2897 case ISO_control_1:
2898 MAYBE_FINISH_COMPOSITION ();
2899 goto invalid_code;
2900
2901 case ISO_shift_out:
2902 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2903 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2904 goto invalid_code;
2905 CODING_ISO_INVOCATION (coding, 0) = 1;
2906 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2907 continue;
2908
2909 case ISO_shift_in:
2910 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2911 goto invalid_code;
2912 CODING_ISO_INVOCATION (coding, 0) = 0;
2913 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2914 continue;
2915
2916 case ISO_single_shift_2_7:
2917 case ISO_single_shift_2:
2918 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2919 goto invalid_code;
2920 /* SS2 is handled as an escape sequence of ESC 'N' */
2921 c1 = 'N';
2922 goto label_escape_sequence;
2923
2924 case ISO_single_shift_3:
2925 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2926 goto invalid_code;
2927 /* SS2 is handled as an escape sequence of ESC 'O' */
2928 c1 = 'O';
2929 goto label_escape_sequence;
2930
2931 case ISO_control_sequence_introducer:
2932 /* CSI is handled as an escape sequence of ESC '[' ... */
2933 c1 = '[';
2934 goto label_escape_sequence;
2935
2936 case ISO_escape:
2937 ONE_MORE_BYTE (c1);
2938 label_escape_sequence:
2939 /* Escape sequences handled here are invocation,
2940 designation, direction specification, and character
2941 composition specification. */
2942 switch (c1)
2943 {
2944 case '&': /* revision of following character set */
2945 ONE_MORE_BYTE (c1);
2946 if (!(c1 >= '@' && c1 <= '~'))
2947 goto invalid_code;
2948 ONE_MORE_BYTE (c1);
2949 if (c1 != ISO_CODE_ESC)
2950 goto invalid_code;
2951 ONE_MORE_BYTE (c1);
2952 goto label_escape_sequence;
2953
2954 case '$': /* designation of 2-byte character set */
2955 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2956 goto invalid_code;
2957 ONE_MORE_BYTE (c1);
2958 if (c1 >= '@' && c1 <= 'B')
2959 { /* designation of JISX0208.1978, GB2312.1980,
2960 or JISX0208.1980 */
2961 DECODE_DESIGNATION (0, 2, 0, c1);
2962 }
2963 else if (c1 >= 0x28 && c1 <= 0x2B)
2964 { /* designation of DIMENSION2_CHARS94 character set */
2965 ONE_MORE_BYTE (c2);
2966 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2967 }
2968 else if (c1 >= 0x2C && c1 <= 0x2F)
2969 { /* designation of DIMENSION2_CHARS96 character set */
2970 ONE_MORE_BYTE (c2);
2971 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2972 }
2973 else
2974 goto invalid_code;
2975 /* We must update these variables now. */
2976 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2977 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2978 continue;
2979
2980 case 'n': /* invocation of locking-shift-2 */
2981 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2982 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2983 goto invalid_code;
2984 CODING_ISO_INVOCATION (coding, 0) = 2;
2985 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2986 continue;
2987
2988 case 'o': /* invocation of locking-shift-3 */
2989 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2990 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2991 goto invalid_code;
2992 CODING_ISO_INVOCATION (coding, 0) = 3;
2993 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2994 continue;
2995
2996 case 'N': /* invocation of single-shift-2 */
2997 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2998 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2999 goto invalid_code;
3000 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3001 ONE_MORE_BYTE (c1);
3002 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3003 goto invalid_code;
3004 break;
3005
3006 case 'O': /* invocation of single-shift-3 */
3007 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3008 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3009 goto invalid_code;
3010 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3011 ONE_MORE_BYTE (c1);
3012 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3013 goto invalid_code;
3014 break;
3015
3016 case '0': case '2': case '3': case '4': /* start composition */
3017 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3018 goto invalid_code;
3019 DECODE_COMPOSITION_START (c1);
3020 continue;
3021
3022 case '1': /* end composition */
3023 if (composition_state == COMPOSING_NO)
3024 goto invalid_code;
3025 DECODE_COMPOSITION_END ();
3026 continue;
3027
3028 case '[': /* specification of direction */
3029 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3030 goto invalid_code;
3031 /* For the moment, nested direction is not supported.
3032 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3033 left-to-right, and nozero means right-to-left. */
3034 ONE_MORE_BYTE (c1);
3035 switch (c1)
3036 {
3037 case ']': /* end of the current direction */
3038 coding->mode &= ~CODING_MODE_DIRECTION;
3039
3040 case '0': /* end of the current direction */
3041 case '1': /* start of left-to-right direction */
3042 ONE_MORE_BYTE (c1);
3043 if (c1 == ']')
3044 coding->mode &= ~CODING_MODE_DIRECTION;
3045 else
3046 goto invalid_code;
3047 break;
3048
3049 case '2': /* start of right-to-left direction */
3050 ONE_MORE_BYTE (c1);
3051 if (c1 == ']')
3052 coding->mode |= CODING_MODE_DIRECTION;
3053 else
3054 goto invalid_code;
3055 break;
3056
3057 default:
3058 goto invalid_code;
3059 }
3060 continue;
3061
3062 case '%':
3063 ONE_MORE_BYTE (c1);
3064 if (c1 == '/')
3065 {
3066 /* CTEXT extended segment:
3067 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3068 We keep these bytes as is for the moment.
3069 They may be decoded by post-read-conversion. */
3070 int dim, M, L;
3071 int size;
3072
3073 ONE_MORE_BYTE (dim);
3074 ONE_MORE_BYTE (M);
3075 ONE_MORE_BYTE (L);
3076 size = ((M - 128) * 128) + (L - 128);
3077 if (charbuf + 8 + size > charbuf_end)
3078 goto break_loop;
3079 *charbuf++ = ISO_CODE_ESC;
3080 *charbuf++ = '%';
3081 *charbuf++ = '/';
3082 *charbuf++ = dim;
3083 *charbuf++ = BYTE8_TO_CHAR (M);
3084 *charbuf++ = BYTE8_TO_CHAR (L);
3085 while (size-- > 0)
3086 {
3087 ONE_MORE_BYTE (c1);
3088 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3089 }
3090 }
3091 else if (c1 == 'G')
3092 {
3093 /* XFree86 extension for embedding UTF-8 in CTEXT:
3094 ESC % G --UTF-8-BYTES-- ESC % @
3095 We keep these bytes as is for the moment.
3096 They may be decoded by post-read-conversion. */
3097 int *p = charbuf;
3098
3099 if (p + 6 > charbuf_end)
3100 goto break_loop;
3101 *p++ = ISO_CODE_ESC;
3102 *p++ = '%';
3103 *p++ = 'G';
3104 while (p < charbuf_end)
3105 {
3106 ONE_MORE_BYTE (c1);
3107 if (c1 == ISO_CODE_ESC
3108 && src + 1 < src_end
3109 && src[0] == '%'
3110 && src[1] == '@')
3111 break;
3112 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3113 }
3114 if (p + 3 > charbuf_end)
3115 goto break_loop;
3116 *p++ = ISO_CODE_ESC;
3117 *p++ = '%';
3118 *p++ = '@';
3119 charbuf = p;
3120 }
3121 else
3122 goto invalid_code;
3123 continue;
3124 break;
3125
3126 default:
3127 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3128 goto invalid_code;
3129 if (c1 >= 0x28 && c1 <= 0x2B)
3130 { /* designation of DIMENSION1_CHARS94 character set */
3131 ONE_MORE_BYTE (c2);
3132 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3133 }
3134 else if (c1 >= 0x2C && c1 <= 0x2F)
3135 { /* designation of DIMENSION1_CHARS96 character set */
3136 ONE_MORE_BYTE (c2);
3137 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3138 }
3139 else
3140 goto invalid_code;
3141 /* We must update these variables now. */
3142 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3143 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3144 continue;
3145 }
3146 }
3147
3148 if (charset->id != charset_ascii
3149 && last_id != charset->id)
3150 {
3151 if (last_id != charset_ascii)
3152 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3153 last_id = charset->id;
3154 last_offset = char_offset;
3155 }
3156
3157 /* Now we know CHARSET and 1st position code C1 of a character.
3158 Produce a decoded character while getting 2nd position code
3159 C2 if necessary. */
3160 c1 &= 0x7F;
3161 if (CHARSET_DIMENSION (charset) > 1)
3162 {
3163 ONE_MORE_BYTE (c2);
3164 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3165 /* C2 is not in a valid range. */
3166 goto invalid_code;
3167 c1 = (c1 << 8) | (c2 & 0x7F);
3168 if (CHARSET_DIMENSION (charset) > 2)
3169 {
3170 ONE_MORE_BYTE (c2);
3171 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3172 /* C2 is not in a valid range. */
3173 goto invalid_code;
3174 c1 = (c1 << 8) | (c2 & 0x7F);
3175 }
3176 }
3177
3178 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3179 if (c < 0)
3180 {
3181 MAYBE_FINISH_COMPOSITION ();
3182 for (; src_base < src; src_base++, char_offset++)
3183 {
3184 if (ASCII_BYTE_P (*src_base))
3185 *charbuf++ = *src_base;
3186 else
3187 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3188 }
3189 }
3190 else if (composition_state == COMPOSING_NO)
3191 {
3192 *charbuf++ = c;
3193 char_offset++;
3194 }
3195 else
3196 {
3197 components[component_idx++] = c;
3198 if (method == COMPOSITION_WITH_RULE
3199 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3200 && composition_state == COMPOSING_COMPONENT_CHAR))
3201 composition_state++;
3202 }
3203 continue;
3204
3205 invalid_code:
3206 MAYBE_FINISH_COMPOSITION ();
3207 src = src_base;
3208 consumed_chars = consumed_chars_base;
3209 ONE_MORE_BYTE (c);
3210 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3211 char_offset++;
3212 coding->errors++;
3213 continue;
3214
3215 break_loop:
3216 break;
3217 }
3218
3219 no_more_source:
3220 if (last_id != charset_ascii)
3221 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3222 coding->consumed_char += consumed_chars_base;
3223 coding->consumed = src_base - coding->source;
3224 coding->charbuf_used = charbuf - coding->charbuf;
3225 }
3226
3227
3228 /* ISO2022 encoding stuff. */
3229
3230 /*
3231 It is not enough to say just "ISO2022" on encoding, we have to
3232 specify more details. In Emacs, each coding system of ISO2022
3233 variant has the following specifications:
3234 1. Initial designation to G0 thru G3.
3235 2. Allows short-form designation?
3236 3. ASCII should be designated to G0 before control characters?
3237 4. ASCII should be designated to G0 at end of line?
3238 5. 7-bit environment or 8-bit environment?
3239 6. Use locking-shift?
3240 7. Use Single-shift?
3241 And the following two are only for Japanese:
3242 8. Use ASCII in place of JIS0201-1976-Roman?
3243 9. Use JISX0208-1983 in place of JISX0208-1978?
3244 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3245 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3246 details.
3247 */
3248
3249 /* Produce codes (escape sequence) for designating CHARSET to graphic
3250 register REG at DST, and increment DST. If <final-char> of CHARSET is
3251 '@', 'A', or 'B' and the coding system CODING allows, produce
3252 designation sequence of short-form. */
3253
3254 #define ENCODE_DESIGNATION(charset, reg, coding) \
3255 do { \
3256 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3257 char *intermediate_char_94 = "()*+"; \
3258 char *intermediate_char_96 = ",-./"; \
3259 int revision = -1; \
3260 int c; \
3261 \
3262 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3263 revision = CHARSET_ISO_REVISION (charset); \
3264 \
3265 if (revision >= 0) \
3266 { \
3267 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3268 EMIT_ONE_BYTE ('@' + revision); \
3269 } \
3270 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3271 if (CHARSET_DIMENSION (charset) == 1) \
3272 { \
3273 if (! CHARSET_ISO_CHARS_96 (charset)) \
3274 c = intermediate_char_94[reg]; \
3275 else \
3276 c = intermediate_char_96[reg]; \
3277 EMIT_ONE_ASCII_BYTE (c); \
3278 } \
3279 else \
3280 { \
3281 EMIT_ONE_ASCII_BYTE ('$'); \
3282 if (! CHARSET_ISO_CHARS_96 (charset)) \
3283 { \
3284 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3285 || reg != 0 \
3286 || final_char < '@' || final_char > 'B') \
3287 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3288 } \
3289 else \
3290 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3291 } \
3292 EMIT_ONE_ASCII_BYTE (final_char); \
3293 \
3294 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3295 } while (0)
3296
3297
3298 /* The following two macros produce codes (control character or escape
3299 sequence) for ISO2022 single-shift functions (single-shift-2 and
3300 single-shift-3). */
3301
3302 #define ENCODE_SINGLE_SHIFT_2 \
3303 do { \
3304 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3305 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3306 else \
3307 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3308 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3309 } while (0)
3310
3311
3312 #define ENCODE_SINGLE_SHIFT_3 \
3313 do { \
3314 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3315 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3316 else \
3317 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3318 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3319 } while (0)
3320
3321
3322 /* The following four macros produce codes (control character or
3323 escape sequence) for ISO2022 locking-shift functions (shift-in,
3324 shift-out, locking-shift-2, and locking-shift-3). */
3325
3326 #define ENCODE_SHIFT_IN \
3327 do { \
3328 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3329 CODING_ISO_INVOCATION (coding, 0) = 0; \
3330 } while (0)
3331
3332
3333 #define ENCODE_SHIFT_OUT \
3334 do { \
3335 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3336 CODING_ISO_INVOCATION (coding, 0) = 1; \
3337 } while (0)
3338
3339
3340 #define ENCODE_LOCKING_SHIFT_2 \
3341 do { \
3342 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3343 CODING_ISO_INVOCATION (coding, 0) = 2; \
3344 } while (0)
3345
3346
3347 #define ENCODE_LOCKING_SHIFT_3 \
3348 do { \
3349 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3350 CODING_ISO_INVOCATION (coding, 0) = 3; \
3351 } while (0)
3352
3353
3354 /* Produce codes for a DIMENSION1 character whose character set is
3355 CHARSET and whose position-code is C1. Designation and invocation
3356 sequences are also produced in advance if necessary. */
3357
3358 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3359 do { \
3360 int id = CHARSET_ID (charset); \
3361 \
3362 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3363 && id == charset_ascii) \
3364 { \
3365 id = charset_jisx0201_roman; \
3366 charset = CHARSET_FROM_ID (id); \
3367 } \
3368 \
3369 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3370 { \
3371 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3372 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3373 else \
3374 EMIT_ONE_BYTE (c1 | 0x80); \
3375 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3376 break; \
3377 } \
3378 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3379 { \
3380 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3381 break; \
3382 } \
3383 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3384 { \
3385 EMIT_ONE_BYTE (c1 | 0x80); \
3386 break; \
3387 } \
3388 else \
3389 /* Since CHARSET is not yet invoked to any graphic planes, we \
3390 must invoke it, or, at first, designate it to some graphic \
3391 register. Then repeat the loop to actually produce the \
3392 character. */ \
3393 dst = encode_invocation_designation (charset, coding, dst, \
3394 &produced_chars); \
3395 } while (1)
3396
3397
3398 /* Produce codes for a DIMENSION2 character whose character set is
3399 CHARSET and whose position-codes are C1 and C2. Designation and
3400 invocation codes are also produced in advance if necessary. */
3401
3402 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3403 do { \
3404 int id = CHARSET_ID (charset); \
3405 \
3406 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3407 && id == charset_jisx0208) \
3408 { \
3409 id = charset_jisx0208_1978; \
3410 charset = CHARSET_FROM_ID (id); \
3411 } \
3412 \
3413 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3414 { \
3415 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3416 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3417 else \
3418 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3419 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3420 break; \
3421 } \
3422 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3423 { \
3424 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3425 break; \
3426 } \
3427 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3428 { \
3429 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3430 break; \
3431 } \
3432 else \
3433 /* Since CHARSET is not yet invoked to any graphic planes, we \
3434 must invoke it, or, at first, designate it to some graphic \
3435 register. Then repeat the loop to actually produce the \
3436 character. */ \
3437 dst = encode_invocation_designation (charset, coding, dst, \
3438 &produced_chars); \
3439 } while (1)
3440
3441
3442 #define ENCODE_ISO_CHARACTER(charset, c) \
3443 do { \
3444 int code = ENCODE_CHAR ((charset),(c)); \
3445 \
3446 if (CHARSET_DIMENSION (charset) == 1) \
3447 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3448 else \
3449 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3450 } while (0)
3451
3452
3453 /* Produce designation and invocation codes at a place pointed by DST
3454 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3455 Return new DST. */
3456
3457 unsigned char *
3458 encode_invocation_designation (charset, coding, dst, p_nchars)
3459 struct charset *charset;
3460 struct coding_system *coding;
3461 unsigned char *dst;
3462 int *p_nchars;
3463 {
3464 int multibytep = coding->dst_multibyte;
3465 int produced_chars = *p_nchars;
3466 int reg; /* graphic register number */
3467 int id = CHARSET_ID (charset);
3468
3469 /* At first, check designations. */
3470 for (reg = 0; reg < 4; reg++)
3471 if (id == CODING_ISO_DESIGNATION (coding, reg))
3472 break;
3473
3474 if (reg >= 4)
3475 {
3476 /* CHARSET is not yet designated to any graphic registers. */
3477 /* At first check the requested designation. */
3478 reg = CODING_ISO_REQUEST (coding, id);
3479 if (reg < 0)
3480 /* Since CHARSET requests no special designation, designate it
3481 to graphic register 0. */
3482 reg = 0;
3483
3484 ENCODE_DESIGNATION (charset, reg, coding);
3485 }
3486
3487 if (CODING_ISO_INVOCATION (coding, 0) != reg
3488 && CODING_ISO_INVOCATION (coding, 1) != reg)
3489 {
3490 /* Since the graphic register REG is not invoked to any graphic
3491 planes, invoke it to graphic plane 0. */
3492 switch (reg)
3493 {
3494 case 0: /* graphic register 0 */
3495 ENCODE_SHIFT_IN;
3496 break;
3497
3498 case 1: /* graphic register 1 */
3499 ENCODE_SHIFT_OUT;
3500 break;
3501
3502 case 2: /* graphic register 2 */
3503 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3504 ENCODE_SINGLE_SHIFT_2;
3505 else
3506 ENCODE_LOCKING_SHIFT_2;
3507 break;
3508
3509 case 3: /* graphic register 3 */
3510 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3511 ENCODE_SINGLE_SHIFT_3;
3512 else
3513 ENCODE_LOCKING_SHIFT_3;
3514 break;
3515 }
3516 }
3517
3518 *p_nchars = produced_chars;
3519 return dst;
3520 }
3521
3522 /* The following three macros produce codes for indicating direction
3523 of text. */
3524 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3525 do { \
3526 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3527 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3528 else \
3529 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3530 } while (0)
3531
3532
3533 #define ENCODE_DIRECTION_R2L() \
3534 do { \
3535 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3536 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3537 } while (0)
3538
3539
3540 #define ENCODE_DIRECTION_L2R() \
3541 do { \
3542 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3543 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3544 } while (0)
3545
3546
3547 /* Produce codes for designation and invocation to reset the graphic
3548 planes and registers to initial state. */
3549 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3550 do { \
3551 int reg; \
3552 struct charset *charset; \
3553 \
3554 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3555 ENCODE_SHIFT_IN; \
3556 for (reg = 0; reg < 4; reg++) \
3557 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3558 && (CODING_ISO_DESIGNATION (coding, reg) \
3559 != CODING_ISO_INITIAL (coding, reg))) \
3560 { \
3561 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3562 ENCODE_DESIGNATION (charset, reg, coding); \
3563 } \
3564 } while (0)
3565
3566
3567 /* Produce designation sequences of charsets in the line started from
3568 SRC to a place pointed by DST, and return updated DST.
3569
3570 If the current block ends before any end-of-line, we may fail to
3571 find all the necessary designations. */
3572
3573 static unsigned char *
3574 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3575 struct coding_system *coding;
3576 int *charbuf, *charbuf_end;
3577 unsigned char *dst;
3578 {
3579 struct charset *charset;
3580 /* Table of charsets to be designated to each graphic register. */
3581 int r[4];
3582 int c, found = 0, reg;
3583 int produced_chars = 0;
3584 int multibytep = coding->dst_multibyte;
3585 Lisp_Object attrs;
3586 Lisp_Object charset_list;
3587
3588 attrs = CODING_ID_ATTRS (coding->id);
3589 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3590 if (EQ (charset_list, Qiso_2022))
3591 charset_list = Viso_2022_charset_list;
3592
3593 for (reg = 0; reg < 4; reg++)
3594 r[reg] = -1;
3595
3596 while (found < 4)
3597 {
3598 int id;
3599
3600 c = *charbuf++;
3601 if (c == '\n')
3602 break;
3603 charset = char_charset (c, charset_list, NULL);
3604 id = CHARSET_ID (charset);
3605 reg = CODING_ISO_REQUEST (coding, id);
3606 if (reg >= 0 && r[reg] < 0)
3607 {
3608 found++;
3609 r[reg] = id;
3610 }
3611 }
3612
3613 if (found)
3614 {
3615 for (reg = 0; reg < 4; reg++)
3616 if (r[reg] >= 0
3617 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3618 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3619 }
3620
3621 return dst;
3622 }
3623
3624 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3625
3626 static int
3627 encode_coding_iso_2022 (coding)
3628 struct coding_system *coding;
3629 {
3630 int multibytep = coding->dst_multibyte;
3631 int *charbuf = coding->charbuf;
3632 int *charbuf_end = charbuf + coding->charbuf_used;
3633 unsigned char *dst = coding->destination + coding->produced;
3634 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3635 int safe_room = 16;
3636 int bol_designation
3637 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3638 && CODING_ISO_BOL (coding));
3639 int produced_chars = 0;
3640 Lisp_Object attrs, eol_type, charset_list;
3641 int ascii_compatible;
3642 int c;
3643 int preferred_charset_id = -1;
3644
3645 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3646 setup_iso_safe_charsets (attrs);
3647 /* Charset list may have been changed. */
3648 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3649 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3650
3651 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3652
3653 while (charbuf < charbuf_end)
3654 {
3655 ASSURE_DESTINATION (safe_room);
3656
3657 if (bol_designation)
3658 {
3659 unsigned char *dst_prev = dst;
3660
3661 /* We have to produce designation sequences if any now. */
3662 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3663 bol_designation = 0;
3664 /* We are sure that designation sequences are all ASCII bytes. */
3665 produced_chars += dst - dst_prev;
3666 }
3667
3668 c = *charbuf++;
3669
3670 if (c < 0)
3671 {
3672 /* Handle an annotation. */
3673 switch (*charbuf)
3674 {
3675 case CODING_ANNOTATE_COMPOSITION_MASK:
3676 /* Not yet implemented. */
3677 break;
3678 case CODING_ANNOTATE_CHARSET_MASK:
3679 preferred_charset_id = charbuf[3];
3680 if (preferred_charset_id >= 0
3681 && NILP (Fmemq (make_number (preferred_charset_id),
3682 charset_list)))
3683 preferred_charset_id = -1;
3684 break;
3685 default:
3686 abort ();
3687 }
3688 charbuf += -c - 1;
3689 continue;
3690 }
3691
3692 /* Now encode the character C. */
3693 if (c < 0x20 || c == 0x7F)
3694 {
3695 if (c == '\n'
3696 || (c == '\r' && EQ (eol_type, Qmac)))
3697 {
3698 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3699 ENCODE_RESET_PLANE_AND_REGISTER ();
3700 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3701 {
3702 int i;
3703
3704 for (i = 0; i < 4; i++)
3705 CODING_ISO_DESIGNATION (coding, i)
3706 = CODING_ISO_INITIAL (coding, i);
3707 }
3708 bol_designation
3709 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3710 }
3711 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3712 ENCODE_RESET_PLANE_AND_REGISTER ();
3713 EMIT_ONE_ASCII_BYTE (c);
3714 }
3715 else if (ASCII_CHAR_P (c))
3716 {
3717 if (ascii_compatible)
3718 EMIT_ONE_ASCII_BYTE (c);
3719 else
3720 {
3721 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3722 ENCODE_ISO_CHARACTER (charset, c);
3723 }
3724 }
3725 else if (CHAR_BYTE8_P (c))
3726 {
3727 c = CHAR_TO_BYTE8 (c);
3728 EMIT_ONE_BYTE (c);
3729 }
3730 else
3731 {
3732 struct charset *charset;
3733
3734 if (preferred_charset_id >= 0)
3735 {
3736 charset = CHARSET_FROM_ID (preferred_charset_id);
3737 if (! CHAR_CHARSET_P (c, charset))
3738 charset = char_charset (c, charset_list, NULL);
3739 }
3740 else
3741 charset = char_charset (c, charset_list, NULL);
3742 if (!charset)
3743 {
3744 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3745 {
3746 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3747 charset = CHARSET_FROM_ID (charset_ascii);
3748 }
3749 else
3750 {
3751 c = coding->default_char;
3752 charset = char_charset (c, charset_list, NULL);
3753 }
3754 }
3755 ENCODE_ISO_CHARACTER (charset, c);
3756 }
3757 }
3758
3759 if (coding->mode & CODING_MODE_LAST_BLOCK
3760 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3761 {
3762 ASSURE_DESTINATION (safe_room);
3763 ENCODE_RESET_PLANE_AND_REGISTER ();
3764 }
3765 coding->result = CODING_RESULT_SUCCESS;
3766 CODING_ISO_BOL (coding) = bol_designation;
3767 coding->produced_char += produced_chars;
3768 coding->produced = dst - coding->destination;
3769 return 0;
3770 }
3771
3772 \f
3773 /*** 8,9. SJIS and BIG5 handlers ***/
3774
3775 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3776 quite widely. So, for the moment, Emacs supports them in the bare
3777 C code. But, in the future, they may be supported only by CCL. */
3778
3779 /* SJIS is a coding system encoding three character sets: ASCII, right
3780 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3781 as is. A character of charset katakana-jisx0201 is encoded by
3782 "position-code + 0x80". A character of charset japanese-jisx0208
3783 is encoded in 2-byte but two position-codes are divided and shifted
3784 so that it fit in the range below.
3785
3786 --- CODE RANGE of SJIS ---
3787 (character set) (range)
3788 ASCII 0x00 .. 0x7F
3789 KATAKANA-JISX0201 0xA0 .. 0xDF
3790 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3791 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3792 -------------------------------
3793
3794 */
3795
3796 /* BIG5 is a coding system encoding two character sets: ASCII and
3797 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3798 character set and is encoded in two-byte.
3799
3800 --- CODE RANGE of BIG5 ---
3801 (character set) (range)
3802 ASCII 0x00 .. 0x7F
3803 Big5 (1st byte) 0xA1 .. 0xFE
3804 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3805 --------------------------
3806
3807 */
3808
3809 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3810 Check if a text is encoded in SJIS. If it is, return
3811 CATEGORY_MASK_SJIS, else return 0. */
3812
3813 static int
3814 detect_coding_sjis (coding, detect_info)
3815 struct coding_system *coding;
3816 struct coding_detection_info *detect_info;
3817 {
3818 const unsigned char *src = coding->source, *src_base = src;
3819 const unsigned char *src_end = coding->source + coding->src_bytes;
3820 int multibytep = coding->src_multibyte;
3821 int consumed_chars = 0;
3822 int found = 0;
3823 int c;
3824 int incomplete;
3825
3826 detect_info->checked |= CATEGORY_MASK_SJIS;
3827 /* A coding system of this category is always ASCII compatible. */
3828 src += coding->head_ascii;
3829
3830 while (1)
3831 {
3832 incomplete = 0;
3833 ONE_MORE_BYTE (c);
3834 incomplete = 1;
3835 if (c < 0x80)
3836 continue;
3837 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3838 {
3839 ONE_MORE_BYTE (c);
3840 if (c < 0x40 || c == 0x7F || c > 0xFC)
3841 break;
3842 found = CATEGORY_MASK_SJIS;
3843 }
3844 else if (c >= 0xA0 && c < 0xE0)
3845 found = CATEGORY_MASK_SJIS;
3846 else
3847 break;
3848 }
3849 detect_info->rejected |= CATEGORY_MASK_SJIS;
3850 return 0;
3851
3852 no_more_source:
3853 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3854 {
3855 detect_info->rejected |= CATEGORY_MASK_SJIS;
3856 return 0;
3857 }
3858 detect_info->found |= found;
3859 return 1;
3860 }
3861
3862 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3863 Check if a text is encoded in BIG5. If it is, return
3864 CATEGORY_MASK_BIG5, else return 0. */
3865
3866 static int
3867 detect_coding_big5 (coding, detect_info)
3868 struct coding_system *coding;
3869 struct coding_detection_info *detect_info;
3870 {
3871 const unsigned char *src = coding->source, *src_base = src;
3872 const unsigned char *src_end = coding->source + coding->src_bytes;
3873 int multibytep = coding->src_multibyte;
3874 int consumed_chars = 0;
3875 int found = 0;
3876 int c;
3877 int incomplete;
3878
3879 detect_info->checked |= CATEGORY_MASK_BIG5;
3880 /* A coding system of this category is always ASCII compatible. */
3881 src += coding->head_ascii;
3882
3883 while (1)
3884 {
3885 incomplete = 0;
3886 ONE_MORE_BYTE (c);
3887 incomplete = 1;
3888 if (c < 0x80)
3889 continue;
3890 if (c >= 0xA1)
3891 {
3892 ONE_MORE_BYTE (c);
3893 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3894 return 0;
3895 found = CATEGORY_MASK_BIG5;
3896 }
3897 else
3898 break;
3899 }
3900 detect_info->rejected |= CATEGORY_MASK_BIG5;
3901 return 0;
3902
3903 no_more_source:
3904 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3905 {
3906 detect_info->rejected |= CATEGORY_MASK_BIG5;
3907 return 0;
3908 }
3909 detect_info->found |= found;
3910 return 1;
3911 }
3912
3913 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3914 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3915
3916 static void
3917 decode_coding_sjis (coding)
3918 struct coding_system *coding;
3919 {
3920 const unsigned char *src = coding->source + coding->consumed;
3921 const unsigned char *src_end = coding->source + coding->src_bytes;
3922 const unsigned char *src_base;
3923 int *charbuf = coding->charbuf;
3924 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3925 int consumed_chars = 0, consumed_chars_base;
3926 int multibytep = coding->src_multibyte;
3927 struct charset *charset_roman, *charset_kanji, *charset_kana;
3928 Lisp_Object attrs, eol_type, charset_list, val;
3929 int char_offset = coding->produced_char;
3930 int last_offset = char_offset;
3931 int last_id = charset_ascii;
3932
3933 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3934
3935 val = charset_list;
3936 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3937 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3938 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3939
3940 while (1)
3941 {
3942 int c, c1;
3943
3944 src_base = src;
3945 consumed_chars_base = consumed_chars;
3946
3947 if (charbuf >= charbuf_end)
3948 break;
3949
3950 ONE_MORE_BYTE (c);
3951
3952 if (c == '\r')
3953 {
3954 if (EQ (eol_type, Qdos))
3955 {
3956 if (src == src_end)
3957 {
3958 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3959 goto no_more_source;
3960 }
3961 if (*src == '\n')
3962 ONE_MORE_BYTE (c);
3963 }
3964 else if (EQ (eol_type, Qmac))
3965 c = '\n';
3966 }
3967 else
3968 {
3969 struct charset *charset;
3970
3971 if (c < 0x80)
3972 charset = charset_roman;
3973 else
3974 {
3975 if (c >= 0xF0)
3976 goto invalid_code;
3977 if (c < 0xA0 || c >= 0xE0)
3978 {
3979 /* SJIS -> JISX0208 */
3980 ONE_MORE_BYTE (c1);
3981 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3982 goto invalid_code;
3983 c = (c << 8) | c1;
3984 SJIS_TO_JIS (c);
3985 charset = charset_kanji;
3986 }
3987 else if (c > 0xA0)
3988 {
3989 /* SJIS -> JISX0201-Kana */
3990 c &= 0x7F;
3991 charset = charset_kana;
3992 }
3993 else
3994 goto invalid_code;
3995 }
3996 if (charset->id != charset_ascii
3997 && last_id != charset->id)
3998 {
3999 if (last_id != charset_ascii)
4000 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4001 last_id = charset->id;
4002 last_offset = char_offset;
4003 }
4004 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4005 }
4006 *charbuf++ = c;
4007 char_offset++;
4008 continue;
4009
4010 invalid_code:
4011 src = src_base;
4012 consumed_chars = consumed_chars_base;
4013 ONE_MORE_BYTE (c);
4014 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4015 char_offset++;
4016 coding->errors++;
4017 }
4018
4019 no_more_source:
4020 if (last_id != charset_ascii)
4021 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4022 coding->consumed_char += consumed_chars_base;
4023 coding->consumed = src_base - coding->source;
4024 coding->charbuf_used = charbuf - coding->charbuf;
4025 }
4026
4027 static void
4028 decode_coding_big5 (coding)
4029 struct coding_system *coding;
4030 {
4031 const unsigned char *src = coding->source + coding->consumed;
4032 const unsigned char *src_end = coding->source + coding->src_bytes;
4033 const unsigned char *src_base;
4034 int *charbuf = coding->charbuf;
4035 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4036 int consumed_chars = 0, consumed_chars_base;
4037 int multibytep = coding->src_multibyte;
4038 struct charset *charset_roman, *charset_big5;
4039 Lisp_Object attrs, eol_type, charset_list, val;
4040 int char_offset = coding->produced_char;
4041 int last_offset = char_offset;
4042 int last_id = charset_ascii;
4043
4044 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4045 val = charset_list;
4046 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4047 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4048
4049 while (1)
4050 {
4051 int c, c1;
4052
4053 src_base = src;
4054 consumed_chars_base = consumed_chars;
4055
4056 if (charbuf >= charbuf_end)
4057 break;
4058
4059 ONE_MORE_BYTE (c);
4060
4061 if (c == '\r')
4062 {
4063 if (EQ (eol_type, Qdos))
4064 {
4065 if (src == src_end)
4066 {
4067 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4068 goto no_more_source;
4069 }
4070 if (*src == '\n')
4071 ONE_MORE_BYTE (c);
4072 }
4073 else if (EQ (eol_type, Qmac))
4074 c = '\n';
4075 }
4076 else
4077 {
4078 struct charset *charset;
4079 if (c < 0x80)
4080 charset = charset_roman;
4081 else
4082 {
4083 /* BIG5 -> Big5 */
4084 if (c < 0xA1 || c > 0xFE)
4085 goto invalid_code;
4086 ONE_MORE_BYTE (c1);
4087 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4088 goto invalid_code;
4089 c = c << 8 | c1;
4090 charset = charset_big5;
4091 }
4092 if (charset->id != charset_ascii
4093 && last_id != charset->id)
4094 {
4095 if (last_id != charset_ascii)
4096 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4097 last_id = charset->id;
4098 last_offset = char_offset;
4099 }
4100 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4101 }
4102
4103 *charbuf++ = c;
4104 char_offset++;
4105 continue;
4106
4107 invalid_code:
4108 src = src_base;
4109 consumed_chars = consumed_chars_base;
4110 ONE_MORE_BYTE (c);
4111 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4112 char_offset++;
4113 coding->errors++;
4114 }
4115
4116 no_more_source:
4117 if (last_id != charset_ascii)
4118 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4119 coding->consumed_char += consumed_chars_base;
4120 coding->consumed = src_base - coding->source;
4121 coding->charbuf_used = charbuf - coding->charbuf;
4122 }
4123
4124 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4125 This function can encode charsets `ascii', `katakana-jisx0201',
4126 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4127 are sure that all these charsets are registered as official charset
4128 (i.e. do not have extended leading-codes). Characters of other
4129 charsets are produced without any encoding. If SJIS_P is 1, encode
4130 SJIS text, else encode BIG5 text. */
4131
4132 static int
4133 encode_coding_sjis (coding)
4134 struct coding_system *coding;
4135 {
4136 int multibytep = coding->dst_multibyte;
4137 int *charbuf = coding->charbuf;
4138 int *charbuf_end = charbuf + coding->charbuf_used;
4139 unsigned char *dst = coding->destination + coding->produced;
4140 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4141 int safe_room = 4;
4142 int produced_chars = 0;
4143 Lisp_Object attrs, eol_type, charset_list, val;
4144 int ascii_compatible;
4145 struct charset *charset_roman, *charset_kanji, *charset_kana;
4146 int c;
4147
4148 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4149 val = charset_list;
4150 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4151 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4152 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4153
4154 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4155
4156 while (charbuf < charbuf_end)
4157 {
4158 ASSURE_DESTINATION (safe_room);
4159 c = *charbuf++;
4160 /* Now encode the character C. */
4161 if (ASCII_CHAR_P (c) && ascii_compatible)
4162 EMIT_ONE_ASCII_BYTE (c);
4163 else if (CHAR_BYTE8_P (c))
4164 {
4165 c = CHAR_TO_BYTE8 (c);
4166 EMIT_ONE_BYTE (c);
4167 }
4168 else
4169 {
4170 unsigned code;
4171 struct charset *charset = char_charset (c, charset_list, &code);
4172
4173 if (!charset)
4174 {
4175 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4176 {
4177 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4178 charset = CHARSET_FROM_ID (charset_ascii);
4179 }
4180 else
4181 {
4182 c = coding->default_char;
4183 charset = char_charset (c, charset_list, &code);
4184 }
4185 }
4186 if (code == CHARSET_INVALID_CODE (charset))
4187 abort ();
4188 if (charset == charset_kanji)
4189 {
4190 int c1, c2;
4191 JIS_TO_SJIS (code);
4192 c1 = code >> 8, c2 = code & 0xFF;
4193 EMIT_TWO_BYTES (c1, c2);
4194 }
4195 else if (charset == charset_kana)
4196 EMIT_ONE_BYTE (code | 0x80);
4197 else
4198 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4199 }
4200 }
4201 coding->result = CODING_RESULT_SUCCESS;
4202 coding->produced_char += produced_chars;
4203 coding->produced = dst - coding->destination;
4204 return 0;
4205 }
4206
4207 static int
4208 encode_coding_big5 (coding)
4209 struct coding_system *coding;
4210 {
4211 int multibytep = coding->dst_multibyte;
4212 int *charbuf = coding->charbuf;
4213 int *charbuf_end = charbuf + coding->charbuf_used;
4214 unsigned char *dst = coding->destination + coding->produced;
4215 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4216 int safe_room = 4;
4217 int produced_chars = 0;
4218 Lisp_Object attrs, eol_type, charset_list, val;
4219 int ascii_compatible;
4220 struct charset *charset_roman, *charset_big5;
4221 int c;
4222
4223 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4224 val = charset_list;
4225 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4226 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4227 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4228
4229 while (charbuf < charbuf_end)
4230 {
4231 ASSURE_DESTINATION (safe_room);
4232 c = *charbuf++;
4233 /* Now encode the character C. */
4234 if (ASCII_CHAR_P (c) && ascii_compatible)
4235 EMIT_ONE_ASCII_BYTE (c);
4236 else if (CHAR_BYTE8_P (c))
4237 {
4238 c = CHAR_TO_BYTE8 (c);
4239 EMIT_ONE_BYTE (c);
4240 }
4241 else
4242 {
4243 unsigned code;
4244 struct charset *charset = char_charset (c, charset_list, &code);
4245
4246 if (! charset)
4247 {
4248 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4249 {
4250 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4251 charset = CHARSET_FROM_ID (charset_ascii);
4252 }
4253 else
4254 {
4255 c = coding->default_char;
4256 charset = char_charset (c, charset_list, &code);
4257 }
4258 }
4259 if (code == CHARSET_INVALID_CODE (charset))
4260 abort ();
4261 if (charset == charset_big5)
4262 {
4263 int c1, c2;
4264
4265 c1 = code >> 8, c2 = code & 0xFF;
4266 EMIT_TWO_BYTES (c1, c2);
4267 }
4268 else
4269 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4270 }
4271 }
4272 coding->result = CODING_RESULT_SUCCESS;
4273 coding->produced_char += produced_chars;
4274 coding->produced = dst - coding->destination;
4275 return 0;
4276 }
4277
4278 \f
4279 /*** 10. CCL handlers ***/
4280
4281 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4282 Check if a text is encoded in a coding system of which
4283 encoder/decoder are written in CCL program. If it is, return
4284 CATEGORY_MASK_CCL, else return 0. */
4285
4286 static int
4287 detect_coding_ccl (coding, detect_info)
4288 struct coding_system *coding;
4289 struct coding_detection_info *detect_info;
4290 {
4291 const unsigned char *src = coding->source, *src_base = src;
4292 const unsigned char *src_end = coding->source + coding->src_bytes;
4293 int multibytep = coding->src_multibyte;
4294 int consumed_chars = 0;
4295 int found = 0;
4296 unsigned char *valids = CODING_CCL_VALIDS (coding);
4297 int head_ascii = coding->head_ascii;
4298 Lisp_Object attrs;
4299
4300 detect_info->checked |= CATEGORY_MASK_CCL;
4301
4302 coding = &coding_categories[coding_category_ccl];
4303 attrs = CODING_ID_ATTRS (coding->id);
4304 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4305 src += head_ascii;
4306
4307 while (1)
4308 {
4309 int c;
4310 ONE_MORE_BYTE (c);
4311 if (! valids[c])
4312 break;
4313 if ((valids[c] > 1))
4314 found = CATEGORY_MASK_CCL;
4315 }
4316 detect_info->rejected |= CATEGORY_MASK_CCL;
4317 return 0;
4318
4319 no_more_source:
4320 detect_info->found |= found;
4321 return 1;
4322 }
4323
4324 static void
4325 decode_coding_ccl (coding)
4326 struct coding_system *coding;
4327 {
4328 const unsigned char *src = coding->source + coding->consumed;
4329 const unsigned char *src_end = coding->source + coding->src_bytes;
4330 int *charbuf = coding->charbuf;
4331 int *charbuf_end = charbuf + coding->charbuf_size;
4332 int consumed_chars = 0;
4333 int multibytep = coding->src_multibyte;
4334 struct ccl_program ccl;
4335 int source_charbuf[1024];
4336 int source_byteidx[1024];
4337 Lisp_Object attrs, eol_type, charset_list;
4338
4339 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4340 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4341
4342 while (src < src_end)
4343 {
4344 const unsigned char *p = src;
4345 int *source, *source_end;
4346 int i = 0;
4347
4348 if (multibytep)
4349 while (i < 1024 && p < src_end)
4350 {
4351 source_byteidx[i] = p - src;
4352 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4353 }
4354 else
4355 while (i < 1024 && p < src_end)
4356 source_charbuf[i++] = *p++;
4357
4358 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4359 ccl.last_block = 1;
4360
4361 source = source_charbuf;
4362 source_end = source + i;
4363 while (source < source_end)
4364 {
4365 ccl_driver (&ccl, source, charbuf,
4366 source_end - source, charbuf_end - charbuf,
4367 charset_list);
4368 source += ccl.consumed;
4369 charbuf += ccl.produced;
4370 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4371 break;
4372 }
4373 if (source < source_end)
4374 src += source_byteidx[source - source_charbuf];
4375 else
4376 src = p;
4377 consumed_chars += source - source_charbuf;
4378
4379 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4380 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4381 break;
4382 }
4383
4384 switch (ccl.status)
4385 {
4386 case CCL_STAT_SUSPEND_BY_SRC:
4387 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4388 break;
4389 case CCL_STAT_SUSPEND_BY_DST:
4390 break;
4391 case CCL_STAT_QUIT:
4392 case CCL_STAT_INVALID_CMD:
4393 coding->result = CODING_RESULT_INTERRUPT;
4394 break;
4395 default:
4396 coding->result = CODING_RESULT_SUCCESS;
4397 break;
4398 }
4399 coding->consumed_char += consumed_chars;
4400 coding->consumed = src - coding->source;
4401 coding->charbuf_used = charbuf - coding->charbuf;
4402 }
4403
4404 static int
4405 encode_coding_ccl (coding)
4406 struct coding_system *coding;
4407 {
4408 struct ccl_program ccl;
4409 int multibytep = coding->dst_multibyte;
4410 int *charbuf = coding->charbuf;
4411 int *charbuf_end = charbuf + coding->charbuf_used;
4412 unsigned char *dst = coding->destination + coding->produced;
4413 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4414 unsigned char *adjusted_dst_end = dst_end - 1;
4415 int destination_charbuf[1024];
4416 int i, produced_chars = 0;
4417 Lisp_Object attrs, eol_type, charset_list;
4418
4419 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4420 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4421
4422 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4423 ccl.dst_multibyte = coding->dst_multibyte;
4424
4425 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4426 {
4427 int dst_bytes = dst_end - dst;
4428 if (dst_bytes > 1024)
4429 dst_bytes = 1024;
4430
4431 ccl_driver (&ccl, charbuf, destination_charbuf,
4432 charbuf_end - charbuf, dst_bytes, charset_list);
4433 charbuf += ccl.consumed;
4434 if (multibytep)
4435 for (i = 0; i < ccl.produced; i++)
4436 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4437 else
4438 {
4439 for (i = 0; i < ccl.produced; i++)
4440 *dst++ = destination_charbuf[i] & 0xFF;
4441 produced_chars += ccl.produced;
4442 }
4443 }
4444
4445 switch (ccl.status)
4446 {
4447 case CCL_STAT_SUSPEND_BY_SRC:
4448 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4449 break;
4450 case CCL_STAT_SUSPEND_BY_DST:
4451 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4452 break;
4453 case CCL_STAT_QUIT:
4454 case CCL_STAT_INVALID_CMD:
4455 coding->result = CODING_RESULT_INTERRUPT;
4456 break;
4457 default:
4458 coding->result = CODING_RESULT_SUCCESS;
4459 break;
4460 }
4461
4462 coding->produced_char += produced_chars;
4463 coding->produced = dst - coding->destination;
4464 return 0;
4465 }
4466
4467
4468 \f
4469 /*** 10, 11. no-conversion handlers ***/
4470
4471 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4472
4473 static void
4474 decode_coding_raw_text (coding)
4475 struct coding_system *coding;
4476 {
4477 coding->chars_at_source = 1;
4478 coding->consumed_char = 0;
4479 coding->consumed = 0;
4480 coding->result = CODING_RESULT_SUCCESS;
4481 }
4482
4483 static int
4484 encode_coding_raw_text (coding)
4485 struct coding_system *coding;
4486 {
4487 int multibytep = coding->dst_multibyte;
4488 int *charbuf = coding->charbuf;
4489 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4490 unsigned char *dst = coding->destination + coding->produced;
4491 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4492 int produced_chars = 0;
4493 int c;
4494
4495 if (multibytep)
4496 {
4497 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4498
4499 if (coding->src_multibyte)
4500 while (charbuf < charbuf_end)
4501 {
4502 ASSURE_DESTINATION (safe_room);
4503 c = *charbuf++;
4504 if (ASCII_CHAR_P (c))
4505 EMIT_ONE_ASCII_BYTE (c);
4506 else if (CHAR_BYTE8_P (c))
4507 {
4508 c = CHAR_TO_BYTE8 (c);
4509 EMIT_ONE_BYTE (c);
4510 }
4511 else
4512 {
4513 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4514
4515 CHAR_STRING_ADVANCE (c, p1);
4516 while (p0 < p1)
4517 {
4518 EMIT_ONE_BYTE (*p0);
4519 p0++;
4520 }
4521 }
4522 }
4523 else
4524 while (charbuf < charbuf_end)
4525 {
4526 ASSURE_DESTINATION (safe_room);
4527 c = *charbuf++;
4528 EMIT_ONE_BYTE (c);
4529 }
4530 }
4531 else
4532 {
4533 if (coding->src_multibyte)
4534 {
4535 int safe_room = MAX_MULTIBYTE_LENGTH;
4536
4537 while (charbuf < charbuf_end)
4538 {
4539 ASSURE_DESTINATION (safe_room);
4540 c = *charbuf++;
4541 if (ASCII_CHAR_P (c))
4542 *dst++ = c;
4543 else if (CHAR_BYTE8_P (c))
4544 *dst++ = CHAR_TO_BYTE8 (c);
4545 else
4546 CHAR_STRING_ADVANCE (c, dst);
4547 produced_chars++;
4548 }
4549 }
4550 else
4551 {
4552 ASSURE_DESTINATION (charbuf_end - charbuf);
4553 while (charbuf < charbuf_end && dst < dst_end)
4554 *dst++ = *charbuf++;
4555 produced_chars = dst - (coding->destination + coding->dst_bytes);
4556 }
4557 }
4558 coding->result = CODING_RESULT_SUCCESS;
4559 coding->produced_char += produced_chars;
4560 coding->produced = dst - coding->destination;
4561 return 0;
4562 }
4563
4564 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4565 Check if a text is encoded in a charset-based coding system. If it
4566 is, return 1, else return 0. */
4567
4568 static int
4569 detect_coding_charset (coding, detect_info)
4570 struct coding_system *coding;
4571 struct coding_detection_info *detect_info;
4572 {
4573 const unsigned char *src = coding->source, *src_base = src;
4574 const unsigned char *src_end = coding->source + coding->src_bytes;
4575 int multibytep = coding->src_multibyte;
4576 int consumed_chars = 0;
4577 Lisp_Object attrs, valids;
4578 int found = 0;
4579
4580 detect_info->checked |= CATEGORY_MASK_CHARSET;
4581
4582 coding = &coding_categories[coding_category_charset];
4583 attrs = CODING_ID_ATTRS (coding->id);
4584 valids = AREF (attrs, coding_attr_charset_valids);
4585
4586 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4587 src += coding->head_ascii;
4588
4589 while (1)
4590 {
4591 int c;
4592
4593 ONE_MORE_BYTE (c);
4594 if (NILP (AREF (valids, c)))
4595 break;
4596 if (c >= 0x80)
4597 found = CATEGORY_MASK_CHARSET;
4598 }
4599 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4600 return 0;
4601
4602 no_more_source:
4603 detect_info->found |= found;
4604 return 1;
4605 }
4606
4607 static void
4608 decode_coding_charset (coding)
4609 struct coding_system *coding;
4610 {
4611 const unsigned char *src = coding->source + coding->consumed;
4612 const unsigned char *src_end = coding->source + coding->src_bytes;
4613 const unsigned char *src_base;
4614 int *charbuf = coding->charbuf;
4615 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4616 int consumed_chars = 0, consumed_chars_base;
4617 int multibytep = coding->src_multibyte;
4618 Lisp_Object attrs, eol_type, charset_list, valids;
4619 int char_offset = coding->produced_char;
4620 int last_offset = char_offset;
4621 int last_id = charset_ascii;
4622
4623 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4624 valids = AREF (attrs, coding_attr_charset_valids);
4625
4626 while (1)
4627 {
4628 int c;
4629
4630 src_base = src;
4631 consumed_chars_base = consumed_chars;
4632
4633 if (charbuf >= charbuf_end)
4634 break;
4635
4636 ONE_MORE_BYTE (c);
4637 if (c == '\r')
4638 {
4639 /* Here we assume that no charset maps '\r' to something
4640 else. */
4641 if (EQ (eol_type, Qdos))
4642 {
4643 if (src == src_end)
4644 {
4645 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4646 goto no_more_source;
4647 }
4648 if (*src == '\n')
4649 ONE_MORE_BYTE (c);
4650 }
4651 else if (EQ (eol_type, Qmac))
4652 c = '\n';
4653 }
4654 else
4655 {
4656 Lisp_Object val;
4657 struct charset *charset;
4658 int dim;
4659 int len = 1;
4660 unsigned code = c;
4661
4662 val = AREF (valids, c);
4663 if (NILP (val))
4664 goto invalid_code;
4665 if (INTEGERP (val))
4666 {
4667 charset = CHARSET_FROM_ID (XFASTINT (val));
4668 dim = CHARSET_DIMENSION (charset);
4669 while (len < dim)
4670 {
4671 ONE_MORE_BYTE (c);
4672 code = (code << 8) | c;
4673 len++;
4674 }
4675 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4676 charset, code, c);
4677 }
4678 else
4679 {
4680 /* VAL is a list of charset IDs. It is assured that the
4681 list is sorted by charset dimensions (smaller one
4682 comes first). */
4683 while (CONSP (val))
4684 {
4685 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4686 dim = CHARSET_DIMENSION (charset);
4687 while (len < dim)
4688 {
4689 ONE_MORE_BYTE (c);
4690 code = (code << 8) | c;
4691 len++;
4692 }
4693 CODING_DECODE_CHAR (coding, src, src_base,
4694 src_end, charset, code, c);
4695 if (c >= 0)
4696 break;
4697 val = XCDR (val);
4698 }
4699 }
4700 if (c < 0)
4701 goto invalid_code;
4702 if (charset->id != charset_ascii
4703 && last_id != charset->id)
4704 {
4705 if (last_id != charset_ascii)
4706 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4707 last_id = charset->id;
4708 last_offset = char_offset;
4709 }
4710 }
4711 *charbuf++ = c;
4712 char_offset++;
4713 continue;
4714
4715 invalid_code:
4716 src = src_base;
4717 consumed_chars = consumed_chars_base;
4718 ONE_MORE_BYTE (c);
4719 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4720 char_offset++;
4721 coding->errors++;
4722 }
4723
4724 no_more_source:
4725 if (last_id != charset_ascii)
4726 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4727 coding->consumed_char += consumed_chars_base;
4728 coding->consumed = src_base - coding->source;
4729 coding->charbuf_used = charbuf - coding->charbuf;
4730 }
4731
4732 static int
4733 encode_coding_charset (coding)
4734 struct coding_system *coding;
4735 {
4736 int multibytep = coding->dst_multibyte;
4737 int *charbuf = coding->charbuf;
4738 int *charbuf_end = charbuf + coding->charbuf_used;
4739 unsigned char *dst = coding->destination + coding->produced;
4740 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4741 int safe_room = MAX_MULTIBYTE_LENGTH;
4742 int produced_chars = 0;
4743 Lisp_Object attrs, eol_type, charset_list;
4744 int ascii_compatible;
4745 int c;
4746
4747 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4748 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4749
4750 while (charbuf < charbuf_end)
4751 {
4752 struct charset *charset;
4753 unsigned code;
4754
4755 ASSURE_DESTINATION (safe_room);
4756 c = *charbuf++;
4757 if (ascii_compatible && ASCII_CHAR_P (c))
4758 EMIT_ONE_ASCII_BYTE (c);
4759 else if (CHAR_BYTE8_P (c))
4760 {
4761 c = CHAR_TO_BYTE8 (c);
4762 EMIT_ONE_BYTE (c);
4763 }
4764 else
4765 {
4766 charset = char_charset (c, charset_list, &code);
4767 if (charset)
4768 {
4769 if (CHARSET_DIMENSION (charset) == 1)
4770 EMIT_ONE_BYTE (code);
4771 else if (CHARSET_DIMENSION (charset) == 2)
4772 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4773 else if (CHARSET_DIMENSION (charset) == 3)
4774 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4775 else
4776 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4777 (code >> 8) & 0xFF, code & 0xFF);
4778 }
4779 else
4780 {
4781 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4782 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4783 else
4784 c = coding->default_char;
4785 EMIT_ONE_BYTE (c);
4786 }
4787 }
4788 }
4789
4790 coding->result = CODING_RESULT_SUCCESS;
4791 coding->produced_char += produced_chars;
4792 coding->produced = dst - coding->destination;
4793 return 0;
4794 }
4795
4796 \f
4797 /*** 7. C library functions ***/
4798
4799 /* Setup coding context CODING from information about CODING_SYSTEM.
4800 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4801 CODING_SYSTEM is invalid, signal an error. */
4802
4803 void
4804 setup_coding_system (coding_system, coding)
4805 Lisp_Object coding_system;
4806 struct coding_system *coding;
4807 {
4808 Lisp_Object attrs;
4809 Lisp_Object eol_type;
4810 Lisp_Object coding_type;
4811 Lisp_Object val;
4812
4813 if (NILP (coding_system))
4814 coding_system = Qno_conversion;
4815
4816 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4817
4818 attrs = CODING_ID_ATTRS (coding->id);
4819 eol_type = CODING_ID_EOL_TYPE (coding->id);
4820
4821 coding->mode = 0;
4822 coding->head_ascii = -1;
4823 coding->common_flags
4824 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4825 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4826 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4827 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4828 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4829 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4830 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4831
4832 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4833 coding->max_charset_id = SCHARS (val) - 1;
4834 coding->safe_charsets = (char *) SDATA (val);
4835 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4836
4837 coding_type = CODING_ATTR_TYPE (attrs);
4838 if (EQ (coding_type, Qundecided))
4839 {
4840 coding->detector = NULL;
4841 coding->decoder = decode_coding_raw_text;
4842 coding->encoder = encode_coding_raw_text;
4843 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4844 }
4845 else if (EQ (coding_type, Qiso_2022))
4846 {
4847 int i;
4848 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4849
4850 /* Invoke graphic register 0 to plane 0. */
4851 CODING_ISO_INVOCATION (coding, 0) = 0;
4852 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4853 CODING_ISO_INVOCATION (coding, 1)
4854 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4855 /* Setup the initial status of designation. */
4856 for (i = 0; i < 4; i++)
4857 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4858 /* Not single shifting initially. */
4859 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4860 /* Beginning of buffer should also be regarded as bol. */
4861 CODING_ISO_BOL (coding) = 1;
4862 coding->detector = detect_coding_iso_2022;
4863 coding->decoder = decode_coding_iso_2022;
4864 coding->encoder = encode_coding_iso_2022;
4865 if (flags & CODING_ISO_FLAG_SAFE)
4866 coding->mode |= CODING_MODE_SAFE_ENCODING;
4867 coding->common_flags
4868 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4869 | CODING_REQUIRE_FLUSHING_MASK);
4870 if (flags & CODING_ISO_FLAG_COMPOSITION)
4871 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4872 if (flags & CODING_ISO_FLAG_DESIGNATION)
4873 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4874 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4875 {
4876 setup_iso_safe_charsets (attrs);
4877 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4878 coding->max_charset_id = SCHARS (val) - 1;
4879 coding->safe_charsets = (char *) SDATA (val);
4880 }
4881 CODING_ISO_FLAGS (coding) = flags;
4882 }
4883 else if (EQ (coding_type, Qcharset))
4884 {
4885 coding->detector = detect_coding_charset;
4886 coding->decoder = decode_coding_charset;
4887 coding->encoder = encode_coding_charset;
4888 coding->common_flags
4889 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4890 }
4891 else if (EQ (coding_type, Qutf_8))
4892 {
4893 coding->detector = detect_coding_utf_8;
4894 coding->decoder = decode_coding_utf_8;
4895 coding->encoder = encode_coding_utf_8;
4896 coding->common_flags
4897 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4898 }
4899 else if (EQ (coding_type, Qutf_16))
4900 {
4901 val = AREF (attrs, coding_attr_utf_16_bom);
4902 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4903 : EQ (val, Qt) ? utf_16_with_bom
4904 : utf_16_without_bom);
4905 val = AREF (attrs, coding_attr_utf_16_endian);
4906 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4907 : utf_16_little_endian);
4908 CODING_UTF_16_SURROGATE (coding) = 0;
4909 coding->detector = detect_coding_utf_16;
4910 coding->decoder = decode_coding_utf_16;
4911 coding->encoder = encode_coding_utf_16;
4912 coding->common_flags
4913 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4914 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4915 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4916 }
4917 else if (EQ (coding_type, Qccl))
4918 {
4919 coding->detector = detect_coding_ccl;
4920 coding->decoder = decode_coding_ccl;
4921 coding->encoder = encode_coding_ccl;
4922 coding->common_flags
4923 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4924 | CODING_REQUIRE_FLUSHING_MASK);
4925 }
4926 else if (EQ (coding_type, Qemacs_mule))
4927 {
4928 coding->detector = detect_coding_emacs_mule;
4929 coding->decoder = decode_coding_emacs_mule;
4930 coding->encoder = encode_coding_emacs_mule;
4931 coding->common_flags
4932 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4933 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4934 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4935 {
4936 Lisp_Object tail, safe_charsets;
4937 int max_charset_id = 0;
4938
4939 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4940 tail = XCDR (tail))
4941 if (max_charset_id < XFASTINT (XCAR (tail)))
4942 max_charset_id = XFASTINT (XCAR (tail));
4943 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4944 make_number (255));
4945 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4946 tail = XCDR (tail))
4947 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4948 coding->max_charset_id = max_charset_id;
4949 coding->safe_charsets = (char *) SDATA (safe_charsets);
4950 }
4951 }
4952 else if (EQ (coding_type, Qshift_jis))
4953 {
4954 coding->detector = detect_coding_sjis;
4955 coding->decoder = decode_coding_sjis;
4956 coding->encoder = encode_coding_sjis;
4957 coding->common_flags
4958 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4959 }
4960 else if (EQ (coding_type, Qbig5))
4961 {
4962 coding->detector = detect_coding_big5;
4963 coding->decoder = decode_coding_big5;
4964 coding->encoder = encode_coding_big5;
4965 coding->common_flags
4966 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4967 }
4968 else /* EQ (coding_type, Qraw_text) */
4969 {
4970 coding->detector = NULL;
4971 coding->decoder = decode_coding_raw_text;
4972 coding->encoder = encode_coding_raw_text;
4973 }
4974
4975 return;
4976 }
4977
4978 /* Return raw-text or one of its subsidiaries that has the same
4979 eol_type as CODING-SYSTEM. */
4980
4981 Lisp_Object
4982 raw_text_coding_system (coding_system)
4983 Lisp_Object coding_system;
4984 {
4985 Lisp_Object spec, attrs;
4986 Lisp_Object eol_type, raw_text_eol_type;
4987
4988 if (NILP (coding_system))
4989 return Qraw_text;
4990 spec = CODING_SYSTEM_SPEC (coding_system);
4991 attrs = AREF (spec, 0);
4992
4993 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4994 return coding_system;
4995
4996 eol_type = AREF (spec, 2);
4997 if (VECTORP (eol_type))
4998 return Qraw_text;
4999 spec = CODING_SYSTEM_SPEC (Qraw_text);
5000 raw_text_eol_type = AREF (spec, 2);
5001 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5002 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5003 : AREF (raw_text_eol_type, 2));
5004 }
5005
5006
5007 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5008 does, return one of the subsidiary that has the same eol-spec as
5009 PARENT. Otherwise, return CODING_SYSTEM. */
5010
5011 Lisp_Object
5012 coding_inherit_eol_type (coding_system, parent)
5013 Lisp_Object coding_system, parent;
5014 {
5015 Lisp_Object spec, eol_type;
5016
5017 if (NILP (coding_system))
5018 coding_system = Qraw_text;
5019 spec = CODING_SYSTEM_SPEC (coding_system);
5020 eol_type = AREF (spec, 2);
5021 if (VECTORP (eol_type)
5022 && ! NILP (parent))
5023 {
5024 Lisp_Object parent_spec;
5025 Lisp_Object parent_eol_type;
5026
5027 parent_spec
5028 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5029 parent_eol_type = AREF (parent_spec, 2);
5030 if (EQ (parent_eol_type, Qunix))
5031 coding_system = AREF (eol_type, 0);
5032 else if (EQ (parent_eol_type, Qdos))
5033 coding_system = AREF (eol_type, 1);
5034 else if (EQ (parent_eol_type, Qmac))
5035 coding_system = AREF (eol_type, 2);
5036 }
5037 return coding_system;
5038 }
5039
5040 /* Emacs has a mechanism to automatically detect a coding system if it
5041 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5042 it's impossible to distinguish some coding systems accurately
5043 because they use the same range of codes. So, at first, coding
5044 systems are categorized into 7, those are:
5045
5046 o coding-category-emacs-mule
5047
5048 The category for a coding system which has the same code range
5049 as Emacs' internal format. Assigned the coding-system (Lisp
5050 symbol) `emacs-mule' by default.
5051
5052 o coding-category-sjis
5053
5054 The category for a coding system which has the same code range
5055 as SJIS. Assigned the coding-system (Lisp
5056 symbol) `japanese-shift-jis' by default.
5057
5058 o coding-category-iso-7
5059
5060 The category for a coding system which has the same code range
5061 as ISO2022 of 7-bit environment. This doesn't use any locking
5062 shift and single shift functions. This can encode/decode all
5063 charsets. Assigned the coding-system (Lisp symbol)
5064 `iso-2022-7bit' by default.
5065
5066 o coding-category-iso-7-tight
5067
5068 Same as coding-category-iso-7 except that this can
5069 encode/decode only the specified charsets.
5070
5071 o coding-category-iso-8-1
5072
5073 The category for a coding system which has the same code range
5074 as ISO2022 of 8-bit environment and graphic plane 1 used only
5075 for DIMENSION1 charset. This doesn't use any locking shift
5076 and single shift functions. Assigned the coding-system (Lisp
5077 symbol) `iso-latin-1' by default.
5078
5079 o coding-category-iso-8-2
5080
5081 The category for a coding system which has the same code range
5082 as ISO2022 of 8-bit environment and graphic plane 1 used only
5083 for DIMENSION2 charset. This doesn't use any locking shift
5084 and single shift functions. Assigned the coding-system (Lisp
5085 symbol) `japanese-iso-8bit' by default.
5086
5087 o coding-category-iso-7-else
5088
5089 The category for a coding system which has the same code range
5090 as ISO2022 of 7-bit environemnt but uses locking shift or
5091 single shift functions. Assigned the coding-system (Lisp
5092 symbol) `iso-2022-7bit-lock' by default.
5093
5094 o coding-category-iso-8-else
5095
5096 The category for a coding system which has the same code range
5097 as ISO2022 of 8-bit environemnt but uses locking shift or
5098 single shift functions. Assigned the coding-system (Lisp
5099 symbol) `iso-2022-8bit-ss2' by default.
5100
5101 o coding-category-big5
5102
5103 The category for a coding system which has the same code range
5104 as BIG5. Assigned the coding-system (Lisp symbol)
5105 `cn-big5' by default.
5106
5107 o coding-category-utf-8
5108
5109 The category for a coding system which has the same code range
5110 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5111 symbol) `utf-8' by default.
5112
5113 o coding-category-utf-16-be
5114
5115 The category for a coding system in which a text has an
5116 Unicode signature (cf. Unicode Standard) in the order of BIG
5117 endian at the head. Assigned the coding-system (Lisp symbol)
5118 `utf-16-be' by default.
5119
5120 o coding-category-utf-16-le
5121
5122 The category for a coding system in which a text has an
5123 Unicode signature (cf. Unicode Standard) in the order of
5124 LITTLE endian at the head. Assigned the coding-system (Lisp
5125 symbol) `utf-16-le' by default.
5126
5127 o coding-category-ccl
5128
5129 The category for a coding system of which encoder/decoder is
5130 written in CCL programs. The default value is nil, i.e., no
5131 coding system is assigned.
5132
5133 o coding-category-binary
5134
5135 The category for a coding system not categorized in any of the
5136 above. Assigned the coding-system (Lisp symbol)
5137 `no-conversion' by default.
5138
5139 Each of them is a Lisp symbol and the value is an actual
5140 `coding-system's (this is also a Lisp symbol) assigned by a user.
5141 What Emacs does actually is to detect a category of coding system.
5142 Then, it uses a `coding-system' assigned to it. If Emacs can't
5143 decide only one possible category, it selects a category of the
5144 highest priority. Priorities of categories are also specified by a
5145 user in a Lisp variable `coding-category-list'.
5146
5147 */
5148
5149 #define EOL_SEEN_NONE 0
5150 #define EOL_SEEN_LF 1
5151 #define EOL_SEEN_CR 2
5152 #define EOL_SEEN_CRLF 4
5153
5154 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5155 SOURCE is encoded. If CATEGORY is one of
5156 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5157 two-byte, else they are encoded by one-byte.
5158
5159 Return one of EOL_SEEN_XXX. */
5160
5161 #define MAX_EOL_CHECK_COUNT 3
5162
5163 static int
5164 detect_eol (source, src_bytes, category)
5165 unsigned char *source;
5166 EMACS_INT src_bytes;
5167 enum coding_category category;
5168 {
5169 unsigned char *src = source, *src_end = src + src_bytes;
5170 unsigned char c;
5171 int total = 0;
5172 int eol_seen = EOL_SEEN_NONE;
5173
5174 if ((1 << category) & CATEGORY_MASK_UTF_16)
5175 {
5176 int msb, lsb;
5177
5178 msb = category == (coding_category_utf_16_le
5179 | coding_category_utf_16_le_nosig);
5180 lsb = 1 - msb;
5181
5182 while (src + 1 < src_end)
5183 {
5184 c = src[lsb];
5185 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5186 {
5187 int this_eol;
5188
5189 if (c == '\n')
5190 this_eol = EOL_SEEN_LF;
5191 else if (src + 3 >= src_end
5192 || src[msb + 2] != 0
5193 || src[lsb + 2] != '\n')
5194 this_eol = EOL_SEEN_CR;
5195 else
5196 this_eol = EOL_SEEN_CRLF;
5197
5198 if (eol_seen == EOL_SEEN_NONE)
5199 /* This is the first end-of-line. */
5200 eol_seen = this_eol;
5201 else if (eol_seen != this_eol)
5202 {
5203 /* The found type is different from what found before. */
5204 eol_seen = EOL_SEEN_LF;
5205 break;
5206 }
5207 if (++total == MAX_EOL_CHECK_COUNT)
5208 break;
5209 }
5210 src += 2;
5211 }
5212 }
5213 else
5214 {
5215 while (src < src_end)
5216 {
5217 c = *src++;
5218 if (c == '\n' || c == '\r')
5219 {
5220 int this_eol;
5221
5222 if (c == '\n')
5223 this_eol = EOL_SEEN_LF;
5224 else if (src >= src_end || *src != '\n')
5225 this_eol = EOL_SEEN_CR;
5226 else
5227 this_eol = EOL_SEEN_CRLF, src++;
5228
5229 if (eol_seen == EOL_SEEN_NONE)
5230 /* This is the first end-of-line. */
5231 eol_seen = this_eol;
5232 else if (eol_seen != this_eol)
5233 {
5234 /* The found type is different from what found before. */
5235 eol_seen = EOL_SEEN_LF;
5236 break;
5237 }
5238 if (++total == MAX_EOL_CHECK_COUNT)
5239 break;
5240 }
5241 }
5242 }
5243 return eol_seen;
5244 }
5245
5246
5247 static void
5248 adjust_coding_eol_type (coding, eol_seen)
5249 struct coding_system *coding;
5250 int eol_seen;
5251 {
5252 Lisp_Object eol_type;
5253
5254 eol_type = CODING_ID_EOL_TYPE (coding->id);
5255 if (eol_seen & EOL_SEEN_LF)
5256 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5257 else if (eol_seen & EOL_SEEN_CRLF)
5258 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5259 else if (eol_seen & EOL_SEEN_CR)
5260 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5261 }
5262
5263 /* Detect how a text specified in CODING is encoded. If a coding
5264 system is detected, update fields of CODING by the detected coding
5265 system. */
5266
5267 void
5268 detect_coding (coding)
5269 struct coding_system *coding;
5270 {
5271 const unsigned char *src, *src_end;
5272 Lisp_Object attrs, coding_type;
5273
5274 coding->consumed = coding->consumed_char = 0;
5275 coding->produced = coding->produced_char = 0;
5276 coding_set_source (coding);
5277
5278 src_end = coding->source + coding->src_bytes;
5279
5280 /* If we have not yet decided the text encoding type, detect it
5281 now. */
5282 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5283 {
5284 int c, i;
5285
5286 for (src = coding->source; src < src_end; src++)
5287 {
5288 c = *src;
5289 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5290 || c == ISO_CODE_SI
5291 || c == ISO_CODE_SO)))
5292 break;
5293 }
5294 coding->head_ascii = src - (coding->source + coding->consumed);
5295
5296 if (coding->head_ascii < coding->src_bytes)
5297 {
5298 struct coding_detection_info detect_info;
5299 enum coding_category category;
5300 struct coding_system *this;
5301
5302 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5303 for (i = 0; i < coding_category_raw_text; i++)
5304 {
5305 category = coding_priorities[i];
5306 this = coding_categories + category;
5307 if (this->id < 0)
5308 {
5309 /* No coding system of this category is defined. */
5310 detect_info.rejected |= (1 << category);
5311 }
5312 else if (category >= coding_category_raw_text)
5313 continue;
5314 else if (detect_info.checked & (1 << category))
5315 {
5316 if (detect_info.found & (1 << category))
5317 break;
5318 }
5319 else if ((*(this->detector)) (coding, &detect_info)
5320 && detect_info.found & (1 << category))
5321 break;
5322 }
5323 if (i < coding_category_raw_text)
5324 setup_coding_system (CODING_ID_NAME (this->id), coding);
5325 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5326 setup_coding_system (Qraw_text, coding);
5327 else if (detect_info.rejected)
5328 for (i = 0; i < coding_category_raw_text; i++)
5329 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5330 {
5331 this = coding_categories + coding_priorities[i];
5332 setup_coding_system (CODING_ID_NAME (this->id), coding);
5333 break;
5334 }
5335 }
5336 }
5337 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5338 {
5339 Lisp_Object coding_systems;
5340 struct coding_detection_info detect_info;
5341
5342 coding_systems
5343 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5344 detect_info.found = detect_info.rejected = 0;
5345 if (CONSP (coding_systems)
5346 && detect_coding_utf_16 (coding, &detect_info)
5347 && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5348 | CATEGORY_MASK_UTF_16_BE)))
5349 {
5350 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5351 setup_coding_system (XCAR (coding_systems), coding);
5352 else
5353 setup_coding_system (XCDR (coding_systems), coding);
5354 }
5355 }
5356
5357 attrs = CODING_ID_ATTRS (coding->id);
5358 coding_type = CODING_ATTR_TYPE (attrs);
5359
5360 /* If we have not yet decided the EOL type, detect it now. But, the
5361 detection is impossible for a CCL based coding system, in which
5362 case, we detct the EOL type after decoding. */
5363 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5364 && ! EQ (coding_type, Qccl))
5365 {
5366 int eol_seen = detect_eol (coding->source, coding->src_bytes,
5367 (enum coding_category) XINT (CODING_ATTR_CATEGORY (attrs)));
5368
5369 if (eol_seen != EOL_SEEN_NONE)
5370 adjust_coding_eol_type (coding, eol_seen);
5371 }
5372 }
5373
5374
5375 static void
5376 decode_eol (coding)
5377 struct coding_system *coding;
5378 {
5379 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5380 {
5381 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5382 unsigned char *pend = p + coding->produced;
5383 int eol_seen = EOL_SEEN_NONE;
5384
5385 for (; p < pend; p++)
5386 {
5387 if (*p == '\n')
5388 eol_seen |= EOL_SEEN_LF;
5389 else if (*p == '\r')
5390 {
5391 if (p + 1 < pend && *(p + 1) == '\n')
5392 {
5393 eol_seen |= EOL_SEEN_CRLF;
5394 p++;
5395 }
5396 else
5397 eol_seen |= EOL_SEEN_CR;
5398 }
5399 }
5400 if (eol_seen != EOL_SEEN_NONE)
5401 adjust_coding_eol_type (coding, eol_seen);
5402 }
5403
5404 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5405 {
5406 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5407 unsigned char *pend = p + coding->produced;
5408
5409 for (; p < pend; p++)
5410 if (*p == '\r')
5411 *p = '\n';
5412 }
5413 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5414 {
5415 unsigned char *p, *pbeg, *pend;
5416 Lisp_Object undo_list;
5417
5418 move_gap_both (coding->dst_pos + coding->produced_char,
5419 coding->dst_pos_byte + coding->produced);
5420 undo_list = current_buffer->undo_list;
5421 current_buffer->undo_list = Qt;
5422 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5423 current_buffer->undo_list = undo_list;
5424 pbeg = GPT_ADDR;
5425 pend = pbeg + coding->produced;
5426
5427 for (p = pend - 1; p >= pbeg; p--)
5428 if (*p == '\r')
5429 {
5430 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5431 pend--;
5432 }
5433 coding->produced_char -= coding->produced - (pend - pbeg);
5434 coding->produced = pend - pbeg;
5435 insert_from_gap (coding->produced_char, coding->produced);
5436 }
5437 }
5438
5439 static void
5440 translate_chars (coding, table)
5441 struct coding_system *coding;
5442 Lisp_Object table;
5443 {
5444 int *charbuf = coding->charbuf;
5445 int *charbuf_end = charbuf + coding->charbuf_used;
5446 int c;
5447
5448 if (coding->chars_at_source)
5449 return;
5450
5451 while (charbuf < charbuf_end)
5452 {
5453 c = *charbuf;
5454 if (c < 0)
5455 charbuf += c;
5456 else
5457 *charbuf++ = translate_char (table, c);
5458 }
5459 }
5460
5461 static int
5462 produce_chars (coding)
5463 struct coding_system *coding;
5464 {
5465 unsigned char *dst = coding->destination + coding->produced;
5466 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5467 int produced;
5468 int produced_chars = 0;
5469
5470 if (! coding->chars_at_source)
5471 {
5472 /* Characters are in coding->charbuf. */
5473 EMACS_INT *buf = coding->charbuf;
5474 EMACS_INT *buf_end = buf + coding->charbuf_used;
5475 unsigned char *adjusted_dst_end;
5476
5477 if (BUFFERP (coding->src_object)
5478 && EQ (coding->src_object, coding->dst_object))
5479 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5480 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5481
5482 while (buf < buf_end)
5483 {
5484 int c = *buf++;
5485
5486 if (dst >= adjusted_dst_end)
5487 {
5488 dst = alloc_destination (coding,
5489 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5490 dst);
5491 dst_end = coding->destination + coding->dst_bytes;
5492 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5493 }
5494 if (c >= 0)
5495 {
5496 if (coding->dst_multibyte
5497 || ! CHAR_BYTE8_P (c))
5498 CHAR_STRING_ADVANCE (c, dst);
5499 else
5500 *dst++ = CHAR_TO_BYTE8 (c);
5501 produced_chars++;
5502 }
5503 else
5504 /* This is an annotation datum. (-C) is the length of
5505 it. */
5506 buf += -c - 1;
5507 }
5508 }
5509 else
5510 {
5511 const unsigned char *src = coding->source;
5512 const unsigned char *src_end = src + coding->src_bytes;
5513 Lisp_Object eol_type;
5514
5515 eol_type = CODING_ID_EOL_TYPE (coding->id);
5516
5517 if (coding->src_multibyte != coding->dst_multibyte)
5518 {
5519 if (coding->src_multibyte)
5520 {
5521 int multibytep = 1;
5522 int consumed_chars;
5523
5524 while (1)
5525 {
5526 const unsigned char *src_base = src;
5527 int c;
5528
5529 ONE_MORE_BYTE (c);
5530 if (c == '\r')
5531 {
5532 if (EQ (eol_type, Qdos))
5533 {
5534 if (src == src_end)
5535 {
5536 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5537 goto no_more_source;
5538 }
5539 if (*src == '\n')
5540 c = *src++;
5541 }
5542 else if (EQ (eol_type, Qmac))
5543 c = '\n';
5544 }
5545 if (dst == dst_end)
5546 {
5547 coding->consumed = src - coding->source;
5548
5549 if (EQ (coding->src_object, coding->dst_object))
5550 dst_end = (unsigned char *) src;
5551 if (dst == dst_end)
5552 {
5553 dst = alloc_destination (coding, src_end - src + 1,
5554 dst);
5555 dst_end = coding->destination + coding->dst_bytes;
5556 coding_set_source (coding);
5557 src = coding->source + coding->consumed;
5558 src_end = coding->source + coding->src_bytes;
5559 }
5560 }
5561 *dst++ = c;
5562 produced_chars++;
5563 }
5564 no_more_source:
5565 ;
5566 }
5567 else
5568 while (src < src_end)
5569 {
5570 int multibytep = 1;
5571 int c = *src++;
5572
5573 if (c == '\r')
5574 {
5575 if (EQ (eol_type, Qdos))
5576 {
5577 if (src < src_end
5578 && *src == '\n')
5579 c = *src++;
5580 }
5581 else if (EQ (eol_type, Qmac))
5582 c = '\n';
5583 }
5584 if (dst >= dst_end - 1)
5585 {
5586 coding->consumed = src - coding->source;
5587
5588 if (EQ (coding->src_object, coding->dst_object))
5589 dst_end = (unsigned char *) src;
5590 if (dst >= dst_end - 1)
5591 {
5592 dst = alloc_destination (coding, src_end - src + 2,
5593 dst);
5594 dst_end = coding->destination + coding->dst_bytes;
5595 coding_set_source (coding);
5596 src = coding->source + coding->consumed;
5597 src_end = coding->source + coding->src_bytes;
5598 }
5599 }
5600 EMIT_ONE_BYTE (c);
5601 }
5602 }
5603 else
5604 {
5605 if (!EQ (coding->src_object, coding->dst_object))
5606 {
5607 int require = coding->src_bytes - coding->dst_bytes;
5608
5609 if (require > 0)
5610 {
5611 EMACS_INT offset = src - coding->source;
5612
5613 dst = alloc_destination (coding, require, dst);
5614 coding_set_source (coding);
5615 src = coding->source + offset;
5616 src_end = coding->source + coding->src_bytes;
5617 }
5618 }
5619 produced_chars = coding->src_chars;
5620 while (src < src_end)
5621 {
5622 int c = *src++;
5623
5624 if (c == '\r')
5625 {
5626 if (EQ (eol_type, Qdos))
5627 {
5628 if (src < src_end
5629 && *src == '\n')
5630 c = *src++;
5631 produced_chars--;
5632 }
5633 else if (EQ (eol_type, Qmac))
5634 c = '\n';
5635 }
5636 *dst++ = c;
5637 }
5638 }
5639 coding->consumed = coding->src_bytes;
5640 coding->consumed_char = coding->src_chars;
5641 }
5642
5643 produced = dst - (coding->destination + coding->produced);
5644 if (BUFFERP (coding->dst_object))
5645 insert_from_gap (produced_chars, produced);
5646 coding->produced += produced;
5647 coding->produced_char += produced_chars;
5648 return produced_chars;
5649 }
5650
5651 /* Compose text in CODING->object according to the annotation data at
5652 CHARBUF. CHARBUF is an array:
5653 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5654 */
5655
5656 static INLINE void
5657 produce_composition (coding, charbuf)
5658 struct coding_system *coding;
5659 int *charbuf;
5660 {
5661 int len;
5662 EMACS_INT from, to;
5663 enum composition_method method;
5664 Lisp_Object components;
5665
5666 len = -charbuf[0];
5667 from = coding->dst_pos + charbuf[2];
5668 to = coding->dst_pos + charbuf[3];
5669 method = (enum composition_method) (charbuf[4]);
5670
5671 if (method == COMPOSITION_RELATIVE)
5672 components = Qnil;
5673 else
5674 {
5675 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5676 int i;
5677
5678 len -= 5;
5679 charbuf += 5;
5680 for (i = 0; i < len; i++)
5681 args[i] = make_number (charbuf[i]);
5682 components = (method == COMPOSITION_WITH_ALTCHARS
5683 ? Fstring (len, args) : Fvector (len, args));
5684 }
5685 compose_text (from, to, components, Qnil, coding->dst_object);
5686 }
5687
5688
5689 /* Put `charset' property on text in CODING->object according to
5690 the annotation data at CHARBUF. CHARBUF is an array:
5691 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5692 */
5693
5694 static INLINE void
5695 produce_charset (coding, charbuf)
5696 struct coding_system *coding;
5697 int *charbuf;
5698 {
5699 EMACS_INT from = coding->dst_pos + charbuf[2];
5700 EMACS_INT to = coding->dst_pos + charbuf[3];
5701 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5702
5703 Fput_text_property (make_number (from), make_number (to),
5704 Qcharset, CHARSET_NAME (charset),
5705 coding->dst_object);
5706 }
5707
5708
5709 #define CHARBUF_SIZE 0x4000
5710
5711 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5712 do { \
5713 int size = CHARBUF_SIZE;; \
5714 \
5715 coding->charbuf = NULL; \
5716 while (size > 1024) \
5717 { \
5718 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5719 if (coding->charbuf) \
5720 break; \
5721 size >>= 1; \
5722 } \
5723 if (! coding->charbuf) \
5724 { \
5725 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5726 return coding->result; \
5727 } \
5728 coding->charbuf_size = size; \
5729 } while (0)
5730
5731
5732 static void
5733 produce_annotation (coding)
5734 struct coding_system *coding;
5735 {
5736 int *charbuf = coding->charbuf;
5737 int *charbuf_end = charbuf + coding->charbuf_used;
5738
5739 if (NILP (coding->dst_object))
5740 return;
5741
5742 while (charbuf < charbuf_end)
5743 {
5744 if (*charbuf >= 0)
5745 charbuf++;
5746 else
5747 {
5748 int len = -*charbuf;
5749 switch (charbuf[1])
5750 {
5751 case CODING_ANNOTATE_COMPOSITION_MASK:
5752 produce_composition (coding, charbuf);
5753 break;
5754 case CODING_ANNOTATE_CHARSET_MASK:
5755 produce_charset (coding, charbuf);
5756 break;
5757 default:
5758 abort ();
5759 }
5760 charbuf += len;
5761 }
5762 }
5763 }
5764
5765 /* Decode the data at CODING->src_object into CODING->dst_object.
5766 CODING->src_object is a buffer, a string, or nil.
5767 CODING->dst_object is a buffer.
5768
5769 If CODING->src_object is a buffer, it must be the current buffer.
5770 In this case, if CODING->src_pos is positive, it is a position of
5771 the source text in the buffer, otherwise, the source text is in the
5772 gap area of the buffer, and CODING->src_pos specifies the offset of
5773 the text from GPT (which must be the same as PT). If this is the
5774 same buffer as CODING->dst_object, CODING->src_pos must be
5775 negative.
5776
5777 If CODING->src_object is a string, CODING->src_pos in an index to
5778 that string.
5779
5780 If CODING->src_object is nil, CODING->source must already point to
5781 the non-relocatable memory area. In this case, CODING->src_pos is
5782 an offset from CODING->source.
5783
5784 The decoded data is inserted at the current point of the buffer
5785 CODING->dst_object.
5786 */
5787
5788 static int
5789 decode_coding (coding)
5790 struct coding_system *coding;
5791 {
5792 Lisp_Object attrs;
5793
5794 if (BUFFERP (coding->src_object)
5795 && coding->src_pos > 0
5796 && coding->src_pos < GPT
5797 && coding->src_pos + coding->src_chars > GPT)
5798 move_gap_both (coding->src_pos, coding->src_pos_byte);
5799
5800 if (BUFFERP (coding->dst_object))
5801 {
5802 if (current_buffer != XBUFFER (coding->dst_object))
5803 set_buffer_internal (XBUFFER (coding->dst_object));
5804 if (GPT != PT)
5805 move_gap_both (PT, PT_BYTE);
5806 }
5807
5808 coding->consumed = coding->consumed_char = 0;
5809 coding->produced = coding->produced_char = 0;
5810 coding->chars_at_source = 0;
5811 coding->result = CODING_RESULT_SUCCESS;
5812 coding->errors = 0;
5813
5814 ALLOC_CONVERSION_WORK_AREA (coding);
5815
5816 attrs = CODING_ID_ATTRS (coding->id);
5817
5818 do
5819 {
5820 coding_set_source (coding);
5821 coding->annotated = 0;
5822 (*(coding->decoder)) (coding);
5823 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5824 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5825 else if (!NILP (Vstandard_translation_table_for_decode))
5826 translate_chars (coding, Vstandard_translation_table_for_decode);
5827 coding_set_destination (coding);
5828 produce_chars (coding);
5829 if (coding->annotated)
5830 produce_annotation (coding);
5831 }
5832 while (coding->consumed < coding->src_bytes
5833 && ! coding->result);
5834
5835 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5836 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5837 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5838 decode_eol (coding);
5839
5840 coding->carryover_bytes = 0;
5841 if (coding->consumed < coding->src_bytes)
5842 {
5843 int nbytes = coding->src_bytes - coding->consumed;
5844 const unsigned char *src;
5845
5846 coding_set_source (coding);
5847 coding_set_destination (coding);
5848 src = coding->source + coding->consumed;
5849
5850 if (coding->mode & CODING_MODE_LAST_BLOCK)
5851 {
5852 /* Flush out unprocessed data as binary chars. We are sure
5853 that the number of data is less than the size of
5854 coding->charbuf. */
5855 while (nbytes-- > 0)
5856 {
5857 int c = *src++;
5858
5859 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5860 }
5861 produce_chars (coding);
5862 }
5863 else
5864 {
5865 /* Record unprocessed bytes in coding->carryover. We are
5866 sure that the number of data is less than the size of
5867 coding->carryover. */
5868 unsigned char *p = coding->carryover;
5869
5870 coding->carryover_bytes = nbytes;
5871 while (nbytes-- > 0)
5872 *p++ = *src++;
5873 }
5874 coding->consumed = coding->src_bytes;
5875 }
5876
5877 return coding->result;
5878 }
5879
5880
5881 /* Extract an annotation datum from a composition starting at POS and
5882 ending before LIMIT of CODING->src_object (buffer or string), store
5883 the data in BUF, set *STOP to a starting position of the next
5884 composition (if any) or to LIMIT, and return the address of the
5885 next element of BUF.
5886
5887 If such an annotation is not found, set *STOP to a starting
5888 position of a composition after POS (if any) or to LIMIT, and
5889 return BUF. */
5890
5891 static INLINE int *
5892 handle_composition_annotation (pos, limit, coding, buf, stop)
5893 EMACS_INT pos, limit;
5894 struct coding_system *coding;
5895 int *buf;
5896 EMACS_INT *stop;
5897 {
5898 EMACS_INT start, end;
5899 Lisp_Object prop;
5900
5901 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5902 || end > limit)
5903 *stop = limit;
5904 else if (start > pos)
5905 *stop = start;
5906 else
5907 {
5908 if (start == pos)
5909 {
5910 /* We found a composition. Store the corresponding
5911 annotation data in BUF. */
5912 int *head = buf;
5913 enum composition_method method = COMPOSITION_METHOD (prop);
5914 int nchars = COMPOSITION_LENGTH (prop);
5915
5916 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5917 if (method != COMPOSITION_RELATIVE)
5918 {
5919 Lisp_Object components;
5920 int len, i, i_byte;
5921
5922 components = COMPOSITION_COMPONENTS (prop);
5923 if (VECTORP (components))
5924 {
5925 len = XVECTOR (components)->size;
5926 for (i = 0; i < len; i++)
5927 *buf++ = XINT (AREF (components, i));
5928 }
5929 else if (STRINGP (components))
5930 {
5931 len = SCHARS (components);
5932 i = i_byte = 0;
5933 while (i < len)
5934 {
5935 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5936 buf++;
5937 }
5938 }
5939 else if (INTEGERP (components))
5940 {
5941 len = 1;
5942 *buf++ = XINT (components);
5943 }
5944 else if (CONSP (components))
5945 {
5946 for (len = 0; CONSP (components);
5947 len++, components = XCDR (components))
5948 *buf++ = XINT (XCAR (components));
5949 }
5950 else
5951 abort ();
5952 *head -= len;
5953 }
5954 }
5955
5956 if (find_composition (end, limit, &start, &end, &prop,
5957 coding->src_object)
5958 && end <= limit)
5959 *stop = start;
5960 else
5961 *stop = limit;
5962 }
5963 return buf;
5964 }
5965
5966
5967 /* Extract an annotation datum from a text property `charset' at POS of
5968 CODING->src_object (buffer of string), store the data in BUF, set
5969 *STOP to the position where the value of `charset' property changes
5970 (limiting by LIMIT), and return the address of the next element of
5971 BUF.
5972
5973 If the property value is nil, set *STOP to the position where the
5974 property value is non-nil (limiting by LIMIT), and return BUF. */
5975
5976 static INLINE int *
5977 handle_charset_annotation (pos, limit, coding, buf, stop)
5978 EMACS_INT pos, limit;
5979 struct coding_system *coding;
5980 int *buf;
5981 EMACS_INT *stop;
5982 {
5983 Lisp_Object val, next;
5984 int id;
5985
5986 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5987 if (! NILP (val) && CHARSETP (val))
5988 id = XINT (CHARSET_SYMBOL_ID (val));
5989 else
5990 id = -1;
5991 ADD_CHARSET_DATA (buf, 0, 0, id);
5992 next = Fnext_single_property_change (make_number (pos), Qcharset,
5993 coding->src_object,
5994 make_number (limit));
5995 *stop = XINT (next);
5996 return buf;
5997 }
5998
5999
6000 static void
6001 consume_chars (coding)
6002 struct coding_system *coding;
6003 {
6004 int *buf = coding->charbuf;
6005 int *buf_end = coding->charbuf + coding->charbuf_size;
6006 const unsigned char *src = coding->source + coding->consumed;
6007 const unsigned char *src_end = coding->source + coding->src_bytes;
6008 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6009 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6010 int multibytep = coding->src_multibyte;
6011 Lisp_Object eol_type;
6012 int c;
6013 EMACS_INT stop, stop_composition, stop_charset;
6014
6015 eol_type = CODING_ID_EOL_TYPE (coding->id);
6016 if (VECTORP (eol_type))
6017 eol_type = Qunix;
6018
6019 /* Note: composition handling is not yet implemented. */
6020 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6021
6022 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6023 stop = stop_composition = pos;
6024 else
6025 stop = stop_composition = end_pos;
6026 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6027 stop = stop_charset = pos;
6028 else
6029 stop_charset = end_pos;
6030
6031 /* Compensate for CRLF and annotation. */
6032 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6033 while (buf < buf_end)
6034 {
6035 if (pos == stop)
6036 {
6037 if (pos == end_pos)
6038 break;
6039 if (pos == stop_composition)
6040 buf = handle_composition_annotation (pos, end_pos, coding,
6041 buf, &stop_composition);
6042 if (pos == stop_charset)
6043 buf = handle_charset_annotation (pos, end_pos, coding,
6044 buf, &stop_charset);
6045 stop = (stop_composition < stop_charset
6046 ? stop_composition : stop_charset);
6047 }
6048
6049 if (! multibytep)
6050 {
6051 EMACS_INT bytes;
6052
6053 if (! CODING_FOR_UNIBYTE (coding)
6054 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6055 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6056 else
6057 c = *src++, pos++;
6058 }
6059 else
6060 c = STRING_CHAR_ADVANCE (src), pos++;
6061 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6062 c = '\n';
6063 if (! EQ (eol_type, Qunix))
6064 {
6065 if (c == '\n')
6066 {
6067 if (EQ (eol_type, Qdos))
6068 *buf++ = '\r';
6069 else
6070 c = '\r';
6071 }
6072 }
6073 *buf++ = c;
6074 }
6075
6076 coding->consumed = src - coding->source;
6077 coding->consumed_char = pos - coding->src_pos;
6078 coding->charbuf_used = buf - coding->charbuf;
6079 coding->chars_at_source = 0;
6080 }
6081
6082
6083 /* Encode the text at CODING->src_object into CODING->dst_object.
6084 CODING->src_object is a buffer or a string.
6085 CODING->dst_object is a buffer or nil.
6086
6087 If CODING->src_object is a buffer, it must be the current buffer.
6088 In this case, if CODING->src_pos is positive, it is a position of
6089 the source text in the buffer, otherwise. the source text is in the
6090 gap area of the buffer, and coding->src_pos specifies the offset of
6091 the text from GPT (which must be the same as PT). If this is the
6092 same buffer as CODING->dst_object, CODING->src_pos must be
6093 negative and CODING should not have `pre-write-conversion'.
6094
6095 If CODING->src_object is a string, CODING should not have
6096 `pre-write-conversion'.
6097
6098 If CODING->dst_object is a buffer, the encoded data is inserted at
6099 the current point of that buffer.
6100
6101 If CODING->dst_object is nil, the encoded data is placed at the
6102 memory area specified by CODING->destination. */
6103
6104 static int
6105 encode_coding (coding)
6106 struct coding_system *coding;
6107 {
6108 Lisp_Object attrs;
6109
6110 attrs = CODING_ID_ATTRS (coding->id);
6111
6112 if (BUFFERP (coding->dst_object))
6113 {
6114 set_buffer_internal (XBUFFER (coding->dst_object));
6115 coding->dst_multibyte
6116 = ! NILP (current_buffer->enable_multibyte_characters);
6117 }
6118
6119 coding->consumed = coding->consumed_char = 0;
6120 coding->produced = coding->produced_char = 0;
6121 coding->result = CODING_RESULT_SUCCESS;
6122 coding->errors = 0;
6123
6124 ALLOC_CONVERSION_WORK_AREA (coding);
6125
6126 do {
6127 coding_set_source (coding);
6128 consume_chars (coding);
6129
6130 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6131 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6132 else if (!NILP (Vstandard_translation_table_for_encode))
6133 translate_chars (coding, Vstandard_translation_table_for_encode);
6134
6135 coding_set_destination (coding);
6136 (*(coding->encoder)) (coding);
6137 } while (coding->consumed_char < coding->src_chars);
6138
6139 if (BUFFERP (coding->dst_object))
6140 insert_from_gap (coding->produced_char, coding->produced);
6141
6142 return (coding->result);
6143 }
6144
6145
6146 /* Stack of working buffers used in code conversion. An nil element
6147 means that the code conversion of that level is not using a working
6148 buffer. */
6149 Lisp_Object Vcode_conversion_work_buf_list;
6150
6151 /* A working buffer used by the top level conversion. */
6152 Lisp_Object Vcode_conversion_reused_work_buf;
6153
6154
6155 /* Return a working buffer that can be freely used by the following
6156 code conversion. MULTIBYTEP specifies the multibyteness of the
6157 buffer. */
6158
6159 Lisp_Object
6160 make_conversion_work_buffer (multibytep, depth)
6161 int multibytep, depth;
6162 {
6163 struct buffer *current = current_buffer;
6164 Lisp_Object buf, name;
6165
6166 if (depth == 0)
6167 {
6168 if (NILP (Vcode_conversion_reused_work_buf))
6169 Vcode_conversion_reused_work_buf
6170 = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6171 buf = Vcode_conversion_reused_work_buf;
6172 }
6173 else
6174 {
6175 if (depth < 0)
6176 {
6177 name = build_string (" *code-converting-work*");
6178 name = Fgenerate_new_buffer_name (name, Qnil);
6179 }
6180 else
6181 {
6182 char str[128];
6183
6184 sprintf (str, " *code-converting-work*<%d>", depth);
6185 name = build_string (str);
6186 }
6187 buf = Fget_buffer_create (name);
6188 }
6189 set_buffer_internal (XBUFFER (buf));
6190 current_buffer->undo_list = Qt;
6191 Ferase_buffer ();
6192 Fset_buffer_multibyte (multibytep ? Qt : Qnil);
6193 set_buffer_internal (current);
6194 return buf;
6195 }
6196
6197 static Lisp_Object
6198 code_conversion_restore (buffer)
6199 Lisp_Object buffer;
6200 {
6201 Lisp_Object workbuf;
6202
6203 workbuf = XCAR (Vcode_conversion_work_buf_list);
6204 if (! NILP (workbuf)
6205 && ! EQ (workbuf, Vcode_conversion_reused_work_buf)
6206 && ! NILP (Fbuffer_live_p (workbuf)))
6207 Fkill_buffer (workbuf);
6208 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6209 set_buffer_internal (XBUFFER (buffer));
6210 return Qnil;
6211 }
6212
6213 static Lisp_Object
6214 code_conversion_save (buffer, with_work_buf, multibyte)
6215 Lisp_Object buffer;
6216 int with_work_buf, multibyte;
6217 {
6218 Lisp_Object workbuf;
6219
6220 if (with_work_buf)
6221 {
6222 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6223
6224 workbuf = make_conversion_work_buffer (multibyte, depth);
6225 }
6226 else
6227 workbuf = Qnil;
6228 Vcode_conversion_work_buf_list
6229 = Fcons (workbuf, Vcode_conversion_work_buf_list);
6230 record_unwind_protect (code_conversion_restore, buffer);
6231 return workbuf;
6232 }
6233
6234 int
6235 decode_coding_gap (coding, chars, bytes)
6236 struct coding_system *coding;
6237 EMACS_INT chars, bytes;
6238 {
6239 int count = specpdl_ptr - specpdl;
6240 Lisp_Object attrs;
6241 Lisp_Object buffer;
6242
6243 buffer = Fcurrent_buffer ();
6244 code_conversion_save (buffer, 0, 0);
6245
6246 coding->src_object = buffer;
6247 coding->src_chars = chars;
6248 coding->src_bytes = bytes;
6249 coding->src_pos = -chars;
6250 coding->src_pos_byte = -bytes;
6251 coding->src_multibyte = chars < bytes;
6252 coding->dst_object = buffer;
6253 coding->dst_pos = PT;
6254 coding->dst_pos_byte = PT_BYTE;
6255 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6256 coding->mode |= CODING_MODE_LAST_BLOCK;
6257
6258 if (CODING_REQUIRE_DETECTION (coding))
6259 detect_coding (coding);
6260
6261 decode_coding (coding);
6262
6263 attrs = CODING_ID_ATTRS (coding->id);
6264 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6265 {
6266 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6267 Lisp_Object val;
6268
6269 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6270 val = call1 (CODING_ATTR_POST_READ (attrs),
6271 make_number (coding->produced_char));
6272 CHECK_NATNUM (val);
6273 coding->produced_char += Z - prev_Z;
6274 coding->produced += Z_BYTE - prev_Z_BYTE;
6275 }
6276
6277 unbind_to (count, Qnil);
6278 return coding->result;
6279 }
6280
6281 int
6282 encode_coding_gap (coding, chars, bytes)
6283 struct coding_system *coding;
6284 EMACS_INT chars, bytes;
6285 {
6286 int count = specpdl_ptr - specpdl;
6287 Lisp_Object buffer;
6288
6289 buffer = Fcurrent_buffer ();
6290 code_conversion_save (buffer, 0, 0);
6291
6292 coding->src_object = buffer;
6293 coding->src_chars = chars;
6294 coding->src_bytes = bytes;
6295 coding->src_pos = -chars;
6296 coding->src_pos_byte = -bytes;
6297 coding->src_multibyte = chars < bytes;
6298 coding->dst_object = coding->src_object;
6299 coding->dst_pos = PT;
6300 coding->dst_pos_byte = PT_BYTE;
6301
6302 encode_coding (coding);
6303
6304 unbind_to (count, Qnil);
6305 return coding->result;
6306 }
6307
6308
6309 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6310 SRC_OBJECT into DST_OBJECT by coding context CODING.
6311
6312 SRC_OBJECT is a buffer, a string, or Qnil.
6313
6314 If it is a buffer, the text is at point of the buffer. FROM and TO
6315 are positions in the buffer.
6316
6317 If it is a string, the text is at the beginning of the string.
6318 FROM and TO are indices to the string.
6319
6320 If it is nil, the text is at coding->source. FROM and TO are
6321 indices to coding->source.
6322
6323 DST_OBJECT is a buffer, Qt, or Qnil.
6324
6325 If it is a buffer, the decoded text is inserted at point of the
6326 buffer. If the buffer is the same as SRC_OBJECT, the source text
6327 is deleted.
6328
6329 If it is Qt, a string is made from the decoded text, and
6330 set in CODING->dst_object.
6331
6332 If it is Qnil, the decoded text is stored at CODING->destination.
6333 The caller must allocate CODING->dst_bytes bytes at
6334 CODING->destination by xmalloc. If the decoded text is longer than
6335 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6336 */
6337
6338 void
6339 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6340 dst_object)
6341 struct coding_system *coding;
6342 Lisp_Object src_object;
6343 EMACS_INT from, from_byte, to, to_byte;
6344 Lisp_Object dst_object;
6345 {
6346 int count = specpdl_ptr - specpdl;
6347 unsigned char *destination;
6348 EMACS_INT dst_bytes;
6349 EMACS_INT chars = to - from;
6350 EMACS_INT bytes = to_byte - from_byte;
6351 Lisp_Object attrs;
6352 Lisp_Object buffer;
6353 int saved_pt = -1, saved_pt_byte;
6354
6355 buffer = Fcurrent_buffer ();
6356
6357 if (NILP (dst_object))
6358 {
6359 destination = coding->destination;
6360 dst_bytes = coding->dst_bytes;
6361 }
6362
6363 coding->src_object = src_object;
6364 coding->src_chars = chars;
6365 coding->src_bytes = bytes;
6366 coding->src_multibyte = chars < bytes;
6367
6368 if (STRINGP (src_object))
6369 {
6370 coding->src_pos = from;
6371 coding->src_pos_byte = from_byte;
6372 }
6373 else if (BUFFERP (src_object))
6374 {
6375 set_buffer_internal (XBUFFER (src_object));
6376 if (from != GPT)
6377 move_gap_both (from, from_byte);
6378 if (EQ (src_object, dst_object))
6379 {
6380 saved_pt = PT, saved_pt_byte = PT_BYTE;
6381 TEMP_SET_PT_BOTH (from, from_byte);
6382 del_range_both (from, from_byte, to, to_byte, 1);
6383 coding->src_pos = -chars;
6384 coding->src_pos_byte = -bytes;
6385 }
6386 else
6387 {
6388 coding->src_pos = from;
6389 coding->src_pos_byte = from_byte;
6390 }
6391 }
6392
6393 if (CODING_REQUIRE_DETECTION (coding))
6394 detect_coding (coding);
6395 attrs = CODING_ID_ATTRS (coding->id);
6396
6397 if (EQ (dst_object, Qt)
6398 || (! NILP (CODING_ATTR_POST_READ (attrs))
6399 && NILP (dst_object)))
6400 {
6401 coding->dst_object = code_conversion_save (buffer, 1, 1);
6402 coding->dst_pos = BEG;
6403 coding->dst_pos_byte = BEG_BYTE;
6404 coding->dst_multibyte = 1;
6405 }
6406 else if (BUFFERP (dst_object))
6407 {
6408 code_conversion_save (buffer, 0, 0);
6409 coding->dst_object = dst_object;
6410 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6411 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6412 coding->dst_multibyte
6413 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6414 }
6415 else
6416 {
6417 code_conversion_save (buffer, 0, 0);
6418 coding->dst_object = Qnil;
6419 coding->dst_multibyte = 1;
6420 }
6421
6422 decode_coding (coding);
6423
6424 if (BUFFERP (coding->dst_object))
6425 set_buffer_internal (XBUFFER (coding->dst_object));
6426
6427 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6428 {
6429 struct gcpro gcpro1, gcpro2;
6430 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6431 Lisp_Object val;
6432
6433 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6434 GCPRO2 (coding->src_object, coding->dst_object);
6435 val = call1 (CODING_ATTR_POST_READ (attrs),
6436 make_number (coding->produced_char));
6437 UNGCPRO;
6438 CHECK_NATNUM (val);
6439 coding->produced_char += Z - prev_Z;
6440 coding->produced += Z_BYTE - prev_Z_BYTE;
6441 }
6442
6443 if (EQ (dst_object, Qt))
6444 {
6445 coding->dst_object = Fbuffer_string ();
6446 }
6447 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6448 {
6449 set_buffer_internal (XBUFFER (coding->dst_object));
6450 if (dst_bytes < coding->produced)
6451 {
6452 destination
6453 = (unsigned char *) xrealloc (destination, coding->produced);
6454 if (! destination)
6455 {
6456 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6457 unbind_to (count, Qnil);
6458 return;
6459 }
6460 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6461 move_gap_both (BEGV, BEGV_BYTE);
6462 bcopy (BEGV_ADDR, destination, coding->produced);
6463 coding->destination = destination;
6464 }
6465 }
6466
6467 if (saved_pt >= 0)
6468 {
6469 /* This is the case of:
6470 (BUFFERP (src_object) && EQ (src_object, dst_object))
6471 As we have moved PT while replacing the original buffer
6472 contents, we must recover it now. */
6473 set_buffer_internal (XBUFFER (src_object));
6474 if (saved_pt < from)
6475 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6476 else if (saved_pt < from + chars)
6477 TEMP_SET_PT_BOTH (from, from_byte);
6478 else if (! NILP (current_buffer->enable_multibyte_characters))
6479 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6480 saved_pt_byte + (coding->produced - bytes));
6481 else
6482 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6483 saved_pt_byte + (coding->produced - bytes));
6484 }
6485
6486 unbind_to (count, Qnil);
6487 }
6488
6489
6490 void
6491 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6492 dst_object)
6493 struct coding_system *coding;
6494 Lisp_Object src_object;
6495 EMACS_INT from, from_byte, to, to_byte;
6496 Lisp_Object dst_object;
6497 {
6498 int count = specpdl_ptr - specpdl;
6499 EMACS_INT chars = to - from;
6500 EMACS_INT bytes = to_byte - from_byte;
6501 Lisp_Object attrs;
6502 Lisp_Object buffer;
6503 int saved_pt = -1, saved_pt_byte;
6504
6505 buffer = Fcurrent_buffer ();
6506
6507 coding->src_object = src_object;
6508 coding->src_chars = chars;
6509 coding->src_bytes = bytes;
6510 coding->src_multibyte = chars < bytes;
6511
6512 attrs = CODING_ID_ATTRS (coding->id);
6513
6514 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6515 {
6516 coding->src_object = code_conversion_save (buffer, 1,
6517 coding->src_multibyte);
6518 set_buffer_internal (XBUFFER (coding->src_object));
6519 if (STRINGP (src_object))
6520 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6521 else if (BUFFERP (src_object))
6522 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6523 else
6524 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6525
6526 if (EQ (src_object, dst_object))
6527 {
6528 set_buffer_internal (XBUFFER (src_object));
6529 saved_pt = PT, saved_pt_byte = PT_BYTE;
6530 del_range_both (from, from_byte, to, to_byte, 1);
6531 set_buffer_internal (XBUFFER (coding->src_object));
6532 }
6533
6534 call2 (CODING_ATTR_PRE_WRITE (attrs),
6535 make_number (BEG), make_number (Z));
6536 coding->src_object = Fcurrent_buffer ();
6537 if (BEG != GPT)
6538 move_gap_both (BEG, BEG_BYTE);
6539 coding->src_chars = Z - BEG;
6540 coding->src_bytes = Z_BYTE - BEG_BYTE;
6541 coding->src_pos = BEG;
6542 coding->src_pos_byte = BEG_BYTE;
6543 coding->src_multibyte = Z < Z_BYTE;
6544 }
6545 else if (STRINGP (src_object))
6546 {
6547 code_conversion_save (buffer, 0, 0);
6548 coding->src_pos = from;
6549 coding->src_pos_byte = from_byte;
6550 }
6551 else if (BUFFERP (src_object))
6552 {
6553 code_conversion_save (buffer, 0, 0);
6554 set_buffer_internal (XBUFFER (src_object));
6555 if (EQ (src_object, dst_object))
6556 {
6557 saved_pt = PT, saved_pt_byte = PT_BYTE;
6558 coding->src_object = del_range_1 (from, to, 1, 1);
6559 coding->src_pos = 0;
6560 coding->src_pos_byte = 0;
6561 }
6562 else
6563 {
6564 if (from < GPT && to >= GPT)
6565 move_gap_both (from, from_byte);
6566 coding->src_pos = from;
6567 coding->src_pos_byte = from_byte;
6568 }
6569 }
6570 else
6571 code_conversion_save (buffer, 0, 0);
6572
6573 if (BUFFERP (dst_object))
6574 {
6575 coding->dst_object = dst_object;
6576 if (EQ (src_object, dst_object))
6577 {
6578 coding->dst_pos = from;
6579 coding->dst_pos_byte = from_byte;
6580 }
6581 else
6582 {
6583 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6584 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6585 }
6586 coding->dst_multibyte
6587 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6588 }
6589 else if (EQ (dst_object, Qt))
6590 {
6591 coding->dst_object = Qnil;
6592 coding->dst_bytes = coding->src_chars;
6593 if (coding->dst_bytes == 0)
6594 coding->dst_bytes = 1;
6595 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6596 coding->dst_multibyte = 0;
6597 }
6598 else
6599 {
6600 coding->dst_object = Qnil;
6601 coding->dst_multibyte = 0;
6602 }
6603
6604 encode_coding (coding);
6605
6606 if (EQ (dst_object, Qt))
6607 {
6608 if (BUFFERP (coding->dst_object))
6609 coding->dst_object = Fbuffer_string ();
6610 else
6611 {
6612 coding->dst_object
6613 = make_unibyte_string ((char *) coding->destination,
6614 coding->produced);
6615 xfree (coding->destination);
6616 }
6617 }
6618
6619 if (saved_pt >= 0)
6620 {
6621 /* This is the case of:
6622 (BUFFERP (src_object) && EQ (src_object, dst_object))
6623 As we have moved PT while replacing the original buffer
6624 contents, we must recover it now. */
6625 set_buffer_internal (XBUFFER (src_object));
6626 if (saved_pt < from)
6627 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6628 else if (saved_pt < from + chars)
6629 TEMP_SET_PT_BOTH (from, from_byte);
6630 else if (! NILP (current_buffer->enable_multibyte_characters))
6631 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6632 saved_pt_byte + (coding->produced - bytes));
6633 else
6634 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6635 saved_pt_byte + (coding->produced - bytes));
6636 }
6637
6638 unbind_to (count, Qnil);
6639 }
6640
6641
6642 Lisp_Object
6643 preferred_coding_system ()
6644 {
6645 int id = coding_categories[coding_priorities[0]].id;
6646
6647 return CODING_ID_NAME (id);
6648 }
6649
6650 \f
6651 #ifdef emacs
6652 /*** 8. Emacs Lisp library functions ***/
6653
6654 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6655 doc: /* Return t if OBJECT is nil or a coding-system.
6656 See the documentation of `define-coding-system' for information
6657 about coding-system objects. */)
6658 (obj)
6659 Lisp_Object obj;
6660 {
6661 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6662 }
6663
6664 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6665 Sread_non_nil_coding_system, 1, 1, 0,
6666 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6667 (prompt)
6668 Lisp_Object prompt;
6669 {
6670 Lisp_Object val;
6671 do
6672 {
6673 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6674 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6675 }
6676 while (SCHARS (val) == 0);
6677 return (Fintern (val, Qnil));
6678 }
6679
6680 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6681 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6682 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6683 (prompt, default_coding_system)
6684 Lisp_Object prompt, default_coding_system;
6685 {
6686 Lisp_Object val;
6687 if (SYMBOLP (default_coding_system))
6688 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6689 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6690 Qt, Qnil, Qcoding_system_history,
6691 default_coding_system, Qnil);
6692 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6693 }
6694
6695 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6696 1, 1, 0,
6697 doc: /* Check validity of CODING-SYSTEM.
6698 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6699 (coding_system)
6700 Lisp_Object coding_system;
6701 {
6702 CHECK_SYMBOL (coding_system);
6703 if (!NILP (Fcoding_system_p (coding_system)))
6704 return coding_system;
6705 while (1)
6706 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6707 }
6708
6709 \f
6710 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6711 HIGHEST is nonzero, return the coding system of the highest
6712 priority among the detected coding systems. Otherwize return a
6713 list of detected coding systems sorted by their priorities. If
6714 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6715 multibyte form but contains only ASCII and eight-bit chars.
6716 Otherwise, the bytes are raw bytes.
6717
6718 CODING-SYSTEM controls the detection as below:
6719
6720 If it is nil, detect both text-format and eol-format. If the
6721 text-format part of CODING-SYSTEM is already specified
6722 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6723 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6724 detect only text-format. */
6725
6726 Lisp_Object
6727 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6728 const unsigned char *src;
6729 int src_bytes, highest;
6730 int multibytep;
6731 Lisp_Object coding_system;
6732 {
6733 const unsigned char *src_end = src + src_bytes;
6734 Lisp_Object attrs, eol_type;
6735 Lisp_Object val;
6736 struct coding_system coding;
6737 int id;
6738 struct coding_detection_info detect_info;
6739
6740 if (NILP (coding_system))
6741 coding_system = Qundecided;
6742 setup_coding_system (coding_system, &coding);
6743 attrs = CODING_ID_ATTRS (coding.id);
6744 eol_type = CODING_ID_EOL_TYPE (coding.id);
6745 coding_system = CODING_ATTR_BASE_NAME (attrs);
6746
6747 coding.source = src;
6748 coding.src_bytes = src_bytes;
6749 coding.src_multibyte = multibytep;
6750 coding.consumed = 0;
6751 coding.mode |= CODING_MODE_LAST_BLOCK;
6752
6753 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6754
6755 /* At first, detect text-format if necessary. */
6756 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6757 {
6758 enum coding_category category;
6759 struct coding_system *this;
6760 int c, i;
6761
6762 for (; src < src_end; src++)
6763 {
6764 c = *src;
6765 if (c & 0x80
6766 || (c < 0x20 && (c == ISO_CODE_ESC
6767 || c == ISO_CODE_SI
6768 || c == ISO_CODE_SO)))
6769 break;
6770 }
6771 coding.head_ascii = src - coding.source;
6772
6773 if (src < src_end)
6774 for (i = 0; i < coding_category_raw_text; i++)
6775 {
6776 category = coding_priorities[i];
6777 this = coding_categories + category;
6778
6779 if (this->id < 0)
6780 {
6781 /* No coding system of this category is defined. */
6782 detect_info.rejected |= (1 << category);
6783 }
6784 else if (category >= coding_category_raw_text)
6785 continue;
6786 else if (detect_info.checked & (1 << category))
6787 {
6788 if (highest
6789 && (detect_info.found & (1 << category)))
6790 break;
6791 }
6792 else
6793 {
6794 if ((*(this->detector)) (&coding, &detect_info)
6795 && highest
6796 && (detect_info.found & (1 << category)))
6797 break;
6798 }
6799 }
6800
6801
6802 if (detect_info.rejected == CATEGORY_MASK_ANY)
6803 {
6804 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6805 id = coding_categories[coding_category_raw_text].id;
6806 val = Fcons (make_number (id), Qnil);
6807 }
6808 else if (! detect_info.rejected && ! detect_info.found)
6809 {
6810 detect_info.found = CATEGORY_MASK_ANY;
6811 id = coding_categories[coding_category_undecided].id;
6812 val = Fcons (make_number (id), Qnil);
6813 }
6814 else if (highest)
6815 {
6816 if (detect_info.found)
6817 {
6818 detect_info.found = 1 << category;
6819 val = Fcons (make_number (this->id), Qnil);
6820 }
6821 else
6822 for (i = 0; i < coding_category_raw_text; i++)
6823 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6824 {
6825 detect_info.found = 1 << coding_priorities[i];
6826 id = coding_categories[coding_priorities[i]].id;
6827 val = Fcons (make_number (id), Qnil);
6828 break;
6829 }
6830 }
6831 else
6832 {
6833 int mask = detect_info.rejected | detect_info.found;
6834 int found = 0;
6835 val = Qnil;
6836
6837 for (i = coding_category_raw_text - 1; i >= 0; i--)
6838 {
6839 category = coding_priorities[i];
6840 if (! (mask & (1 << category)))
6841 {
6842 found |= 1 << category;
6843 id = coding_categories[category].id;
6844 val = Fcons (make_number (id), val);
6845 }
6846 }
6847 for (i = coding_category_raw_text - 1; i >= 0; i--)
6848 {
6849 category = coding_priorities[i];
6850 if (detect_info.found & (1 << category))
6851 {
6852 id = coding_categories[category].id;
6853 val = Fcons (make_number (id), val);
6854 }
6855 }
6856 detect_info.found |= found;
6857 }
6858 }
6859 else
6860 {
6861 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6862 val = Fcons (make_number (coding.id), Qnil);
6863 }
6864
6865 /* Then, detect eol-format if necessary. */
6866 {
6867 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6868 Lisp_Object tail;
6869
6870 if (VECTORP (eol_type))
6871 {
6872 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6873 normal_eol = detect_eol (coding.source, src_bytes,
6874 coding_category_raw_text);
6875 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6876 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6877 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6878 coding_category_utf_16_be);
6879 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6880 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6881 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6882 coding_category_utf_16_le);
6883 }
6884 else
6885 {
6886 if (EQ (eol_type, Qunix))
6887 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6888 else if (EQ (eol_type, Qdos))
6889 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6890 else
6891 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6892 }
6893
6894 for (tail = val; CONSP (tail); tail = XCDR (tail))
6895 {
6896 enum coding_category category;
6897 int this_eol;
6898
6899 id = XINT (XCAR (tail));
6900 attrs = CODING_ID_ATTRS (id);
6901 category = XINT (CODING_ATTR_CATEGORY (attrs));
6902 eol_type = CODING_ID_EOL_TYPE (id);
6903 if (VECTORP (eol_type))
6904 {
6905 if (category == coding_category_utf_16_be
6906 || category == coding_category_utf_16_be_nosig)
6907 this_eol = utf_16_be_eol;
6908 else if (category == coding_category_utf_16_le
6909 || category == coding_category_utf_16_le_nosig)
6910 this_eol = utf_16_le_eol;
6911 else
6912 this_eol = normal_eol;
6913
6914 if (this_eol == EOL_SEEN_LF)
6915 XSETCAR (tail, AREF (eol_type, 0));
6916 else if (this_eol == EOL_SEEN_CRLF)
6917 XSETCAR (tail, AREF (eol_type, 1));
6918 else if (this_eol == EOL_SEEN_CR)
6919 XSETCAR (tail, AREF (eol_type, 2));
6920 else
6921 XSETCAR (tail, CODING_ID_NAME (id));
6922 }
6923 else
6924 XSETCAR (tail, CODING_ID_NAME (id));
6925 }
6926 }
6927
6928 return (highest ? XCAR (val) : val);
6929 }
6930
6931
6932 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6933 2, 3, 0,
6934 doc: /* Detect coding system of the text in the region between START and END.
6935 Return a list of possible coding systems ordered by priority.
6936
6937 If only ASCII characters are found, it returns a list of single element
6938 `undecided' or its subsidiary coding system according to a detected
6939 end-of-line format.
6940
6941 If optional argument HIGHEST is non-nil, return the coding system of
6942 highest priority. */)
6943 (start, end, highest)
6944 Lisp_Object start, end, highest;
6945 {
6946 int from, to;
6947 int from_byte, to_byte;
6948
6949 CHECK_NUMBER_COERCE_MARKER (start);
6950 CHECK_NUMBER_COERCE_MARKER (end);
6951
6952 validate_region (&start, &end);
6953 from = XINT (start), to = XINT (end);
6954 from_byte = CHAR_TO_BYTE (from);
6955 to_byte = CHAR_TO_BYTE (to);
6956
6957 if (from < GPT && to >= GPT)
6958 move_gap_both (to, to_byte);
6959
6960 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6961 to_byte - from_byte,
6962 !NILP (highest),
6963 !NILP (current_buffer
6964 ->enable_multibyte_characters),
6965 Qnil);
6966 }
6967
6968 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6969 1, 2, 0,
6970 doc: /* Detect coding system of the text in STRING.
6971 Return a list of possible coding systems ordered by priority.
6972
6973 If only ASCII characters are found, it returns a list of single element
6974 `undecided' or its subsidiary coding system according to a detected
6975 end-of-line format.
6976
6977 If optional argument HIGHEST is non-nil, return the coding system of
6978 highest priority. */)
6979 (string, highest)
6980 Lisp_Object string, highest;
6981 {
6982 CHECK_STRING (string);
6983
6984 return detect_coding_system (SDATA (string), SBYTES (string),
6985 !NILP (highest), STRING_MULTIBYTE (string),
6986 Qnil);
6987 }
6988
6989
6990 static INLINE int
6991 char_encodable_p (c, attrs)
6992 int c;
6993 Lisp_Object attrs;
6994 {
6995 Lisp_Object tail;
6996 struct charset *charset;
6997
6998 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6999 CONSP (tail); tail = XCDR (tail))
7000 {
7001 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7002 if (CHAR_CHARSET_P (c, charset))
7003 break;
7004 }
7005 return (! NILP (tail));
7006 }
7007
7008
7009 /* Return a list of coding systems that safely encode the text between
7010 START and END. If EXCLUDE is non-nil, it is a list of coding
7011 systems not to check. The returned list doesn't contain any such
7012 coding systems. In any case, if the text contains only ASCII or is
7013 unibyte, return t. */
7014
7015 DEFUN ("find-coding-systems-region-internal",
7016 Ffind_coding_systems_region_internal,
7017 Sfind_coding_systems_region_internal, 2, 3, 0,
7018 doc: /* Internal use only. */)
7019 (start, end, exclude)
7020 Lisp_Object start, end, exclude;
7021 {
7022 Lisp_Object coding_attrs_list, safe_codings;
7023 EMACS_INT start_byte, end_byte;
7024 const unsigned char *p, *pbeg, *pend;
7025 int c;
7026 Lisp_Object tail, elt;
7027
7028 if (STRINGP (start))
7029 {
7030 if (!STRING_MULTIBYTE (start)
7031 || SCHARS (start) == SBYTES (start))
7032 return Qt;
7033 start_byte = 0;
7034 end_byte = SBYTES (start);
7035 }
7036 else
7037 {
7038 CHECK_NUMBER_COERCE_MARKER (start);
7039 CHECK_NUMBER_COERCE_MARKER (end);
7040 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7041 args_out_of_range (start, end);
7042 if (NILP (current_buffer->enable_multibyte_characters))
7043 return Qt;
7044 start_byte = CHAR_TO_BYTE (XINT (start));
7045 end_byte = CHAR_TO_BYTE (XINT (end));
7046 if (XINT (end) - XINT (start) == end_byte - start_byte)
7047 return Qt;
7048
7049 if (XINT (start) < GPT && XINT (end) > GPT)
7050 {
7051 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7052 move_gap_both (XINT (start), start_byte);
7053 else
7054 move_gap_both (XINT (end), end_byte);
7055 }
7056 }
7057
7058 coding_attrs_list = Qnil;
7059 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7060 if (NILP (exclude)
7061 || NILP (Fmemq (XCAR (tail), exclude)))
7062 {
7063 Lisp_Object attrs;
7064
7065 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7066 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7067 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7068 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7069 }
7070
7071 if (STRINGP (start))
7072 p = pbeg = SDATA (start);
7073 else
7074 p = pbeg = BYTE_POS_ADDR (start_byte);
7075 pend = p + (end_byte - start_byte);
7076
7077 while (p < pend && ASCII_BYTE_P (*p)) p++;
7078 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7079
7080 while (p < pend)
7081 {
7082 if (ASCII_BYTE_P (*p))
7083 p++;
7084 else
7085 {
7086 c = STRING_CHAR_ADVANCE (p);
7087
7088 charset_map_loaded = 0;
7089 for (tail = coding_attrs_list; CONSP (tail);)
7090 {
7091 elt = XCAR (tail);
7092 if (NILP (elt))
7093 tail = XCDR (tail);
7094 else if (char_encodable_p (c, elt))
7095 tail = XCDR (tail);
7096 else if (CONSP (XCDR (tail)))
7097 {
7098 XSETCAR (tail, XCAR (XCDR (tail)));
7099 XSETCDR (tail, XCDR (XCDR (tail)));
7100 }
7101 else
7102 {
7103 XSETCAR (tail, Qnil);
7104 tail = XCDR (tail);
7105 }
7106 }
7107 if (charset_map_loaded)
7108 {
7109 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7110
7111 if (STRINGP (start))
7112 pbeg = SDATA (start);
7113 else
7114 pbeg = BYTE_POS_ADDR (start_byte);
7115 p = pbeg + p_offset;
7116 pend = pbeg + pend_offset;
7117 }
7118 }
7119 }
7120
7121 safe_codings = Qnil;
7122 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7123 if (! NILP (XCAR (tail)))
7124 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7125
7126 return safe_codings;
7127 }
7128
7129
7130 DEFUN ("unencodable-char-position", Funencodable_char_position,
7131 Sunencodable_char_position, 3, 5, 0,
7132 doc: /*
7133 Return position of first un-encodable character in a region.
7134 START and END specfiy the region and CODING-SYSTEM specifies the
7135 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7136
7137 If optional 4th argument COUNT is non-nil, it specifies at most how
7138 many un-encodable characters to search. In this case, the value is a
7139 list of positions.
7140
7141 If optional 5th argument STRING is non-nil, it is a string to search
7142 for un-encodable characters. In that case, START and END are indexes
7143 to the string. */)
7144 (start, end, coding_system, count, string)
7145 Lisp_Object start, end, coding_system, count, string;
7146 {
7147 int n;
7148 struct coding_system coding;
7149 Lisp_Object attrs, charset_list;
7150 Lisp_Object positions;
7151 int from, to;
7152 const unsigned char *p, *stop, *pend;
7153 int ascii_compatible;
7154
7155 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7156 attrs = CODING_ID_ATTRS (coding.id);
7157 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7158 return Qnil;
7159 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7160 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7161
7162 if (NILP (string))
7163 {
7164 validate_region (&start, &end);
7165 from = XINT (start);
7166 to = XINT (end);
7167 if (NILP (current_buffer->enable_multibyte_characters)
7168 || (ascii_compatible
7169 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7170 return Qnil;
7171 p = CHAR_POS_ADDR (from);
7172 pend = CHAR_POS_ADDR (to);
7173 if (from < GPT && to >= GPT)
7174 stop = GPT_ADDR;
7175 else
7176 stop = pend;
7177 }
7178 else
7179 {
7180 CHECK_STRING (string);
7181 CHECK_NATNUM (start);
7182 CHECK_NATNUM (end);
7183 from = XINT (start);
7184 to = XINT (end);
7185 if (from > to
7186 || to > SCHARS (string))
7187 args_out_of_range_3 (string, start, end);
7188 if (! STRING_MULTIBYTE (string))
7189 return Qnil;
7190 p = SDATA (string) + string_char_to_byte (string, from);
7191 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7192 if (ascii_compatible && (to - from) == (pend - p))
7193 return Qnil;
7194 }
7195
7196 if (NILP (count))
7197 n = 1;
7198 else
7199 {
7200 CHECK_NATNUM (count);
7201 n = XINT (count);
7202 }
7203
7204 positions = Qnil;
7205 while (1)
7206 {
7207 int c;
7208
7209 if (ascii_compatible)
7210 while (p < stop && ASCII_BYTE_P (*p))
7211 p++, from++;
7212 if (p >= stop)
7213 {
7214 if (p >= pend)
7215 break;
7216 stop = pend;
7217 p = GAP_END_ADDR;
7218 }
7219
7220 c = STRING_CHAR_ADVANCE (p);
7221 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7222 && ! char_charset (c, charset_list, NULL))
7223 {
7224 positions = Fcons (make_number (from), positions);
7225 n--;
7226 if (n == 0)
7227 break;
7228 }
7229
7230 from++;
7231 }
7232
7233 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7234 }
7235
7236
7237 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7238 Scheck_coding_systems_region, 3, 3, 0,
7239 doc: /* Check if the region is encodable by coding systems.
7240
7241 START and END are buffer positions specifying the region.
7242 CODING-SYSTEM-LIST is a list of coding systems to check.
7243
7244 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7245 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7246 whole region, POS0, POS1, ... are buffer positions where non-encodable
7247 characters are found.
7248
7249 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7250 value is nil.
7251
7252 START may be a string. In that case, check if the string is
7253 encodable, and the value contains indices to the string instead of
7254 buffer positions. END is ignored. */)
7255 (start, end, coding_system_list)
7256 Lisp_Object start, end, coding_system_list;
7257 {
7258 Lisp_Object list;
7259 EMACS_INT start_byte, end_byte;
7260 int pos;
7261 const unsigned char *p, *pbeg, *pend;
7262 int c;
7263 Lisp_Object tail, elt;
7264
7265 if (STRINGP (start))
7266 {
7267 if (!STRING_MULTIBYTE (start)
7268 && SCHARS (start) != SBYTES (start))
7269 return Qnil;
7270 start_byte = 0;
7271 end_byte = SBYTES (start);
7272 pos = 0;
7273 }
7274 else
7275 {
7276 CHECK_NUMBER_COERCE_MARKER (start);
7277 CHECK_NUMBER_COERCE_MARKER (end);
7278 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7279 args_out_of_range (start, end);
7280 if (NILP (current_buffer->enable_multibyte_characters))
7281 return Qnil;
7282 start_byte = CHAR_TO_BYTE (XINT (start));
7283 end_byte = CHAR_TO_BYTE (XINT (end));
7284 if (XINT (end) - XINT (start) == end_byte - start_byte)
7285 return Qt;
7286
7287 if (XINT (start) < GPT && XINT (end) > GPT)
7288 {
7289 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7290 move_gap_both (XINT (start), start_byte);
7291 else
7292 move_gap_both (XINT (end), end_byte);
7293 }
7294 pos = XINT (start);
7295 }
7296
7297 list = Qnil;
7298 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7299 {
7300 elt = XCAR (tail);
7301 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7302 Qnil)),
7303 list);
7304 }
7305
7306 if (STRINGP (start))
7307 p = pbeg = SDATA (start);
7308 else
7309 p = pbeg = BYTE_POS_ADDR (start_byte);
7310 pend = p + (end_byte - start_byte);
7311
7312 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7313 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7314
7315 while (p < pend)
7316 {
7317 if (ASCII_BYTE_P (*p))
7318 p++;
7319 else
7320 {
7321 c = STRING_CHAR_ADVANCE (p);
7322
7323 charset_map_loaded = 0;
7324 for (tail = list; CONSP (tail); tail = XCDR (tail))
7325 {
7326 elt = XCDR (XCAR (tail));
7327 if (! char_encodable_p (c, XCAR (elt)))
7328 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7329 }
7330 if (charset_map_loaded)
7331 {
7332 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7333
7334 if (STRINGP (start))
7335 pbeg = SDATA (start);
7336 else
7337 pbeg = BYTE_POS_ADDR (start_byte);
7338 p = pbeg + p_offset;
7339 pend = pbeg + pend_offset;
7340 }
7341 }
7342 pos++;
7343 }
7344
7345 tail = list;
7346 list = Qnil;
7347 for (; CONSP (tail); tail = XCDR (tail))
7348 {
7349 elt = XCAR (tail);
7350 if (CONSP (XCDR (XCDR (elt))))
7351 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7352 list);
7353 }
7354
7355 return list;
7356 }
7357
7358
7359
7360 Lisp_Object
7361 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7362 Lisp_Object start, end, coding_system, dst_object;
7363 int encodep, norecord;
7364 {
7365 struct coding_system coding;
7366 EMACS_INT from, from_byte, to, to_byte;
7367 Lisp_Object src_object;
7368
7369 CHECK_NUMBER_COERCE_MARKER (start);
7370 CHECK_NUMBER_COERCE_MARKER (end);
7371 if (NILP (coding_system))
7372 coding_system = Qno_conversion;
7373 else
7374 CHECK_CODING_SYSTEM (coding_system);
7375 src_object = Fcurrent_buffer ();
7376 if (NILP (dst_object))
7377 dst_object = src_object;
7378 else if (! EQ (dst_object, Qt))
7379 CHECK_BUFFER (dst_object);
7380
7381 validate_region (&start, &end);
7382 from = XFASTINT (start);
7383 from_byte = CHAR_TO_BYTE (from);
7384 to = XFASTINT (end);
7385 to_byte = CHAR_TO_BYTE (to);
7386
7387 setup_coding_system (coding_system, &coding);
7388 coding.mode |= CODING_MODE_LAST_BLOCK;
7389
7390 if (encodep)
7391 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7392 dst_object);
7393 else
7394 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7395 dst_object);
7396 if (! norecord)
7397 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7398
7399 if (coding.result != CODING_RESULT_SUCCESS)
7400 error ("Code conversion error: %d", coding.result);
7401
7402 return (BUFFERP (dst_object)
7403 ? make_number (coding.produced_char)
7404 : coding.dst_object);
7405 }
7406
7407
7408 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7409 3, 4, "r\nzCoding system: ",
7410 doc: /* Decode the current region from the specified coding system.
7411 When called from a program, takes four arguments:
7412 START, END, CODING-SYSTEM, and DESTINATION.
7413 START and END are buffer positions.
7414
7415 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7416 If nil, the region between START and END is replace by the decoded text.
7417 If buffer, the decoded text is inserted in the buffer.
7418 If t, the decoded text is returned.
7419
7420 This function sets `last-coding-system-used' to the precise coding system
7421 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7422 not fully specified.)
7423 It returns the length of the decoded text. */)
7424 (start, end, coding_system, destination)
7425 Lisp_Object start, end, coding_system, destination;
7426 {
7427 return code_convert_region (start, end, coding_system, destination, 0, 0);
7428 }
7429
7430 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7431 3, 4, "r\nzCoding system: ",
7432 doc: /* Encode the current region by specified coding system.
7433 When called from a program, takes three arguments:
7434 START, END, and CODING-SYSTEM. START and END are buffer positions.
7435
7436 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7437 If nil, the region between START and END is replace by the encoded text.
7438 If buffer, the encoded text is inserted in the buffer.
7439 If t, the encoded text is returned.
7440
7441 This function sets `last-coding-system-used' to the precise coding system
7442 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7443 not fully specified.)
7444 It returns the length of the encoded text. */)
7445 (start, end, coding_system, destination)
7446 Lisp_Object start, end, coding_system, destination;
7447 {
7448 return code_convert_region (start, end, coding_system, destination, 1, 0);
7449 }
7450
7451 Lisp_Object
7452 code_convert_string (string, coding_system, dst_object,
7453 encodep, nocopy, norecord)
7454 Lisp_Object string, coding_system, dst_object;
7455 int encodep, nocopy, norecord;
7456 {
7457 struct coding_system coding;
7458 EMACS_INT chars, bytes;
7459
7460 CHECK_STRING (string);
7461 if (NILP (coding_system))
7462 {
7463 if (! norecord)
7464 Vlast_coding_system_used = Qno_conversion;
7465 if (NILP (dst_object))
7466 return (nocopy ? Fcopy_sequence (string) : string);
7467 }
7468
7469 if (NILP (coding_system))
7470 coding_system = Qno_conversion;
7471 else
7472 CHECK_CODING_SYSTEM (coding_system);
7473 if (NILP (dst_object))
7474 dst_object = Qt;
7475 else if (! EQ (dst_object, Qt))
7476 CHECK_BUFFER (dst_object);
7477
7478 setup_coding_system (coding_system, &coding);
7479 coding.mode |= CODING_MODE_LAST_BLOCK;
7480 chars = SCHARS (string);
7481 bytes = SBYTES (string);
7482 if (encodep)
7483 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7484 else
7485 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7486 if (! norecord)
7487 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7488
7489 if (coding.result != CODING_RESULT_SUCCESS)
7490 error ("Code conversion error: %d", coding.result);
7491
7492 return (BUFFERP (dst_object)
7493 ? make_number (coding.produced_char)
7494 : coding.dst_object);
7495 }
7496
7497
7498 /* Encode or decode STRING according to CODING_SYSTEM.
7499 Do not set Vlast_coding_system_used.
7500
7501 This function is called only from macros DECODE_FILE and
7502 ENCODE_FILE, thus we ignore character composition. */
7503
7504 Lisp_Object
7505 code_convert_string_norecord (string, coding_system, encodep)
7506 Lisp_Object string, coding_system;
7507 int encodep;
7508 {
7509 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7510 }
7511
7512
7513 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7514 2, 4, 0,
7515 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7516
7517 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7518 if the decoding operation is trivial.
7519
7520 Optional fourth arg BUFFER non-nil meant that the decoded text is
7521 inserted in BUFFER instead of returned as a string. In this case,
7522 the return value is BUFFER.
7523
7524 This function sets `last-coding-system-used' to the precise coding system
7525 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7526 not fully specified. */)
7527 (string, coding_system, nocopy, buffer)
7528 Lisp_Object string, coding_system, nocopy, buffer;
7529 {
7530 return code_convert_string (string, coding_system, buffer,
7531 0, ! NILP (nocopy), 0);
7532 }
7533
7534 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7535 2, 4, 0,
7536 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7537
7538 Optional third arg NOCOPY non-nil means it is OK to return STRING
7539 itself if the encoding operation is trivial.
7540
7541 Optional fourth arg BUFFER non-nil meant that the encoded text is
7542 inserted in BUFFER instead of returned as a string. In this case,
7543 the return value is BUFFER.
7544
7545 This function sets `last-coding-system-used' to the precise coding system
7546 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7547 not fully specified.) */)
7548 (string, coding_system, nocopy, buffer)
7549 Lisp_Object string, coding_system, nocopy, buffer;
7550 {
7551 return code_convert_string (string, coding_system, buffer,
7552 1, ! NILP (nocopy), 1);
7553 }
7554
7555 \f
7556 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7557 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7558 Return the corresponding character. */)
7559 (code)
7560 Lisp_Object code;
7561 {
7562 Lisp_Object spec, attrs, val;
7563 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7564 int c;
7565
7566 CHECK_NATNUM (code);
7567 c = XFASTINT (code);
7568 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7569 attrs = AREF (spec, 0);
7570
7571 if (ASCII_BYTE_P (c)
7572 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7573 return code;
7574
7575 val = CODING_ATTR_CHARSET_LIST (attrs);
7576 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7577 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7578 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7579
7580 if (c <= 0x7F)
7581 charset = charset_roman;
7582 else if (c >= 0xA0 && c < 0xDF)
7583 {
7584 charset = charset_kana;
7585 c -= 0x80;
7586 }
7587 else
7588 {
7589 int s1 = c >> 8, s2 = c & 0xFF;
7590
7591 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7592 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7593 error ("Invalid code: %d", code);
7594 SJIS_TO_JIS (c);
7595 charset = charset_kanji;
7596 }
7597 c = DECODE_CHAR (charset, c);
7598 if (c < 0)
7599 error ("Invalid code: %d", code);
7600 return make_number (c);
7601 }
7602
7603
7604 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7605 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7606 Return the corresponding code in SJIS. */)
7607 (ch)
7608 Lisp_Object ch;
7609 {
7610 Lisp_Object spec, attrs, charset_list;
7611 int c;
7612 struct charset *charset;
7613 unsigned code;
7614
7615 CHECK_CHARACTER (ch);
7616 c = XFASTINT (ch);
7617 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7618 attrs = AREF (spec, 0);
7619
7620 if (ASCII_CHAR_P (c)
7621 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7622 return ch;
7623
7624 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7625 charset = char_charset (c, charset_list, &code);
7626 if (code == CHARSET_INVALID_CODE (charset))
7627 error ("Can't encode by shift_jis encoding: %d", c);
7628 JIS_TO_SJIS (code);
7629
7630 return make_number (code);
7631 }
7632
7633 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7634 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7635 Return the corresponding character. */)
7636 (code)
7637 Lisp_Object code;
7638 {
7639 Lisp_Object spec, attrs, val;
7640 struct charset *charset_roman, *charset_big5, *charset;
7641 int c;
7642
7643 CHECK_NATNUM (code);
7644 c = XFASTINT (code);
7645 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7646 attrs = AREF (spec, 0);
7647
7648 if (ASCII_BYTE_P (c)
7649 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7650 return code;
7651
7652 val = CODING_ATTR_CHARSET_LIST (attrs);
7653 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7654 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7655
7656 if (c <= 0x7F)
7657 charset = charset_roman;
7658 else
7659 {
7660 int b1 = c >> 8, b2 = c & 0x7F;
7661 if (b1 < 0xA1 || b1 > 0xFE
7662 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7663 error ("Invalid code: %d", code);
7664 charset = charset_big5;
7665 }
7666 c = DECODE_CHAR (charset, (unsigned )c);
7667 if (c < 0)
7668 error ("Invalid code: %d", code);
7669 return make_number (c);
7670 }
7671
7672 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7673 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7674 Return the corresponding character code in Big5. */)
7675 (ch)
7676 Lisp_Object ch;
7677 {
7678 Lisp_Object spec, attrs, charset_list;
7679 struct charset *charset;
7680 int c;
7681 unsigned code;
7682
7683 CHECK_CHARACTER (ch);
7684 c = XFASTINT (ch);
7685 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7686 attrs = AREF (spec, 0);
7687 if (ASCII_CHAR_P (c)
7688 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7689 return ch;
7690
7691 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7692 charset = char_charset (c, charset_list, &code);
7693 if (code == CHARSET_INVALID_CODE (charset))
7694 error ("Can't encode by Big5 encoding: %d", c);
7695
7696 return make_number (code);
7697 }
7698
7699 \f
7700 DEFUN ("set-terminal-coding-system-internal",
7701 Fset_terminal_coding_system_internal,
7702 Sset_terminal_coding_system_internal, 1, 1, 0,
7703 doc: /* Internal use only. */)
7704 (coding_system)
7705 Lisp_Object coding_system;
7706 {
7707 CHECK_SYMBOL (coding_system);
7708 setup_coding_system (Fcheck_coding_system (coding_system),
7709 &terminal_coding);
7710
7711 /* We had better not send unsafe characters to terminal. */
7712 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7713 /* Characer composition should be disabled. */
7714 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7715 terminal_coding.src_multibyte = 1;
7716 terminal_coding.dst_multibyte = 0;
7717 return Qnil;
7718 }
7719
7720 DEFUN ("set-safe-terminal-coding-system-internal",
7721 Fset_safe_terminal_coding_system_internal,
7722 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7723 doc: /* Internal use only. */)
7724 (coding_system)
7725 Lisp_Object coding_system;
7726 {
7727 CHECK_SYMBOL (coding_system);
7728 setup_coding_system (Fcheck_coding_system (coding_system),
7729 &safe_terminal_coding);
7730 /* Characer composition should be disabled. */
7731 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7732 safe_terminal_coding.src_multibyte = 1;
7733 safe_terminal_coding.dst_multibyte = 0;
7734 return Qnil;
7735 }
7736
7737 DEFUN ("terminal-coding-system",
7738 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7739 doc: /* Return coding system specified for terminal output. */)
7740 ()
7741 {
7742 return CODING_ID_NAME (terminal_coding.id);
7743 }
7744
7745 DEFUN ("set-keyboard-coding-system-internal",
7746 Fset_keyboard_coding_system_internal,
7747 Sset_keyboard_coding_system_internal, 1, 1, 0,
7748 doc: /* Internal use only. */)
7749 (coding_system)
7750 Lisp_Object coding_system;
7751 {
7752 CHECK_SYMBOL (coding_system);
7753 setup_coding_system (Fcheck_coding_system (coding_system),
7754 &keyboard_coding);
7755 /* Characer composition should be disabled. */
7756 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7757 return Qnil;
7758 }
7759
7760 DEFUN ("keyboard-coding-system",
7761 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7762 doc: /* Return coding system specified for decoding keyboard input. */)
7763 ()
7764 {
7765 return CODING_ID_NAME (keyboard_coding.id);
7766 }
7767
7768 \f
7769 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7770 Sfind_operation_coding_system, 1, MANY, 0,
7771 doc: /* Choose a coding system for an operation based on the target name.
7772 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7773 DECODING-SYSTEM is the coding system to use for decoding
7774 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7775 for encoding (in case OPERATION does encoding).
7776
7777 The first argument OPERATION specifies an I/O primitive:
7778 For file I/O, `insert-file-contents' or `write-region'.
7779 For process I/O, `call-process', `call-process-region', or `start-process'.
7780 For network I/O, `open-network-stream'.
7781
7782 The remaining arguments should be the same arguments that were passed
7783 to the primitive. Depending on which primitive, one of those arguments
7784 is selected as the TARGET. For example, if OPERATION does file I/O,
7785 whichever argument specifies the file name is TARGET.
7786
7787 TARGET has a meaning which depends on OPERATION:
7788 For file I/O, TARGET is a file name.
7789 For process I/O, TARGET is a process name.
7790 For network I/O, TARGET is a service name or a port number
7791
7792 This function looks up what specified for TARGET in,
7793 `file-coding-system-alist', `process-coding-system-alist',
7794 or `network-coding-system-alist' depending on OPERATION.
7795 They may specify a coding system, a cons of coding systems,
7796 or a function symbol to call.
7797 In the last case, we call the function with one argument,
7798 which is a list of all the arguments given to this function.
7799
7800 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7801 (nargs, args)
7802 int nargs;
7803 Lisp_Object *args;
7804 {
7805 Lisp_Object operation, target_idx, target, val;
7806 register Lisp_Object chain;
7807
7808 if (nargs < 2)
7809 error ("Too few arguments");
7810 operation = args[0];
7811 if (!SYMBOLP (operation)
7812 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7813 error ("Invalid first arguement");
7814 if (nargs < 1 + XINT (target_idx))
7815 error ("Too few arguments for operation: %s",
7816 SDATA (SYMBOL_NAME (operation)));
7817 target = args[XINT (target_idx) + 1];
7818 if (!(STRINGP (target)
7819 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7820 error ("Invalid %dth argument", XINT (target_idx) + 1);
7821
7822 chain = ((EQ (operation, Qinsert_file_contents)
7823 || EQ (operation, Qwrite_region))
7824 ? Vfile_coding_system_alist
7825 : (EQ (operation, Qopen_network_stream)
7826 ? Vnetwork_coding_system_alist
7827 : Vprocess_coding_system_alist));
7828 if (NILP (chain))
7829 return Qnil;
7830
7831 for (; CONSP (chain); chain = XCDR (chain))
7832 {
7833 Lisp_Object elt;
7834
7835 elt = XCAR (chain);
7836 if (CONSP (elt)
7837 && ((STRINGP (target)
7838 && STRINGP (XCAR (elt))
7839 && fast_string_match (XCAR (elt), target) >= 0)
7840 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7841 {
7842 val = XCDR (elt);
7843 /* Here, if VAL is both a valid coding system and a valid
7844 function symbol, we return VAL as a coding system. */
7845 if (CONSP (val))
7846 return val;
7847 if (! SYMBOLP (val))
7848 return Qnil;
7849 if (! NILP (Fcoding_system_p (val)))
7850 return Fcons (val, val);
7851 if (! NILP (Ffboundp (val)))
7852 {
7853 val = call1 (val, Flist (nargs, args));
7854 if (CONSP (val))
7855 return val;
7856 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7857 return Fcons (val, val);
7858 }
7859 return Qnil;
7860 }
7861 }
7862 return Qnil;
7863 }
7864
7865 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7866 Sset_coding_system_priority, 0, MANY, 0,
7867 doc: /* Assign higher priority to the coding systems given as arguments.
7868 If multiple coding systems belongs to the same category,
7869 all but the first one are ignored.
7870
7871 usage: (set-coding-system-priority ...) */)
7872 (nargs, args)
7873 int nargs;
7874 Lisp_Object *args;
7875 {
7876 int i, j;
7877 int changed[coding_category_max];
7878 enum coding_category priorities[coding_category_max];
7879
7880 bzero (changed, sizeof changed);
7881
7882 for (i = j = 0; i < nargs; i++)
7883 {
7884 enum coding_category category;
7885 Lisp_Object spec, attrs;
7886
7887 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7888 attrs = AREF (spec, 0);
7889 category = XINT (CODING_ATTR_CATEGORY (attrs));
7890 if (changed[category])
7891 /* Ignore this coding system because a coding system of the
7892 same category already had a higher priority. */
7893 continue;
7894 changed[category] = 1;
7895 priorities[j++] = category;
7896 if (coding_categories[category].id >= 0
7897 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7898 setup_coding_system (args[i], &coding_categories[category]);
7899 Fset (AREF (Vcoding_category_table, category), args[i]);
7900 }
7901
7902 /* Now we have decided top J priorities. Reflect the order of the
7903 original priorities to the remaining priorities. */
7904
7905 for (i = j, j = 0; i < coding_category_max; i++, j++)
7906 {
7907 while (j < coding_category_max
7908 && changed[coding_priorities[j]])
7909 j++;
7910 if (j == coding_category_max)
7911 abort ();
7912 priorities[i] = coding_priorities[j];
7913 }
7914
7915 bcopy (priorities, coding_priorities, sizeof priorities);
7916
7917 /* Update `coding-category-list'. */
7918 Vcoding_category_list = Qnil;
7919 for (i = coding_category_max - 1; i >= 0; i--)
7920 Vcoding_category_list
7921 = Fcons (AREF (Vcoding_category_table, priorities[i]),
7922 Vcoding_category_list);
7923
7924 return Qnil;
7925 }
7926
7927 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7928 Scoding_system_priority_list, 0, 1, 0,
7929 doc: /* Return a list of coding systems ordered by their priorities.
7930 HIGHESTP non-nil means just return the highest priority one. */)
7931 (highestp)
7932 Lisp_Object highestp;
7933 {
7934 int i;
7935 Lisp_Object val;
7936
7937 for (i = 0, val = Qnil; i < coding_category_max; i++)
7938 {
7939 enum coding_category category = coding_priorities[i];
7940 int id = coding_categories[category].id;
7941 Lisp_Object attrs;
7942
7943 if (id < 0)
7944 continue;
7945 attrs = CODING_ID_ATTRS (id);
7946 if (! NILP (highestp))
7947 return CODING_ATTR_BASE_NAME (attrs);
7948 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7949 }
7950 return Fnreverse (val);
7951 }
7952
7953 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7954
7955 static Lisp_Object
7956 make_subsidiaries (base)
7957 Lisp_Object base;
7958 {
7959 Lisp_Object subsidiaries;
7960 int base_name_len = SBYTES (SYMBOL_NAME (base));
7961 char *buf = (char *) alloca (base_name_len + 6);
7962 int i;
7963
7964 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
7965 subsidiaries = Fmake_vector (make_number (3), Qnil);
7966 for (i = 0; i < 3; i++)
7967 {
7968 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7969 ASET (subsidiaries, i, intern (buf));
7970 }
7971 return subsidiaries;
7972 }
7973
7974
7975 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7976 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7977 doc: /* For internal use only.
7978 usage: (define-coding-system-internal ...) */)
7979 (nargs, args)
7980 int nargs;
7981 Lisp_Object *args;
7982 {
7983 Lisp_Object name;
7984 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7985 Lisp_Object attrs; /* Vector of attributes. */
7986 Lisp_Object eol_type;
7987 Lisp_Object aliases;
7988 Lisp_Object coding_type, charset_list, safe_charsets;
7989 enum coding_category category;
7990 Lisp_Object tail, val;
7991 int max_charset_id = 0;
7992 int i;
7993
7994 if (nargs < coding_arg_max)
7995 goto short_args;
7996
7997 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7998
7999 name = args[coding_arg_name];
8000 CHECK_SYMBOL (name);
8001 CODING_ATTR_BASE_NAME (attrs) = name;
8002
8003 val = args[coding_arg_mnemonic];
8004 if (! STRINGP (val))
8005 CHECK_CHARACTER (val);
8006 CODING_ATTR_MNEMONIC (attrs) = val;
8007
8008 coding_type = args[coding_arg_coding_type];
8009 CHECK_SYMBOL (coding_type);
8010 CODING_ATTR_TYPE (attrs) = coding_type;
8011
8012 charset_list = args[coding_arg_charset_list];
8013 if (SYMBOLP (charset_list))
8014 {
8015 if (EQ (charset_list, Qiso_2022))
8016 {
8017 if (! EQ (coding_type, Qiso_2022))
8018 error ("Invalid charset-list");
8019 charset_list = Viso_2022_charset_list;
8020 }
8021 else if (EQ (charset_list, Qemacs_mule))
8022 {
8023 if (! EQ (coding_type, Qemacs_mule))
8024 error ("Invalid charset-list");
8025 charset_list = Vemacs_mule_charset_list;
8026 }
8027 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8028 if (max_charset_id < XFASTINT (XCAR (tail)))
8029 max_charset_id = XFASTINT (XCAR (tail));
8030 }
8031 else
8032 {
8033 charset_list = Fcopy_sequence (charset_list);
8034 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8035 {
8036 struct charset *charset;
8037
8038 val = Fcar (tail);
8039 CHECK_CHARSET_GET_CHARSET (val, charset);
8040 if (EQ (coding_type, Qiso_2022)
8041 ? CHARSET_ISO_FINAL (charset) < 0
8042 : EQ (coding_type, Qemacs_mule)
8043 ? CHARSET_EMACS_MULE_ID (charset) < 0
8044 : 0)
8045 error ("Can't handle charset `%s'",
8046 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8047
8048 XSETCAR (tail, make_number (charset->id));
8049 if (max_charset_id < charset->id)
8050 max_charset_id = charset->id;
8051 }
8052 }
8053 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8054
8055 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8056 make_number (255));
8057 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8058 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8059 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8060
8061 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8062
8063 val = args[coding_arg_decode_translation_table];
8064 if (! NILP (val))
8065 CHECK_CHAR_TABLE (val);
8066 CODING_ATTR_DECODE_TBL (attrs) = val;
8067
8068 val = args[coding_arg_encode_translation_table];
8069 if (! NILP (val))
8070 CHECK_CHAR_TABLE (val);
8071 CODING_ATTR_ENCODE_TBL (attrs) = val;
8072
8073 val = args[coding_arg_post_read_conversion];
8074 CHECK_SYMBOL (val);
8075 CODING_ATTR_POST_READ (attrs) = val;
8076
8077 val = args[coding_arg_pre_write_conversion];
8078 CHECK_SYMBOL (val);
8079 CODING_ATTR_PRE_WRITE (attrs) = val;
8080
8081 val = args[coding_arg_default_char];
8082 if (NILP (val))
8083 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8084 else
8085 {
8086 CHECK_CHARACTER (val);
8087 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8088 }
8089
8090 val = args[coding_arg_for_unibyte];
8091 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8092
8093 val = args[coding_arg_plist];
8094 CHECK_LIST (val);
8095 CODING_ATTR_PLIST (attrs) = val;
8096
8097 if (EQ (coding_type, Qcharset))
8098 {
8099 Lisp_Object list;
8100 /* Generate a lisp vector of 256 elements. Each element is nil,
8101 integer, or a list of charset IDs.
8102
8103 If Nth element is nil, the byte code N is invalid in this
8104 coding system.
8105
8106 If Nth element is a number NUM, N is the first byte of a
8107 charset whose ID is NUM.
8108
8109 If Nth element is a list of charset IDs, N is the first byte
8110 of one of them. The list is sorted by dimensions of the
8111 charsets. A charset of smaller dimension comes firtst.
8112 */
8113 for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail))
8114 {
8115 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8116
8117 if (charset->method == CHARSET_METHOD_SUPERSET)
8118 {
8119 val = CHARSET_SUPERSET (charset);
8120 for (; CONSP (val); val = XCDR (val))
8121 list = Fcons (XCAR (XCAR (val)), list);
8122 }
8123 else
8124 list = Fcons (XCAR (tail), list);
8125 }
8126
8127 val = Fmake_vector (make_number (256), Qnil);
8128
8129 for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail))
8130 {
8131 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8132 int dim = CHARSET_DIMENSION (charset);
8133 int idx = (dim - 1) * 4;
8134
8135 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8136 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8137
8138 for (i = charset->code_space[idx];
8139 i <= charset->code_space[idx + 1]; i++)
8140 {
8141 Lisp_Object tmp, tmp2;
8142 int dim2;
8143
8144 tmp = AREF (val, i);
8145 if (NILP (tmp))
8146 tmp = XCAR (tail);
8147 else if (NUMBERP (tmp))
8148 {
8149 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8150 if (dim < dim2)
8151 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8152 else
8153 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8154 }
8155 else
8156 {
8157 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8158 {
8159 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8160 if (dim < dim2)
8161 break;
8162 }
8163 if (NILP (tmp2))
8164 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8165 else
8166 {
8167 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8168 XSETCAR (tmp2, XCAR (tail));
8169 }
8170 }
8171 ASET (val, i, tmp);
8172 }
8173 }
8174 ASET (attrs, coding_attr_charset_valids, val);
8175 category = coding_category_charset;
8176 }
8177 else if (EQ (coding_type, Qccl))
8178 {
8179 Lisp_Object valids;
8180
8181 if (nargs < coding_arg_ccl_max)
8182 goto short_args;
8183
8184 val = args[coding_arg_ccl_decoder];
8185 CHECK_CCL_PROGRAM (val);
8186 if (VECTORP (val))
8187 val = Fcopy_sequence (val);
8188 ASET (attrs, coding_attr_ccl_decoder, val);
8189
8190 val = args[coding_arg_ccl_encoder];
8191 CHECK_CCL_PROGRAM (val);
8192 if (VECTORP (val))
8193 val = Fcopy_sequence (val);
8194 ASET (attrs, coding_attr_ccl_encoder, val);
8195
8196 val = args[coding_arg_ccl_valids];
8197 valids = Fmake_string (make_number (256), make_number (0));
8198 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8199 {
8200 int from, to;
8201
8202 val = Fcar (tail);
8203 if (INTEGERP (val))
8204 {
8205 from = to = XINT (val);
8206 if (from < 0 || from > 255)
8207 args_out_of_range_3 (val, make_number (0), make_number (255));
8208 }
8209 else
8210 {
8211 CHECK_CONS (val);
8212 CHECK_NATNUM_CAR (val);
8213 CHECK_NATNUM_CDR (val);
8214 from = XINT (XCAR (val));
8215 if (from > 255)
8216 args_out_of_range_3 (XCAR (val),
8217 make_number (0), make_number (255));
8218 to = XINT (XCDR (val));
8219 if (to < from || to > 255)
8220 args_out_of_range_3 (XCDR (val),
8221 XCAR (val), make_number (255));
8222 }
8223 for (i = from; i <= to; i++)
8224 SSET (valids, i, 1);
8225 }
8226 ASET (attrs, coding_attr_ccl_valids, valids);
8227
8228 category = coding_category_ccl;
8229 }
8230 else if (EQ (coding_type, Qutf_16))
8231 {
8232 Lisp_Object bom, endian;
8233
8234 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8235
8236 if (nargs < coding_arg_utf16_max)
8237 goto short_args;
8238
8239 bom = args[coding_arg_utf16_bom];
8240 if (! NILP (bom) && ! EQ (bom, Qt))
8241 {
8242 CHECK_CONS (bom);
8243 val = XCAR (bom);
8244 CHECK_CODING_SYSTEM (val);
8245 val = XCDR (bom);
8246 CHECK_CODING_SYSTEM (val);
8247 }
8248 ASET (attrs, coding_attr_utf_16_bom, bom);
8249
8250 endian = args[coding_arg_utf16_endian];
8251 CHECK_SYMBOL (endian);
8252 if (NILP (endian))
8253 endian = Qbig;
8254 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8255 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8256 ASET (attrs, coding_attr_utf_16_endian, endian);
8257
8258 category = (CONSP (bom)
8259 ? coding_category_utf_16_auto
8260 : NILP (bom)
8261 ? (EQ (endian, Qbig)
8262 ? coding_category_utf_16_be_nosig
8263 : coding_category_utf_16_le_nosig)
8264 : (EQ (endian, Qbig)
8265 ? coding_category_utf_16_be
8266 : coding_category_utf_16_le));
8267 }
8268 else if (EQ (coding_type, Qiso_2022))
8269 {
8270 Lisp_Object initial, reg_usage, request, flags;
8271 int i;
8272
8273 if (nargs < coding_arg_iso2022_max)
8274 goto short_args;
8275
8276 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8277 CHECK_VECTOR (initial);
8278 for (i = 0; i < 4; i++)
8279 {
8280 val = Faref (initial, make_number (i));
8281 if (! NILP (val))
8282 {
8283 struct charset *charset;
8284
8285 CHECK_CHARSET_GET_CHARSET (val, charset);
8286 ASET (initial, i, make_number (CHARSET_ID (charset)));
8287 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8288 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8289 }
8290 else
8291 ASET (initial, i, make_number (-1));
8292 }
8293
8294 reg_usage = args[coding_arg_iso2022_reg_usage];
8295 CHECK_CONS (reg_usage);
8296 CHECK_NUMBER_CAR (reg_usage);
8297 CHECK_NUMBER_CDR (reg_usage);
8298
8299 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8300 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8301 {
8302 int id;
8303 Lisp_Object tmp;
8304
8305 val = Fcar (tail);
8306 CHECK_CONS (val);
8307 tmp = XCAR (val);
8308 CHECK_CHARSET_GET_ID (tmp, id);
8309 CHECK_NATNUM_CDR (val);
8310 if (XINT (XCDR (val)) >= 4)
8311 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8312 XSETCAR (val, make_number (id));
8313 }
8314
8315 flags = args[coding_arg_iso2022_flags];
8316 CHECK_NATNUM (flags);
8317 i = XINT (flags);
8318 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8319 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8320
8321 ASET (attrs, coding_attr_iso_initial, initial);
8322 ASET (attrs, coding_attr_iso_usage, reg_usage);
8323 ASET (attrs, coding_attr_iso_request, request);
8324 ASET (attrs, coding_attr_iso_flags, flags);
8325 setup_iso_safe_charsets (attrs);
8326
8327 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8328 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8329 | CODING_ISO_FLAG_SINGLE_SHIFT))
8330 ? coding_category_iso_7_else
8331 : EQ (args[coding_arg_charset_list], Qiso_2022)
8332 ? coding_category_iso_7
8333 : coding_category_iso_7_tight);
8334 else
8335 {
8336 int id = XINT (AREF (initial, 1));
8337
8338 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8339 || EQ (args[coding_arg_charset_list], Qiso_2022)
8340 || id < 0)
8341 ? coding_category_iso_8_else
8342 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8343 ? coding_category_iso_8_1
8344 : coding_category_iso_8_2);
8345 }
8346 if (category != coding_category_iso_8_1
8347 && category != coding_category_iso_8_2)
8348 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8349 }
8350 else if (EQ (coding_type, Qemacs_mule))
8351 {
8352 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8353 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8354 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8355 category = coding_category_emacs_mule;
8356 }
8357 else if (EQ (coding_type, Qshift_jis))
8358 {
8359
8360 struct charset *charset;
8361
8362 if (XINT (Flength (charset_list)) != 3)
8363 error ("There should be just three charsets");
8364
8365 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8366 if (CHARSET_DIMENSION (charset) != 1)
8367 error ("Dimension of charset %s is not one",
8368 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8369 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8370 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8371
8372 charset_list = XCDR (charset_list);
8373 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8374 if (CHARSET_DIMENSION (charset) != 1)
8375 error ("Dimension of charset %s is not one",
8376 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8377
8378 charset_list = XCDR (charset_list);
8379 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8380 if (CHARSET_DIMENSION (charset) != 2)
8381 error ("Dimension of charset %s is not two",
8382 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8383
8384 category = coding_category_sjis;
8385 Vsjis_coding_system = name;
8386 }
8387 else if (EQ (coding_type, Qbig5))
8388 {
8389 struct charset *charset;
8390
8391 if (XINT (Flength (charset_list)) != 2)
8392 error ("There should be just two charsets");
8393
8394 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8395 if (CHARSET_DIMENSION (charset) != 1)
8396 error ("Dimension of charset %s is not one",
8397 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8398 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8399 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8400
8401 charset_list = XCDR (charset_list);
8402 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8403 if (CHARSET_DIMENSION (charset) != 2)
8404 error ("Dimension of charset %s is not two",
8405 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8406
8407 category = coding_category_big5;
8408 Vbig5_coding_system = name;
8409 }
8410 else if (EQ (coding_type, Qraw_text))
8411 {
8412 category = coding_category_raw_text;
8413 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8414 }
8415 else if (EQ (coding_type, Qutf_8))
8416 {
8417 category = coding_category_utf_8;
8418 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8419 }
8420 else if (EQ (coding_type, Qundecided))
8421 category = coding_category_undecided;
8422 else
8423 error ("Invalid coding system type: %s",
8424 SDATA (SYMBOL_NAME (coding_type)));
8425
8426 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8427 CODING_ATTR_PLIST (attrs)
8428 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8429 CODING_ATTR_PLIST (attrs)));
8430
8431 eol_type = args[coding_arg_eol_type];
8432 if (! NILP (eol_type)
8433 && ! EQ (eol_type, Qunix)
8434 && ! EQ (eol_type, Qdos)
8435 && ! EQ (eol_type, Qmac))
8436 error ("Invalid eol-type");
8437
8438 aliases = Fcons (name, Qnil);
8439
8440 if (NILP (eol_type))
8441 {
8442 eol_type = make_subsidiaries (name);
8443 for (i = 0; i < 3; i++)
8444 {
8445 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8446
8447 this_name = AREF (eol_type, i);
8448 this_aliases = Fcons (this_name, Qnil);
8449 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8450 this_spec = Fmake_vector (make_number (3), attrs);
8451 ASET (this_spec, 1, this_aliases);
8452 ASET (this_spec, 2, this_eol_type);
8453 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8454 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8455 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8456 Vcoding_system_alist);
8457 }
8458 }
8459
8460 spec_vec = Fmake_vector (make_number (3), attrs);
8461 ASET (spec_vec, 1, aliases);
8462 ASET (spec_vec, 2, eol_type);
8463
8464 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8465 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8466 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8467 Vcoding_system_alist);
8468
8469 {
8470 int id = coding_categories[category].id;
8471
8472 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8473 setup_coding_system (name, &coding_categories[category]);
8474 }
8475
8476 return Qnil;
8477
8478 short_args:
8479 return Fsignal (Qwrong_number_of_arguments,
8480 Fcons (intern ("define-coding-system-internal"),
8481 make_number (nargs)));
8482 }
8483
8484 /* Fixme: should this record the alias relationships for
8485 diagnostics? Should it update coding-system-list? */
8486 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8487 Sdefine_coding_system_alias, 2, 2, 0,
8488 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8489 (alias, coding_system)
8490 Lisp_Object alias, coding_system;
8491 {
8492 Lisp_Object spec, aliases, eol_type;
8493
8494 CHECK_SYMBOL (alias);
8495 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8496 aliases = AREF (spec, 1);
8497 while (!NILP (XCDR (aliases)))
8498 aliases = XCDR (aliases);
8499 XSETCDR (aliases, Fcons (alias, Qnil));
8500
8501 eol_type = AREF (spec, 2);
8502 if (VECTORP (eol_type))
8503 {
8504 Lisp_Object subsidiaries;
8505 int i;
8506
8507 subsidiaries = make_subsidiaries (alias);
8508 for (i = 0; i < 3; i++)
8509 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8510 AREF (eol_type, i));
8511
8512 ASET (spec, 2, subsidiaries);
8513 }
8514
8515 Fputhash (alias, spec, Vcoding_system_hash_table);
8516 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8517 Vcoding_system_alist);
8518
8519 return Qnil;
8520 }
8521
8522 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8523 1, 1, 0,
8524 doc: /* Return the base of CODING-SYSTEM.
8525 Any alias or subsidiary coding system is not a base coding system. */)
8526 (coding_system)
8527 Lisp_Object coding_system;
8528 {
8529 Lisp_Object spec, attrs;
8530
8531 if (NILP (coding_system))
8532 return (Qno_conversion);
8533 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8534 attrs = AREF (spec, 0);
8535 return CODING_ATTR_BASE_NAME (attrs);
8536 }
8537
8538 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8539 1, 1, 0,
8540 doc: "Return the property list of CODING-SYSTEM.")
8541 (coding_system)
8542 Lisp_Object coding_system;
8543 {
8544 Lisp_Object spec, attrs;
8545
8546 if (NILP (coding_system))
8547 coding_system = Qno_conversion;
8548 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8549 attrs = AREF (spec, 0);
8550 return CODING_ATTR_PLIST (attrs);
8551 }
8552
8553
8554 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8555 1, 1, 0,
8556 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8557 (coding_system)
8558 Lisp_Object coding_system;
8559 {
8560 Lisp_Object spec;
8561
8562 if (NILP (coding_system))
8563 coding_system = Qno_conversion;
8564 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8565 return AREF (spec, 1);
8566 }
8567
8568 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8569 Scoding_system_eol_type, 1, 1, 0,
8570 doc: /* Return eol-type of CODING-SYSTEM.
8571 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8572
8573 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8574 and CR respectively.
8575
8576 A vector value indicates that a format of end-of-line should be
8577 detected automatically. Nth element of the vector is the subsidiary
8578 coding system whose eol-type is N. */)
8579 (coding_system)
8580 Lisp_Object coding_system;
8581 {
8582 Lisp_Object spec, eol_type;
8583 int n;
8584
8585 if (NILP (coding_system))
8586 coding_system = Qno_conversion;
8587 if (! CODING_SYSTEM_P (coding_system))
8588 return Qnil;
8589 spec = CODING_SYSTEM_SPEC (coding_system);
8590 eol_type = AREF (spec, 2);
8591 if (VECTORP (eol_type))
8592 return Fcopy_sequence (eol_type);
8593 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8594 return make_number (n);
8595 }
8596
8597 #endif /* emacs */
8598
8599 \f
8600 /*** 9. Post-amble ***/
8601
8602 void
8603 init_coding_once ()
8604 {
8605 int i;
8606
8607 for (i = 0; i < coding_category_max; i++)
8608 {
8609 coding_categories[i].id = -1;
8610 coding_priorities[i] = i;
8611 }
8612
8613 /* ISO2022 specific initialize routine. */
8614 for (i = 0; i < 0x20; i++)
8615 iso_code_class[i] = ISO_control_0;
8616 for (i = 0x21; i < 0x7F; i++)
8617 iso_code_class[i] = ISO_graphic_plane_0;
8618 for (i = 0x80; i < 0xA0; i++)
8619 iso_code_class[i] = ISO_control_1;
8620 for (i = 0xA1; i < 0xFF; i++)
8621 iso_code_class[i] = ISO_graphic_plane_1;
8622 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8623 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8624 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8625 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8626 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8627 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8628 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8629 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8630 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8631 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8632
8633 for (i = 0; i < 256; i++)
8634 {
8635 emacs_mule_bytes[i] = 1;
8636 }
8637 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8638 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8639 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8640 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8641 }
8642
8643 #ifdef emacs
8644
8645 void
8646 syms_of_coding ()
8647 {
8648 staticpro (&Vcoding_system_hash_table);
8649 {
8650 Lisp_Object args[2];
8651 args[0] = QCtest;
8652 args[1] = Qeq;
8653 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8654 }
8655
8656 staticpro (&Vsjis_coding_system);
8657 Vsjis_coding_system = Qnil;
8658
8659 staticpro (&Vbig5_coding_system);
8660 Vbig5_coding_system = Qnil;
8661
8662 staticpro (&Vcode_conversion_work_buf_list);
8663 Vcode_conversion_work_buf_list = Qnil;
8664
8665 staticpro (&Vcode_conversion_reused_work_buf);
8666 Vcode_conversion_reused_work_buf = Qnil;
8667
8668 DEFSYM (Qcharset, "charset");
8669 DEFSYM (Qtarget_idx, "target-idx");
8670 DEFSYM (Qcoding_system_history, "coding-system-history");
8671 Fset (Qcoding_system_history, Qnil);
8672
8673 /* Target FILENAME is the first argument. */
8674 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8675 /* Target FILENAME is the third argument. */
8676 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8677
8678 DEFSYM (Qcall_process, "call-process");
8679 /* Target PROGRAM is the first argument. */
8680 Fput (Qcall_process, Qtarget_idx, make_number (0));
8681
8682 DEFSYM (Qcall_process_region, "call-process-region");
8683 /* Target PROGRAM is the third argument. */
8684 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8685
8686 DEFSYM (Qstart_process, "start-process");
8687 /* Target PROGRAM is the third argument. */
8688 Fput (Qstart_process, Qtarget_idx, make_number (2));
8689
8690 DEFSYM (Qopen_network_stream, "open-network-stream");
8691 /* Target SERVICE is the fourth argument. */
8692 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8693
8694 DEFSYM (Qcoding_system, "coding-system");
8695 DEFSYM (Qcoding_aliases, "coding-aliases");
8696
8697 DEFSYM (Qeol_type, "eol-type");
8698 DEFSYM (Qunix, "unix");
8699 DEFSYM (Qdos, "dos");
8700
8701 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8702 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8703 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8704 DEFSYM (Qdefault_char, "default-char");
8705 DEFSYM (Qundecided, "undecided");
8706 DEFSYM (Qno_conversion, "no-conversion");
8707 DEFSYM (Qraw_text, "raw-text");
8708
8709 DEFSYM (Qiso_2022, "iso-2022");
8710
8711 DEFSYM (Qutf_8, "utf-8");
8712 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8713
8714 DEFSYM (Qutf_16, "utf-16");
8715 DEFSYM (Qbig, "big");
8716 DEFSYM (Qlittle, "little");
8717
8718 DEFSYM (Qshift_jis, "shift-jis");
8719 DEFSYM (Qbig5, "big5");
8720
8721 DEFSYM (Qcoding_system_p, "coding-system-p");
8722
8723 DEFSYM (Qcoding_system_error, "coding-system-error");
8724 Fput (Qcoding_system_error, Qerror_conditions,
8725 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8726 Fput (Qcoding_system_error, Qerror_message,
8727 build_string ("Invalid coding system"));
8728
8729 /* Intern this now in case it isn't already done.
8730 Setting this variable twice is harmless.
8731 But don't staticpro it here--that is done in alloc.c. */
8732 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8733
8734 DEFSYM (Qtranslation_table, "translation-table");
8735 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8736 DEFSYM (Qtranslation_table_id, "translation-table-id");
8737 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8738 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8739
8740 DEFSYM (Qvalid_codes, "valid-codes");
8741
8742 DEFSYM (Qemacs_mule, "emacs-mule");
8743
8744 DEFSYM (QCcategory, ":category");
8745
8746 Vcoding_category_table
8747 = Fmake_vector (make_number (coding_category_max), Qnil);
8748 staticpro (&Vcoding_category_table);
8749 /* Followings are target of code detection. */
8750 ASET (Vcoding_category_table, coding_category_iso_7,
8751 intern ("coding-category-iso-7"));
8752 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8753 intern ("coding-category-iso-7-tight"));
8754 ASET (Vcoding_category_table, coding_category_iso_8_1,
8755 intern ("coding-category-iso-8-1"));
8756 ASET (Vcoding_category_table, coding_category_iso_8_2,
8757 intern ("coding-category-iso-8-2"));
8758 ASET (Vcoding_category_table, coding_category_iso_7_else,
8759 intern ("coding-category-iso-7-else"));
8760 ASET (Vcoding_category_table, coding_category_iso_8_else,
8761 intern ("coding-category-iso-8-else"));
8762 ASET (Vcoding_category_table, coding_category_utf_8,
8763 intern ("coding-category-utf-8"));
8764 ASET (Vcoding_category_table, coding_category_utf_16_be,
8765 intern ("coding-category-utf-16-be"));
8766 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8767 intern ("coding-category-utf-16-auto"));
8768 ASET (Vcoding_category_table, coding_category_utf_16_le,
8769 intern ("coding-category-utf-16-le"));
8770 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8771 intern ("coding-category-utf-16-be-nosig"));
8772 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8773 intern ("coding-category-utf-16-le-nosig"));
8774 ASET (Vcoding_category_table, coding_category_charset,
8775 intern ("coding-category-charset"));
8776 ASET (Vcoding_category_table, coding_category_sjis,
8777 intern ("coding-category-sjis"));
8778 ASET (Vcoding_category_table, coding_category_big5,
8779 intern ("coding-category-big5"));
8780 ASET (Vcoding_category_table, coding_category_ccl,
8781 intern ("coding-category-ccl"));
8782 ASET (Vcoding_category_table, coding_category_emacs_mule,
8783 intern ("coding-category-emacs-mule"));
8784 /* Followings are NOT target of code detection. */
8785 ASET (Vcoding_category_table, coding_category_raw_text,
8786 intern ("coding-category-raw-text"));
8787 ASET (Vcoding_category_table, coding_category_undecided,
8788 intern ("coding-category-undecided"));
8789
8790 defsubr (&Scoding_system_p);
8791 defsubr (&Sread_coding_system);
8792 defsubr (&Sread_non_nil_coding_system);
8793 defsubr (&Scheck_coding_system);
8794 defsubr (&Sdetect_coding_region);
8795 defsubr (&Sdetect_coding_string);
8796 defsubr (&Sfind_coding_systems_region_internal);
8797 defsubr (&Sunencodable_char_position);
8798 defsubr (&Scheck_coding_systems_region);
8799 defsubr (&Sdecode_coding_region);
8800 defsubr (&Sencode_coding_region);
8801 defsubr (&Sdecode_coding_string);
8802 defsubr (&Sencode_coding_string);
8803 defsubr (&Sdecode_sjis_char);
8804 defsubr (&Sencode_sjis_char);
8805 defsubr (&Sdecode_big5_char);
8806 defsubr (&Sencode_big5_char);
8807 defsubr (&Sset_terminal_coding_system_internal);
8808 defsubr (&Sset_safe_terminal_coding_system_internal);
8809 defsubr (&Sterminal_coding_system);
8810 defsubr (&Sset_keyboard_coding_system_internal);
8811 defsubr (&Skeyboard_coding_system);
8812 defsubr (&Sfind_operation_coding_system);
8813 defsubr (&Sset_coding_system_priority);
8814 defsubr (&Sdefine_coding_system_internal);
8815 defsubr (&Sdefine_coding_system_alias);
8816 defsubr (&Scoding_system_base);
8817 defsubr (&Scoding_system_plist);
8818 defsubr (&Scoding_system_aliases);
8819 defsubr (&Scoding_system_eol_type);
8820 defsubr (&Scoding_system_priority_list);
8821
8822 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8823 doc: /* List of coding systems.
8824
8825 Do not alter the value of this variable manually. This variable should be
8826 updated by the functions `define-coding-system' and
8827 `define-coding-system-alias'. */);
8828 Vcoding_system_list = Qnil;
8829
8830 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8831 doc: /* Alist of coding system names.
8832 Each element is one element list of coding system name.
8833 This variable is given to `completing-read' as TABLE argument.
8834
8835 Do not alter the value of this variable manually. This variable should be
8836 updated by the functions `make-coding-system' and
8837 `define-coding-system-alias'. */);
8838 Vcoding_system_alist = Qnil;
8839
8840 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8841 doc: /* List of coding-categories (symbols) ordered by priority.
8842
8843 On detecting a coding system, Emacs tries code detection algorithms
8844 associated with each coding-category one by one in this order. When
8845 one algorithm agrees with a byte sequence of source text, the coding
8846 system bound to the corresponding coding-category is selected. */);
8847 {
8848 int i;
8849
8850 Vcoding_category_list = Qnil;
8851 for (i = coding_category_max - 1; i >= 0; i--)
8852 Vcoding_category_list
8853 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8854 Vcoding_category_list);
8855 }
8856
8857 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8858 doc: /* Specify the coding system for read operations.
8859 It is useful to bind this variable with `let', but do not set it globally.
8860 If the value is a coding system, it is used for decoding on read operation.
8861 If not, an appropriate element is used from one of the coding system alists:
8862 There are three such tables, `file-coding-system-alist',
8863 `process-coding-system-alist', and `network-coding-system-alist'. */);
8864 Vcoding_system_for_read = Qnil;
8865
8866 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8867 doc: /* Specify the coding system for write operations.
8868 Programs bind this variable with `let', but you should not set it globally.
8869 If the value is a coding system, it is used for encoding of output,
8870 when writing it to a file and when sending it to a file or subprocess.
8871
8872 If this does not specify a coding system, an appropriate element
8873 is used from one of the coding system alists:
8874 There are three such tables, `file-coding-system-alist',
8875 `process-coding-system-alist', and `network-coding-system-alist'.
8876 For output to files, if the above procedure does not specify a coding system,
8877 the value of `buffer-file-coding-system' is used. */);
8878 Vcoding_system_for_write = Qnil;
8879
8880 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8881 doc: /*
8882 Coding system used in the latest file or process I/O. */);
8883 Vlast_coding_system_used = Qnil;
8884
8885 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8886 doc: /*
8887 *Non-nil means always inhibit code conversion of end-of-line format.
8888 See info node `Coding Systems' and info node `Text and Binary' concerning
8889 such conversion. */);
8890 inhibit_eol_conversion = 0;
8891
8892 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8893 doc: /*
8894 Non-nil means process buffer inherits coding system of process output.
8895 Bind it to t if the process output is to be treated as if it were a file
8896 read from some filesystem. */);
8897 inherit_process_coding_system = 0;
8898
8899 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8900 doc: /*
8901 Alist to decide a coding system to use for a file I/O operation.
8902 The format is ((PATTERN . VAL) ...),
8903 where PATTERN is a regular expression matching a file name,
8904 VAL is a coding system, a cons of coding systems, or a function symbol.
8905 If VAL is a coding system, it is used for both decoding and encoding
8906 the file contents.
8907 If VAL is a cons of coding systems, the car part is used for decoding,
8908 and the cdr part is used for encoding.
8909 If VAL is a function symbol, the function must return a coding system
8910 or a cons of coding systems which are used as above. The function gets
8911 the arguments with which `find-operation-coding-systems' was called.
8912
8913 See also the function `find-operation-coding-system'
8914 and the variable `auto-coding-alist'. */);
8915 Vfile_coding_system_alist = Qnil;
8916
8917 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8918 doc: /*
8919 Alist to decide a coding system to use for a process I/O operation.
8920 The format is ((PATTERN . VAL) ...),
8921 where PATTERN is a regular expression matching a program name,
8922 VAL is a coding system, a cons of coding systems, or a function symbol.
8923 If VAL is a coding system, it is used for both decoding what received
8924 from the program and encoding what sent to the program.
8925 If VAL is a cons of coding systems, the car part is used for decoding,
8926 and the cdr part is used for encoding.
8927 If VAL is a function symbol, the function must return a coding system
8928 or a cons of coding systems which are used as above.
8929
8930 See also the function `find-operation-coding-system'. */);
8931 Vprocess_coding_system_alist = Qnil;
8932
8933 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8934 doc: /*
8935 Alist to decide a coding system to use for a network I/O operation.
8936 The format is ((PATTERN . VAL) ...),
8937 where PATTERN is a regular expression matching a network service name
8938 or is a port number to connect to,
8939 VAL is a coding system, a cons of coding systems, or a function symbol.
8940 If VAL is a coding system, it is used for both decoding what received
8941 from the network stream and encoding what sent to the network stream.
8942 If VAL is a cons of coding systems, the car part is used for decoding,
8943 and the cdr part is used for encoding.
8944 If VAL is a function symbol, the function must return a coding system
8945 or a cons of coding systems which are used as above.
8946
8947 See also the function `find-operation-coding-system'. */);
8948 Vnetwork_coding_system_alist = Qnil;
8949
8950 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8951 doc: /* Coding system to use with system messages.
8952 Also used for decoding keyboard input on X Window system. */);
8953 Vlocale_coding_system = Qnil;
8954
8955 /* The eol mnemonics are reset in startup.el system-dependently. */
8956 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8957 doc: /*
8958 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8959 eol_mnemonic_unix = build_string (":");
8960
8961 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8962 doc: /*
8963 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8964 eol_mnemonic_dos = build_string ("\\");
8965
8966 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8967 doc: /*
8968 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8969 eol_mnemonic_mac = build_string ("/");
8970
8971 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8972 doc: /*
8973 *String displayed in mode line when end-of-line format is not yet determined. */);
8974 eol_mnemonic_undecided = build_string (":");
8975
8976 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8977 doc: /*
8978 *Non-nil enables character translation while encoding and decoding. */);
8979 Venable_character_translation = Qt;
8980
8981 DEFVAR_LISP ("standard-translation-table-for-decode",
8982 &Vstandard_translation_table_for_decode,
8983 doc: /* Table for translating characters while decoding. */);
8984 Vstandard_translation_table_for_decode = Qnil;
8985
8986 DEFVAR_LISP ("standard-translation-table-for-encode",
8987 &Vstandard_translation_table_for_encode,
8988 doc: /* Table for translating characters while encoding. */);
8989 Vstandard_translation_table_for_encode = Qnil;
8990
8991 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8992 doc: /* Alist of charsets vs revision numbers.
8993 While encoding, if a charset (car part of an element) is found,
8994 designate it with the escape sequence identifying revision (cdr part
8995 of the element). */);
8996 Vcharset_revision_table = Qnil;
8997
8998 DEFVAR_LISP ("default-process-coding-system",
8999 &Vdefault_process_coding_system,
9000 doc: /* Cons of coding systems used for process I/O by default.
9001 The car part is used for decoding a process output,
9002 the cdr part is used for encoding a text to be sent to a process. */);
9003 Vdefault_process_coding_system = Qnil;
9004
9005 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9006 doc: /*
9007 Table of extra Latin codes in the range 128..159 (inclusive).
9008 This is a vector of length 256.
9009 If Nth element is non-nil, the existence of code N in a file
9010 \(or output of subprocess) doesn't prevent it to be detected as
9011 a coding system of ISO 2022 variant which has a flag
9012 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9013 or reading output of a subprocess.
9014 Only 128th through 159th elements has a meaning. */);
9015 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9016
9017 DEFVAR_LISP ("select-safe-coding-system-function",
9018 &Vselect_safe_coding_system_function,
9019 doc: /*
9020 Function to call to select safe coding system for encoding a text.
9021
9022 If set, this function is called to force a user to select a proper
9023 coding system which can encode the text in the case that a default
9024 coding system used in each operation can't encode the text.
9025
9026 The default value is `select-safe-coding-system' (which see). */);
9027 Vselect_safe_coding_system_function = Qnil;
9028
9029 DEFVAR_BOOL ("coding-system-require-warning",
9030 &coding_system_require_warning,
9031 doc: /* Internal use only.
9032 If non-nil, on writing a file, `select-safe-coding-system-function' is
9033 called even if `coding-system-for-write' is non-nil. The command
9034 `universal-coding-system-argument' binds this variable to t temporarily. */);
9035 coding_system_require_warning = 0;
9036
9037
9038 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9039 &inhibit_iso_escape_detection,
9040 doc: /*
9041 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9042
9043 By default, on reading a file, Emacs tries to detect how the text is
9044 encoded. This code detection is sensitive to escape sequences. If
9045 the sequence is valid as ISO2022, the code is determined as one of
9046 the ISO2022 encodings, and the file is decoded by the corresponding
9047 coding system (e.g. `iso-2022-7bit').
9048
9049 However, there may be a case that you want to read escape sequences in
9050 a file as is. In such a case, you can set this variable to non-nil.
9051 Then, as the code detection ignores any escape sequences, no file is
9052 detected as encoded in some ISO2022 encoding. The result is that all
9053 escape sequences become visible in a buffer.
9054
9055 The default value is nil, and it is strongly recommended not to change
9056 it. That is because many Emacs Lisp source files that contain
9057 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9058 in Emacs's distribution, and they won't be decoded correctly on
9059 reading if you suppress escape sequence detection.
9060
9061 The other way to read escape sequences in a file without decoding is
9062 to explicitly specify some coding system that doesn't use ISO2022's
9063 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9064 inhibit_iso_escape_detection = 0;
9065
9066 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9067 doc: /* Char table for translating self-inserting characters.
9068 This is applied to the result of input methods, not their input. See also
9069 `keyboard-translate-table'. */);
9070 Vtranslation_table_for_input = Qnil;
9071
9072 {
9073 Lisp_Object args[coding_arg_max];
9074 Lisp_Object plist[16];
9075 int i;
9076
9077 for (i = 0; i < coding_arg_max; i++)
9078 args[i] = Qnil;
9079
9080 plist[0] = intern (":name");
9081 plist[1] = args[coding_arg_name] = Qno_conversion;
9082 plist[2] = intern (":mnemonic");
9083 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9084 plist[4] = intern (":coding-type");
9085 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9086 plist[6] = intern (":ascii-compatible-p");
9087 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9088 plist[8] = intern (":default-char");
9089 plist[9] = args[coding_arg_default_char] = make_number (0);
9090 plist[10] = intern (":for-unibyte");
9091 plist[11] = args[coding_arg_for_unibyte] = Qt;
9092 plist[12] = intern (":docstring");
9093 plist[13] = build_string ("Do no conversion.\n\
9094 \n\
9095 When you visit a file with this coding, the file is read into a\n\
9096 unibyte buffer as is, thus each byte of a file is treated as a\n\
9097 character.");
9098 plist[14] = intern (":eol-type");
9099 plist[15] = args[coding_arg_eol_type] = Qunix;
9100 args[coding_arg_plist] = Flist (16, plist);
9101 Fdefine_coding_system_internal (coding_arg_max, args);
9102 }
9103
9104 setup_coding_system (Qno_conversion, &keyboard_coding);
9105 setup_coding_system (Qno_conversion, &terminal_coding);
9106 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9107
9108 {
9109 int i;
9110
9111 for (i = 0; i < coding_category_max; i++)
9112 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9113 }
9114 }
9115
9116 char *
9117 emacs_strerror (error_number)
9118 int error_number;
9119 {
9120 char *str;
9121
9122 synchronize_system_messages_locale ();
9123 str = strerror (error_number);
9124
9125 if (! NILP (Vlocale_coding_system))
9126 {
9127 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9128 Vlocale_coding_system,
9129 0);
9130 str = (char *) SDATA (dec);
9131 }
9132
9133 return str;
9134 }
9135
9136 #endif /* emacs */